From 1b2dc1c796377e94dd9f34828cc6d7dbfdcc80b0 Mon Sep 17 00:00:00 2001 From: Michal Witkowski Date: Thu, 2 Jul 2015 15:03:39 +0100 Subject: [PATCH] metrics: add `events` metrics in etcdhttp. --- Documentation/metrics.md | 71 ++++++++++++------------- error/error.go | 4 +- error/error_test.go | 4 +- etcdserver/etcdhttp/client.go | 11 ++-- etcdserver/etcdhttp/metrics.go | 96 ++++++++++++++++++++++++++++++++++ 5 files changed, 140 insertions(+), 46 deletions(-) create mode 100644 etcdserver/etcdhttp/metrics.go diff --git a/Documentation/metrics.md b/Documentation/metrics.md index 2a185a2ca..fc096693d 100644 --- a/Documentation/metrics.md +++ b/Documentation/metrics.md @@ -30,45 +30,6 @@ Pending proposal (`pending_proposal_total`) gives you an idea about how many pro Failed proposals (`proposal_failed_total`) are normally related to two issues: temporary failures related to a leader election or longer duration downtime caused by a loss of quorum in the cluster. - -### store - -These metrics describe the accesses into the data store of etcd members that exist in the cluster. They -are useful to count what kind of actions are taken by users. It is also useful to see and whether all etcd members -"see" the same set of data mutations, and whether reads and watches (which are local) are equally distributed. - -All these metrics are prefixed with `etcd_store_`. - -| Name | Description | Type | -|---------------------------|------------------------------------------------------------------------------------------|--------------------| -| reads_total | Total number of reads from store, should differ among etcd members (local reads). | Counter(action) | -| writes_total | Total number of writes to store, should be same among all etcd members. | Counter(action) | -| reads_failed_total | Number of failed reads from store (e.g. key missing) on local reads. | Counter(action) | -| writes_failed_total | Number of failed writes to store (e.g. failed compare and swap). | Counter(action) | -| expires_total | Total number of expired keys (due to TTL).   | Counter | -| watch_requests_totals | Total number of incoming watch requests to this etcd member (local watches). | Counter | -| watchers | Current count of active watchers on this etcd member. | Gauge | - -Both `reads_total` and `writes_total` count both successful and failed requests. `reads_failed_total` and -`writes_failed_total` count failed requests. A lot of failed writes indicate possible contentions on keys (e.g. when -doing `compareAndSet`), and read failures indicate that some clients try to access keys that don't exist. - -Example Prometheus queries that may be useful from these metrics (across all etcd members): - - * `sum(rate(etcd_store_reads_total{job="etcd"}[1m])) by (action)` - `max(rate(etcd_store_writes_total{job="etcd"}[1m])) by (action)` - - Rate of reads and writes by action, across all servers across a time window of `1m`. The reason why `max` is used - for writes as opposed to `sum` for reads is because all of etcd nodes in the cluster apply all writes to their stores. - Shows the rate of successful readonly/write queries across all servers, across a time window of `1m`. - * `sum(rate(etcd_store_watch_requests_total{job="etcd"}[1m]))` - - Shows rate of new watch requests per second. Likely driven by how often watched keys change. - * `sum(etcd_store_watchers{job="etcd"})` - - Number of active watchers across all etcd servers. - - ### wal | Name | Description | Type | @@ -78,6 +39,38 @@ Example Prometheus queries that may be useful from these metrics (across all etc Abnormally high fsync duration (`fsync_durations_microseconds`) indicates disk issues and might cause the cluster to be unstable. + +### http requests + +These metrics describe the serving of requests (non-watch events) served by etcd members in non-proxy mode: total +incoming requests, request failures and processing latency (inc. raft rounds for storage). They are useful for tracking + user-generated traffic hitting the etcd cluster . + +All these metrics are prefixed with `etcd_http_` + +| Name | Description | Type | +|--------------------------------|-----------------------------------------------------------------------------------------|--------------------| +| received_total | Total number of events after parsing and auth. | Counter(method) | +| failed_total | Total number of failed events.   | Counter(method,error) | +| successful_duration_second | Bucketed handling times of the requests, including raft rounds for writes. | Histogram(method) | + + +Example Prometheus queries that may be useful from these metrics (across all etcd members): + + * `sum(rate(etcd_http_failed_total{job="etcd"}[1m]) by (method) / sum(rate(etcd_http_events_received_total{job="etcd"})[1m]) by (method)` + + Shows the fraction of events that failed by HTTP method across all members, across a time window of `1m`. + + * `sum(rate(etcd_http_received_total{job="etcd",method="GET})[1m]) by (method)` + `sum(rate(etcd_http_received_total{job="etcd",method~="GET})[1m]) by (method)` + + Shows the rate of successful readonly/write queries across all servers, across a time window of `1m`. + + * `histogram_quantile(0.9, sum(increase(etcd_http_successful_processing_seconds{job="etcd",method="GET"}[5m]) ) by (le))` + `histogram_quantile(0.9, sum(increase(etcd_http_successful_processing_seconds{job="etcd",method!="GET"}[5m]) ) by (le))` + + Show the 0.90-tile latency (in seconds) of read/write (respectively) event handling across all members, with a window of `5m`. + ### snapshot | Name | Description | Type | diff --git a/error/error.go b/error/error.go index 92f2b8b9a..56facdc01 100644 --- a/error/error.go +++ b/error/error.go @@ -143,7 +143,7 @@ func (e Error) toJsonString() string { return string(b) } -func (e Error) statusCode() int { +func (e Error) StatusCode() int { status, ok := errorStatus[e.ErrorCode] if !ok { status = http.StatusBadRequest @@ -154,6 +154,6 @@ func (e Error) statusCode() int { func (e Error) WriteTo(w http.ResponseWriter) { w.Header().Add("X-Etcd-Index", fmt.Sprint(e.Index)) w.Header().Set("Content-Type", "application/json") - w.WriteHeader(e.statusCode()) + w.WriteHeader(e.StatusCode()) fmt.Fprintln(w, e.toJsonString()) } diff --git a/error/error_test.go b/error/error_test.go index 1fdfacb75..9251f428b 100644 --- a/error/error_test.go +++ b/error/error_test.go @@ -28,8 +28,8 @@ func TestErrorWriteTo(t *testing.T) { rr := httptest.NewRecorder() err.WriteTo(rr) - if err.statusCode() != rr.Code { - t.Errorf("HTTP status code %d, want %d", rr.Code, err.statusCode()) + if err.StatusCode() != rr.Code { + t.Errorf("HTTP status code %d, want %d", rr.Code, err.StatusCode()) } gbody := strings.TrimSuffix(rr.Body.String(), "\n") diff --git a/etcdserver/etcdhttp/client.go b/etcdserver/etcdhttp/client.go index 7553392c5..e56f7b4e9 100644 --- a/etcdserver/etcdhttp/client.go +++ b/etcdserver/etcdhttp/client.go @@ -128,8 +128,9 @@ func (h *keysHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) { ctx, cancel := context.WithTimeout(context.Background(), h.timeout) defer cancel() - - rr, err := parseKeyRequest(r, clockwork.NewRealClock()) + clock := clockwork.NewRealClock() + startTime := clock.Now() + rr, err := parseKeyRequest(r, clock) if err != nil { writeKeyError(w, err) return @@ -139,11 +140,14 @@ func (h *keysHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) { writeKeyNoAuth(w) return } - + if !rr.Wait { + reportRequestReceived(rr) + } resp, err := h.server.Do(ctx, rr) if err != nil { err = trimErrorPrefix(err, etcdserver.StoreKeysPrefix) writeKeyError(w, err) + reportRequestFailed(rr, err) return } switch { @@ -152,6 +156,7 @@ func (h *keysHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) { // Should never be reached plog.Errorf("error writing event (%v)", err) } + reportRequestCompleted(rr, resp, startTime) case resp.Watcher != nil: ctx, cancel := context.WithTimeout(context.Background(), defaultWatchTimeout) defer cancel() diff --git a/etcdserver/etcdhttp/metrics.go b/etcdserver/etcdhttp/metrics.go new file mode 100644 index 000000000..f037d3c33 --- /dev/null +++ b/etcdserver/etcdhttp/metrics.go @@ -0,0 +1,96 @@ +// Copyright 2015 CoreOS, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package etcdhttp + +import ( + "strconv" + "time" + + "net/http" + + "github.com/coreos/etcd/Godeps/_workspace/src/github.com/prometheus/client_golang/prometheus" + etcdErr "github.com/coreos/etcd/error" + "github.com/coreos/etcd/etcdserver" + "github.com/coreos/etcd/etcdserver/etcdhttp/httptypes" + "github.com/coreos/etcd/etcdserver/etcdserverpb" +) + +var ( + incomingEvents = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: "etcd", + Subsystem: "http", + Name: "received_total", + Help: "Counter of requests received into the system (successfully parsed and authd).", + }, []string{"method"}) + + failedEvents = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: "etcd", + Subsystem: "http", + Name: "failed_total", + Help: "Counter of handle failures of requests (non-watches), by method (GET/PUT etc.) and code (400, 500 etc.).", + }, []string{"method", "code"}) + + successfulEventsHandlingTime = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Namespace: "etcd", + Subsystem: "http", + Name: "successful_duration_second", + Help: "Bucketed histogram of processing time (s) of successfully handled requests (non-watches), by method (GET/PUT etc.).", + Buckets: prometheus.ExponentialBuckets(0.0005, 2, 13), + }, []string{"method"}) +) + +func init() { + prometheus.MustRegister(incomingEvents) + prometheus.MustRegister(failedEvents) + prometheus.MustRegister(successfulEventsHandlingTime) +} + +func reportRequestReceived(request etcdserverpb.Request) { + incomingEvents.WithLabelValues(methodFromRequest(request)).Inc() +} + +func reportRequestCompleted(request etcdserverpb.Request, response etcdserver.Response, startTime time.Time) { + method := methodFromRequest(request) + successfulEventsHandlingTime.WithLabelValues(method).Observe(time.Since(startTime).Seconds()) +} + +func reportRequestFailed(request etcdserverpb.Request, err error) { + method := methodFromRequest(request) + failedEvents.WithLabelValues(method, strconv.Itoa(codeFromError(err))).Inc() +} + +func methodFromRequest(request etcdserverpb.Request) string { + if request.Method == "GET" && request.Quorum { + return "QGET" + } + return request.Method +} + +func codeFromError(err error) int { + if err == nil { + return http.StatusInternalServerError + } + switch e := err.(type) { + case *etcdErr.Error: + return (*etcdErr.Error)(e).StatusCode() + case *httptypes.HTTPError: + return (*httptypes.HTTPError)(e).Code + default: + return http.StatusInternalServerError + } +}