diff --git a/Documentation/op-guide/etcd3_alert.rules b/Documentation/op-guide/etcd3_alert.rules new file mode 100644 index 000000000..90c3770e8 --- /dev/null +++ b/Documentation/op-guide/etcd3_alert.rules @@ -0,0 +1,206 @@ +# general cluster availability + +# alert if another failed member will result in an unavailable cluster +ALERT InsufficientMembers +IF count(up{job="etcd"} == 0) > (count(up{job="etcd"}) / 2 - 1) +FOR 3m +LABELS { + severity = "critical" +} +ANNOTATIONS { + summary = "etcd cluster insufficient members", + description = "If one more etcd member goes down the cluster will be unavailable", +} + +# etcd leader alerts +# ================== + +# alert if any etcd instance has no leader +ALERT NoLeader +IF etcd_server_has_leader{job="etcd"} == 0 +FOR 1m +LABELS { + severity = "critical" +} +ANNOTATIONS { + summary = "etcd member has no leader", + description = "etcd member {{ $labels.instance }} has no leader", +} + +# alert if there are lots of leader changes +ALERT HighNumberOfLeaderChanges +IF increase(etcd_server_leader_changes_seen_total{job="etcd"}[1h]) > 3 +LABELS { + severity = "warning" +} +ANNOTATIONS { + summary = "a high number of leader changes within the etcd cluster are happening", + description = "etcd instance {{ $labels.instance }} has seen {{ $value }} leader changes within the last hour", +} + +# gRPC request alerts +# =================== + +# alert if more than 1% of gRPC method calls have failed within the last 5 minutes +ALERT HighNumberOfFailedGRPCRequests +IF sum by(grpc_method) (rate(etcd_grpc_requests_failed_total{job="etcd"}[5m])) + / sum by(grpc_method) (rate(etcd_grpc_total{job="etcd"}[5m])) > 0.01 +FOR 10m +LABELS { + severity = "warning" +} +ANNOTATIONS { + summary = "a high number of gRPC requests are failing", + description = "{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}", +} + +# alert if more than 5% of gRPC method calls have failed within the last 5 minutes +ALERT HighNumberOfFailedGRPCRequests +IF sum by(grpc_method) (rate(etcd_grpc_requests_failed_total{job="etcd"}[5m])) + / sum by(grpc_method) (rate(etcd_grpc_total{job="etcd"}[5m])) > 0.05 +FOR 5m +LABELS { + severity = "critical" +} +ANNOTATIONS { + summary = "a high number of gRPC requests are failing", + description = "{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}", +} + +# alert if the 99th percentile of gRPC method calls take more than 150ms +ALERT GRPCRequestsSlow +IF histogram_quantile(0.99, rate(etcd_grpc_unary_requests_duration_seconds_bucket[5m])) > 0.15 +FOR 10m +LABELS { + severity = "critical" +} +ANNOTATIONS { + summary = "slow gRPC requests", + description = "on etcd instance {{ $labels.instance }} gRPC requests to {{ $label.grpc_method }} are slow", +} + +# HTTP requests alerts +# ==================== + +# alert if more than 1% of requests to an HTTP endpoint have failed within the last 5 minutes +ALERT HighNumberOfFailedHTTPRequests +IF sum by(method) (rate(etcd_http_failed_total{job="etcd"}[5m])) + / sum by(method) (rate(etcd_http_received_total{job="etcd"}[5m])) > 0.01 +FOR 10m +LABELS { + severity = "warning" +} +ANNOTATIONS { + summary = "a high number of HTTP requests are failing", + description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}", +} + +# alert if more than 5% of requests to an HTTP endpoint have failed within the last 5 minutes +ALERT HighNumberOfFailedHTTPRequests +IF sum by(method) (rate(etcd_http_failed_total{job="etcd"}[5m])) + / sum by(method) (rate(etcd_http_received_total{job="etcd"}[5m])) > 0.05 +FOR 5m +LABELS { + severity = "critical" +} +ANNOTATIONS { + summary = "a high number of HTTP requests are failing", + description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}", +} + +# alert if the 99th percentile of HTTP requests take more than 150ms +ALERT HTTPRequestsSlow +IF histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m])) > 0.15 +FOR 10m +LABELS { + severity = "warning" +} +ANNOTATIONS { + summary = "slow HTTP requests", + description = "on etcd instance {{ $labels.instance }} HTTP requests to {{ $label.method }} are slow", +} + +# file descriptor alerts +# ====================== + +instance:fd_utilization = process_open_fds / process_max_fds + +# alert if file descriptors are likely to exhaust within the next 4 hours +ALERT FdExhaustionClose +IF predict_linear(instance:fd_utilization[1h], 3600 * 4) > 1 +FOR 10m +LABELS { + severity = "warning" +} +ANNOTATIONS { + summary = "file descriptors soon exhausted", + description = "{{ $labels.job }} instance {{ $labels.instance }} will exhaust its file descriptors soon", +} + +# alert if file descriptors are likely to exhaust within the next hour +ALERT FdExhaustionClose +IF predict_linear(instance:fd_utilization[10m], 3600) > 1 +FOR 10m +LABELS { + severity = "critical" +} +ANNOTATIONS { + summary = "file descriptors soon exhausted", + description = "{{ $labels.job }} instance {{ $labels.instance }} will exhaust its file descriptors soon", +} + +# etcd member communication alerts +# ================================ + +# alert if 99th percentile of round trips take 150ms +ALERT EtcdMemberCommunicationSlow +IF histogram_quantile(0.99, rate(etcd_network_member_round_trip_time_seconds_bucket[5m])) > 0.15 +FOR 10m +LABELS { + severity = "warning" +} +ANNOTATIONS { + summary = "etcd member communication is slow", + description = "etcd instance {{ $labels.instance }} member communication with {{ $label.To }} is slow", +} + +# etcd proposal alerts +# ==================== + +# alert if there are several failed proposals within an hour +ALERT HighNumberOfFailedProposals +IF increase(etcd_server_proposals_failed_total{job="etcd"}[1h]) > 5 +LABELS { + severity = "warning" +} +ANNOTATIONS { + summary = "a high number of proposals within the etcd cluster are failing", + description = "etcd instance {{ $labels.instance }} has seen {{ $value }} proposal failures within the last hour", +} + +# etcd disk io latency alerts +# =========================== + +# alert if 99th percentile of fsync durations is higher than 500ms +ALERT HighFsyncDurations +IF histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m])) > 0.5 +FOR 10m +LABELS { + severity = "warning" +} +ANNOTATIONS { + summary = "high fsync durations", + description = "etcd instance {{ $labels.instance }} fync durations are high", +} + +# alert if 99th percentile of commit durations is higher than 250ms +ALERT HighCommitDurations +IF histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) > 0.25 +FOR 10m +LABELS { + severity = "warning" +} +ANNOTATIONS { + summary = "high commit durations", + description = "etcd instance {{ $labels.instance }} commit durations are high", +} diff --git a/Documentation/op-guide/monitoring.md b/Documentation/op-guide/monitoring.md index 9ed354ebf..7a9812706 100644 --- a/Documentation/op-guide/monitoring.md +++ b/Documentation/op-guide/monitoring.md @@ -56,6 +56,12 @@ nohup /tmp/prometheus \ Now Prometheus will scrape etcd metrics every 10 seconds. +## Alerting + +There is a [set of default alerts for etcd v3 clusters](./etcd3_alert.rules). + +> Note: `job` labels may need to be adjusted to fit a particular need. The rules were written to apply to a single cluster so it is recommended to choose labels unique to a cluster. + ## Grafana [Grafana][grafana] has built-in Prometheus support; just add a Prometheus data source: diff --git a/Documentation/v2/etcd_alert.rules b/Documentation/v2/etcd_alert.rules new file mode 100644 index 000000000..1793cad49 --- /dev/null +++ b/Documentation/v2/etcd_alert.rules @@ -0,0 +1,121 @@ +### General cluster availability ### + +# alert if another failed member will result in an unavailable cluster +ALERT InsufficientMembers + IF count(up{job="etcd"} == 0) > (count(up{job="etcd"}) / 2 - 1) + FOR 3m + LABELS { + severity = "critical" + } + ANNOTATIONS { + summary = "etcd cluster insufficient members", + description = "If one more etcd member goes down the cluster will be unavailable", + } + +### HTTP requests alerts ### + +# alert if more than 1% of requests to an HTTP endpoint have failed with a non 4xx response +ALERT HighNumberOfFailedHTTPRequests + IF sum by(method) (rate(etcd_http_failed_total{job="etcd", code!~"4[0-9]{2}"}[5m])) + / sum by(method) (rate(etcd_http_received_total{job="etcd"}[5m])) > 0.01 + FOR 10m + LABELS { + severity = "warning" + } + ANNOTATIONS { + summary = "a high number of HTTP requests are failing", + description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}", + } + +# alert if more than 5% of requests to an HTTP endpoint have failed with a non 4xx response +ALERT HighNumberOfFailedHTTPRequests + IF sum by(method) (rate(etcd_http_failed_total{job="etcd", code!~"4[0-9]{2}"}[5m])) + / sum by(method) (rate(etcd_http_received_total{job="etcd"}[5m])) > 0.05 + FOR 5m + LABELS { + severity = "critical" + } + ANNOTATIONS { + summary = "a high number of HTTP requests are failing", + description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}", + } + +# alert if 50% of requests get a 4xx response +ALERT HighNumberOfFailedHTTPRequests + IF sum by(method) (rate(etcd_http_failed_total{job="etcd", code=~"4[0-9]{2}"}[5m])) + / sum by(method) (rate(etcd_http_received_total{job="etcd"}[5m])) > 0.5 + FOR 10m + LABELS { + severity = "critical" + } + ANNOTATIONS { + summary = "a high number of HTTP requests are failing", + description = "{{ $value }}% of requests for {{ $labels.method }} failed with 4xx responses on etcd instance {{ $labels.instance }}", + } + +# alert if the 99th percentile of HTTP requests take more than 150ms +ALERT HTTPRequestsSlow + IF histogram_quantile(0.99, rate(etcd_http_successful_duration_second_bucket[5m])) > 0.15 + FOR 10m + LABELS { + severity = "warning" + } + ANNOTATIONS { + summary = "slow HTTP requests", + description = "on etcd instance {{ $labels.instance }} HTTP requests to {{ $label.method }} are slow", + } + +### File descriptor alerts ### + +instance:fd_utilization = process_open_fds / process_max_fds + +# alert if file descriptors are likely to exhaust within the next 4 hours +ALERT FdExhaustionClose + IF predict_linear(instance:fd_utilization[1h], 3600 * 4) > 1 + FOR 10m + LABELS { + severity = "warning" + } + ANNOTATIONS { + summary = "file descriptors soon exhausted", + description = "{{ $labels.job }} instance {{ $labels.instance }} will exhaust its file descriptors soon", + } + +# alert if file descriptors are likely to exhaust within the next hour +ALERT FdExhaustionClose + IF predict_linear(instance:fd_utilization[10m], 3600) > 1 + FOR 10m + LABELS { + severity = "critical" + } + ANNOTATIONS { + summary = "file descriptors soon exhausted", + description = "{{ $labels.job }} instance {{ $labels.instance }} will exhaust its file descriptors soon", + } + +### etcd proposal alerts ### + +# alert if there are several failed proposals within an hour +ALERT HighNumberOfFailedProposals + IF increase(etcd_server_proposal_failed_total{job="etcd"}[1h]) > 5 + LABELS { + severity = "warning" + } + ANNOTATIONS { + summary = "a high number of proposals within the etcd cluster are failing", + description = "etcd instance {{ $labels.instance }} has seen {{ $value }} proposal failures within the last hour", + } + +### etcd disk io latency alerts ### + +# alert if 99th percentile of fsync durations is higher than 500ms +ALERT HighFsyncDurations + IF histogram_quantile(0.99, rate(etcd_wal_fsync_durations_seconds_bucket[5m])) > 0.5 + FOR 10m + LABELS { + severity = "warning" + } + ANNOTATIONS { + summary = "high fsync durations", + description = "etcd instance {{ $labels.instance }} fync durations are high", + }