### General cluster availability ### # alert if another failed member will result in an unavailable cluster ALERT InsufficientMembers IF count(up{job="etcd"} == 0) > (count(up{job="etcd"}) / 2 - 1) FOR 3m LABELS { severity = "critical" } ANNOTATIONS { summary = "etcd cluster insufficient members", description = "If one more etcd member goes down the cluster will be unavailable", } ### HTTP requests alerts ### # alert if more than 1% of requests to an HTTP endpoint have failed with a non 4xx response ALERT HighNumberOfFailedHTTPRequests IF sum by(method) (rate(etcd_http_failed_total{job="etcd", code!~"4[0-9]{2}"}[5m])) / sum by(method) (rate(etcd_http_received_total{job="etcd"}[5m])) > 0.01 FOR 10m LABELS { severity = "warning" } ANNOTATIONS { summary = "a high number of HTTP requests are failing", description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}", } # alert if more than 5% of requests to an HTTP endpoint have failed with a non 4xx response ALERT HighNumberOfFailedHTTPRequests IF sum by(method) (rate(etcd_http_failed_total{job="etcd", code!~"4[0-9]{2}"}[5m])) / sum by(method) (rate(etcd_http_received_total{job="etcd"}[5m])) > 0.05 FOR 5m LABELS { severity = "critical" } ANNOTATIONS { summary = "a high number of HTTP requests are failing", description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}", } # alert if 50% of requests get a 4xx response ALERT HighNumberOfFailedHTTPRequests IF sum by(method) (rate(etcd_http_failed_total{job="etcd", code=~"4[0-9]{2}"}[5m])) / sum by(method) (rate(etcd_http_received_total{job="etcd"}[5m])) > 0.5 FOR 10m LABELS { severity = "critical" } ANNOTATIONS { summary = "a high number of HTTP requests are failing", description = "{{ $value }}% of requests for {{ $labels.method }} failed with 4xx responses on etcd instance {{ $labels.instance }}", } # alert if the 99th percentile of HTTP requests take more than 150ms ALERT HTTPRequestsSlow IF histogram_quantile(0.99, rate(etcd_http_successful_duration_second_bucket[5m])) > 0.15 FOR 10m LABELS { severity = "warning" } ANNOTATIONS { summary = "slow HTTP requests", description = "on etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method }} are slow", } ### File descriptor alerts ### instance:fd_utilization = process_open_fds / process_max_fds # alert if file descriptors are likely to exhaust within the next 4 hours ALERT FdExhaustionClose IF predict_linear(instance:fd_utilization[1h], 3600 * 4) > 1 FOR 10m LABELS { severity = "warning" } ANNOTATIONS { summary = "file descriptors soon exhausted", description = "{{ $labels.job }} instance {{ $labels.instance }} will exhaust its file descriptors soon", } # alert if file descriptors are likely to exhaust within the next hour ALERT FdExhaustionClose IF predict_linear(instance:fd_utilization[10m], 3600) > 1 FOR 10m LABELS { severity = "critical" } ANNOTATIONS { summary = "file descriptors soon exhausted", description = "{{ $labels.job }} instance {{ $labels.instance }} will exhaust its file descriptors soon", } ### etcd proposal alerts ### # alert if there are several failed proposals within an hour ALERT HighNumberOfFailedProposals IF increase(etcd_server_proposal_failed_total{job="etcd"}[1h]) > 5 LABELS { severity = "warning" } ANNOTATIONS { summary = "a high number of proposals within the etcd cluster are failing", description = "etcd instance {{ $labels.instance }} has seen {{ $value }} proposal failures within the last hour", } ### etcd disk io latency alerts ### # alert if 99th percentile of fsync durations is higher than 500ms ALERT HighFsyncDurations IF histogram_quantile(0.99, rate(etcd_wal_fsync_durations_seconds_bucket[5m])) > 0.5 FOR 10m LABELS { severity = "warning" } ANNOTATIONS { summary = "high fsync durations", description = "etcd instance {{ $labels.instance }} fync durations are high", }