From 358cc1a8fa966ebef21545daa8a09e3b61ab8f14 Mon Sep 17 00:00:00 2001 From: Dmitry Verkhoturov Date: Thu, 8 Nov 2018 15:15:04 +0300 Subject: [PATCH 1/3] doc: sync prometheus rules with prometheus-operator version (and remove non-etcd specific FdExhaustionClose) https://github.com/coreos/prometheus-operator/blob/master/helm/exporter-kube-etcd/templates/etcd3.rules.yaml sync etcd alert rules with libsonnet Signed-off-by: Dmitry Verkhoturov --- Documentation/etcd-mixin/mixin.libsonnet | 32 ++-- Documentation/op-guide/etcd3_alert.rules.yml | 158 +++++++++++-------- 2 files changed, 111 insertions(+), 79 deletions(-) diff --git a/Documentation/etcd-mixin/mixin.libsonnet b/Documentation/etcd-mixin/mixin.libsonnet index f8c79caf7..4f9f53c7c 100644 --- a/Documentation/etcd-mixin/mixin.libsonnet +++ b/Documentation/etcd-mixin/mixin.libsonnet @@ -149,33 +149,45 @@ }, }, { - record: 'instance:fd_utilization', - expr: 'process_open_fds / process_max_fds', - }, - { - alert: 'FdExhaustionClose', + alert: 'EtcdHighNumberOfFailedHTTPRequests', expr: ||| - predict_linear(instance:fd_utilization{%(etcd_selector)s}[1h], 3600 * 4) > 1 + sum(rate(etcd_http_failed_total{%(etcd_selector)s}[5m])) BY (method) / sum(rate(etcd_http_received_total{%(etcd_selector)s}[5m])) + BY (method) > 0.01 ||| % $._config, 'for': '10m', labels: { severity: 'warning', }, annotations: { - message: '{{ $labels.job }} instance {{ $labels.instance }} will exhaust its file descriptors soon', + message: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}', }, }, { - alert: 'FdExhaustionClose', + alert: 'EtcdHighNumberOfFailedHTTPRequests', expr: ||| - predict_linear(instance:fd_utilization{%(etcd_selector)s}[10m], 3600) > 1 + sum(rate(etcd_http_failed_total{%(etcd_selector)s}[5m])) BY (method) / sum(rate(etcd_http_received_total{%(etcd_selector)s}[5m])) + BY (method) > 0.05 ||| % $._config, 'for': '10m', labels: { severity: 'critical', }, annotations: { - description: '{{ $labels.job }} instance {{ $labels.instance }} will exhaust its file descriptors soon', + message: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}.', + }, + }, + { + alert: 'EtcdHTTPRequestsSlow', + expr: ||| + histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m])) + > 0.15 + ||| % $._config, + 'for': '10m', + labels: { + severity: 'warning', + }, + annotations: { + message: 'Etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method }} are slow.', }, }, ], diff --git a/Documentation/op-guide/etcd3_alert.rules.yml b/Documentation/op-guide/etcd3_alert.rules.yml index b6caee674..83cc5e0df 100644 --- a/Documentation/op-guide/etcd3_alert.rules.yml +++ b/Documentation/op-guide/etcd3_alert.rules.yml @@ -1,113 +1,133 @@ groups: -- name: etcd3_alert.rules +- name: etcd rules: - - alert: InsufficientMembers - expr: count(up{job="etcd"} == 0) > (count(up{job="etcd"}) / 2 - 1) + - alert: EtcdInsufficientMembers + annotations: + message: 'Etcd cluster "{{ $labels.job }}": insufficient members ({{ $value + }}).' + expr: | + sum(up{job=~".*etcd.*"} == bool 1) by (job) < ((count(up{job=~".*etcd.*"}) by (job) + 1) / 2) for: 3m labels: severity: critical + - alert: EtcdNoLeader annotations: - description: If one more etcd member goes down the cluster will be unavailable - summary: etcd cluster insufficient members - - alert: NoLeader - expr: etcd_server_has_leader{job="etcd"} == 0 + message: 'Etcd cluster "{{ $labels.job }}": member {{ $labels.instance }} has + no leader.' + expr: | + etcd_server_has_leader{job=~".*etcd.*"} == 0 for: 1m labels: severity: critical + - alert: EtcdHighNumberOfLeaderChanges annotations: - description: etcd member {{ $labels.instance }} has no leader - summary: etcd member has no leader - - alert: HighNumberOfLeaderChanges - expr: increase(etcd_server_leader_changes_seen_total{job="etcd"}[1h]) > 3 + message: 'Etcd cluster "{{ $labels.job }}": instance {{ $labels.instance }} + has seen {{ $value }} leader changes within the last hour.' + expr: | + rate(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}[15m]) > 3 + for: 15m labels: severity: warning + - alert: EtcdHighNumberOfFailedGRPCRequests annotations: - description: etcd instance {{ $labels.instance }} has seen {{ $value }} leader - changes within the last hour - summary: a high number of leader changes within the etcd cluster are happening - - alert: HighNumberOfFailedGRPCRequests - expr: 100 * (sum(rate(grpc_server_handled_total{grpc_code!="OK",job="etcd"}[5m])) BY (grpc_service, grpc_method) - / sum(rate(grpc_server_handled_total{job="etcd"}[5m])) BY (grpc_service, grpc_method)) > 1 + message: 'Etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ + $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.' + expr: | + 100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code!="OK"}[5m])) BY (job, instance, grpc_service, grpc_method) + / + sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) BY (job, instance, grpc_service, grpc_method) + > 1 for: 10m labels: severity: warning + - alert: EtcdHighNumberOfFailedGRPCRequests annotations: - description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed - on etcd instance {{ $labels.instance }}' - summary: a high number of gRPC requests are failing - - alert: HighNumberOfFailedGRPCRequests - expr: 100 * (sum(rate(grpc_server_handled_total{grpc_code!="OK",job="etcd"}[5m])) BY (grpc_service, grpc_method) - / sum(rate(grpc_server_handled_total{job="etcd"}[5m])) BY (grpc_service, grpc_method)) > 5 + message: 'Etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ + $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.' + expr: | + 100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code!="OK"}[5m])) BY (job, instance, grpc_service, grpc_method) + / + sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) BY (job, instance, grpc_service, grpc_method) + > 5 for: 5m labels: severity: critical + - alert: EtcdGRPCRequestsSlow annotations: - description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed - on etcd instance {{ $labels.instance }}' - summary: a high number of gRPC requests are failing - - alert: GRPCRequestsSlow - expr: histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job="etcd",grpc_type="unary"}[5m])) by (grpc_service, grpc_method, le)) + message: 'Etcd cluster "{{ $labels.job }}": gRPC requests to {{ $labels.grpc_method + }} are taking {{ $value }}s on etcd instance {{ $labels.instance }}.' + expr: | + histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~".*etcd.*", grpc_type="unary"}[5m])) by (job, instance, grpc_service, grpc_method, le)) > 0.15 for: 10m labels: severity: critical - annotations: - description: on etcd instance {{ $labels.instance }} gRPC requests to {{ $labels.grpc_method - }} are slow - summary: slow gRPC requests - - record: instance:fd_utilization - expr: process_open_fds / process_max_fds - - alert: FdExhaustionClose - expr: predict_linear(instance:fd_utilization[1h], 3600 * 4) > 1 - for: 10m - labels: - severity: warning - annotations: - description: '{{ $labels.job }} instance {{ $labels.instance }} will exhaust - its file descriptors soon' - summary: file descriptors soon exhausted - - alert: FdExhaustionClose - expr: predict_linear(instance:fd_utilization[10m], 3600) > 1 - for: 10m - labels: - severity: critical - annotations: - description: '{{ $labels.job }} instance {{ $labels.instance }} will exhaust - its file descriptors soon' - summary: file descriptors soon exhausted - alert: EtcdMemberCommunicationSlow - expr: histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket[5m])) + annotations: + message: 'Etcd cluster "{{ $labels.job }}": member communication with {{ $labels.To + }} is taking {{ $value }}s on etcd instance {{ $labels.instance }}.' + expr: | + histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~".*etcd.*"}[5m])) > 0.15 for: 10m labels: severity: warning + - alert: EtcdHighNumberOfFailedProposals annotations: - description: etcd instance {{ $labels.instance }} member communication with - {{ $labels.To }} is slow - summary: etcd member communication is slow - - alert: HighNumberOfFailedProposals - expr: increase(etcd_server_proposals_failed_total{job="etcd"}[1h]) > 5 + message: 'Etcd cluster "{{ $labels.job }}": {{ $value }} proposal failures within + the last hour on etcd instance {{ $labels.instance }}.' + expr: | + rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m]) > 5 + for: 15m labels: severity: warning + - alert: EtcdHighFsyncDurations annotations: - description: etcd instance {{ $labels.instance }} has seen {{ $value }} proposal - failures within the last hour - summary: a high number of proposals within the etcd cluster are failing - - alert: HighFsyncDurations - expr: histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m])) + message: 'Etcd cluster "{{ $labels.job }}": 99th percentile fync durations are + {{ $value }}s on etcd instance {{ $labels.instance }}.' + expr: | + histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m])) > 0.5 for: 10m labels: severity: warning + - alert: EtcdHighCommitDurations annotations: - description: etcd instance {{ $labels.instance }} fync durations are high - summary: high fsync durations - - alert: HighCommitDurations - expr: histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) + message: 'Etcd cluster "{{ $labels.job }}": 99th percentile commit durations + {{ $value }}s on etcd instance {{ $labels.instance }}.' + expr: | + histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~".*etcd.*"}[5m])) > 0.25 for: 10m labels: severity: warning + - alert: EtcdHighNumberOfFailedHTTPRequests annotations: - description: etcd instance {{ $labels.instance }} commit durations are high - summary: high commit durations + message: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd + instance {{ $labels.instance }}' + expr: | + sum(rate(etcd_http_failed_total{job=~".*etcd.*"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job=~".*etcd.*"}[5m])) + BY (method) > 0.01 + for: 10m + labels: + severity: warning + - alert: EtcdHighNumberOfFailedHTTPRequests + annotations: + message: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd + instance {{ $labels.instance }}.' + expr: | + sum(rate(etcd_http_failed_total{job=~".*etcd.*"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job=~".*etcd.*"}[5m])) + BY (method) > 0.05 + for: 10m + labels: + severity: critical + - alert: EtcdHTTPRequestsSlow + annotations: + message: Etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method + }} are slow. + expr: | + histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m])) + > 0.15 + for: 10m + labels: + severity: warning From 830d064903be5c8e23170aa7d738584003fdec37 Mon Sep 17 00:00:00 2001 From: Dmitry Verkhoturov Date: Thu, 15 Nov 2018 12:06:01 +0300 Subject: [PATCH 2/3] doc: convert etcd to lower-case everywhere --- Documentation/etcd-mixin/mixin.libsonnet | 48 +++++++++---------- Documentation/op-guide/etcd3_alert.rules.yml | 49 ++++++++++---------- 2 files changed, 49 insertions(+), 48 deletions(-) diff --git a/Documentation/etcd-mixin/mixin.libsonnet b/Documentation/etcd-mixin/mixin.libsonnet index 4f9f53c7c..13e0dcfa4 100644 --- a/Documentation/etcd-mixin/mixin.libsonnet +++ b/Documentation/etcd-mixin/mixin.libsonnet @@ -9,7 +9,7 @@ name: 'etcd', rules: [ { - alert: 'EtcdInsufficientMembers', + alert: 'etcdInsufficientMembers', expr: ||| sum(up{%(etcd_selector)s} == bool 1) by (job) < ((count(up{%(etcd_selector)s}) by (job) + 1) / 2) ||| % $._config, @@ -18,11 +18,11 @@ severity: 'critical', }, annotations: { - message: 'Etcd cluster "{{ $labels.job }}": insufficient members ({{ $value }}).', + message: 'etcd cluster "{{ $labels.job }}": insufficient members ({{ $value }}).', }, }, { - alert: 'EtcdNoLeader', + alert: 'etcdNoLeader', expr: ||| etcd_server_has_leader{%(etcd_selector)s} == 0 ||| % $._config, @@ -31,11 +31,11 @@ severity: 'critical', }, annotations: { - message: 'Etcd cluster "{{ $labels.job }}": member {{ $labels.instance }} has no leader.', + message: 'etcd cluster "{{ $labels.job }}": member {{ $labels.instance }} has no leader.', }, }, { - alert: 'EtcdHighNumberOfLeaderChanges', + alert: 'etcdHighNumberOfLeaderChanges', expr: ||| rate(etcd_server_leader_changes_seen_total{%(etcd_selector)s}[15m]) > 3 ||| % $._config, @@ -44,11 +44,11 @@ severity: 'warning', }, annotations: { - message: 'Etcd cluster "{{ $labels.job }}": instance {{ $labels.instance }} has seen {{ $value }} leader changes within the last hour.', + message: 'etcd cluster "{{ $labels.job }}": instance {{ $labels.instance }} has seen {{ $value }} leader changes within the last hour.', }, }, { - alert: 'EtcdHighNumberOfFailedGRPCRequests', + alert: 'etcdHighNumberOfFailedGRPCRequests', expr: ||| 100 * sum(rate(grpc_server_handled_total{%(etcd_selector)s, grpc_code!="OK"}[5m])) BY (job, instance, grpc_service, grpc_method) / @@ -60,11 +60,11 @@ severity: 'warning', }, annotations: { - message: 'Etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.', + message: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.', }, }, { - alert: 'EtcdHighNumberOfFailedGRPCRequests', + alert: 'etcdHighNumberOfFailedGRPCRequests', expr: ||| 100 * sum(rate(grpc_server_handled_total{%(etcd_selector)s, grpc_code!="OK"}[5m])) BY (job, instance, grpc_service, grpc_method) / @@ -76,11 +76,11 @@ severity: 'critical', }, annotations: { - message: 'Etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.', + message: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.', }, }, { - alert: 'EtcdGRPCRequestsSlow', + alert: 'etcdGRPCRequestsSlow', expr: ||| histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{%(etcd_selector)s, grpc_type="unary"}[5m])) by (job, instance, grpc_service, grpc_method, le)) > 0.15 @@ -90,11 +90,11 @@ severity: 'critical', }, annotations: { - message: 'Etcd cluster "{{ $labels.job }}": gRPC requests to {{ $labels.grpc_method }} are taking {{ $value }}s on etcd instance {{ $labels.instance }}.', + message: 'etcd cluster "{{ $labels.job }}": gRPC requests to {{ $labels.grpc_method }} are taking {{ $value }}s on etcd instance {{ $labels.instance }}.', }, }, { - alert: 'EtcdMemberCommunicationSlow', + alert: 'etcdMemberCommunicationSlow', expr: ||| histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{%(etcd_selector)s}[5m])) > 0.15 @@ -104,11 +104,11 @@ severity: 'warning', }, annotations: { - message: 'Etcd cluster "{{ $labels.job }}": member communication with {{ $labels.To }} is taking {{ $value }}s on etcd instance {{ $labels.instance }}.', + message: 'etcd cluster "{{ $labels.job }}": member communication with {{ $labels.To }} is taking {{ $value }}s on etcd instance {{ $labels.instance }}.', }, }, { - alert: 'EtcdHighNumberOfFailedProposals', + alert: 'etcdHighNumberOfFailedProposals', expr: ||| rate(etcd_server_proposals_failed_total{%(etcd_selector)s}[15m]) > 5 ||| % $._config, @@ -117,11 +117,11 @@ severity: 'warning', }, annotations: { - message: 'Etcd cluster "{{ $labels.job }}": {{ $value }} proposal failures within the last hour on etcd instance {{ $labels.instance }}.', + message: 'etcd cluster "{{ $labels.job }}": {{ $value }} proposal failures within the last hour on etcd instance {{ $labels.instance }}.', }, }, { - alert: 'EtcdHighFsyncDurations', + alert: 'etcdHighFsyncDurations', expr: ||| histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{%(etcd_selector)s}[5m])) > 0.5 @@ -131,11 +131,11 @@ severity: 'warning', }, annotations: { - message: 'Etcd cluster "{{ $labels.job }}": 99th percentile fync durations are {{ $value }}s on etcd instance {{ $labels.instance }}.', + message: 'etcd cluster "{{ $labels.job }}": 99th percentile fync durations are {{ $value }}s on etcd instance {{ $labels.instance }}.', }, }, { - alert: 'EtcdHighCommitDurations', + alert: 'etcdHighCommitDurations', expr: ||| histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{%(etcd_selector)s}[5m])) > 0.25 @@ -145,11 +145,11 @@ severity: 'warning', }, annotations: { - message: 'Etcd cluster "{{ $labels.job }}": 99th percentile commit durations {{ $value }}s on etcd instance {{ $labels.instance }}.', + message: 'etcd cluster "{{ $labels.job }}": 99th percentile commit durations {{ $value }}s on etcd instance {{ $labels.instance }}.', }, }, { - alert: 'EtcdHighNumberOfFailedHTTPRequests', + alert: 'etcdHighNumberOfFailedHTTPRequests', expr: ||| sum(rate(etcd_http_failed_total{%(etcd_selector)s}[5m])) BY (method) / sum(rate(etcd_http_received_total{%(etcd_selector)s}[5m])) BY (method) > 0.01 @@ -163,7 +163,7 @@ }, }, { - alert: 'EtcdHighNumberOfFailedHTTPRequests', + alert: 'etcdHighNumberOfFailedHTTPRequests', expr: ||| sum(rate(etcd_http_failed_total{%(etcd_selector)s}[5m])) BY (method) / sum(rate(etcd_http_received_total{%(etcd_selector)s}[5m])) BY (method) > 0.05 @@ -177,7 +177,7 @@ }, }, { - alert: 'EtcdHTTPRequestsSlow', + alert: 'etcdHTTPRequestsSlow', expr: ||| histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m])) > 0.15 @@ -187,7 +187,7 @@ severity: 'warning', }, annotations: { - message: 'Etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method }} are slow.', + message: 'etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method }} are slow.', }, }, ], diff --git a/Documentation/op-guide/etcd3_alert.rules.yml b/Documentation/op-guide/etcd3_alert.rules.yml index 83cc5e0df..deaa89b6f 100644 --- a/Documentation/op-guide/etcd3_alert.rules.yml +++ b/Documentation/op-guide/etcd3_alert.rules.yml @@ -1,36 +1,37 @@ +# these rules synced manually from https://github.com/etcd-io/etcd/blob/master/Documentation/etcd-mixin/mixin.libsonnet groups: - name: etcd rules: - - alert: EtcdInsufficientMembers + - alert: etcdInsufficientMembers annotations: - message: 'Etcd cluster "{{ $labels.job }}": insufficient members ({{ $value + message: 'etcd cluster "{{ $labels.job }}": insufficient members ({{ $value }}).' expr: | sum(up{job=~".*etcd.*"} == bool 1) by (job) < ((count(up{job=~".*etcd.*"}) by (job) + 1) / 2) for: 3m labels: severity: critical - - alert: EtcdNoLeader + - alert: etcdNoLeader annotations: - message: 'Etcd cluster "{{ $labels.job }}": member {{ $labels.instance }} has + message: 'etcd cluster "{{ $labels.job }}": member {{ $labels.instance }} has no leader.' expr: | etcd_server_has_leader{job=~".*etcd.*"} == 0 for: 1m labels: severity: critical - - alert: EtcdHighNumberOfLeaderChanges + - alert: etcdHighNumberOfLeaderChanges annotations: - message: 'Etcd cluster "{{ $labels.job }}": instance {{ $labels.instance }} + message: 'etcd cluster "{{ $labels.job }}": instance {{ $labels.instance }} has seen {{ $value }} leader changes within the last hour.' expr: | rate(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}[15m]) > 3 for: 15m labels: severity: warning - - alert: EtcdHighNumberOfFailedGRPCRequests + - alert: etcdHighNumberOfFailedGRPCRequests annotations: - message: 'Etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ + message: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.' expr: | 100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code!="OK"}[5m])) BY (job, instance, grpc_service, grpc_method) @@ -40,9 +41,9 @@ groups: for: 10m labels: severity: warning - - alert: EtcdHighNumberOfFailedGRPCRequests + - alert: etcdHighNumberOfFailedGRPCRequests annotations: - message: 'Etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ + message: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.' expr: | 100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code!="OK"}[5m])) BY (job, instance, grpc_service, grpc_method) @@ -52,9 +53,9 @@ groups: for: 5m labels: severity: critical - - alert: EtcdGRPCRequestsSlow + - alert: etcdGRPCRequestsSlow annotations: - message: 'Etcd cluster "{{ $labels.job }}": gRPC requests to {{ $labels.grpc_method + message: 'etcd cluster "{{ $labels.job }}": gRPC requests to {{ $labels.grpc_method }} are taking {{ $value }}s on etcd instance {{ $labels.instance }}.' expr: | histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~".*etcd.*", grpc_type="unary"}[5m])) by (job, instance, grpc_service, grpc_method, le)) @@ -62,9 +63,9 @@ groups: for: 10m labels: severity: critical - - alert: EtcdMemberCommunicationSlow + - alert: etcdMemberCommunicationSlow annotations: - message: 'Etcd cluster "{{ $labels.job }}": member communication with {{ $labels.To + message: 'etcd cluster "{{ $labels.job }}": member communication with {{ $labels.To }} is taking {{ $value }}s on etcd instance {{ $labels.instance }}.' expr: | histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~".*etcd.*"}[5m])) @@ -72,18 +73,18 @@ groups: for: 10m labels: severity: warning - - alert: EtcdHighNumberOfFailedProposals + - alert: etcdHighNumberOfFailedProposals annotations: - message: 'Etcd cluster "{{ $labels.job }}": {{ $value }} proposal failures within + message: 'etcd cluster "{{ $labels.job }}": {{ $value }} proposal failures within the last hour on etcd instance {{ $labels.instance }}.' expr: | rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m]) > 5 for: 15m labels: severity: warning - - alert: EtcdHighFsyncDurations + - alert: etcdHighFsyncDurations annotations: - message: 'Etcd cluster "{{ $labels.job }}": 99th percentile fync durations are + message: 'etcd cluster "{{ $labels.job }}": 99th percentile fync durations are {{ $value }}s on etcd instance {{ $labels.instance }}.' expr: | histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m])) @@ -91,9 +92,9 @@ groups: for: 10m labels: severity: warning - - alert: EtcdHighCommitDurations + - alert: etcdHighCommitDurations annotations: - message: 'Etcd cluster "{{ $labels.job }}": 99th percentile commit durations + message: 'etcd cluster "{{ $labels.job }}": 99th percentile commit durations {{ $value }}s on etcd instance {{ $labels.instance }}.' expr: | histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~".*etcd.*"}[5m])) @@ -101,7 +102,7 @@ groups: for: 10m labels: severity: warning - - alert: EtcdHighNumberOfFailedHTTPRequests + - alert: etcdHighNumberOfFailedHTTPRequests annotations: message: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}' @@ -111,7 +112,7 @@ groups: for: 10m labels: severity: warning - - alert: EtcdHighNumberOfFailedHTTPRequests + - alert: etcdHighNumberOfFailedHTTPRequests annotations: message: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}.' @@ -121,9 +122,9 @@ groups: for: 10m labels: severity: critical - - alert: EtcdHTTPRequestsSlow + - alert: etcdHTTPRequestsSlow annotations: - message: Etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method + message: etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method }} are slow. expr: | histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m])) From 09290808341647d7e21e59fe6d1d68fd99395f4f Mon Sep 17 00:00:00 2001 From: Dmitry Verkhoturov Date: Sat, 17 Nov 2018 23:30:24 +0300 Subject: [PATCH 3/3] doc: exclude 404 error because kubelet generating false positive --- Documentation/etcd-mixin/mixin.libsonnet | 4 ++-- Documentation/op-guide/etcd3_alert.rules.yml | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Documentation/etcd-mixin/mixin.libsonnet b/Documentation/etcd-mixin/mixin.libsonnet index 13e0dcfa4..b6cc471ff 100644 --- a/Documentation/etcd-mixin/mixin.libsonnet +++ b/Documentation/etcd-mixin/mixin.libsonnet @@ -151,7 +151,7 @@ { alert: 'etcdHighNumberOfFailedHTTPRequests', expr: ||| - sum(rate(etcd_http_failed_total{%(etcd_selector)s}[5m])) BY (method) / sum(rate(etcd_http_received_total{%(etcd_selector)s}[5m])) + sum(rate(etcd_http_failed_total{%(etcd_selector)s, code!="404"}[5m])) BY (method) / sum(rate(etcd_http_received_total{%(etcd_selector)s}[5m])) BY (method) > 0.01 ||| % $._config, 'for': '10m', @@ -165,7 +165,7 @@ { alert: 'etcdHighNumberOfFailedHTTPRequests', expr: ||| - sum(rate(etcd_http_failed_total{%(etcd_selector)s}[5m])) BY (method) / sum(rate(etcd_http_received_total{%(etcd_selector)s}[5m])) + sum(rate(etcd_http_failed_total{%(etcd_selector)s, code!="404"}[5m])) BY (method) / sum(rate(etcd_http_received_total{%(etcd_selector)s}[5m])) BY (method) > 0.05 ||| % $._config, 'for': '10m', diff --git a/Documentation/op-guide/etcd3_alert.rules.yml b/Documentation/op-guide/etcd3_alert.rules.yml index deaa89b6f..17287172b 100644 --- a/Documentation/op-guide/etcd3_alert.rules.yml +++ b/Documentation/op-guide/etcd3_alert.rules.yml @@ -107,7 +107,7 @@ groups: message: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}' expr: | - sum(rate(etcd_http_failed_total{job=~".*etcd.*"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job=~".*etcd.*"}[5m])) + sum(rate(etcd_http_failed_total{job=~".*etcd.*", code!="404"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job=~".*etcd.*"}[5m])) BY (method) > 0.01 for: 10m labels: @@ -117,7 +117,7 @@ groups: message: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}.' expr: | - sum(rate(etcd_http_failed_total{job=~".*etcd.*"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job=~".*etcd.*"}[5m])) + sum(rate(etcd_http_failed_total{job=~".*etcd.*", code!="404"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job=~".*etcd.*"}[5m])) BY (method) > 0.05 for: 10m labels: