Documentation/etcd-mixin: Fix etcdHighNumberOfLeaderChanges (#11448)

The `etcdHighNumberOfLeaderChanges` alert had a copy and paste error when it was converted from docs to mixin in 10244 - we moved from "increase over 15m > 3" to "rate over 15m > 3" which is not the same (rate is measured per second, so it should have been "rate over 15m > (3 / 60 / 15)"). As part of fixing that, we need to capture when prometheus starts or when new etcd clusters are captured with a high leader change - i.e. if you start a new etcd cluster and at the moment prometheus first scrapes you are already at 5 leader changes, we should fire on that transition. This alert is also now more responsive, so if you get a quick burst of 3 leader changes we'll alert within 5m rather than 15m.
2019-12-13 19:00:11 -05:00 · 2019-12-13 19:00:11 -05:00 · 322c38e169
parent 7f3dd59d22
commit 322c38e169
2 changed files with 33 additions and 3 deletions
--- a/Documentation/etcd-mixin/mixin.libsonnet
+++ b/Documentation/etcd-mixin/mixin.libsonnet
@ -57,14 +57,14 @@
          {
            alert: 'etcdHighNumberOfLeaderChanges',
            expr: |||
-              rate(etcd_server_leader_changes_seen_total{%(etcd_selector)s}[15m]) > 3
+              increase((max by (job) (etcd_server_leader_changes_seen_total{%(etcd_selector)s}) or 0*absent(etcd_server_leader_changes_seen_total{%(etcd_selector)s}))[15m:1m]) >= 3
            ||| % $._config,
-            'for': '15m',
+            'for': '5m',
            labels: {
              severity: 'warning',
            },
            annotations: {
-              message: 'etcd cluster "{{ $labels.job }}": instance {{ $labels.instance }} has seen {{ $value }} leader changes within the last 30 minutes.',
+              message: 'etcd cluster "{{ $labels.job }}": {{ $value }} leader changes within the last 15 minutes. Frequent elections may be a sign of insufficient resources, high network latency, or disruptions by other components and should be investigated.',
            },
          },
          {
--- a/Documentation/etcd-mixin/test.yaml
+++ b/Documentation/etcd-mixin/test.yaml
@ -83,3 +83,33 @@ tests:
              severity: critical
            exp_annotations:
              message: 'etcd cluster "etcd": members are down (1).'
  - interval: 1m
    input_series:
      - series: 'etcd_server_leader_changes_seen_total{job="etcd",instance="10.10.10.0"}'
        values: '0 0 2 0 0 1 0 0 0 0 0 0 0 0 0 0'
      - series: 'etcd_server_leader_changes_seen_total{job="etcd",instance="10.10.10.1"}'
        values: '0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0'
      - series: 'etcd_server_leader_changes_seen_total{job="etcd",instance="10.10.10.2"}'
        values: '0 0 0 0 0 0 0 0'
    alert_rule_test:
      - eval_time: 10m
        alertname: etcdHighNumberOfLeaderChanges
        exp_alerts:
          - exp_labels:
              job: etcd
              severity: warning
            exp_annotations:
              message: 'etcd cluster "etcd": 3 leader changes within the last 15 minutes. Frequent elections may be a sign of insufficient resources, high network latency, or disruptions by other components and should be investigated.'
  - interval: 1m
    input_series:
      - series: 'etcd_server_leader_changes_seen_total{job="etcd",instance="10.10.10.0"}'
        values: '0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0'
      - series: 'etcd_server_leader_changes_seen_total{job="etcd",instance="10.10.10.1"}'
        values: '0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0'
      - series: 'etcd_server_leader_changes_seen_total{job="etcd",instance="10.10.10.2"}'
        values: '0 0 0 0 0 0 0 0'
    alert_rule_test:
      - eval_time: 10m
        alertname: etcdHighNumberOfLeaderChanges
        exp_alerts: