diff --git a/Documentation/etcd-mixin/mixin.libsonnet b/Documentation/etcd-mixin/mixin.libsonnet index 0653c8d0e..3d0c4b339 100644 --- a/Documentation/etcd-mixin/mixin.libsonnet +++ b/Documentation/etcd-mixin/mixin.libsonnet @@ -57,14 +57,14 @@ { alert: 'etcdHighNumberOfLeaderChanges', expr: ||| - rate(etcd_server_leader_changes_seen_total{%(etcd_selector)s}[15m]) > 3 + increase((max by (job) (etcd_server_leader_changes_seen_total{%(etcd_selector)s}) or 0*absent(etcd_server_leader_changes_seen_total{%(etcd_selector)s}))[15m:1m]) >= 3 ||| % $._config, - 'for': '15m', + 'for': '5m', labels: { severity: 'warning', }, annotations: { - message: 'etcd cluster "{{ $labels.job }}": instance {{ $labels.instance }} has seen {{ $value }} leader changes within the last 30 minutes.', + message: 'etcd cluster "{{ $labels.job }}": {{ $value }} leader changes within the last 15 minutes. Frequent elections may be a sign of insufficient resources, high network latency, or disruptions by other components and should be investigated.', }, }, { diff --git a/Documentation/etcd-mixin/test.yaml b/Documentation/etcd-mixin/test.yaml index 408cfd972..56ee613b1 100644 --- a/Documentation/etcd-mixin/test.yaml +++ b/Documentation/etcd-mixin/test.yaml @@ -83,3 +83,33 @@ tests: severity: critical exp_annotations: message: 'etcd cluster "etcd": members are down (1).' + - interval: 1m + input_series: + - series: 'etcd_server_leader_changes_seen_total{job="etcd",instance="10.10.10.0"}' + values: '0 0 2 0 0 1 0 0 0 0 0 0 0 0 0 0' + - series: 'etcd_server_leader_changes_seen_total{job="etcd",instance="10.10.10.1"}' + values: '0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0' + - series: 'etcd_server_leader_changes_seen_total{job="etcd",instance="10.10.10.2"}' + values: '0 0 0 0 0 0 0 0' + alert_rule_test: + - eval_time: 10m + alertname: etcdHighNumberOfLeaderChanges + exp_alerts: + - exp_labels: + job: etcd + severity: warning + exp_annotations: + message: 'etcd cluster "etcd": 3 leader changes within the last 15 minutes. Frequent elections may be a sign of insufficient resources, high network latency, or disruptions by other components and should be investigated.' + - interval: 1m + input_series: + - series: 'etcd_server_leader_changes_seen_total{job="etcd",instance="10.10.10.0"}' + values: '0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0' + - series: 'etcd_server_leader_changes_seen_total{job="etcd",instance="10.10.10.1"}' + values: '0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0' + - series: 'etcd_server_leader_changes_seen_total{job="etcd",instance="10.10.10.2"}' + values: '0 0 0 0 0 0 0 0' + alert_rule_test: + - eval_time: 10m + alertname: etcdHighNumberOfLeaderChanges + exp_alerts: +