Documentation/etcd-mixin: Fix etcdHighNumberOfLeaderChanges (#11448)

The `etcdHighNumberOfLeaderChanges` alert had a copy and paste
error when it was converted from docs to mixin in 10244 - we moved
from "increase over 15m > 3" to "rate over 15m > 3" which is not
the same (rate is measured per second, so it should have been
"rate over 15m > (3 / 60 / 15)").  As part of fixing that, we
need to capture when prometheus starts or when new etcd clusters
are captured with a high leader change - i.e. if you start a new
etcd cluster and at the moment prometheus first scrapes you are
already at 5 leader changes, we should fire on that transition.

This alert is also now more responsive, so if you get a quick
burst of 3 leader changes we'll alert within 5m rather than 15m.
release-3.5
Clayton Coleman 2019-12-13 19:00:11 -05:00 committed by Xiang Li
parent 7f3dd59d22
commit 322c38e169
2 changed files with 33 additions and 3 deletions

View File

@ -57,14 +57,14 @@
{
alert: 'etcdHighNumberOfLeaderChanges',
expr: |||
rate(etcd_server_leader_changes_seen_total{%(etcd_selector)s}[15m]) > 3
increase((max by (job) (etcd_server_leader_changes_seen_total{%(etcd_selector)s}) or 0*absent(etcd_server_leader_changes_seen_total{%(etcd_selector)s}))[15m:1m]) >= 3
||| % $._config,
'for': '15m',
'for': '5m',
labels: {
severity: 'warning',
},
annotations: {
message: 'etcd cluster "{{ $labels.job }}": instance {{ $labels.instance }} has seen {{ $value }} leader changes within the last 30 minutes.',
message: 'etcd cluster "{{ $labels.job }}": {{ $value }} leader changes within the last 15 minutes. Frequent elections may be a sign of insufficient resources, high network latency, or disruptions by other components and should be investigated.',
},
},
{

View File

@ -83,3 +83,33 @@ tests:
severity: critical
exp_annotations:
message: 'etcd cluster "etcd": members are down (1).'
- interval: 1m
input_series:
- series: 'etcd_server_leader_changes_seen_total{job="etcd",instance="10.10.10.0"}'
values: '0 0 2 0 0 1 0 0 0 0 0 0 0 0 0 0'
- series: 'etcd_server_leader_changes_seen_total{job="etcd",instance="10.10.10.1"}'
values: '0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0'
- series: 'etcd_server_leader_changes_seen_total{job="etcd",instance="10.10.10.2"}'
values: '0 0 0 0 0 0 0 0'
alert_rule_test:
- eval_time: 10m
alertname: etcdHighNumberOfLeaderChanges
exp_alerts:
- exp_labels:
job: etcd
severity: warning
exp_annotations:
message: 'etcd cluster "etcd": 3 leader changes within the last 15 minutes. Frequent elections may be a sign of insufficient resources, high network latency, or disruptions by other components and should be investigated.'
- interval: 1m
input_series:
- series: 'etcd_server_leader_changes_seen_total{job="etcd",instance="10.10.10.0"}'
values: '0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0'
- series: 'etcd_server_leader_changes_seen_total{job="etcd",instance="10.10.10.1"}'
values: '0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0'
- series: 'etcd_server_leader_changes_seen_total{job="etcd",instance="10.10.10.2"}'
values: '0 0 0 0 0 0 0 0'
alert_rule_test:
- eval_time: 10m
alertname: etcdHighNumberOfLeaderChanges
exp_alerts: