Documentation/etcd-mixin: Fix etcdHighNumberOfLeaderChanges (#11448)
The `etcdHighNumberOfLeaderChanges` alert had a copy and paste error when it was converted from docs to mixin in 10244 - we moved from "increase over 15m > 3" to "rate over 15m > 3" which is not the same (rate is measured per second, so it should have been "rate over 15m > (3 / 60 / 15)"). As part of fixing that, we need to capture when prometheus starts or when new etcd clusters are captured with a high leader change - i.e. if you start a new etcd cluster and at the moment prometheus first scrapes you are already at 5 leader changes, we should fire on that transition. This alert is also now more responsive, so if you get a quick burst of 3 leader changes we'll alert within 5m rather than 15m.release-3.5
parent
7f3dd59d22
commit
322c38e169
|
@ -57,14 +57,14 @@
|
||||||
{
|
{
|
||||||
alert: 'etcdHighNumberOfLeaderChanges',
|
alert: 'etcdHighNumberOfLeaderChanges',
|
||||||
expr: |||
|
expr: |||
|
||||||
rate(etcd_server_leader_changes_seen_total{%(etcd_selector)s}[15m]) > 3
|
increase((max by (job) (etcd_server_leader_changes_seen_total{%(etcd_selector)s}) or 0*absent(etcd_server_leader_changes_seen_total{%(etcd_selector)s}))[15m:1m]) >= 3
|
||||||
||| % $._config,
|
||| % $._config,
|
||||||
'for': '15m',
|
'for': '5m',
|
||||||
labels: {
|
labels: {
|
||||||
severity: 'warning',
|
severity: 'warning',
|
||||||
},
|
},
|
||||||
annotations: {
|
annotations: {
|
||||||
message: 'etcd cluster "{{ $labels.job }}": instance {{ $labels.instance }} has seen {{ $value }} leader changes within the last 30 minutes.',
|
message: 'etcd cluster "{{ $labels.job }}": {{ $value }} leader changes within the last 15 minutes. Frequent elections may be a sign of insufficient resources, high network latency, or disruptions by other components and should be investigated.',
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|
|
@ -83,3 +83,33 @@ tests:
|
||||||
severity: critical
|
severity: critical
|
||||||
exp_annotations:
|
exp_annotations:
|
||||||
message: 'etcd cluster "etcd": members are down (1).'
|
message: 'etcd cluster "etcd": members are down (1).'
|
||||||
|
- interval: 1m
|
||||||
|
input_series:
|
||||||
|
- series: 'etcd_server_leader_changes_seen_total{job="etcd",instance="10.10.10.0"}'
|
||||||
|
values: '0 0 2 0 0 1 0 0 0 0 0 0 0 0 0 0'
|
||||||
|
- series: 'etcd_server_leader_changes_seen_total{job="etcd",instance="10.10.10.1"}'
|
||||||
|
values: '0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0'
|
||||||
|
- series: 'etcd_server_leader_changes_seen_total{job="etcd",instance="10.10.10.2"}'
|
||||||
|
values: '0 0 0 0 0 0 0 0'
|
||||||
|
alert_rule_test:
|
||||||
|
- eval_time: 10m
|
||||||
|
alertname: etcdHighNumberOfLeaderChanges
|
||||||
|
exp_alerts:
|
||||||
|
- exp_labels:
|
||||||
|
job: etcd
|
||||||
|
severity: warning
|
||||||
|
exp_annotations:
|
||||||
|
message: 'etcd cluster "etcd": 3 leader changes within the last 15 minutes. Frequent elections may be a sign of insufficient resources, high network latency, or disruptions by other components and should be investigated.'
|
||||||
|
- interval: 1m
|
||||||
|
input_series:
|
||||||
|
- series: 'etcd_server_leader_changes_seen_total{job="etcd",instance="10.10.10.0"}'
|
||||||
|
values: '0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0'
|
||||||
|
- series: 'etcd_server_leader_changes_seen_total{job="etcd",instance="10.10.10.1"}'
|
||||||
|
values: '0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0'
|
||||||
|
- series: 'etcd_server_leader_changes_seen_total{job="etcd",instance="10.10.10.2"}'
|
||||||
|
values: '0 0 0 0 0 0 0 0'
|
||||||
|
alert_rule_test:
|
||||||
|
- eval_time: 10m
|
||||||
|
alertname: etcdHighNumberOfLeaderChanges
|
||||||
|
exp_alerts:
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue