Merge pull request #12177 from ironcladlou/etcdmembersdown-tweak

Documentation: Further improve etcdMembersDown alert
release-3.5
Sam Batschelet 2020-07-31 15:06:52 -04:00 committed by GitHub
commit 1af6d61a1c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 18 additions and 17 deletions

View File

@ -7,6 +7,9 @@
// instances are deployed on K8s, you will likely want to change // instances are deployed on K8s, you will likely want to change
// this to 'instance, pod'. // this to 'instance, pod'.
etcd_instance_labels: 'instance', etcd_instance_labels: 'instance',
// scrape_interval_seconds is the global scrape interval which can be
// used to dynamically adjust rate windows as a function of the interval.
scrape_interval_seconds: 30,
}, },
prometheusAlerts+:: { prometheusAlerts+:: {
@ -21,12 +24,12 @@
sum without (%(etcd_instance_labels)s) (up{%(etcd_selector)s} == bool 0) sum without (%(etcd_instance_labels)s) (up{%(etcd_selector)s} == bool 0)
or or
count without (To) ( count without (To) (
sum without (%(etcd_instance_labels)s) (rate(etcd_network_peer_sent_failures_total{%(etcd_selector)s}[1m])) > 0.01 sum without (%(etcd_instance_labels)s) (rate(etcd_network_peer_sent_failures_total{%(etcd_selector)s}[%(network_failure_range)ss])) > 0.01
) )
) )
> 0 > 0
||| % $._config, ||| % {etcd_instance_labels: $._config.etcd_instance_labels, etcd_selector: $._config.etcd_selector, network_failure_range: $._config.scrape_interval_seconds*4},
'for': '3m', 'for': '10m',
labels: { labels: {
severity: 'critical', severity: 'critical',
}, },

View File

@ -17,16 +17,16 @@ tests:
alertname: etcdInsufficientMembers alertname: etcdInsufficientMembers
- eval_time: 5m - eval_time: 5m
alertname: etcdInsufficientMembers alertname: etcdInsufficientMembers
- eval_time: 5m - eval_time: 12m
alertname: etcdMembersDown alertname: etcdMembersDown
- eval_time: 7m - eval_time: 14m
alertname: etcdMembersDown alertname: etcdMembersDown
exp_alerts: exp_alerts:
- exp_labels: - exp_labels:
job: etcd job: etcd
severity: critical severity: critical
exp_annotations: exp_annotations:
message: 'etcd cluster "etcd": members are down (1).' message: 'etcd cluster "etcd": members are down (3).'
- eval_time: 7m - eval_time: 7m
alertname: etcdInsufficientMembers alertname: etcdInsufficientMembers
- eval_time: 11m - eval_time: 11m
@ -49,33 +49,31 @@ tests:
- interval: 1m - interval: 1m
input_series: input_series:
- series: 'up{job="etcd",instance="10.10.10.0"}' - series: 'up{job="etcd",instance="10.10.10.0"}'
values: '1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0' values: '1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0'
- series: 'up{job="etcd",instance="10.10.10.1"}' - series: 'up{job="etcd",instance="10.10.10.1"}'
values: '1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0' values: '1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0'
- series: 'up{job="etcd",instance="10.10.10.2"}' - series: 'up{job="etcd",instance="10.10.10.2"}'
values: '1 1 1 1 0 0 0 0' values: '1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0'
alert_rule_test: alert_rule_test:
- eval_time: 10m - eval_time: 14m
alertname: etcdMembersDown alertname: etcdMembersDown
exp_alerts: exp_alerts:
- exp_labels: - exp_labels:
job: etcd job: etcd
severity: critical severity: critical
exp_annotations: exp_annotations:
message: 'etcd cluster "etcd": members are down (2).' message: 'etcd cluster "etcd": members are down (3).'
- interval: 1m - interval: 1m
input_series: input_series:
- series: 'up{job="etcd",instance="10.10.10.0"}' - series: 'up{job="etcd",instance="10.10.10.0"}'
values: '1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0' values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0'
- series: 'up{job="etcd",instance="10.10.10.1"}' - series: 'up{job="etcd",instance="10.10.10.1"}'
values: '1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0' values: '1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0'
- series: 'etcd_network_peer_sent_failures_total{To="member-1",job="etcd",endpoint="test"}' - series: 'etcd_network_peer_sent_failures_total{To="member-1",job="etcd",endpoint="test"}'
values: '0 0 1 2 3 4 5 6 7 8 9 10' values: '0 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18'
alert_rule_test: alert_rule_test:
- eval_time: 4m - eval_time: 13m
alertname: etcdMembersDown
- eval_time: 6m
alertname: etcdMembersDown alertname: etcdMembersDown
exp_alerts: exp_alerts:
- exp_labels: - exp_labels: