Merge pull request #12177 from ironcladlou/etcdmembersdown-tweak
Documentation: Further improve etcdMembersDown alertrelease-3.5
commit
1af6d61a1c
|
@ -7,6 +7,9 @@
|
||||||
// instances are deployed on K8s, you will likely want to change
|
// instances are deployed on K8s, you will likely want to change
|
||||||
// this to 'instance, pod'.
|
// this to 'instance, pod'.
|
||||||
etcd_instance_labels: 'instance',
|
etcd_instance_labels: 'instance',
|
||||||
|
// scrape_interval_seconds is the global scrape interval which can be
|
||||||
|
// used to dynamically adjust rate windows as a function of the interval.
|
||||||
|
scrape_interval_seconds: 30,
|
||||||
},
|
},
|
||||||
|
|
||||||
prometheusAlerts+:: {
|
prometheusAlerts+:: {
|
||||||
|
@ -21,12 +24,12 @@
|
||||||
sum without (%(etcd_instance_labels)s) (up{%(etcd_selector)s} == bool 0)
|
sum without (%(etcd_instance_labels)s) (up{%(etcd_selector)s} == bool 0)
|
||||||
or
|
or
|
||||||
count without (To) (
|
count without (To) (
|
||||||
sum without (%(etcd_instance_labels)s) (rate(etcd_network_peer_sent_failures_total{%(etcd_selector)s}[1m])) > 0.01
|
sum without (%(etcd_instance_labels)s) (rate(etcd_network_peer_sent_failures_total{%(etcd_selector)s}[%(network_failure_range)ss])) > 0.01
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
> 0
|
> 0
|
||||||
||| % $._config,
|
||| % {etcd_instance_labels: $._config.etcd_instance_labels, etcd_selector: $._config.etcd_selector, network_failure_range: $._config.scrape_interval_seconds*4},
|
||||||
'for': '3m',
|
'for': '10m',
|
||||||
labels: {
|
labels: {
|
||||||
severity: 'critical',
|
severity: 'critical',
|
||||||
},
|
},
|
||||||
|
|
|
@ -17,16 +17,16 @@ tests:
|
||||||
alertname: etcdInsufficientMembers
|
alertname: etcdInsufficientMembers
|
||||||
- eval_time: 5m
|
- eval_time: 5m
|
||||||
alertname: etcdInsufficientMembers
|
alertname: etcdInsufficientMembers
|
||||||
- eval_time: 5m
|
- eval_time: 12m
|
||||||
alertname: etcdMembersDown
|
alertname: etcdMembersDown
|
||||||
- eval_time: 7m
|
- eval_time: 14m
|
||||||
alertname: etcdMembersDown
|
alertname: etcdMembersDown
|
||||||
exp_alerts:
|
exp_alerts:
|
||||||
- exp_labels:
|
- exp_labels:
|
||||||
job: etcd
|
job: etcd
|
||||||
severity: critical
|
severity: critical
|
||||||
exp_annotations:
|
exp_annotations:
|
||||||
message: 'etcd cluster "etcd": members are down (1).'
|
message: 'etcd cluster "etcd": members are down (3).'
|
||||||
- eval_time: 7m
|
- eval_time: 7m
|
||||||
alertname: etcdInsufficientMembers
|
alertname: etcdInsufficientMembers
|
||||||
- eval_time: 11m
|
- eval_time: 11m
|
||||||
|
@ -49,33 +49,31 @@ tests:
|
||||||
- interval: 1m
|
- interval: 1m
|
||||||
input_series:
|
input_series:
|
||||||
- series: 'up{job="etcd",instance="10.10.10.0"}'
|
- series: 'up{job="etcd",instance="10.10.10.0"}'
|
||||||
values: '1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0'
|
values: '1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0'
|
||||||
- series: 'up{job="etcd",instance="10.10.10.1"}'
|
- series: 'up{job="etcd",instance="10.10.10.1"}'
|
||||||
values: '1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0'
|
values: '1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0'
|
||||||
- series: 'up{job="etcd",instance="10.10.10.2"}'
|
- series: 'up{job="etcd",instance="10.10.10.2"}'
|
||||||
values: '1 1 1 1 0 0 0 0'
|
values: '1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0'
|
||||||
alert_rule_test:
|
alert_rule_test:
|
||||||
- eval_time: 10m
|
- eval_time: 14m
|
||||||
alertname: etcdMembersDown
|
alertname: etcdMembersDown
|
||||||
exp_alerts:
|
exp_alerts:
|
||||||
- exp_labels:
|
- exp_labels:
|
||||||
job: etcd
|
job: etcd
|
||||||
severity: critical
|
severity: critical
|
||||||
exp_annotations:
|
exp_annotations:
|
||||||
message: 'etcd cluster "etcd": members are down (2).'
|
message: 'etcd cluster "etcd": members are down (3).'
|
||||||
|
|
||||||
- interval: 1m
|
- interval: 1m
|
||||||
input_series:
|
input_series:
|
||||||
- series: 'up{job="etcd",instance="10.10.10.0"}'
|
- series: 'up{job="etcd",instance="10.10.10.0"}'
|
||||||
values: '1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0'
|
values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0'
|
||||||
- series: 'up{job="etcd",instance="10.10.10.1"}'
|
- series: 'up{job="etcd",instance="10.10.10.1"}'
|
||||||
values: '1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0'
|
values: '1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0'
|
||||||
- series: 'etcd_network_peer_sent_failures_total{To="member-1",job="etcd",endpoint="test"}'
|
- series: 'etcd_network_peer_sent_failures_total{To="member-1",job="etcd",endpoint="test"}'
|
||||||
values: '0 0 1 2 3 4 5 6 7 8 9 10'
|
values: '0 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18'
|
||||||
alert_rule_test:
|
alert_rule_test:
|
||||||
- eval_time: 4m
|
- eval_time: 13m
|
||||||
alertname: etcdMembersDown
|
|
||||||
- eval_time: 6m
|
|
||||||
alertname: etcdMembersDown
|
alertname: etcdMembersDown
|
||||||
exp_alerts:
|
exp_alerts:
|
||||||
- exp_labels:
|
- exp_labels:
|
||||||
|
|
Loading…
Reference in New Issue