etcd/contrib/mixin/mixin.libsonnet

1446 lines
47 KiB
Plaintext

{
_config+:: {
etcd_selector: 'job=~".*etcd.*"',
// etcd_instance_labels are the label names that are uniquely
// identifying an instance and need to be aggreated away for alerts
// that are about an etcd cluster as a whole. For example, if etcd
// instances are deployed on K8s, you will likely want to change
// this to 'instance, pod'.
etcd_instance_labels: 'instance',
// scrape_interval_seconds is the global scrape interval which can be
// used to dynamically adjust rate windows as a function of the interval.
scrape_interval_seconds: 30,
// Dashboard variable refresh option on Grafana (https://grafana.com/docs/grafana/latest/datasources/prometheus/).
// 0 : Never (Will never refresh the Dashboard variables values)
// 1 : On Dashboard Load (Will refresh Dashboards variables when dashboard are loaded)
// 2 : On Time Range Change (Will refresh Dashboards variables when time range will be changed)
dashboard_var_refresh: 2,
// clusterLabel is used to identify a cluster.
clusterLabel: 'job',
},
prometheusAlerts+:: {
groups+: [
{
name: 'etcd',
rules: [
{
alert: 'etcdMembersDown',
expr: |||
max without (endpoint) (
sum without (%(etcd_instance_labels)s) (up{%(etcd_selector)s} == bool 0)
or
count without (To) (
sum without (%(etcd_instance_labels)s) (rate(etcd_network_peer_sent_failures_total{%(etcd_selector)s}[%(network_failure_range)ss])) > 0.01
)
)
> 0
||| % { etcd_instance_labels: $._config.etcd_instance_labels, etcd_selector: $._config.etcd_selector, network_failure_range: $._config.scrape_interval_seconds * 4 },
'for': '10m',
labels: {
severity: 'critical',
},
annotations: {
description: 'etcd cluster "{{ $labels.%s }}": members are down ({{ $value }}).' % $._config.clusterLabel,
summary: 'etcd cluster members are down.',
},
},
{
alert: 'etcdInsufficientMembers',
expr: |||
sum(up{%(etcd_selector)s} == bool 1) without (%(etcd_instance_labels)s) < ((count(up{%(etcd_selector)s}) without (%(etcd_instance_labels)s) + 1) / 2)
||| % $._config,
'for': '3m',
labels: {
severity: 'critical',
},
annotations: {
description: 'etcd cluster "{{ $labels.%s }}": insufficient members ({{ $value }}).' % $._config.clusterLabel,
summary: 'etcd cluster has insufficient number of members.',
},
},
{
alert: 'etcdNoLeader',
expr: |||
etcd_server_has_leader{%(etcd_selector)s} == 0
||| % $._config,
'for': '1m',
labels: {
severity: 'critical',
},
annotations: {
description: 'etcd cluster "{{ $labels.%s }}": member {{ $labels.instance }} has no leader.' % $._config.clusterLabel,
summary: 'etcd cluster has no leader.',
},
},
{
alert: 'etcdHighNumberOfLeaderChanges',
expr: |||
increase((max without (%(etcd_instance_labels)s) (etcd_server_leader_changes_seen_total{%(etcd_selector)s}) or 0*absent(etcd_server_leader_changes_seen_total{%(etcd_selector)s}))[15m:1m]) >= 4
||| % $._config,
'for': '5m',
labels: {
severity: 'warning',
},
annotations: {
description: 'etcd cluster "{{ $labels.%s }}": {{ $value }} leader changes within the last 15 minutes. Frequent elections may be a sign of insufficient resources, high network latency, or disruptions by other components and should be investigated.' % $._config.clusterLabel,
summary: 'etcd cluster has high number of leader changes.',
},
},
{
alert: 'etcdHighNumberOfFailedGRPCRequests',
expr: |||
100 * sum(rate(grpc_server_handled_total{%(etcd_selector)s, grpc_code=~"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded"}[5m])) without (grpc_type, grpc_code)
/
sum(rate(grpc_server_handled_total{%(etcd_selector)s}[5m])) without (grpc_type, grpc_code)
> 1
||| % $._config,
'for': '10m',
labels: {
severity: 'warning',
},
annotations: {
description: 'etcd cluster "{{ $labels.%s }}": {{ $value }}%% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.' % $._config.clusterLabel,
summary: 'etcd cluster has high number of failed grpc requests.',
},
},
{
alert: 'etcdHighNumberOfFailedGRPCRequests',
expr: |||
100 * sum(rate(grpc_server_handled_total{%(etcd_selector)s, grpc_code=~"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded"}[5m])) without (grpc_type, grpc_code)
/
sum(rate(grpc_server_handled_total{%(etcd_selector)s}[5m])) without (grpc_type, grpc_code)
> 5
||| % $._config,
'for': '5m',
labels: {
severity: 'critical',
},
annotations: {
description: 'etcd cluster "{{ $labels.%s }}": {{ $value }}%% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.' % $._config.clusterLabel,
summary: 'etcd cluster has high number of failed grpc requests.',
},
},
{
alert: 'etcdGRPCRequestsSlow',
expr: |||
histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{%(etcd_selector)s, grpc_method!="Defragment", grpc_type="unary"}[5m])) without(grpc_type))
> 0.15
||| % $._config,
'for': '10m',
labels: {
severity: 'critical',
},
annotations: {
description: 'etcd cluster "{{ $labels.%s }}": 99th percentile of gRPC requests is {{ $value }}s on etcd instance {{ $labels.instance }} for {{ $labels.grpc_method }} method.' % $._config.clusterLabel,
summary: 'etcd grpc requests are slow',
},
},
{
alert: 'etcdMemberCommunicationSlow',
expr: |||
histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{%(etcd_selector)s}[5m]))
> 0.15
||| % $._config,
'for': '10m',
labels: {
severity: 'warning',
},
annotations: {
description: 'etcd cluster "{{ $labels.%s }}": member communication with {{ $labels.To }} is taking {{ $value }}s on etcd instance {{ $labels.instance }}.' % $._config.clusterLabel,
summary: 'etcd cluster member communication is slow.',
},
},
{
alert: 'etcdHighNumberOfFailedProposals',
expr: |||
rate(etcd_server_proposals_failed_total{%(etcd_selector)s}[15m]) > 5
||| % $._config,
'for': '15m',
labels: {
severity: 'warning',
},
annotations: {
description: 'etcd cluster "{{ $labels.%s }}": {{ $value }} proposal failures within the last 30 minutes on etcd instance {{ $labels.instance }}.' % $._config.clusterLabel,
summary: 'etcd cluster has high number of proposal failures.',
},
},
{
alert: 'etcdHighFsyncDurations',
expr: |||
histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{%(etcd_selector)s}[5m]))
> 0.5
||| % $._config,
'for': '10m',
labels: {
severity: 'warning',
},
annotations: {
description: 'etcd cluster "{{ $labels.%s }}": 99th percentile fsync durations are {{ $value }}s on etcd instance {{ $labels.instance }}.' % $._config.clusterLabel,
summary: 'etcd cluster 99th percentile fsync durations are too high.',
},
},
{
alert: 'etcdHighFsyncDurations',
expr: |||
histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{%(etcd_selector)s}[5m]))
> 1
||| % $._config,
'for': '10m',
labels: {
severity: 'critical',
},
annotations: {
description: 'etcd cluster "{{ $labels.%s }}": 99th percentile fsync durations are {{ $value }}s on etcd instance {{ $labels.instance }}.' % $._config.clusterLabel,
summary: 'etcd cluster 99th percentile fsync durations are too high.',
},
},
{
alert: 'etcdHighCommitDurations',
expr: |||
histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{%(etcd_selector)s}[5m]))
> 0.25
||| % $._config,
'for': '10m',
labels: {
severity: 'warning',
},
annotations: {
description: 'etcd cluster "{{ $labels.%s }}": 99th percentile commit durations {{ $value }}s on etcd instance {{ $labels.instance }}.' % $._config.clusterLabel,
summary: 'etcd cluster 99th percentile commit durations are too high.',
},
},
{
alert: 'etcdDatabaseQuotaLowSpace',
expr: |||
(last_over_time(etcd_mvcc_db_total_size_in_bytes[5m]) / last_over_time(etcd_server_quota_backend_bytes[5m]))*100 > 95
||| % $._config,
'for': '10m',
labels: {
severity: 'critical',
},
annotations: {
description: 'etcd cluster "{{ $labels.%s }}": database size exceeds the defined quota on etcd instance {{ $labels.instance }}, please defrag or increase the quota as the writes to etcd will be disabled when it is full.' % $._config.clusterLabel,
summary: 'etcd cluster database is running full.',
},
},
{
alert: 'etcdExcessiveDatabaseGrowth',
expr: |||
predict_linear(etcd_mvcc_db_total_size_in_bytes[4h], 4*60*60) > etcd_server_quota_backend_bytes
||| % $._config,
'for': '10m',
labels: {
severity: 'warning',
},
annotations: {
description: 'etcd cluster "{{ $labels.%s }}": Predicting running out of disk space in the next four hours, based on write observations within the past four hours on etcd instance {{ $labels.instance }}, please check as it might be disruptive.' % $._config.clusterLabel,
summary: 'etcd cluster database growing very fast.',
},
},
{
alert: 'etcdDatabaseHighFragmentationRatio',
expr: |||
(last_over_time(etcd_mvcc_db_total_size_in_use_in_bytes[5m]) / last_over_time(etcd_mvcc_db_total_size_in_bytes[5m])) < 0.5 and etcd_mvcc_db_total_size_in_use_in_bytes > 104857600
||| % $._config,
'for': '10m',
labels: {
severity: 'warning',
},
annotations: {
description: 'etcd cluster "{{ $labels.%s }}": database size in use on instance {{ $labels.instance }} is {{ $value | humanizePercentage }} of the actual allocated disk space, please run defragmentation (e.g. etcdctl defrag) to retrieve the unused fragmented disk space.' % $._config.clusterLabel,
summary: 'etcd database size in use is less than 50% of the actual allocated storage.',
runbook_url: 'https://etcd.io/docs/v3.5/op-guide/maintenance/#defragmentation',
},
},
],
},
],
},
grafanaDashboards+:: {
'etcd.json': {
uid: std.md5('etcd.json'),
title: 'etcd',
description: 'etcd sample Grafana dashboard with Prometheus',
tags: ['etcd-mixin'],
style: 'dark',
timezone: 'browser',
editable: true,
hideControls: false,
sharedCrosshair: false,
rows: [
{
collapse: false,
editable: true,
height: '250px',
panels: [
{
cacheTimeout: null,
colorBackground: false,
colorValue: false,
colors: [
'rgba(245, 54, 54, 0.9)',
'rgba(237, 129, 40, 0.89)',
'rgba(50, 172, 45, 0.97)',
],
datasource: '$datasource',
editable: true,
'error': false,
format: 'none',
gauge: {
maxValue: 100,
minValue: 0,
show: false,
thresholdLabels: false,
thresholdMarkers: true,
},
id: 28,
interval: null,
isNew: true,
links: [],
mappingType: 1,
mappingTypes: [
{
name: 'value to text',
value: 1,
},
{
name: 'range to text',
value: 2,
},
],
maxDataPoints: 100,
nullPointMode: 'connected',
nullText: null,
postfix: '',
postfixFontSize: '50%',
prefix: '',
prefixFontSize: '50%',
rangeMaps: [{
from: 'null',
text: 'N/A',
to: 'null',
}],
span: 3,
sparkline: {
fillColor: 'rgba(31, 118, 189, 0.18)',
full: false,
lineColor: 'rgb(31, 120, 193)',
show: false,
},
targets: [{
expr: 'sum(etcd_server_has_leader{%s="$cluster"})' % $._config.clusterLabel,
intervalFactor: 2,
legendFormat: '',
metric: 'etcd_server_has_leader',
refId: 'A',
step: 20,
}],
thresholds: '',
title: 'Up',
type: 'singlestat',
valueFontSize: '200%',
valueMaps: [{
op: '=',
text: 'N/A',
value: 'null',
}],
valueName: 'avg',
},
{
aliasColors: {},
bars: false,
datasource: '$datasource',
editable: true,
'error': false,
fill: 0,
id: 23,
isNew: true,
legend: {
avg: false,
current: false,
max: false,
min: false,
show: false,
total: false,
values: false,
},
lines: true,
linewidth: 2,
links: [],
nullPointMode: 'connected',
percentage: false,
pointradius: 5,
points: false,
renderer: 'flot',
seriesOverrides: [],
span: 5,
stack: false,
steppedLine: false,
targets: [
{
expr: 'sum(rate(grpc_server_started_total{%s="$cluster",grpc_type="unary"}[$__rate_interval]))' % $._config.clusterLabel,
format: 'time_series',
intervalFactor: 2,
legendFormat: 'RPC Rate',
metric: 'grpc_server_started_total',
refId: 'A',
step: 2,
},
{
expr: 'sum(rate(grpc_server_handled_total{%s="$cluster",grpc_type="unary",grpc_code=~"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded"}[$__rate_interval]))' % $._config.clusterLabel,
format: 'time_series',
intervalFactor: 2,
legendFormat: 'RPC Failed Rate',
metric: 'grpc_server_handled_total',
refId: 'B',
step: 2,
},
],
thresholds: [],
timeFrom: null,
timeShift: null,
title: 'RPC Rate',
tooltip: {
msResolution: false,
shared: true,
sort: 0,
value_type: 'individual',
},
type: 'graph',
xaxis: {
mode: 'time',
name: null,
show: true,
values: [],
},
yaxes: [
{
format: 'ops',
label: null,
logBase: 1,
max: null,
min: null,
show: true,
},
{
format: 'short',
label: null,
logBase: 1,
max: null,
min: null,
show: true,
},
],
},
{
aliasColors: {},
bars: false,
datasource: '$datasource',
editable: true,
'error': false,
fill: 0,
id: 41,
isNew: true,
legend: {
avg: false,
current: false,
max: false,
min: false,
show: false,
total: false,
values: false,
},
lines: true,
linewidth: 2,
links: [],
nullPointMode: 'connected',
percentage: false,
pointradius: 5,
points: false,
renderer: 'flot',
seriesOverrides: [],
span: 4,
stack: true,
steppedLine: false,
targets: [
{
expr: 'sum(grpc_server_started_total{%(clusterLabel)s="$cluster",grpc_service="etcdserverpb.Watch",grpc_type="bidi_stream"}) - sum(grpc_server_handled_total{%(clusterLabel)s="$cluster",grpc_service="etcdserverpb.Watch",grpc_type="bidi_stream"})' % $._config,
intervalFactor: 2,
legendFormat: 'Watch Streams',
metric: 'grpc_server_handled_total',
refId: 'A',
step: 4,
},
{
expr: 'sum(grpc_server_started_total{%(clusterLabel)s="$cluster",grpc_service="etcdserverpb.Lease",grpc_type="bidi_stream"}) - sum(grpc_server_handled_total{%(clusterLabel)s="$cluster",grpc_service="etcdserverpb.Lease",grpc_type="bidi_stream"})' % $._config,
intervalFactor: 2,
legendFormat: 'Lease Streams',
metric: 'grpc_server_handled_total',
refId: 'B',
step: 4,
},
],
thresholds: [],
timeFrom: null,
timeShift: null,
title: 'Active Streams',
tooltip: {
msResolution: false,
shared: true,
sort: 0,
value_type: 'individual',
},
type: 'graph',
xaxis: {
mode: 'time',
name: null,
show: true,
values: [],
},
yaxes: [
{
format: 'short',
label: '',
logBase: 1,
max: null,
min: null,
show: true,
},
{
format: 'short',
label: null,
logBase: 1,
max: null,
min: null,
show: true,
},
],
},
],
showTitle: false,
title: 'Row',
},
{
collapse: false,
editable: true,
height: '250px',
panels: [
{
aliasColors: {},
bars: false,
datasource: '$datasource',
decimals: null,
editable: true,
'error': false,
fill: 0,
grid: {},
id: 1,
legend: {
avg: false,
current: false,
max: false,
min: false,
show: false,
total: false,
values: false,
},
lines: true,
linewidth: 2,
links: [],
nullPointMode: 'connected',
percentage: false,
pointradius: 5,
points: false,
renderer: 'flot',
seriesOverrides: [],
span: 4,
stack: false,
steppedLine: false,
targets: [{
expr: 'etcd_mvcc_db_total_size_in_bytes{%s="$cluster"}' % $._config.clusterLabel,
hide: false,
interval: '',
intervalFactor: 2,
legendFormat: '{{instance}} DB Size',
metric: '',
refId: 'A',
step: 4,
}],
thresholds: [],
timeFrom: null,
timeShift: null,
title: 'DB Size',
tooltip: {
msResolution: false,
shared: true,
sort: 0,
value_type: 'cumulative',
},
type: 'graph',
xaxis: {
mode: 'time',
name: null,
show: true,
values: [],
},
yaxes: [
{
format: 'bytes',
logBase: 1,
max: null,
min: null,
show: true,
},
{
format: 'short',
logBase: 1,
max: null,
min: null,
show: false,
},
],
},
{
aliasColors: {},
bars: false,
datasource: '$datasource',
editable: true,
'error': false,
fill: 0,
grid: {},
id: 3,
legend: {
avg: false,
current: false,
max: false,
min: false,
show: false,
total: false,
values: false,
},
lines: true,
linewidth: 2,
links: [],
nullPointMode: 'connected',
percentage: false,
pointradius: 1,
points: false,
renderer: 'flot',
seriesOverrides: [],
span: 4,
stack: false,
steppedLine: true,
targets: [
{
expr: 'histogram_quantile(0.99, sum(rate(etcd_disk_wal_fsync_duration_seconds_bucket{%s="$cluster"}[$__rate_interval])) by (instance, le))' % $._config.clusterLabel,
hide: false,
intervalFactor: 2,
legendFormat: '{{instance}} WAL fsync',
metric: 'etcd_disk_wal_fsync_duration_seconds_bucket',
refId: 'A',
step: 4,
},
{
expr: 'histogram_quantile(0.99, sum(rate(etcd_disk_backend_commit_duration_seconds_bucket{%s="$cluster"}[$__rate_interval])) by (instance, le))' % $._config.clusterLabel,
intervalFactor: 2,
legendFormat: '{{instance}} DB fsync',
metric: 'etcd_disk_backend_commit_duration_seconds_bucket',
refId: 'B',
step: 4,
},
],
thresholds: [],
timeFrom: null,
timeShift: null,
title: 'Disk Sync Duration',
tooltip: {
msResolution: false,
shared: true,
sort: 0,
value_type: 'cumulative',
},
type: 'graph',
xaxis: {
mode: 'time',
name: null,
show: true,
values: [],
},
yaxes: [
{
format: 's',
logBase: 1,
max: null,
min: null,
show: true,
},
{
format: 'short',
logBase: 1,
max: null,
min: null,
show: false,
},
],
},
{
aliasColors: {},
bars: false,
datasource: '$datasource',
editable: true,
'error': false,
fill: 0,
id: 29,
isNew: true,
legend: {
avg: false,
current: false,
max: false,
min: false,
show: false,
total: false,
values: false,
},
lines: true,
linewidth: 2,
links: [],
nullPointMode: 'connected',
percentage: false,
pointradius: 5,
points: false,
renderer: 'flot',
seriesOverrides: [],
span: 4,
stack: false,
steppedLine: false,
targets: [{
expr: 'process_resident_memory_bytes{%s="$cluster"}' % $._config.clusterLabel,
intervalFactor: 2,
legendFormat: '{{instance}} Resident Memory',
metric: 'process_resident_memory_bytes',
refId: 'A',
step: 4,
}],
thresholds: [],
timeFrom: null,
timeShift: null,
title: 'Memory',
tooltip: {
msResolution: false,
shared: true,
sort: 0,
value_type: 'individual',
},
type: 'graph',
xaxis: {
mode: 'time',
name: null,
show: true,
values: [],
},
yaxes: [
{
format: 'bytes',
label: null,
logBase: 1,
max: null,
min: null,
show: true,
},
{
format: 'short',
label: null,
logBase: 1,
max: null,
min: null,
show: true,
},
],
},
],
title: 'New row',
},
{
collapse: false,
editable: true,
height: '250px',
panels: [
{
aliasColors: {},
bars: false,
datasource: '$datasource',
editable: true,
'error': false,
fill: 5,
id: 22,
isNew: true,
legend: {
avg: false,
current: false,
max: false,
min: false,
show: false,
total: false,
values: false,
},
lines: true,
linewidth: 2,
links: [],
nullPointMode: 'connected',
percentage: false,
pointradius: 5,
points: false,
renderer: 'flot',
seriesOverrides: [],
span: 3,
stack: true,
steppedLine: false,
targets: [{
expr: 'rate(etcd_network_client_grpc_received_bytes_total{%s="$cluster"}[$__rate_interval])' % $._config.clusterLabel,
intervalFactor: 2,
legendFormat: '{{instance}} Client Traffic In',
metric: 'etcd_network_client_grpc_received_bytes_total',
refId: 'A',
step: 4,
}],
thresholds: [],
timeFrom: null,
timeShift: null,
title: 'Client Traffic In',
tooltip: {
msResolution: false,
shared: true,
sort: 0,
value_type: 'individual',
},
type: 'graph',
xaxis: {
mode: 'time',
name: null,
show: true,
values: [],
},
yaxes: [
{
format: 'Bps',
label: null,
logBase: 1,
max: null,
min: null,
show: true,
},
{
format: 'short',
label: null,
logBase: 1,
max: null,
min: null,
show: true,
},
],
},
{
aliasColors: {},
bars: false,
datasource: '$datasource',
editable: true,
'error': false,
fill: 5,
id: 21,
isNew: true,
legend: {
avg: false,
current: false,
max: false,
min: false,
show: false,
total: false,
values: false,
},
lines: true,
linewidth: 2,
links: [],
nullPointMode: 'connected',
percentage: false,
pointradius: 5,
points: false,
renderer: 'flot',
seriesOverrides: [],
span: 3,
stack: true,
steppedLine: false,
targets: [{
expr: 'rate(etcd_network_client_grpc_sent_bytes_total{%s="$cluster"}[$__rate_interval])' % $._config.clusterLabel,
intervalFactor: 2,
legendFormat: '{{instance}} Client Traffic Out',
metric: 'etcd_network_client_grpc_sent_bytes_total',
refId: 'A',
step: 4,
}],
thresholds: [],
timeFrom: null,
timeShift: null,
title: 'Client Traffic Out',
tooltip: {
msResolution: false,
shared: true,
sort: 0,
value_type: 'individual',
},
type: 'graph',
xaxis: {
mode: 'time',
name: null,
show: true,
values: [],
},
yaxes: [
{
format: 'Bps',
label: null,
logBase: 1,
max: null,
min: null,
show: true,
},
{
format: 'short',
label: null,
logBase: 1,
max: null,
min: null,
show: true,
},
],
},
{
aliasColors: {},
bars: false,
datasource: '$datasource',
editable: true,
'error': false,
fill: 0,
id: 20,
isNew: true,
legend: {
avg: false,
current: false,
max: false,
min: false,
show: false,
total: false,
values: false,
},
lines: true,
linewidth: 2,
links: [],
nullPointMode: 'connected',
percentage: false,
pointradius: 5,
points: false,
renderer: 'flot',
seriesOverrides: [],
span: 3,
stack: false,
steppedLine: false,
targets: [{
expr: 'sum(rate(etcd_network_peer_received_bytes_total{%s="$cluster"}[$__rate_interval])) by (instance)' % $._config.clusterLabel,
intervalFactor: 2,
legendFormat: '{{instance}} Peer Traffic In',
metric: 'etcd_network_peer_received_bytes_total',
refId: 'A',
step: 4,
}],
thresholds: [],
timeFrom: null,
timeShift: null,
title: 'Peer Traffic In',
tooltip: {
msResolution: false,
shared: true,
sort: 0,
value_type: 'individual',
},
type: 'graph',
xaxis: {
mode: 'time',
name: null,
show: true,
values: [],
},
yaxes: [
{
format: 'Bps',
label: null,
logBase: 1,
max: null,
min: null,
show: true,
},
{
format: 'short',
label: null,
logBase: 1,
max: null,
min: null,
show: true,
},
],
},
{
aliasColors: {},
bars: false,
datasource: '$datasource',
decimals: null,
editable: true,
'error': false,
fill: 0,
grid: {},
id: 16,
legend: {
avg: false,
current: false,
max: false,
min: false,
show: false,
total: false,
values: false,
},
lines: true,
linewidth: 2,
links: [],
nullPointMode: 'connected',
percentage: false,
pointradius: 5,
points: false,
renderer: 'flot',
seriesOverrides: [],
span: 3,
stack: false,
steppedLine: false,
targets: [{
expr: 'sum(rate(etcd_network_peer_sent_bytes_total{%s="$cluster"}[$__rate_interval])) by (instance)' % $._config.clusterLabel,
hide: false,
interval: '',
intervalFactor: 2,
legendFormat: '{{instance}} Peer Traffic Out',
metric: 'etcd_network_peer_sent_bytes_total',
refId: 'A',
step: 4,
}],
thresholds: [],
timeFrom: null,
timeShift: null,
title: 'Peer Traffic Out',
tooltip: {
msResolution: false,
shared: true,
sort: 0,
value_type: 'cumulative',
},
type: 'graph',
xaxis: {
mode: 'time',
name: null,
show: true,
values: [],
},
yaxes: [
{
format: 'Bps',
logBase: 1,
max: null,
min: null,
show: true,
},
{
format: 'short',
logBase: 1,
max: null,
min: null,
show: true,
},
],
},
],
title: 'New row',
},
{
collapse: false,
editable: true,
height: '250px',
panels: [
{
aliasColors: {},
bars: false,
datasource: '$datasource',
editable: true,
'error': false,
fill: 0,
id: 40,
isNew: true,
legend: {
avg: false,
current: false,
max: false,
min: false,
show: false,
total: false,
values: false,
},
lines: true,
linewidth: 2,
links: [],
nullPointMode: 'connected',
percentage: false,
pointradius: 5,
points: false,
renderer: 'flot',
seriesOverrides: [],
span: 6,
stack: false,
steppedLine: false,
targets: [
{
expr: 'sum(rate(etcd_server_proposals_failed_total{%s="$cluster"}[$__rate_interval]))' % $._config.clusterLabel,
intervalFactor: 2,
legendFormat: 'Proposal Failure Rate',
metric: 'etcd_server_proposals_failed_total',
refId: 'A',
step: 2,
},
{
expr: 'sum(etcd_server_proposals_pending{%s="$cluster"})' % $._config.clusterLabel,
intervalFactor: 2,
legendFormat: 'Proposal Pending Total',
metric: 'etcd_server_proposals_pending',
refId: 'B',
step: 2,
},
{
expr: 'sum(rate(etcd_server_proposals_committed_total{%s="$cluster"}[$__rate_interval]))' % $._config.clusterLabel,
intervalFactor: 2,
legendFormat: 'Proposal Commit Rate',
metric: 'etcd_server_proposals_committed_total',
refId: 'C',
step: 2,
},
{
expr: 'sum(rate(etcd_server_proposals_applied_total{%s="$cluster"}[$__rate_interval]))' % $._config.clusterLabel,
intervalFactor: 2,
legendFormat: 'Proposal Apply Rate',
refId: 'D',
step: 2,
},
],
thresholds: [],
timeFrom: null,
timeShift: null,
title: 'Raft Proposals',
tooltip: {
msResolution: false,
shared: true,
sort: 0,
value_type: 'individual',
},
type: 'graph',
xaxis: {
mode: 'time',
name: null,
show: true,
values: [],
},
yaxes: [
{
format: 'short',
label: '',
logBase: 1,
max: null,
min: null,
show: true,
},
{
format: 'short',
label: null,
logBase: 1,
max: null,
min: null,
show: true,
},
],
},
{
aliasColors: {},
bars: false,
datasource: '$datasource',
decimals: 0,
editable: true,
'error': false,
fill: 0,
id: 19,
isNew: true,
legend: {
alignAsTable: false,
avg: false,
current: false,
max: false,
min: false,
rightSide: false,
show: false,
total: false,
values: false,
},
lines: true,
linewidth: 2,
links: [],
nullPointMode: 'connected',
percentage: false,
pointradius: 5,
points: false,
renderer: 'flot',
seriesOverrides: [],
span: 6,
stack: false,
steppedLine: false,
targets: [{
expr: 'changes(etcd_server_leader_changes_seen_total{%s="$cluster"}[1d])' % $._config.clusterLabel,
intervalFactor: 2,
legendFormat: '{{instance}} Total Leader Elections Per Day',
metric: 'etcd_server_leader_changes_seen_total',
refId: 'A',
step: 2,
}],
thresholds: [],
timeFrom: null,
timeShift: null,
title: 'Total Leader Elections Per Day',
tooltip: {
msResolution: false,
shared: true,
sort: 0,
value_type: 'individual',
},
type: 'graph',
xaxis: {
mode: 'time',
name: null,
show: true,
values: [],
},
yaxes: [
{
format: 'short',
label: null,
logBase: 1,
max: null,
min: null,
show: true,
},
{
format: 'short',
label: null,
logBase: 1,
max: null,
min: null,
show: true,
},
],
},
{
aliasColors: {},
bars: false,
dashLength: 10,
dashes: false,
datasource: '$datasource',
decimals: 0,
editable: true,
'error': false,
fieldConfig: {
defaults: {
custom: {},
},
overrides: [],
},
fill: 0,
fillGradient: 0,
gridPos: {
h: 7,
w: 12,
x: 0,
y: 28,
},
hiddenSeries: false,
id: 42,
isNew: true,
legend: {
alignAsTable: false,
avg: false,
current: false,
max: false,
min: false,
rightSide: false,
show: false,
total: false,
values: false,
},
lines: true,
linewidth: 2,
links: [],
nullPointMode: 'connected',
options: {
alertThreshold: true,
},
percentage: false,
pluginVersion: '7.4.3',
pointradius: 5,
points: false,
renderer: 'flot',
seriesOverrides: [],
spaceLength: 10,
stack: false,
steppedLine: false,
targets: [
{
expr: 'histogram_quantile(0.99, sum by (instance, le) (rate(etcd_network_peer_round_trip_time_seconds_bucket{%s="$cluster"}[$__rate_interval])))' % $._config.clusterLabel,
interval: '',
intervalFactor: 2,
legendFormat: '{{instance}} Peer round trip time',
metric: 'etcd_network_peer_round_trip_time_seconds_bucket',
refId: 'A',
step: 2,
},
],
thresholds: [],
timeFrom: null,
timeRegions: [],
timeShift: null,
title: 'Peer round trip time',
tooltip: {
msResolution: false,
shared: true,
sort: 0,
value_type: 'individual',
},
type: 'graph',
xaxis: {
buckets: null,
mode: 'time',
name: null,
show: true,
values: [],
},
yaxes: [
{
'$$hashKey': 'object:925',
decimals: null,
format: 's',
label: null,
logBase: 1,
max: null,
min: null,
show: true,
},
{
'$$hashKey': 'object:926',
format: 'short',
label: null,
logBase: 1,
max: null,
min: null,
show: true,
},
],
yaxis: {
align: false,
alignLevel: null,
},
},
],
title: 'New row',
},
],
time: {
from: 'now-15m',
to: 'now',
},
timepicker: {
now: true,
refresh_intervals: [
'5s',
'10s',
'30s',
'1m',
'5m',
'15m',
'30m',
'1h',
'2h',
'1d',
],
time_options: [
'5m',
'15m',
'1h',
'6h',
'12h',
'24h',
'2d',
'7d',
'30d',
],
},
templating: {
list: [
{
current: {
text: 'Prometheus',
value: 'Prometheus',
},
hide: 0,
label: 'Data Source',
name: 'datasource',
options: [],
query: 'prometheus',
refresh: 1,
regex: '',
type: 'datasource',
},
{
allValue: null,
current: {
text: 'prod',
value: 'prod',
},
datasource: '$datasource',
hide: 0,
includeAll: false,
label: 'cluster',
multi: false,
name: 'cluster',
options: [],
query: 'label_values(etcd_server_has_leader, %s)' % $._config.clusterLabel,
refresh: $._config.dashboard_var_refresh,
regex: '',
sort: 2,
tagValuesQuery: '',
tags: [],
tagsQuery: '',
type: 'query',
useTags: false,
},
],
},
annotations: {
list: [],
},
refresh: '10s',
schemaVersion: 13,
version: 215,
links: [],
gnetId: null,
},
},
}