raft: use HistogramVec for message_sent_latency
parent
c1e4e647eb
commit
964f6050ee
|
@ -24,7 +24,7 @@ etcd now exposes the following metrics:
|
|||
|
||||
High file descriptors (`file_descriptors_used_total`) usage (near the file descriptors limitation of the process) indicates a potential out of file descriptors issue. That might cause etcd fails to create new WAL files and panics.
|
||||
|
||||
[Proposal](glossary.md#proposal) durations (`proposal_durations_seconds`) give you an histogram about the proposal commit latency. Latency can be introduced into this process by network and disk IO.
|
||||
[Proposal](glossary.md#proposal) durations (`proposal_durations_seconds`) give you a histogram about the proposal commit latency. Latency can be introduced into this process by network and disk IO.
|
||||
|
||||
Pending proposal (`pending_proposal_total`) gives you an idea about how many proposal are in the queue and waiting for commit. An increasing pending number indicates a high client load or an unstable cluster.
|
||||
|
||||
|
@ -82,13 +82,13 @@ Abnormally high snapshot duration (`snapshot_save_total_durations_seconds`) indi
|
|||
|
||||
### rafthttp
|
||||
|
||||
| Name | Description | Type | Labels |
|
||||
|-----------------------------------|--------------------------------------------|---------|--------------------------------|
|
||||
| message_sent_latency_microseconds | The latency distributions of messages sent | Summary | sendingType, msgType, remoteID |
|
||||
| message_sent_failed_total | The total number of failed messages sent | Summary | sendingType, msgType, remoteID |
|
||||
| Name | Description | Type | Labels |
|
||||
|-----------------------------------|--------------------------------------------|--------------|--------------------------------|
|
||||
| message_sent_latency_seconds | The latency distributions of messages sent | HistogramVec | sendingType, msgType, remoteID |
|
||||
| message_sent_failed_total | The total number of failed messages sent | Summary | sendingType, msgType, remoteID |
|
||||
|
||||
|
||||
Abnormally high message duration (`message_sent_latency_microseconds`) indicates network issues and might cause the cluster to be unstable.
|
||||
Abnormally high message duration (`message_sent_latency_seconds`) indicates network issues and might cause the cluster to be unstable.
|
||||
|
||||
An increase in message failures (`message_sent_failed_total`) indicates more severe network issues and might cause the cluster to be unstable.
|
||||
|
||||
|
|
|
@ -23,12 +23,17 @@ import (
|
|||
)
|
||||
|
||||
var (
|
||||
msgSentDuration = prometheus.NewSummaryVec(
|
||||
prometheus.SummaryOpts{
|
||||
// TODO: create a separate histogram for recording
|
||||
// snapshot sending metric. snapshot can be large and
|
||||
// take a long time to send. So it needs a different
|
||||
// time range than other type of messages.
|
||||
msgSentDuration = prometheus.NewHistogramVec(
|
||||
prometheus.HistogramOpts{
|
||||
Namespace: "etcd",
|
||||
Subsystem: "rafthttp",
|
||||
Name: "message_sent_latency_microseconds",
|
||||
Name: "message_sent_latency_seconds",
|
||||
Help: "message sent latency distributions.",
|
||||
Buckets: prometheus.ExponentialBuckets(0.0005, 2, 13),
|
||||
},
|
||||
[]string{"sendingType", "remoteID", "msgType"},
|
||||
)
|
||||
|
@ -53,7 +58,7 @@ func reportSentDuration(sendingType string, m raftpb.Message, duration time.Dura
|
|||
if isLinkHeartbeatMessage(m) {
|
||||
typ = "MsgLinkHeartbeat"
|
||||
}
|
||||
msgSentDuration.WithLabelValues(sendingType, types.ID(m.To).String(), typ).Observe(float64(duration.Nanoseconds() / int64(time.Microsecond)))
|
||||
msgSentDuration.WithLabelValues(sendingType, types.ID(m.To).String(), typ).Observe(float64(duration) / float64(time.Second))
|
||||
}
|
||||
|
||||
func reportSentFailure(sendingType string, m raftpb.Message) {
|
||||
|
|
Loading…
Reference in New Issue