etcdserver: Improve some debug metrics.
The _total suffix is by convention for counters, don't use it on a gauge. Clarify help string. Tweak metric name so it'll sort with related metrics, and be a little more understandable. Remove open file descriptor metric, as Prometheus client_golang provides that out of the box as process_open_fds which is also more up to date. Both only support Linux, so there's no loss of platform support. Fixes #5229release-3.0
parent
00d6f104b5
commit
ea1d0f3e0d
|
@ -79,16 +79,13 @@ The metrics under the `etcd_debugging` prefix are for debugging. They are very i
|
||||||
|
|
||||||
| Name | Description | Type |
|
| Name | Description | Type |
|
||||||
|-----------------------------------------|--------------------------------------------------|-----------|
|
|-----------------------------------------|--------------------------------------------------|-----------|
|
||||||
| file_descriptors_used_total | The total number of file descriptors used | Gauge |
|
|
||||||
| proposal_durations_seconds | The latency distributions of committing proposal | Histogram |
|
| proposal_durations_seconds | The latency distributions of committing proposal | Histogram |
|
||||||
| pending_proposal_total | The total number of pending proposals | Gauge |
|
| proposals_pending | The current number of pending proposals | Gauge |
|
||||||
| proposal_failed_total | The total number of failed proposals | Counter |
|
| proposal_failed_total | The total number of failed proposals | Counter |
|
||||||
|
|
||||||
Heavy file descriptor (`file_descriptors_used_total`) usage (i.e., near the process's file descriptor limit) indicates a potential file descriptor exhaustion issue. If the file descriptors are exhausted, etcd may panic because it cannot create new WAL files.
|
|
||||||
|
|
||||||
[Proposal][glossary-proposal] durations (`proposal_durations_seconds`) provides a proposal commit latency histogram. The reported latency reflects network and disk IO delays in etcd.
|
[Proposal][glossary-proposal] durations (`proposal_durations_seconds`) provides a proposal commit latency histogram. The reported latency reflects network and disk IO delays in etcd.
|
||||||
|
|
||||||
Pending proposal (`pending_proposal_total`) indicates how many proposals are queued for commit. A rising pending proposal total suggests there is a high client load or the cluster is unstable.
|
Proposals pending (`proposals_pending`) indicates how many proposals are queued for commit. Rising pending proposals suggests there is a high client load or the cluster is unstable.
|
||||||
|
|
||||||
Failed proposals (`proposal_failed_total`) are normally related to two issues: temporary failures related to a leader election or longer duration downtime caused by a loss of quorum in the cluster.
|
Failed proposals (`proposal_failed_total`) are normally related to two issues: temporary failures related to a leader election or longer duration downtime caused by a loss of quorum in the cluster.
|
||||||
|
|
||||||
|
@ -127,6 +124,17 @@ Label `msgType` is the type of raft message. `MsgApp` is log replication message
|
||||||
|
|
||||||
Label `remoteID` is the member ID of the message destination.
|
Label `remoteID` is the member ID of the message destination.
|
||||||
|
|
||||||
|
## Prometheus supplied metrics
|
||||||
|
|
||||||
|
The Prometheus client library provides a number of metrics under the `go` and `process` namespaces. There are a few that are particlarly interesting.
|
||||||
|
|
||||||
|
| Name | Description | Type |
|
||||||
|
|-----------------------------------|--------------------------------------------|--------------|
|
||||||
|
| process_open_fds | Number of open file descriptors. | Gauge |
|
||||||
|
| process_max_fds | Maximum number of open file descriptors. | Gauge |
|
||||||
|
|
||||||
|
Heavy file descriptor (`process_open_fds`) usage (i.e., near the process's file descriptor limit, `process_max_fds`) indicates a potential file descriptor exhaustion issue. If the file descriptors are exhausted, etcd may panic because it cannot create new WAL files.
|
||||||
|
|
||||||
[glossary-proposal]: glossary.md#proposal
|
[glossary-proposal]: glossary.md#proposal
|
||||||
[prometheus]: http://prometheus.io/
|
[prometheus]: http://prometheus.io/
|
||||||
[prometheus-getting-started]: http://prometheus.io/docs/introduction/getting_started/
|
[prometheus-getting-started]: http://prometheus.io/docs/introduction/getting_started/
|
||||||
|
|
|
@ -33,8 +33,8 @@ var (
|
||||||
proposePending = prometheus.NewGauge(prometheus.GaugeOpts{
|
proposePending = prometheus.NewGauge(prometheus.GaugeOpts{
|
||||||
Namespace: "etcd_debugging",
|
Namespace: "etcd_debugging",
|
||||||
Subsystem: "server",
|
Subsystem: "server",
|
||||||
Name: "pending_proposal_total",
|
Name: "proposals_pending",
|
||||||
Help: "The total number of pending proposals.",
|
Help: "The current number of pending proposals.",
|
||||||
})
|
})
|
||||||
// This is number of proposal failed in client's view.
|
// This is number of proposal failed in client's view.
|
||||||
// The proposal might be later got committed in raft.
|
// The proposal might be later got committed in raft.
|
||||||
|
@ -44,20 +44,12 @@ var (
|
||||||
Name: "proposal_failed_total",
|
Name: "proposal_failed_total",
|
||||||
Help: "The total number of failed proposals.",
|
Help: "The total number of failed proposals.",
|
||||||
})
|
})
|
||||||
|
|
||||||
fileDescriptorUsed = prometheus.NewGauge(prometheus.GaugeOpts{
|
|
||||||
Namespace: "etcd_debugging",
|
|
||||||
Subsystem: "server",
|
|
||||||
Name: "file_descriptors_used_total",
|
|
||||||
Help: "The total number of file descriptors used.",
|
|
||||||
})
|
|
||||||
)
|
)
|
||||||
|
|
||||||
func init() {
|
func init() {
|
||||||
prometheus.MustRegister(proposeDurations)
|
prometheus.MustRegister(proposeDurations)
|
||||||
prometheus.MustRegister(proposePending)
|
prometheus.MustRegister(proposePending)
|
||||||
prometheus.MustRegister(proposeFailed)
|
prometheus.MustRegister(proposeFailed)
|
||||||
prometheus.MustRegister(fileDescriptorUsed)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func monitorFileDescriptor(done <-chan struct{}) {
|
func monitorFileDescriptor(done <-chan struct{}) {
|
||||||
|
@ -69,7 +61,6 @@ func monitorFileDescriptor(done <-chan struct{}) {
|
||||||
plog.Errorf("cannot monitor file descriptor usage (%v)", err)
|
plog.Errorf("cannot monitor file descriptor usage (%v)", err)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
fileDescriptorUsed.Set(float64(used))
|
|
||||||
limit, err := runtime.FDLimit()
|
limit, err := runtime.FDLimit()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
plog.Errorf("cannot monitor file descriptor usage (%v)", err)
|
plog.Errorf("cannot monitor file descriptor usage (%v)", err)
|
||||||
|
|
Loading…
Reference in New Issue