etcdserver: add OS level FD metrics

Similar counts are exposed via Prometheus.
This adds the one that are perceived by etcd server.

e.g.

os_fd_limit 120000
os_fd_used 14
process_cpu_seconds_total 0.31
process_max_fds 120000
process_open_fds 17

Signed-off-by: Gyuho Lee <leegyuho@amazon.com>
release-3.5
Gyuho Lee 2020-08-12 10:23:22 -07:00
parent 53fdcdc5a2
commit 421df2ecbb
1 changed files with 17 additions and 1 deletions

View File

@ -151,6 +151,19 @@ var (
Help: "Server or member ID in hexadecimal format. 1 for 'server_id' label with current ID.",
},
[]string{"server_id"})
fdUsed = prometheus.NewGauge(prometheus.GaugeOpts{
Namespace: "os",
Subsystem: "fd",
Name: "used",
Help: "The number of used file descriptors.",
})
fdLimit = prometheus.NewGauge(prometheus.GaugeOpts{
Namespace: "os",
Subsystem: "fd",
Name: "limit",
Help: "The file descriptor limit.",
})
)
func init() {
@ -174,6 +187,8 @@ func init() {
prometheus.MustRegister(isLearner)
prometheus.MustRegister(learnerPromoteSucceed)
prometheus.MustRegister(learnerPromoteFailed)
prometheus.MustRegister(fdUsed)
prometheus.MustRegister(fdLimit)
currentVersion.With(prometheus.Labels{
"server_version": version.Version,
@ -184,7 +199,6 @@ func init() {
}
func monitorFileDescriptor(lg *zap.Logger, done <-chan struct{}) {
// This ticker will check File Descriptor Requirements ,and count all fds in used.
// And recorded some logs when in used >= limit/5*4. Just recorded message.
// If fds was more than 10K,It's low performance due to FDUsage() works.
@ -198,11 +212,13 @@ func monitorFileDescriptor(lg *zap.Logger, done <-chan struct{}) {
lg.Warn("failed to get file descriptor usage", zap.Error(err))
return
}
fdUsed.Set(float64(used))
limit, err := runtime.FDLimit()
if err != nil {
lg.Warn("failed to get file descriptor limit", zap.Error(err))
return
}
fdLimit.Set(float64(limit))
if used >= limit/5*4 {
lg.Warn("80% of file descriptors are used", zap.Uint64("used", used), zap.Uint64("limit", limit))
}