diff --git a/CHANGELOG-3.5.md b/CHANGELOG-3.5.md index 2bed69fd5..0800b0827 100644 --- a/CHANGELOG-3.5.md +++ b/CHANGELOG-3.5.md @@ -134,6 +134,7 @@ Note that any `etcd_debugging_*` metrics are experimental and subject to change. - Improve [mvcc.watchResponse channel Memory Usage](https://github.com/etcd-io/etcd/pull/11987). - Log [expensive request info in UnaryInterceptor](https://github.com/etcd-io/etcd/pull/12086). - [Fix invalid Go type in etcdserverpb](https://github.com/etcd-io/etcd/pull/12000). +- [Improve healthcheck by using v3 range request and its corresponding timeout](https://github.com/etcd-io/etcd/pull/12195). ### Package `embed` diff --git a/embed/etcd.go b/embed/etcd.go index 815468e1b..ec86319f0 100644 --- a/embed/etcd.go +++ b/embed/etcd.go @@ -632,6 +632,7 @@ func (e *Etcd) serveClients() (err error) { } else { mux := http.NewServeMux() etcdhttp.HandleBasic(e.cfg.logger, mux, e.Server) + etcdhttp.HandleMetricsHealthForV3(e.cfg.logger, mux, e.Server) h = mux } @@ -666,7 +667,7 @@ func (e *Etcd) serveMetrics() (err error) { if len(e.cfg.ListenMetricsUrls) > 0 { metricsMux := http.NewServeMux() - etcdhttp.HandleMetricsHealth(e.cfg.logger, metricsMux, e.Server) + etcdhttp.HandleMetricsHealthForV3(e.cfg.logger, metricsMux, e.Server) for _, murl := range e.cfg.ListenMetricsUrls { tlsInfo := &e.cfg.ClientTLSInfo diff --git a/etcdserver/api/etcdhttp/base.go b/etcdserver/api/etcdhttp/base.go index 991e759f5..5a1dfa64d 100644 --- a/etcdserver/api/etcdhttp/base.go +++ b/etcdserver/api/etcdhttp/base.go @@ -37,12 +37,7 @@ const ( // HandleBasic adds handlers to a mux for serving JSON etcd client requests // that do not access the v2 store. func HandleBasic(lg *zap.Logger, mux *http.ServeMux, server etcdserver.ServerPeer) { - if lg == nil { - lg = zap.NewNop() - } mux.HandleFunc(varsPath, serveVars) - - HandleMetricsHealth(lg, mux, server) mux.HandleFunc(versionPath, versionHandler(server.Cluster(), serveVersion)) } diff --git a/etcdserver/api/etcdhttp/metrics.go b/etcdserver/api/etcdhttp/metrics.go index 84d27c791..96fd189d5 100644 --- a/etcdserver/api/etcdhttp/metrics.go +++ b/etcdserver/api/etcdhttp/metrics.go @@ -17,6 +17,7 @@ package etcdhttp import ( "context" "encoding/json" + "fmt" "net/http" "time" @@ -38,7 +39,14 @@ const ( // HandleMetricsHealth registers metrics and health handlers. func HandleMetricsHealth(lg *zap.Logger, mux *http.ServeMux, srv etcdserver.ServerV2) { mux.Handle(PathMetrics, promhttp.Handler()) - mux.Handle(PathHealth, NewHealthHandler(lg, func() Health { return checkHealth(lg, srv) })) + mux.Handle(PathHealth, NewHealthHandler(lg, func() Health { return checkV2Health(lg, srv) })) +} + +// HandleMetricsHealthForV3 registers metrics and health handlers. it checks health by using v3 range request +// and its corresponding timeout. +func HandleMetricsHealthForV3(lg *zap.Logger, mux *http.ServeMux, srv *etcdserver.EtcdServer) { + mux.Handle(PathMetrics, promhttp.Handler()) + mux.Handle(PathHealth, NewHealthHandler(lg, func() Health { return checkV3Health(lg, srv) })) } // HandlePrometheus registers prometheus handler on '/metrics'. @@ -56,6 +64,13 @@ func NewHealthHandler(lg *zap.Logger, hfunc func() Health) http.HandlerFunc { return } h := hfunc() + defer func() { + if h.Health == "true" { + healthSuccess.Inc() + } else { + healthFailed.Inc() + } + }() d, _ := json.Marshal(h) if h.Health != "true" { http.Error(w, string(d), http.StatusServiceUnavailable) @@ -97,17 +112,9 @@ type Health struct { // TODO: server NOSPACE, etcdserver.ErrNoLeader in health API -func checkHealth(lg *zap.Logger, srv etcdserver.ServerV2) (h Health) { +func checkHealth(lg *zap.Logger, srv etcdserver.ServerV2) Health { + h := Health{} h.Health = "true" - - defer func() { - if h.Health == "true" { - healthSuccess.Inc() - } else { - healthFailed.Inc() - } - }() - as := srv.Alarms() if len(as) > 0 { h.Health = "false" @@ -122,25 +129,48 @@ func checkHealth(lg *zap.Logger, srv etcdserver.ServerV2) (h Health) { } lg.Warn("serving /health false due to an alarm", zap.String("alarm", v.String())) } - return + return h } if uint64(srv.Leader()) == raft.None { h.Health = "false" h.Reason = "RAFT NO LEADER" lg.Warn("serving /health false; no leader") + return h + } + return h +} + +func checkV2Health(lg *zap.Logger, srv etcdserver.ServerV2) (h Health) { + if h = checkHealth(lg, srv); h.Health != "true" { return } - ctx, cancel := context.WithTimeout(context.Background(), time.Second) _, err := srv.Do(ctx, etcdserverpb.Request{Method: "QGET"}) cancel() if err != nil { h.Health = "false" - h.Reason = "QGET ERROR" + h.Reason = fmt.Sprintf("QGET ERROR:%s", err) lg.Warn("serving /health false; QGET fails", zap.Error(err)) + return + } + lg.Info("serving /health true") + return +} + +func checkV3Health(lg *zap.Logger, srv *etcdserver.EtcdServer) (h Health) { + if h = checkHealth(lg, srv); h.Health != "true" { + return + } + ctx, cancel := context.WithTimeout(context.Background(), srv.Cfg.ReqTimeout()) + _, err := srv.Range(ctx, &etcdserverpb.RangeRequest{KeysOnly: true, Limit: 1}) + cancel() + if err != nil { + h.Health = "false" + h.Reason = fmt.Sprintf("RANGE ERROR:%s", err) + lg.Warn("serving /health false; Range fails", zap.Error(err)) + return } - lg.Info("serving /health true") return } diff --git a/etcdserver/api/v2http/client.go b/etcdserver/api/v2http/client.go index 5eabab906..d758bd49c 100644 --- a/etcdserver/api/v2http/client.go +++ b/etcdserver/api/v2http/client.go @@ -58,6 +58,7 @@ func NewClientHandler(lg *zap.Logger, server etcdserver.ServerPeer, timeout time } mux := http.NewServeMux() etcdhttp.HandleBasic(lg, mux, server) + etcdhttp.HandleMetricsHealth(lg, mux, server) handleV2(lg, mux, server, timeout) return requestLogger(lg, mux) }