fix(server): reduce the screaming heartbeat logs
Currently the only way we know that a peer isn't getting a heartbeat is an edge triggered event from go raft on every missed heartbeat. This means that we need to do some book keeping in order to do exponential backoff. The upside is that instead of screaming thousands of log lines before a machine hits the default removal of 30 minutes it is only ~100.release-0.4
parent
dfeecd2537
commit
1c958f8fc3
|
@ -23,6 +23,10 @@ import (
|
||||||
)
|
)
|
||||||
|
|
||||||
const (
|
const (
|
||||||
|
// MaxHeartbeatTimeoutBackoff is the maximum number of seconds before we warn
|
||||||
|
// the user again about a peer not accepting heartbeats.
|
||||||
|
MaxHeartbeatTimeoutBackoff = 15 * time.Second
|
||||||
|
|
||||||
// ThresholdMonitorTimeout is the time between log notifications that the
|
// ThresholdMonitorTimeout is the time between log notifications that the
|
||||||
// Raft heartbeat is too close to the election timeout.
|
// Raft heartbeat is too close to the election timeout.
|
||||||
ThresholdMonitorTimeout = 5 * time.Second
|
ThresholdMonitorTimeout = 5 * time.Second
|
||||||
|
@ -70,10 +74,18 @@ type PeerServer struct {
|
||||||
routineGroup sync.WaitGroup
|
routineGroup sync.WaitGroup
|
||||||
timeoutThresholdChan chan interface{}
|
timeoutThresholdChan chan interface{}
|
||||||
|
|
||||||
|
logBackoffs map[string]*logBackoff
|
||||||
|
|
||||||
metrics *metrics.Bucket
|
metrics *metrics.Bucket
|
||||||
sync.Mutex
|
sync.Mutex
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type logBackoff struct {
|
||||||
|
next time.Time
|
||||||
|
backoff time.Duration
|
||||||
|
count int
|
||||||
|
}
|
||||||
|
|
||||||
// TODO: find a good policy to do snapshot
|
// TODO: find a good policy to do snapshot
|
||||||
type snapshotConf struct {
|
type snapshotConf struct {
|
||||||
// Etcd will check if snapshot is need every checkingInterval
|
// Etcd will check if snapshot is need every checkingInterval
|
||||||
|
@ -97,6 +109,7 @@ func NewPeerServer(psConfig PeerServerConfig, client *Client, registry *Registry
|
||||||
serverStats: serverStats,
|
serverStats: serverStats,
|
||||||
|
|
||||||
timeoutThresholdChan: make(chan interface{}, 1),
|
timeoutThresholdChan: make(chan interface{}, 1),
|
||||||
|
logBackoffs: make(map[string]*logBackoff),
|
||||||
|
|
||||||
metrics: mb,
|
metrics: mb,
|
||||||
}
|
}
|
||||||
|
@ -687,11 +700,12 @@ func (s *PeerServer) raftEventLogger(event raft.Event) {
|
||||||
case raft.RemovePeerEventType:
|
case raft.RemovePeerEventType:
|
||||||
log.Infof("%s: peer removed: '%v'", s.Config.Name, value)
|
log.Infof("%s: peer removed: '%v'", s.Config.Name, value)
|
||||||
case raft.HeartbeatIntervalEventType:
|
case raft.HeartbeatIntervalEventType:
|
||||||
var name = "<unknown>"
|
peer, ok := value.(*raft.Peer);
|
||||||
if peer, ok := value.(*raft.Peer); ok {
|
if !ok {
|
||||||
name = peer.Name
|
log.Warnf("%s: heatbeat timeout from unknown peer", s.Config.Name)
|
||||||
|
return
|
||||||
}
|
}
|
||||||
log.Infof("%s: warning: heartbeat timed out: '%v'", s.Config.Name, name)
|
s.logHeartbeatTimeout(peer)
|
||||||
case raft.ElectionTimeoutThresholdEventType:
|
case raft.ElectionTimeoutThresholdEventType:
|
||||||
select {
|
select {
|
||||||
case s.timeoutThresholdChan <- value:
|
case s.timeoutThresholdChan <- value:
|
||||||
|
@ -701,6 +715,35 @@ func (s *PeerServer) raftEventLogger(event raft.Event) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// logHeartbeatTimeout logs about the edge triggered heartbeat timeout event
|
||||||
|
// only if we haven't warned within a reasonable interval.
|
||||||
|
func (s *PeerServer) logHeartbeatTimeout(peer *raft.Peer) {
|
||||||
|
b, ok := s.logBackoffs[peer.Name]
|
||||||
|
if !ok {
|
||||||
|
b = &logBackoff{time.Time{}, time.Second, 1}
|
||||||
|
s.logBackoffs[peer.Name] = b
|
||||||
|
}
|
||||||
|
|
||||||
|
if peer.LastActivity().After(b.next) {
|
||||||
|
b.next = time.Time{}
|
||||||
|
b.backoff = time.Second
|
||||||
|
b.count = 1
|
||||||
|
}
|
||||||
|
|
||||||
|
if b.next.After(time.Now()) {
|
||||||
|
b.count++
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
b.backoff = 2 * b.backoff
|
||||||
|
if b.backoff > MaxHeartbeatTimeoutBackoff {
|
||||||
|
b.backoff = MaxHeartbeatTimeoutBackoff
|
||||||
|
}
|
||||||
|
b.next = time.Now().Add(b.backoff)
|
||||||
|
|
||||||
|
log.Infof("%s: warning: heartbeat time out peer=%q missed=%d backoff=%q", s.Config.Name, peer.Name, b.count, b.backoff)
|
||||||
|
}
|
||||||
|
|
||||||
func (s *PeerServer) recordMetricEvent(event raft.Event) {
|
func (s *PeerServer) recordMetricEvent(event raft.Event) {
|
||||||
name := fmt.Sprintf("raft.event.%s", event.Type())
|
name := fmt.Sprintf("raft.event.%s", event.Type())
|
||||||
value := event.Value().(time.Duration)
|
value := event.Value().(time.Duration)
|
||||||
|
|
Loading…
Reference in New Issue