etcd/mvcc/watcher_group.go

294 lines
6.8 KiB
Go
Raw Normal View History

2016-05-13 06:50:33 +03:00
// Copyright 2016 The etcd Authors
2016-02-26 19:55:28 +03:00
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
2016-04-25 22:32:58 +03:00
package mvcc
2016-02-26 19:55:28 +03:00
import (
"fmt"
2016-02-26 19:55:28 +03:00
"math"
"go.etcd.io/etcd/mvcc/mvccpb"
"go.etcd.io/etcd/pkg/adt"
2016-02-26 19:55:28 +03:00
)
var (
// watchBatchMaxRevs is the maximum distinct revisions that
// may be sent to an unsynced watcher at a time. Declared as
// var instead of const for testing purposes.
watchBatchMaxRevs = 1000
)
type eventBatch struct {
// evs is a batch of revision-ordered events
2016-04-25 22:32:58 +03:00
evs []mvccpb.Event
2016-02-26 19:55:28 +03:00
// revs is the minimum unique revisions observed for this batch
revs int
// moreRev is first revision with more events following this batch
moreRev int64
}
2016-04-25 22:32:58 +03:00
func (eb *eventBatch) add(ev mvccpb.Event) {
2016-02-26 19:55:28 +03:00
if eb.revs > watchBatchMaxRevs {
// maxed out batch size
return
}
if len(eb.evs) == 0 {
// base case
eb.revs = 1
eb.evs = append(eb.evs, ev)
return
}
// revision accounting
ebRev := eb.evs[len(eb.evs)-1].Kv.ModRevision
evRev := ev.Kv.ModRevision
if evRev > ebRev {
eb.revs++
if eb.revs > watchBatchMaxRevs {
eb.moreRev = evRev
return
}
}
eb.evs = append(eb.evs, ev)
}
type watcherBatch map[*watcher]*eventBatch
2016-04-25 22:32:58 +03:00
func (wb watcherBatch) add(w *watcher, ev mvccpb.Event) {
2016-02-26 19:55:28 +03:00
eb := wb[w]
if eb == nil {
eb = &eventBatch{}
wb[w] = eb
}
eb.add(ev)
}
// newWatcherBatch maps watchers to their matched events. It enables quick
// events look up by watcher.
2016-04-25 22:32:58 +03:00
func newWatcherBatch(wg *watcherGroup, evs []mvccpb.Event) watcherBatch {
if len(wg.watchers) == 0 {
return nil
}
2016-02-26 19:55:28 +03:00
wb := make(watcherBatch)
for _, ev := range evs {
for w := range wg.watcherSetByKey(string(ev.Kv.Key)) {
if ev.Kv.ModRevision >= w.minRev {
2016-02-26 19:55:28 +03:00
// don't double notify
wb.add(w, ev)
}
}
}
return wb
}
type watcherSet map[*watcher]struct{}
func (w watcherSet) add(wa *watcher) {
if _, ok := w[wa]; ok {
panic("add watcher twice!")
}
w[wa] = struct{}{}
}
func (w watcherSet) union(ws watcherSet) {
for wa := range ws {
w.add(wa)
}
}
func (w watcherSet) delete(wa *watcher) {
if _, ok := w[wa]; !ok {
panic("removing missing watcher!")
}
delete(w, wa)
}
type watcherSetByKey map[string]watcherSet
func (w watcherSetByKey) add(wa *watcher) {
set := w[string(wa.key)]
if set == nil {
set = make(watcherSet)
w[string(wa.key)] = set
}
set.add(wa)
}
func (w watcherSetByKey) delete(wa *watcher) bool {
k := string(wa.key)
if v, ok := w[k]; ok {
if _, ok := v[wa]; ok {
delete(v, wa)
if len(v) == 0 {
// remove the set; nothing left
delete(w, k)
}
return true
}
}
return false
}
// watcherGroup is a collection of watchers organized by their ranges
type watcherGroup struct {
// keyWatchers has the watchers that watch on a single key
keyWatchers watcherSetByKey
// ranges has the watchers that watch a range; it is sorted by interval
ranges adt.IntervalTree
// watchers is the set of all watchers
watchers watcherSet
}
func newWatcherGroup() watcherGroup {
return watcherGroup{
keyWatchers: make(watcherSetByKey),
ranges: adt.NewIntervalTree(),
2016-02-26 19:55:28 +03:00
watchers: make(watcherSet),
}
}
// add puts a watcher in the group.
func (wg *watcherGroup) add(wa *watcher) {
wg.watchers.add(wa)
if wa.end == nil {
wg.keyWatchers.add(wa)
return
}
// interval already registered?
ivl := adt.NewStringAffineInterval(string(wa.key), string(wa.end))
if iv := wg.ranges.Find(ivl); iv != nil {
iv.Val.(watcherSet).add(wa)
return
}
// not registered, put in interval tree
ws := make(watcherSet)
ws.add(wa)
wg.ranges.Insert(ivl, ws)
}
// contains is whether the given key has a watcher in the group.
func (wg *watcherGroup) contains(key string) bool {
_, ok := wg.keyWatchers[key]
return ok || wg.ranges.Intersects(adt.NewStringAffinePoint(key))
2016-02-26 19:55:28 +03:00
}
// size gives the number of unique watchers in the group.
func (wg *watcherGroup) size() int { return len(wg.watchers) }
// delete removes a watcher from the group.
func (wg *watcherGroup) delete(wa *watcher) bool {
if _, ok := wg.watchers[wa]; !ok {
return false
}
wg.watchers.delete(wa)
if wa.end == nil {
wg.keyWatchers.delete(wa)
return true
}
ivl := adt.NewStringAffineInterval(string(wa.key), string(wa.end))
iv := wg.ranges.Find(ivl)
if iv == nil {
return false
}
ws := iv.Val.(watcherSet)
delete(ws, wa)
if len(ws) == 0 {
// remove interval missing watchers
if ok := wg.ranges.Delete(ivl); !ok {
panic("could not remove watcher from interval tree")
}
}
return true
}
// choose selects watchers from the watcher group to update
func (wg *watcherGroup) choose(maxWatchers int, curRev, compactRev int64) (*watcherGroup, int64) {
if len(wg.watchers) < maxWatchers {
return wg, wg.chooseAll(curRev, compactRev)
}
ret := newWatcherGroup()
for w := range wg.watchers {
if maxWatchers <= 0 {
break
}
maxWatchers--
ret.add(w)
}
return &ret, ret.chooseAll(curRev, compactRev)
}
func (wg *watcherGroup) chooseAll(curRev, compactRev int64) int64 {
2016-02-26 19:55:28 +03:00
minRev := int64(math.MaxInt64)
for w := range wg.watchers {
if w.minRev > curRev {
mvcc: fix panic by allowing future revision watcher from restore operation This also happens without gRPC proxy. Fix panic when gRPC proxy leader watcher is restored: ``` go test -v -tags cluster_proxy -cpu 4 -race -run TestV3WatchRestoreSnapshotUnsync === RUN TestV3WatchRestoreSnapshotUnsync panic: watcher minimum revision 9223372036854775805 should not exceed current revision 16 goroutine 156 [running]: github.com/coreos/etcd/mvcc.(*watcherGroup).chooseAll(0xc4202b8720, 0x10, 0xffffffffffffffff, 0x1) /home/gyuho/go/src/github.com/coreos/etcd/mvcc/watcher_group.go:242 +0x3b5 github.com/coreos/etcd/mvcc.(*watcherGroup).choose(0xc4202b8720, 0x200, 0x10, 0xffffffffffffffff, 0xc420253378, 0xc420253378) /home/gyuho/go/src/github.com/coreos/etcd/mvcc/watcher_group.go:225 +0x289 github.com/coreos/etcd/mvcc.(*watchableStore).syncWatchers(0xc4202b86e0, 0x0) /home/gyuho/go/src/github.com/coreos/etcd/mvcc/watchable_store.go:340 +0x237 github.com/coreos/etcd/mvcc.(*watchableStore).syncWatchersLoop(0xc4202b86e0) /home/gyuho/go/src/github.com/coreos/etcd/mvcc/watchable_store.go:214 +0x280 created by github.com/coreos/etcd/mvcc.newWatchableStore /home/gyuho/go/src/github.com/coreos/etcd/mvcc/watchable_store.go:90 +0x477 exit status 2 FAIL github.com/coreos/etcd/integration 2.551s ``` gRPC proxy spawns a watcher with a key "proxy-namespace__lostleader" and watch revision "int64(math.MaxInt64 - 2)" to detect leader loss. But, when the partitioned node restores, this watcher triggers panic with "watcher minimum revision ... should not exceed current ...". This check was added a long time ago, by my PR, when there was no gRPC proxy: https://github.com/coreos/etcd/pull/4043#discussion_r48457145 > we can remove this checking actually. it is impossible for a unsynced watching to have a future rev. or we should just panic here. However, now it's possible that a unsynced watcher has a future revision, when it was moved from a synced watcher group through restore operation. This PR adds "restore" flag to indicate that a watcher was moved from the synced watcher group with restore operation. Otherwise, the watcher with future revision in an unsynced watcher group would still panic. Example logs with future revision watcher from restore operation: ``` {"level":"info","ts":1527196358.9057755,"caller":"mvcc/watcher_group.go:261","msg":"choosing future revision watcher from restore operation","watch-key":"proxy-namespace__lostleader","watch-revision":9223372036854775805,"current-revision":16} {"level":"info","ts":1527196358.910349,"caller":"mvcc/watcher_group.go:261","msg":"choosing future revision watcher from restore operation","watch-key":"proxy-namespace__lostleader","watch-revision":9223372036854775805,"current-revision":16} ``` Signed-off-by: Gyuho Lee <gyuhox@gmail.com>
2018-05-25 20:04:54 +03:00
// after network partition, possibly choosing future revision watcher from restore operation
// with watch key "proxy-namespace__lostleader" and revision "math.MaxInt64 - 2"
// do not panic when such watcher had been moved from "synced" watcher during restore operation
if !w.restore {
panic(fmt.Errorf("watcher minimum revision %d should not exceed current revision %d", w.minRev, curRev))
}
// mark 'restore' done, since it's chosen
w.restore = false
2016-02-26 19:55:28 +03:00
}
if w.minRev < compactRev {
2016-02-26 19:55:28 +03:00
select {
case w.ch <- WatchResponse{WatchID: w.id, CompactRevision: compactRev}:
w.compacted = true
2016-02-26 19:55:28 +03:00
wg.delete(w)
default:
// retry next time
}
continue
}
if minRev > w.minRev {
minRev = w.minRev
2016-02-26 19:55:28 +03:00
}
}
return minRev
}
2016-04-08 07:57:06 +03:00
// watcherSetByKey gets the set of watchers that receive events on the given key.
2016-02-26 19:55:28 +03:00
func (wg *watcherGroup) watcherSetByKey(key string) watcherSet {
wkeys := wg.keyWatchers[key]
wranges := wg.ranges.Stab(adt.NewStringAffinePoint(key))
// zero-copy cases
switch {
case len(wranges) == 0:
// no need to merge ranges or copy; reuse single-key set
return wkeys
case len(wranges) == 0 && len(wkeys) == 0:
return nil
case len(wranges) == 1 && len(wkeys) == 0:
return wranges[0].Val.(watcherSet)
}
// copy case
ret := make(watcherSet)
ret.union(wg.keyWatchers[key])
for _, item := range wranges {
ret.union(item.Val.(watcherSet))
}
return ret
}