// Copyright 2015 CoreOS, Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package storage import ( "log" "math" "strings" "sync" "time" "github.com/coreos/etcd/lease" "github.com/coreos/etcd/storage/backend" "github.com/coreos/etcd/storage/storagepb" ) const ( // chanBufLen is the length of the buffered chan // for sending out watched events. // TODO: find a good buf value. 1024 is just a random one that // seems to be reasonable. chanBufLen = 1024 ) var ( // watchBatchMaxRevs is the maximum distinct revisions that // may be sent to an unsynced watcher at a time. Declared as // var instead of const for testing purposes. watchBatchMaxRevs = 1000 ) type eventBatch struct { // evs is a batch of revision-ordered events evs []storagepb.Event // revs is the minimum unique revisions observed for this batch revs int // moreRev is first revision with more events following this batch moreRev int64 } type ( watcherSetByKey map[string]watcherSet watcherSet map[*watcher]struct{} watcherBatch map[*watcher]*eventBatch ) func (eb *eventBatch) add(ev storagepb.Event) { if eb.revs > watchBatchMaxRevs { // maxed out batch size return } if len(eb.evs) == 0 { // base case eb.revs = 1 eb.evs = append(eb.evs, ev) return } // revision accounting ebRev := eb.evs[len(eb.evs)-1].Kv.ModRevision evRev := ev.Kv.ModRevision if evRev > ebRev { eb.revs++ if eb.revs > watchBatchMaxRevs { eb.moreRev = evRev return } } eb.evs = append(eb.evs, ev) } func (wb watcherBatch) add(w *watcher, ev storagepb.Event) { eb := wb[w] if eb == nil { eb = &eventBatch{} wb[w] = eb } eb.add(ev) } func (w watcherSet) add(wa *watcher) { if _, ok := w[wa]; ok { panic("add watcher twice!") } w[wa] = struct{}{} } func (w watcherSetByKey) add(wa *watcher) { set := w[string(wa.key)] if set == nil { set = make(watcherSet) w[string(wa.key)] = set } set.add(wa) } func (w watcherSetByKey) getSetByKey(key string) (watcherSet, bool) { set, ok := w[key] return set, ok } func (w watcherSetByKey) delete(wa *watcher) bool { k := string(wa.key) if v, ok := w[k]; ok { if _, ok := v[wa]; ok { delete(v, wa) // if there is nothing in the set, // remove the set if len(v) == 0 { delete(w, k) } return true } } return false } type watchable interface { watch(key []byte, prefix bool, startRev int64, id WatchID, ch chan<- WatchResponse) (*watcher, cancelFunc) rev() int64 } type watchableStore struct { mu sync.Mutex *store // contains all unsynced watchers that needs to sync with events that have happened unsynced watcherSetByKey // contains all synced watchers that are in sync with the progress of the store. // The key of the map is the key that the watcher watches on. synced watcherSetByKey stopc chan struct{} wg sync.WaitGroup } // cancelFunc updates unsynced and synced maps when running // cancel operations. type cancelFunc func() func newWatchableStore(b backend.Backend, le lease.Lessor) *watchableStore { s := &watchableStore{ store: NewStore(b, le), unsynced: make(watcherSetByKey), synced: make(watcherSetByKey), stopc: make(chan struct{}), } if s.le != nil { // use this store as the deleter so revokes trigger watch events s.le.SetRangeDeleter(s) } s.wg.Add(1) go s.syncWatchersLoop() return s } func (s *watchableStore) Put(key, value []byte, lease lease.LeaseID) (rev int64) { s.mu.Lock() defer s.mu.Unlock() rev = s.store.Put(key, value, lease) changes := s.store.getChanges() if len(changes) != 1 { log.Panicf("unexpected len(changes) != 1 after put") } ev := storagepb.Event{ Type: storagepb.PUT, Kv: &changes[0], } s.notify(rev, []storagepb.Event{ev}) return rev } func (s *watchableStore) DeleteRange(key, end []byte) (n, rev int64) { s.mu.Lock() defer s.mu.Unlock() n, rev = s.store.DeleteRange(key, end) changes := s.store.getChanges() if len(changes) != int(n) { log.Panicf("unexpected len(changes) != n after deleteRange") } if n == 0 { return n, rev } evs := make([]storagepb.Event, n) for i, change := range changes { evs[i] = storagepb.Event{ Type: storagepb.DELETE, Kv: &change} evs[i].Kv.ModRevision = rev } s.notify(rev, evs) return n, rev } func (s *watchableStore) TxnBegin() int64 { s.mu.Lock() return s.store.TxnBegin() } func (s *watchableStore) TxnEnd(txnID int64) error { err := s.store.TxnEnd(txnID) if err != nil { return err } changes := s.getChanges() if len(changes) == 0 { s.mu.Unlock() return nil } rev := s.store.Rev() evs := make([]storagepb.Event, len(changes)) for i, change := range changes { switch change.Value { case nil: evs[i] = storagepb.Event{ Type: storagepb.DELETE, Kv: &changes[i]} evs[i].Kv.ModRevision = rev default: evs[i] = storagepb.Event{ Type: storagepb.PUT, Kv: &changes[i]} } } s.notify(rev, evs) s.mu.Unlock() return nil } func (s *watchableStore) Close() error { close(s.stopc) s.wg.Wait() return s.store.Close() } func (s *watchableStore) NewWatchStream() WatchStream { watchStreamGauge.Inc() return &watchStream{ watchable: s, ch: make(chan WatchResponse, chanBufLen), cancels: make(map[WatchID]cancelFunc), } } func (s *watchableStore) watch(key []byte, prefix bool, startRev int64, id WatchID, ch chan<- WatchResponse) (*watcher, cancelFunc) { s.mu.Lock() defer s.mu.Unlock() wa := &watcher{ key: key, prefix: prefix, cur: startRev, id: id, ch: ch, } s.store.mu.Lock() synced := startRev > s.store.currentRev.main || startRev == 0 if synced { wa.cur = s.store.currentRev.main + 1 } s.store.mu.Unlock() if synced { if startRev > wa.cur { panic("can't watch past sync revision") } s.synced.add(wa) } else { slowWatcherGauge.Inc() s.unsynced.add(wa) } watcherGauge.Inc() cancel := cancelFunc(func() { s.mu.Lock() defer s.mu.Unlock() // remove references of the watcher if s.unsynced.delete(wa) { slowWatcherGauge.Dec() watcherGauge.Dec() return } if s.synced.delete(wa) { watcherGauge.Dec() } // If we cannot find it, it should have finished watch. }) return wa, cancel } // syncWatchersLoop syncs the watcher in the unsynced map every 100ms. func (s *watchableStore) syncWatchersLoop() { defer s.wg.Done() for { s.mu.Lock() s.syncWatchers() s.mu.Unlock() select { case <-time.After(100 * time.Millisecond): case <-s.stopc: return } } } // syncWatchers periodically syncs unsynced watchers by: Iterate all unsynced // watchers to get the minimum revision within its range, skipping the // watcher if its current revision is behind the compact revision of the // store. And use this minimum revision to get all key-value pairs. Then send // those events to watchers. func (s *watchableStore) syncWatchers() { s.store.mu.Lock() defer s.store.mu.Unlock() if len(s.unsynced) == 0 { return } // in order to find key-value pairs from unsynced watchers, we need to // find min revision index, and these revisions can be used to // query the backend store of key-value pairs prefixes, minRev := s.scanUnsync() curRev := s.store.currentRev.main minBytes, maxBytes := newRevBytes(), newRevBytes() revToBytes(revision{main: minRev}, minBytes) revToBytes(revision{main: curRev + 1}, maxBytes) // UnsafeRange returns keys and values. And in boltdb, keys are revisions. // values are actual key-value pairs in backend. tx := s.store.b.BatchTx() tx.Lock() revs, vs := tx.UnsafeRange(keyBucketName, minBytes, maxBytes, 0) evs := kvsToEvents(revs, vs, s.unsynced, prefixes) tx.Unlock() for w, eb := range newWatcherBatch(s.unsynced, evs) { select { // s.store.Rev also uses Lock, so just return directly case w.ch <- WatchResponse{WatchID: w.id, Events: eb.evs, Revision: s.store.currentRev.main}: pendingEventsGauge.Add(float64(len(eb.evs))) default: // TODO: handle the full unsynced watchers. // continue to process other watchers for now, the full ones // will be processed next time and hopefully it will not be full. continue } if eb.moreRev != 0 { w.cur = eb.moreRev continue } w.cur = curRev s.synced.add(w) s.unsynced.delete(w) } slowWatcherGauge.Set(float64(len(s.unsynced))) } func (s *watchableStore) scanUnsync() (prefixes map[string]struct{}, minRev int64) { curRev := s.store.currentRev.main compactionRev := s.store.compactMainRev prefixes = make(map[string]struct{}) minRev = int64(math.MaxInt64) for _, set := range s.unsynced { for w := range set { k := string(w.key) if w.cur > curRev { panic("watcher current revision should not exceed current revision") } if w.cur < compactionRev { select { case w.ch <- WatchResponse{WatchID: w.id, CompactRevision: compactionRev}: s.unsynced.delete(w) default: // retry next time } continue } if minRev > w.cur { minRev = w.cur } if w.prefix { prefixes[k] = struct{}{} } } } return prefixes, minRev } // kvsToEvents gets all events for the watchers from all key-value pairs func kvsToEvents(revs, vals [][]byte, wsk watcherSetByKey, pfxs map[string]struct{}) (evs []storagepb.Event) { for i, v := range vals { var kv storagepb.KeyValue if err := kv.Unmarshal(v); err != nil { log.Panicf("storage: cannot unmarshal event: %v", err) } k := string(kv.Key) if _, ok := wsk.getSetByKey(k); !ok && !matchPrefix(k, pfxs) { continue } ty := storagepb.PUT if isTombstone(revs[i]) { ty = storagepb.DELETE // patch in mod revision so watchers won't skip kv.ModRevision = bytesToRev(revs[i]).main } evs = append(evs, storagepb.Event{Kv: &kv, Type: ty}) } return evs } // notify notifies the fact that given event at the given rev just happened to // watchers that watch on the key of the event. func (s *watchableStore) notify(rev int64, evs []storagepb.Event) { we := newWatcherBatch(s.synced, evs) for _, wm := range s.synced { for w := range wm { eb, ok := we[w] if !ok { continue } if eb.revs != 1 { panic("unexpected multiple revisions in notification") } select { case w.ch <- WatchResponse{WatchID: w.id, Events: eb.evs, Revision: s.Rev()}: pendingEventsGauge.Add(float64(len(eb.evs))) default: // move slow watcher to unsynced w.cur = rev s.unsynced.add(w) delete(wm, w) slowWatcherGauge.Inc() } } } } func (s *watchableStore) rev() int64 { return s.store.Rev() } type watcher struct { // the watcher key key []byte // prefix indicates if watcher is on a key or a prefix. // If prefix is true, the watcher is on a prefix. prefix bool // cur is the current watcher revision. // If cur is behind the current revision of the KV, // watcher is unsynced and needs to catch up. cur int64 id WatchID // a chan to send out the watch response. // The chan might be shared with other watchers. ch chan<- WatchResponse } // newWatcherBatch maps watchers to their matched events. It enables quick // events look up by watcher. func newWatcherBatch(sm watcherSetByKey, evs []storagepb.Event) watcherBatch { wb := make(watcherBatch) for _, ev := range evs { key := string(ev.Kv.Key) // check all prefixes of the key to notify all corresponded watchers for i := 0; i <= len(key); i++ { for w := range sm[key[:i]] { // don't double notify if ev.Kv.ModRevision < w.cur { continue } // the watcher needs to be notified when either it watches prefix or // the key is exactly matched. if !w.prefix && i != len(ev.Kv.Key) { continue } wb.add(w, ev) } } } return wb } // matchPrefix returns true if key has any matching prefix // from prefixes map. func matchPrefix(key string, prefixes map[string]struct{}) bool { for p := range prefixes { if strings.HasPrefix(key, p) { return true } } return false }