etcd/storage/kvstore.go

308 lines
6.4 KiB
Go
Raw Normal View History

2015-05-22 18:11:43 +03:00
package storage
import (
2015-05-27 20:35:51 +03:00
"errors"
2015-05-22 18:11:43 +03:00
"log"
2015-05-27 20:35:51 +03:00
"math/rand"
2015-05-22 18:11:43 +03:00
"sync"
"time"
"github.com/coreos/etcd/storage/backend"
"github.com/coreos/etcd/storage/storagepb"
)
var (
batchLimit = 10000
batchInterval = 100 * time.Millisecond
keyBucketName = []byte("key")
2015-05-27 20:35:51 +03:00
2015-05-31 18:59:31 +03:00
scheduledCompactKeyName = []byte("scheduledCompactRev")
finishedCompactKeyName = []byte("finishedCompactRev")
2015-05-27 20:35:51 +03:00
ErrTnxIDMismatch = errors.New("storage: tnx id mismatch")
2015-05-31 18:59:31 +03:00
ErrCompacted = errors.New("storage: required reversion has been compacted")
2015-05-22 18:11:43 +03:00
)
type store struct {
2015-05-22 23:35:43 +03:00
mu sync.RWMutex
2015-05-22 18:11:43 +03:00
b backend.Backend
kvindex index
2015-05-31 08:56:33 +03:00
currentRev reversion
2015-05-31 18:59:31 +03:00
// the main reversion of the last compaction
compactMainRev int64
2015-05-22 23:35:43 +03:00
2015-05-27 20:35:51 +03:00
tmu sync.Mutex // protect the tnxID field
tnxID int64 // tracks the current tnxID to verify tnx operations
2015-05-22 18:11:43 +03:00
}
2015-05-28 00:24:23 +03:00
func newStore(path string) KV {
2015-05-22 18:11:43 +03:00
s := &store{
2015-05-31 18:59:31 +03:00
b: backend.New(path, batchInterval, batchLimit),
kvindex: newTreeIndex(),
currentRev: reversion{},
compactMainRev: -1,
2015-05-22 18:11:43 +03:00
}
tx := s.b.BatchTx()
tx.Lock()
tx.UnsafeCreateBucket(keyBucketName)
tx.Unlock()
s.b.ForceCommit()
return s
}
func (s *store) Put(key, value []byte) int64 {
2015-05-27 20:35:51 +03:00
id := s.TnxBegin()
2015-05-31 08:56:33 +03:00
s.put(key, value, s.currentRev.main+1)
2015-05-27 20:35:51 +03:00
s.TnxEnd(id)
2015-05-22 18:11:43 +03:00
2015-05-31 08:56:33 +03:00
return int64(s.currentRev.main)
2015-05-22 18:11:43 +03:00
}
2015-05-31 18:59:31 +03:00
func (s *store) Range(key, end []byte, limit, rangeRev int64) (kvs []storagepb.KeyValue, rev int64, err error) {
2015-05-27 20:35:51 +03:00
id := s.TnxBegin()
2015-05-31 18:59:31 +03:00
kvs, rev, err = s.rangeKeys(key, end, limit, rangeRev)
2015-05-27 20:35:51 +03:00
s.TnxEnd(id)
2015-05-22 23:35:43 +03:00
2015-05-31 18:59:31 +03:00
return kvs, rev, err
2015-05-22 23:35:43 +03:00
}
2015-05-31 08:56:33 +03:00
func (s *store) DeleteRange(key, end []byte) (n, rev int64) {
2015-05-27 20:35:51 +03:00
id := s.TnxBegin()
2015-05-31 08:56:33 +03:00
n = s.deleteRange(key, end, s.currentRev.main+1)
2015-05-27 20:35:51 +03:00
s.TnxEnd(id)
2015-05-22 23:35:43 +03:00
2015-05-31 08:56:33 +03:00
return n, int64(s.currentRev.main)
2015-05-22 23:35:43 +03:00
}
2015-05-27 20:35:51 +03:00
func (s *store) TnxBegin() int64 {
2015-05-22 23:35:43 +03:00
s.mu.Lock()
2015-05-31 08:56:33 +03:00
s.currentRev.sub = 0
2015-05-27 20:35:51 +03:00
s.tmu.Lock()
defer s.tmu.Unlock()
s.tnxID = rand.Int63()
return s.tnxID
2015-05-22 23:35:43 +03:00
}
2015-05-22 18:11:43 +03:00
2015-05-27 20:35:51 +03:00
func (s *store) TnxEnd(tnxID int64) error {
s.tmu.Lock()
defer s.tmu.Unlock()
if tnxID != s.tnxID {
return ErrTnxIDMismatch
}
2015-05-31 08:56:33 +03:00
if s.currentRev.sub != 0 {
s.currentRev.main += 1
2015-05-22 23:35:43 +03:00
}
2015-05-31 08:56:33 +03:00
s.currentRev.sub = 0
2015-05-22 23:35:43 +03:00
s.mu.Unlock()
2015-05-27 20:35:51 +03:00
return nil
2015-05-22 23:35:43 +03:00
}
2015-05-31 08:56:33 +03:00
func (s *store) TnxRange(tnxID int64, key, end []byte, limit, rangeRev int64) (kvs []storagepb.KeyValue, rev int64, err error) {
2015-05-27 20:35:51 +03:00
s.tmu.Lock()
defer s.tmu.Unlock()
if tnxID != s.tnxID {
return nil, 0, ErrTnxIDMismatch
}
2015-05-31 18:59:31 +03:00
return s.rangeKeys(key, end, limit, rangeRev)
2015-05-27 19:58:21 +03:00
}
2015-05-31 08:56:33 +03:00
func (s *store) TnxPut(tnxID int64, key, value []byte) (rev int64, err error) {
2015-05-27 20:35:51 +03:00
s.tmu.Lock()
defer s.tmu.Unlock()
if tnxID != s.tnxID {
return 0, ErrTnxIDMismatch
}
2015-05-31 08:56:33 +03:00
s.put(key, value, s.currentRev.main+1)
return int64(s.currentRev.main + 1), nil
2015-05-27 19:58:21 +03:00
}
2015-05-31 08:56:33 +03:00
func (s *store) TnxDeleteRange(tnxID int64, key, end []byte) (n, rev int64, err error) {
2015-05-27 20:35:51 +03:00
s.tmu.Lock()
defer s.tmu.Unlock()
if tnxID != s.tnxID {
return 0, 0, ErrTnxIDMismatch
}
2015-05-31 08:56:33 +03:00
n = s.deleteRange(key, end, s.currentRev.main+1)
if n != 0 || s.currentRev.sub != 0 {
rev = int64(s.currentRev.main + 1)
2015-05-27 19:58:21 +03:00
}
2015-05-31 08:56:33 +03:00
return n, rev, nil
2015-05-27 19:58:21 +03:00
}
2015-05-31 18:59:31 +03:00
func (s *store) Compact(rev int64) error {
s.mu.Lock()
defer s.mu.Unlock()
if rev <= s.compactMainRev {
return ErrCompacted
}
s.compactMainRev = rev
rbytes := make([]byte, 8+1+8)
revToBytes(reversion{main: rev}, rbytes)
tx := s.b.BatchTx()
tx.Lock()
tx.UnsafePut(keyBucketName, scheduledCompactKeyName, rbytes)
tx.Unlock()
keep := s.kvindex.Compact(rev)
go s.scheduleCompaction(rev, keep)
return nil
}
2015-05-27 19:58:21 +03:00
// range is a keyword in Go, add Keys suffix.
2015-05-31 18:59:31 +03:00
func (s *store) rangeKeys(key, end []byte, limit, rangeRev int64) (kvs []storagepb.KeyValue, rev int64, err error) {
2015-05-31 08:56:33 +03:00
if rangeRev <= 0 {
rev = int64(s.currentRev.main)
if s.currentRev.sub > 0 {
rev += 1
2015-05-22 23:35:43 +03:00
}
2015-05-22 18:11:43 +03:00
} else {
2015-05-31 08:56:33 +03:00
rev = rangeRev
2015-05-22 18:11:43 +03:00
}
2015-05-31 18:59:31 +03:00
if rev <= s.compactMainRev {
return nil, 0, ErrCompacted
}
2015-05-22 18:11:43 +03:00
2015-05-31 18:59:31 +03:00
_, revpairs := s.kvindex.Range(key, end, int64(rev))
if len(revpairs) == 0 {
return nil, rev, nil
2015-05-22 18:11:43 +03:00
}
2015-05-31 18:59:31 +03:00
if limit > 0 && len(revpairs) > int(limit) {
revpairs = revpairs[:limit]
2015-05-22 18:11:43 +03:00
}
tx := s.b.BatchTx()
tx.Lock()
defer tx.Unlock()
2015-05-31 18:59:31 +03:00
for _, revpair := range revpairs {
2015-05-31 08:56:33 +03:00
revbytes := make([]byte, 8+1+8)
2015-05-31 18:59:31 +03:00
revToBytes(revpair, revbytes)
2015-05-22 18:11:43 +03:00
2015-05-31 18:59:31 +03:00
_, vs := tx.UnsafeRange(keyBucketName, revbytes, nil, 0)
2015-05-31 08:56:33 +03:00
if len(vs) != 1 {
2015-05-31 18:59:31 +03:00
log.Fatalf("storage: range cannot find rev (%d,%d)", revpair.main, revpair.sub)
2015-05-22 18:11:43 +03:00
}
2015-05-31 08:56:33 +03:00
e := &storagepb.Event{}
if err := e.Unmarshal(vs[0]); err != nil {
log.Fatalf("storage: cannot unmarshal event: %v", err)
2015-05-22 18:11:43 +03:00
}
2015-05-31 08:56:33 +03:00
if e.Type == storagepb.PUT {
kvs = append(kvs, e.Kv)
2015-05-22 23:35:43 +03:00
}
2015-05-22 18:11:43 +03:00
}
2015-05-31 18:59:31 +03:00
return kvs, rev, nil
2015-05-22 18:11:43 +03:00
}
2015-05-31 08:56:33 +03:00
func (s *store) put(key, value []byte, rev int64) {
ibytes := make([]byte, 8+1+8)
2015-05-31 18:59:31 +03:00
revToBytes(reversion{main: rev, sub: s.currentRev.sub}, ibytes)
2015-05-22 18:11:43 +03:00
event := storagepb.Event{
Type: storagepb.PUT,
Kv: storagepb.KeyValue{
Key: key,
Value: value,
},
}
d, err := event.Marshal()
if err != nil {
log.Fatalf("storage: cannot marshal event: %v", err)
}
tx := s.b.BatchTx()
tx.Lock()
defer tx.Unlock()
tx.UnsafePut(keyBucketName, ibytes, d)
2015-05-31 08:56:33 +03:00
s.kvindex.Put(key, reversion{main: rev, sub: s.currentRev.sub})
s.currentRev.sub += 1
2015-05-22 23:35:43 +03:00
}
2015-05-31 08:56:33 +03:00
func (s *store) deleteRange(key, end []byte, rev int64) int64 {
2015-05-22 23:35:43 +03:00
var n int64
2015-05-31 08:56:33 +03:00
rrev := rev
if s.currentRev.sub > 0 {
rrev += 1
2015-05-22 23:35:43 +03:00
}
2015-05-31 08:56:33 +03:00
keys, _ := s.kvindex.Range(key, end, rrev)
2015-05-22 23:35:43 +03:00
2015-05-31 08:56:33 +03:00
if len(keys) == 0 {
2015-05-22 23:35:43 +03:00
return 0
}
2015-05-31 08:56:33 +03:00
for _, key := range keys {
ok := s.delete(key, rev)
2015-05-22 23:35:43 +03:00
if ok {
n++
}
}
return n
2015-05-22 18:11:43 +03:00
}
2015-05-31 08:56:33 +03:00
func (s *store) delete(key []byte, mainrev int64) bool {
grev := mainrev
if s.currentRev.sub > 0 {
grev += 1
2015-05-22 23:35:43 +03:00
}
2015-05-31 08:56:33 +03:00
rev, err := s.kvindex.Get(key, grev)
2015-05-22 18:11:43 +03:00
if err != nil {
// key not exist
return false
}
2015-05-31 08:56:33 +03:00
tx := s.b.BatchTx()
tx.Lock()
defer tx.Unlock()
revbytes := make([]byte, 8+1+8)
2015-05-31 18:59:31 +03:00
revToBytes(rev, revbytes)
2015-05-31 08:56:33 +03:00
2015-05-31 18:59:31 +03:00
_, vs := tx.UnsafeRange(keyBucketName, revbytes, nil, 0)
2015-05-31 08:56:33 +03:00
if len(vs) != 1 {
log.Fatalf("storage: delete cannot find rev (%d,%d)", rev.main, rev.sub)
}
e := &storagepb.Event{}
if err := e.Unmarshal(vs[0]); err != nil {
log.Fatalf("storage: cannot unmarshal event: %v", err)
}
if e.Type == storagepb.DELETE {
return false
}
ibytes := make([]byte, 8+1+8)
2015-05-31 18:59:31 +03:00
revToBytes(reversion{main: mainrev, sub: s.currentRev.sub}, ibytes)
2015-05-22 18:11:43 +03:00
event := storagepb.Event{
Type: storagepb.DELETE,
Kv: storagepb.KeyValue{
Key: key,
},
}
d, err := event.Marshal()
if err != nil {
log.Fatalf("storage: cannot marshal event: %v", err)
}
tx.UnsafePut(keyBucketName, ibytes, d)
2015-05-31 08:56:33 +03:00
err = s.kvindex.Tombstone(key, reversion{main: mainrev, sub: s.currentRev.sub})
2015-05-22 18:11:43 +03:00
if err != nil {
log.Fatalf("storage: cannot tombstone an existing key (%s): %v", string(key), err)
}
2015-05-31 08:56:33 +03:00
s.currentRev.sub += 1
2015-05-22 18:11:43 +03:00
return true
}