etcd/raft/storage.go

274 lines
8.4 KiB
Go
Raw Normal View History

2016-05-13 06:49:15 +03:00
// Copyright 2015 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package raft
import (
"errors"
"sync"
pb "go.etcd.io/etcd/v3/raft/raftpb"
)
// ErrCompacted is returned by Storage.Entries/Compact when a requested
// index is unavailable because it predates the last snapshot.
var ErrCompacted = errors.New("requested index is unavailable due to compaction")
2016-01-29 10:18:47 +03:00
// ErrSnapOutOfDate is returned by Storage.CreateSnapshot when a requested
// index is older than the existing snapshot.
var ErrSnapOutOfDate = errors.New("requested index is older than the existing snapshot")
2016-04-08 08:14:56 +03:00
// ErrUnavailable is returned by Storage interface when the requested log entries
// are unavailable.
var ErrUnavailable = errors.New("requested entry at index is unavailable")
2016-04-08 08:14:56 +03:00
// ErrSnapshotTemporarilyUnavailable is returned by the Storage interface when the required
// snapshot is temporarily unavailable.
var ErrSnapshotTemporarilyUnavailable = errors.New("snapshot is temporarily unavailable")
// Storage is an interface that may be implemented by the application
// to retrieve log entries from storage.
//
// If any Storage method returns an error, the raft instance will
// become inoperable and refuse to participate in elections; the
// application is responsible for cleanup and recovery in this case.
type Storage interface {
raft/rafttest: introduce datadriven testing It has often been tedious to test the interactions between multi-member Raft groups, especially when many steps were required to reach a certain scenario. Often, this boilerplate was as boring as it is hard to write and hard to maintain, making it attractive to resort to shortcuts whenever possible, which in turn tended to undercut how meaningful and maintainable the tests ended up being - that is, if the tests were even written, which sometimes they weren't. This change introduces a datadriven framework specifically for testing deterministically the interaction between multiple members of a raft group with the goal of reducing the friction for writing these tests to near zero. In the near term, this will be used to add thorough testing for joint consensus (which is already available today, but wildly undertested), but just converting an existing test into this framework has shown that the concise representation and built-in inspection of log messages highlights unexpected behavior much more readily than the previous unit tests did (the test in question is `snapshot_succeed_via_app_resp`; the reader is invited to compare the old and new version of it). The main building block is `InteractionEnv`, which holds on to the state of the whole system and exposes various relevant methods for manipulating it, including but not limited to adding nodes, delivering and dropping messages, and proposing configuration changes. All of this is extensible so that in the future I hope to use it to explore the phenomena discussed in https://github.com/etcd-io/etcd/issues/7625#issuecomment-488798263 which requires injecting appropriate "crash points" in the Ready handling loop. Discussions of the "what if X happened in state Y" can quickly be made concrete by "scripting up an interaction test". Additionally, this framework is intentionally not kept internal to the raft package.. Though this is in its infancy, a goal is that it should be possible for a suite of interaction tests to allow applications to validate that their Storage implementation behaves accordingly, simply by running a raft-provided interaction suite against their Storage.
2019-08-12 12:13:51 +03:00
// TODO(tbg): split this into two interfaces, LogStorage and StateStorage.
// InitialState returns the saved HardState and ConfState information.
InitialState() (pb.HardState, pb.ConfState, error)
// Entries returns a slice of log entries in the range [lo,hi).
// MaxSize limits the total size of the log entries returned, but
// Entries returns at least one entry if any.
Entries(lo, hi, maxSize uint64) ([]pb.Entry, error)
// Term returns the term of entry i, which must be in the range
// [FirstIndex()-1, LastIndex()]. The term of the entry before
// FirstIndex is retained for matching purposes even though the
// rest of that entry may not be available.
Term(i uint64) (uint64, error)
// LastIndex returns the index of the last entry in the log.
LastIndex() (uint64, error)
// FirstIndex returns the index of the first log entry that is
// possibly available via Entries (older entries have been incorporated
// into the latest Snapshot; if storage only contains the dummy entry the
// first log entry is not available).
FirstIndex() (uint64, error)
// Snapshot returns the most recent snapshot.
2016-02-01 08:42:39 +03:00
// If snapshot is temporarily unavailable, it should return ErrSnapshotTemporarilyUnavailable,
// so raft state machine could know that Storage needs some time to prepare
// snapshot and call Snapshot later.
Snapshot() (pb.Snapshot, error)
}
// MemoryStorage implements the Storage interface backed by an
// in-memory array.
type MemoryStorage struct {
// Protects access to all fields. Most methods of MemoryStorage are
// run on the raft goroutine, but Append() is run on an application
// goroutine.
sync.Mutex
hardState pb.HardState
snapshot pb.Snapshot
// ents[i] has raft log position i+snapshot.Metadata.Index
ents []pb.Entry
}
// NewMemoryStorage creates an empty MemoryStorage.
func NewMemoryStorage() *MemoryStorage {
return &MemoryStorage{
// When starting from scratch populate the list with a dummy entry at term zero.
ents: make([]pb.Entry, 1),
}
}
// InitialState implements the Storage interface.
func (ms *MemoryStorage) InitialState() (pb.HardState, pb.ConfState, error) {
return ms.hardState, ms.snapshot.Metadata.ConfState, nil
}
// SetHardState saves the current HardState.
func (ms *MemoryStorage) SetHardState(st pb.HardState) error {
ms.Lock()
defer ms.Unlock()
ms.hardState = st
return nil
}
// Entries implements the Storage interface.
func (ms *MemoryStorage) Entries(lo, hi, maxSize uint64) ([]pb.Entry, error) {
ms.Lock()
defer ms.Unlock()
offset := ms.ents[0].Index
if lo <= offset {
return nil, ErrCompacted
}
2015-01-28 21:31:11 +03:00
if hi > ms.lastIndex()+1 {
2016-01-06 11:17:02 +03:00
raftLogger.Panicf("entries' hi(%d) is out of bound lastindex(%d)", hi, ms.lastIndex())
2015-01-28 21:31:11 +03:00
}
// only contains dummy entries.
if len(ms.ents) == 1 {
return nil, ErrUnavailable
}
ents := ms.ents[lo-offset : hi-offset]
return limitSize(ents, maxSize), nil
}
// Term implements the Storage interface.
func (ms *MemoryStorage) Term(i uint64) (uint64, error) {
ms.Lock()
defer ms.Unlock()
offset := ms.ents[0].Index
if i < offset {
return 0, ErrCompacted
}
if int(i-offset) >= len(ms.ents) {
return 0, ErrUnavailable
}
return ms.ents[i-offset].Term, nil
}
// LastIndex implements the Storage interface.
func (ms *MemoryStorage) LastIndex() (uint64, error) {
ms.Lock()
defer ms.Unlock()
2015-01-28 21:31:11 +03:00
return ms.lastIndex(), nil
}
func (ms *MemoryStorage) lastIndex() uint64 {
return ms.ents[0].Index + uint64(len(ms.ents)) - 1
}
// FirstIndex implements the Storage interface.
func (ms *MemoryStorage) FirstIndex() (uint64, error) {
ms.Lock()
defer ms.Unlock()
2015-01-28 21:31:11 +03:00
return ms.firstIndex(), nil
}
func (ms *MemoryStorage) firstIndex() uint64 {
return ms.ents[0].Index + 1
}
// Snapshot implements the Storage interface.
func (ms *MemoryStorage) Snapshot() (pb.Snapshot, error) {
ms.Lock()
defer ms.Unlock()
return ms.snapshot, nil
}
// ApplySnapshot overwrites the contents of this Storage object with
// those of the given snapshot.
func (ms *MemoryStorage) ApplySnapshot(snap pb.Snapshot) error {
ms.Lock()
defer ms.Unlock()
//handle check for old snapshot being applied
msIndex := ms.snapshot.Metadata.Index
snapIndex := snap.Metadata.Index
if msIndex >= snapIndex {
return ErrSnapOutOfDate
}
ms.snapshot = snap
ms.ents = []pb.Entry{{Term: snap.Metadata.Term, Index: snap.Metadata.Index}}
return nil
}
2016-02-21 16:05:03 +03:00
// CreateSnapshot makes a snapshot which can be retrieved with Snapshot() and
// can be used to reconstruct the state at that point.
// If any configuration changes have been made since the last compaction,
// the result of the last ApplyConfChange must be passed in.
func (ms *MemoryStorage) CreateSnapshot(i uint64, cs *pb.ConfState, data []byte) (pb.Snapshot, error) {
ms.Lock()
defer ms.Unlock()
if i <= ms.snapshot.Metadata.Index {
return pb.Snapshot{}, ErrSnapOutOfDate
}
offset := ms.ents[0].Index
2015-01-28 21:31:11 +03:00
if i > ms.lastIndex() {
2015-03-08 08:00:13 +03:00
raftLogger.Panicf("snapshot %d is out of bound lastindex(%d)", i, ms.lastIndex())
}
ms.snapshot.Metadata.Index = i
ms.snapshot.Metadata.Term = ms.ents[i-offset].Term
if cs != nil {
ms.snapshot.Metadata.ConfState = *cs
}
ms.snapshot.Data = data
return ms.snapshot, nil
}
// Compact discards all log entries prior to compactIndex.
// It is the application's responsibility to not attempt to compact an index
// greater than raftLog.applied.
func (ms *MemoryStorage) Compact(compactIndex uint64) error {
ms.Lock()
defer ms.Unlock()
offset := ms.ents[0].Index
if compactIndex <= offset {
return ErrCompacted
}
2015-01-28 21:31:11 +03:00
if compactIndex > ms.lastIndex() {
2015-03-08 08:00:13 +03:00
raftLogger.Panicf("compact %d is out of bound lastindex(%d)", compactIndex, ms.lastIndex())
}
i := compactIndex - offset
ents := make([]pb.Entry, 1, 1+uint64(len(ms.ents))-i)
ents[0].Index = ms.ents[i].Index
ents[0].Term = ms.ents[i].Term
ents = append(ents, ms.ents[i+1:]...)
ms.ents = ents
return nil
}
// Append the new entries to storage.
// TODO (xiangli): ensure the entries are continuous and
// entries[0].Index > ms.entries[0].Index
func (ms *MemoryStorage) Append(entries []pb.Entry) error {
2014-11-25 03:36:59 +03:00
if len(entries) == 0 {
return nil
2014-11-25 03:36:59 +03:00
}
ms.Lock()
defer ms.Unlock()
first := ms.firstIndex()
last := entries[0].Index + uint64(len(entries)) - 1
// shortcut if there is no new entry.
if last < first {
return nil
}
2015-01-28 21:31:11 +03:00
// truncate compacted entries
if first > entries[0].Index {
entries = entries[first-entries[0].Index:]
}
offset := entries[0].Index - ms.ents[0].Index
switch {
case uint64(len(ms.ents)) > offset:
ms.ents = append([]pb.Entry{}, ms.ents[:offset]...)
ms.ents = append(ms.ents, entries...)
case uint64(len(ms.ents)) == offset:
ms.ents = append(ms.ents, entries...)
default:
2015-03-08 08:00:13 +03:00
raftLogger.Panicf("missing log entry [last: %d, append at: %d]",
2015-01-28 21:31:11 +03:00
ms.lastIndex(), entries[0].Index)
2014-11-25 03:36:59 +03:00
}
return nil
}