Merge pull request #5553 from swingbach/master
raft: implemented read-only query when quorum check is onrelease-3.1
commit
5f1c763993
22
raft/node.go
22
raft/node.go
|
@ -60,6 +60,12 @@ type Ready struct {
|
|||
// HardState will be equal to empty state if there is no update.
|
||||
pb.HardState
|
||||
|
||||
// ReadState can be used for node to serve linearizable read requests locally
|
||||
// when its applied index is greater than the index in ReadState.
|
||||
// Note that the readState will be returned when raft receives msgReadIndex.
|
||||
// The returned is only valid for the request that requested to read.
|
||||
ReadState
|
||||
|
||||
// Entries specifies entries to be saved to stable storage BEFORE
|
||||
// Messages are sent.
|
||||
Entries []pb.Entry
|
||||
|
@ -96,7 +102,7 @@ func IsEmptySnap(sp pb.Snapshot) bool {
|
|||
func (rd Ready) containsUpdates() bool {
|
||||
return rd.SoftState != nil || !IsEmptyHardState(rd.HardState) ||
|
||||
!IsEmptySnap(rd.Snapshot) || len(rd.Entries) > 0 ||
|
||||
len(rd.CommittedEntries) > 0 || len(rd.Messages) > 0
|
||||
len(rd.CommittedEntries) > 0 || len(rd.Messages) > 0 || rd.Index != None
|
||||
}
|
||||
|
||||
// Node represents a node in a raft cluster.
|
||||
|
@ -361,7 +367,10 @@ func (n *node) run(r *raft) {
|
|||
if !IsEmptySnap(rd.Snapshot) {
|
||||
prevSnapi = rd.Snapshot.Metadata.Index
|
||||
}
|
||||
|
||||
r.msgs = nil
|
||||
r.readState.Index = None
|
||||
r.readState.RequestCtx = nil
|
||||
advancec = n.advancec
|
||||
case <-advancec:
|
||||
if prevHardSt.Commit != 0 {
|
||||
|
@ -478,6 +487,10 @@ func (n *node) ReportSnapshot(id uint64, status SnapshotStatus) {
|
|||
}
|
||||
}
|
||||
|
||||
func (n *node) ReadIndex(ctx context.Context, id uint64, rctx []byte) error {
|
||||
return n.step(ctx, pb.Message{Type: pb.MsgReadIndex, From: id, Entries: []pb.Entry{{Data: rctx}}})
|
||||
}
|
||||
|
||||
func newReady(r *raft, prevSoftSt *SoftState, prevHardSt pb.HardState) Ready {
|
||||
rd := Ready{
|
||||
Entries: r.raftLog.unstableEntries(),
|
||||
|
@ -493,5 +506,12 @@ func newReady(r *raft, prevSoftSt *SoftState, prevHardSt pb.HardState) Ready {
|
|||
if r.raftLog.unstable.snapshot != nil {
|
||||
rd.Snapshot = *r.raftLog.unstable.snapshot
|
||||
}
|
||||
if r.readState.Index != None {
|
||||
c := make([]byte, len(r.readState.RequestCtx))
|
||||
copy(c, r.readState.RequestCtx)
|
||||
|
||||
rd.Index = r.readState.Index
|
||||
rd.RequestCtx = c
|
||||
}
|
||||
return rd
|
||||
}
|
||||
|
|
|
@ -142,6 +142,58 @@ func TestNodePropose(t *testing.T) {
|
|||
}
|
||||
}
|
||||
|
||||
// TestNodeReadIndex ensures that node.ReadIndex sends the MsgReadIndex message to the underlying raft.
|
||||
// It also ensures that ReadState can be read out through ready chan.
|
||||
func TestNodeReadIndex(t *testing.T) {
|
||||
msgs := []raftpb.Message{}
|
||||
appendStep := func(r *raft, m raftpb.Message) {
|
||||
msgs = append(msgs, m)
|
||||
}
|
||||
wreadIndex := uint64(1)
|
||||
wrequestCtx := []byte("somedata")
|
||||
|
||||
n := newNode()
|
||||
s := NewMemoryStorage()
|
||||
r := newTestRaft(1, []uint64{1}, 10, 1, s)
|
||||
r.readState.Index = wreadIndex
|
||||
r.readState.RequestCtx = wrequestCtx
|
||||
go n.run(r)
|
||||
n.Campaign(context.TODO())
|
||||
for {
|
||||
rd := <-n.Ready()
|
||||
if rd.Index != wreadIndex {
|
||||
t.Errorf("ReadIndex = %d, want %d", rd.Index, wreadIndex)
|
||||
}
|
||||
|
||||
if !reflect.DeepEqual(rd.RequestCtx, wrequestCtx) {
|
||||
t.Errorf("RequestCtx = %v, want %v", rd.RequestCtx, wrequestCtx)
|
||||
}
|
||||
|
||||
s.Append(rd.Entries)
|
||||
|
||||
if rd.SoftState.Lead == r.id {
|
||||
n.Advance()
|
||||
break
|
||||
}
|
||||
n.Advance()
|
||||
}
|
||||
|
||||
r.step = appendStep
|
||||
wrequestCtx = []byte("somedata2")
|
||||
n.ReadIndex(context.TODO(), r.id, wrequestCtx)
|
||||
n.Stop()
|
||||
|
||||
if len(msgs) != 1 {
|
||||
t.Fatalf("len(msgs) = %d, want %d", len(msgs), 1)
|
||||
}
|
||||
if msgs[0].Type != raftpb.MsgReadIndex {
|
||||
t.Errorf("msg type = %d, want %d", msgs[0].Type, raftpb.MsgReadIndex)
|
||||
}
|
||||
if !reflect.DeepEqual(msgs[0].Entries[0].Data, wrequestCtx) {
|
||||
t.Errorf("data = %v, want %v", msgs[0].Entries[0].Data, wrequestCtx)
|
||||
}
|
||||
}
|
||||
|
||||
// TestNodeProposeConfig ensures that node.ProposeConfChange sends the given configuration proposal
|
||||
// to the underlying raft.
|
||||
func TestNodeProposeConfig(t *testing.T) {
|
||||
|
|
36
raft/raft.go
36
raft/raft.go
|
@ -133,12 +133,24 @@ func (c *Config) validate() error {
|
|||
return nil
|
||||
}
|
||||
|
||||
// ReadState provides state for read only query.
|
||||
// It's caller's responsibility to send MsgReadIndex first before getting
|
||||
// this state from ready, It's also caller's duty to differentiate if this
|
||||
// state is what it requests through RequestCtx, eg. given a unique id as
|
||||
// RequestCtx
|
||||
type ReadState struct {
|
||||
Index uint64
|
||||
RequestCtx []byte
|
||||
}
|
||||
|
||||
type raft struct {
|
||||
id uint64
|
||||
|
||||
Term uint64
|
||||
Vote uint64
|
||||
|
||||
readState ReadState
|
||||
|
||||
// the log
|
||||
raftLog *raftLog
|
||||
|
||||
|
@ -208,6 +220,7 @@ func newRaft(c *Config) *raft {
|
|||
r := &raft{
|
||||
id: c.ID,
|
||||
lead: None,
|
||||
readState: ReadState{Index: None, RequestCtx: nil},
|
||||
raftLog: raftlog,
|
||||
maxMsgSize: c.MaxSizePerMsg,
|
||||
maxInflight: c.MaxInflightMsgs,
|
||||
|
@ -642,6 +655,14 @@ func stepLeader(r *raft, m pb.Message) {
|
|||
r.id, r.raftLog.lastTerm(), r.raftLog.lastIndex(), r.Vote, m.From, m.LogTerm, m.Index, r.Term)
|
||||
r.send(pb.Message{To: m.From, Type: pb.MsgVoteResp, Reject: true})
|
||||
return
|
||||
case pb.MsgReadIndex:
|
||||
ri := None
|
||||
if r.checkQuorum {
|
||||
ri = r.raftLog.committed
|
||||
}
|
||||
|
||||
r.send(pb.Message{To: m.From, Type: pb.MsgReadIndexResp, Index: ri, Entries: m.Entries})
|
||||
return
|
||||
}
|
||||
|
||||
// All other message types require a progress for m.From (pr).
|
||||
|
@ -822,6 +843,21 @@ func stepFollower(r *raft, m pb.Message) {
|
|||
case pb.MsgTimeoutNow:
|
||||
r.logger.Infof("%x [term %d] received MsgTimeoutNow from %x and starts an election to get leadership.", r.id, r.Term, m.From)
|
||||
r.campaign()
|
||||
case pb.MsgReadIndex:
|
||||
if r.lead == None {
|
||||
r.logger.Infof("%x no leader at term %d; dropping index reading msg", r.id, r.Term)
|
||||
return
|
||||
}
|
||||
m.To = r.lead
|
||||
r.send(m)
|
||||
case pb.MsgReadIndexResp:
|
||||
if len(m.Entries) != 1 {
|
||||
r.logger.Errorf("%x invalid format of MsgReadIndexResp from %x, entries count: %d", r.id, m.From, len(m.Entries))
|
||||
return
|
||||
}
|
||||
|
||||
r.readState.Index = m.Index
|
||||
r.readState.RequestCtx = m.Entries[0].Data
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -1405,6 +1405,75 @@ func TestNonPromotableVoterWithCheckQuorum(t *testing.T) {
|
|||
}
|
||||
}
|
||||
|
||||
func TestReadIndexWithCheckQuorum(t *testing.T) {
|
||||
a := newTestRaft(1, []uint64{1, 2, 3}, 10, 1, NewMemoryStorage())
|
||||
b := newTestRaft(2, []uint64{1, 2, 3}, 10, 1, NewMemoryStorage())
|
||||
c := newTestRaft(3, []uint64{1, 2, 3}, 10, 1, NewMemoryStorage())
|
||||
|
||||
a.checkQuorum = true
|
||||
b.checkQuorum = true
|
||||
c.checkQuorum = true
|
||||
|
||||
nt := newNetwork(a, b, c)
|
||||
for i := 0; i < b.electionTimeout; i++ {
|
||||
b.tick()
|
||||
}
|
||||
nt.send(pb.Message{From: 1, To: 1, Type: pb.MsgHup})
|
||||
|
||||
if a.state != StateLeader {
|
||||
t.Fatalf("state = %s, want %s", a.state, StateLeader)
|
||||
}
|
||||
|
||||
tests := []struct {
|
||||
sm *raft
|
||||
proposals int
|
||||
wri uint64
|
||||
wctx []byte
|
||||
}{
|
||||
{b, 10, 11, []byte("ctx1")},
|
||||
{c, 10, 21, []byte("ctx2")},
|
||||
{b, 10, 31, []byte("ctx3")},
|
||||
{c, 10, 41, []byte("ctx4")},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
for j := 0; j < tt.proposals; j++ {
|
||||
nt.send(pb.Message{From: 1, To: 1, Type: pb.MsgProp, Entries: []pb.Entry{{}}})
|
||||
}
|
||||
|
||||
nt.send(pb.Message{From: tt.sm.id, To: tt.sm.id, Type: pb.MsgReadIndex, Entries: []pb.Entry{{Data: tt.wctx}}})
|
||||
|
||||
r := tt.sm
|
||||
if r.readState.Index != tt.wri {
|
||||
t.Errorf("readIndex = %d, want %d", r.readState.Index, tt.wri)
|
||||
}
|
||||
|
||||
if !bytes.Equal(r.readState.RequestCtx, tt.wctx) {
|
||||
t.Errorf("requestCtx = %v, want %v", r.readState.RequestCtx, tt.wctx)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestReadIndexWithoutCheckQuorum(t *testing.T) {
|
||||
a := newTestRaft(1, []uint64{1, 2, 3}, 10, 1, NewMemoryStorage())
|
||||
b := newTestRaft(2, []uint64{1, 2, 3}, 10, 1, NewMemoryStorage())
|
||||
c := newTestRaft(3, []uint64{1, 2, 3}, 10, 1, NewMemoryStorage())
|
||||
|
||||
nt := newNetwork(a, b, c)
|
||||
nt.send(pb.Message{From: 1, To: 1, Type: pb.MsgHup})
|
||||
|
||||
ctx := []byte("ctx1")
|
||||
nt.send(pb.Message{From: 2, To: 2, Type: pb.MsgReadIndex, Entries: []pb.Entry{{Data: ctx}}})
|
||||
|
||||
if b.readState.Index != None {
|
||||
t.Errorf("readIndex = %d, want %d", b.readState.Index, None)
|
||||
}
|
||||
|
||||
if !bytes.Equal(b.readState.RequestCtx, ctx) {
|
||||
t.Errorf("requestCtx = %v, want %v", b.readState.RequestCtx, ctx)
|
||||
}
|
||||
}
|
||||
|
||||
func TestLeaderAppResp(t *testing.T) {
|
||||
// initial progress: match = 0; next = 3
|
||||
tests := []struct {
|
||||
|
|
|
@ -90,6 +90,8 @@ const (
|
|||
MsgCheckQuorum MessageType = 12
|
||||
MsgTransferLeader MessageType = 13
|
||||
MsgTimeoutNow MessageType = 14
|
||||
MsgReadIndex MessageType = 15
|
||||
MsgReadIndexResp MessageType = 16
|
||||
)
|
||||
|
||||
var MessageType_name = map[int32]string{
|
||||
|
@ -108,6 +110,8 @@ var MessageType_name = map[int32]string{
|
|||
12: "MsgCheckQuorum",
|
||||
13: "MsgTransferLeader",
|
||||
14: "MsgTimeoutNow",
|
||||
15: "MsgReadIndex",
|
||||
16: "MsgReadIndexResp",
|
||||
}
|
||||
var MessageType_value = map[string]int32{
|
||||
"MsgHup": 0,
|
||||
|
@ -125,6 +129,8 @@ var MessageType_value = map[string]int32{
|
|||
"MsgCheckQuorum": 12,
|
||||
"MsgTransferLeader": 13,
|
||||
"MsgTimeoutNow": 14,
|
||||
"MsgReadIndex": 15,
|
||||
"MsgReadIndexResp": 16,
|
||||
}
|
||||
|
||||
func (x MessageType) Enum() *MessageType {
|
||||
|
|
|
@ -48,6 +48,8 @@ enum MessageType {
|
|||
MsgCheckQuorum = 12;
|
||||
MsgTransferLeader = 13;
|
||||
MsgTimeoutNow = 14;
|
||||
MsgReadIndex = 15;
|
||||
MsgReadIndexResp = 16;
|
||||
}
|
||||
|
||||
message Message {
|
||||
|
|
Loading…
Reference in New Issue