diff --git a/raft/raft.go b/raft/raft.go index 8f570a75a..90eedfe44 100644 --- a/raft/raft.go +++ b/raft/raft.go @@ -554,35 +554,34 @@ func (r *raft) bcastHeartbeatWithCtx(ctx []byte) { } func (r *raft) advance(rd Ready) { + r.reduceUncommittedSize(rd.CommittedEntries) + // If entries were applied (or a snapshot), update our cursor for // the next Ready. Note that if the current HardState contains a // new Commit index, this does not mean that we're also applying // all of the new entries due to commit pagination by size. - if index := rd.appliedCursor(); index > 0 { - r.raftLog.appliedTo(index) - if r.prs.Config.AutoLeave && index >= r.pendingConfIndex && r.state == StateLeader { + if newApplied := rd.appliedCursor(); newApplied > 0 { + oldApplied := r.raftLog.applied + r.raftLog.appliedTo(newApplied) + + if r.prs.Config.AutoLeave && oldApplied < r.pendingConfIndex && newApplied >= r.pendingConfIndex && r.state == StateLeader { // If the current (and most recent, at least for this leader's term) - // configuration should be auto-left, initiate that now. - ccdata, err := (&pb.ConfChangeV2{}).Marshal() - if err != nil { - panic(err) - } + // configuration should be auto-left, initiate that now. We use a + // nil Data which unmarshals into an empty ConfChangeV2 and has the + // benefit that appendEntry can never refuse it based on its size + // (which registers as zero). ent := pb.Entry{ Type: pb.EntryConfChangeV2, - Data: ccdata, + Data: nil, } + // There's no way in which this proposal should be able to be rejected. if !r.appendEntry(ent) { - // If we could not append the entry, bump the pending conf index - // so that we'll try again later. - // - // TODO(tbg): test this case. - r.pendingConfIndex = r.raftLog.lastIndex() - } else { - r.logger.Infof("initiating automatic transition out of joint configuration %s", r.prs.Config) + panic("refused un-refusable auto-leaving ConfChangeV2") } + r.pendingConfIndex = r.raftLog.lastIndex() + r.logger.Infof("initiating automatic transition out of joint configuration %s", r.prs.Config) } } - r.reduceUncommittedSize(rd.CommittedEntries) if len(rd.Entries) > 0 { e := rd.Entries[len(rd.Entries)-1] @@ -1607,16 +1606,21 @@ func (r *raft) abortLeaderTransfer() { // If the new entries would exceed the limit, the method returns false. If not, // the increase in uncommitted entry size is recorded and the method returns // true. +// Configuration changes are never refused. func (r *raft) increaseUncommittedSize(ents []pb.Entry) bool { var s uint64 for _, e := range ents { s += uint64(PayloadSize(e)) } - if r.uncommittedSize > 0 && r.uncommittedSize+s > r.maxUncommittedSize { + if r.uncommittedSize > 0 && s > 0 && r.uncommittedSize+s > r.maxUncommittedSize { // If the uncommitted tail of the Raft log is empty, allow any size // proposal. Otherwise, limit the size of the uncommitted tail of the // log and drop any proposal that would push the size over the limit. + // Note the added requirement s>0 which is used to make sure that + // appending single empty entries to the log always succeeds, used both + // for replicating a new leader's initial empty entry, and for + // auto-leaving joint configurations. return false } r.uncommittedSize += s diff --git a/raft/raft_test.go b/raft/raft_test.go index 87ceb99ca..9432bd0fc 100644 --- a/raft/raft_test.go +++ b/raft/raft_test.go @@ -186,6 +186,10 @@ func TestUncommittedEntryLimit(t *testing.T) { testEntry := pb.Entry{Data: []byte("testdata")} maxEntrySize := maxEntries * PayloadSize(testEntry) + if n := PayloadSize(pb.Entry{Data: nil}); n != 0 { + t.Fatal("entry with no Data must have zero payload size") + } + cfg := newTestConfig(1, []uint64{1, 2, 3}, 5, 1, NewMemoryStorage()) cfg.MaxUncommittedEntriesSize = uint64(maxEntrySize) cfg.MaxInflightMsgs = 2 * 1024 // avoid interference @@ -244,10 +248,19 @@ func TestUncommittedEntryLimit(t *testing.T) { t.Fatalf("proposal not dropped: %v", err) } + // But we can always append an entry with no Data. This is used both for the + // leader's first empty entry and for auto-transitioning out of joint config + // states. + if err := r.Step( + pb.Message{From: 1, To: 1, Type: pb.MsgProp, Entries: []pb.Entry{{}}}, + ); err != nil { + t.Fatal(err) + } + // Read messages and reduce the uncommitted size as if we had committed // these entries. ms = r.readMessages() - if e := 1 * numFollowers; len(ms) != e { + if e := 2 * numFollowers; len(ms) != e { t.Fatalf("expected %d messages, got %d", e, len(ms)) } r.reduceUncommittedSize(propEnts) diff --git a/raft/raftpb/confchange.go b/raft/raftpb/confchange.go index 46a7a7021..47fae65df 100644 --- a/raft/raftpb/confchange.go +++ b/raft/raftpb/confchange.go @@ -73,7 +73,7 @@ func (c ConfChangeV2) AsV1() (ConfChange, bool) { return ConfChange{}, false } // than one change or if the use of Joint Consensus was requested explicitly. // The first bool can only be true if second one is, and indicates whether the // Joint State will be left automatically. -func (c *ConfChangeV2) EnterJoint() (autoLeave bool, ok bool) { +func (c ConfChangeV2) EnterJoint() (autoLeave bool, ok bool) { // NB: in theory, more config changes could qualify for the "simple" // protocol but it depends on the config on top of which the changes apply. // For example, adding two learners is not OK if both nodes are part of the @@ -100,10 +100,10 @@ func (c *ConfChangeV2) EnterJoint() (autoLeave bool, ok bool) { // LeaveJoint is true if the configuration change leaves a joint configuration. // This is the case if the ConfChangeV2 is zero, with the possible exception of // the Context field. -func (c *ConfChangeV2) LeaveJoint() bool { - cpy := *c - cpy.Context = nil - return proto.Equal(&cpy, &ConfChangeV2{}) +func (c ConfChangeV2) LeaveJoint() bool { + // NB: c is already a copy. + c.Context = nil + return proto.Equal(&c, &ConfChangeV2{}) } // ConfChangesFromString parses a Space-delimited sequence of operations into a diff --git a/raft/rawnode_test.go b/raft/rawnode_test.go index 2651aff2a..15e3090f4 100644 --- a/raft/rawnode_test.go +++ b/raft/rawnode_test.go @@ -200,11 +200,12 @@ func TestRawNodeProposeAndConfChange(t *testing.T) { }, // Ditto implicit. { - pb.ConfChangeV2{Changes: []pb.ConfChangeSingle{ - {NodeID: 2, Type: pb.ConfChangeAddNode}, - {NodeID: 1, Type: pb.ConfChangeAddLearnerNode}, - {NodeID: 3, Type: pb.ConfChangeAddLearnerNode}, - }, + pb.ConfChangeV2{ + Changes: []pb.ConfChangeSingle{ + {NodeID: 2, Type: pb.ConfChangeAddNode}, + {NodeID: 1, Type: pb.ConfChangeAddLearnerNode}, + {NodeID: 3, Type: pb.ConfChangeAddLearnerNode}, + }, Transition: pb.ConfChangeTransitionJointImplicit, }, pb.ConfState{ @@ -282,7 +283,9 @@ func TestRawNodeProposeAndConfChange(t *testing.T) { } // Check that the last index is exactly the conf change we put in, - // down to the bits. + // down to the bits. Note that this comes from the Storage, which + // will not reflect any unstable entries that we'll only be presented + // with in the next Ready. lastIndex, err = s.LastIndex() if err != nil { t.Fatal(err) @@ -313,7 +316,17 @@ func TestRawNodeProposeAndConfChange(t *testing.T) { t.Fatalf("exp:\n%+v\nact:\n%+v", exp, cs) } - if exp, act := lastIndex, rawNode.raft.pendingConfIndex; exp != act { + var maybePlusOne uint64 + if autoLeave, ok := tc.cc.AsV2().EnterJoint(); ok && autoLeave { + // If this is an auto-leaving joint conf change, it will have + // appended the entry that auto-leaves, so add one to the last + // index that forms the basis of our expectations on + // pendingConfIndex. (Recall that lastIndex was taken from stable + // storage, but this auto-leaving entry isn't on stable storage + // yet). + maybePlusOne = 1 + } + if exp, act := lastIndex+maybePlusOne, rawNode.raft.pendingConfIndex; exp != act { t.Fatalf("pendingConfIndex: expected %d, got %d", exp, act) } diff --git a/raft/testdata/confchange_v2_add_double_auto.txt b/raft/testdata/confchange_v2_add_double_auto.txt index f335d38b3..9583102ed 100644 --- a/raft/testdata/confchange_v2_add_double_auto.txt +++ b/raft/testdata/confchange_v2_add_double_auto.txt @@ -195,3 +195,213 @@ stabilize 1 3 stabilize ---- ok + +# Now remove two nodes. What's new here is that the leader will actually have +# to go to a quorum to commit the transition into the joint config. + +propose-conf-change 1 +r2 r3 +---- +ok + +# n1 sends out MsgApps. +stabilize 1 +---- +> 1 handling Ready + Ready MustSync=true: + Entries: + 1/6 EntryConfChangeV2 r2 r3 + Messages: + 1->2 MsgApp Term:1 Log:1/5 Commit:5 Entries:[1/6 EntryConfChangeV2 r2 r3] + 1->3 MsgApp Term:1 Log:1/5 Commit:5 Entries:[1/6 EntryConfChangeV2 r2 r3] + +# n2, n3 ack them. +stabilize 2 3 +---- +> 2 receiving messages + 1->2 MsgApp Term:1 Log:1/5 Commit:5 Entries:[1/6 EntryConfChangeV2 r2 r3] +> 3 receiving messages + 1->3 MsgApp Term:1 Log:1/5 Commit:5 Entries:[1/6 EntryConfChangeV2 r2 r3] +> 2 handling Ready + Ready MustSync=true: + Entries: + 1/6 EntryConfChangeV2 r2 r3 + Messages: + 2->1 MsgAppResp Term:1 Log:0/6 +> 3 handling Ready + Ready MustSync=true: + Entries: + 1/6 EntryConfChangeV2 r2 r3 + Messages: + 3->1 MsgAppResp Term:1 Log:0/6 + +# n1 gets some more proposals. This is part of a regression test: There used to +# be a bug in which these proposals would prompt the leader to transition out of +# the same joint state multiple times, which would cause a panic. +propose 1 foo +---- +ok + +propose 1 bar +---- +ok + +# n1 switches to the joint config, then initiates a transition into the final +# config. +stabilize 1 +---- +> 1 handling Ready + Ready MustSync=true: + Entries: + 1/7 EntryNormal "foo" + 1/8 EntryNormal "bar" + Messages: + 1->2 MsgApp Term:1 Log:1/6 Commit:5 Entries:[1/7 EntryNormal "foo"] + 1->3 MsgApp Term:1 Log:1/6 Commit:5 Entries:[1/7 EntryNormal "foo"] + 1->2 MsgApp Term:1 Log:1/7 Commit:5 Entries:[1/8 EntryNormal "bar"] + 1->3 MsgApp Term:1 Log:1/7 Commit:5 Entries:[1/8 EntryNormal "bar"] +> 1 receiving messages + 2->1 MsgAppResp Term:1 Log:0/6 + 3->1 MsgAppResp Term:1 Log:0/6 +> 1 handling Ready + Ready MustSync=false: + HardState Term:1 Vote:1 Commit:6 + CommittedEntries: + 1/6 EntryConfChangeV2 r2 r3 + Messages: + 1->2 MsgApp Term:1 Log:1/8 Commit:6 + 1->3 MsgApp Term:1 Log:1/8 Commit:6 + INFO 1 switched to configuration voters=(1)&&(1 2 3) autoleave + INFO initiating automatic transition out of joint configuration voters=(1)&&(1 2 3) autoleave +> 1 handling Ready + Ready MustSync=true: + Entries: + 1/9 EntryConfChangeV2 + +# n2 and n3 also switch to the joint config, and ack the transition out of it. +stabilize 2 3 +---- +> 2 receiving messages + 1->2 MsgApp Term:1 Log:1/6 Commit:5 Entries:[1/7 EntryNormal "foo"] + 1->2 MsgApp Term:1 Log:1/7 Commit:5 Entries:[1/8 EntryNormal "bar"] + 1->2 MsgApp Term:1 Log:1/8 Commit:6 +> 3 receiving messages + 1->3 MsgApp Term:1 Log:1/6 Commit:5 Entries:[1/7 EntryNormal "foo"] + 1->3 MsgApp Term:1 Log:1/7 Commit:5 Entries:[1/8 EntryNormal "bar"] + 1->3 MsgApp Term:1 Log:1/8 Commit:6 +> 2 handling Ready + Ready MustSync=true: + HardState Term:1 Commit:6 + Entries: + 1/7 EntryNormal "foo" + 1/8 EntryNormal "bar" + CommittedEntries: + 1/6 EntryConfChangeV2 r2 r3 + Messages: + 2->1 MsgAppResp Term:1 Log:0/7 + 2->1 MsgAppResp Term:1 Log:0/8 + 2->1 MsgAppResp Term:1 Log:0/8 + INFO 2 switched to configuration voters=(1)&&(1 2 3) autoleave +> 3 handling Ready + Ready MustSync=true: + HardState Term:1 Commit:6 + Entries: + 1/7 EntryNormal "foo" + 1/8 EntryNormal "bar" + CommittedEntries: + 1/6 EntryConfChangeV2 r2 r3 + Messages: + 3->1 MsgAppResp Term:1 Log:0/7 + 3->1 MsgAppResp Term:1 Log:0/8 + 3->1 MsgAppResp Term:1 Log:0/8 + INFO 3 switched to configuration voters=(1)&&(1 2 3) autoleave + +# n2 and n3 also leave the joint config and the dust settles. We see at the very +# end that n1 receives some messages from them that it refuses because it does +# not have them in its config any more. +stabilize +---- +> 1 receiving messages + 2->1 MsgAppResp Term:1 Log:0/7 + 2->1 MsgAppResp Term:1 Log:0/8 + 2->1 MsgAppResp Term:1 Log:0/8 + 3->1 MsgAppResp Term:1 Log:0/7 + 3->1 MsgAppResp Term:1 Log:0/8 + 3->1 MsgAppResp Term:1 Log:0/8 +> 1 handling Ready + Ready MustSync=false: + HardState Term:1 Vote:1 Commit:8 + CommittedEntries: + 1/7 EntryNormal "foo" + 1/8 EntryNormal "bar" + Messages: + 1->2 MsgApp Term:1 Log:1/8 Commit:7 Entries:[1/9 EntryConfChangeV2] + 1->3 MsgApp Term:1 Log:1/8 Commit:7 Entries:[1/9 EntryConfChangeV2] + 1->2 MsgApp Term:1 Log:1/9 Commit:8 + 1->3 MsgApp Term:1 Log:1/9 Commit:8 +> 2 receiving messages + 1->2 MsgApp Term:1 Log:1/8 Commit:7 Entries:[1/9 EntryConfChangeV2] + 1->2 MsgApp Term:1 Log:1/9 Commit:8 +> 3 receiving messages + 1->3 MsgApp Term:1 Log:1/8 Commit:7 Entries:[1/9 EntryConfChangeV2] + 1->3 MsgApp Term:1 Log:1/9 Commit:8 +> 2 handling Ready + Ready MustSync=true: + HardState Term:1 Commit:8 + Entries: + 1/9 EntryConfChangeV2 + CommittedEntries: + 1/7 EntryNormal "foo" + 1/8 EntryNormal "bar" + Messages: + 2->1 MsgAppResp Term:1 Log:0/9 + 2->1 MsgAppResp Term:1 Log:0/9 +> 3 handling Ready + Ready MustSync=true: + HardState Term:1 Commit:8 + Entries: + 1/9 EntryConfChangeV2 + CommittedEntries: + 1/7 EntryNormal "foo" + 1/8 EntryNormal "bar" + Messages: + 3->1 MsgAppResp Term:1 Log:0/9 + 3->1 MsgAppResp Term:1 Log:0/9 +> 1 receiving messages + 2->1 MsgAppResp Term:1 Log:0/9 + 2->1 MsgAppResp Term:1 Log:0/9 + 3->1 MsgAppResp Term:1 Log:0/9 + 3->1 MsgAppResp Term:1 Log:0/9 +> 1 handling Ready + Ready MustSync=false: + HardState Term:1 Vote:1 Commit:9 + CommittedEntries: + 1/9 EntryConfChangeV2 + Messages: + 1->2 MsgApp Term:1 Log:1/9 Commit:9 + 1->3 MsgApp Term:1 Log:1/9 Commit:9 + INFO 1 switched to configuration voters=(1) +> 2 receiving messages + 1->2 MsgApp Term:1 Log:1/9 Commit:9 +> 3 receiving messages + 1->3 MsgApp Term:1 Log:1/9 Commit:9 +> 2 handling Ready + Ready MustSync=false: + HardState Term:1 Commit:9 + CommittedEntries: + 1/9 EntryConfChangeV2 + Messages: + 2->1 MsgAppResp Term:1 Log:0/9 + INFO 2 switched to configuration voters=(1) +> 3 handling Ready + Ready MustSync=false: + HardState Term:1 Commit:9 + CommittedEntries: + 1/9 EntryConfChangeV2 + Messages: + 3->1 MsgAppResp Term:1 Log:0/9 + INFO 3 switched to configuration voters=(1) +> 1 receiving messages + 2->1 MsgAppResp Term:1 Log:0/9 + raft: cannot step as peer not found3->1 MsgAppResp Term:1 Log:0/9 + raft: cannot step as peer not found