From 42230e5e15d4a6bc859e0460fd956dda6a8686a0 Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Fri, 7 Jun 2024 16:23:55 +0200 Subject: [PATCH] raft: only read after committing an entry from the current term Otherwise, a newly elected leader that's behind on commit/apply may serve stale reads. --- src/raft/node.rs | 22 ++++++++++---- .../request_leader_change_linearizability | 30 ++++++++++++------- 2 files changed, 37 insertions(+), 15 deletions(-) diff --git a/src/raft/node.rs b/src/raft/node.rs index 5fbae7f9..5b70df1f 100644 --- a/src/raft/node.rs +++ b/src/raft/node.rs @@ -865,10 +865,6 @@ impl RawNode { } // A follower received our heartbeat and confirms our leadership. - // - // TODO: this needs to commit and apply entries following a leader - // change too, otherwise we can serve stale reads if the new leader - // hadn't fully committed and applied entries yet. Message::HeartbeatResponse { match_index, read_seq } => { let (last_index, _) = self.log.get_last_index(); assert!(match_index <= last_index, "future match index"); @@ -1072,7 +1068,7 @@ impl RawNode { // If the commit index doesn't advance, do nothing. We don't assert on // this, since the quorum value may regress e.g. following a restart or // leader change where followers are initialized with log index 0. - let mut commit_index = self.log.get_commit_index().0; + let (mut commit_index, old_commit_term) = self.log.get_commit_index(); if quorum_index <= commit_index { return Ok(commit_index); } @@ -1105,6 +1101,12 @@ impl RawNode { Ok(()) })?; + // If the commit term changed, there may be pending reads waiting for us + // to commit an entry from our own term. Execute them. + if old_commit_term != self.term { + self.maybe_read()?; + } + Ok(commit_index) } @@ -1115,6 +1117,16 @@ impl RawNode { return Ok(()); } + // It's only safe to read if we've committed and applied an entry from + // our own term (the leader appends an entry when elected). Otherwise we + // may be behind on application and serve stale reads, violating + // linearizability. + let (commit_index, commit_term) = self.log.get_commit_index(); + let applied_index = self.state.get_applied_index(); + if commit_term < self.term || applied_index < commit_index { + return Ok(()); + } + // Determine the maximum read sequence confirmed by quorum. let read_seq = self.quorum_value( self.role diff --git a/src/raft/testscripts/node/request_leader_change_linearizability b/src/raft/testscripts/node/request_leader_change_linearizability index fff1fd20..75a48aa7 100644 --- a/src/raft/testscripts/node/request_leader_change_linearizability +++ b/src/raft/testscripts/node/request_leader_change_linearizability @@ -24,7 +24,7 @@ n1@1 leader last=3@1 commit=3@1 apply=3 progress={2:3→4 3:3→4} n2@1 follower(n1) last=3@1 commit=2@1 apply=2 n3@1 follower(n1) last=3@1 commit=2@1 apply=2 -# n2 campaigns and wins. +# n2 now campaigns and wins, while being behind on commit/apply. campaign 2 deliver --- @@ -36,8 +36,8 @@ n1@2 → n2 CampaignResponse vote=true n3@1 follower(n1) ⇨ n3@2 follower() n3@2 → n2 CampaignResponse vote=true -# However, the initial append/heartbeat messages don't make it to the followers, -# so its commit index trails the previous leader. +# The initial append doesn't make it to the followers, so its commit index +# trails the previous leader. partition 2 deliver 2 --- @@ -57,12 +57,11 @@ n1@2 follower() last=3@1 commit=3@1 apply=3 n2@2 leader last=4@2 commit=2@1 apply=2 progress={1:0→5 3:0→5} n3@2 follower() last=3@1 commit=2@1 apply=2 -# Reading from n2 should not result in a stale read. -# -# TODO: this currently results in a stale read, since n2 does not wait to commit -# and apply an entry from its own term before serving reads. +# Reading from n2 should not result in a stale read even if followers +# confirm the read sequence. get 2 b -stabilize +deliver +deliver --- c2@2 → n2 ClientRequest id=0x03 read 0x000162 n2@2 → n1 Heartbeat last_index=4 commit_index=2 read_seq=1 @@ -71,14 +70,23 @@ n1@2 follower() ⇨ n1@2 follower(n2) n1@2 → n2 HeartbeatResponse match_index=0 read_seq=1 n3@2 follower() ⇨ n3@2 follower(n2) n3@2 → n2 HeartbeatResponse match_index=0 read_seq=1 -n2@2 → c2 ClientResponse id=0x03 read 0x0000 -c2@2 get b ⇒ None n2@2 → n1 Append base=3@1 [] n2@2 → n3 Append base=3@1 [] + +# However, the heartbeat resulted in probes for the dropped appends. +# The leader resends them. +deliver +deliver +--- n1@2 → n2 AppendResponse match_index=3 n3@2 → n2 AppendResponse match_index=3 n2@2 → n1 Append base=3@1 [4@2] n2@2 → n3 Append base=3@1 [4@2] + +# Once the followers acknowledge the appends the leader commits them. The read +# can now be served, resulting in an up-to-date b=2. +stabilize +--- n1@2 append 4@2 None n1@2 → n2 AppendResponse match_index=4 n3@2 append 4@2 None @@ -86,3 +94,5 @@ n3@2 → n2 AppendResponse match_index=4 n2@2 commit 4@2 n2@2 apply 3@1 put b=2 n2@2 apply 4@2 None +n2@2 → c2 ClientResponse id=0x03 read 0x00010132 +c2@2 get b ⇒ 2