From e2cd0a788827e78770140aaa4fe2edf374bda863 Mon Sep 17 00:00:00 2001 From: Tim Gross Date: Thu, 19 Mar 2026 16:39:54 -0400 Subject: [PATCH] fix flake in test for restarted followers not causing an election While working on #666, I encountered a test flake caused by a race condition in `TestRaft_FollowerRemovalNoElection`, which is also obscuring the test intent. After we restart the follower, typically the `c.ConnectFully` call will complete quickly enough that the follower never enters a candidate state, so we're not exercising the intended behavior. But if the `c.ConnectFully` call takes too long, the follower can miss heartbeats and start an election. This is expected behavior and its request for an election will be rejected. But in that case, the follower will not have settled on the leader by the time we check it and the test will fail when it shouldn't. Add an extra sleep before we reconnect the cluster to force the restarted follower to become a candidate so that we're exercising the intent of the test, but then wait until the follower shows an event that it's got a leader before continuing to the remaining assertions. --- raft_test.go | 29 ++++++++++++++++++++++++++--- testing.go | 3 +++ 2 files changed, 29 insertions(+), 3 deletions(-) diff --git a/raft_test.go b/raft_test.go index 9cfbf6c2..f605b14d 100644 --- a/raft_test.go +++ b/raft_test.go @@ -3106,14 +3106,37 @@ func TestRaft_FollowerRemovalNoElection(t *testing.T) { if err != nil { t.Fatalf("error restarting follower: %v", err) } + n.RegisterObserver(NewObserver(c.observationCh, false, nil)) + c.rafts[i] = n c.trans[i] = n.trans.(*InmemTransport) c.fsms[i] = n.fsm.(*MockFSM) + time.Sleep(500 * time.Millisecond) c.FullyConnect() - // There should be no re-election during this sleep - time.Sleep(250 * time.Millisecond) - // Let things settle and make sure we recovered. + // Wait for the restarted follower to settle. + ch := c.WaitEventChan(t.Context(), func(o *Observation) bool { + if o == nil { + return false + } + if o.Raft.localID == follower.localID { + if obs, ok := o.Data.(LeaderObservation); ok { + if obs.Leader != "" { + t.Logf("[INFO] restarted follower has rejoined cluster") + return true + } + } + + } + return false + }) + select { + case <-time.After(time.Second): + t.Fatal("restarted follower never got leader") + case <-ch: + } + + // There should have been no re-election c.EnsureLeader(t, leader.localAddr) c.EnsureSame(t) c.EnsureSamePeers(t) diff --git a/testing.go b/testing.go index 66bad0bd..582d10ff 100644 --- a/testing.go +++ b/testing.go @@ -587,6 +587,7 @@ func (c *cluster) IndexOf(r *Raft) int { // EnsureLeader checks that ALL the nodes think the leader is the given expected // leader. func (c *cluster) EnsureLeader(t *testing.T, expect ServerAddress) { + t.Helper() // We assume c.Leader() has been called already; now check all the rafts // think the leader is correct fail := false @@ -612,6 +613,7 @@ func (c *cluster) EnsureLeader(t *testing.T, expect ServerAddress) { // EnsureSame makes sure all the FSMs have the same contents. func (c *cluster) EnsureSame(t *testing.T) { + t.Helper() limit := time.Now().Add(c.longstopTimeout) first := getMockFSM(c.fsms[0]) @@ -690,6 +692,7 @@ func (c *cluster) getConfiguration(r *Raft) Configuration { // EnsureSamePeers makes sure all the rafts have the same set of peers. func (c *cluster) EnsureSamePeers(t *testing.T) { + t.Helper() limit := time.Now().Add(c.longstopTimeout) peerSet := c.getConfiguration(c.rafts[0])