From e2cd0a788827e78770140aaa4fe2edf374bda863 Mon Sep 17 00:00:00 2001
From: Tim Gross <tgross@hashicorp.com>
Date: Thu, 19 Mar 2026 16:39:54 -0400
Subject: [PATCH] fix flake in test for restarted followers not causing an
 election

While working on #666, I encountered a test flake caused by a race condition in
`TestRaft_FollowerRemovalNoElection`, which is also obscuring the test
intent. After we restart the follower, typically the `c.ConnectFully` call will
complete quickly enough that the follower never enters a candidate state, so
we're not exercising the intended behavior. But if the `c.ConnectFully` call
takes too long, the follower can miss heartbeats and start an election. This is
expected behavior and its request for an election will be rejected. But in that
case, the follower will not have settled on the leader by the time we check it
and the test will fail when it shouldn't.

Add an extra sleep before we reconnect the cluster to force the restarted
follower to become a candidate so that we're exercising the intent of the test,
but then wait until the follower shows an event that it's got a leader before
continuing to the remaining assertions.
---
 raft_test.go | 29 ++++++++++++++++++++++++++---
 testing.go   |  3 +++
 2 files changed, 29 insertions(+), 3 deletions(-)

diff --git a/raft_test.go b/raft_test.go
index 9cfbf6c2..f605b14d 100644
--- a/raft_test.go
+++ b/raft_test.go
@@ -3106,14 +3106,37 @@ func TestRaft_FollowerRemovalNoElection(t *testing.T) {
 	if err != nil {
 		t.Fatalf("error restarting follower: %v", err)
 	}
+	n.RegisterObserver(NewObserver(c.observationCh, false, nil))
+
 	c.rafts[i] = n
 	c.trans[i] = n.trans.(*InmemTransport)
 	c.fsms[i] = n.fsm.(*MockFSM)
+	time.Sleep(500 * time.Millisecond)
 	c.FullyConnect()
-	// There should be no re-election during this sleep
-	time.Sleep(250 * time.Millisecond)
 
-	// Let things settle and make sure we recovered.
+	// Wait for the restarted follower to settle.
+	ch := c.WaitEventChan(t.Context(), func(o *Observation) bool {
+		if o == nil {
+			return false
+		}
+		if o.Raft.localID == follower.localID {
+			if obs, ok := o.Data.(LeaderObservation); ok {
+				if obs.Leader != "" {
+					t.Logf("[INFO] restarted follower has rejoined cluster")
+					return true
+				}
+			}
+
+		}
+		return false
+	})
+	select {
+	case <-time.After(time.Second):
+		t.Fatal("restarted follower never got leader")
+	case <-ch:
+	}
+
+	// There should have been no re-election
 	c.EnsureLeader(t, leader.localAddr)
 	c.EnsureSame(t)
 	c.EnsureSamePeers(t)
diff --git a/testing.go b/testing.go
index 66bad0bd..582d10ff 100644
--- a/testing.go
+++ b/testing.go
@@ -587,6 +587,7 @@ func (c *cluster) IndexOf(r *Raft) int {
 // EnsureLeader checks that ALL the nodes think the leader is the given expected
 // leader.
 func (c *cluster) EnsureLeader(t *testing.T, expect ServerAddress) {
+	t.Helper()
 	// We assume c.Leader() has been called already; now check all the rafts
 	// think the leader is correct
 	fail := false
@@ -612,6 +613,7 @@ func (c *cluster) EnsureLeader(t *testing.T, expect ServerAddress) {
 
 // EnsureSame makes sure all the FSMs have the same contents.
 func (c *cluster) EnsureSame(t *testing.T) {
+	t.Helper()
 	limit := time.Now().Add(c.longstopTimeout)
 	first := getMockFSM(c.fsms[0])
 
@@ -690,6 +692,7 @@ func (c *cluster) getConfiguration(r *Raft) Configuration {
 
 // EnsureSamePeers makes sure all the rafts have the same set of peers.
 func (c *cluster) EnsureSamePeers(t *testing.T) {
+	t.Helper()
 	limit := time.Now().Add(c.longstopTimeout)
 	peerSet := c.getConfiguration(c.rafts[0])