diff --git a/gossip/discovery/discovery_impl.go b/gossip/discovery/discovery_impl.go index 9707f259c20..69503eb707a 100644 --- a/gossip/discovery/discovery_impl.go +++ b/gossip/discovery/discovery_impl.go @@ -505,7 +505,11 @@ func (d *gossipDiscoveryImpl) handleAliveMessage(m *protoext.SignedGossipMessage d.lock.RLock() _, known := d.id2Member[string(pkiID)] +<<<<<<< fix-gossip-discovery-toctou-race + lastAliveTS, isAlive := d.aliveLastTS[string(pkiID)] +======= _, isAlive := d.aliveLastTS[string(pkiID)] +>>>>>>> main lastDeadTS, isDead := d.deadLastTS[string(pkiID)] d.lock.RUnlock() @@ -535,10 +539,6 @@ func (d *gossipDiscoveryImpl) handleAliveMessage(m *protoext.SignedGossipMessage return } - d.lock.RLock() - lastAliveTS, isAlive := d.aliveLastTS[string(pkiID)] - d.lock.RUnlock() - if isAlive { if before(lastAliveTS, ts) { d.learnExistingMembers([]*protoext.SignedGossipMessage{m}) @@ -843,6 +843,10 @@ func (d *gossipDiscoveryImpl) learnExistingMembers(aliveArr []*protoext.SignedGo // update member's data member := d.id2Member[string(am.Membership.PkiId)] + if member == nil { + d.logger.Debugf("Member with PkiId %x was purged during alive message processing, skipping update", am.Membership.PkiId) + continue + } member.Endpoint = am.Membership.Endpoint member.Metadata = am.Membership.Metadata member.InternalEndpoint = internalEndpoint diff --git a/gossip/discovery/discovery_test.go b/gossip/discovery/discovery_test.go index 01b80fc4fcf..8e679ac8d93 100644 --- a/gossip/discovery/discovery_test.go +++ b/gossip/discovery/discovery_test.go @@ -1928,6 +1928,31 @@ func portOfEndpoint(endpoint string) int { return int(port) } + +func TestLearnExistingMembers_NilMemberAfterConcurrentPurge(t *testing.T) { + inst := createDiscoveryInstanceWithNoGossip(33900, "d0", nil) + defer inst.Stop() + + d := inst.discoveryImpl() + + memberPKIid := common.PKIidType("purged-member-pkiid") + memberEndpoint := "localhost:9999" + + // Simulate the TOCTOU race fixed in PR #5397: + // handleAliveMessage sees isAlive=true under one lock, then a concurrent + // purge() removes the member from all maps before learnExistingMembers + // acquires the write lock. The result is a member present in aliveLastTS + // at decision time but absent from id2Member when learnExistingMembers runs. + d.lock.Lock() + d.aliveLastTS[string(memberPKIid)] = ×tamp{ + incTime: time.Now(), + seqNum: 1, + lastSeen: time.Now(), + } + // Intentionally NOT adding to id2Member to reproduce the nil dereference: + // before the fix, learnExistingMembers accessed member.Endpoint without + // a nil guard, causing a panic. + d.lock.Unlock() func TestHandleAliveMessage_RelearnsMemberAfterConcurrentPurge(t *testing.T) { inst := createDiscoveryInstanceWithNoGossip(33700, "d0", []string{}) defer inst.Stop() @@ -1954,6 +1979,20 @@ func TestHandleAliveMessage_RelearnsMemberAfterConcurrentPurge(t *testing.T) { }, Timestamp: &proto.PeerTime{ IncNum: uint64(time.Now().UnixNano()), + SeqNum: 2, + }, + }, + }, + } + signedMsg, err := protoext.NoopSign(aliveMsg) + require.NoError(t, err) + + // Before the fix: panics with nil pointer dereference on member.Endpoint + // because id2Member[memberPKIid] is nil (member was concurrently purged). + // After the fix: nil member detected, update skipped, no panic. + require.NotPanics(t, func() { + d.learnExistingMembers([]*protoext.SignedGossipMessage{signedMsg}) + }) SeqNum: 1, }, },