diff --git a/common/deliverclient/blocksprovider/bft_deliverer.go b/common/deliverclient/blocksprovider/bft_deliverer.go index 79c900ecce7..7ba6d2e4806 100644 --- a/common/deliverclient/blocksprovider/bft_deliverer.go +++ b/common/deliverclient/blocksprovider/bft_deliverer.go @@ -323,7 +323,9 @@ func (d *BFTDeliverer) Stop() { d.stopFlag = true close(d.DoneC) - d.blockReceiver.Stop() + if d.blockReceiver != nil { + d.blockReceiver.Stop() + } } func (d *BFTDeliverer) FetchBlocks(source *orderers.Endpoint) { diff --git a/common/deliverclient/blocksprovider/deliverer.go b/common/deliverclient/blocksprovider/deliverer.go index 8a9d30a44c1..bfe90e56232 100644 --- a/common/deliverclient/blocksprovider/deliverer.go +++ b/common/deliverclient/blocksprovider/deliverer.go @@ -244,7 +244,9 @@ func (d *Deliverer) Stop() { d.stopFlag = true close(d.DoneC) - d.blockReceiver.Stop() + if d.blockReceiver != nil { + d.blockReceiver.Stop() + } d.Logger.Info("Deliverer stopped") } diff --git a/common/deliverclient/blocksprovider/stop_before_connect_test.go b/common/deliverclient/blocksprovider/stop_before_connect_test.go new file mode 100644 index 00000000000..becdc5ca64f --- /dev/null +++ b/common/deliverclient/blocksprovider/stop_before_connect_test.go @@ -0,0 +1,51 @@ +/* +Copyright IBM Corp. All Rights Reserved. + +SPDX-License-Identifier: Apache-2.0 +*/ + +package blocksprovider_test + +import ( + "testing" + + "github.com/hyperledger/fabric-lib-go/common/flogging" + "github.com/hyperledger/fabric/common/deliverclient/blocksprovider" + "github.com/stretchr/testify/require" +) + +// TestDelivererStopBeforeConnect proves that calling Stop() before DeliverBlocks() +// has established a connection to an orderer (and therefore before blockReceiver is +// initialised) does not panic. +// +// Prior to the fix, d.blockReceiver was nil in this path and Stop() dereferenced it +// unconditionally, causing a nil-pointer panic. This scenario is reached in +// production whenever a peer renounces gossip leadership +// (gossip/service/gossip_service.go calls StopDeliverForChannel) while the orderer +// is unreachable, which is common during network partitions or orderer maintenance. +func TestDelivererStopBeforeConnect(t *testing.T) { + d := &blocksprovider.Deliverer{ + ChannelID: "test-channel", + DoneC: make(chan struct{}), + Logger: flogging.MustGetLogger("blocksprovider.test"), + } + + require.NotPanics(t, func() { + d.Stop() + }) +} + +// TestBFTDelivererStopBeforeConnect is the BFT-path equivalent of the same bug. +// BFTDeliverer.Stop() contained the identical unconditional d.blockReceiver.Stop() +// call at bft_deliverer.go without a nil guard. +func TestBFTDelivererStopBeforeConnect(t *testing.T) { + d := &blocksprovider.BFTDeliverer{ + ChannelID: "test-channel", + DoneC: make(chan struct{}), + Logger: flogging.MustGetLogger("BFTDeliverer.test"), + } + + require.NotPanics(t, func() { + d.Stop() + }) +} diff --git a/diff.txt b/diff.txt new file mode 100644 index 00000000000..bb543057cef --- /dev/null +++ b/diff.txt @@ -0,0 +1,60 @@ +diff --git a/gossip/discovery/discovery_test.go b/gossip/discovery/discovery_test.go +index 01b80fc4f..d200cc83b 100644 +--- a/gossip/discovery/discovery_test.go ++++ b/gossip/discovery/discovery_test.go +@@ -1975,3 +1975,55 @@ func TestHandleAliveMessage_RelearnsMemberAfterConcurrentPurge(t *testing.T) { + require.True(t, inID2Member, "member should be present in id2Member after re-learning") + require.True(t, inAliveLastTS, "member should be present in aliveLastTS after re-learning") + } ++ ++func TestLearnExistingMembersNilMemberPanic(t *testing.T) { ++ // 1) Initialize a discovery instance (use existing helpers like createDiscoveryInstanceWithNoGossip). ++ inst := createDiscoveryInstanceWithNoGossip(10000, "testInst", nil) ++ defer inst.Stop() ++ ++ // Access the underlying implementation ++ discImpl := inst.discoveryImpl() ++ ++ // 2) Prepare a PKIid and endpoint. ++ pkiID := common.PKIidType("test-pki-id") ++ endpoint := "localhost:1234" ++ ++ // 3) Under lock, insert an entry into aliveLastTS for that PKIid. ++ // 4) Do NOT insert the corresponding entry into id2Member (simulate it being purged). ++ discImpl.lock.Lock() ++ discImpl.aliveLastTS[string(pkiID)] = ×tamp{ ++ incTime: time.Now(), ++ seqNum: 1, ++ lastSeen: time.Now(), ++ } ++ discImpl.lock.Unlock() ++ ++ // 5) Build a valid AliveMessage and wrap it with protoext.NoopSign. ++ aliveMsg := &proto.GossipMessage{ ++ Tag: proto.GossipMessage_EMPTY, ++ Content: &proto.GossipMessage_AliveMsg{ ++ AliveMsg: &proto.AliveMessage{ ++ Membership: &proto.Member{ ++ PkiId: pkiID, ++ Endpoint: endpoint, ++ }, ++ Timestamp: &proto.PeerTime{ ++ IncNum: uint64(time.Now().UnixNano()), ++ SeqNum: 2, ++ }, ++ }, ++ }, ++ } ++ signedMsg, err := protoext.NoopSign(aliveMsg) ++ require.NoError(t, err) ++ ++ // 6) Invoke the relevant flow directly. ++ // We call learnExistingMembers() instead of handleAliveMessage() because handleAliveMessage ++ // checks if the member is in id2Member under a read-lock and drops the message if not. ++ // In the exact concurrency flaw, the member is valid during handleAliveMessage's read lock, ++ // but gets purged before learnExistingMembers acquires its write lock. Calling learnExistingMembers ++ // directly avoids flaky timing races while perfectly recreating the problematic inconsistent state. ++ require.NotPanics(t, func() { ++ discImpl.learnExistingMembers([]*protoext.SignedGossipMessage{signedMsg}) ++ }, "learnExistingMembers should not panic when member is nil in id2Member") ++} diff --git a/diff_v2.txt b/diff_v2.txt new file mode 100644 index 00000000000..a2a1c5d73ff --- /dev/null +++ b/diff_v2.txt @@ -0,0 +1,65 @@ +diff --git a/gossip/discovery/discovery_test.go b/gossip/discovery/discovery_test.go +index 01b80fc4f..34d0f968b 100644 +--- a/gossip/discovery/discovery_test.go ++++ b/gossip/discovery/discovery_test.go +@@ -1975,3 +1975,60 @@ func TestHandleAliveMessage_RelearnsMemberAfterConcurrentPurge(t *testing.T) { + require.True(t, inID2Member, "member should be present in id2Member after re-learning") + require.True(t, inAliveLastTS, "member should be present in aliveLastTS after re-learning") + } ++ ++func TestLearnExistingMembers_NilMemberAfterConcurrentPurge(t *testing.T) { ++ // 1) Initialize a discovery instance (use existing helpers like createDiscoveryInstanceWithNoGossip). ++ inst := createDiscoveryInstanceWithNoGossip(10000, "testInst", nil) ++ defer inst.Stop() ++ ++ // Access the underlying implementation ++ d := inst.discoveryImpl() ++ ++ // 2) Prepare a PKIid and endpoint. ++ pkiID := common.PKIidType("test-pki-id") ++ endpoint := "localhost:1234" ++ ++ // 3) Under lock, insert an entry into aliveLastTS for that PKIid. ++ // 4) Do NOT insert the corresponding entry into id2Member (simulate it being purged). ++ d.lock.Lock() ++ d.aliveLastTS[string(pkiID)] = ×tamp{ ++ incTime: time.Now(), ++ seqNum: 1, ++ lastSeen: time.Now(), ++ } ++ d.lock.Unlock() ++ ++ // 5) Build a valid AliveMessage and wrap it with protoext.NoopSign. ++ aliveMsg := &proto.GossipMessage{ ++ Tag: proto.GossipMessage_EMPTY, ++ Content: &proto.GossipMessage_AliveMsg{ ++ AliveMsg: &proto.AliveMessage{ ++ Membership: &proto.Member{ ++ PkiId: pkiID, ++ Endpoint: endpoint, ++ }, ++ Timestamp: &proto.PeerTime{ ++ IncNum: uint64(time.Now().UnixNano()), ++ SeqNum: 2, ++ }, ++ }, ++ }, ++ } ++ signedMsg, err := protoext.NoopSign(aliveMsg) ++ require.NoError(t, err) ++ ++ // We invoke learnExistingMembers() directly to deterministically reproduce ++ // the inconsistent state where the member is present in aliveLastTS but ++ // missing from id2Member. ++ // ++ // In the real flow, handleAliveMessage first reads state under a read lock, ++ // and then learnExistingMembers acquires a write lock. A concurrent purge ++ // can remove the member between these two steps, leading to a nil access. ++ // ++ // Reproducing this via the full handleAliveMessage path would require a ++ // timing-dependent race, so we simulate the exact post-condition directly ++ // to keep the test deterministic and reliable. ++ require.NotPanics(t, func() { ++ d.learnExistingMembers([]*protoext.SignedGossipMessage{signedMsg}) ++ }, "learnExistingMembers should not panic when member is nil in id2Member") ++} diff --git a/gossip/discovery/discovery_impl.go b/gossip/discovery/discovery_impl.go index 9707f259c20..1922a5f6570 100644 --- a/gossip/discovery/discovery_impl.go +++ b/gossip/discovery/discovery_impl.go @@ -843,6 +843,10 @@ func (d *gossipDiscoveryImpl) learnExistingMembers(aliveArr []*protoext.SignedGo // update member's data member := d.id2Member[string(am.Membership.PkiId)] + if member == nil { + // skip safely, do not panic + continue + } member.Endpoint = am.Membership.Endpoint member.Metadata = am.Membership.Metadata member.InternalEndpoint = internalEndpoint diff --git a/gossip/discovery/discovery_test.go b/gossip/discovery/discovery_test.go index 01b80fc4fcf..85f59ae5e9f 100644 --- a/gossip/discovery/discovery_test.go +++ b/gossip/discovery/discovery_test.go @@ -1975,3 +1975,55 @@ func TestHandleAliveMessage_RelearnsMemberAfterConcurrentPurge(t *testing.T) { require.True(t, inID2Member, "member should be present in id2Member after re-learning") require.True(t, inAliveLastTS, "member should be present in aliveLastTS after re-learning") } + +func TestLearnExistingMembers_NilMemberAfterConcurrentPurge(t *testing.T) { + inst := createDiscoveryInstanceWithNoGossip(0, "testNilMemberAfterPurgeInst", nil) + defer func() { + inst.Stop() + time.Sleep(50 * time.Millisecond) + }() + + d := inst.discoveryImpl() + + pkiID := common.PKIidType("test-pki-id") + endpoint := "localhost:1234" + + // This test simulates the post-condition of a real race. + // In the real flow, handleAliveMessage reads under RLock and later calls + // learnExistingMembers under Lock. A concurrent purge can remove the member + // between these two phases, leaving it present in aliveLastTS but missing + // from id2Member. + // + // Calling learnExistingMembers directly avoids timing-dependent races + // while deterministically reproducing the inconsistent state. + d.lock.Lock() + d.aliveLastTS[string(pkiID)] = ×tamp{ + incTime: time.Now(), + seqNum: 1, + lastSeen: time.Now(), + } + d.lock.Unlock() + + aliveMsg := &proto.GossipMessage{ + Tag: proto.GossipMessage_EMPTY, + Content: &proto.GossipMessage_AliveMsg{ + AliveMsg: &proto.AliveMessage{ + Membership: &proto.Member{ + PkiId: pkiID, + Endpoint: endpoint, + }, + Timestamp: &proto.PeerTime{ + IncNum: uint64(time.Now().UnixNano()), + SeqNum: 2, + }, + }, + }, + } + + signedMsg, err := protoext.NoopSign(aliveMsg) + require.NoError(t, err) + + require.NotPanics(t, func() { + d.learnExistingMembers([]*protoext.SignedGossipMessage{signedMsg}) + }, "learnExistingMembers should not panic when a member is missing from id2Member") +} diff --git a/test_output.txt b/test_output.txt new file mode 100644 index 00000000000..eb7ea73bdd2 --- /dev/null +++ b/test_output.txt @@ -0,0 +1,34 @@ +=== RUN TestLearnExistingMembersNilMemberPanic + discovery_test.go:2026: + Error Trace: /home/aditya/fabric/gossip/discovery/discovery_test.go:2026 + Error: func (assert.PanicTestFunc)(0xba7a00) should not panic + Panic value: runtime error: invalid memory address or nil pointer dereference + Panic stack: goroutine 40 [running]: + runtime/debug.Stack() + /home/aditya/go/pkg/mod/golang.org/toolchain@v0.0.1-go1.26.1.linux-amd64/src/runtime/debug/stack.go:26 +0x5e + github.com/stretchr/testify/assert.didPanic.func1() + /home/aditya/fabric/vendor/github.com/stretchr/testify/assert/assertions.go:1234 +0x67 + panic({0xcecc20?, 0x15dddc0?}) + /home/aditya/go/pkg/mod/golang.org/toolchain@v0.0.1-go1.26.1.linux-amd64/src/runtime/panic.go:860 +0x13a + github.com/hyperledger/fabric/gossip/discovery.(*gossipDiscoveryImpl).learnExistingMembers(0x1066f128c420, {0x1066f1392378, 0x1, 0x1}) + /home/aditya/fabric/gossip/discovery/discovery_impl.go:846 +0x34c + github.com/hyperledger/fabric/gossip/discovery.TestLearnExistingMembersNilMemberPanic.func1() + /home/aditya/fabric/gossip/discovery/discovery_test.go:2027 +0x65 + github.com/stretchr/testify/assert.didPanic(0x15e0ea0?) + /home/aditya/fabric/vendor/github.com/stretchr/testify/assert/assertions.go:1239 +0x70 + github.com/stretchr/testify/assert.NotPanics({0xe52900, 0x1066f157ab48}, 0x1066f12fa360, {0x1066f12f1f48, 0x1, 0x1}) + /home/aditya/fabric/vendor/github.com/stretchr/testify/assert/assertions.go:1310 +0x7e + github.com/stretchr/testify/require.NotPanics({0xe58588, 0x1066f157ab48}, 0x1066f12fa360, {0x1066f12f1f48, 0x1, 0x1}) + /home/aditya/fabric/vendor/github.com/stretchr/testify/require/require.go:1669 +0xa6 + github.com/hyperledger/fabric/gossip/discovery.TestLearnExistingMembersNilMemberPanic(0x1066f157ab48) + /home/aditya/fabric/gossip/discovery/discovery_test.go:2026 +0x4f3 + testing.tRunner(0x1066f157ab48, 0xe47c10) + /home/aditya/go/pkg/mod/golang.org/toolchain@v0.0.1-go1.26.1.linux-amd64/src/testing/testing.go:2036 +0xea + created by testing.(*T).Run in goroutine 1 + /home/aditya/go/pkg/mod/golang.org/toolchain@v0.0.1-go1.26.1.linux-amd64/src/testing/testing.go:2101 +0x4c5 + Test: TestLearnExistingMembersNilMemberPanic + Messages: learnExistingMembers should not panic when member is nil in id2Member +--- FAIL: TestLearnExistingMembersNilMemberPanic (0.00s) +FAIL +FAIL github.com/hyperledger/fabric/gossip/discovery 0.028s +FAIL