From 417a70e37b219662709f1e40989893641d4ad3ae Mon Sep 17 00:00:00 2001 From: Esteve Soler Arderiu Date: Thu, 16 Apr 2026 19:48:45 +0200 Subject: [PATCH 1/3] fix(p2p): Mark unresponsive peers as disposable to prevent snapsync stalls Fixes snapsync failures where peer count stays constant and sync eventually fails with "Failed to receive block headers" after hours of operation. Root cause: After PR #6458 introduced Kademlia k-buckets, peers that became unresponsive during sync weren't marked as disposable, so they remained in the routing table indefinitely. New peers went into replacement lists but were never promoted because dead peers weren't pruned. Changes: - Enhanced prune() to remove disposable contacts from both main and replacement lists, with automatic promotion of replacements - Mark peers as disposable when they timeout during RLPx operations (block headers, block bodies, sync head requests) - Added periodic pruning in the snap_sync main loop to ensure dead peers are regularly removed and replaced Evidence from CI artifacts showed peer count stuck at 6 throughout 3h35m sync before failure. This fix enables peer rotation so healthy peers from replacement lists can take over when active peers become unresponsive. --- crates/networking/p2p/peer_handler.rs | 7 ++++- crates/networking/p2p/peer_table.rs | 36 ++++++++++++++++++++----- crates/networking/p2p/sync/snap_sync.rs | 3 +++ 3 files changed, 38 insertions(+), 8 deletions(-) diff --git a/crates/networking/p2p/peer_handler.rs b/crates/networking/p2p/peer_handler.rs index a921bc975a4..b2da1b4cc62 100644 --- a/crates/networking/p2p/peer_handler.rs +++ b/crates/networking/p2p/peer_handler.rs @@ -90,6 +90,8 @@ async fn ask_peer_head_number( } Ok(_other_msgs) => Err(PeerHandlerError::UnexpectedResponseFromPeer(peer_id)), Err(PeerConnectionError::Timeout) => { + // Mark this peer as disposable so it gets pruned and replaced + let _ = peer_table.set_disposable(peer_id); Err(PeerHandlerError::ReceiveMessageFromPeerTimeout(peer_id)) } Err(_other_err) => Err(PeerHandlerError::ReceiveMessageFromPeer(peer_id)), @@ -448,13 +450,15 @@ impl PeerHandler { warn!( "[SYNCING] Received empty/invalid headers from peer, penalizing peer {peer_id}" ); + let _ = self.peer_table.set_disposable(peer_id); return Ok(None); } } - // Timeouted + // Timeout or invalid response - mark peer as disposable warn!( "[SYNCING] Didn't receive block headers from peer, penalizing peer {peer_id}..." ); + let _ = self.peer_table.set_disposable(peer_id); Ok(None) } } @@ -536,6 +540,7 @@ impl PeerHandler { "[SYNCING] Didn't receive block bodies from peer, penalizing peer {peer_id}..." ); self.peer_table.record_failure(peer_id)?; + let _ = self.peer_table.set_disposable(peer_id); Ok(None) } } diff --git a/crates/networking/p2p/peer_table.rs b/crates/networking/p2p/peer_table.rs index f699b60295a..9912ac4e211 100644 --- a/crates/networking/p2p/peer_table.rs +++ b/crates/networking/p2p/peer_table.rs @@ -1011,15 +1011,37 @@ impl PeerTableServer { // --- Contact operations --- + /// Prune disposable contacts from both main and replacement lists. + /// When a main contact is removed, a replacement is automatically promoted. fn prune(&mut self) { - let disposable_contacts: Vec = self - .iter_contacts() - .filter_map(|(id, c)| c.disposable.then_some(*id)) - .collect(); + for bucket in &mut self.buckets { + // Collect disposable contacts from main list + let main_disposable: Vec = bucket + .contacts + .iter() + .filter(|(_, c)| c.disposable) + .map(|(id, _)| *id) + .collect(); + + // Remove from main list and promote replacements + for node_id in main_disposable { + bucket.remove_and_promote(&node_id); + self.discarded_contacts.insert(node_id); + } - for node_id in disposable_contacts { - if let Some(idx) = self.bucket_for(&node_id) { - self.buckets[idx].remove_and_promote(&node_id); + // Remove disposable contacts from replacement list + // (these don't get promoted, just removed) + let replacement_disposable: Vec = bucket + .replacements + .iter() + .filter(|(_, c)| c.disposable) + .map(|(id, _)| *id) + .collect(); + + bucket + .replacements + .retain(|(id, _)| !replacement_disposable.contains(id)); + for node_id in replacement_disposable { self.discarded_contacts.insert(node_id); } } diff --git a/crates/networking/p2p/sync/snap_sync.rs b/crates/networking/p2p/sync/snap_sync.rs index 519f1c73389..f2cfef9c9d1 100644 --- a/crates/networking/p2p/sync/snap_sync.rs +++ b/crates/networking/p2p/sync/snap_sync.rs @@ -130,6 +130,9 @@ pub async fn sync_cycle_snap( let mut attempts = 0; loop { + // Prune dead/unresponsive peers periodically to allow replacements to be promoted + let _ = peers.peer_table.prune_table(); + debug!("Requesting Block Headers from {current_head}"); let Some(mut block_headers) = peers From 3430ab4fb5597ff3527ff7e195888056fff25e43 Mon Sep 17 00:00:00 2001 From: Esteve Soler Arderiu Date: Fri, 17 Apr 2026 14:28:42 +0200 Subject: [PATCH 2/3] fix(l1): include replacement contacts in peer discovery and iteration The Kademlia k-bucket implementation only iterated over main bucket contacts, ignoring replacement entries. This caused peer starvation because dead contacts in the main list were never replaced by fresher peers from the replacement list. Fix iter_contacts() and do_get_contact_to_initiate() to also check replacement contacts, allowing the node to discover and connect to peers that were previously invisible to the peer selection logic. --- crates/networking/p2p/peer_table.rs | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/crates/networking/p2p/peer_table.rs b/crates/networking/p2p/peer_table.rs index 9912ac4e211..24cf51f8d2c 100644 --- a/crates/networking/p2p/peer_table.rs +++ b/crates/networking/p2p/peer_table.rs @@ -971,11 +971,15 @@ impl PeerTableServer { None } - /// Iterate over all contacts across all buckets. + /// Iterate over all contacts across all buckets (main and replacement lists). fn iter_contacts(&self) -> impl Iterator { - self.buckets - .iter() - .flat_map(|bucket| bucket.contacts.iter().map(|(id, c)| (id, c))) + self.buckets.iter().flat_map(|bucket| { + bucket + .contacts + .iter() + .chain(bucket.replacements.iter()) + .map(|(id, c)| (id, c)) + }) } // --- Peer selection --- @@ -1048,8 +1052,10 @@ impl PeerTableServer { } fn do_get_contact_to_initiate(&mut self) -> Option { + // Check both main contacts and replacements in each bucket. + // Replacements may contain fresher peers that haven't been tried yet. for bucket in &self.buckets { - for (node_id, contact) in &bucket.contacts { + for (node_id, contact) in bucket.contacts.iter().chain(bucket.replacements.iter()) { if !self.peers.contains_key(node_id) && !self.already_tried_peers.contains(node_id) && contact.knows_us From 4bdc22c1938815a544a1b2088a792f4e08c53fb4 Mon Sep 17 00:00:00 2001 From: Esteve Soler Arderiu Date: Fri, 17 Apr 2026 14:36:32 +0200 Subject: [PATCH 3/3] fix(l1): make contact lookup and mutation cover replacement lists KBucket::get_mut and get_contact only searched the main contact list, so any state mutation (set_disposable, ping tracking, find_node count, mark_knows_us) silently failed for contacts in the replacement list. Since iter_contacts and do_get_contact_to_initiate now return replacement contacts, this caused phantom contacts that were visible to selection but invisible to updates. Update get_contact to use get_any (main + replacements) and get_mut to search both lists, ensuring all contact state mutations work regardless of which list holds the contact. --- crates/networking/p2p/peer_table.rs | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/crates/networking/p2p/peer_table.rs b/crates/networking/p2p/peer_table.rs index 24cf51f8d2c..78a91f05bc9 100644 --- a/crates/networking/p2p/peer_table.rs +++ b/crates/networking/p2p/peer_table.rs @@ -86,9 +86,12 @@ impl KBucket { }) } - /// Find a mutable reference to a contact by node ID within this bucket. + /// Find a mutable reference to a contact by node ID (main or replacement list). fn get_mut(&mut self, node_id: &H256) -> Option<&mut Contact> { - self.contacts + if let Some((_, c)) = self.contacts.iter_mut().find(|(id, _)| id == node_id) { + return Some(c); + } + self.replacements .iter_mut() .find(|(id, _)| id == node_id) .map(|(_, c)| c) @@ -909,10 +912,10 @@ impl PeerTableServer { bucket_index(&self.local_node_id, node_id) } - /// Look up a contact by node ID (O(K) within the bucket). + /// Look up a contact by node ID in main or replacement list (O(K) within the bucket). fn get_contact(&self, node_id: &H256) -> Option<&Contact> { let idx = self.bucket_for(node_id)?; - self.buckets[idx].get(node_id) + self.buckets[idx].get_any(node_id) } /// Look up a mutable reference to a contact by node ID.