From dd8b301a658ce12688e743d996c33ec53544694a Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Mon, 27 Apr 2026 20:36:58 +0000
Subject: [PATCH 01/44] Scale native allreduce/allgather algos for MNNVL/MNNVLS

Bump MAX_NRANKS_PER_NODE from 8 to 72 to cover Multi-Node NVLink (MNNVL)
domains up to GB200 NVL72, and bump NUM_SEMAPHORES from 64 to 512 to
accommodate semaphore indexing that grows as O(nRanksPerNode).

Convert allreduce_rsag_zero_copy from a compile-time-templated kernel
({4,8} ranks) to a runtime nRanksPerNode kernel; fuse load+reduce inside
the peer loop to avoid O(NPeers) register pressure that would otherwise
spill at NVL72 scale.

Bump AllreduceAllpairPacket::maxBlockNum_ from 28 to 72 so the adapter
can launch >= nPeers blocks at MNNVL scale.

Fix a shared-memory channel-cache bug across five kernels:
nvls_zero_copy, nvls_warp_pipeline, packet, allreduce_fullmesh, and
allgather_fullmesh. The original 'if (lid < nPeers) channels[lid] = ...'
load only populated the first WARP_SIZE entries, but threads from
multiple warps later read channels[threadIdx.x] up to nPeers-1. Replace
with a per-warp strided loop so every warp loads all entries before
__syncwarp(); the same-value cross-warp writes are benign.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../allgather/allgather_fullmesh.cu           |  7 ++-
 .../allreduce/allreduce_fullmesh.cu           |  9 ++-
 .../allreduce/allreduce_nvls_warp_pipeline.cu |  7 ++-
 .../allreduce/allreduce_nvls_zero_copy.cu     |  7 ++-
 .../collectives/allreduce/allreduce_packet.cu |  7 ++-
 .../allreduce/allreduce_rsag_zero_copy.cu     | 55 ++++++++-----------
 .../allreduce/allreduce_allpair_packet.hpp    |  4 +-
 .../collectives/include/collective_utils.hpp  | 13 ++++-
 8 files changed, 61 insertions(+), 48 deletions(-)
diff --git a/src/ext/collectives/allgather/allgather_fullmesh.cu b/src/ext/collectives/allgather/allgather_fullmesh.cu
index fb51a3425..17054869e 100644
--- a/src/ext/collectives/allgather/allgather_fullmesh.cu
+++ b/src/ext/collectives/allgather/allgather_fullmesh.cu
@@ -30,8 +30,11 @@ __global__ void __launch_bounds__(1024, 1)
 
   __shared__ DeviceHandle<MemoryChannel> channels[MAX_NRANKS_PER_NODE - 1];
   const int lid = threadIdx.x % WARP_SIZE;
-  if (lid < nPeer) {
-    channels[lid] = memoryChans[lid];
+  // Each warp redundantly loads all entries (same value, benign race) so that
+  // every warp has the data its threads will read after __syncwarp(). Required
+  // when nPeer > WARP_SIZE (MNNVL/NVL72 scale).
+  for (int i = lid; i < nPeer; i += WARP_SIZE) {
+    channels[i] = memoryChans[i];
   }
   __syncwarp();
   const int tid = threadIdx.x + blockIdx.x * blockDim.x;
diff --git a/src/ext/collectives/allreduce/allreduce_fullmesh.cu b/src/ext/collectives/allreduce/allreduce_fullmesh.cu
index 24d2a31c2..9d144c621 100644
--- a/src/ext/collectives/allreduce/allreduce_fullmesh.cu
+++ b/src/ext/collectives/allreduce/allreduce_fullmesh.cu
@@ -52,9 +52,12 @@ __global__ void __launch_bounds__(512, 1)
   __shared__ DeviceHandle<MemoryChannel> channels[MAX_NRANKS_PER_NODE - 1];
   __shared__ DeviceHandle<MemoryChannel> outChannels[MAX_NRANKS_PER_NODE - 1];
   const int lid = threadIdx.x % WARP_SIZE;
-  if (lid < nPeer) {
-    channels[lid] = memoryChans[lid];
-    outChannels[lid] = memoryOutChans[lid];
+  // Each warp redundantly loads all entries (same value, benign race) so that
+  // every warp has the data its threads will read after __syncwarp(). Required
+  // when nPeer > WARP_SIZE (MNNVL/NVL72 scale).
+  for (int i = lid; i < nPeer; i += WARP_SIZE) {
+    channels[i] = memoryChans[i];
+    outChannels[i] = memoryOutChans[i];
   }
   __syncwarp();
 
diff --git a/src/ext/collectives/allreduce/allreduce_nvls_warp_pipeline.cu b/src/ext/collectives/allreduce/allreduce_nvls_warp_pipeline.cu
index 3bb054dae..9be621f08 100644
--- a/src/ext/collectives/allreduce/allreduce_nvls_warp_pipeline.cu
+++ b/src/ext/collectives/allreduce/allreduce_nvls_warp_pipeline.cu
@@ -57,8 +57,11 @@ __global__ void __launch_bounds__(1024, 1)
   auto memoryChans = memoryChannels + chanOffset;
   __shared__ DeviceHandle<BaseMemoryChannel> channels[(MAX_NRANKS_PER_NODE - 1) * 2];
   const int lid = threadIdx.x % WARP_SIZE;
-  if (lid < nPeers * 2) {
-    channels[lid] = memoryChans[lid];
+  // Each warp redundantly loads all entries (same value, benign race) so that
+  // every warp has the data its threads will read after __syncwarp(). Required
+  // when nPeers*2 > WARP_SIZE (MNNVL scale).
+  for (int i = lid; i < nPeers * 2; i += WARP_SIZE) {
+    channels[i] = memoryChans[i];
   }
   __syncwarp();
   for (int it = 0; it < nIter; it++) {
diff --git a/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu b/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu
index e7f2028fa..735deb0a1 100644
--- a/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu
+++ b/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu
@@ -44,8 +44,11 @@ __global__ void __launch_bounds__(1024, 1)
   auto memoryChans = memoryChannels + chanOffset;
   __shared__ mscclpp::DeviceHandle<mscclpp::BaseMemoryChannel> channels[MAX_NRANKS_PER_NODE - 1];
   const int lid = threadIdx.x % WARP_SIZE;
-  if (lid < nRanksPerNode - 1) {
-    channels[lid] = memoryChans[lid];
+  // Each warp redundantly loads all entries (same value, benign race) so that
+  // every warp has the data its threads will read after __syncwarp(). Required
+  // when nPeers > WARP_SIZE (MNNVL/NVL72 → 71 peers).
+  for (int i = lid; i < nRanksPerNode - 1; i += WARP_SIZE) {
+    channels[i] = memoryChans[i];
   }
   __syncwarp();
   if (threadIdx.x < nPeers) {
diff --git a/src/ext/collectives/allreduce/allreduce_packet.cu b/src/ext/collectives/allreduce/allreduce_packet.cu
index e2d8ef732..d39da408e 100644
--- a/src/ext/collectives/allreduce/allreduce_packet.cu
+++ b/src/ext/collectives/allreduce/allreduce_packet.cu
@@ -83,8 +83,11 @@ __global__ void __launch_bounds__(1024, 1)
   // Put channels into shared memory, read channel info from global memory is unexpectable slow.
   __shared__ mscclpp::DeviceHandle<mscclpp::MemoryChannel> channels[MAX_NRANKS_PER_NODE - 1];
   const int lid = tid % WARP_SIZE;
-  if (lid < nPeers) {
-    channels[lid] = memoryChannels[lid];
+  // Each warp redundantly loads all entries (same value, benign race) so that
+  // every warp has the data its threads will read after __syncwarp(). Required
+  // when nPeers > WARP_SIZE (MNNVL/NVL72 scale).
+  for (int i = lid; i < nPeers; i += WARP_SIZE) {
+    channels[i] = memoryChannels[i];
   }
   __syncwarp();
   // step 1: write to scratch buffer
diff --git a/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu b/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu
index f95ba7e33..42d86fc89 100644
--- a/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu
+++ b/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu
@@ -35,25 +35,26 @@ __device__ mscclpp::DeviceSyncer globalSyncer;
 //
 // This approach requires registering both input and output buffers as remote
 // memories (2 * nPeers handles), but avoids scratch buffer allocation and
-// the extra copy steps of the standard RSAG. The NRanksPerNode template
-// parameter enables compile-time unrolling of peer loops (supports 4 or 8).
+// the extra copy steps of the standard RSAG. nRanksPerNode is accepted at
+// runtime, which allows the same kernel to handle any NVLink-domain size
+// (including Multi-Node NVLink fabrics up to NVL72).
 
-template <int NRanksPerNode, ReduceOp OpType, typename T, typename AccumT = T>
+template <ReduceOp OpType, typename T, typename AccumT = T>
 __global__ void __launch_bounds__(1024, 1)
     allreduceRsAgZeroCopy(T* buff, T* scratch, T* resultBuff, DeviceHandle<BaseMemoryChannel>* memoryChannels,
-                          DeviceHandle<SwitchChannel>* switchChannels, void* remoteMemories, int rank, int worldSize,
-                          size_t nelems) {
+                          DeviceHandle<SwitchChannel>* switchChannels, void* remoteMemories, int rank,
+                          int nRanksPerNode, int worldSize, size_t nelems) {
   int blockId = blockIdx.x;
 
   assert((uintptr_t)buff % sizeof(int4) == 0);
   assert((uintptr_t)resultBuff % sizeof(int4) == 0);
 
-  constexpr int NPeers = NRanksPerNode - 1;
+  const int NPeers = nRanksPerNode - 1;
   constexpr uint32_t nelemsPerInt4 = sizeof(int4) / sizeof(T);
-  const uint32_t outputRemoteBufferOffset = NRanksPerNode - 1;
-  uint32_t alignedNelems = ((nelems + NRanksPerNode - 1) / NRanksPerNode + nelemsPerInt4 - 1) / nelemsPerInt4 *
-                           nelemsPerInt4 * NRanksPerNode;
-  uint32_t nelemsPerRank = alignedNelems / NRanksPerNode;
+  const uint32_t outputRemoteBufferOffset = NPeers;
+  uint32_t alignedNelems = ((nelems + nRanksPerNode - 1) / nRanksPerNode + nelemsPerInt4 - 1) / nelemsPerInt4 *
+                           nelemsPerInt4 * nRanksPerNode;
+  uint32_t nelemsPerRank = alignedNelems / nRanksPerNode;
   uint32_t nInt4PerRank = nelemsPerRank / nelemsPerInt4;
   uint32_t nInt4Total = (nelems + nelemsPerInt4 - 1) / nelemsPerInt4;
 
@@ -69,12 +70,11 @@ __global__ void __launch_bounds__(1024, 1)
   }
   if (nInt4PerBlock == 0) return;
 
-  if (threadIdx.x < NPeers) {
+  if ((int)threadIdx.x < NPeers) {
     memoryChannelsLocal[threadIdx.x].relaxedSignal();
     memoryChannelsLocal[threadIdx.x].relaxedWait();
   }
   __syncthreads();
-  int4 data[NPeers];
   // AccumInt4: when AccumT != T, use a wider accumulator type.
   // For AccumT == T, this is just int4 (no-op conversion).
   constexpr int nElemsPerInt4 = sizeof(int4) / sizeof(T);
@@ -84,20 +84,17 @@ __global__ void __launch_bounds__(1024, 1)
     uint32_t offset = idx + offset4 + rank * nInt4PerRank;
     if (offset >= nInt4Total) continue;
     int4 tmp_raw = buff4[offset];
-#pragma unroll
-    for (int i = 0; i < NPeers; i++) {
-      int rankIdx = (rank + i + 1) % NRanksPerNode;
-      int peerIdx = rankIdx < rank ? rankIdx : rankIdx - 1;
-      data[i] = mscclpp::read<int4>(((void**)remoteMemories)[peerIdx], offset);
-    }
+    int4 data;
     AccumVec acc = mscclpp::upcastVector<T, AccumT, AccumVec>(tmp_raw);
     for (int i = 0; i < NPeers; i++) {
-      acc = mscclpp::calVectorAccum<T, AccumT, OpType, AccumVec>(acc, data[i]);
+      int rankIdx = (rank + i + 1) % nRanksPerNode;
+      int peerIdx = rankIdx < rank ? rankIdx : rankIdx - 1;
+      data = mscclpp::read<int4>(((void**)remoteMemories)[peerIdx], offset);
+      acc = mscclpp::calVectorAccum<T, AccumT, OpType, AccumVec>(acc, data);
     }
     int4 tmp = mscclpp::downcastVector<T, AccumT, int4>(acc);
-#pragma unroll
     for (int i = 0; i < NPeers; i++) {
-      int rankIdx = (rank + i + 1) % NRanksPerNode;
+      int rankIdx = (rank + i + 1) % nRanksPerNode;
       int peerIdx = rankIdx < rank ? rankIdx : rankIdx - 1;
       mscclpp::write<int4>(((void**)remoteMemories)[outputRemoteBufferOffset + peerIdx], offset, tmp);
     }
@@ -105,7 +102,7 @@ __global__ void __launch_bounds__(1024, 1)
   }
   // Use device barrier gives better performance here.
   globalSyncer.sync(gridDim.x);
-  if (blockIdx.x == 0 && threadIdx.x < NPeers) {
+  if (blockIdx.x == 0 && (int)threadIdx.x < NPeers) {
     memoryChannelsLocal[threadIdx.x].signal();
     memoryChannelsLocal[threadIdx.x].wait();
   }
@@ -126,17 +123,9 @@ struct AllreduceRsAgZeroCopyAdapter {
         nBlocks = 128;
       }
     }
-    if (nRanksPerNode == 4) {
-      allreduceRsAgZeroCopy<4, OpType, T, AccumT>
-          <<<nBlocks, nThreadsPerBlock, 0, stream>>>((T*)input, (T*)scratch, (T*)output, (ChannelType*)memoryChannels,
-                                                     switchChannel, remoteMemories, rank, worldSize, nelems);
-    } else if (nRanksPerNode == 8) {
-      allreduceRsAgZeroCopy<8, OpType, T, AccumT>
-          <<<nBlocks, nThreadsPerBlock, 0, stream>>>((T*)input, (T*)scratch, (T*)output, (ChannelType*)memoryChannels,
-                                                     switchChannel, remoteMemories, rank, worldSize, nelems);
-    } else {
-      THROW(ALGO, Error, ErrorCode::InvalidUsage, "Unsupported number of ranks per node: ", nRanksPerNode);
-    }
+    allreduceRsAgZeroCopy<OpType, T, AccumT><<<nBlocks, nThreadsPerBlock, 0, stream>>>(
+        (T*)input, (T*)scratch, (T*)output, (ChannelType*)memoryChannels, switchChannel, remoteMemories, rank,
+        nRanksPerNode, worldSize, nelems);
     return cudaGetLastError();
   }
 };
diff --git a/src/ext/collectives/include/allreduce/allreduce_allpair_packet.hpp b/src/ext/collectives/include/allreduce/allreduce_allpair_packet.hpp
index 362308b2e..fe96f7622 100644
--- a/src/ext/collectives/include/allreduce/allreduce_allpair_packet.hpp
+++ b/src/ext/collectives/include/allreduce/allreduce_allpair_packet.hpp
@@ -29,7 +29,9 @@ class AllreduceAllpairPacket : public AlgorithmBuilder {
   void* scratchBuffer_;
   size_t scratchBufferSize_;
   const int nSegmentsForScratchBuffer_ = 2;
-  const int maxBlockNum_ = 28;
+  // Must be at least MAX_NRANKS_PER_NODE-1 so the adapter can launch one
+  // block per peer at MNNVL scale.
+  const int maxBlockNum_ = 72;
   std::vector<Connection> conns_;
   std::vector<std::shared_ptr<MemoryDevice2DeviceSemaphore>> memorySemaphores_;
   std::vector<RegisteredMemory> registeredMemories_;
diff --git a/src/ext/collectives/include/collective_utils.hpp b/src/ext/collectives/include/collective_utils.hpp
index f705a9d1d..638214dd5 100644
--- a/src/ext/collectives/include/collective_utils.hpp
+++ b/src/ext/collectives/include/collective_utils.hpp
@@ -26,9 +26,16 @@ namespace mscclpp {
 
 namespace collective {
 constexpr int NUM_NVLS_CONNECTION = 8;
-constexpr int NUM_SEMAPHORES = 64;
-
-constexpr int MAX_NRANKS_PER_NODE = 8;
+// Sized to cover MAX_NRANKS_PER_NODE-scale allreduce algos whose device-side
+// semaphore indices grow as O(nRanksPerNode) (e.g. nvls_block_pipeline uses
+// up to ~5 * nRanksPerNode entries).
+constexpr int NUM_SEMAPHORES = 512;
+
+// Upper bound on the number of NVLink-reachable ranks that participate in a
+// single collective. Sized to cover Multi-Node NVLink (MNNVL) domains up to
+// GB200 NVL72 (72 GPUs sharing one NVLink fabric). Drives compile-time sizing
+// of shared-memory channel arrays in the allreduce/allgather kernels.
+constexpr int MAX_NRANKS_PER_NODE = 72;
 
 constexpr int SCRATCH_SIZE = 2 * 1024 * 1024 * 70;  // double buffer * 35 thread-blocks * 8 ranks * 256KB = 70MB
 

From 893a08e69c036311bc3e8e74bf4e5973d9e0c317 Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Tue, 28 Apr 2026 05:38:59 +0000
Subject: [PATCH 02/44] Enable MNNVL allreduce tuning

Add an MNNVL rank-domain override so MSCCL++ collectives can treat multi-host NVLink fabrics as a single CUDA IPC/NVLS peer group. Update packet, RSAG, and NVLS allreduce paths to use the collective domain size and teach the torch integration tuning example to select MNNVL-capable allreduce algorithms.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../customized_comm_with_tuning.py            | 176 +++++++++++++++---
 include/mscclpp/env.hpp                       |   6 +
 src/core/bootstrap/bootstrap.cc               |   5 +
 src/core/env.cpp                              |   4 +-
 .../allreduce/allreduce_allpair_packet.cu     |   7 +-
 .../allreduce/allreduce_nvls_packet.cu        |   4 +-
 .../allreduce/allreduce_nvls_zero_copy.cu     |   2 +-
 .../collectives/allreduce/allreduce_packet.cu |   7 +-
 .../collectives/allreduce/allreduce_rsag.cu   |   2 +-
 .../allreduce/allreduce_rsag_zero_copy.cu     |   2 +-
 src/ext/collectives/collective_utils.cc       |  21 ++-
 .../collectives/include/collective_utils.hpp  |   4 +-
 12 files changed, 199 insertions(+), 41 deletions(-)

diff --git a/examples/torch-integration/customized_comm_with_tuning.py b/examples/torch-integration/customized_comm_with_tuning.py
index 060a0097d..035c1dbbb 100644
--- a/examples/torch-integration/customized_comm_with_tuning.py
+++ b/examples/torch-integration/customized_comm_with_tuning.py
@@ -2,11 +2,34 @@
 # Licensed under the MIT License.
 
 # torchrun --nnodes=1 --nproc_per_node=8 examples/torch-integration/customized_comm_with_tuning.py
+# mpirun -np 2 --hostfile <hostfile> python3 examples/torch-integration/customized_comm_with_tuning.py
 
-import os
+import gc
+import fcntl
 import ipaddress
+import os
+import socket
+import struct
+import sys
+import traceback
+
+def _get_bootstrap_world_size():
+    for name in ("WORLD_SIZE", "OMPI_COMM_WORLD_SIZE", "PMI_SIZE", "SLURM_NTASKS"):
+        value = os.environ.get(name)
+        if value is not None:
+            return int(value)
+    return None
+
+
+_bootstrap_world_size = _get_bootstrap_world_size()
+if (
+    _bootstrap_world_size
+    and _bootstrap_world_size > 1
+    and "MSCCLPP_MNNVL_NRANKS_PER_NODE" not in os.environ
+    and os.environ.get("MSCCLPP_ENABLE_MNNVL", "1") != "0"
+):
+    os.environ["MSCCLPP_MNNVL_NRANKS_PER_NODE"] = str(_bootstrap_world_size)
 
-import netifaces as ni
 import torch
 import mscclpp
 import mscclpp.ext
@@ -37,15 +60,44 @@ def _load_algorithms(scratch: torch.Tensor, rank: int):
 
 def _interfaces_for_ip(ip: str):
     target = ipaddress.ip_address(ip)
-    for iface in ni.interfaces():
-        addrs = ni.ifaddresses(iface)
-        if ni.AF_INET in addrs:
-            for link in addrs[ni.AF_INET]:
-                if "addr" in link and ipaddress.ip_address(link["addr"]) == target:
-                    return iface
+    for iface in os.listdir("/sys/class/net"):
+        try:
+            with socket.socket(socket.AF_INET, socket.SOCK_DGRAM) as sock:
+                req = struct.pack("256s", iface.encode()[:15])
+                addr = socket.inet_ntoa(fcntl.ioctl(sock.fileno(), 0x8915, req)[20:24])
+        except OSError:
+            continue
+        if ipaddress.ip_address(addr) == target:
+            return iface
     return None
 
 
+def _resolve_interface(master_addr: str):
+    for env_name in ("MSCCLPP_INTERFACE", "MSCCLPP_SOCKET_IFNAME", "NCCL_SOCKET_IFNAME"):
+        value = os.environ.get(env_name)
+        if value:
+            iface = value.split(",")[0].strip()
+            if iface in os.listdir("/sys/class/net"):
+                return iface
+            raise ValueError(f"Interface {iface} from {env_name} does not exist")
+    return _interfaces_for_ip(master_addr)
+
+
+def _get_env_int(*names: str, default=None):
+    for name in names:
+        value = os.environ.get(name)
+        if value is not None:
+            return int(value)
+    return default
+
+
+def _running_under_mpi() -> bool:
+    return any(
+        name in os.environ
+        for name in ("OMPI_COMM_WORLD_RANK", "PMI_RANK", "PMIX_RANK", "MPI_LOCALRANKID", "SLURM_PROCID")
+    )
+
+
 def _to_mscclpp_op(op) -> mscclpp.ReduceOp:
     if op == torch.distributed.ReduceOp.SUM:
         return mscclpp.ReduceOp.SUM
@@ -76,6 +128,7 @@ class CustomizedComm:
         "default_allreduce_nvls_packet": 16,
         "default_allreduce_packet": 56,
         "default_allreduce_allpair_packet": 56,
+        "default_allreduce_rsag": 64,
         "default_allreduce_fullmesh": 64,
         "default_allgather_fullmesh2": 32,
     }
@@ -84,6 +137,12 @@ def __init__(self, comm: mscclpp.CommGroup, symmetric_memory: bool = False):
         self.comm = comm
         self.rank = comm.my_rank
         self.world_size = comm.nranks
+        self.nranks_per_node = comm.nranks_per_node
+        self.mnnvl_domain = self.world_size > 1 and os.environ.get("MSCCLPP_MNNVL_NRANKS_PER_NODE") == str(
+            self.world_size
+        )
+        self.multi_node = self.world_size > self.nranks_per_node and not self.mnnvl_domain
+        self.multi_host_mnnvl = self.mnnvl_domain and self.world_size > 1
         self.symmetric_memory = symmetric_memory
         self._nvls = mscclpp.is_nvls_supported()
 
@@ -106,6 +165,10 @@ def _default_ar_config(self):
         pkt = self._algo("allreduce", "default_allreduce_nvls_packet")
         if self._nvls and pkt:
             return (pkt, 0, 0)
+        if self.multi_node or self.multi_host_mnnvl:
+            rsag = self._algo("allreduce", "default_allreduce_rsag")
+            if rsag:
+                return (rsag, 0, 0)
         return (self._algo("allreduce", "default_allreduce_packet"), 0, 0)
 
     # -- low-level execute --
@@ -166,23 +229,48 @@ def _ensure_tune_bufs(self):
 
     def _ar_candidates(self, size: int):
         out = []
-        if size <= 4 << 20:
+        if self.multi_host_mnnvl:
+            if size <= 4 << 20:
+                a = self._algo("allreduce", "default_allreduce_packet")
+                if a:
+                    out.append(a)
+                a = self._algo("allreduce", "default_allreduce_nvls_packet")
+                if self._nvls and a:
+                    out.append(a)
+            if size >= 512 << 10:
+                a = self._algo("allreduce", "default_allreduce_rsag")
+                if a:
+                    out.append(a)
+            return out
+        if self.multi_node:
             a = self._algo("allreduce", "default_allreduce_nvls_packet")
             if self._nvls and a:
                 out.append(a)
             a = self._algo("allreduce", "default_allreduce_packet")
+            if a:
+                out.append(a)
+            if size >= 512 << 10:
+                a = self._algo("allreduce", "default_allreduce_rsag")
+                if a:
+                    out.append(a)
+            return out
+        if size <= 4 << 20:
+            a = self._algo("allreduce", "default_allreduce_packet")
             if a:
                 out.append(a)
             a = self._algo("allreduce", "default_allreduce_allpair_packet")
             if a:
                 out.append(a)
-        if size >= 512 << 10:
-            a = self._algo("allreduce", "default_allreduce_nvls_zero_copy")
-            if self._nvls and self.symmetric_memory and a:
+            a = self._algo("allreduce", "default_allreduce_nvls_packet")
+            if self._nvls and a:
                 out.append(a)
+        if size >= 512 << 10:
             a = self._algo("allreduce", "default_allreduce_rsag_zero_copy")
             if a:
                 out.append(a)
+            a = self._algo("allreduce", "default_allreduce_nvls_zero_copy")
+            if self._nvls and self.symmetric_memory and a:
+                out.append(a)
         if torch.version.hip is not None:
             a = self._algo("allreduce", "default_allreduce_fullmesh")
             if a:
@@ -190,6 +278,8 @@ def _ar_candidates(self, size: int):
         return out
 
     def _ag_candidates(self):
+        if self.multi_node or self.multi_host_mnnvl:
+            return []
         a = self._algo("allgather", "default_allgather_fullmesh2")
         return [a] if a else []
 
@@ -314,6 +404,8 @@ def all_reduce(self, tensor, op=torch.distributed.ReduceOp.SUM, stream=None, acc
         )
 
     def all_gather(self, output_tensor, input_tensor, stream=None):
+        if self.multi_node or self.multi_host_mnnvl:
+            raise RuntimeError("all_gather in this example currently supports only single-node runs")
         sz = _round_pow2(input_tensor.nbytes)
         if sz not in self._tune_cache["allgather"]:
             self._tune_size("allgather", sz)
@@ -332,7 +424,11 @@ def destroy(self):
 # -- Benchmarks (standalone) --------------------------------------------------
 
 
-def _bench_sizes(low=5 * 1024, high=80 << 20):
+def _bench_sizes(low=None, high=None):
+    if low is None:
+        low = _get_env_int("MSCCLPP_BENCH_LOW_SIZE", default=5 * 1024)
+    if high is None:
+        high = _get_env_int("MSCCLPP_BENCH_HIGH_SIZE", default=80 << 20)
     sizes, c = [], low
     while c <= high:
         sizes.append(c)
@@ -433,13 +529,21 @@ def benchmark_allgather(comm: CustomizedComm, dtype=torch.float16, n_warmup=10,
 
 def init_dist() -> mscclpp.CommGroup:
     addr = os.environ.get("MSCCLPP_MASTER_ADDR")
-    if addr:
-        rank, world = int(os.environ["RANK"]), int(os.environ["WORLD_SIZE"])
-        port = os.environ["MSCCLPP_MASTER_PORT"]
-        iface = _interfaces_for_ip(addr)
+    rank = _get_env_int("RANK", "OMPI_COMM_WORLD_RANK", "PMI_RANK", "SLURM_PROCID")
+    world = _get_env_int("WORLD_SIZE", "OMPI_COMM_WORLD_SIZE", "PMI_SIZE", "SLURM_NTASKS")
+    if addr and rank is not None and world is not None:
+        port = os.environ.get("MSCCLPP_MASTER_PORT", "29500")
+        iface = _resolve_interface(addr)
         if not iface:
             raise ValueError(f"No interface for {addr}")
         return mscclpp.CommGroup(interfaceIpPortTrio=f"{iface}:{addr}:{port}", rank=rank, size=world)
+    if _running_under_mpi():
+        try:
+            from mpi4py import MPI
+        except ModuleNotFoundError as exc:
+            raise RuntimeError("mpi4py is required to launch this example with mpirun") from exc
+
+        return mscclpp.CommGroup(mpi_comm=MPI.COMM_WORLD)
     import torch.distributed as dist
 
     dist.init_process_group(backend="gloo")
@@ -447,7 +551,7 @@ def init_dist() -> mscclpp.CommGroup:
 
 
 def main():
-    local = int(os.environ["LOCAL_RANK"])
+    local = _get_env_int("LOCAL_RANK", "OMPI_COMM_WORLD_LOCAL_RANK", "MPI_LOCALRANKID", "SLURM_LOCALID", default=0)
     torch.cuda.set_device(local)
 
     dtype_str = os.environ.get("DTYPE", "float16")
@@ -455,22 +559,48 @@ def main():
     accum_map = {"float32": mscclpp.DataType.float32, "float16": mscclpp.DataType.float16}
     accum_str = os.environ.get("ACCUM_DTYPE")
     accum_dtype = accum_map.get(accum_str) if accum_str else None
+    n_warmup = _get_env_int("MSCCLPP_BENCH_WARMUP", default=10)
+    n_graph_launches = _get_env_int("MSCCLPP_BENCH_GRAPH_LAUNCHES", default=10)
+    n_iter = _get_env_int("MSCCLPP_BENCH_ITERS", default=100)
 
     comm_group = init_dist()
     cc = CustomizedComm(comm_group)
 
     print(f"rank {local} starting benchmarks with dtype={dtype} accum_dtype={accum_dtype}...")
-    benchmark_allreduce(cc, dtype=dtype, accum_dtype=accum_dtype)
+    benchmark_allreduce(
+        cc,
+        dtype=dtype,
+        accum_dtype=accum_dtype,
+        n_warmup=n_warmup,
+        n_graph_launches=n_graph_launches,
+        n_iter=n_iter,
+    )
     cc.barrier()
     torch.cuda.synchronize()
 
-    benchmark_allgather(cc, dtype=dtype)
-    cc.barrier()
-    torch.cuda.synchronize()
+    if cc.multi_node or cc.multi_host_mnnvl:
+        if cc.rank == 0:
+            print("Skipping allgather benchmark on multi-node: this example's allgather path is single-node only.")
+    else:
+        benchmark_allgather(cc, dtype=dtype, n_warmup=n_warmup, n_graph_launches=n_graph_launches, n_iter=n_iter)
+        cc.barrier()
+        torch.cuda.synchronize()
 
     cc.destroy()
+    del cc
+    del comm_group
+    gc.collect()
     print(f"rank {local} completed successfully.")
 
 
 if __name__ == "__main__":
-    main()
+    exit_code = 0
+    try:
+        main()
+    except Exception:
+        exit_code = 1
+        traceback.print_exc()
+    finally:
+        sys.stdout.flush()
+        sys.stderr.flush()
+        os._exit(exit_code)
diff --git a/include/mscclpp/env.hpp b/include/mscclpp/env.hpp
index a6dd306b6..09d364c3b 100644
--- a/include/mscclpp/env.hpp
+++ b/include/mscclpp/env.hpp
@@ -119,6 +119,12 @@ class Env {
   /// Default is 0. Used when `EndpointConfig::Ib::gidIndex` is -1 (unspecified).
   const int ibGidIndex;
 
+  /// Env name: `MSCCLPP_MNNVL_NRANKS_PER_NODE`. Overrides the NVLink-domain size reported by the bootstrap.
+  /// This is intended for Multi-Node NVLink (MNNVL) deployments where a single CUDA IPC / NVLS domain spans
+  /// multiple hosts and should be treated as one collective peer group.
+  /// If unset or non-positive, the bootstrap falls back to physical-host-based detection.
+  const int mnnvlNranksPerNode;
+
  private:
   Env();
 
diff --git a/src/core/bootstrap/bootstrap.cc b/src/core/bootstrap/bootstrap.cc
index b3032e502..c84ef4c0f 100644
--- a/src/core/bootstrap/bootstrap.cc
+++ b/src/core/bootstrap/bootstrap.cc
@@ -5,6 +5,7 @@
 
 #include <cstring>
 #include <mscclpp/core.hpp>
+#include <mscclpp/env.hpp>
 #include <mscclpp/errors.hpp>
 #include <sstream>
 #include <thread>
@@ -433,6 +434,10 @@ void TcpBootstrap::Impl::establishConnections(int64_t timeoutSec) {
 
 int TcpBootstrap::Impl::getNranksPerNode() {
   if (nRanksPerNode_ > 0) return nRanksPerNode_;
+  if (env()->mnnvlNranksPerNode > 0) {
+    nRanksPerNode_ = env()->mnnvlNranksPerNode;
+    return nRanksPerNode_;
+  }
   int nRanksPerNode = 0;
   bool useIpv4 = peerCommAddresses_[rank_].sa.sa_family == AF_INET;
   for (int i = 0; i < nRanks_; i++) {
diff --git a/src/core/env.cpp b/src/core/env.cpp
index 7a42471bf..b46670d79 100644
--- a/src/core/env.cpp
+++ b/src/core/env.cpp
@@ -67,7 +67,8 @@ Env::Env()
       ncclSymmetricMemory(readEnv<bool>("MSCCLPP_NCCL_SYMMETRIC_MEMORY", false)),
       forceDisableNvls(readEnv<bool>("MSCCLPP_FORCE_DISABLE_NVLS", false)),
       forceDisableGdr(readEnv<bool>("MSCCLPP_FORCE_DISABLE_GDR", false)),
-      ibGidIndex(readEnv<int>("MSCCLPP_IB_GID_INDEX", 0)) {}
+      ibGidIndex(readEnv<int>("MSCCLPP_IB_GID_INDEX", 0)),
+      mnnvlNranksPerNode(readEnv<int>("MSCCLPP_MNNVL_NRANKS_PER_NODE", 0)) {}
 
 std::shared_ptr<Env> env() {
   static std::shared_ptr<Env> globalEnv = std::shared_ptr<Env>(new Env());
@@ -97,6 +98,7 @@ std::shared_ptr<Env> env() {
     logEnv("MSCCLPP_FORCE_DISABLE_NVLS", globalEnv->forceDisableNvls);
     logEnv("MSCCLPP_FORCE_DISABLE_GDR", globalEnv->forceDisableGdr);
     logEnv("MSCCLPP_IB_GID_INDEX", globalEnv->ibGidIndex);
+    logEnv("MSCCLPP_MNNVL_NRANKS_PER_NODE", globalEnv->mnnvlNranksPerNode);
   }
   return globalEnv;
 }
diff --git a/src/ext/collectives/allreduce/allreduce_allpair_packet.cu b/src/ext/collectives/allreduce/allreduce_allpair_packet.cu
index 17bcfc338..9516ad786 100644
--- a/src/ext/collectives/allreduce/allreduce_allpair_packet.cu
+++ b/src/ext/collectives/allreduce/allreduce_allpair_packet.cu
@@ -17,9 +17,6 @@ __global__ void allreduceAllPairs(T* buff, T* scratch, T* resultBuff, DeviceHand
                                   size_t channelDataOffset, size_t scratchBufferSize, int rank, int nRanksPerNode,
                                   int worldSize, size_t nelems, uint32_t numScratchBuff, void* flags,
                                   uint32_t flagSize) {
-  // This version of allreduce only works for single nodes
-  if (worldSize != nRanksPerNode) return;
-
   if (sizeof(T) == 2 || sizeof(T) == 1) nelems = (nelems * sizeof(T) + sizeof(T)) / sizeof(int);
   const int nPeers = nRanksPerNode - 1;
 
@@ -143,7 +140,7 @@ std::shared_ptr<void> AllreduceAllpairPacket::initAllreduceContext(std::shared_p
   const int nChannelsPerConnection = maxBlockNum_;
   ctx->rank = comm->bootstrap()->getRank();
   ctx->workSize = comm->bootstrap()->getNranks();
-  ctx->nRanksPerNode = comm->bootstrap()->getNranksPerNode();
+  ctx->nRanksPerNode = getCollectiveDomainNranksPerNode(comm, this->conns_);
   ctx->memorySemaphores = this->memorySemaphores_;
   ctx->registeredMemories = this->registeredMemories_;
   ctx->registeredMemories.pop_back();  // remove the local memory from previous context
@@ -189,4 +186,4 @@ std::shared_ptr<Algorithm> AllreduceAllpairPacket::build() {
       });
 }
 }  // namespace collective
-}  // namespace mscclpp
\ No newline at end of file
+}  // namespace mscclpp
diff --git a/src/ext/collectives/allreduce/allreduce_nvls_packet.cu b/src/ext/collectives/allreduce/allreduce_nvls_packet.cu
index a616485e1..21f710283 100644
--- a/src/ext/collectives/allreduce/allreduce_nvls_packet.cu
+++ b/src/ext/collectives/allreduce/allreduce_nvls_packet.cu
@@ -94,7 +94,7 @@ std::shared_ptr<void> AllreduceNvlsPacket::initAllreduceContext(std::shared_ptr<
   auto ctx = std::make_shared<AlgorithmCtx>();
   ctx->rank = comm->bootstrap()->getRank();
   ctx->workSize = comm->bootstrap()->getNranks();
-  ctx->nRanksPerNode = comm->bootstrap()->getNranksPerNode();
+  ctx->nRanksPerNode = getCollectiveDomainNranksPerNode(comm);
 
   // setup channels
   ctx->switchChannels = this->switchChannels_;
@@ -154,4 +154,4 @@ std::shared_ptr<mscclpp::Algorithm> AllreduceNvlsPacket::build() {
       });
 }
 }  // namespace collective
-}  // namespace mscclpp
\ No newline at end of file
+}  // namespace mscclpp
diff --git a/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu b/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu
index 735deb0a1..25077004b 100644
--- a/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu
+++ b/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu
@@ -183,7 +183,7 @@ std::shared_ptr<void> AllreduceNvls::initAllreduceContext(std::shared_ptr<mscclp
   auto ctx = std::make_shared<AlgorithmCtx>();
   ctx->rank = comm->bootstrap()->getRank();
   ctx->workSize = comm->bootstrap()->getNranks();
-  ctx->nRanksPerNode = comm->bootstrap()->getNranksPerNode();
+  ctx->nRanksPerNode = getCollectiveDomainNranksPerNode(comm);
 
   size_t sendBytes, recvBytes;
   CUdeviceptr sendBasePtr, recvBasePtr;
diff --git a/src/ext/collectives/allreduce/allreduce_packet.cu b/src/ext/collectives/allreduce/allreduce_packet.cu
index d39da408e..c195aefa3 100644
--- a/src/ext/collectives/allreduce/allreduce_packet.cu
+++ b/src/ext/collectives/allreduce/allreduce_packet.cu
@@ -23,9 +23,6 @@ __global__ void __launch_bounds__(1024, 1)
 #else
     ) {
 #endif
-  // This version of allreduce only works for single nodes
-  if (worldSize != nRanksPerNode) return;
-
 #if defined(ENABLE_NPKIT)
   extern __shared__ int4 NpkitSharedMem[];
   NpKitEvent* event_buffer = (NpKitEvent*)((char*)NpkitSharedMem);
@@ -267,7 +264,7 @@ std::shared_ptr<void> AllreducePacket::initAllreduceContext(std::shared_ptr<Comm
   const int nChannelsPerConnection = maxBlockNum_;
   ctx->rank = comm->bootstrap()->getRank();
   ctx->workSize = comm->bootstrap()->getNranks();
-  ctx->nRanksPerNode = comm->bootstrap()->getNranksPerNode();
+  ctx->nRanksPerNode = getCollectiveDomainNranksPerNode(comm, this->conns_);
   ctx->memorySemaphores = this->memorySemaphores_;
   ctx->registeredMemories = this->registeredMemories_;
   ctx->registeredMemories.pop_back();  // remove the local memory from previous context
@@ -313,4 +310,4 @@ std::shared_ptr<Algorithm> AllreducePacket::build() {
 }
 
 }  // namespace collective
-}  // namespace mscclpp
\ No newline at end of file
+}  // namespace mscclpp
diff --git a/src/ext/collectives/allreduce/allreduce_rsag.cu b/src/ext/collectives/allreduce/allreduce_rsag.cu
index db471b932..f964b87e9 100644
--- a/src/ext/collectives/allreduce/allreduce_rsag.cu
+++ b/src/ext/collectives/allreduce/allreduce_rsag.cu
@@ -199,7 +199,7 @@ std::shared_ptr<void> AllreduceRsAg::initAllreduceContext(std::shared_ptr<Commun
   auto ctx = std::make_shared<AlgorithmCtx>();
   ctx->rank = comm->bootstrap()->getRank();
   ctx->workSize = comm->bootstrap()->getNranks();
-  ctx->nRanksPerNode = comm->bootstrap()->getNranksPerNode();
+  ctx->nRanksPerNode = getCollectiveDomainNranksPerNode(comm, this->conns_);
 
   ctx->memorySemaphores = this->scratchSemaphores_;
   ctx->registeredMemories = this->remoteScratchMemories_;
diff --git a/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu b/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu
index 42d86fc89..c4dea321c 100644
--- a/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu
+++ b/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu
@@ -183,7 +183,7 @@ std::shared_ptr<void> AllreduceRsAgZeroCopy::initAllreduceContext(std::shared_pt
   auto ctx = std::make_shared<AlgorithmCtx>();
   ctx->rank = comm->bootstrap()->getRank();
   ctx->workSize = comm->bootstrap()->getNranks();
-  ctx->nRanksPerNode = comm->bootstrap()->getNranksPerNode();
+  ctx->nRanksPerNode = getCollectiveDomainNranksPerNode(comm, this->conns_);
 
   ctx->memorySemaphores = this->semaphores_;
 
diff --git a/src/ext/collectives/collective_utils.cc b/src/ext/collectives/collective_utils.cc
index 016c4a5cc..4d46c53bc 100644
--- a/src/ext/collectives/collective_utils.cc
+++ b/src/ext/collectives/collective_utils.cc
@@ -69,6 +69,25 @@ std::vector<std::shared_ptr<mscclpp::MemoryDevice2DeviceSemaphore>> setupMemoryS
   return memorySemaphores;
 }
 
+int getCollectiveDomainNranksPerNode(std::shared_ptr<mscclpp::Communicator> comm,
+                                     const std::vector<mscclpp::Connection>& connections) {
+  const int worldSize = comm->bootstrap()->getNranks();
+  const int nRanksPerNode = comm->bootstrap()->getNranksPerNode();
+  if (worldSize <= nRanksPerNode) {
+    return nRanksPerNode;
+  }
+  const bool allPeersUseCudaIpc =
+      std::all_of(connections.begin(), connections.end(),
+                  [](const auto& connection) { return connection.transport() == mscclpp::Transport::CudaIpc; });
+  return allPeersUseCudaIpc ? worldSize : nRanksPerNode;
+}
+
+int getCollectiveDomainNranksPerNode(std::shared_ptr<mscclpp::Communicator> comm) {
+  const int worldSize = comm->bootstrap()->getNranks();
+  const int nRanksPerNode = comm->bootstrap()->getNranksPerNode();
+  return worldSize > nRanksPerNode ? worldSize : nRanksPerNode;
+}
+
 std::shared_ptr<mscclpp::DeviceHandle<mscclpp::MemoryChannel>> setupMemoryChannelDeviceHandles(
     const std::vector<mscclpp::MemoryChannel>& memoryChannels) {
   std::vector<mscclpp::DeviceHandle<mscclpp::MemoryChannel>> memoryChannelDeviceHandles;
@@ -153,4 +172,4 @@ std::shared_ptr<mscclpp::DeviceHandle<mscclpp::BaseMemoryChannel>> setupBaseMemo
 
 }  // namespace collective
 
-}  // namespace mscclpp
\ No newline at end of file
+}  // namespace mscclpp
diff --git a/src/ext/collectives/include/collective_utils.hpp b/src/ext/collectives/include/collective_utils.hpp
index 638214dd5..38362a659 100644
--- a/src/ext/collectives/include/collective_utils.hpp
+++ b/src/ext/collectives/include/collective_utils.hpp
@@ -50,6 +50,8 @@ std::vector<MemoryChannel> setupMemoryChannels(
 std::vector<Connection> setupConnections(std::shared_ptr<Communicator> comm);
 std::vector<std::shared_ptr<MemoryDevice2DeviceSemaphore>> setupMemorySemaphores(
     std::shared_ptr<Communicator> comm, const std::vector<Connection>& connections, int nChannelsPerConnection);
+int getCollectiveDomainNranksPerNode(std::shared_ptr<Communicator> comm, const std::vector<Connection>& connections);
+int getCollectiveDomainNranksPerNode(std::shared_ptr<Communicator> comm);
 
 std::shared_ptr<DeviceHandle<MemoryChannel>> setupMemoryChannelDeviceHandles(
     const std::vector<MemoryChannel>& memoryChannels);
@@ -96,4 +98,4 @@ class AlgorithmCtx {
 
 }  // namespace collective
 }  // namespace mscclpp
-#endif  // MSCCLPP_EXT_COLLECTIVE_UTILS_HPP_
\ No newline at end of file
+#endif  // MSCCLPP_EXT_COLLECTIVE_UTILS_HPP_

From dded5e0e3933573080acfad30b30681e9a4b19b7 Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Tue, 28 Apr 2026 06:41:17 +0000
Subject: [PATCH 03/44] Improve MNNVL allreduce tuning performance

Add the allpair packet algorithm to the MNNVL small-message candidate set and enable zero-copy NVLS/RSAG candidates for larger symmetric-memory allreduce benchmarks. Run the standalone tuning example with symmetric memory so RawGpuBuffer-backed tensors can use the zero-copy paths.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../customized_comm_with_tuning.py                  | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/examples/torch-integration/customized_comm_with_tuning.py b/examples/torch-integration/customized_comm_with_tuning.py
index 035c1dbbb..75a2c9608 100644
--- a/examples/torch-integration/customized_comm_with_tuning.py
+++ b/examples/torch-integration/customized_comm_with_tuning.py
@@ -126,9 +126,11 @@ class CustomizedComm:
     _CANDIDATE_NTHREADS = [512, 768, 1024]
     _NBLOCKS_LIMIT = {
         "default_allreduce_nvls_packet": 16,
+        "default_allreduce_nvls_zero_copy": 32,
         "default_allreduce_packet": 56,
         "default_allreduce_allpair_packet": 56,
         "default_allreduce_rsag": 64,
+        "default_allreduce_rsag_zero_copy": 64,
         "default_allreduce_fullmesh": 64,
         "default_allgather_fullmesh2": 32,
     }
@@ -232,12 +234,21 @@ def _ar_candidates(self, size: int):
         if self.multi_host_mnnvl:
             if size <= 4 << 20:
                 a = self._algo("allreduce", "default_allreduce_packet")
+                if a:
+                    out.append(a)
+                a = self._algo("allreduce", "default_allreduce_allpair_packet")
                 if a:
                     out.append(a)
                 a = self._algo("allreduce", "default_allreduce_nvls_packet")
                 if self._nvls and a:
                     out.append(a)
             if size >= 512 << 10:
+                a = self._algo("allreduce", "default_allreduce_rsag_zero_copy")
+                if self.symmetric_memory and a:
+                    out.append(a)
+                a = self._algo("allreduce", "default_allreduce_nvls_zero_copy")
+                if self._nvls and self.symmetric_memory and a:
+                    out.append(a)
                 a = self._algo("allreduce", "default_allreduce_rsag")
                 if a:
                     out.append(a)
@@ -564,7 +575,7 @@ def main():
     n_iter = _get_env_int("MSCCLPP_BENCH_ITERS", default=100)
 
     comm_group = init_dist()
-    cc = CustomizedComm(comm_group)
+    cc = CustomizedComm(comm_group, symmetric_memory=True)
 
     print(f"rank {local} starting benchmarks with dtype={dtype} accum_dtype={accum_dtype}...")
     benchmark_allreduce(

From 865c2bc795d5cf3f4e45c5c480eec41b36d5b96e Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Tue, 28 Apr 2026 07:55:52 +0000
Subject: [PATCH 04/44] Optimize MNNVL allreduce without symmetric memory

Run the tuning example with symmetric memory disabled, make allreduce tuning use the same symmetric-memory mode as execution, and narrow the MNNVL small-message candidate set to avoid slower packet/NVLS choices. Increase packet and RSAG channel parallelism so non-symmetric CUDA-IPC paths can use 112-block packet and 128-block RSAG configs.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../customized_comm_with_tuning.py            | 37 +++++++++++--------
 .../collectives/allreduce/allreduce_rsag.cu   |  8 +++-
 .../include/allreduce/allreduce_packet.hpp    |  4 +-
 3 files changed, 30 insertions(+), 19 deletions(-)

diff --git a/examples/torch-integration/customized_comm_with_tuning.py b/examples/torch-integration/customized_comm_with_tuning.py
index 75a2c9608..4190d562e 100644
--- a/examples/torch-integration/customized_comm_with_tuning.py
+++ b/examples/torch-integration/customized_comm_with_tuning.py
@@ -122,14 +122,14 @@ class CustomizedComm:
     _TUNE_N_WARMUP = 5
     _TUNE_N_GRAPH_LAUNCHES = 10
     _TUNE_N_OPS_PER_GRAPH = 100
-    _CANDIDATE_NBLOCKS = [4, 8, 16, 24, 32, 48, 64, 128]
+    _CANDIDATE_NBLOCKS = [4, 8, 16, 24, 32, 48, 64, 112, 128]
     _CANDIDATE_NTHREADS = [512, 768, 1024]
     _NBLOCKS_LIMIT = {
         "default_allreduce_nvls_packet": 16,
         "default_allreduce_nvls_zero_copy": 32,
-        "default_allreduce_packet": 56,
+        "default_allreduce_packet": 112,
         "default_allreduce_allpair_packet": 56,
-        "default_allreduce_rsag": 64,
+        "default_allreduce_rsag": 128,
         "default_allreduce_rsag_zero_copy": 64,
         "default_allreduce_fullmesh": 64,
         "default_allgather_fullmesh2": 32,
@@ -162,6 +162,11 @@ def __init__(self, comm: mscclpp.CommGroup, symmetric_memory: bool = False):
     def _algo(self, collective: str, name: str):
         return self._algos.get((collective, name))
 
+    def _nblocks_limit(self, algo_name: str, size: int) -> int:
+        if algo_name == "default_allreduce_packet" and size < (1 << 20):
+            return 56
+        return self._NBLOCKS_LIMIT.get(algo_name, 128)
+
     def _default_ar_config(self):
         """Fallback allreduce config for barrier / timing sync."""
         pkt = self._algo("allreduce", "default_allreduce_nvls_packet")
@@ -218,7 +223,7 @@ def _exec_ag(self, inp, out, algo, nb, nt, stream=None, sym=None):
 
     def _barrier_internal(self):
         a, nb, nt = self._default_ar_config()
-        self._exec_ar(self._barrier_tensor, a, nb, nt, sym=True)
+        self._exec_ar(self._barrier_tensor, a, nb, nt, sym=self.symmetric_memory)
 
     # -- lazy tuning --
 
@@ -233,15 +238,17 @@ def _ar_candidates(self, size: int):
         out = []
         if self.multi_host_mnnvl:
             if size <= 4 << 20:
-                a = self._algo("allreduce", "default_allreduce_packet")
-                if a:
-                    out.append(a)
                 a = self._algo("allreduce", "default_allreduce_allpair_packet")
                 if a:
                     out.append(a)
-                a = self._algo("allreduce", "default_allreduce_nvls_packet")
-                if self._nvls and a:
-                    out.append(a)
+                if size <= 64 << 10:
+                    a = self._algo("allreduce", "default_allreduce_nvls_packet")
+                    if self._nvls and a:
+                        out.append(a)
+                if size > 128 << 10:
+                    a = self._algo("allreduce", "default_allreduce_packet")
+                    if a:
+                        out.append(a)
             if size >= 512 << 10:
                 a = self._algo("allreduce", "default_allreduce_rsag_zero_copy")
                 if self.symmetric_memory and a:
@@ -308,7 +315,7 @@ def _run_tune(self, collective, algo, buf, size, nb, nt):
                 stream=torch.cuda.current_stream().cuda_stream,
                 nblocks=nb,
                 nthreads_per_block=nt,
-                symmetric_memory=True,
+                symmetric_memory=self.symmetric_memory,
             )
         else:
             total = size * self.world_size
@@ -337,7 +344,7 @@ def _tune_size(self, collective: str, target_size: int):
         run = lambda a, nb, nt: self._run_tune(collective, a, buf, target_size, nb, nt)
 
         for algo in cands:
-            nb_limit = self._NBLOCKS_LIMIT.get(algo.name, 128)
+            nb_limit = self._nblocks_limit(algo.name, target_size)
             for nb in self._CANDIDATE_NBLOCKS:
                 if nb > nb_limit:
                     continue
@@ -346,7 +353,7 @@ def _tune_size(self, collective: str, target_size: int):
                     ret = run(algo, nb, nt)
                     torch.cuda.synchronize()
                     self._time_buf[0] = float(ret)
-                    self._exec_ar(self._time_buf[:1], *self._default_ar_config(), sym=True)
+                    self._exec_ar(self._time_buf[:1], *self._default_ar_config(), sym=self.symmetric_memory)
                     if self._time_buf[0].item() != 0:
                         continue
                     used.add(algo)
@@ -375,7 +382,7 @@ def _tune_size(self, collective: str, target_size: int):
                     # Cross-rank timing sync
                     self._time_buf.fill_(elapsed)
                     torch.cuda.current_stream().wait_stream(cs)
-                    self._exec_ar(self._time_buf, *self._default_ar_config(), sym=True)
+                    self._exec_ar(self._time_buf, *self._default_ar_config(), sym=self.symmetric_memory)
                     avg = self._time_buf[self.rank].item() / self.world_size
 
                     if avg < best_time:
@@ -575,7 +582,7 @@ def main():
     n_iter = _get_env_int("MSCCLPP_BENCH_ITERS", default=100)
 
     comm_group = init_dist()
-    cc = CustomizedComm(comm_group, symmetric_memory=True)
+    cc = CustomizedComm(comm_group, symmetric_memory=False)
 
     print(f"rank {local} starting benchmarks with dtype={dtype} accum_dtype={accum_dtype}...")
     benchmark_allreduce(
diff --git a/src/ext/collectives/allreduce/allreduce_rsag.cu b/src/ext/collectives/allreduce/allreduce_rsag.cu
index f964b87e9..7f9e6bfd6 100644
--- a/src/ext/collectives/allreduce/allreduce_rsag.cu
+++ b/src/ext/collectives/allreduce/allreduce_rsag.cu
@@ -133,7 +133,7 @@ struct AllreduceRsAgAdapter {
     size_t nelems = inputSize / sizeof(T);
     if (nBlocks == 0 || nThreadsPerBlock == 0) {
       nThreadsPerBlock = 1024;
-      nBlocks = 64;
+      nBlocks = 128;
     }
     allreduceRsAg<OpType, T><<<nBlocks, nThreadsPerBlock, 0, stream>>>(
         (T*)input, (T*)scratch, (T*)output, (ChannelType*)memoryChannels, switchChannel, remoteMemories, rank,
@@ -144,7 +144,7 @@ struct AllreduceRsAgAdapter {
 
 void AllreduceRsAg::initialize(std::shared_ptr<Communicator> comm) {
   this->conns_ = setupConnections(comm);
-  nChannelsPerConnection_ = 64;
+  nChannelsPerConnection_ = 128;
   comm_ = comm;
   // setup semaphores
   this->scratchSemaphores_ = setupMemorySemaphores(comm, this->conns_, nChannelsPerConnection_);
@@ -179,6 +179,10 @@ CommResult AllreduceRsAg::allreduceKernelFunc(const std::shared_ptr<void> ctx, c
     return CommResult::CommInvalidArgument;
   }
   std::pair<int, int> numBlocksAndThreads = {nBlocks, nThreadsPerBlock};
+  if (numBlocksAndThreads.first > nChannelsPerConnection_) {
+    WARN(ALGO, "Block number ", numBlocksAndThreads.first, " exceeds the maximum limit ", nChannelsPerConnection_);
+    return CommResult::CommInvalidArgument;
+  }
   cudaError_t error = allreduce(input, this->scratchBuffer_, output, this->baseMemoryChannelHandles_.get(),
                                 this->remoteMemoryHandles_.get(), nullptr, nullptr, 0, 0, 0, algoCtx->rank,
                                 algoCtx->nRanksPerNode, algoCtx->workSize, inputSize, stream, nullptr, 0, 0,
diff --git a/src/ext/collectives/include/allreduce/allreduce_packet.hpp b/src/ext/collectives/include/allreduce/allreduce_packet.hpp
index de7ca4719..771126c96 100644
--- a/src/ext/collectives/include/allreduce/allreduce_packet.hpp
+++ b/src/ext/collectives/include/allreduce/allreduce_packet.hpp
@@ -29,7 +29,7 @@ class AllreducePacket : public AlgorithmBuilder {
   void* scratchBuffer_;
   size_t scratchBufferSize_;
   const int nSegmentsForScratchBuffer_ = 2;
-  const int maxBlockNum_ = 56;
+  const int maxBlockNum_ = 112;
   std::vector<Connection> conns_;
   uintptr_t flagBuffer_;
   size_t flagBufferSize_;
@@ -37,4 +37,4 @@ class AllreducePacket : public AlgorithmBuilder {
   std::vector<RegisteredMemory> registeredMemories_;
 };
 }  // namespace collective
-}  // namespace mscclpp
\ No newline at end of file
+}  // namespace mscclpp

From 3bc00cb7f0ab309b7a274db29de839730116098c Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Tue, 28 Apr 2026 08:24:49 +0000
Subject: [PATCH 05/44] Enable NVLS zero-copy without symmetric memory flag

Allow default_allreduce_nvls_zero_copy to run when the public symmetric_memory flag is false; the algorithm already binds the concrete input and output allocations in its context. Include that fast path in MNNVL tuning and bound allpair/NVLS packet candidates to small sizes so large-message no-symmetric tuning avoids slow or unsafe packet variants.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../torch-integration/customized_comm_with_tuning.py     | 9 +++++----
 .../collectives/allreduce/allreduce_nvls_zero_copy.cu    | 7 +------
 .../include/allreduce/allreduce_nvls_zero_copy.hpp       | 3 +--
 3 files changed, 7 insertions(+), 12 deletions(-)

diff --git a/examples/torch-integration/customized_comm_with_tuning.py b/examples/torch-integration/customized_comm_with_tuning.py
index 4190d562e..0736cb68e 100644
--- a/examples/torch-integration/customized_comm_with_tuning.py
+++ b/examples/torch-integration/customized_comm_with_tuning.py
@@ -238,9 +238,10 @@ def _ar_candidates(self, size: int):
         out = []
         if self.multi_host_mnnvl:
             if size <= 4 << 20:
-                a = self._algo("allreduce", "default_allreduce_allpair_packet")
-                if a:
-                    out.append(a)
+                if size <= 128 << 10:
+                    a = self._algo("allreduce", "default_allreduce_allpair_packet")
+                    if a:
+                        out.append(a)
                 if size <= 64 << 10:
                     a = self._algo("allreduce", "default_allreduce_nvls_packet")
                     if self._nvls and a:
@@ -254,7 +255,7 @@ def _ar_candidates(self, size: int):
                 if self.symmetric_memory and a:
                     out.append(a)
                 a = self._algo("allreduce", "default_allreduce_nvls_zero_copy")
-                if self._nvls and self.symmetric_memory and a:
+                if self._nvls and a:
                     out.append(a)
                 a = self._algo("allreduce", "default_allreduce_rsag")
                 if a:
diff --git a/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu b/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu
index 25077004b..8c360f962 100644
--- a/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu
+++ b/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu
@@ -122,10 +122,6 @@ CommResult AllreduceNvls::allreduceKernelFunc(const std::shared_ptr<void> ctx_vo
                                               cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
                                               [[maybe_unused]] const std::unordered_map<std::string, uintptr_t>& extras,
                                               mscclpp::DataType accumDtype) {
-  if (!symmetricMemory_) {
-    WARN("AllreduceNvls requires symmetric memory for now.");
-    return CommResult::CommInvalidArgument;
-  }
   auto ctx = std::static_pointer_cast<AlgorithmCtx>(ctx_void);
   AllreduceFunc allreduce = dispatch<NvlsAdapter>(op, dtype, accumDtype);
   if (!allreduce) {
@@ -169,8 +165,7 @@ CommResult AllreduceNvls::allreduceKernelFunc(const std::shared_ptr<void> ctx_vo
 }
 
 mscclpp::AlgorithmCtxKey AllreduceNvls::generateAllreduceContextKey(const void* input, void* output, size_t,
-                                                                    mscclpp::DataType, bool symmetricMemory) {
-  symmetricMemory_ = symmetricMemory;
+                                                                    mscclpp::DataType, bool) {
   size_t sendBytes, recvBytes;
   CUdeviceptr sendBasePtr, recvBasePtr;
   MSCCLPP_CUTHROW(cuMemGetAddressRange(&sendBasePtr, &sendBytes, (CUdeviceptr)input));
diff --git a/src/ext/collectives/include/allreduce/allreduce_nvls_zero_copy.hpp b/src/ext/collectives/include/allreduce/allreduce_nvls_zero_copy.hpp
index d53ea180b..396152800 100644
--- a/src/ext/collectives/include/allreduce/allreduce_nvls_zero_copy.hpp
+++ b/src/ext/collectives/include/allreduce/allreduce_nvls_zero_copy.hpp
@@ -15,7 +15,6 @@ class AllreduceNvls : public AlgorithmBuilder {
   std::shared_ptr<Algorithm> build() override;
 
  private:
-  bool symmetricMemory_ = false;
   void initialize(std::shared_ptr<Communicator> comm);
   CommResult allreduceKernelFunc(const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize,
                                  DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
@@ -41,4 +40,4 @@ class AllreduceNvls : public AlgorithmBuilder {
 }  // namespace collective
 }  // namespace mscclpp
 
-#endif  // MSCCLPP_ALLREDUCE_NVLS_ZERO_COPY_HPP_
\ No newline at end of file
+#endif  // MSCCLPP_ALLREDUCE_NVLS_ZERO_COPY_HPP_

From 533f329971e003e2ca67803c19959d13bf7140ea Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Tue, 28 Apr 2026 16:23:23 +0000
Subject: [PATCH 06/44] Tune no-sym MNNVL with RSAG zero-copy

Disable NVLS zero-copy when symmetric memory is not enabled, and allow the RSAG zero-copy path to participate in MNNVL tuning for non-symmetric memory. Cache RSAG zero-copy contexts by the concrete buffer pointers so CUDA graph capture does not create a new registration for every execute call, and cap requested blocks at the channel count.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../torch-integration/customized_comm_with_tuning.py     | 6 +++---
 .../collectives/allreduce/allreduce_nvls_zero_copy.cu    | 7 ++++++-
 .../collectives/allreduce/allreduce_rsag_zero_copy.cu    | 9 ++++++---
 .../include/allreduce/allreduce_nvls_zero_copy.hpp       | 1 +
 4 files changed, 16 insertions(+), 7 deletions(-)

diff --git a/examples/torch-integration/customized_comm_with_tuning.py b/examples/torch-integration/customized_comm_with_tuning.py
index 0736cb68e..6f8f097d0 100644
--- a/examples/torch-integration/customized_comm_with_tuning.py
+++ b/examples/torch-integration/customized_comm_with_tuning.py
@@ -130,7 +130,7 @@ class CustomizedComm:
         "default_allreduce_packet": 112,
         "default_allreduce_allpair_packet": 56,
         "default_allreduce_rsag": 128,
-        "default_allreduce_rsag_zero_copy": 64,
+        "default_allreduce_rsag_zero_copy": 128,
         "default_allreduce_fullmesh": 64,
         "default_allgather_fullmesh2": 32,
     }
@@ -252,10 +252,10 @@ def _ar_candidates(self, size: int):
                         out.append(a)
             if size >= 512 << 10:
                 a = self._algo("allreduce", "default_allreduce_rsag_zero_copy")
-                if self.symmetric_memory and a:
+                if a:
                     out.append(a)
                 a = self._algo("allreduce", "default_allreduce_nvls_zero_copy")
-                if self._nvls and a:
+                if self._nvls and self.symmetric_memory and a:
                     out.append(a)
                 a = self._algo("allreduce", "default_allreduce_rsag")
                 if a:
diff --git a/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu b/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu
index 8c360f962..25077004b 100644
--- a/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu
+++ b/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu
@@ -122,6 +122,10 @@ CommResult AllreduceNvls::allreduceKernelFunc(const std::shared_ptr<void> ctx_vo
                                               cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
                                               [[maybe_unused]] const std::unordered_map<std::string, uintptr_t>& extras,
                                               mscclpp::DataType accumDtype) {
+  if (!symmetricMemory_) {
+    WARN("AllreduceNvls requires symmetric memory for now.");
+    return CommResult::CommInvalidArgument;
+  }
   auto ctx = std::static_pointer_cast<AlgorithmCtx>(ctx_void);
   AllreduceFunc allreduce = dispatch<NvlsAdapter>(op, dtype, accumDtype);
   if (!allreduce) {
@@ -165,7 +169,8 @@ CommResult AllreduceNvls::allreduceKernelFunc(const std::shared_ptr<void> ctx_vo
 }
 
 mscclpp::AlgorithmCtxKey AllreduceNvls::generateAllreduceContextKey(const void* input, void* output, size_t,
-                                                                    mscclpp::DataType, bool) {
+                                                                    mscclpp::DataType, bool symmetricMemory) {
+  symmetricMemory_ = symmetricMemory;
   size_t sendBytes, recvBytes;
   CUdeviceptr sendBasePtr, recvBasePtr;
   MSCCLPP_CUTHROW(cuMemGetAddressRange(&sendBasePtr, &sendBytes, (CUdeviceptr)input));
diff --git a/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu b/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu
index c4dea321c..a11da0f89 100644
--- a/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu
+++ b/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu
@@ -153,6 +153,10 @@ CommResult AllreduceRsAgZeroCopy::allreduceKernelFunc(const std::shared_ptr<void
     return CommResult::CommInvalidArgument;
   }
   std::pair<int, int> numBlocksAndThreads = {nBlocks, nThreadsPerBlock};
+  if (numBlocksAndThreads.first > nChannelsPerConnection_) {
+    WARN(ALGO, "Block number ", numBlocksAndThreads.first, " exceeds the maximum limit ", nChannelsPerConnection_);
+    return CommResult::CommInvalidArgument;
+  }
   cudaError_t error =
       allreduce(input, nullptr, output, this->baseMemoryChannelHandles_.get(), algoCtx->remoteMemoryHandles.get(),
                 nullptr, nullptr, 0, 0, 0, algoCtx->rank, algoCtx->nRanksPerNode, algoCtx->workSize, inputSize, stream,
@@ -165,9 +169,8 @@ CommResult AllreduceRsAgZeroCopy::allreduceKernelFunc(const std::shared_ptr<void
 }
 
 AlgorithmCtxKey AllreduceRsAgZeroCopy::generateAllreduceContextKey(const void* inputBuffer, void* outputBuffer,
-                                                                   size_t size, DataType, bool symmetricMemory) {
+                                                                    size_t size, DataType, bool symmetricMemory) {
   // For non-symmetric algorithms, we use both input and output buffer pointers in the key.
-  static int tag = 0;
   if (symmetricMemory) {
     size_t inputBytes, outputBytes;
     CUdeviceptr inputBasePtr, outputBasePtr;
@@ -175,7 +178,7 @@ AlgorithmCtxKey AllreduceRsAgZeroCopy::generateAllreduceContextKey(const void* i
     MSCCLPP_CUTHROW(cuMemGetAddressRange(&outputBasePtr, &outputBytes, (CUdeviceptr)outputBuffer));
     return AlgorithmCtxKey{(void*)inputBasePtr, (void*)outputBasePtr, inputBytes, outputBytes, 0};
   }
-  return AlgorithmCtxKey{(void*)inputBuffer, outputBuffer, size, size, ++tag};
+  return AlgorithmCtxKey{(void*)inputBuffer, outputBuffer, size, size, 0};
 }
 
 std::shared_ptr<void> AllreduceRsAgZeroCopy::initAllreduceContext(std::shared_ptr<Communicator> comm, const void* input,
diff --git a/src/ext/collectives/include/allreduce/allreduce_nvls_zero_copy.hpp b/src/ext/collectives/include/allreduce/allreduce_nvls_zero_copy.hpp
index 396152800..c40bd2cda 100644
--- a/src/ext/collectives/include/allreduce/allreduce_nvls_zero_copy.hpp
+++ b/src/ext/collectives/include/allreduce/allreduce_nvls_zero_copy.hpp
@@ -15,6 +15,7 @@ class AllreduceNvls : public AlgorithmBuilder {
   std::shared_ptr<Algorithm> build() override;
 
  private:
+  bool symmetricMemory_ = false;
   void initialize(std::shared_ptr<Communicator> comm);
   CommResult allreduceKernelFunc(const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize,
                                  DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks, int nThreadsPerBlock,

From 45a651b2c81ec61bc846d823db1691772c33d280 Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Fri, 1 May 2026 18:27:17 +0000
Subject: [PATCH 07/44] Decouple IPC-domain hint from bootstrap nRanksPerNode

Replace MSCCLPP_MNNVL_NRANKS_PER_NODE (which overrode TcpBootstrap and
silently changed getNranksPerNode() for every consumer) with a single
algorithm-level helper getIpcDomainNranks(comm) backed by a new
MSCCLPP_IPC_DOMAIN_NRANKS env. The neutral IPC name covers both NVLink/
MNNVL on NV and XGMI on AMD. Bootstrap is unchanged and continues to
report physical-host detection.

Collapse the two getCollectiveDomainNranksPerNode overloads into one
canonical helper and route all six allreduce algos (packet,
allpair_packet, nvls_packet, nvls_zero_copy, rsag, rsag_zero_copy)
through it. Update the standalone tuning example to use the new env
name; drop the undeclared MSCCLPP_ENABLE_MNNVL gate; fix
multi_host_mnnvl detection now that nranks_per_node is no longer
overridden by the bootstrap.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../customized_comm_with_tuning.py            | 17 +++++---------
 include/mscclpp/env.hpp                       | 10 ++++-----
 src/core/bootstrap/bootstrap.cc               |  5 -----
 src/core/env.cpp                              |  4 ++--
 .../allreduce/allreduce_allpair_packet.cu     |  2 +-
 .../allreduce/allreduce_nvls_packet.cu        |  2 +-
 .../allreduce/allreduce_nvls_zero_copy.cu     |  2 +-
 .../collectives/allreduce/allreduce_packet.cu |  2 +-
 .../collectives/allreduce/allreduce_rsag.cu   |  2 +-
 .../allreduce/allreduce_rsag_zero_copy.cu     |  4 ++--
 src/ext/collectives/collective_utils.cc       | 22 +++++--------------
 .../collectives/include/collective_utils.hpp  |  9 ++++++--
 12 files changed, 33 insertions(+), 48 deletions(-)

diff --git a/examples/torch-integration/customized_comm_with_tuning.py b/examples/torch-integration/customized_comm_with_tuning.py
index 6f8f097d0..1d54cfa77 100644
--- a/examples/torch-integration/customized_comm_with_tuning.py
+++ b/examples/torch-integration/customized_comm_with_tuning.py
@@ -13,6 +13,7 @@
 import sys
 import traceback
 
+
 def _get_bootstrap_world_size():
     for name in ("WORLD_SIZE", "OMPI_COMM_WORLD_SIZE", "PMI_SIZE", "SLURM_NTASKS"):
         value = os.environ.get(name)
@@ -22,13 +23,8 @@ def _get_bootstrap_world_size():
 
 
 _bootstrap_world_size = _get_bootstrap_world_size()
-if (
-    _bootstrap_world_size
-    and _bootstrap_world_size > 1
-    and "MSCCLPP_MNNVL_NRANKS_PER_NODE" not in os.environ
-    and os.environ.get("MSCCLPP_ENABLE_MNNVL", "1") != "0"
-):
-    os.environ["MSCCLPP_MNNVL_NRANKS_PER_NODE"] = str(_bootstrap_world_size)
+if _bootstrap_world_size and _bootstrap_world_size > 1 and "MSCCLPP_IPC_DOMAIN_NRANKS" not in os.environ:
+    os.environ["MSCCLPP_IPC_DOMAIN_NRANKS"] = str(_bootstrap_world_size)
 
 import torch
 import mscclpp
@@ -140,11 +136,10 @@ def __init__(self, comm: mscclpp.CommGroup, symmetric_memory: bool = False):
         self.rank = comm.my_rank
         self.world_size = comm.nranks
         self.nranks_per_node = comm.nranks_per_node
-        self.mnnvl_domain = self.world_size > 1 and os.environ.get("MSCCLPP_MNNVL_NRANKS_PER_NODE") == str(
-            self.world_size
-        )
+        nvlink_domain_nranks = int(os.environ.get("MSCCLPP_IPC_DOMAIN_NRANKS", "0"))
+        self.mnnvl_domain = self.world_size > 1 and nvlink_domain_nranks >= self.world_size
         self.multi_node = self.world_size > self.nranks_per_node and not self.mnnvl_domain
-        self.multi_host_mnnvl = self.mnnvl_domain and self.world_size > 1
+        self.multi_host_mnnvl = self.mnnvl_domain and self.world_size > self.nranks_per_node
         self.symmetric_memory = symmetric_memory
         self._nvls = mscclpp.is_nvls_supported()
 
diff --git a/include/mscclpp/env.hpp b/include/mscclpp/env.hpp
index 09d364c3b..0dd63ed74 100644
--- a/include/mscclpp/env.hpp
+++ b/include/mscclpp/env.hpp
@@ -119,11 +119,11 @@ class Env {
   /// Default is 0. Used when `EndpointConfig::Ib::gidIndex` is -1 (unspecified).
   const int ibGidIndex;
 
-  /// Env name: `MSCCLPP_MNNVL_NRANKS_PER_NODE`. Overrides the NVLink-domain size reported by the bootstrap.
-  /// This is intended for Multi-Node NVLink (MNNVL) deployments where a single CUDA IPC / NVLS domain spans
-  /// multiple hosts and should be treated as one collective peer group.
-  /// If unset or non-positive, the bootstrap falls back to physical-host-based detection.
-  const int mnnvlNranksPerNode;
+  /// Env name: `MSCCLPP_IPC_DOMAIN_NRANKS`. Number of ranks that share a single GPU-IPC-reachable peer
+  /// group (e.g. a Multi-Node NVLink fabric such as GB200 NVL72, or an AMD XGMI domain). This hint is
+  /// consumed only by the collective algorithms; it does not affect `Bootstrap::getNranksPerNode()` or
+  /// any other layer. If unset or non-positive, algorithms fall back to `bootstrap->getNranksPerNode()`.
+  const int ipcDomainNranks;
 
  private:
   Env();
diff --git a/src/core/bootstrap/bootstrap.cc b/src/core/bootstrap/bootstrap.cc
index c84ef4c0f..b3032e502 100644
--- a/src/core/bootstrap/bootstrap.cc
+++ b/src/core/bootstrap/bootstrap.cc
@@ -5,7 +5,6 @@
 
 #include <cstring>
 #include <mscclpp/core.hpp>
-#include <mscclpp/env.hpp>
 #include <mscclpp/errors.hpp>
 #include <sstream>
 #include <thread>
@@ -434,10 +433,6 @@ void TcpBootstrap::Impl::establishConnections(int64_t timeoutSec) {
 
 int TcpBootstrap::Impl::getNranksPerNode() {
   if (nRanksPerNode_ > 0) return nRanksPerNode_;
-  if (env()->mnnvlNranksPerNode > 0) {
-    nRanksPerNode_ = env()->mnnvlNranksPerNode;
-    return nRanksPerNode_;
-  }
   int nRanksPerNode = 0;
   bool useIpv4 = peerCommAddresses_[rank_].sa.sa_family == AF_INET;
   for (int i = 0; i < nRanks_; i++) {
diff --git a/src/core/env.cpp b/src/core/env.cpp
index b46670d79..18d548b02 100644
--- a/src/core/env.cpp
+++ b/src/core/env.cpp
@@ -68,7 +68,7 @@ Env::Env()
       forceDisableNvls(readEnv<bool>("MSCCLPP_FORCE_DISABLE_NVLS", false)),
       forceDisableGdr(readEnv<bool>("MSCCLPP_FORCE_DISABLE_GDR", false)),
       ibGidIndex(readEnv<int>("MSCCLPP_IB_GID_INDEX", 0)),
-      mnnvlNranksPerNode(readEnv<int>("MSCCLPP_MNNVL_NRANKS_PER_NODE", 0)) {}
+      ipcDomainNranks(readEnv<int>("MSCCLPP_IPC_DOMAIN_NRANKS", 0)) {}
 
 std::shared_ptr<Env> env() {
   static std::shared_ptr<Env> globalEnv = std::shared_ptr<Env>(new Env());
@@ -98,7 +98,7 @@ std::shared_ptr<Env> env() {
     logEnv("MSCCLPP_FORCE_DISABLE_NVLS", globalEnv->forceDisableNvls);
     logEnv("MSCCLPP_FORCE_DISABLE_GDR", globalEnv->forceDisableGdr);
     logEnv("MSCCLPP_IB_GID_INDEX", globalEnv->ibGidIndex);
-    logEnv("MSCCLPP_MNNVL_NRANKS_PER_NODE", globalEnv->mnnvlNranksPerNode);
+    logEnv("MSCCLPP_IPC_DOMAIN_NRANKS", globalEnv->ipcDomainNranks);
   }
   return globalEnv;
 }
diff --git a/src/ext/collectives/allreduce/allreduce_allpair_packet.cu b/src/ext/collectives/allreduce/allreduce_allpair_packet.cu
index 9516ad786..690d0eb4e 100644
--- a/src/ext/collectives/allreduce/allreduce_allpair_packet.cu
+++ b/src/ext/collectives/allreduce/allreduce_allpair_packet.cu
@@ -140,7 +140,7 @@ std::shared_ptr<void> AllreduceAllpairPacket::initAllreduceContext(std::shared_p
   const int nChannelsPerConnection = maxBlockNum_;
   ctx->rank = comm->bootstrap()->getRank();
   ctx->workSize = comm->bootstrap()->getNranks();
-  ctx->nRanksPerNode = getCollectiveDomainNranksPerNode(comm, this->conns_);
+  ctx->nRanksPerNode = getIpcDomainNranks(comm);
   ctx->memorySemaphores = this->memorySemaphores_;
   ctx->registeredMemories = this->registeredMemories_;
   ctx->registeredMemories.pop_back();  // remove the local memory from previous context
diff --git a/src/ext/collectives/allreduce/allreduce_nvls_packet.cu b/src/ext/collectives/allreduce/allreduce_nvls_packet.cu
index 21f710283..d331cc672 100644
--- a/src/ext/collectives/allreduce/allreduce_nvls_packet.cu
+++ b/src/ext/collectives/allreduce/allreduce_nvls_packet.cu
@@ -94,7 +94,7 @@ std::shared_ptr<void> AllreduceNvlsPacket::initAllreduceContext(std::shared_ptr<
   auto ctx = std::make_shared<AlgorithmCtx>();
   ctx->rank = comm->bootstrap()->getRank();
   ctx->workSize = comm->bootstrap()->getNranks();
-  ctx->nRanksPerNode = getCollectiveDomainNranksPerNode(comm);
+  ctx->nRanksPerNode = getIpcDomainNranks(comm);
 
   // setup channels
   ctx->switchChannels = this->switchChannels_;
diff --git a/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu b/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu
index 25077004b..36fcf860b 100644
--- a/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu
+++ b/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu
@@ -183,7 +183,7 @@ std::shared_ptr<void> AllreduceNvls::initAllreduceContext(std::shared_ptr<mscclp
   auto ctx = std::make_shared<AlgorithmCtx>();
   ctx->rank = comm->bootstrap()->getRank();
   ctx->workSize = comm->bootstrap()->getNranks();
-  ctx->nRanksPerNode = getCollectiveDomainNranksPerNode(comm);
+  ctx->nRanksPerNode = getIpcDomainNranks(comm);
 
   size_t sendBytes, recvBytes;
   CUdeviceptr sendBasePtr, recvBasePtr;
diff --git a/src/ext/collectives/allreduce/allreduce_packet.cu b/src/ext/collectives/allreduce/allreduce_packet.cu
index c195aefa3..d631c35a8 100644
--- a/src/ext/collectives/allreduce/allreduce_packet.cu
+++ b/src/ext/collectives/allreduce/allreduce_packet.cu
@@ -264,7 +264,7 @@ std::shared_ptr<void> AllreducePacket::initAllreduceContext(std::shared_ptr<Comm
   const int nChannelsPerConnection = maxBlockNum_;
   ctx->rank = comm->bootstrap()->getRank();
   ctx->workSize = comm->bootstrap()->getNranks();
-  ctx->nRanksPerNode = getCollectiveDomainNranksPerNode(comm, this->conns_);
+  ctx->nRanksPerNode = getIpcDomainNranks(comm);
   ctx->memorySemaphores = this->memorySemaphores_;
   ctx->registeredMemories = this->registeredMemories_;
   ctx->registeredMemories.pop_back();  // remove the local memory from previous context
diff --git a/src/ext/collectives/allreduce/allreduce_rsag.cu b/src/ext/collectives/allreduce/allreduce_rsag.cu
index 7f9e6bfd6..4c46bf9b3 100644
--- a/src/ext/collectives/allreduce/allreduce_rsag.cu
+++ b/src/ext/collectives/allreduce/allreduce_rsag.cu
@@ -203,7 +203,7 @@ std::shared_ptr<void> AllreduceRsAg::initAllreduceContext(std::shared_ptr<Commun
   auto ctx = std::make_shared<AlgorithmCtx>();
   ctx->rank = comm->bootstrap()->getRank();
   ctx->workSize = comm->bootstrap()->getNranks();
-  ctx->nRanksPerNode = getCollectiveDomainNranksPerNode(comm, this->conns_);
+  ctx->nRanksPerNode = getIpcDomainNranks(comm);
 
   ctx->memorySemaphores = this->scratchSemaphores_;
   ctx->registeredMemories = this->remoteScratchMemories_;
diff --git a/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu b/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu
index a11da0f89..67eed6d31 100644
--- a/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu
+++ b/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu
@@ -169,7 +169,7 @@ CommResult AllreduceRsAgZeroCopy::allreduceKernelFunc(const std::shared_ptr<void
 }
 
 AlgorithmCtxKey AllreduceRsAgZeroCopy::generateAllreduceContextKey(const void* inputBuffer, void* outputBuffer,
-                                                                    size_t size, DataType, bool symmetricMemory) {
+                                                                   size_t size, DataType, bool symmetricMemory) {
   // For non-symmetric algorithms, we use both input and output buffer pointers in the key.
   if (symmetricMemory) {
     size_t inputBytes, outputBytes;
@@ -186,7 +186,7 @@ std::shared_ptr<void> AllreduceRsAgZeroCopy::initAllreduceContext(std::shared_pt
   auto ctx = std::make_shared<AlgorithmCtx>();
   ctx->rank = comm->bootstrap()->getRank();
   ctx->workSize = comm->bootstrap()->getNranks();
-  ctx->nRanksPerNode = getCollectiveDomainNranksPerNode(comm, this->conns_);
+  ctx->nRanksPerNode = getIpcDomainNranks(comm);
 
   ctx->memorySemaphores = this->semaphores_;
 
diff --git a/src/ext/collectives/collective_utils.cc b/src/ext/collectives/collective_utils.cc
index 4d46c53bc..de33009c6 100644
--- a/src/ext/collectives/collective_utils.cc
+++ b/src/ext/collectives/collective_utils.cc
@@ -6,6 +6,7 @@
 #include <algorithm>
 #include <mscclpp/algorithm.hpp>
 #include <mscclpp/core.hpp>
+#include <mscclpp/env.hpp>
 #include <mscclpp/memory_channel.hpp>
 #include <mscclpp/switch_channel.hpp>
 
@@ -69,23 +70,12 @@ std::vector<std::shared_ptr<mscclpp::MemoryDevice2DeviceSemaphore>> setupMemoryS
   return memorySemaphores;
 }
 
-int getCollectiveDomainNranksPerNode(std::shared_ptr<mscclpp::Communicator> comm,
-                                     const std::vector<mscclpp::Connection>& connections) {
-  const int worldSize = comm->bootstrap()->getNranks();
-  const int nRanksPerNode = comm->bootstrap()->getNranksPerNode();
-  if (worldSize <= nRanksPerNode) {
-    return nRanksPerNode;
+int getIpcDomainNranks(std::shared_ptr<mscclpp::Communicator> comm) {
+  const int envValue = mscclpp::env()->ipcDomainNranks;
+  if (envValue > 0) {
+    return envValue;
   }
-  const bool allPeersUseCudaIpc =
-      std::all_of(connections.begin(), connections.end(),
-                  [](const auto& connection) { return connection.transport() == mscclpp::Transport::CudaIpc; });
-  return allPeersUseCudaIpc ? worldSize : nRanksPerNode;
-}
-
-int getCollectiveDomainNranksPerNode(std::shared_ptr<mscclpp::Communicator> comm) {
-  const int worldSize = comm->bootstrap()->getNranks();
-  const int nRanksPerNode = comm->bootstrap()->getNranksPerNode();
-  return worldSize > nRanksPerNode ? worldSize : nRanksPerNode;
+  return comm->bootstrap()->getNranksPerNode();
 }
 
 std::shared_ptr<mscclpp::DeviceHandle<mscclpp::MemoryChannel>> setupMemoryChannelDeviceHandles(
diff --git a/src/ext/collectives/include/collective_utils.hpp b/src/ext/collectives/include/collective_utils.hpp
index 38362a659..44a214020 100644
--- a/src/ext/collectives/include/collective_utils.hpp
+++ b/src/ext/collectives/include/collective_utils.hpp
@@ -50,8 +50,13 @@ std::vector<MemoryChannel> setupMemoryChannels(
 std::vector<Connection> setupConnections(std::shared_ptr<Communicator> comm);
 std::vector<std::shared_ptr<MemoryDevice2DeviceSemaphore>> setupMemorySemaphores(
     std::shared_ptr<Communicator> comm, const std::vector<Connection>& connections, int nChannelsPerConnection);
-int getCollectiveDomainNranksPerNode(std::shared_ptr<Communicator> comm, const std::vector<Connection>& connections);
-int getCollectiveDomainNranksPerNode(std::shared_ptr<Communicator> comm);
+
+/// Number of ranks that participate in the same GPU-IPC-reachable peer group (e.g. a single host or
+/// a Multi-Node NVLink fabric, or an AMD XGMI domain). Returns the value of `MSCCLPP_IPC_DOMAIN_NRANKS`
+/// if set to a positive value; otherwise falls back to `bootstrap->getNranksPerNode()`. This is
+/// intentionally independent of `nRanksPerNode` so that algorithms can opt in to MNNVL-like behavior
+/// without changing the meaning of bootstrap-level APIs.
+int getIpcDomainNranks(std::shared_ptr<Communicator> comm);
 
 std::shared_ptr<DeviceHandle<MemoryChannel>> setupMemoryChannelDeviceHandles(
     const std::vector<MemoryChannel>& memoryChannels);

From 2a2fca8a587a658888fe5a21f5b42cd07bf6cec2 Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Fri, 1 May 2026 19:06:07 +0000
Subject: [PATCH 08/44] Rename collective ctx/kernel param nRanksPerNode to
 ipcDomainNranks
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The AlgorithmCtx field and the kernel/host parameters that hold the
collective's IPC peer-group size were named nRanksPerNode, which is
misleading on Multi-Node NVLink (where the value spans multiple hosts)
and on AMD (where the relevant fabric is XGMI, not NVLink). Rename to
ipcDomainNranks throughout the collective algorithms to match the
neutral naming introduced for the env helper.

Scope intentionally limited to src/ext/collectives/. The following are
left untouched on purpose:
  - Bootstrap::getNranksPerNode() — physical-host detection, semantics
    unchanged.
  - Algorithm::Constraint::nRanksPerNode (public API in
    include/mscclpp/algorithm.hpp) and the DSL plan config in
    algorithm_collection_builder.cc — these describe a plan's required
    physical topology.
  - NCCL adapter (src/ext/nccl/) — preserves NCCL ABI compatibility.
  - MAX_NRANKS_PER_NODE — sizing constant for shared-memory arrays.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../allgather/allgather_fullmesh.cu           | 10 +++----
 .../allgather/allgather_fullmesh_2.cu         | 10 +++----
 .../allreduce/allreduce_allpair_packet.cu     | 14 ++++-----
 .../allreduce/allreduce_fullmesh.cu           | 12 ++++----
 .../allreduce_nvls_block_pipeline.cu          | 30 +++++++++----------
 .../allreduce/allreduce_nvls_packet.cu        |  4 +--
 .../allreduce/allreduce_nvls_warp_pipeline.cu | 28 ++++++++---------
 .../allreduce/allreduce_nvls_zero_copy.cu     | 20 ++++++-------
 .../collectives/allreduce/allreduce_packet.cu | 20 ++++++-------
 .../collectives/allreduce/allreduce_rsag.cu   | 24 +++++++--------
 .../allreduce/allreduce_rsag_pipeline.cu      | 22 +++++++-------
 .../allreduce/allreduce_rsag_zero_copy.cu     | 26 ++++++++--------
 .../collectives/include/collective_utils.hpp  | 10 +++----
 13 files changed, 115 insertions(+), 115 deletions(-)

diff --git a/src/ext/collectives/allgather/allgather_fullmesh.cu b/src/ext/collectives/allgather/allgather_fullmesh.cu
index 17054869e..cbe199bcb 100644
--- a/src/ext/collectives/allgather/allgather_fullmesh.cu
+++ b/src/ext/collectives/allgather/allgather_fullmesh.cu
@@ -11,8 +11,8 @@ namespace collective {
 template <bool IsOutOfPlace>
 __global__ void __launch_bounds__(1024, 1)
     allgatherFullmesh(void* buff, void* scratch, void* resultBuff, DeviceHandle<MemoryChannel>* memoryChannels,
-                      int rank, int nRanksPerNode, [[maybe_unused]] int worldSize, size_t nelems) {
-  const int nPeer = nRanksPerNode - 1;
+                      int rank, int ipcDomainNranks, [[maybe_unused]] int worldSize, size_t nelems) {
+  const int nPeer = ipcDomainNranks - 1;
   const size_t chanOffset = nPeer * blockIdx.x;
   // assume (nelems * sizeof(T)) is divisible by 16
   const size_t nInt4 = nelems * sizeof(int) / sizeof(int4);
@@ -129,11 +129,11 @@ CommResult AllgatherFullmesh::allgatherKernelFunc(const std::shared_ptr<void> ct
   if ((char*)input == (char*)output + rank * inputSize) {
     allgatherFullmesh<false><<<numBlocksAndThreads.first, numBlocksAndThreads.second, 0, stream>>>(
         (void*)input, this->scratchBuffer_, (void*)output, ctx->memoryChannelDeviceHandles.get(), rank,
-        ctx->nRanksPerNode, ctx->workSize, nElem);
+        ctx->ipcDomainNranks, ctx->workSize, nElem);
   } else {
     allgatherFullmesh<true><<<numBlocksAndThreads.first, numBlocksAndThreads.second, 0, stream>>>(
         (void*)input, this->scratchBuffer_, (void*)output, ctx->memoryChannelDeviceHandles.get(), rank,
-        ctx->nRanksPerNode, ctx->workSize, nElem);
+        ctx->ipcDomainNranks, ctx->workSize, nElem);
   }
   cudaError_t err = cudaGetLastError();
   if (err != cudaSuccess) {
@@ -150,7 +150,7 @@ std::shared_ptr<void> AllgatherFullmesh::initAllgatherContext(std::shared_ptr<Co
   auto ctx = std::make_shared<AlgorithmCtx>();
   ctx->rank = comm->bootstrap()->getRank();
   ctx->workSize = comm->bootstrap()->getNranks();
-  ctx->nRanksPerNode = comm->bootstrap()->getNranksPerNode();
+  ctx->ipcDomainNranks = comm->bootstrap()->getNranksPerNode();
 
   // setup semaphores
   ctx->memorySemaphores = setupMemorySemaphores(comm, this->conns_, nChannelsPerConnection);
diff --git a/src/ext/collectives/allgather/allgather_fullmesh_2.cu b/src/ext/collectives/allgather/allgather_fullmesh_2.cu
index 9d169d689..6e69f81ca 100644
--- a/src/ext/collectives/allgather/allgather_fullmesh_2.cu
+++ b/src/ext/collectives/allgather/allgather_fullmesh_2.cu
@@ -12,7 +12,7 @@ __device__ DeviceSyncer deviceSyncer;
 template <bool IsOutOfPlace>
 __global__ void __launch_bounds__(1024, 1)
     allgatherFullmesh2(void* sendbuff, mscclpp::DeviceHandle<mscclpp::MemoryChannel>* memoryChannels,
-                       size_t channelOutOffset, size_t rank, [[maybe_unused]] size_t worldSize, size_t nRanksPerNode,
+                       size_t channelOutOffset, size_t rank, [[maybe_unused]] size_t worldSize, size_t ipcDomainNranks,
                        size_t nelemsPerGPU) {
   const size_t tid = threadIdx.x + blockIdx.x * blockDim.x;
   const size_t lid = tid % WARP_SIZE;
@@ -20,7 +20,7 @@ __global__ void __launch_bounds__(1024, 1)
 
   const size_t nThread = blockDim.x * gridDim.x;
   const size_t nWarp = nThread / WARP_SIZE;
-  const size_t nPeer = nRanksPerNode - 1;
+  const size_t nPeer = ipcDomainNranks - 1;
   const size_t chanOffset = nPeer * blockIdx.x;
   auto memChans = memoryChannels + chanOffset;
 
@@ -140,11 +140,11 @@ CommResult AllgatherFullmesh2::allgatherKernelFunc(const std::shared_ptr<void> c
   if ((char*)input == (char*)output + rank * inputSize) {
     allgatherFullmesh2<false><<<numBlocksAndThreads.first, numBlocksAndThreads.second, 0, stream>>>(
         (void*)input, ctx->memoryChannelDeviceHandles.get(), channelOutOffset, ctx->rank, ctx->workSize,
-        ctx->nRanksPerNode, nElem);
+        ctx->ipcDomainNranks, nElem);
   } else {
     allgatherFullmesh2<true><<<numBlocksAndThreads.first, numBlocksAndThreads.second, 0, stream>>>(
         (void*)input, ctx->memoryChannelDeviceHandles.get(), channelOutOffset, ctx->rank, ctx->workSize,
-        ctx->nRanksPerNode, nElem);
+        ctx->ipcDomainNranks, nElem);
   }
   cudaError_t err = cudaGetLastError();
   if (err != cudaSuccess) {
@@ -159,7 +159,7 @@ std::shared_ptr<void> AllgatherFullmesh2::initAllgatherContext(std::shared_ptr<m
   auto ctx = std::make_shared<AlgorithmCtx>();
   ctx->rank = comm->bootstrap()->getRank();
   ctx->workSize = comm->bootstrap()->getNranks();
-  ctx->nRanksPerNode = comm->bootstrap()->getNranksPerNode();
+  ctx->ipcDomainNranks = comm->bootstrap()->getNranksPerNode();
 
   // setup semaphores
   ctx->memorySemaphores = this->memorySemaphores_;
diff --git a/src/ext/collectives/allreduce/allreduce_allpair_packet.cu b/src/ext/collectives/allreduce/allreduce_allpair_packet.cu
index 690d0eb4e..5be2f3360 100644
--- a/src/ext/collectives/allreduce/allreduce_allpair_packet.cu
+++ b/src/ext/collectives/allreduce/allreduce_allpair_packet.cu
@@ -14,11 +14,11 @@ namespace collective {
 
 template <ReduceOp OpType, typename T, typename AccumT = T>
 __global__ void allreduceAllPairs(T* buff, T* scratch, T* resultBuff, DeviceHandle<MemoryChannel>* memoryChannels,
-                                  size_t channelDataOffset, size_t scratchBufferSize, int rank, int nRanksPerNode,
+                                  size_t channelDataOffset, size_t scratchBufferSize, int rank, int ipcDomainNranks,
                                   int worldSize, size_t nelems, uint32_t numScratchBuff, void* flags,
                                   uint32_t flagSize) {
   if (sizeof(T) == 2 || sizeof(T) == 1) nelems = (nelems * sizeof(T) + sizeof(T)) / sizeof(int);
-  const int nPeers = nRanksPerNode - 1;
+  const int nPeers = ipcDomainNranks - 1;
 
   uint32_t flag = ((uint32_t*)flags)[blockIdx.x];
   size_t scratchBaseOffset = (flag % numScratchBuff) ? (scratchBufferSize / numScratchBuff) : 0;
@@ -72,7 +72,7 @@ template <ReduceOp OpType, typename T, typename AccumT = T>
 struct AllpairAdapter {
   static cudaError_t call(const void* buff, void* scratch, void* resultBuff, void* memoryChannels, void*,
                           DeviceHandle<SwitchChannel>*, DeviceHandle<SwitchChannel>*, size_t channelInOffset, size_t,
-                          size_t scratchBufferSize, int rank, int nRanksPerNode, int worldSize, size_t inputSize,
+                          size_t scratchBufferSize, int rank, int ipcDomainNranks, int worldSize, size_t inputSize,
                           cudaStream_t stream, void* flags, uint32_t flagSize, uint32_t numScratchBuff, int nBlocks = 0,
                           int nThreadsPerBlock = 0) {
     using ChannelType = DeviceHandle<MemoryChannel>;
@@ -84,7 +84,7 @@ struct AllpairAdapter {
     }
     allreduceAllPairs<OpType, T, AccumT><<<nBlocks, nThreadsPerBlock, 0, stream>>>(
         (T*)buff, (T*)scratch, (T*)resultBuff, (ChannelType*)memoryChannels, channelInOffset, scratchBufferSize, rank,
-        nRanksPerNode, worldSize, nelems, numScratchBuff, flags, flagSize);
+        ipcDomainNranks, worldSize, nelems, numScratchBuff, flags, flagSize);
     return cudaGetLastError();
   }
 };
@@ -108,7 +108,7 @@ CommResult AllreduceAllpairPacket::allreduceKernelFunc(const std::shared_ptr<voi
     blockAndThreadNum = getDefaultBlockNumAndThreadNum(inputSize, algoCtx->workSize);
   }
   // nBlocks must be at least nPeers for allpair — each block maps to one peer.
-  const int nPeers = algoCtx->nRanksPerNode - 1;
+  const int nPeers = algoCtx->ipcDomainNranks - 1;
   if (nPeers > 0 && blockAndThreadNum.first < nPeers) {
     return CommResult::CommInvalidArgument;
   }
@@ -124,7 +124,7 @@ CommResult AllreduceAllpairPacket::allreduceKernelFunc(const std::shared_ptr<voi
   }
   cudaError_t error =
       allreduce(input, this->scratchBuffer_, output, algoCtx->memoryChannelDeviceHandles.get(), nullptr, nullptr,
-                nullptr, channelInOffset, 0, this->scratchBufferSize_, algoCtx->rank, algoCtx->nRanksPerNode,
+                nullptr, channelInOffset, 0, this->scratchBufferSize_, algoCtx->rank, algoCtx->ipcDomainNranks,
                 algoCtx->workSize, inputSize, stream, (void*)flagBuffer_, (uint32_t)flagBufferSize_,
                 this->nSegmentsForScratchBuffer_, blockAndThreadNum.first, blockAndThreadNum.second);
   if (error != cudaSuccess) {
@@ -140,7 +140,7 @@ std::shared_ptr<void> AllreduceAllpairPacket::initAllreduceContext(std::shared_p
   const int nChannelsPerConnection = maxBlockNum_;
   ctx->rank = comm->bootstrap()->getRank();
   ctx->workSize = comm->bootstrap()->getNranks();
-  ctx->nRanksPerNode = getIpcDomainNranks(comm);
+  ctx->ipcDomainNranks = getIpcDomainNranks(comm);
   ctx->memorySemaphores = this->memorySemaphores_;
   ctx->registeredMemories = this->registeredMemories_;
   ctx->registeredMemories.pop_back();  // remove the local memory from previous context
diff --git a/src/ext/collectives/allreduce/allreduce_fullmesh.cu b/src/ext/collectives/allreduce/allreduce_fullmesh.cu
index 9d144c621..b95dcb284 100644
--- a/src/ext/collectives/allreduce/allreduce_fullmesh.cu
+++ b/src/ext/collectives/allreduce/allreduce_fullmesh.cu
@@ -13,8 +13,8 @@ template <ReduceOp OpType, typename T, typename AccumT = T>
 __global__ void __launch_bounds__(512, 1)
     allreduceFullmesh(T* buff, T* scratch, T* resultBuff, DeviceHandle<MemoryChannel>* memoryChannels,
                       DeviceHandle<MemoryChannel>* memoryOutChannels, size_t channelOutDataOffset, int rank,
-                      int nRanksPerNode, int worldSize, size_t nelems) {
-  const int nPeer = nRanksPerNode - 1;
+                      int ipcDomainNranks, int worldSize, size_t nelems) {
+  const int nPeer = ipcDomainNranks - 1;
   const size_t chanOffset = nPeer * blockIdx.x;
   // assume (nelems * sizeof(T)) is divisible by (16 * worldSize)
   const size_t nInt4 = nelems * sizeof(T) / sizeof(int4);
@@ -159,7 +159,7 @@ template <ReduceOp OpType, typename T, typename AccumT = T>
 struct AllreduceAllconnectAdapter {
   static cudaError_t call(const void* input, void* scratch, void* output, void* memoryChannels, void* memoryOutChannels,
                           DeviceHandle<SwitchChannel>*, DeviceHandle<SwitchChannel>*, size_t,
-                          size_t channelOutDataOffset, size_t, int rank, int nRanksPerNode, int worldSize,
+                          size_t channelOutDataOffset, size_t, int rank, int ipcDomainNranks, int worldSize,
                           size_t inputSize, cudaStream_t stream, void*, uint32_t, uint32_t, int nBlocks,
                           int nThreadsPerBlock) {
     using ChannelType = DeviceHandle<MemoryChannel>;
@@ -168,7 +168,7 @@ struct AllreduceAllconnectAdapter {
     if (nThreadsPerBlock == 0) nThreadsPerBlock = 512;
     allreduceFullmesh<OpType, T, AccumT><<<nBlocks, nThreadsPerBlock, 0, stream>>>(
         (T*)input, (T*)scratch, (T*)output, (ChannelType*)memoryChannels, (ChannelType*)memoryOutChannels,
-        channelOutDataOffset, rank, nRanksPerNode, worldSize, nelems);
+        channelOutDataOffset, rank, ipcDomainNranks, worldSize, nelems);
     return cudaGetLastError();
   }
 };
@@ -225,7 +225,7 @@ CommResult AllreduceFullmesh::allreduceKernelFunc(
   }
   cudaError_t error =
       allreduce(input, this->scratchBuffer_, output, inputChannelHandles.get(), ctx->memoryChannelDeviceHandles.get(),
-                nullptr, nullptr, 0, channelOutOffset, 0, ctx->rank, ctx->nRanksPerNode, ctx->workSize, inputSize,
+                nullptr, nullptr, 0, channelOutOffset, 0, ctx->rank, ctx->ipcDomainNranks, ctx->workSize, inputSize,
                 stream, nullptr, 0, 0, numBlocksAndThreads.first, numBlocksAndThreads.second);
   if (error != cudaSuccess) {
     WARN("AllreduceAllconnect failed with error: %s", cudaGetErrorString(error));
@@ -252,7 +252,7 @@ std::shared_ptr<void> AllreduceFullmesh::initAllreduceContext(std::shared_ptr<Co
   auto ctx = std::make_shared<AlgorithmCtx>();
   ctx->rank = comm->bootstrap()->getRank();
   ctx->workSize = comm->bootstrap()->getNranks();
-  ctx->nRanksPerNode = comm->bootstrap()->getNranksPerNode();
+  ctx->ipcDomainNranks = comm->bootstrap()->getNranksPerNode();
 
   // setup semaphores
   ctx->memorySemaphores = this->outputSemaphores_;
diff --git a/src/ext/collectives/allreduce/allreduce_nvls_block_pipeline.cu b/src/ext/collectives/allreduce/allreduce_nvls_block_pipeline.cu
index 2d71cd638..3ecb361fc 100644
--- a/src/ext/collectives/allreduce/allreduce_nvls_block_pipeline.cu
+++ b/src/ext/collectives/allreduce/allreduce_nvls_block_pipeline.cu
@@ -20,15 +20,15 @@ __global__ void __launch_bounds__(1024, 1)
                                [[maybe_unused]] DeviceHandle<BaseMemoryChannel>* memoryChannels,
                                [[maybe_unused]] DeviceHandle<SwitchChannel>* switchChannels,
                                [[maybe_unused]] size_t size, [[maybe_unused]] size_t scratchBufferSize,
-                               [[maybe_unused]] int rank, [[maybe_unused]] int nRanksPerNode) {
+                               [[maybe_unused]] int rank, [[maybe_unused]] int ipcDomainNranks) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
   constexpr int alignment = 16;
-  int nPeers = nRanksPerNode - 1;
-  int nBlocksForCopy = nRanksPerNode * 2;
-  int nBlocksForReduce = nRanksPerNode;
+  int nPeers = ipcDomainNranks - 1;
+  int nBlocksForCopy = ipcDomainNranks * 2;
+  int nBlocksForReduce = ipcDomainNranks;
   int copyReduceRatio = nBlocksForCopy / nBlocksForReduce;
-  size_t scratchSizePerRank = scratchBufferSize / nRanksPerNode;
-  size_t sizePerRank = size / nRanksPerNode;
+  size_t scratchSizePerRank = scratchBufferSize / ipcDomainNranks;
+  size_t sizePerRank = size / ipcDomainNranks;
   assert(sizePerRank % alignment == 0);
   uint32_t sizePerBlock =
       ((sizePerRank + (nBlocksForCopy - 1)) / nBlocksForCopy + alignment - 1) / alignment * alignment;
@@ -68,7 +68,7 @@ __global__ void __launch_bounds__(1024, 1)
         deviceSemaphore[bid + 2 * nBlocksForCopy].acquire();
       }
       __syncthreads();
-      for (int i = 0; i < nRanksPerNode; i++) {
+      for (int i = 0; i < ipcDomainNranks; i++) {
         size_t blockOffset = it * unitSize + bid * sizePerBlock + i * sizePerRank;
         uint32_t scratchOffset = scratchIt * unitSize + bid * scratchSizePerBlock + i * scratchSizePerRank;
         char* srcData = (char*)src + blockOffset;
@@ -125,7 +125,7 @@ __global__ void __launch_bounds__(1024, 1)
         channels->wait();
       }
       __syncthreads();
-      for (int i = 0; i < nRanksPerNode; i++) {
+      for (int i = 0; i < ipcDomainNranks; i++) {
         size_t blockOffset = it * unitSize + (bid - nBlocksForCopy - nBlocksForReduce) * sizePerBlock + i * sizePerRank;
         uint32_t scratchOffset = scratchIt * unitSize +
                                  (bid - nBlocksForCopy - nBlocksForReduce) * scratchSizePerBlock +
@@ -150,7 +150,7 @@ template <ReduceOp OpType, typename T, typename AccumT = T>
 struct NvlsBlockPipelineAdapter {
   static cudaError_t call(const void* input, void* scratch, void* output, void* memoryChannels, void*,
                           DeviceHandle<SwitchChannel>* nvlsChannels, DeviceHandle<SwitchChannel>*, size_t, size_t,
-                          size_t scratchBufferSize, int rank, int nRanksPerNode, int, size_t inputSize,
+                          size_t scratchBufferSize, int rank, int ipcDomainNranks, int, size_t inputSize,
                           cudaStream_t stream, void*, uint32_t, uint32_t, int nBlocks, int nThreadsPerBlock) {
     // uint8_t is not supported for NVLS (no hardware support for byte-level reduction)
     if constexpr (std::is_same_v<T, uint8_t>) {
@@ -166,9 +166,9 @@ struct NvlsBlockPipelineAdapter {
 #endif
       {
         using ChannelType = DeviceHandle<BaseMemoryChannel>;
-        allreduceNvlsBlockPipeline<T>
-            <<<nBlocks, nThreadsPerBlock, 0, stream>>>(input, scratch, output, (ChannelType*)memoryChannels,
-                                                       nvlsChannels, inputSize, scratchBufferSize, rank, nRanksPerNode);
+        allreduceNvlsBlockPipeline<T><<<nBlocks, nThreadsPerBlock, 0, stream>>>(
+            input, scratch, output, (ChannelType*)memoryChannels, nvlsChannels, inputSize, scratchBufferSize, rank,
+            ipcDomainNranks);
         return cudaGetLastError();
       }
   }
@@ -200,11 +200,11 @@ CommResult AllreduceNvlsBlockPipeline::allreduceKernelFunc(const std::shared_ptr
   }
   std::pair<int, int> blockAndThreadNum = {nBlocks, nThreadsPerBlock};
   if (blockAndThreadNum.first == 0 || blockAndThreadNum.second == 0) {
-    blockAndThreadNum = {ctx->nRanksPerNode * 5, 1024};
+    blockAndThreadNum = {ctx->ipcDomainNranks * 5, 1024};
   }
   cudaError_t error = allreduce(input, this->scratchBuffer_, output, this->memoryChannelsDeviceHandle_.get(), nullptr,
                                 ctx->switchChannelDeviceHandles.get(), nullptr, 0, 0, this->scratchBufferSize_,
-                                ctx->rank, ctx->nRanksPerNode, ctx->workSize, inputSize, stream, nullptr, 0, 0,
+                                ctx->rank, ctx->ipcDomainNranks, ctx->workSize, inputSize, stream, nullptr, 0, 0,
                                 blockAndThreadNum.first, blockAndThreadNum.second);
   if (error != cudaSuccess) {
     WARN("AllreduceNvlsBlockPipeline failed with error: %s", cudaGetErrorString(error));
@@ -222,7 +222,7 @@ std::shared_ptr<void> AllreduceNvlsBlockPipeline::initAllreduceContext(std::shar
   auto ctx = std::make_shared<AlgorithmCtx>();
   ctx->rank = comm->bootstrap()->getRank();
   ctx->workSize = comm->bootstrap()->getNranks();
-  ctx->nRanksPerNode = comm->bootstrap()->getNranksPerNode();
+  ctx->ipcDomainNranks = comm->bootstrap()->getNranksPerNode();
 
   // setup channels
   ctx->switchChannels =
diff --git a/src/ext/collectives/allreduce/allreduce_nvls_packet.cu b/src/ext/collectives/allreduce/allreduce_nvls_packet.cu
index d331cc672..2ef0516e3 100644
--- a/src/ext/collectives/allreduce/allreduce_nvls_packet.cu
+++ b/src/ext/collectives/allreduce/allreduce_nvls_packet.cu
@@ -94,7 +94,7 @@ std::shared_ptr<void> AllreduceNvlsPacket::initAllreduceContext(std::shared_ptr<
   auto ctx = std::make_shared<AlgorithmCtx>();
   ctx->rank = comm->bootstrap()->getRank();
   ctx->workSize = comm->bootstrap()->getNranks();
-  ctx->nRanksPerNode = getIpcDomainNranks(comm);
+  ctx->ipcDomainNranks = getIpcDomainNranks(comm);
 
   // setup channels
   ctx->switchChannels = this->switchChannels_;
@@ -123,7 +123,7 @@ CommResult AllreduceNvlsPacket::allreduceKernelFunc(const std::shared_ptr<void>
   }
   cudaError_t error =
       allreduce(input, this->scratchBuffer_, output, nullptr, nullptr, ctx->switchChannelDeviceHandles.get(), nullptr,
-                0, 0, this->scratchBufferSize_, ctx->rank, ctx->nRanksPerNode, ctx->workSize, inputSize, stream,
+                0, 0, this->scratchBufferSize_, ctx->rank, ctx->ipcDomainNranks, ctx->workSize, inputSize, stream,
                 (void*)flagBuffer_, (uint32_t)flagBufferSize_, 0, blockAndThreadNum.first, blockAndThreadNum.second);
   if (error != cudaSuccess) {
     WARN(ALGO, "AllreduceNvlsPacket failed with error: ", cudaGetErrorString(error));
diff --git a/src/ext/collectives/allreduce/allreduce_nvls_warp_pipeline.cu b/src/ext/collectives/allreduce/allreduce_nvls_warp_pipeline.cu
index 9be621f08..1bdac9ada 100644
--- a/src/ext/collectives/allreduce/allreduce_nvls_warp_pipeline.cu
+++ b/src/ext/collectives/allreduce/allreduce_nvls_warp_pipeline.cu
@@ -18,15 +18,15 @@ __global__ void __launch_bounds__(1024, 1)
                               [[maybe_unused]] DeviceHandle<BaseMemoryChannel>* memoryChannels,
                               [[maybe_unused]] DeviceHandle<SwitchChannel>* multicast, [[maybe_unused]] size_t size,
                               [[maybe_unused]] size_t scratchBufferSize, [[maybe_unused]] int rank,
-                              [[maybe_unused]] int nRanksPerNode) {
+                              [[maybe_unused]] int ipcDomainNranks) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
   constexpr int alignment = 16;
-  int nPeers = nRanksPerNode - 1;
+  int nPeers = ipcDomainNranks - 1;
   int nBlocks = gridDim.x;
   int nBlocksPerNvlsConn = nBlocks / NUM_NVLS_CONNECTION;
   int bid = blockIdx.x;
-  size_t sizePerRank = size / nRanksPerNode;
-  size_t scratchSizePerRank = scratchBufferSize / nRanksPerNode;
+  size_t sizePerRank = size / ipcDomainNranks;
+  size_t scratchSizePerRank = scratchBufferSize / ipcDomainNranks;
   const size_t maxSizePerBlock = ((sizePerRank + nBlocks - 1) / nBlocks + alignment - 1) / alignment * alignment;
   size_t start = bid * maxSizePerBlock;
   size_t end = min(start + maxSizePerBlock, sizePerRank);
@@ -53,7 +53,7 @@ __global__ void __launch_bounds__(1024, 1)
     lastIterSize = sizePerBlock % copyPerIter;
   }
 
-  const size_t chanOffset = (nRanksPerNode - 1) * blockIdx.x * 2;
+  const size_t chanOffset = (ipcDomainNranks - 1) * blockIdx.x * 2;
   auto memoryChans = memoryChannels + chanOffset;
   __shared__ DeviceHandle<BaseMemoryChannel> channels[(MAX_NRANKS_PER_NODE - 1) * 2];
   const int lid = threadIdx.x % WARP_SIZE;
@@ -68,7 +68,7 @@ __global__ void __launch_bounds__(1024, 1)
     const size_t iterSize = (it == nIter - 1) ? lastIterSize : copyPerIter;
     if (warpId < endCopyWid) {
       int tidInCopy = threadIdx.x;
-      for (int i = 0; i < nRanksPerNode; i++) {
+      for (int i = 0; i < ipcDomainNranks; i++) {
         size_t offset = i * sizePerRank + maxSizePerBlock * bid + it * copyPerIter;
         size_t offsetScratch =
             i * scratchSizePerRank + scratchSizePerBlock * bid + (it * copyPerIter) % scratchSizePerBlock;
@@ -99,7 +99,7 @@ __global__ void __launch_bounds__(1024, 1)
         channels[tidInRecvCopy + nPeers].wait();
       }
       asm volatile("bar.sync %0, %1;" ::"r"(3), "r"((NRECV_COPY_WARPS)*WARP_SIZE) : "memory");
-      for (int i = 0; i < nRanksPerNode; i++) {
+      for (int i = 0; i < ipcDomainNranks; i++) {
         size_t offset = i * sizePerRank + maxSizePerBlock * bid + it * copyPerIter;
         size_t offsetScratch =
             i * scratchSizePerRank + scratchSizePerBlock * bid + (it * copyPerIter) % scratchSizePerBlock;
@@ -116,7 +116,7 @@ template <ReduceOp OpType, typename T, typename AccumT = T>
 struct NvlsWarpPipelineAdapter {
   static cudaError_t call(const void* input, void* scratch, void* output, void* memoryChannels, void*,
                           DeviceHandle<SwitchChannel>* nvlsChannels, DeviceHandle<SwitchChannel>*, size_t, size_t,
-                          size_t scratchBufferSize, int rank, int nRanksPerNode, int, size_t inputSize,
+                          size_t scratchBufferSize, int rank, int ipcDomainNranks, int, size_t inputSize,
                           cudaStream_t stream, void*, uint32_t, uint32_t, int nBlocks, int nThreadsPerBlock) {
     // uint8_t is not supported for NVLS (no hardware support for byte-level reduction)
     if constexpr (std::is_same_v<T, uint8_t>) {
@@ -132,9 +132,9 @@ struct NvlsWarpPipelineAdapter {
 #endif
       {
         using ChannelType = DeviceHandle<BaseMemoryChannel>;
-        allreduceNvlsWarpPipeline<T>
-            <<<nBlocks, nThreadsPerBlock, 0, stream>>>(input, scratch, output, (ChannelType*)memoryChannels,
-                                                       nvlsChannels, inputSize, scratchBufferSize, rank, nRanksPerNode);
+        allreduceNvlsWarpPipeline<T><<<nBlocks, nThreadsPerBlock, 0, stream>>>(
+            input, scratch, output, (ChannelType*)memoryChannels, nvlsChannels, inputSize, scratchBufferSize, rank,
+            ipcDomainNranks);
         return cudaGetLastError();
       }
   }
@@ -165,11 +165,11 @@ CommResult AllreduceNvlsWarpPipeline::allreduceKernelFunc(
   }
   std::pair<int, int> blockAndThreadNum = {nBlocks, nThreadsPerBlock};
   if (blockAndThreadNum.first == 0 || blockAndThreadNum.second == 0) {
-    blockAndThreadNum = {ctx->nRanksPerNode * 4, 1024};
+    blockAndThreadNum = {ctx->ipcDomainNranks * 4, 1024};
   }
   cudaError_t error = allreduce(input, this->scratchBuffer_, output, this->memoryChannelsDeviceHandle_.get(), nullptr,
                                 ctx->switchChannelDeviceHandles.get(), nullptr, 0, 0, this->scratchBufferSize_,
-                                ctx->rank, ctx->nRanksPerNode, ctx->workSize, inputSize, stream, nullptr, 0, 0,
+                                ctx->rank, ctx->ipcDomainNranks, ctx->workSize, inputSize, stream, nullptr, 0, 0,
                                 blockAndThreadNum.first, blockAndThreadNum.second);
   if (error != cudaSuccess) {
     WARN("AllreduceNvlsWarpPipeline failed with error: %s", cudaGetErrorString(error));
@@ -187,7 +187,7 @@ std::shared_ptr<void> AllreduceNvlsWarpPipeline::initAllreduceContext(std::share
   auto ctx = std::make_shared<AlgorithmCtx>();
   ctx->rank = comm->bootstrap()->getRank();
   ctx->workSize = comm->bootstrap()->getNranks();
-  ctx->nRanksPerNode = comm->bootstrap()->getNranksPerNode();
+  ctx->ipcDomainNranks = comm->bootstrap()->getNranksPerNode();
 
   // setup channels
   ctx->switchChannels =
diff --git a/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu b/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu
index 36fcf860b..a9d46d4f5 100644
--- a/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu
+++ b/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu
@@ -19,12 +19,12 @@ __global__ void __launch_bounds__(1024, 1)
                   [[maybe_unused]] mscclpp::DeviceHandle<mscclpp::SwitchChannel>* multicast,
                   [[maybe_unused]] mscclpp::DeviceHandle<mscclpp::SwitchChannel>* multicastOut,
                   [[maybe_unused]] size_t channelInOffset, [[maybe_unused]] size_t channelOutOffset,
-                  [[maybe_unused]] size_t size, [[maybe_unused]] int rank, [[maybe_unused]] int nRanksPerNode) {
+                  [[maybe_unused]] size_t size, [[maybe_unused]] int rank, [[maybe_unused]] int ipcDomainNranks) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
-  int nPeers = nRanksPerNode - 1;
+  int nPeers = ipcDomainNranks - 1;
   int nBlocks = gridDim.x;
   int bid = blockIdx.x;
-  size_t sizePerRank = size / nRanksPerNode;
+  size_t sizePerRank = size / ipcDomainNranks;
   const size_t minAlign = 16;
   // Align sizePerBlock to 16 bytes to ensure aligned vector access in handleMultiLoadReduceStore
   size_t sizePerBlock = (sizePerRank + nBlocks - 1) / nBlocks;
@@ -40,14 +40,14 @@ __global__ void __launch_bounds__(1024, 1)
   mscclpp::DeviceHandle<mscclpp::SwitchChannel>* multicastPtr = multicast + bid;
   mscclpp::DeviceHandle<mscclpp::SwitchChannel>* multicastOutPtr = multicastOut + bid;
 
-  const size_t chanOffset = (nRanksPerNode - 1) * blockIdx.x;
+  const size_t chanOffset = (ipcDomainNranks - 1) * blockIdx.x;
   auto memoryChans = memoryChannels + chanOffset;
   __shared__ mscclpp::DeviceHandle<mscclpp::BaseMemoryChannel> channels[MAX_NRANKS_PER_NODE - 1];
   const int lid = threadIdx.x % WARP_SIZE;
   // Each warp redundantly loads all entries (same value, benign race) so that
   // every warp has the data its threads will read after __syncwarp(). Required
   // when nPeers > WARP_SIZE (MNNVL/NVL72 → 71 peers).
-  for (int i = lid; i < nRanksPerNode - 1; i += WARP_SIZE) {
+  for (int i = lid; i < ipcDomainNranks - 1; i += WARP_SIZE) {
     channels[i] = memoryChans[i];
   }
   __syncwarp();
@@ -75,7 +75,7 @@ struct NvlsAdapter {
   static cudaError_t call(const void*, void*, void*, void* memoryChannels, void*,
                           mscclpp::DeviceHandle<mscclpp::SwitchChannel>* nvlsChannels,
                           mscclpp::DeviceHandle<mscclpp::SwitchChannel>* nvlsOutChannels, size_t channelInOffset,
-                          size_t channelOutOffset, size_t, int rank, int nRanksPerNode, int, size_t inputSize,
+                          size_t channelOutOffset, size_t, int rank, int ipcDomainNranks, int, size_t inputSize,
                           cudaStream_t stream, void*, uint32_t, uint32_t, int nBlocks, int nThreadsPerBlock) {
     // uint8_t is not supported for NVLS (no hardware support for byte-level reduction)
     if constexpr (std::is_same_v<T, uint8_t>) {
@@ -93,7 +93,7 @@ struct NvlsAdapter {
       using ChannelType = DeviceHandle<mscclpp::BaseMemoryChannel>;
       allreduceNvls<T><<<nBlocks, nThreadsPerBlock, 0, stream>>>((ChannelType*)memoryChannels, nvlsChannels,
                                                                  nvlsOutChannels, channelInOffset, channelOutOffset,
-                                                                 inputSize, rank, nRanksPerNode);
+                                                                 inputSize, rank, ipcDomainNranks);
       return cudaGetLastError();
     }
   }
@@ -145,7 +145,7 @@ CommResult AllreduceNvls::allreduceKernelFunc(const std::shared_ptr<void> ctx_vo
   }
   std::pair<int, int> numBlocksAndThreads = {nBlocks, nThreadsPerBlock};
   if (numBlocksAndThreads.first == 0 || numBlocksAndThreads.second == 0) {
-    numBlocksAndThreads = {::min(ctx->nRanksPerNode, MAX_NBLOCKS), 1024};
+    numBlocksAndThreads = {::min(ctx->ipcDomainNranks, MAX_NBLOCKS), 1024};
     // For GB200 devices with MNNVLS (Multi-Node NVLink Sharp), scale the number of blocks inversely with
     // the number of GPUs. Empirically, 32 blocks works well for 4 GPUs and 16 for 8 GPUs, which
     // follows the formula 128 / nGPUs, clamped to [1, MAX_NBLOCKS].
@@ -159,7 +159,7 @@ CommResult AllreduceNvls::allreduceKernelFunc(const std::shared_ptr<void> ctx_vo
   }
   cudaError_t error =
       allreduce(nullptr, nullptr, nullptr, this->memoryChannelsDeviceHandle_.get(), nullptr, nvlsChannels,
-                nvlsOutChannels, channelInOffset, channelOutOffset, 0, ctx->rank, ctx->nRanksPerNode, ctx->workSize,
+                nvlsOutChannels, channelInOffset, channelOutOffset, 0, ctx->rank, ctx->ipcDomainNranks, ctx->workSize,
                 inputSize, stream, nullptr, 0, 0, numBlocksAndThreads.first, numBlocksAndThreads.second);
   if (error != cudaSuccess) {
     WARN("AllreduceNvls failed with error: %s", cudaGetErrorString(error));
@@ -183,7 +183,7 @@ std::shared_ptr<void> AllreduceNvls::initAllreduceContext(std::shared_ptr<mscclp
   auto ctx = std::make_shared<AlgorithmCtx>();
   ctx->rank = comm->bootstrap()->getRank();
   ctx->workSize = comm->bootstrap()->getNranks();
-  ctx->nRanksPerNode = getIpcDomainNranks(comm);
+  ctx->ipcDomainNranks = getIpcDomainNranks(comm);
 
   size_t sendBytes, recvBytes;
   CUdeviceptr sendBasePtr, recvBasePtr;
diff --git a/src/ext/collectives/allreduce/allreduce_packet.cu b/src/ext/collectives/allreduce/allreduce_packet.cu
index d631c35a8..ebb2f618a 100644
--- a/src/ext/collectives/allreduce/allreduce_packet.cu
+++ b/src/ext/collectives/allreduce/allreduce_packet.cu
@@ -15,7 +15,7 @@ namespace collective {
 template <ReduceOp OpType, typename T, typename AccumT = T>
 __global__ void __launch_bounds__(1024, 1)
     allreducePacket(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle<mscclpp::MemoryChannel>* memoryChannels,
-                    size_t channelDataOffset, size_t scratchBufferSize, int rank, int nRanksPerNode, int worldSize,
+                    size_t channelDataOffset, size_t scratchBufferSize, int rank, int ipcDomainNranks, int worldSize,
                     size_t nelems, void* flags, uint32_t flagBufferSize, uint32_t numScratchBuff
 #if defined(ENABLE_NPKIT)
                     ,
@@ -53,7 +53,7 @@ __global__ void __launch_bounds__(1024, 1)
   else
     nelems = nelems / (sizeof(int) / sizeof(T));
 
-  const int nPeers = nRanksPerNode - 1;
+  const int nPeers = ipcDomainNranks - 1;
   const size_t nPkts = nelems / 2;
 
   uint32_t flag = ((uint32_t*)flags)[blockIdx.x];
@@ -156,7 +156,7 @@ template <ReduceOp OpType, typename T, typename AccumT = T>
 struct PacketAdapter {
   static cudaError_t call(const void* buff, void* scratch, void* resultBuff, void* memoryChannels, void*,
                           DeviceHandle<SwitchChannel>*, DeviceHandle<SwitchChannel>*, size_t channelInOffset, size_t,
-                          size_t scratchBufferSize, int rank, int nRanksPerNode, int worldSize, size_t inputSize,
+                          size_t scratchBufferSize, int rank, int ipcDomainNranks, int worldSize, size_t inputSize,
                           cudaStream_t stream, void* flags, uint32_t flagBufferSize, uint32_t numScratchBuff,
                           int nBlocks = 0, int nThreadsPerBlock = 0) {
     using ChannelType = DeviceHandle<MemoryChannel>;
@@ -167,20 +167,20 @@ struct PacketAdapter {
     size_t sharedMemSize = sizeof(NpKitEvent) * NPKIT_SHM_NUM_EVENTS;
     allreducePacket<OpType, T, AccumT><<<nBlocks, nThreadsPerBlock, sharedMemSize, stream>>>(
         (T*)buff, (T*)scratch, (T*)resultBuff, (ChannelType*)memoryChannels, channelInOffset, scratchBufferSize, rank,
-        nRanksPerNode, worldSize, nelems, flags, flagBufferSize, numScratchBuff, NpKit::GetGpuEventCollectContexts(),
+        ipcDomainNranks, worldSize, nelems, flags, flagBufferSize, numScratchBuff, NpKit::GetGpuEventCollectContexts(),
         NpKit::GetCpuTimestamp());
 #else
     allreducePacket<OpType, T, AccumT><<<nBlocks, nThreadsPerBlock, 0, stream>>>(
         (T*)buff, (T*)scratch, (T*)resultBuff, (ChannelType*)memoryChannels, channelInOffset, scratchBufferSize, rank,
-        nRanksPerNode, worldSize, nelems, flags, flagBufferSize, numScratchBuff);
+        ipcDomainNranks, worldSize, nelems, flags, flagBufferSize, numScratchBuff);
 #endif
     return cudaGetLastError();
   }
 };
 
-inline std::pair<int, int> getDefaultBlockNumAndThreadNum(size_t inputSize, int nRanksPerNode, int worldSize,
+inline std::pair<int, int> getDefaultBlockNumAndThreadNum(size_t inputSize, int ipcDomainNranks, int worldSize,
                                                           [[maybe_unused]] DataType dtype) {
-  int nBlocks = (nRanksPerNode - 1) * 4;
+  int nBlocks = (ipcDomainNranks - 1) * 4;
   int nThreadsPerBlock = 1024;
   if (inputSize >= 32768) {
     nBlocks = (worldSize - 1) * 8;
@@ -232,7 +232,7 @@ CommResult AllreducePacket::allreduceKernelFunc(const std::shared_ptr<void> ctx_
   auto ctx = std::static_pointer_cast<AlgorithmCtx>(ctx_void);
   std::pair<int, int> blockAndThreadNum = {nBlocks, nThreadsPerBlock};
   if (blockAndThreadNum.first == 0 || blockAndThreadNum.second == 0) {
-    blockAndThreadNum = getDefaultBlockNumAndThreadNum(inputSize, ctx->workSize, ctx->nRanksPerNode, dtype);
+    blockAndThreadNum = getDefaultBlockNumAndThreadNum(inputSize, ctx->workSize, ctx->ipcDomainNranks, dtype);
   }
 
   size_t sendBytes;
@@ -248,7 +248,7 @@ CommResult AllreducePacket::allreduceKernelFunc(const std::shared_ptr<void> ctx_
   }
   cudaError_t error =
       allreduce(input, this->scratchBuffer_, output, ctx->memoryChannelDeviceHandles.get(), nullptr, nullptr, nullptr,
-                channelInOffset, 0, this->scratchBufferSize_, ctx->rank, ctx->nRanksPerNode, ctx->workSize, inputSize,
+                channelInOffset, 0, this->scratchBufferSize_, ctx->rank, ctx->ipcDomainNranks, ctx->workSize, inputSize,
                 stream, (void*)flagBuffer_, (uint32_t)flagBufferSize_, this->nSegmentsForScratchBuffer_,
                 blockAndThreadNum.first, blockAndThreadNum.second);
   if (error != cudaSuccess) {
@@ -264,7 +264,7 @@ std::shared_ptr<void> AllreducePacket::initAllreduceContext(std::shared_ptr<Comm
   const int nChannelsPerConnection = maxBlockNum_;
   ctx->rank = comm->bootstrap()->getRank();
   ctx->workSize = comm->bootstrap()->getNranks();
-  ctx->nRanksPerNode = getIpcDomainNranks(comm);
+  ctx->ipcDomainNranks = getIpcDomainNranks(comm);
   ctx->memorySemaphores = this->memorySemaphores_;
   ctx->registeredMemories = this->registeredMemories_;
   ctx->registeredMemories.pop_back();  // remove the local memory from previous context
diff --git a/src/ext/collectives/allreduce/allreduce_rsag.cu b/src/ext/collectives/allreduce/allreduce_rsag.cu
index 4c46bf9b3..93e2d0c46 100644
--- a/src/ext/collectives/allreduce/allreduce_rsag.cu
+++ b/src/ext/collectives/allreduce/allreduce_rsag.cu
@@ -31,18 +31,18 @@ namespace collective {
 template <ReduceOp OpType, typename T>
 __global__ void __launch_bounds__(1024, 1)
     allreduceRsAg(T* buff, T* scratch, T* resultBuff, DeviceHandle<BaseMemoryChannel>* memoryChannels,
-                  DeviceHandle<SwitchChannel>* switchChannels, void* remoteMemories, int rank, int nRanksPerNode,
+                  DeviceHandle<SwitchChannel>* switchChannels, void* remoteMemories, int rank, int ipcDomainNranks,
                   int worldSize, size_t nelems) {
   int blockId = blockIdx.x;
-  uint32_t nPeers = nRanksPerNode - 1;
+  uint32_t nPeers = ipcDomainNranks - 1;
 
   assert((uintptr_t)buff % sizeof(int4) == 0);
   assert((uintptr_t)resultBuff % sizeof(int4) == 0);
 
   constexpr uint32_t nelemsPerInt4 = sizeof(int4) / sizeof(T);
-  uint32_t alignedNelems = ((nelems + nRanksPerNode - 1) / nRanksPerNode + nelemsPerInt4 - 1) / nelemsPerInt4 *
-                           nelemsPerInt4 * nRanksPerNode;
-  uint32_t nelemsPerRank = alignedNelems / nRanksPerNode;
+  uint32_t alignedNelems = ((nelems + ipcDomainNranks - 1) / ipcDomainNranks + nelemsPerInt4 - 1) / nelemsPerInt4 *
+                           nelemsPerInt4 * ipcDomainNranks;
+  uint32_t nelemsPerRank = alignedNelems / ipcDomainNranks;
   uint32_t nInt4PerRank = nelemsPerRank / nelemsPerInt4;
   uint32_t lastInt4Index = nelems / nelemsPerInt4;
   uint32_t remainder = nelems % nelemsPerInt4;
@@ -59,7 +59,7 @@ __global__ void __launch_bounds__(1024, 1)
     nInt4PerBlock += remainderForBlock;
   }
   if (nInt4PerBlock == 0) return;
-  uint32_t nInt4ForCopy = nInt4PerBlock * nRanksPerNode;
+  uint32_t nInt4ForCopy = nInt4PerBlock * ipcDomainNranks;
 
   for (uint32_t idx = threadIdx.x; idx < nInt4ForCopy; idx += blockDim.x) {
     int rankIdx = idx / nInt4PerBlock;
@@ -84,13 +84,13 @@ __global__ void __launch_bounds__(1024, 1)
     if (offset > lastInt4Index) continue;
     int4 tmp = scratch4[offset];
     for (uint32_t i = 0; i < nPeers; i++) {
-      int rankIdx = (rank + i + 1) % nRanksPerNode;
+      int rankIdx = (rank + i + 1) % ipcDomainNranks;
       int peerIdx = rankIdx < rank ? rankIdx : rankIdx - 1;
       int4 data = mscclpp::read<int4>(((void**)remoteMemories)[peerIdx], offset);
       tmp = calVector<T, OpType>(data, tmp);
     }
     for (uint32_t i = 0; i < nPeers; i++) {
-      int rankIdx = (rank + i + 1) % nRanksPerNode;
+      int rankIdx = (rank + i + 1) % ipcDomainNranks;
       int peerIdx = rankIdx < rank ? rankIdx : rankIdx - 1;
       mscclpp::write<int4>(((void**)remoteMemories)[peerIdx], offset, tmp);
     }
@@ -127,7 +127,7 @@ template <ReduceOp OpType, typename T, typename AccumT = T>
 struct AllreduceRsAgAdapter {
   static cudaError_t call(const void* input, void* scratch, void* output, void* memoryChannels, void* remoteMemories,
                           DeviceHandle<SwitchChannel>* switchChannel, DeviceHandle<SwitchChannel>*, size_t, size_t,
-                          size_t, int rank, int nRanksPerNode, int worldSize, size_t inputSize, cudaStream_t stream,
+                          size_t, int rank, int ipcDomainNranks, int worldSize, size_t inputSize, cudaStream_t stream,
                           void*, uint32_t, uint32_t, int nBlocks, int nThreadsPerBlock) {
     using ChannelType = DeviceHandle<BaseMemoryChannel>;
     size_t nelems = inputSize / sizeof(T);
@@ -137,7 +137,7 @@ struct AllreduceRsAgAdapter {
     }
     allreduceRsAg<OpType, T><<<nBlocks, nThreadsPerBlock, 0, stream>>>(
         (T*)input, (T*)scratch, (T*)output, (ChannelType*)memoryChannels, switchChannel, remoteMemories, rank,
-        nRanksPerNode, worldSize, nelems);
+        ipcDomainNranks, worldSize, nelems);
     return cudaGetLastError();
   }
 };
@@ -185,7 +185,7 @@ CommResult AllreduceRsAg::allreduceKernelFunc(const std::shared_ptr<void> ctx, c
   }
   cudaError_t error = allreduce(input, this->scratchBuffer_, output, this->baseMemoryChannelHandles_.get(),
                                 this->remoteMemoryHandles_.get(), nullptr, nullptr, 0, 0, 0, algoCtx->rank,
-                                algoCtx->nRanksPerNode, algoCtx->workSize, inputSize, stream, nullptr, 0, 0,
+                                algoCtx->ipcDomainNranks, algoCtx->workSize, inputSize, stream, nullptr, 0, 0,
                                 numBlocksAndThreads.first, numBlocksAndThreads.second);
   if (error != cudaSuccess) {
     WARN(ALGO, "Allreduce kernel launch failed with error: ", cudaGetErrorString(error));
@@ -203,7 +203,7 @@ std::shared_ptr<void> AllreduceRsAg::initAllreduceContext(std::shared_ptr<Commun
   auto ctx = std::make_shared<AlgorithmCtx>();
   ctx->rank = comm->bootstrap()->getRank();
   ctx->workSize = comm->bootstrap()->getNranks();
-  ctx->nRanksPerNode = getIpcDomainNranks(comm);
+  ctx->ipcDomainNranks = getIpcDomainNranks(comm);
 
   ctx->memorySemaphores = this->scratchSemaphores_;
   ctx->registeredMemories = this->remoteScratchMemories_;
diff --git a/src/ext/collectives/allreduce/allreduce_rsag_pipeline.cu b/src/ext/collectives/allreduce/allreduce_rsag_pipeline.cu
index eabe3dc53..9f63e5905 100644
--- a/src/ext/collectives/allreduce/allreduce_rsag_pipeline.cu
+++ b/src/ext/collectives/allreduce/allreduce_rsag_pipeline.cu
@@ -86,7 +86,7 @@ template <ReduceOp OpType, typename T>
 __global__ void __launch_bounds__(1024, 1)
     allreduceRsAgPipeline(T* buff, T* scratch, T* resultBuff, DeviceHandle<BaseMemoryChannel>* memoryChannels,
                           DeviceHandle<SwitchChannel>* switchChannels, void* remoteMemories, int rank,
-                          int nRanksPerNode, int worldSize, size_t nelems, size_t scratchSize, uint32_t nblocksForPut,
+                          int ipcDomainNranks, int worldSize, size_t nelems, size_t scratchSize, uint32_t nblocksForPut,
                           uint32_t nblocksForReduce, uint32_t nblocksForRecv) {
   uint32_t bid = blockIdx.x;
   constexpr uint32_t nStepsPerIter = 4;
@@ -94,7 +94,7 @@ __global__ void __launch_bounds__(1024, 1)
   uint32_t nInt4PerIter = nblocksForReduce * blockDim.x * nStepsPerIter;
   const uint32_t chunkSize = nInt4PerIter * worldSize;
   uint32_t nIters = (nInt4 + chunkSize - 1) / chunkSize;
-  uint32_t nPeers = nRanksPerNode - 1;
+  uint32_t nPeers = ipcDomainNranks - 1;
   int4* scratch4 = reinterpret_cast<int4*>((char*)scratch);
   const uint32_t scratchIterStride = 2 * chunkSize;  // one for AS, one for AG
   const uint32_t pipelineDepth = scratchSize / sizeof(int4) / scratchIterStride;
@@ -111,7 +111,7 @@ __global__ void __launch_bounds__(1024, 1)
       __syncthreads();
       uint32_t threadIdInPut = bid * blockDim.x + threadIdx.x;
       for (uint32_t peer = 0; peer < nPeers; peer++) {
-        int remoteRankId = (rank + peer + 1) % nRanksPerNode;
+        int remoteRankId = (rank + peer + 1) % ipcDomainNranks;
         int peerId = remoteRankId < rank ? remoteRankId : remoteRankId - 1;
         // Read chunk[remoteRankId] from local buff, write to peer's scratch[rank] (sender's slot)
         uint32_t srcOffset = iter * chunkSize + remoteRankId * nInt4PerIter;
@@ -164,7 +164,7 @@ __global__ void __launch_bounds__(1024, 1)
         int4 tmp = loadVec(buff, myChunkOffset, nelems);
         // Add data from each peer's slot in scratch (peer sent their chunk[rank] to our scratch[peer])
         for (uint32_t peer = 0; peer < nPeers; peer++) {
-          int remoteRankId = (rank + peer + 1) % nRanksPerNode;
+          int remoteRankId = (rank + peer + 1) % ipcDomainNranks;
           uint32_t peerSlotOffset =
               baseOffset + remoteRankId * nInt4PerIter + threadIdInPut + putStep * blockDim.x * nblocksForPut;
           int4 data = scratch4[peerSlotOffset];
@@ -175,7 +175,7 @@ __global__ void __launch_bounds__(1024, 1)
         uint32_t dstOffset =
             baseOffset + chunkSize + rank * nInt4PerIter + threadIdInPut + putStep * blockDim.x * nblocksForPut;
         for (uint32_t i = 0; i < nPeers; i++) {
-          int peerIdx = (rank + i + 1) % nRanksPerNode;
+          int peerIdx = (rank + i + 1) % ipcDomainNranks;
           int index = peerIdx < rank ? peerIdx : peerIdx - 1;
           mscclpp::write<int4>(((void**)remoteMemories)[index], dstOffset, tmp);
         }
@@ -203,7 +203,7 @@ __global__ void __launch_bounds__(1024, 1)
       __syncthreads();
       // Copy other ranks' reduced chunks from scratch to result
       for (uint32_t peer = 0; peer < nPeers; peer++) {
-        int remoteRankId = (rank + peer + 1) % nRanksPerNode;
+        int remoteRankId = (rank + peer + 1) % ipcDomainNranks;
         for (uint32_t step = 0; step < nStepsPerIter * REDUCE_COPY_RATIO; step++) {
           uint32_t offset = baseOffset + chunkSize + remoteRankId * nInt4PerIter + threadIdInRecv +
                             step * blockDim.x * nblocksForRecv;
@@ -224,7 +224,7 @@ template <ReduceOp OpType, typename T, typename AccumT = T>
 struct AllreduceRsAgPipelineAdapter {
   static cudaError_t call(const void* input, void* scratch, void* output, void* memoryChannels, void* remoteMemories,
                           DeviceHandle<SwitchChannel>* switchChannel, DeviceHandle<SwitchChannel>*, size_t, size_t,
-                          size_t scratchSize, int rank, int nRanksPerNode, int worldSize, size_t inputSize,
+                          size_t scratchSize, int rank, int ipcDomainNranks, int worldSize, size_t inputSize,
                           cudaStream_t stream, void*, uint32_t, uint32_t, int nBlocks, int nThreadsPerBlock) {
     using ChannelType = DeviceHandle<BaseMemoryChannel>;
     size_t nelems = inputSize / sizeof(T);
@@ -248,7 +248,7 @@ struct AllreduceRsAgPipelineAdapter {
     }
     allreduceRsAgPipeline<OpType, T><<<nBlocks, nThreadsPerBlock, 0, stream>>>(
         (T*)input, (T*)scratch, (T*)output, (ChannelType*)memoryChannels, switchChannel, remoteMemories, rank,
-        nRanksPerNode, worldSize, nelems, scratchSize, nblocksForPut, nblocksForReduce, nblocksForRecv);
+        ipcDomainNranks, worldSize, nelems, scratchSize, nblocksForPut, nblocksForReduce, nblocksForRecv);
     return cudaGetLastError();
   }
 };
@@ -288,8 +288,8 @@ CommResult AllreduceRsAgPipeline::allreduceKernelFunc(
   std::pair<int, int> numBlocksAndThreads = {nBlocks, nThreadsPerBlock};
   cudaError_t error = allreduce(input, this->scratchBuffer_, output, this->baseMemoryChannelHandles_.get(),
                                 this->remoteMemoryHandles_.get(), nullptr, nullptr, 0, 0, this->scratchBufferSize_,
-                                algoCtx->rank, algoCtx->nRanksPerNode, algoCtx->workSize, inputSize, stream, nullptr, 0,
-                                0, numBlocksAndThreads.first, numBlocksAndThreads.second);
+                                algoCtx->rank, algoCtx->ipcDomainNranks, algoCtx->workSize, inputSize, stream, nullptr,
+                                0, 0, numBlocksAndThreads.first, numBlocksAndThreads.second);
   if (error != cudaSuccess) {
     WARN(ALGO, "Allreduce kernel launch failed with error: ", cudaGetErrorString(error));
     return CommResult::CommUnhandledCudaError;
@@ -306,7 +306,7 @@ std::shared_ptr<void> AllreduceRsAgPipeline::initAllreduceContext(std::shared_pt
   auto ctx = std::make_shared<AlgorithmCtx>();
   ctx->rank = comm->bootstrap()->getRank();
   ctx->workSize = comm->bootstrap()->getNranks();
-  ctx->nRanksPerNode = comm->bootstrap()->getNranksPerNode();
+  ctx->ipcDomainNranks = comm->bootstrap()->getNranksPerNode();
 
   ctx->memorySemaphores = this->scratchSemaphores_;
   ctx->registeredMemories = this->remoteScratchMemories_;
diff --git a/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu b/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu
index 67eed6d31..ea6643255 100644
--- a/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu
+++ b/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu
@@ -35,7 +35,7 @@ __device__ mscclpp::DeviceSyncer globalSyncer;
 //
 // This approach requires registering both input and output buffers as remote
 // memories (2 * nPeers handles), but avoids scratch buffer allocation and
-// the extra copy steps of the standard RSAG. nRanksPerNode is accepted at
+// the extra copy steps of the standard RSAG. ipcDomainNranks is accepted at
 // runtime, which allows the same kernel to handle any NVLink-domain size
 // (including Multi-Node NVLink fabrics up to NVL72).
 
@@ -43,18 +43,18 @@ template <ReduceOp OpType, typename T, typename AccumT = T>
 __global__ void __launch_bounds__(1024, 1)
     allreduceRsAgZeroCopy(T* buff, T* scratch, T* resultBuff, DeviceHandle<BaseMemoryChannel>* memoryChannels,
                           DeviceHandle<SwitchChannel>* switchChannels, void* remoteMemories, int rank,
-                          int nRanksPerNode, int worldSize, size_t nelems) {
+                          int ipcDomainNranks, int worldSize, size_t nelems) {
   int blockId = blockIdx.x;
 
   assert((uintptr_t)buff % sizeof(int4) == 0);
   assert((uintptr_t)resultBuff % sizeof(int4) == 0);
 
-  const int NPeers = nRanksPerNode - 1;
+  const int NPeers = ipcDomainNranks - 1;
   constexpr uint32_t nelemsPerInt4 = sizeof(int4) / sizeof(T);
   const uint32_t outputRemoteBufferOffset = NPeers;
-  uint32_t alignedNelems = ((nelems + nRanksPerNode - 1) / nRanksPerNode + nelemsPerInt4 - 1) / nelemsPerInt4 *
-                           nelemsPerInt4 * nRanksPerNode;
-  uint32_t nelemsPerRank = alignedNelems / nRanksPerNode;
+  uint32_t alignedNelems = ((nelems + ipcDomainNranks - 1) / ipcDomainNranks + nelemsPerInt4 - 1) / nelemsPerInt4 *
+                           nelemsPerInt4 * ipcDomainNranks;
+  uint32_t nelemsPerRank = alignedNelems / ipcDomainNranks;
   uint32_t nInt4PerRank = nelemsPerRank / nelemsPerInt4;
   uint32_t nInt4Total = (nelems + nelemsPerInt4 - 1) / nelemsPerInt4;
 
@@ -87,14 +87,14 @@ __global__ void __launch_bounds__(1024, 1)
     int4 data;
     AccumVec acc = mscclpp::upcastVector<T, AccumT, AccumVec>(tmp_raw);
     for (int i = 0; i < NPeers; i++) {
-      int rankIdx = (rank + i + 1) % nRanksPerNode;
+      int rankIdx = (rank + i + 1) % ipcDomainNranks;
       int peerIdx = rankIdx < rank ? rankIdx : rankIdx - 1;
       data = mscclpp::read<int4>(((void**)remoteMemories)[peerIdx], offset);
       acc = mscclpp::calVectorAccum<T, AccumT, OpType, AccumVec>(acc, data);
     }
     int4 tmp = mscclpp::downcastVector<T, AccumT, int4>(acc);
     for (int i = 0; i < NPeers; i++) {
-      int rankIdx = (rank + i + 1) % nRanksPerNode;
+      int rankIdx = (rank + i + 1) % ipcDomainNranks;
       int peerIdx = rankIdx < rank ? rankIdx : rankIdx - 1;
       mscclpp::write<int4>(((void**)remoteMemories)[outputRemoteBufferOffset + peerIdx], offset, tmp);
     }
@@ -112,7 +112,7 @@ template <ReduceOp OpType, typename T, typename AccumT = T>
 struct AllreduceRsAgZeroCopyAdapter {
   static cudaError_t call(const void* input, void* scratch, void* output, void* memoryChannels, void* remoteMemories,
                           DeviceHandle<SwitchChannel>* switchChannel, DeviceHandle<SwitchChannel>*, size_t, size_t,
-                          size_t, int rank, int nRanksPerNode, int worldSize, size_t inputSize, cudaStream_t stream,
+                          size_t, int rank, int ipcDomainNranks, int worldSize, size_t inputSize, cudaStream_t stream,
                           void*, uint32_t, uint32_t, int nBlocks, int nThreadsPerBlock) {
     using ChannelType = DeviceHandle<BaseMemoryChannel>;
     size_t nelems = inputSize / sizeof(T);
@@ -125,7 +125,7 @@ struct AllreduceRsAgZeroCopyAdapter {
     }
     allreduceRsAgZeroCopy<OpType, T, AccumT><<<nBlocks, nThreadsPerBlock, 0, stream>>>(
         (T*)input, (T*)scratch, (T*)output, (ChannelType*)memoryChannels, switchChannel, remoteMemories, rank,
-        nRanksPerNode, worldSize, nelems);
+        ipcDomainNranks, worldSize, nelems);
     return cudaGetLastError();
   }
 };
@@ -159,8 +159,8 @@ CommResult AllreduceRsAgZeroCopy::allreduceKernelFunc(const std::shared_ptr<void
   }
   cudaError_t error =
       allreduce(input, nullptr, output, this->baseMemoryChannelHandles_.get(), algoCtx->remoteMemoryHandles.get(),
-                nullptr, nullptr, 0, 0, 0, algoCtx->rank, algoCtx->nRanksPerNode, algoCtx->workSize, inputSize, stream,
-                nullptr, 0, 0, numBlocksAndThreads.first, numBlocksAndThreads.second);
+                nullptr, nullptr, 0, 0, 0, algoCtx->rank, algoCtx->ipcDomainNranks, algoCtx->workSize, inputSize,
+                stream, nullptr, 0, 0, numBlocksAndThreads.first, numBlocksAndThreads.second);
   if (error != cudaSuccess) {
     WARN(ALGO, "Allreduce kernel launch failed with error: ", cudaGetErrorString(error));
     return CommResult::CommUnhandledCudaError;
@@ -186,7 +186,7 @@ std::shared_ptr<void> AllreduceRsAgZeroCopy::initAllreduceContext(std::shared_pt
   auto ctx = std::make_shared<AlgorithmCtx>();
   ctx->rank = comm->bootstrap()->getRank();
   ctx->workSize = comm->bootstrap()->getNranks();
-  ctx->nRanksPerNode = getIpcDomainNranks(comm);
+  ctx->ipcDomainNranks = getIpcDomainNranks(comm);
 
   ctx->memorySemaphores = this->semaphores_;
 
diff --git a/src/ext/collectives/include/collective_utils.hpp b/src/ext/collectives/include/collective_utils.hpp
index 44a214020..7fa6a81ea 100644
--- a/src/ext/collectives/include/collective_utils.hpp
+++ b/src/ext/collectives/include/collective_utils.hpp
@@ -27,8 +27,8 @@ namespace mscclpp {
 namespace collective {
 constexpr int NUM_NVLS_CONNECTION = 8;
 // Sized to cover MAX_NRANKS_PER_NODE-scale allreduce algos whose device-side
-// semaphore indices grow as O(nRanksPerNode) (e.g. nvls_block_pipeline uses
-// up to ~5 * nRanksPerNode entries).
+// semaphore indices grow as O(ipcDomainNranks) (e.g. nvls_block_pipeline uses
+// up to ~5 * ipcDomainNranks entries).
 constexpr int NUM_SEMAPHORES = 512;
 
 // Upper bound on the number of NVLink-reachable ranks that participate in a
@@ -54,8 +54,8 @@ std::vector<std::shared_ptr<MemoryDevice2DeviceSemaphore>> setupMemorySemaphores
 /// Number of ranks that participate in the same GPU-IPC-reachable peer group (e.g. a single host or
 /// a Multi-Node NVLink fabric, or an AMD XGMI domain). Returns the value of `MSCCLPP_IPC_DOMAIN_NRANKS`
 /// if set to a positive value; otherwise falls back to `bootstrap->getNranksPerNode()`. This is
-/// intentionally independent of `nRanksPerNode` so that algorithms can opt in to MNNVL-like behavior
-/// without changing the meaning of bootstrap-level APIs.
+/// intentionally independent of `Bootstrap::getNranksPerNode()` so that algorithms can opt in to
+/// MNNVL-like behavior without changing the meaning of bootstrap-level APIs.
 int getIpcDomainNranks(std::shared_ptr<Communicator> comm);
 
 std::shared_ptr<DeviceHandle<MemoryChannel>> setupMemoryChannelDeviceHandles(
@@ -86,7 +86,7 @@ class AlgorithmCtx {
  public:
   int rank;
   int workSize;
-  int nRanksPerNode;
+  int ipcDomainNranks;
 
   std::vector<RegisteredMemory> registeredMemories;
   std::vector<MemoryChannel> memoryChannels;

From 2efda4d81964f87092d7dc75d5d89c1f2c9166ee Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Fri, 1 May 2026 23:09:22 +0000
Subject: [PATCH 09/44] Restore compile-time templated NRanksPerNode for
 rsag_zero_copy

Recovers the per-thread int4 register array + #pragma unroll for the
{4, 8} rank cases. All NPeers remote reads are issued in parallel so
their latency overlaps instead of being serialized by the runtime
fused load+reduce loop. The runtime-domain (NVL72) fallback is
removed; the algo now returns cudaErrorInvalidValue for unsupported
ipcDomainNranks, and rsag_zero_copy is dropped from the MNNVL
candidate list in the tuning example.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../customized_comm_with_tuning.py            |  3 -
 .../allreduce/allreduce_rsag_zero_copy.cu     | 60 ++++++++++++-------
 2 files changed, 40 insertions(+), 23 deletions(-)

diff --git a/examples/torch-integration/customized_comm_with_tuning.py b/examples/torch-integration/customized_comm_with_tuning.py
index 1d54cfa77..9ad7f22a5 100644
--- a/examples/torch-integration/customized_comm_with_tuning.py
+++ b/examples/torch-integration/customized_comm_with_tuning.py
@@ -246,9 +246,6 @@ def _ar_candidates(self, size: int):
                     if a:
                         out.append(a)
             if size >= 512 << 10:
-                a = self._algo("allreduce", "default_allreduce_rsag_zero_copy")
-                if a:
-                    out.append(a)
                 a = self._algo("allreduce", "default_allreduce_nvls_zero_copy")
                 if self._nvls and self.symmetric_memory and a:
                     out.append(a)
diff --git a/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu b/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu
index ea6643255..09fa2fe70 100644
--- a/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu
+++ b/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu
@@ -35,26 +35,32 @@ __device__ mscclpp::DeviceSyncer globalSyncer;
 //
 // This approach requires registering both input and output buffers as remote
 // memories (2 * nPeers handles), but avoids scratch buffer allocation and
-// the extra copy steps of the standard RSAG. ipcDomainNranks is accepted at
-// runtime, which allows the same kernel to handle any NVLink-domain size
-// (including Multi-Node NVLink fabrics up to NVL72).
+// the extra copy steps of the standard RSAG.
+//
+// The kernel is templated on NRanksPerNode so the compiler can keep an int4
+// register array of NPeers elements, #pragma unroll the peer loops, and turn
+// the per-iteration modulo into a single AND. This issues all NPeers remote
+// reads in parallel so their latency is overlapped instead of serialized.
+// Only small fixed sizes ({4, 8}) are instantiated; larger MNNVL domains
+// (where the int4 array would spill out of registers) must use a different
+// algorithm.
 
-template <ReduceOp OpType, typename T, typename AccumT = T>
+template <int NRanksPerNode, ReduceOp OpType, typename T, typename AccumT = T>
 __global__ void __launch_bounds__(1024, 1)
     allreduceRsAgZeroCopy(T* buff, T* scratch, T* resultBuff, DeviceHandle<BaseMemoryChannel>* memoryChannels,
-                          DeviceHandle<SwitchChannel>* switchChannels, void* remoteMemories, int rank,
-                          int ipcDomainNranks, int worldSize, size_t nelems) {
+                          DeviceHandle<SwitchChannel>* switchChannels, void* remoteMemories, int rank, int worldSize,
+                          size_t nelems) {
   int blockId = blockIdx.x;
 
   assert((uintptr_t)buff % sizeof(int4) == 0);
   assert((uintptr_t)resultBuff % sizeof(int4) == 0);
 
-  const int NPeers = ipcDomainNranks - 1;
+  constexpr int NPeers = NRanksPerNode - 1;
   constexpr uint32_t nelemsPerInt4 = sizeof(int4) / sizeof(T);
-  const uint32_t outputRemoteBufferOffset = NPeers;
-  uint32_t alignedNelems = ((nelems + ipcDomainNranks - 1) / ipcDomainNranks + nelemsPerInt4 - 1) / nelemsPerInt4 *
-                           nelemsPerInt4 * ipcDomainNranks;
-  uint32_t nelemsPerRank = alignedNelems / ipcDomainNranks;
+  constexpr uint32_t outputRemoteBufferOffset = NPeers;
+  uint32_t alignedNelems = ((nelems + NRanksPerNode - 1) / NRanksPerNode + nelemsPerInt4 - 1) / nelemsPerInt4 *
+                           nelemsPerInt4 * NRanksPerNode;
+  uint32_t nelemsPerRank = alignedNelems / NRanksPerNode;
   uint32_t nInt4PerRank = nelemsPerRank / nelemsPerInt4;
   uint32_t nInt4Total = (nelems + nelemsPerInt4 - 1) / nelemsPerInt4;
 
@@ -75,6 +81,7 @@ __global__ void __launch_bounds__(1024, 1)
     memoryChannelsLocal[threadIdx.x].relaxedWait();
   }
   __syncthreads();
+  int4 data[NPeers];
   // AccumInt4: when AccumT != T, use a wider accumulator type.
   // For AccumT == T, this is just int4 (no-op conversion).
   constexpr int nElemsPerInt4 = sizeof(int4) / sizeof(T);
@@ -84,17 +91,21 @@ __global__ void __launch_bounds__(1024, 1)
     uint32_t offset = idx + offset4 + rank * nInt4PerRank;
     if (offset >= nInt4Total) continue;
     int4 tmp_raw = buff4[offset];
-    int4 data;
-    AccumVec acc = mscclpp::upcastVector<T, AccumT, AccumVec>(tmp_raw);
+#pragma unroll
     for (int i = 0; i < NPeers; i++) {
-      int rankIdx = (rank + i + 1) % ipcDomainNranks;
+      int rankIdx = (rank + i + 1) % NRanksPerNode;
       int peerIdx = rankIdx < rank ? rankIdx : rankIdx - 1;
-      data = mscclpp::read<int4>(((void**)remoteMemories)[peerIdx], offset);
-      acc = mscclpp::calVectorAccum<T, AccumT, OpType, AccumVec>(acc, data);
+      data[i] = mscclpp::read<int4>(((void**)remoteMemories)[peerIdx], offset);
+    }
+    AccumVec acc = mscclpp::upcastVector<T, AccumT, AccumVec>(tmp_raw);
+#pragma unroll
+    for (int i = 0; i < NPeers; i++) {
+      acc = mscclpp::calVectorAccum<T, AccumT, OpType, AccumVec>(acc, data[i]);
     }
     int4 tmp = mscclpp::downcastVector<T, AccumT, int4>(acc);
+#pragma unroll
     for (int i = 0; i < NPeers; i++) {
-      int rankIdx = (rank + i + 1) % ipcDomainNranks;
+      int rankIdx = (rank + i + 1) % NRanksPerNode;
       int peerIdx = rankIdx < rank ? rankIdx : rankIdx - 1;
       mscclpp::write<int4>(((void**)remoteMemories)[outputRemoteBufferOffset + peerIdx], offset, tmp);
     }
@@ -123,9 +134,18 @@ struct AllreduceRsAgZeroCopyAdapter {
         nBlocks = 128;
       }
     }
-    allreduceRsAgZeroCopy<OpType, T, AccumT><<<nBlocks, nThreadsPerBlock, 0, stream>>>(
-        (T*)input, (T*)scratch, (T*)output, (ChannelType*)memoryChannels, switchChannel, remoteMemories, rank,
-        ipcDomainNranks, worldSize, nelems);
+    if (ipcDomainNranks == 4) {
+      allreduceRsAgZeroCopy<4, OpType, T, AccumT>
+          <<<nBlocks, nThreadsPerBlock, 0, stream>>>((T*)input, (T*)scratch, (T*)output, (ChannelType*)memoryChannels,
+                                                     switchChannel, remoteMemories, rank, worldSize, nelems);
+    } else if (ipcDomainNranks == 8) {
+      allreduceRsAgZeroCopy<8, OpType, T, AccumT>
+          <<<nBlocks, nThreadsPerBlock, 0, stream>>>((T*)input, (T*)scratch, (T*)output, (ChannelType*)memoryChannels,
+                                                     switchChannel, remoteMemories, rank, worldSize, nelems);
+    } else {
+      WARN(ALGO, "AllreduceRsAgZeroCopy only supports ipcDomainNranks of 4 or 8, got: ", ipcDomainNranks);
+      return cudaErrorInvalidValue;
+    }
     return cudaGetLastError();
   }
 };

From 1c298175661003f9d80192349cebbb3575f8d0d3 Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Fri, 1 May 2026 23:40:11 +0000
Subject: [PATCH 10/44] Revert AllreduceRsAgZeroCopy non-symmetric ctx key tag
 back to ++tag

Commit 533f3299 dropped the static tag counter from
generateAllreduceContextKey, causing every non-symmetric call to
return the same key (zero) and reuse a stale context. Restore the
pre-MNNVL behavior of returning a unique key per non-symmetric call
so the context cache rebuilds when buffers change.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu b/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu
index 09fa2fe70..a20756aee 100644
--- a/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu
+++ b/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu
@@ -191,6 +191,7 @@ CommResult AllreduceRsAgZeroCopy::allreduceKernelFunc(const std::shared_ptr<void
 AlgorithmCtxKey AllreduceRsAgZeroCopy::generateAllreduceContextKey(const void* inputBuffer, void* outputBuffer,
                                                                    size_t size, DataType, bool symmetricMemory) {
   // For non-symmetric algorithms, we use both input and output buffer pointers in the key.
+  static int tag = 0;
   if (symmetricMemory) {
     size_t inputBytes, outputBytes;
     CUdeviceptr inputBasePtr, outputBasePtr;
@@ -198,7 +199,7 @@ AlgorithmCtxKey AllreduceRsAgZeroCopy::generateAllreduceContextKey(const void* i
     MSCCLPP_CUTHROW(cuMemGetAddressRange(&outputBasePtr, &outputBytes, (CUdeviceptr)outputBuffer));
     return AlgorithmCtxKey{(void*)inputBasePtr, (void*)outputBasePtr, inputBytes, outputBytes, 0};
   }
-  return AlgorithmCtxKey{(void*)inputBuffer, outputBuffer, size, size, 0};
+  return AlgorithmCtxKey{(void*)inputBuffer, outputBuffer, size, size, ++tag};
 }
 
 std::shared_ptr<void> AllreduceRsAgZeroCopy::initAllreduceContext(std::shared_ptr<Communicator> comm, const void* input,

From 7bc5e0406b4b45eaae5d4cb42ca95ba69b0a2d56 Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Sat, 2 May 2026 03:19:31 +0000
Subject: [PATCH 11/44] Reset GPU tokens before reuse

Clear recycled TokenPool entries before handing them out so device-to-device semaphores start from a clean counter value.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 include/mscclpp/gpu_utils.hpp | 3 +++
 src/core/gpu_utils.cc         | 7 +++++++
 src/core/utils_internal.cc    | 6 ++++--
 3 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/include/mscclpp/gpu_utils.hpp b/include/mscclpp/gpu_utils.hpp
index ecd13c478..f7ec67d05 100644
--- a/include/mscclpp/gpu_utils.hpp
+++ b/include/mscclpp/gpu_utils.hpp
@@ -165,6 +165,7 @@ void gpuFreePhysical(void* ptr);
 void gpuMemcpyAsync(void* dst, const void* src, size_t bytes, cudaStream_t stream,
                     cudaMemcpyKind kind = cudaMemcpyDefault);
 void gpuMemcpy(void* dst, const void* src, size_t bytes, cudaMemcpyKind kind = cudaMemcpyDefault);
+void gpuMemset(void* ptr, int value, size_t bytes);
 
 /// A template function that allocates memory while ensuring that the memory will be freed when the returned object is
 /// destroyed.
@@ -300,6 +301,8 @@ void gpuMemcpy(T* dst, const T* src, size_t nelems, cudaMemcpyKind kind = cudaMe
   detail::gpuMemcpy(dst, src, nelems * sizeof(T), kind);
 }
 
+inline void memset(void* ptr, int value, size_t bytes) { detail::gpuMemset(ptr, value, bytes); }
+
 /// Check if NVLink SHARP (NVLS) is supported.
 ///
 /// @return True if NVLink SHARP (NVLS) is supported, false otherwise.
diff --git a/src/core/gpu_utils.cc b/src/core/gpu_utils.cc
index 09d5025d6..1ce61322c 100644
--- a/src/core/gpu_utils.cc
+++ b/src/core/gpu_utils.cc
@@ -267,6 +267,13 @@ void gpuMemcpy(void* dst, const void* src, size_t bytes, cudaMemcpyKind kind) {
   MSCCLPP_CUDATHROW(cudaStreamSynchronize(stream));
 }
 
+void gpuMemset(void* ptr, int value, size_t bytes) {
+  AvoidCudaGraphCaptureGuard cgcGuard;
+  CudaStreamWithFlags stream(cudaStreamNonBlocking);
+  MSCCLPP_CUDATHROW(cudaMemsetAsync(ptr, value, bytes, stream));
+  MSCCLPP_CUDATHROW(cudaStreamSynchronize(stream));
+}
+
 }  // namespace detail
 
 bool isNvlsSupported() {
diff --git a/src/core/utils_internal.cc b/src/core/utils_internal.cc
index 9504a52cf..ea867ffff 100644
--- a/src/core/utils_internal.cc
+++ b/src/core/utils_internal.cc
@@ -263,8 +263,10 @@ std::shared_ptr<uint64_t> TokenPool::getToken() {
     for (int bit = 0; bit < UINT64_WIDTH; bit++) {
       if (holes & (1UL << bit)) {
         allocationMap_[i].set(bit);
-        INFO(MSCCLPP_ALLOC, "TokenPool allocated token at addr %p", baseAddr_ + i * UINT64_WIDTH + bit);
-        return std::shared_ptr<uint64_t>(baseAddr_ + i * UINT64_WIDTH + bit, deleter);
+        uint64_t* token = baseAddr_ + i * UINT64_WIDTH + bit;
+        mscclpp::memset(token, 0, sizeof(uint64_t));
+        INFO(MSCCLPP_ALLOC, "TokenPool allocated token at addr %p", token);
+        return std::shared_ptr<uint64_t>(token, deleter);
       }
     }
   }

From 9a368843691be73d6f40cbb0d1277f6e20d56013 Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Sat, 2 May 2026 03:32:18 +0000
Subject: [PATCH 12/44] Rename gpuMemset wrapper and zero TokenPool slots in
 deleter

Two follow-ups to commit 7bc5e040:
  * Rename mscclpp::memset to mscclpp::gpuMemset for symmetry with
    gpuMemcpy / gpuMemcpyAsync, and avoid shadowing std::memset for
    callers that pull the namespace in. Also add the missing doc
    comment.
  * Move the per-slot zeroing from getToken() into the deleter so the
    cost is paid on release rather than acquire. This is safe because
    gpuCallocPhysical already zeros the underlying buffer at TokenPool
    construction, so first-time tokens are clean and recycled tokens
    are scrubbed on release.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 include/mscclpp/gpu_utils.hpp | 6 +++++-
 src/core/utils_internal.cc    | 9 +++++----
 2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/include/mscclpp/gpu_utils.hpp b/include/mscclpp/gpu_utils.hpp
index f7ec67d05..b079e0fd9 100644
--- a/include/mscclpp/gpu_utils.hpp
+++ b/include/mscclpp/gpu_utils.hpp
@@ -301,7 +301,11 @@ void gpuMemcpy(T* dst, const T* src, size_t nelems, cudaMemcpyKind kind = cudaMe
   detail::gpuMemcpy(dst, src, nelems * sizeof(T), kind);
 }
 
-inline void memset(void* ptr, int value, size_t bytes) { detail::gpuMemset(ptr, value, bytes); }
+/// Sets `bytes` of memory at `ptr` to `value` synchronously.
+/// @param ptr Destination address.
+/// @param value Value to set (interpreted as unsigned char per CUDA semantics).
+/// @param bytes Number of bytes to set.
+inline void gpuMemset(void* ptr, int value, size_t bytes) { detail::gpuMemset(ptr, value, bytes); }
 
 /// Check if NVLink SHARP (NVLS) is supported.
 ///
diff --git a/src/core/utils_internal.cc b/src/core/utils_internal.cc
index ea867ffff..8cc554301 100644
--- a/src/core/utils_internal.cc
+++ b/src/core/utils_internal.cc
@@ -248,6 +248,9 @@ TokenPool::TokenPool(size_t nToken) : nToken_(nToken) {
 
 std::shared_ptr<uint64_t> TokenPool::getToken() {
   auto deleter = [self = shared_from_this()](uint64_t* token) {
+    // Zero the slot on release so the next allocator hands out a clean
+    // semaphore counter (matches a freshly-allocated slot).
+    mscclpp::gpuMemset(token, 0, sizeof(uint64_t));
     size_t index = (token - self->baseAddr_) / UINT64_WIDTH;
     size_t bit = (token - self->baseAddr_) % UINT64_WIDTH;
     uint64_t mask = 1UL << bit;
@@ -263,10 +266,8 @@ std::shared_ptr<uint64_t> TokenPool::getToken() {
     for (int bit = 0; bit < UINT64_WIDTH; bit++) {
       if (holes & (1UL << bit)) {
         allocationMap_[i].set(bit);
-        uint64_t* token = baseAddr_ + i * UINT64_WIDTH + bit;
-        mscclpp::memset(token, 0, sizeof(uint64_t));
-        INFO(MSCCLPP_ALLOC, "TokenPool allocated token at addr %p", token);
-        return std::shared_ptr<uint64_t>(token, deleter);
+        INFO(MSCCLPP_ALLOC, "TokenPool allocated token at addr %p", baseAddr_ + i * UINT64_WIDTH + bit);
+        return std::shared_ptr<uint64_t>(baseAddr_ + i * UINT64_WIDTH + bit, deleter);
       }
     }
   }

From 6296803d87a451e96f3ae5b62c2b616740861d8a Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Tue, 5 May 2026 04:41:14 +0000
Subject: [PATCH 13/44] Make NVLS non-zero-copy allreduce algorithms
 MNNVL-ready

Both default_allreduce_nvls_warp_pipeline and default_allreduce_nvls_block_pipeline
were only partially MNNVL-aware: their kernels had been updated to use
ipcDomainNranks (with shared-memory channel arrays sized for the global
NVLink-domain bound), but the host-side context init still hard-coded
ctx->ipcDomainNranks = bootstrap->getNranksPerNode(). On a fully populated MNNVL
fabric (e.g. NVL72 where world == ipcDomainNranks but the per-physical-host
nranksPerNode is much smaller), this mismatched the multicast group span and
produced wrong/missing data plus out-of-bounds scratch indexing.

Changes:
- Rename MAX_NRANKS_PER_NODE -> MAX_IPC_DOMAIN_NRANKS to match the rest of the
  IPC-domain naming (getIpcDomainNranks, ipcDomainNranks,
  MSCCLPP_IPC_DOMAIN_NRANKS env var). Pure rename, no semantic change.
- Add validateIpcDomainSpansWorld(comm, algName) helper in collective_utils
  that wraps getIpcDomainNranks() and asserts the IPC-domain == whole-comm
  invariant required by NVLS algorithms (worldSize == ipcDomainNranks,
  rank < ipcDomainNranks, ipcDomainNranks in [2, MAX_IPC_DOMAIN_NRANKS]),
  throwing Error(InvalidUsage) on violation and returning the validated value.
- nvls_zero_copy / nvls_block_pipeline / nvls_warp_pipeline initialize() each
  now call the helper instead of repeating the same ~20-line check inline.
- initAllreduceContext() in both pipelines now uses getIpcDomainNranks(comm)
  instead of bootstrap->getNranksPerNode().
- Per-peer base channel allocation (nBaseChannels_) is sized in initialize() as
  max(64, 4*ipc) for block pipeline and max(64, 8*ipc) for warp pipeline so
  the kernel's per-block channel addressing remains in-bounds at NVL72 scale.
- Block pipeline initialize() also asserts 6*ipcDomainNranks <= NUM_SEMAPHORES.
- allreduceKernelFunc() in both pipelines now validates launch shape and the
  user-supplied scratch buffer size before launching, returning
  CommInvalidArgument with a clear WARN on mismatch:
  - Block: nBlocks must equal 5*ipcDomainNranks (structurally required by the
    kernel's three-phase block partition), nThreads == 1024, inputSize aligned
    to (ipc * 16) bytes, scratchSizePerBlock >= unitSize.
  - Warp: nBlocks >= NUM_NVLS_CONNECTION and a multiple of it (kernel does
    nBlocks / NUM_NVLS_CONNECTION partitioning of the multicast handles),
    2*nBlocks <= nBaseChannels_, nThreads == 1024 (32 warps hard-coded in the
    bar.sync member counts), inputSize divisible by ipcDomainNranks,
    scratchSizePerBlock >= copyPerIter.
- Default nBlocks for warp pipeline is rounded up to a multiple of
  NUM_NVLS_CONNECTION so the structural constraint holds for any
  ipcDomainNranks.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../allgather/allgather_fullmesh.cu           |  2 +-
 .../allreduce/allreduce_fullmesh.cu           |  4 +-
 .../allreduce_nvls_block_pipeline.cu          | 66 ++++++++++++++---
 .../allreduce/allreduce_nvls_warp_pipeline.cu | 72 ++++++++++++++++---
 .../allreduce/allreduce_nvls_zero_copy.cu     |  4 +-
 .../collectives/allreduce/allreduce_packet.cu |  2 +-
 src/ext/collectives/collective_utils.cc       | 23 ++++++
 .../allreduce/allreduce_allpair_packet.hpp    |  2 +-
 .../allreduce_nvls_block_pipeline.hpp         |  2 +
 .../allreduce_nvls_warp_pipeline.hpp          |  2 +
 .../collectives/include/collective_utils.hpp  | 14 +++-
 11 files changed, 166 insertions(+), 27 deletions(-)

diff --git a/src/ext/collectives/allgather/allgather_fullmesh.cu b/src/ext/collectives/allgather/allgather_fullmesh.cu
index cbe199bcb..8ce77fca1 100644
--- a/src/ext/collectives/allgather/allgather_fullmesh.cu
+++ b/src/ext/collectives/allgather/allgather_fullmesh.cu
@@ -28,7 +28,7 @@ __global__ void __launch_bounds__(1024, 1)
   const size_t restNInt4 = nInt4 % nInt4PerChunk;
   const size_t scratchChunkRankOffset = nInt4PerChunk * rank;
 
-  __shared__ DeviceHandle<MemoryChannel> channels[MAX_NRANKS_PER_NODE - 1];
+  __shared__ DeviceHandle<MemoryChannel> channels[MAX_IPC_DOMAIN_NRANKS - 1];
   const int lid = threadIdx.x % WARP_SIZE;
   // Each warp redundantly loads all entries (same value, benign race) so that
   // every warp has the data its threads will read after __syncwarp(). Required
diff --git a/src/ext/collectives/allreduce/allreduce_fullmesh.cu b/src/ext/collectives/allreduce/allreduce_fullmesh.cu
index b95dcb284..f1d815604 100644
--- a/src/ext/collectives/allreduce/allreduce_fullmesh.cu
+++ b/src/ext/collectives/allreduce/allreduce_fullmesh.cu
@@ -49,8 +49,8 @@ __global__ void __launch_bounds__(512, 1)
   const size_t blockOffset = nInt4PerChunk * blockIdx.x;
   const size_t scratchChunkRankOffset = chunkSizePerRank * rank;
 
-  __shared__ DeviceHandle<MemoryChannel> channels[MAX_NRANKS_PER_NODE - 1];
-  __shared__ DeviceHandle<MemoryChannel> outChannels[MAX_NRANKS_PER_NODE - 1];
+  __shared__ DeviceHandle<MemoryChannel> channels[MAX_IPC_DOMAIN_NRANKS - 1];
+  __shared__ DeviceHandle<MemoryChannel> outChannels[MAX_IPC_DOMAIN_NRANKS - 1];
   const int lid = threadIdx.x % WARP_SIZE;
   // Each warp redundantly loads all entries (same value, benign race) so that
   // every warp has the data its threads will read after __syncwarp(). Required
diff --git a/src/ext/collectives/allreduce/allreduce_nvls_block_pipeline.cu b/src/ext/collectives/allreduce/allreduce_nvls_block_pipeline.cu
index 3ecb361fc..4eeb03355 100644
--- a/src/ext/collectives/allreduce/allreduce_nvls_block_pipeline.cu
+++ b/src/ext/collectives/allreduce/allreduce_nvls_block_pipeline.cu
@@ -1,7 +1,9 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT License.
 
+#include <algorithm>
 #include <mscclpp/algorithm.hpp>
+#include <mscclpp/errors.hpp>
 
 #include "allreduce/allreduce_nvls_block_pipeline.hpp"
 #include "allreduce/common.hpp"
@@ -176,31 +178,73 @@ struct NvlsBlockPipelineAdapter {
 
 void AllreduceNvlsBlockPipeline::initialize(std::shared_ptr<Communicator> comm) {
   nSwitchChannels_ = 8;
-  int nBaseChannels = 64;
+  ipcDomainNranks_ = validateIpcDomainSpansWorld(comm, "AllreduceNvlsBlockPipeline");
+  // Block-pipeline device-side semaphore indices grow as 6 * ipcDomainNranks (see kernel).
+  if (6 * ipcDomainNranks_ > NUM_SEMAPHORES) {
+    throw Error("AllreduceNvlsBlockPipeline: ipcDomainNranks " + std::to_string(ipcDomainNranks_) +
+                    " exceeds NUM_SEMAPHORES capacity (" + std::to_string(NUM_SEMAPHORES) + ")",
+                ErrorCode::InvalidUsage);
+  }
+  // The kernel addresses up to `2 * nBlocksForCopy = 4 * ipcDomainNranks` distinct entries
+  // per peer in `memoryChannels`. Scale the per-connection allocation to match.
+  nBaseChannels_ = std::max(64, 4 * ipcDomainNranks_);
   this->conns_ = setupConnections(comm);
   // setup semaphores
   std::vector<std::shared_ptr<MemoryDevice2DeviceSemaphore>> memorySemaphores =
-      setupMemorySemaphores(comm, this->conns_, nBaseChannels);
+      setupMemorySemaphores(comm, this->conns_, nBaseChannels_);
   // setup base memory channels
-  this->baseChannels_ = setupBaseMemoryChannels(this->conns_, memorySemaphores, nBaseChannels);
+  this->baseChannels_ = setupBaseMemoryChannels(this->conns_, memorySemaphores, nBaseChannels_);
   this->memoryChannelsDeviceHandle_ = setupBaseMemoryChannelDeviceHandles(this->baseChannels_);
   this->nvlsConnections_ = setupNvlsConnections(comm, nvlsBufferSize_, nSwitchChannels_);
 }
 
-CommResult AllreduceNvlsBlockPipeline::allreduceKernelFunc(const std::shared_ptr<void> ctx_void, const void* input,
-                                                           void* output, size_t inputSize, DataType dtype, ReduceOp op,
-                                                           cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
-                                                           const std::unordered_map<std::string, uintptr_t>& extras,
-                                                           DataType accumDtype) {
+CommResult AllreduceNvlsBlockPipeline::allreduceKernelFunc(
+    const std::shared_ptr<void> ctx_void, const void* input, void* output, size_t inputSize, DataType dtype,
+    ReduceOp op, cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
+    [[maybe_unused]] const std::unordered_map<std::string, uintptr_t>& extras, DataType accumDtype) {
   auto ctx = std::static_pointer_cast<AlgorithmCtx>(ctx_void);
   AllreduceFunc allreduce = dispatch<NvlsBlockPipelineAdapter>(op, dtype, accumDtype);
   if (!allreduce) {
     WARN("Unsupported operation or data type for allreduce, dtype=%d", static_cast<int>(dtype));
     return CommResult::CommInvalidArgument;
   }
+  const int requiredBlocks = ctx->ipcDomainNranks * 5;
   std::pair<int, int> blockAndThreadNum = {nBlocks, nThreadsPerBlock};
-  if (blockAndThreadNum.first == 0 || blockAndThreadNum.second == 0) {
-    blockAndThreadNum = {ctx->ipcDomainNranks * 5, 1024};
+  if (blockAndThreadNum.first == 0) blockAndThreadNum.first = requiredBlocks;
+  if (blockAndThreadNum.second == 0) blockAndThreadNum.second = 1024;
+  if (blockAndThreadNum.first != requiredBlocks) {
+    WARN("AllreduceNvlsBlockPipeline requires nBlocks == 5 * ipcDomainNranks (got %d, expected %d)",
+         blockAndThreadNum.first, requiredBlocks);
+    return CommResult::CommInvalidArgument;
+  }
+  if (blockAndThreadNum.second != 1024) {
+    WARN("AllreduceNvlsBlockPipeline requires nThreadsPerBlock == 1024 (got %d)", blockAndThreadNum.second);
+    return CommResult::CommInvalidArgument;
+  }
+  // Validate input alignment/divisibility expectations of the kernel.
+  constexpr size_t kKernelAlignment = 16;
+  const size_t perRankBytes = inputSize / ctx->ipcDomainNranks;
+  if (perRankBytes * static_cast<size_t>(ctx->ipcDomainNranks) != inputSize || perRankBytes % kKernelAlignment != 0) {
+    WARN(
+        "AllreduceNvlsBlockPipeline requires inputSize %% (ipcDomainNranks * %zu) == 0 (got inputSize=%zu, "
+        "ipcDomainNranks=%d)",
+        kKernelAlignment, inputSize, ctx->ipcDomainNranks);
+    return CommResult::CommInvalidArgument;
+  }
+  // Validate scratch is large enough for at least one pipeline iteration. The kernel
+  // computes scratchSizePerBlock = (scratchSizePerRank / nBlocksForCopy) aligned down
+  // to unitSize; if this is 0, maxItersForScratch is 0 and the kernel deadlocks.
+  const size_t unitSize = (inputSize <= static_cast<size_t>(1024) * 1024 * 128) ? (1ULL << 16) : (1ULL << 17);
+  const size_t scratchSizePerRank = this->scratchBufferSize_ / ctx->ipcDomainNranks;
+  const size_t nBlocksForCopy = static_cast<size_t>(ctx->ipcDomainNranks) * 2;
+  const size_t scratchSizePerBlock = (scratchSizePerRank / nBlocksForCopy) / unitSize * unitSize;
+  if (scratchSizePerBlock < unitSize) {
+    WARN(
+        "AllreduceNvlsBlockPipeline scratch buffer too small for ipcDomainNranks=%d and inputSize=%zu "
+        "(scratchBufferSize=%zu, need at least ~%zu bytes)",
+        ctx->ipcDomainNranks, inputSize, this->scratchBufferSize_,
+        static_cast<size_t>(ctx->ipcDomainNranks) * nBlocksForCopy * unitSize);
+    return CommResult::CommInvalidArgument;
   }
   cudaError_t error = allreduce(input, this->scratchBuffer_, output, this->memoryChannelsDeviceHandle_.get(), nullptr,
                                 ctx->switchChannelDeviceHandles.get(), nullptr, 0, 0, this->scratchBufferSize_,
@@ -222,7 +266,7 @@ std::shared_ptr<void> AllreduceNvlsBlockPipeline::initAllreduceContext(std::shar
   auto ctx = std::make_shared<AlgorithmCtx>();
   ctx->rank = comm->bootstrap()->getRank();
   ctx->workSize = comm->bootstrap()->getNranks();
-  ctx->ipcDomainNranks = comm->bootstrap()->getNranksPerNode();
+  ctx->ipcDomainNranks = getIpcDomainNranks(comm);
 
   // setup channels
   ctx->switchChannels =
diff --git a/src/ext/collectives/allreduce/allreduce_nvls_warp_pipeline.cu b/src/ext/collectives/allreduce/allreduce_nvls_warp_pipeline.cu
index 1bdac9ada..05e4f747f 100644
--- a/src/ext/collectives/allreduce/allreduce_nvls_warp_pipeline.cu
+++ b/src/ext/collectives/allreduce/allreduce_nvls_warp_pipeline.cu
@@ -1,7 +1,9 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT License.
 
+#include <algorithm>
 #include <mscclpp/algorithm.hpp>
+#include <mscclpp/errors.hpp>
 
 #include "allreduce/allreduce_nvls_warp_pipeline.hpp"
 #include "allreduce/common.hpp"
@@ -55,7 +57,7 @@ __global__ void __launch_bounds__(1024, 1)
 
   const size_t chanOffset = (ipcDomainNranks - 1) * blockIdx.x * 2;
   auto memoryChans = memoryChannels + chanOffset;
-  __shared__ DeviceHandle<BaseMemoryChannel> channels[(MAX_NRANKS_PER_NODE - 1) * 2];
+  __shared__ DeviceHandle<BaseMemoryChannel> channels[(MAX_IPC_DOMAIN_NRANKS - 1) * 2];
   const int lid = threadIdx.x % WARP_SIZE;
   // Each warp redundantly loads all entries (same value, benign race) so that
   // every warp has the data its threads will read after __syncwarp(). Required
@@ -141,14 +143,18 @@ struct NvlsWarpPipelineAdapter {
 };
 
 void AllreduceNvlsWarpPipeline::initialize(std::shared_ptr<Communicator> comm) {
-  nSwitchChannels_ = 8;
-  int nBaseChannels = 64;
+  nSwitchChannels_ = NUM_NVLS_CONNECTION;
+  ipcDomainNranks_ = validateIpcDomainSpansWorld(comm, "AllreduceNvlsWarpPipeline");
+  // The warp-pipeline kernel addresses 2 * nPeers entries per block in `memoryChannels`,
+  // so per-peer base channel allocation must be at least `2 * nBlocks`. Default
+  // nBlocks = 4 * ipcDomainNranks (see allreduceKernelFunc), so size accordingly.
+  nBaseChannels_ = std::max(64, 8 * ipcDomainNranks_);
   this->conns_ = setupConnections(comm);
   // setup semaphores
   std::vector<std::shared_ptr<MemoryDevice2DeviceSemaphore>> memorySemaphores =
-      setupMemorySemaphores(comm, this->conns_, nBaseChannels);
+      setupMemorySemaphores(comm, this->conns_, nBaseChannels_);
   // setup base memory channels
-  this->baseChannels_ = setupBaseMemoryChannels(this->conns_, memorySemaphores, nBaseChannels);
+  this->baseChannels_ = setupBaseMemoryChannels(this->conns_, memorySemaphores, nBaseChannels_);
   this->memoryChannelsDeviceHandle_ = setupBaseMemoryChannelDeviceHandles(this->baseChannels_);
   this->nvlsConnections_ = setupNvlsConnections(comm, nvlsBufferSize_, nSwitchChannels_);
 }
@@ -164,8 +170,58 @@ CommResult AllreduceNvlsWarpPipeline::allreduceKernelFunc(
     return CommResult::CommInvalidArgument;
   }
   std::pair<int, int> blockAndThreadNum = {nBlocks, nThreadsPerBlock};
-  if (blockAndThreadNum.first == 0 || blockAndThreadNum.second == 0) {
-    blockAndThreadNum = {ctx->ipcDomainNranks * 4, 1024};
+  if (blockAndThreadNum.first == 0) {
+    // Default to 4 * ipcDomainNranks blocks, rounded up to a multiple of NUM_NVLS_CONNECTION
+    // so that nBlocks / NUM_NVLS_CONNECTION partitioning in the kernel is well-defined.
+    int defaultBlocks = ctx->ipcDomainNranks * 4;
+    defaultBlocks = ((defaultBlocks + NUM_NVLS_CONNECTION - 1) / NUM_NVLS_CONNECTION) * NUM_NVLS_CONNECTION;
+    blockAndThreadNum.first = std::max(defaultBlocks, NUM_NVLS_CONNECTION);
+  }
+  if (blockAndThreadNum.second == 0) blockAndThreadNum.second = 1024;
+  // The kernel computes nBlocksPerNvlsConn = nBlocks / NUM_NVLS_CONNECTION and indexes the
+  // multicast handle array with bid / nBlocksPerNvlsConn; both must be safe.
+  if (blockAndThreadNum.first < NUM_NVLS_CONNECTION || blockAndThreadNum.first % NUM_NVLS_CONNECTION != 0) {
+    WARN("AllreduceNvlsWarpPipeline requires nBlocks to be a positive multiple of %d (got %d)", NUM_NVLS_CONNECTION,
+         blockAndThreadNum.first);
+    return CommResult::CommInvalidArgument;
+  }
+  // Each block uses 2 * nPeers consecutive entries in `memoryChannels`, so the per-peer
+  // base-channel allocation must support 2 * nBlocks distinct entries.
+  if (2 * blockAndThreadNum.first > this->nBaseChannels_) {
+    WARN(
+        "AllreduceNvlsWarpPipeline: nBlocks %d exceeds channel allocation (nBaseChannels=%d, "
+        "ipcDomainNranks=%d). Increase MSCCLPP_IPC_DOMAIN_NRANKS-aware sizing or reduce nBlocks.",
+        blockAndThreadNum.first, this->nBaseChannels_, ctx->ipcDomainNranks);
+    return CommResult::CommInvalidArgument;
+  }
+  // The kernel hard-codes 14 + 4 + 14 = 32 warps per block and bar.sync member counts
+  // computed from these constants; deviating from 1024 threads breaks those barriers.
+  if (blockAndThreadNum.second != 1024) {
+    WARN("AllreduceNvlsWarpPipeline requires nThreadsPerBlock == 1024 (got %d)", blockAndThreadNum.second);
+    return CommResult::CommInvalidArgument;
+  }
+  // Validate input divisibility by ipcDomainNranks (kernel computes size / ipcDomainNranks).
+  if (inputSize % static_cast<size_t>(ctx->ipcDomainNranks) != 0) {
+    WARN("AllreduceNvlsWarpPipeline requires inputSize %% ipcDomainNranks == 0 (got inputSize=%zu, ipcDomainNranks=%d)",
+         inputSize, ctx->ipcDomainNranks);
+    return CommResult::CommInvalidArgument;
+  }
+  // Validate scratch is large enough for at least one pipeline iteration. The kernel
+  // computes scratchSizePerBlock = (scratchSizePerRank / nBlocks) aligned down to copyPerIter;
+  // if this is 0 the modulo offset arithmetic divides by zero.
+  const size_t sizePerRank = inputSize / static_cast<size_t>(ctx->ipcDomainNranks);
+  const size_t maxSizePerBlock = ((sizePerRank + blockAndThreadNum.first - 1) / blockAndThreadNum.first + 15) / 16 * 16;
+  const size_t copyPerIter = (maxSizePerBlock >= 1024 * 64) ? (1024 * 32) : (1024 * 16);
+  const size_t scratchSizePerRank = this->scratchBufferSize_ / static_cast<size_t>(ctx->ipcDomainNranks);
+  const size_t scratchSizePerBlock =
+      (scratchSizePerRank / static_cast<size_t>(blockAndThreadNum.first)) / copyPerIter * copyPerIter;
+  if (scratchSizePerBlock < copyPerIter) {
+    WARN(
+        "AllreduceNvlsWarpPipeline scratch buffer too small for ipcDomainNranks=%d, nBlocks=%d, inputSize=%zu "
+        "(scratchBufferSize=%zu, need at least ~%zu bytes)",
+        ctx->ipcDomainNranks, blockAndThreadNum.first, inputSize, this->scratchBufferSize_,
+        static_cast<size_t>(ctx->ipcDomainNranks) * static_cast<size_t>(blockAndThreadNum.first) * copyPerIter);
+    return CommResult::CommInvalidArgument;
   }
   cudaError_t error = allreduce(input, this->scratchBuffer_, output, this->memoryChannelsDeviceHandle_.get(), nullptr,
                                 ctx->switchChannelDeviceHandles.get(), nullptr, 0, 0, this->scratchBufferSize_,
@@ -187,7 +243,7 @@ std::shared_ptr<void> AllreduceNvlsWarpPipeline::initAllreduceContext(std::share
   auto ctx = std::make_shared<AlgorithmCtx>();
   ctx->rank = comm->bootstrap()->getRank();
   ctx->workSize = comm->bootstrap()->getNranks();
-  ctx->ipcDomainNranks = comm->bootstrap()->getNranksPerNode();
+  ctx->ipcDomainNranks = getIpcDomainNranks(comm);
 
   // setup channels
   ctx->switchChannels =
diff --git a/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu b/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu
index a9d46d4f5..5d6fc4d37 100644
--- a/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu
+++ b/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu
@@ -2,6 +2,7 @@
 // Licensed under the MIT License.
 
 #include <mscclpp/core.hpp>
+#include <mscclpp/errors.hpp>
 
 #include "allreduce/allreduce_nvls_zero_copy.hpp"
 #include "allreduce/common.hpp"
@@ -42,7 +43,7 @@ __global__ void __launch_bounds__(1024, 1)
 
   const size_t chanOffset = (ipcDomainNranks - 1) * blockIdx.x;
   auto memoryChans = memoryChannels + chanOffset;
-  __shared__ mscclpp::DeviceHandle<mscclpp::BaseMemoryChannel> channels[MAX_NRANKS_PER_NODE - 1];
+  __shared__ mscclpp::DeviceHandle<mscclpp::BaseMemoryChannel> channels[MAX_IPC_DOMAIN_NRANKS - 1];
   const int lid = threadIdx.x % WARP_SIZE;
   // Each warp redundantly loads all entries (same value, benign race) so that
   // every warp has the data its threads will read after __syncwarp(). Required
@@ -106,6 +107,7 @@ void AllreduceNvls::initialize(std::shared_ptr<mscclpp::Communicator> comm) {
   MSCCLPP_CUDATHROW(cudaGetDeviceProperties(&deviceProp, device));
   computeCapabilityMajor_ = deviceProp.major;
   nSwitchChannels_ = 32;
+  validateIpcDomainSpansWorld(comm, "AllreduceNvls");
   this->conns_ = setupConnections(comm);
   // setup semaphores
   std::vector<std::shared_ptr<mscclpp::MemoryDevice2DeviceSemaphore>> memorySemaphores =
diff --git a/src/ext/collectives/allreduce/allreduce_packet.cu b/src/ext/collectives/allreduce/allreduce_packet.cu
index 84e182eb7..cc91370ca 100644
--- a/src/ext/collectives/allreduce/allreduce_packet.cu
+++ b/src/ext/collectives/allreduce/allreduce_packet.cu
@@ -78,7 +78,7 @@ __global__ void __launch_bounds__(1024, 1)
   uint2* dst = (uint2*)((char*)resultBuff + rank * nelemsPerRank * sizeof(int));
 
   // Put channels into shared memory, read channel info from global memory is unexpectable slow.
-  __shared__ mscclpp::DeviceHandle<mscclpp::MemoryChannel> channels[MAX_NRANKS_PER_NODE - 1];
+  __shared__ mscclpp::DeviceHandle<mscclpp::MemoryChannel> channels[MAX_IPC_DOMAIN_NRANKS - 1];
   const int lid = tid % WARP_SIZE;
   // Each warp redundantly loads all entries (same value, benign race) so that
   // every warp has the data its threads will read after __syncwarp(). Required
diff --git a/src/ext/collectives/collective_utils.cc b/src/ext/collectives/collective_utils.cc
index de33009c6..33b6ef779 100644
--- a/src/ext/collectives/collective_utils.cc
+++ b/src/ext/collectives/collective_utils.cc
@@ -7,6 +7,7 @@
 #include <mscclpp/algorithm.hpp>
 #include <mscclpp/core.hpp>
 #include <mscclpp/env.hpp>
+#include <mscclpp/errors.hpp>
 #include <mscclpp/memory_channel.hpp>
 #include <mscclpp/switch_channel.hpp>
 
@@ -78,6 +79,28 @@ int getIpcDomainNranks(std::shared_ptr<mscclpp::Communicator> comm) {
   return comm->bootstrap()->getNranksPerNode();
 }
 
+int validateIpcDomainSpansWorld(std::shared_ptr<mscclpp::Communicator> comm, const char* algName) {
+  const int ipcDomainNranks = getIpcDomainNranks(comm);
+  const int worldSize = comm->bootstrap()->getNranks();
+  const int rank = comm->bootstrap()->getRank();
+  if (ipcDomainNranks < 2 || ipcDomainNranks > MAX_IPC_DOMAIN_NRANKS) {
+    throw mscclpp::Error(std::string(algName) + ": ipcDomainNranks " + std::to_string(ipcDomainNranks) +
+                             " is out of supported range [2, " + std::to_string(MAX_IPC_DOMAIN_NRANKS) + "]",
+                         mscclpp::ErrorCode::InvalidUsage);
+  }
+  if (worldSize != ipcDomainNranks) {
+    throw mscclpp::Error(std::string(algName) + " requires worldSize == ipcDomainNranks (got worldSize=" +
+                             std::to_string(worldSize) + ", ipcDomainNranks=" + std::to_string(ipcDomainNranks) + ")",
+                         mscclpp::ErrorCode::InvalidUsage);
+  }
+  if (rank < 0 || rank >= ipcDomainNranks) {
+    throw mscclpp::Error(std::string(algName) + ": rank " + std::to_string(rank) + " out of [0, " +
+                             std::to_string(ipcDomainNranks) + ")",
+                         mscclpp::ErrorCode::InvalidUsage);
+  }
+  return ipcDomainNranks;
+}
+
 std::shared_ptr<mscclpp::DeviceHandle<mscclpp::MemoryChannel>> setupMemoryChannelDeviceHandles(
     const std::vector<mscclpp::MemoryChannel>& memoryChannels) {
   std::vector<mscclpp::DeviceHandle<mscclpp::MemoryChannel>> memoryChannelDeviceHandles;
diff --git a/src/ext/collectives/include/allreduce/allreduce_allpair_packet.hpp b/src/ext/collectives/include/allreduce/allreduce_allpair_packet.hpp
index fe96f7622..79c211b39 100644
--- a/src/ext/collectives/include/allreduce/allreduce_allpair_packet.hpp
+++ b/src/ext/collectives/include/allreduce/allreduce_allpair_packet.hpp
@@ -29,7 +29,7 @@ class AllreduceAllpairPacket : public AlgorithmBuilder {
   void* scratchBuffer_;
   size_t scratchBufferSize_;
   const int nSegmentsForScratchBuffer_ = 2;
-  // Must be at least MAX_NRANKS_PER_NODE-1 so the adapter can launch one
+  // Must be at least MAX_IPC_DOMAIN_NRANKS-1 so the adapter can launch one
   // block per peer at MNNVL scale.
   const int maxBlockNum_ = 72;
   std::vector<Connection> conns_;
diff --git a/src/ext/collectives/include/allreduce/allreduce_nvls_block_pipeline.hpp b/src/ext/collectives/include/allreduce/allreduce_nvls_block_pipeline.hpp
index 81b74add4..9a1742db1 100644
--- a/src/ext/collectives/include/allreduce/allreduce_nvls_block_pipeline.hpp
+++ b/src/ext/collectives/include/allreduce/allreduce_nvls_block_pipeline.hpp
@@ -29,6 +29,8 @@ class AllreduceNvlsBlockPipeline : public AlgorithmBuilder {
   void* scratchBuffer_;
   size_t scratchBufferSize_;
   uint32_t nSwitchChannels_;
+  int ipcDomainNranks_ = 0;
+  int nBaseChannels_ = 0;
   std::shared_ptr<DeviceHandle<BaseMemoryChannel>> memoryChannelsDeviceHandle_;
   std::vector<BaseMemoryChannel> baseChannels_;
   std::vector<Connection> conns_;
diff --git a/src/ext/collectives/include/allreduce/allreduce_nvls_warp_pipeline.hpp b/src/ext/collectives/include/allreduce/allreduce_nvls_warp_pipeline.hpp
index 8f02a8738..e2aa8c873 100644
--- a/src/ext/collectives/include/allreduce/allreduce_nvls_warp_pipeline.hpp
+++ b/src/ext/collectives/include/allreduce/allreduce_nvls_warp_pipeline.hpp
@@ -29,6 +29,8 @@ class AllreduceNvlsWarpPipeline : public AlgorithmBuilder {
   void* scratchBuffer_;
   size_t scratchBufferSize_;
   uint32_t nSwitchChannels_;
+  int ipcDomainNranks_ = 0;
+  int nBaseChannels_ = 0;
   std::shared_ptr<DeviceHandle<BaseMemoryChannel>> memoryChannelsDeviceHandle_;
   std::vector<BaseMemoryChannel> baseChannels_;
   std::vector<Connection> conns_;
diff --git a/src/ext/collectives/include/collective_utils.hpp b/src/ext/collectives/include/collective_utils.hpp
index 7fa6a81ea..892df3b11 100644
--- a/src/ext/collectives/include/collective_utils.hpp
+++ b/src/ext/collectives/include/collective_utils.hpp
@@ -26,7 +26,7 @@ namespace mscclpp {
 
 namespace collective {
 constexpr int NUM_NVLS_CONNECTION = 8;
-// Sized to cover MAX_NRANKS_PER_NODE-scale allreduce algos whose device-side
+// Sized to cover MAX_IPC_DOMAIN_NRANKS-scale allreduce algos whose device-side
 // semaphore indices grow as O(ipcDomainNranks) (e.g. nvls_block_pipeline uses
 // up to ~5 * ipcDomainNranks entries).
 constexpr int NUM_SEMAPHORES = 512;
@@ -35,7 +35,7 @@ constexpr int NUM_SEMAPHORES = 512;
 // single collective. Sized to cover Multi-Node NVLink (MNNVL) domains up to
 // GB200 NVL72 (72 GPUs sharing one NVLink fabric). Drives compile-time sizing
 // of shared-memory channel arrays in the allreduce/allgather kernels.
-constexpr int MAX_NRANKS_PER_NODE = 72;
+constexpr int MAX_IPC_DOMAIN_NRANKS = 72;
 
 constexpr int SCRATCH_SIZE = 2 * 1024 * 1024 * 70;  // double buffer * 35 thread-blocks * 8 ranks * 256KB = 70MB
 
@@ -58,6 +58,16 @@ std::vector<std::shared_ptr<MemoryDevice2DeviceSemaphore>> setupMemorySemaphores
 /// MNNVL-like behavior without changing the meaning of bootstrap-level APIs.
 int getIpcDomainNranks(std::shared_ptr<Communicator> comm);
 
+/// Validates that the IPC domain spans the whole communicator and that the local rank fits within
+/// the supported `[2, MAX_IPC_DOMAIN_NRANKS]` range. Used by NVLS allreduce algorithms whose
+/// multicast group spans the whole communicator (see `setupNvlsConnections`) and whose kernels
+/// use the global rank to compute per-rank offsets while sizing per-rank work by
+/// `ipcDomainNranks`. These assumptions only hold when the IPC-reachable peer group is exactly
+/// the whole communicator (e.g. a fully populated MNNVL fabric like NVL72). Returns the validated
+/// `ipcDomainNranks`; throws `Error(InvalidUsage)` on violation. `algName` is used as a prefix
+/// in error messages.
+int validateIpcDomainSpansWorld(std::shared_ptr<Communicator> comm, const char* algName);
+
 std::shared_ptr<DeviceHandle<MemoryChannel>> setupMemoryChannelDeviceHandles(
     const std::vector<MemoryChannel>& memoryChannels);
 

From 9aeeaf0f127768fdd3974a4cf5b66200654d3414 Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Wed, 6 May 2026 18:51:29 +0000
Subject: [PATCH 14/44] Simplify torch-integration tuning example for MPI-only
 multi-node testing

Use mpi4py for bootstrap and local-rank discovery; drop the torchrun /
gloo / manual MSCCLPP_MASTER_ADDR paths and the netifaces dependency.
Add MNNVL/multi-node algorithm selection (rsag, rsag_zero_copy,
nvls_zero_copy) and route barrier / timing-sync allreduces through the
configured symmetric_memory flag so they work across hosts.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../customized_comm_with_tuning.py            | 132 ++----------------
 1 file changed, 14 insertions(+), 118 deletions(-)

diff --git a/examples/torch-integration/customized_comm_with_tuning.py b/examples/torch-integration/customized_comm_with_tuning.py
index 9ad7f22a5..8d1efd533 100644
--- a/examples/torch-integration/customized_comm_with_tuning.py
+++ b/examples/torch-integration/customized_comm_with_tuning.py
@@ -1,30 +1,16 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
 
-# torchrun --nnodes=1 --nproc_per_node=8 examples/torch-integration/customized_comm_with_tuning.py
-# mpirun -np 2 --hostfile <hostfile> python3 examples/torch-integration/customized_comm_with_tuning.py
+# mpirun -np 8 python3 examples/torch-integration/customized_comm_with_tuning.py
+# mpirun -np 16 --hostfile <hostfile> python3 examples/torch-integration/customized_comm_with_tuning.py
 
-import gc
-import fcntl
-import ipaddress
 import os
-import socket
-import struct
-import sys
-import traceback
 
+from mpi4py import MPI
 
-def _get_bootstrap_world_size():
-    for name in ("WORLD_SIZE", "OMPI_COMM_WORLD_SIZE", "PMI_SIZE", "SLURM_NTASKS"):
-        value = os.environ.get(name)
-        if value is not None:
-            return int(value)
-    return None
-
-
-_bootstrap_world_size = _get_bootstrap_world_size()
-if _bootstrap_world_size and _bootstrap_world_size > 1 and "MSCCLPP_IPC_DOMAIN_NRANKS" not in os.environ:
-    os.environ["MSCCLPP_IPC_DOMAIN_NRANKS"] = str(_bootstrap_world_size)
+_world_size = MPI.COMM_WORLD.Get_size()
+if _world_size > 1 and "MSCCLPP_IPC_DOMAIN_NRANKS" not in os.environ:
+    os.environ["MSCCLPP_IPC_DOMAIN_NRANKS"] = str(_world_size)
 
 import torch
 import mscclpp
@@ -54,46 +40,6 @@ def _load_algorithms(scratch: torch.Tensor, rank: int):
     )
 
 
-def _interfaces_for_ip(ip: str):
-    target = ipaddress.ip_address(ip)
-    for iface in os.listdir("/sys/class/net"):
-        try:
-            with socket.socket(socket.AF_INET, socket.SOCK_DGRAM) as sock:
-                req = struct.pack("256s", iface.encode()[:15])
-                addr = socket.inet_ntoa(fcntl.ioctl(sock.fileno(), 0x8915, req)[20:24])
-        except OSError:
-            continue
-        if ipaddress.ip_address(addr) == target:
-            return iface
-    return None
-
-
-def _resolve_interface(master_addr: str):
-    for env_name in ("MSCCLPP_INTERFACE", "MSCCLPP_SOCKET_IFNAME", "NCCL_SOCKET_IFNAME"):
-        value = os.environ.get(env_name)
-        if value:
-            iface = value.split(",")[0].strip()
-            if iface in os.listdir("/sys/class/net"):
-                return iface
-            raise ValueError(f"Interface {iface} from {env_name} does not exist")
-    return _interfaces_for_ip(master_addr)
-
-
-def _get_env_int(*names: str, default=None):
-    for name in names:
-        value = os.environ.get(name)
-        if value is not None:
-            return int(value)
-    return default
-
-
-def _running_under_mpi() -> bool:
-    return any(
-        name in os.environ
-        for name in ("OMPI_COMM_WORLD_RANK", "PMI_RANK", "PMIX_RANK", "MPI_LOCALRANKID", "SLURM_PROCID")
-    )
-
-
 def _to_mscclpp_op(op) -> mscclpp.ReduceOp:
     if op == torch.distributed.ReduceOp.SUM:
         return mscclpp.ReduceOp.SUM
@@ -157,11 +103,6 @@ def __init__(self, comm: mscclpp.CommGroup, symmetric_memory: bool = False):
     def _algo(self, collective: str, name: str):
         return self._algos.get((collective, name))
 
-    def _nblocks_limit(self, algo_name: str, size: int) -> int:
-        if algo_name == "default_allreduce_packet" and size < (1 << 20):
-            return 56
-        return self._NBLOCKS_LIMIT.get(algo_name, 128)
-
     def _default_ar_config(self):
         """Fallback allreduce config for barrier / timing sync."""
         pkt = self._algo("allreduce", "default_allreduce_nvls_packet")
@@ -337,7 +278,7 @@ def _tune_size(self, collective: str, target_size: int):
         run = lambda a, nb, nt: self._run_tune(collective, a, buf, target_size, nb, nt)
 
         for algo in cands:
-            nb_limit = self._nblocks_limit(algo.name, target_size)
+            nb_limit = self._NBLOCKS_LIMIT.get(algo.name, 128)
             for nb in self._CANDIDATE_NBLOCKS:
                 if nb > nb_limit:
                     continue
@@ -435,11 +376,7 @@ def destroy(self):
 # -- Benchmarks (standalone) --------------------------------------------------
 
 
-def _bench_sizes(low=None, high=None):
-    if low is None:
-        low = _get_env_int("MSCCLPP_BENCH_LOW_SIZE", default=5 * 1024)
-    if high is None:
-        high = _get_env_int("MSCCLPP_BENCH_HIGH_SIZE", default=80 << 20)
+def _bench_sizes(low=5 * 1024, high=80 << 20):
     sizes, c = [], low
     while c <= high:
         sizes.append(c)
@@ -539,30 +476,11 @@ def benchmark_allgather(comm: CustomizedComm, dtype=torch.float16, n_warmup=10,
 
 
 def init_dist() -> mscclpp.CommGroup:
-    addr = os.environ.get("MSCCLPP_MASTER_ADDR")
-    rank = _get_env_int("RANK", "OMPI_COMM_WORLD_RANK", "PMI_RANK", "SLURM_PROCID")
-    world = _get_env_int("WORLD_SIZE", "OMPI_COMM_WORLD_SIZE", "PMI_SIZE", "SLURM_NTASKS")
-    if addr and rank is not None and world is not None:
-        port = os.environ.get("MSCCLPP_MASTER_PORT", "29500")
-        iface = _resolve_interface(addr)
-        if not iface:
-            raise ValueError(f"No interface for {addr}")
-        return mscclpp.CommGroup(interfaceIpPortTrio=f"{iface}:{addr}:{port}", rank=rank, size=world)
-    if _running_under_mpi():
-        try:
-            from mpi4py import MPI
-        except ModuleNotFoundError as exc:
-            raise RuntimeError("mpi4py is required to launch this example with mpirun") from exc
-
-        return mscclpp.CommGroup(mpi_comm=MPI.COMM_WORLD)
-    import torch.distributed as dist
-
-    dist.init_process_group(backend="gloo")
-    return mscclpp.CommGroup(torch_group=dist.group.WORLD)
+    return mscclpp.CommGroup(mpi_comm=MPI.COMM_WORLD)
 
 
 def main():
-    local = _get_env_int("LOCAL_RANK", "OMPI_COMM_WORLD_LOCAL_RANK", "MPI_LOCALRANKID", "SLURM_LOCALID", default=0)
+    local = MPI.COMM_WORLD.Split_type(MPI.COMM_TYPE_SHARED).Get_rank()
     torch.cuda.set_device(local)
 
     dtype_str = os.environ.get("DTYPE", "float16")
@@ -570,22 +488,12 @@ def main():
     accum_map = {"float32": mscclpp.DataType.float32, "float16": mscclpp.DataType.float16}
     accum_str = os.environ.get("ACCUM_DTYPE")
     accum_dtype = accum_map.get(accum_str) if accum_str else None
-    n_warmup = _get_env_int("MSCCLPP_BENCH_WARMUP", default=10)
-    n_graph_launches = _get_env_int("MSCCLPP_BENCH_GRAPH_LAUNCHES", default=10)
-    n_iter = _get_env_int("MSCCLPP_BENCH_ITERS", default=100)
 
     comm_group = init_dist()
-    cc = CustomizedComm(comm_group, symmetric_memory=False)
+    cc = CustomizedComm(comm_group)
 
     print(f"rank {local} starting benchmarks with dtype={dtype} accum_dtype={accum_dtype}...")
-    benchmark_allreduce(
-        cc,
-        dtype=dtype,
-        accum_dtype=accum_dtype,
-        n_warmup=n_warmup,
-        n_graph_launches=n_graph_launches,
-        n_iter=n_iter,
-    )
+    benchmark_allreduce(cc, dtype=dtype, accum_dtype=accum_dtype)
     cc.barrier()
     torch.cuda.synchronize()
 
@@ -593,25 +501,13 @@ def main():
         if cc.rank == 0:
             print("Skipping allgather benchmark on multi-node: this example's allgather path is single-node only.")
     else:
-        benchmark_allgather(cc, dtype=dtype, n_warmup=n_warmup, n_graph_launches=n_graph_launches, n_iter=n_iter)
+        benchmark_allgather(cc, dtype=dtype)
         cc.barrier()
         torch.cuda.synchronize()
 
     cc.destroy()
-    del cc
-    del comm_group
-    gc.collect()
     print(f"rank {local} completed successfully.")
 
 
 if __name__ == "__main__":
-    exit_code = 0
-    try:
-        main()
-    except Exception:
-        exit_code = 1
-        traceback.print_exc()
-    finally:
-        sys.stdout.flush()
-        sys.stderr.flush()
-        os._exit(exit_code)
+    main()

From 905b23d9a8d34071140fcd17f89b413001c44f58 Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Wed, 6 May 2026 19:00:22 +0000
Subject: [PATCH 15/44] Drop non-MNNVL multi_node regime from torch-integration
 example

The example is now MNNVL-only: a run is either single-host (everything
fits in one node) or multi-host MNNVL (one cross-host NVLink domain).
Plain multi-node-without-MNNVL had its own algorithm branch that this
example will never exercise, so remove the multi_node flag and the
intermediate mnnvl_domain variable.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../customized_comm_with_tuning.py            | 24 ++++---------------
 1 file changed, 5 insertions(+), 19 deletions(-)

diff --git a/examples/torch-integration/customized_comm_with_tuning.py b/examples/torch-integration/customized_comm_with_tuning.py
index 8d1efd533..6da9d7134 100644
--- a/examples/torch-integration/customized_comm_with_tuning.py
+++ b/examples/torch-integration/customized_comm_with_tuning.py
@@ -83,9 +83,7 @@ def __init__(self, comm: mscclpp.CommGroup, symmetric_memory: bool = False):
         self.world_size = comm.nranks
         self.nranks_per_node = comm.nranks_per_node
         nvlink_domain_nranks = int(os.environ.get("MSCCLPP_IPC_DOMAIN_NRANKS", "0"))
-        self.mnnvl_domain = self.world_size > 1 and nvlink_domain_nranks >= self.world_size
-        self.multi_node = self.world_size > self.nranks_per_node and not self.mnnvl_domain
-        self.multi_host_mnnvl = self.mnnvl_domain and self.world_size > self.nranks_per_node
+        self.multi_host_mnnvl = nvlink_domain_nranks >= self.world_size and self.world_size > self.nranks_per_node
         self.symmetric_memory = symmetric_memory
         self._nvls = mscclpp.is_nvls_supported()
 
@@ -108,7 +106,7 @@ def _default_ar_config(self):
         pkt = self._algo("allreduce", "default_allreduce_nvls_packet")
         if self._nvls and pkt:
             return (pkt, 0, 0)
-        if self.multi_node or self.multi_host_mnnvl:
+        if self.multi_host_mnnvl:
             rsag = self._algo("allreduce", "default_allreduce_rsag")
             if rsag:
                 return (rsag, 0, 0)
@@ -194,18 +192,6 @@ def _ar_candidates(self, size: int):
                 if a:
                     out.append(a)
             return out
-        if self.multi_node:
-            a = self._algo("allreduce", "default_allreduce_nvls_packet")
-            if self._nvls and a:
-                out.append(a)
-            a = self._algo("allreduce", "default_allreduce_packet")
-            if a:
-                out.append(a)
-            if size >= 512 << 10:
-                a = self._algo("allreduce", "default_allreduce_rsag")
-                if a:
-                    out.append(a)
-            return out
         if size <= 4 << 20:
             a = self._algo("allreduce", "default_allreduce_packet")
             if a:
@@ -230,7 +216,7 @@ def _ar_candidates(self, size: int):
         return out
 
     def _ag_candidates(self):
-        if self.multi_node or self.multi_host_mnnvl:
+        if self.multi_host_mnnvl:
             return []
         a = self._algo("allgather", "default_allgather_fullmesh2")
         return [a] if a else []
@@ -356,7 +342,7 @@ def all_reduce(self, tensor, op=torch.distributed.ReduceOp.SUM, stream=None, acc
         )
 
     def all_gather(self, output_tensor, input_tensor, stream=None):
-        if self.multi_node or self.multi_host_mnnvl:
+        if self.multi_host_mnnvl:
             raise RuntimeError("all_gather in this example currently supports only single-node runs")
         sz = _round_pow2(input_tensor.nbytes)
         if sz not in self._tune_cache["allgather"]:
@@ -497,7 +483,7 @@ def main():
     cc.barrier()
     torch.cuda.synchronize()
 
-    if cc.multi_node or cc.multi_host_mnnvl:
+    if cc.multi_host_mnnvl:
         if cc.rank == 0:
             print("Skipping allgather benchmark on multi-node: this example's allgather path is single-node only.")
     else:

From 4a0d5b29d509b00268a64f6e0a5b4db602e8cb46 Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Wed, 6 May 2026 21:14:36 +0000
Subject: [PATCH 16/44] Simplify torch-integration tuning example

- Drop the multi_host_mnnvl-specific rsag fallback in _default_ar_config;
  fall through to default_allreduce_packet when NVLS is unavailable.
- Add SYMMETRIC_MEMORY env var so the tuning sweep can include the
  zero-copy NVLS / RSAG candidates without editing the source.
- Make _algo() raise on miss (direct dict lookup) and drop the
  defensive 'if a:' guards in _ar_candidates / _ag_candidates /
  _default_ar_config; merge existence checks into the platform
  conditions (self._nvls, self.symmetric_memory).

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../customized_comm_with_tuning.py            | 73 +++++++------------
 1 file changed, 26 insertions(+), 47 deletions(-)

diff --git a/examples/torch-integration/customized_comm_with_tuning.py b/examples/torch-integration/customized_comm_with_tuning.py
index 6da9d7134..18fdd6f14 100644
--- a/examples/torch-integration/customized_comm_with_tuning.py
+++ b/examples/torch-integration/customized_comm_with_tuning.py
@@ -99,17 +99,12 @@ def __init__(self, comm: mscclpp.CommGroup, symmetric_memory: bool = False):
         self._time_buf = None
 
     def _algo(self, collective: str, name: str):
-        return self._algos.get((collective, name))
+        return self._algos[(collective, name)]
 
     def _default_ar_config(self):
         """Fallback allreduce config for barrier / timing sync."""
-        pkt = self._algo("allreduce", "default_allreduce_nvls_packet")
-        if self._nvls and pkt:
-            return (pkt, 0, 0)
-        if self.multi_host_mnnvl:
-            rsag = self._algo("allreduce", "default_allreduce_rsag")
-            if rsag:
-                return (rsag, 0, 0)
+        if self._nvls:
+            return (self._algo("allreduce", "default_allreduce_nvls_packet"), 0, 0)
         return (self._algo("allreduce", "default_allreduce_packet"), 0, 0)
 
     # -- low-level execute --
@@ -157,7 +152,7 @@ def _exec_ag(self, inp, out, algo, nb, nt, stream=None, sym=None):
 
     def _barrier_internal(self):
         a, nb, nt = self._default_ar_config()
-        self._exec_ar(self._barrier_tensor, a, nb, nt, sym=self.symmetric_memory)
+        self._exec_ar(self._barrier_tensor, a, nb, nt, sym=True)
 
     # -- lazy tuning --
 
@@ -173,53 +168,33 @@ def _ar_candidates(self, size: int):
         if self.multi_host_mnnvl:
             if size <= 4 << 20:
                 if size <= 128 << 10:
-                    a = self._algo("allreduce", "default_allreduce_allpair_packet")
-                    if a:
-                        out.append(a)
-                if size <= 64 << 10:
-                    a = self._algo("allreduce", "default_allreduce_nvls_packet")
-                    if self._nvls and a:
-                        out.append(a)
+                    out.append(self._algo("allreduce", "default_allreduce_allpair_packet"))
+                if size <= 64 << 10 and self._nvls:
+                    out.append(self._algo("allreduce", "default_allreduce_nvls_packet"))
                 if size > 128 << 10:
-                    a = self._algo("allreduce", "default_allreduce_packet")
-                    if a:
-                        out.append(a)
+                    out.append(self._algo("allreduce", "default_allreduce_packet"))
             if size >= 512 << 10:
-                a = self._algo("allreduce", "default_allreduce_nvls_zero_copy")
-                if self._nvls and self.symmetric_memory and a:
-                    out.append(a)
-                a = self._algo("allreduce", "default_allreduce_rsag")
-                if a:
-                    out.append(a)
+                if self._nvls and self.symmetric_memory:
+                    out.append(self._algo("allreduce", "default_allreduce_nvls_zero_copy"))
+                out.append(self._algo("allreduce", "default_allreduce_rsag"))
             return out
         if size <= 4 << 20:
-            a = self._algo("allreduce", "default_allreduce_packet")
-            if a:
-                out.append(a)
-            a = self._algo("allreduce", "default_allreduce_allpair_packet")
-            if a:
-                out.append(a)
-            a = self._algo("allreduce", "default_allreduce_nvls_packet")
-            if self._nvls and a:
-                out.append(a)
+            out.append(self._algo("allreduce", "default_allreduce_packet"))
+            out.append(self._algo("allreduce", "default_allreduce_allpair_packet"))
+            if self._nvls:
+                out.append(self._algo("allreduce", "default_allreduce_nvls_packet"))
         if size >= 512 << 10:
-            a = self._algo("allreduce", "default_allreduce_rsag_zero_copy")
-            if a:
-                out.append(a)
-            a = self._algo("allreduce", "default_allreduce_nvls_zero_copy")
-            if self._nvls and self.symmetric_memory and a:
-                out.append(a)
+            out.append(self._algo("allreduce", "default_allreduce_rsag_zero_copy"))
+            if self._nvls and self.symmetric_memory:
+                out.append(self._algo("allreduce", "default_allreduce_nvls_zero_copy"))
         if torch.version.hip is not None:
-            a = self._algo("allreduce", "default_allreduce_fullmesh")
-            if a:
-                out.append(a)
+            out.append(self._algo("allreduce", "default_allreduce_fullmesh"))
         return out
 
     def _ag_candidates(self):
         if self.multi_host_mnnvl:
             return []
-        a = self._algo("allgather", "default_allgather_fullmesh2")
-        return [a] if a else []
+        return [self._algo("allgather", "default_allgather_fullmesh2")]
 
     def _run_tune(self, collective, algo, buf, size, nb, nt):
         """Single tune invocation for either collective."""
@@ -474,11 +449,15 @@ def main():
     accum_map = {"float32": mscclpp.DataType.float32, "float16": mscclpp.DataType.float16}
     accum_str = os.environ.get("ACCUM_DTYPE")
     accum_dtype = accum_map.get(accum_str) if accum_str else None
+    symmetric_memory = os.environ.get("SYMMETRIC_MEMORY", "0") == "1"
 
     comm_group = init_dist()
-    cc = CustomizedComm(comm_group)
+    cc = CustomizedComm(comm_group, symmetric_memory=symmetric_memory)
 
-    print(f"rank {local} starting benchmarks with dtype={dtype} accum_dtype={accum_dtype}...")
+    print(
+        f"rank {local} starting benchmarks with dtype={dtype} "
+        f"accum_dtype={accum_dtype} symmetric_memory={symmetric_memory}..."
+    )
     benchmark_allreduce(cc, dtype=dtype, accum_dtype=accum_dtype)
     cc.barrier()
     torch.cuda.synchronize()

From 307a4718884a59dd2acead9ca899a1667598b470 Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Wed, 6 May 2026 21:37:09 +0000
Subject: [PATCH 17/44] Shorten verbose comments and use THROW in
 validateIpcDomainSpansWorld

- Collapse the duplicated 3-line warp-strided-load comment in 5 kernels
  (allgather_fullmesh, allreduce_fullmesh, allreduce_packet,
  allreduce_nvls_zero_copy, allreduce_nvls_warp_pipeline) into a single
  one-line 'Peer count may exceed WARP_SIZE on MNNVL.' note.
- Drop the algName parameter from validateIpcDomainSpansWorld; switch
  its 3 throws to use the THROW logger macro (LogSubsys::ALGO), which
  already captures file/line/function. Update the 3 callsites
  (nvls_block_pipeline, nvls_warp_pipeline, nvls_zero_copy) and trim the
  Doxygen comment accordingly.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../allgather/allgather_fullmesh.cu           |  4 +---
 .../allreduce/allreduce_fullmesh.cu           |  4 +---
 .../allreduce_nvls_block_pipeline.cu          |  2 +-
 .../allreduce/allreduce_nvls_warp_pipeline.cu |  6 ++----
 .../allreduce/allreduce_nvls_zero_copy.cu     |  6 ++----
 .../collectives/allreduce/allreduce_packet.cu |  4 +---
 src/ext/collectives/collective_utils.cc       | 20 +++++++++----------
 .../collectives/include/collective_utils.hpp  | 10 +++-------
 8 files changed, 21 insertions(+), 35 deletions(-)

diff --git a/src/ext/collectives/allgather/allgather_fullmesh.cu b/src/ext/collectives/allgather/allgather_fullmesh.cu
index 8ce77fca1..a4196c6cd 100644
--- a/src/ext/collectives/allgather/allgather_fullmesh.cu
+++ b/src/ext/collectives/allgather/allgather_fullmesh.cu
@@ -30,9 +30,7 @@ __global__ void __launch_bounds__(1024, 1)
 
   __shared__ DeviceHandle<MemoryChannel> channels[MAX_IPC_DOMAIN_NRANKS - 1];
   const int lid = threadIdx.x % WARP_SIZE;
-  // Each warp redundantly loads all entries (same value, benign race) so that
-  // every warp has the data its threads will read after __syncwarp(). Required
-  // when nPeer > WARP_SIZE (MNNVL/NVL72 scale).
+  // Peer count may exceed WARP_SIZE on MNNVL.
   for (int i = lid; i < nPeer; i += WARP_SIZE) {
     channels[i] = memoryChans[i];
   }
diff --git a/src/ext/collectives/allreduce/allreduce_fullmesh.cu b/src/ext/collectives/allreduce/allreduce_fullmesh.cu
index f1d815604..ef7ecf74d 100644
--- a/src/ext/collectives/allreduce/allreduce_fullmesh.cu
+++ b/src/ext/collectives/allreduce/allreduce_fullmesh.cu
@@ -52,9 +52,7 @@ __global__ void __launch_bounds__(512, 1)
   __shared__ DeviceHandle<MemoryChannel> channels[MAX_IPC_DOMAIN_NRANKS - 1];
   __shared__ DeviceHandle<MemoryChannel> outChannels[MAX_IPC_DOMAIN_NRANKS - 1];
   const int lid = threadIdx.x % WARP_SIZE;
-  // Each warp redundantly loads all entries (same value, benign race) so that
-  // every warp has the data its threads will read after __syncwarp(). Required
-  // when nPeer > WARP_SIZE (MNNVL/NVL72 scale).
+  // Peer count may exceed WARP_SIZE on MNNVL.
   for (int i = lid; i < nPeer; i += WARP_SIZE) {
     channels[i] = memoryChans[i];
     outChannels[i] = memoryOutChans[i];
diff --git a/src/ext/collectives/allreduce/allreduce_nvls_block_pipeline.cu b/src/ext/collectives/allreduce/allreduce_nvls_block_pipeline.cu
index 4eeb03355..8c4a1e236 100644
--- a/src/ext/collectives/allreduce/allreduce_nvls_block_pipeline.cu
+++ b/src/ext/collectives/allreduce/allreduce_nvls_block_pipeline.cu
@@ -178,7 +178,7 @@ struct NvlsBlockPipelineAdapter {
 
 void AllreduceNvlsBlockPipeline::initialize(std::shared_ptr<Communicator> comm) {
   nSwitchChannels_ = 8;
-  ipcDomainNranks_ = validateIpcDomainSpansWorld(comm, "AllreduceNvlsBlockPipeline");
+  ipcDomainNranks_ = validateIpcDomainSpansWorld(comm);
   // Block-pipeline device-side semaphore indices grow as 6 * ipcDomainNranks (see kernel).
   if (6 * ipcDomainNranks_ > NUM_SEMAPHORES) {
     throw Error("AllreduceNvlsBlockPipeline: ipcDomainNranks " + std::to_string(ipcDomainNranks_) +
diff --git a/src/ext/collectives/allreduce/allreduce_nvls_warp_pipeline.cu b/src/ext/collectives/allreduce/allreduce_nvls_warp_pipeline.cu
index 05e4f747f..950c287bf 100644
--- a/src/ext/collectives/allreduce/allreduce_nvls_warp_pipeline.cu
+++ b/src/ext/collectives/allreduce/allreduce_nvls_warp_pipeline.cu
@@ -59,9 +59,7 @@ __global__ void __launch_bounds__(1024, 1)
   auto memoryChans = memoryChannels + chanOffset;
   __shared__ DeviceHandle<BaseMemoryChannel> channels[(MAX_IPC_DOMAIN_NRANKS - 1) * 2];
   const int lid = threadIdx.x % WARP_SIZE;
-  // Each warp redundantly loads all entries (same value, benign race) so that
-  // every warp has the data its threads will read after __syncwarp(). Required
-  // when nPeers*2 > WARP_SIZE (MNNVL scale).
+  // Peer count may exceed WARP_SIZE on MNNVL.
   for (int i = lid; i < nPeers * 2; i += WARP_SIZE) {
     channels[i] = memoryChans[i];
   }
@@ -144,7 +142,7 @@ struct NvlsWarpPipelineAdapter {
 
 void AllreduceNvlsWarpPipeline::initialize(std::shared_ptr<Communicator> comm) {
   nSwitchChannels_ = NUM_NVLS_CONNECTION;
-  ipcDomainNranks_ = validateIpcDomainSpansWorld(comm, "AllreduceNvlsWarpPipeline");
+  ipcDomainNranks_ = validateIpcDomainSpansWorld(comm);
   // The warp-pipeline kernel addresses 2 * nPeers entries per block in `memoryChannels`,
   // so per-peer base channel allocation must be at least `2 * nBlocks`. Default
   // nBlocks = 4 * ipcDomainNranks (see allreduceKernelFunc), so size accordingly.
diff --git a/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu b/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu
index 5d6fc4d37..6ab0cd639 100644
--- a/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu
+++ b/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu
@@ -45,9 +45,7 @@ __global__ void __launch_bounds__(1024, 1)
   auto memoryChans = memoryChannels + chanOffset;
   __shared__ mscclpp::DeviceHandle<mscclpp::BaseMemoryChannel> channels[MAX_IPC_DOMAIN_NRANKS - 1];
   const int lid = threadIdx.x % WARP_SIZE;
-  // Each warp redundantly loads all entries (same value, benign race) so that
-  // every warp has the data its threads will read after __syncwarp(). Required
-  // when nPeers > WARP_SIZE (MNNVL/NVL72 → 71 peers).
+  // Peer count may exceed WARP_SIZE on MNNVL.
   for (int i = lid; i < ipcDomainNranks - 1; i += WARP_SIZE) {
     channels[i] = memoryChans[i];
   }
@@ -107,7 +105,7 @@ void AllreduceNvls::initialize(std::shared_ptr<mscclpp::Communicator> comm) {
   MSCCLPP_CUDATHROW(cudaGetDeviceProperties(&deviceProp, device));
   computeCapabilityMajor_ = deviceProp.major;
   nSwitchChannels_ = 32;
-  validateIpcDomainSpansWorld(comm, "AllreduceNvls");
+  validateIpcDomainSpansWorld(comm);
   this->conns_ = setupConnections(comm);
   // setup semaphores
   std::vector<std::shared_ptr<mscclpp::MemoryDevice2DeviceSemaphore>> memorySemaphores =
diff --git a/src/ext/collectives/allreduce/allreduce_packet.cu b/src/ext/collectives/allreduce/allreduce_packet.cu
index cc91370ca..7bc9a85f1 100644
--- a/src/ext/collectives/allreduce/allreduce_packet.cu
+++ b/src/ext/collectives/allreduce/allreduce_packet.cu
@@ -80,9 +80,7 @@ __global__ void __launch_bounds__(1024, 1)
   // Put channels into shared memory, read channel info from global memory is unexpectable slow.
   __shared__ mscclpp::DeviceHandle<mscclpp::MemoryChannel> channels[MAX_IPC_DOMAIN_NRANKS - 1];
   const int lid = tid % WARP_SIZE;
-  // Each warp redundantly loads all entries (same value, benign race) so that
-  // every warp has the data its threads will read after __syncwarp(). Required
-  // when nPeers > WARP_SIZE (MNNVL/NVL72 scale).
+  // Peer count may exceed WARP_SIZE on MNNVL.
   for (int i = lid; i < nPeers; i += WARP_SIZE) {
     channels[i] = memoryChannels[i];
   }
diff --git a/src/ext/collectives/collective_utils.cc b/src/ext/collectives/collective_utils.cc
index 33b6ef779..e4eb7142c 100644
--- a/src/ext/collectives/collective_utils.cc
+++ b/src/ext/collectives/collective_utils.cc
@@ -11,6 +11,8 @@
 #include <mscclpp/memory_channel.hpp>
 #include <mscclpp/switch_channel.hpp>
 
+#include "logger.hpp"
+
 namespace mscclpp {
 namespace collective {
 std::vector<mscclpp::RegisteredMemory> setupRemoteMemories(std::shared_ptr<mscclpp::Communicator> comm, int rank,
@@ -79,24 +81,22 @@ int getIpcDomainNranks(std::shared_ptr<mscclpp::Communicator> comm) {
   return comm->bootstrap()->getNranksPerNode();
 }
 
-int validateIpcDomainSpansWorld(std::shared_ptr<mscclpp::Communicator> comm, const char* algName) {
+int validateIpcDomainSpansWorld(std::shared_ptr<mscclpp::Communicator> comm) {
   const int ipcDomainNranks = getIpcDomainNranks(comm);
   const int worldSize = comm->bootstrap()->getNranks();
   const int rank = comm->bootstrap()->getRank();
   if (ipcDomainNranks < 2 || ipcDomainNranks > MAX_IPC_DOMAIN_NRANKS) {
-    throw mscclpp::Error(std::string(algName) + ": ipcDomainNranks " + std::to_string(ipcDomainNranks) +
-                             " is out of supported range [2, " + std::to_string(MAX_IPC_DOMAIN_NRANKS) + "]",
-                         mscclpp::ErrorCode::InvalidUsage);
+    THROW(mscclpp::LogSubsys::ALGO, mscclpp::Error, mscclpp::ErrorCode::InvalidUsage, "ipcDomainNranks ",
+          ipcDomainNranks, " is out of supported range [2, ", MAX_IPC_DOMAIN_NRANKS, "]");
   }
   if (worldSize != ipcDomainNranks) {
-    throw mscclpp::Error(std::string(algName) + " requires worldSize == ipcDomainNranks (got worldSize=" +
-                             std::to_string(worldSize) + ", ipcDomainNranks=" + std::to_string(ipcDomainNranks) + ")",
-                         mscclpp::ErrorCode::InvalidUsage);
+    THROW(mscclpp::LogSubsys::ALGO, mscclpp::Error, mscclpp::ErrorCode::InvalidUsage,
+          "requires worldSize == ipcDomainNranks (got worldSize=", worldSize, ", ipcDomainNranks=", ipcDomainNranks,
+          ")");
   }
   if (rank < 0 || rank >= ipcDomainNranks) {
-    throw mscclpp::Error(std::string(algName) + ": rank " + std::to_string(rank) + " out of [0, " +
-                             std::to_string(ipcDomainNranks) + ")",
-                         mscclpp::ErrorCode::InvalidUsage);
+    THROW(mscclpp::LogSubsys::ALGO, mscclpp::Error, mscclpp::ErrorCode::InvalidUsage, "rank ", rank, " out of [0, ",
+          ipcDomainNranks, ")");
   }
   return ipcDomainNranks;
 }
diff --git a/src/ext/collectives/include/collective_utils.hpp b/src/ext/collectives/include/collective_utils.hpp
index 892df3b11..6b0c6ab48 100644
--- a/src/ext/collectives/include/collective_utils.hpp
+++ b/src/ext/collectives/include/collective_utils.hpp
@@ -60,13 +60,9 @@ int getIpcDomainNranks(std::shared_ptr<Communicator> comm);
 
 /// Validates that the IPC domain spans the whole communicator and that the local rank fits within
 /// the supported `[2, MAX_IPC_DOMAIN_NRANKS]` range. Used by NVLS allreduce algorithms whose
-/// multicast group spans the whole communicator (see `setupNvlsConnections`) and whose kernels
-/// use the global rank to compute per-rank offsets while sizing per-rank work by
-/// `ipcDomainNranks`. These assumptions only hold when the IPC-reachable peer group is exactly
-/// the whole communicator (e.g. a fully populated MNNVL fabric like NVL72). Returns the validated
-/// `ipcDomainNranks`; throws `Error(InvalidUsage)` on violation. `algName` is used as a prefix
-/// in error messages.
-int validateIpcDomainSpansWorld(std::shared_ptr<Communicator> comm, const char* algName);
+/// multicast group spans the whole communicator. Returns the validated `ipcDomainNranks`; throws
+/// `Error(InvalidUsage)` on violation.
+int validateIpcDomainSpansWorld(std::shared_ptr<Communicator> comm);
 
 std::shared_ptr<DeviceHandle<MemoryChannel>> setupMemoryChannelDeviceHandles(
     const std::vector<MemoryChannel>& memoryChannels);

From f0c6ac081f23425e3a91c1493a1f4c7f40909600 Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Wed, 6 May 2026 21:49:48 +0000
Subject: [PATCH 18/44] Fold validateIpcDomainSpansWorld into
 getIpcDomainNranks

getIpcDomainNranks now performs the range / world-size / rank checks
itself and throws on violation, so the separate
validateIpcDomainSpansWorld helper is unnecessary. Update the 3 NVLS
callsites (block_pipeline, warp_pipeline, nvls_zero_copy) to call
getIpcDomainNranks directly. The non-NVLS callers also pick up the
strict validation, which is fine because they are only invoked in
single-host or multi-host MNNVL scenarios where worldSize ==
ipcDomainNranks (the NCCL adapter's multi-node path returns nullptr,
falling back to NCCL/RCCL).

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../allreduce/allreduce_nvls_block_pipeline.cu    |  2 +-
 .../allreduce/allreduce_nvls_warp_pipeline.cu     |  2 +-
 .../allreduce/allreduce_nvls_zero_copy.cu         |  2 +-
 src/ext/collectives/collective_utils.cc           |  9 +--------
 src/ext/collectives/include/collective_utils.hpp  | 15 ++++-----------
 5 files changed, 8 insertions(+), 22 deletions(-)

diff --git a/src/ext/collectives/allreduce/allreduce_nvls_block_pipeline.cu b/src/ext/collectives/allreduce/allreduce_nvls_block_pipeline.cu
index 8c4a1e236..f5c0d2f85 100644
--- a/src/ext/collectives/allreduce/allreduce_nvls_block_pipeline.cu
+++ b/src/ext/collectives/allreduce/allreduce_nvls_block_pipeline.cu
@@ -178,7 +178,7 @@ struct NvlsBlockPipelineAdapter {
 
 void AllreduceNvlsBlockPipeline::initialize(std::shared_ptr<Communicator> comm) {
   nSwitchChannels_ = 8;
-  ipcDomainNranks_ = validateIpcDomainSpansWorld(comm);
+  ipcDomainNranks_ = getIpcDomainNranks(comm);
   // Block-pipeline device-side semaphore indices grow as 6 * ipcDomainNranks (see kernel).
   if (6 * ipcDomainNranks_ > NUM_SEMAPHORES) {
     throw Error("AllreduceNvlsBlockPipeline: ipcDomainNranks " + std::to_string(ipcDomainNranks_) +
diff --git a/src/ext/collectives/allreduce/allreduce_nvls_warp_pipeline.cu b/src/ext/collectives/allreduce/allreduce_nvls_warp_pipeline.cu
index 950c287bf..02b899aa2 100644
--- a/src/ext/collectives/allreduce/allreduce_nvls_warp_pipeline.cu
+++ b/src/ext/collectives/allreduce/allreduce_nvls_warp_pipeline.cu
@@ -142,7 +142,7 @@ struct NvlsWarpPipelineAdapter {
 
 void AllreduceNvlsWarpPipeline::initialize(std::shared_ptr<Communicator> comm) {
   nSwitchChannels_ = NUM_NVLS_CONNECTION;
-  ipcDomainNranks_ = validateIpcDomainSpansWorld(comm);
+  ipcDomainNranks_ = getIpcDomainNranks(comm);
   // The warp-pipeline kernel addresses 2 * nPeers entries per block in `memoryChannels`,
   // so per-peer base channel allocation must be at least `2 * nBlocks`. Default
   // nBlocks = 4 * ipcDomainNranks (see allreduceKernelFunc), so size accordingly.
diff --git a/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu b/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu
index 6ab0cd639..115a229ae 100644
--- a/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu
+++ b/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu
@@ -105,7 +105,7 @@ void AllreduceNvls::initialize(std::shared_ptr<mscclpp::Communicator> comm) {
   MSCCLPP_CUDATHROW(cudaGetDeviceProperties(&deviceProp, device));
   computeCapabilityMajor_ = deviceProp.major;
   nSwitchChannels_ = 32;
-  validateIpcDomainSpansWorld(comm);
+  getIpcDomainNranks(comm);
   this->conns_ = setupConnections(comm);
   // setup semaphores
   std::vector<std::shared_ptr<mscclpp::MemoryDevice2DeviceSemaphore>> memorySemaphores =
diff --git a/src/ext/collectives/collective_utils.cc b/src/ext/collectives/collective_utils.cc
index e4eb7142c..6acfd7ce0 100644
--- a/src/ext/collectives/collective_utils.cc
+++ b/src/ext/collectives/collective_utils.cc
@@ -75,14 +75,7 @@ std::vector<std::shared_ptr<mscclpp::MemoryDevice2DeviceSemaphore>> setupMemoryS
 
 int getIpcDomainNranks(std::shared_ptr<mscclpp::Communicator> comm) {
   const int envValue = mscclpp::env()->ipcDomainNranks;
-  if (envValue > 0) {
-    return envValue;
-  }
-  return comm->bootstrap()->getNranksPerNode();
-}
-
-int validateIpcDomainSpansWorld(std::shared_ptr<mscclpp::Communicator> comm) {
-  const int ipcDomainNranks = getIpcDomainNranks(comm);
+  const int ipcDomainNranks = (envValue > 0) ? envValue : comm->bootstrap()->getNranksPerNode();
   const int worldSize = comm->bootstrap()->getNranks();
   const int rank = comm->bootstrap()->getRank();
   if (ipcDomainNranks < 2 || ipcDomainNranks > MAX_IPC_DOMAIN_NRANKS) {
diff --git a/src/ext/collectives/include/collective_utils.hpp b/src/ext/collectives/include/collective_utils.hpp
index 6b0c6ab48..280a63328 100644
--- a/src/ext/collectives/include/collective_utils.hpp
+++ b/src/ext/collectives/include/collective_utils.hpp
@@ -51,18 +51,11 @@ std::vector<Connection> setupConnections(std::shared_ptr<Communicator> comm);
 std::vector<std::shared_ptr<MemoryDevice2DeviceSemaphore>> setupMemorySemaphores(
     std::shared_ptr<Communicator> comm, const std::vector<Connection>& connections, int nChannelsPerConnection);
 
-/// Number of ranks that participate in the same GPU-IPC-reachable peer group (e.g. a single host or
-/// a Multi-Node NVLink fabric, or an AMD XGMI domain). Returns the value of `MSCCLPP_IPC_DOMAIN_NRANKS`
-/// if set to a positive value; otherwise falls back to `bootstrap->getNranksPerNode()`. This is
-/// intentionally independent of `Bootstrap::getNranksPerNode()` so that algorithms can opt in to
-/// MNNVL-like behavior without changing the meaning of bootstrap-level APIs.
-int getIpcDomainNranks(std::shared_ptr<Communicator> comm);
-
-/// Validates that the IPC domain spans the whole communicator and that the local rank fits within
-/// the supported `[2, MAX_IPC_DOMAIN_NRANKS]` range. Used by NVLS allreduce algorithms whose
-/// multicast group spans the whole communicator. Returns the validated `ipcDomainNranks`; throws
+/// Returns the IPC-reachable peer-group size, validated to span the whole communicator and
+/// to be within `[2, MAX_IPC_DOMAIN_NRANKS]`. Reads `MSCCLPP_IPC_DOMAIN_NRANKS` if set to a
+/// positive value; otherwise falls back to `bootstrap->getNranksPerNode()`. Throws
 /// `Error(InvalidUsage)` on violation.
-int validateIpcDomainSpansWorld(std::shared_ptr<Communicator> comm);
+int getIpcDomainNranks(std::shared_ptr<Communicator> comm);
 
 std::shared_ptr<DeviceHandle<MemoryChannel>> setupMemoryChannelDeviceHandles(
     const std::vector<MemoryChannel>& memoryChannels);

From bde23ce38e6399e52d4662018935863d5654fd4a Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Wed, 6 May 2026 22:16:08 +0000
Subject: [PATCH 19/44] Revert verbose RSAG zero-copy comment; rename
 NRanksPerNode template param

- Restore the original two-line note about the templated peer-loop
  unrolling instead of the multi-paragraph rationale block.
- Rename the kernel template parameter from NRanksPerNode to NRanks.
  The IPC domain can span multiple physical hosts under MNNVL, so the
  'PerNode' suffix is misleading; NRanks matches the runtime
  ipcDomainNranks parameter that drives template dispatch.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../allreduce/allreduce_rsag_zero_copy.cu     | 25 +++++++------------
 1 file changed, 9 insertions(+), 16 deletions(-)

diff --git a/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu b/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu
index a20756aee..c678c2670 100644
--- a/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu
+++ b/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu
@@ -35,17 +35,10 @@ __device__ mscclpp::DeviceSyncer globalSyncer;
 //
 // This approach requires registering both input and output buffers as remote
 // memories (2 * nPeers handles), but avoids scratch buffer allocation and
-// the extra copy steps of the standard RSAG.
-//
-// The kernel is templated on NRanksPerNode so the compiler can keep an int4
-// register array of NPeers elements, #pragma unroll the peer loops, and turn
-// the per-iteration modulo into a single AND. This issues all NPeers remote
-// reads in parallel so their latency is overlapped instead of serialized.
-// Only small fixed sizes ({4, 8}) are instantiated; larger MNNVL domains
-// (where the int4 array would spill out of registers) must use a different
-// algorithm.
+// the extra copy steps of the standard RSAG. The NRanks template
+// parameter enables compile-time unrolling of peer loops (supports 4 or 8).
 
-template <int NRanksPerNode, ReduceOp OpType, typename T, typename AccumT = T>
+template <int NRanks, ReduceOp OpType, typename T, typename AccumT = T>
 __global__ void __launch_bounds__(1024, 1)
     allreduceRsAgZeroCopy(T* buff, T* scratch, T* resultBuff, DeviceHandle<BaseMemoryChannel>* memoryChannels,
                           DeviceHandle<SwitchChannel>* switchChannels, void* remoteMemories, int rank, int worldSize,
@@ -55,12 +48,12 @@ __global__ void __launch_bounds__(1024, 1)
   assert((uintptr_t)buff % sizeof(int4) == 0);
   assert((uintptr_t)resultBuff % sizeof(int4) == 0);
 
-  constexpr int NPeers = NRanksPerNode - 1;
+  constexpr int NPeers = NRanks - 1;
   constexpr uint32_t nelemsPerInt4 = sizeof(int4) / sizeof(T);
   constexpr uint32_t outputRemoteBufferOffset = NPeers;
-  uint32_t alignedNelems = ((nelems + NRanksPerNode - 1) / NRanksPerNode + nelemsPerInt4 - 1) / nelemsPerInt4 *
-                           nelemsPerInt4 * NRanksPerNode;
-  uint32_t nelemsPerRank = alignedNelems / NRanksPerNode;
+  uint32_t alignedNelems =
+      ((nelems + NRanks - 1) / NRanks + nelemsPerInt4 - 1) / nelemsPerInt4 * nelemsPerInt4 * NRanks;
+  uint32_t nelemsPerRank = alignedNelems / NRanks;
   uint32_t nInt4PerRank = nelemsPerRank / nelemsPerInt4;
   uint32_t nInt4Total = (nelems + nelemsPerInt4 - 1) / nelemsPerInt4;
 
@@ -93,7 +86,7 @@ __global__ void __launch_bounds__(1024, 1)
     int4 tmp_raw = buff4[offset];
 #pragma unroll
     for (int i = 0; i < NPeers; i++) {
-      int rankIdx = (rank + i + 1) % NRanksPerNode;
+      int rankIdx = (rank + i + 1) % NRanks;
       int peerIdx = rankIdx < rank ? rankIdx : rankIdx - 1;
       data[i] = mscclpp::read<int4>(((void**)remoteMemories)[peerIdx], offset);
     }
@@ -105,7 +98,7 @@ __global__ void __launch_bounds__(1024, 1)
     int4 tmp = mscclpp::downcastVector<T, AccumT, int4>(acc);
 #pragma unroll
     for (int i = 0; i < NPeers; i++) {
-      int rankIdx = (rank + i + 1) % NRanksPerNode;
+      int rankIdx = (rank + i + 1) % NRanks;
       int peerIdx = rankIdx < rank ? rankIdx : rankIdx - 1;
       mscclpp::write<int4>(((void**)remoteMemories)[outputRemoteBufferOffset + peerIdx], offset, tmp);
     }

From 095cfff11d00e93a09f24ee391161a6f1209dc1b Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Wed, 6 May 2026 22:23:18 +0000
Subject: [PATCH 20/44] Revert RSAG nBlocks default to 64

The 128-block default fires only when the caller passes nBlocks=0
(i.e. no tuning). Tuning explicitly drives nBlocks via the adapter, so
the historical default of 64 is fine. Keep nChannelsPerConnection_=128
so the tuner can still request up to 128 blocks for MNNVL configs.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 src/ext/collectives/allreduce/allreduce_rsag.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/ext/collectives/allreduce/allreduce_rsag.cu b/src/ext/collectives/allreduce/allreduce_rsag.cu
index 93e2d0c46..4dcceb48e 100644
--- a/src/ext/collectives/allreduce/allreduce_rsag.cu
+++ b/src/ext/collectives/allreduce/allreduce_rsag.cu
@@ -133,7 +133,7 @@ struct AllreduceRsAgAdapter {
     size_t nelems = inputSize / sizeof(T);
     if (nBlocks == 0 || nThreadsPerBlock == 0) {
       nThreadsPerBlock = 1024;
-      nBlocks = 128;
+      nBlocks = 64;
     }
     allreduceRsAg<OpType, T><<<nBlocks, nThreadsPerBlock, 0, stream>>>(
         (T*)input, (T*)scratch, (T*)output, (ChannelType*)memoryChannels, switchChannel, remoteMemories, rank,

From 639b80de7b5fd031a283967c77e2de65103ce379 Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Wed, 6 May 2026 22:31:15 +0000
Subject: [PATCH 21/44] Tie AllreduceAllpairPacket maxBlockNum_ to
 MAX_IPC_DOMAIN_NRANKS - 1

The hard-coded 72 was off by one from what the comment claims is the
minimum (MAX_IPC_DOMAIN_NRANKS - 1 = 71). Express the value via the
constant so the relationship is self-documenting and any future change
to MAX_IPC_DOMAIN_NRANKS propagates automatically.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../collectives/include/allreduce/allreduce_allpair_packet.hpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/ext/collectives/include/allreduce/allreduce_allpair_packet.hpp b/src/ext/collectives/include/allreduce/allreduce_allpair_packet.hpp
index 79c211b39..d2ea7259e 100644
--- a/src/ext/collectives/include/allreduce/allreduce_allpair_packet.hpp
+++ b/src/ext/collectives/include/allreduce/allreduce_allpair_packet.hpp
@@ -4,6 +4,7 @@
 #include <mscclpp/algorithm.hpp>
 
 #include "allreduce/common.hpp"
+#include "collective_utils.hpp"
 
 namespace mscclpp {
 namespace collective {
@@ -31,7 +32,7 @@ class AllreduceAllpairPacket : public AlgorithmBuilder {
   const int nSegmentsForScratchBuffer_ = 2;
   // Must be at least MAX_IPC_DOMAIN_NRANKS-1 so the adapter can launch one
   // block per peer at MNNVL scale.
-  const int maxBlockNum_ = 72;
+  const int maxBlockNum_ = MAX_IPC_DOMAIN_NRANKS - 1;
   std::vector<Connection> conns_;
   std::vector<std::shared_ptr<MemoryDevice2DeviceSemaphore>> memorySemaphores_;
   std::vector<RegisteredMemory> registeredMemories_;

From e8caab7c8e866d6ff89e86d7bcf7c0f011f19021 Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Wed, 6 May 2026 23:04:41 +0000
Subject: [PATCH 22/44] Strip preflight validation blocks from NVLS pipeline
 allreduce kernels

allreduce_nvls_block_pipeline.cu and allreduce_nvls_warp_pipeline.cu
were carrying ~45 lines of per-call invariant-checking added during the
MNNVL work. Restore main's simple defaulting pattern (just `if
(==0) set defaults`); incorrect inputs will manifest as CUDA errors via
the existing error-handling path. Also drop the unreachable
`6 * ipcDomainNranks > NUM_SEMAPHORES` throw in the block_pipeline
initialize (max ipcDomainNranks=72, NUM_SEMAPHORES=512), the now-unused
`<mscclpp/errors.hpp>` include, and trim the verbose comments around
`nBaseChannels_` sizing in both files.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../allreduce_nvls_block_pipeline.cu          | 48 +--------------
 .../allreduce/allreduce_nvls_warp_pipeline.cu | 59 +------------------
 2 files changed, 6 insertions(+), 101 deletions(-)

diff --git a/src/ext/collectives/allreduce/allreduce_nvls_block_pipeline.cu b/src/ext/collectives/allreduce/allreduce_nvls_block_pipeline.cu
index f5c0d2f85..9d3316e4c 100644
--- a/src/ext/collectives/allreduce/allreduce_nvls_block_pipeline.cu
+++ b/src/ext/collectives/allreduce/allreduce_nvls_block_pipeline.cu
@@ -3,7 +3,6 @@
 
 #include <algorithm>
 #include <mscclpp/algorithm.hpp>
-#include <mscclpp/errors.hpp>
 
 #include "allreduce/allreduce_nvls_block_pipeline.hpp"
 #include "allreduce/common.hpp"
@@ -179,14 +178,7 @@ struct NvlsBlockPipelineAdapter {
 void AllreduceNvlsBlockPipeline::initialize(std::shared_ptr<Communicator> comm) {
   nSwitchChannels_ = 8;
   ipcDomainNranks_ = getIpcDomainNranks(comm);
-  // Block-pipeline device-side semaphore indices grow as 6 * ipcDomainNranks (see kernel).
-  if (6 * ipcDomainNranks_ > NUM_SEMAPHORES) {
-    throw Error("AllreduceNvlsBlockPipeline: ipcDomainNranks " + std::to_string(ipcDomainNranks_) +
-                    " exceeds NUM_SEMAPHORES capacity (" + std::to_string(NUM_SEMAPHORES) + ")",
-                ErrorCode::InvalidUsage);
-  }
-  // The kernel addresses up to `2 * nBlocksForCopy = 4 * ipcDomainNranks` distinct entries
-  // per peer in `memoryChannels`. Scale the per-connection allocation to match.
+  // Per-peer channel allocation must hold up to 4 * ipcDomainNranks entries (see kernel).
   nBaseChannels_ = std::max(64, 4 * ipcDomainNranks_);
   this->conns_ = setupConnections(comm);
   // setup semaphores
@@ -208,43 +200,9 @@ CommResult AllreduceNvlsBlockPipeline::allreduceKernelFunc(
     WARN("Unsupported operation or data type for allreduce, dtype=%d", static_cast<int>(dtype));
     return CommResult::CommInvalidArgument;
   }
-  const int requiredBlocks = ctx->ipcDomainNranks * 5;
   std::pair<int, int> blockAndThreadNum = {nBlocks, nThreadsPerBlock};
-  if (blockAndThreadNum.first == 0) blockAndThreadNum.first = requiredBlocks;
-  if (blockAndThreadNum.second == 0) blockAndThreadNum.second = 1024;
-  if (blockAndThreadNum.first != requiredBlocks) {
-    WARN("AllreduceNvlsBlockPipeline requires nBlocks == 5 * ipcDomainNranks (got %d, expected %d)",
-         blockAndThreadNum.first, requiredBlocks);
-    return CommResult::CommInvalidArgument;
-  }
-  if (blockAndThreadNum.second != 1024) {
-    WARN("AllreduceNvlsBlockPipeline requires nThreadsPerBlock == 1024 (got %d)", blockAndThreadNum.second);
-    return CommResult::CommInvalidArgument;
-  }
-  // Validate input alignment/divisibility expectations of the kernel.
-  constexpr size_t kKernelAlignment = 16;
-  const size_t perRankBytes = inputSize / ctx->ipcDomainNranks;
-  if (perRankBytes * static_cast<size_t>(ctx->ipcDomainNranks) != inputSize || perRankBytes % kKernelAlignment != 0) {
-    WARN(
-        "AllreduceNvlsBlockPipeline requires inputSize %% (ipcDomainNranks * %zu) == 0 (got inputSize=%zu, "
-        "ipcDomainNranks=%d)",
-        kKernelAlignment, inputSize, ctx->ipcDomainNranks);
-    return CommResult::CommInvalidArgument;
-  }
-  // Validate scratch is large enough for at least one pipeline iteration. The kernel
-  // computes scratchSizePerBlock = (scratchSizePerRank / nBlocksForCopy) aligned down
-  // to unitSize; if this is 0, maxItersForScratch is 0 and the kernel deadlocks.
-  const size_t unitSize = (inputSize <= static_cast<size_t>(1024) * 1024 * 128) ? (1ULL << 16) : (1ULL << 17);
-  const size_t scratchSizePerRank = this->scratchBufferSize_ / ctx->ipcDomainNranks;
-  const size_t nBlocksForCopy = static_cast<size_t>(ctx->ipcDomainNranks) * 2;
-  const size_t scratchSizePerBlock = (scratchSizePerRank / nBlocksForCopy) / unitSize * unitSize;
-  if (scratchSizePerBlock < unitSize) {
-    WARN(
-        "AllreduceNvlsBlockPipeline scratch buffer too small for ipcDomainNranks=%d and inputSize=%zu "
-        "(scratchBufferSize=%zu, need at least ~%zu bytes)",
-        ctx->ipcDomainNranks, inputSize, this->scratchBufferSize_,
-        static_cast<size_t>(ctx->ipcDomainNranks) * nBlocksForCopy * unitSize);
-    return CommResult::CommInvalidArgument;
+  if (blockAndThreadNum.first == 0 || blockAndThreadNum.second == 0) {
+    blockAndThreadNum = {ctx->ipcDomainNranks * 5, 1024};
   }
   cudaError_t error = allreduce(input, this->scratchBuffer_, output, this->memoryChannelsDeviceHandle_.get(), nullptr,
                                 ctx->switchChannelDeviceHandles.get(), nullptr, 0, 0, this->scratchBufferSize_,
diff --git a/src/ext/collectives/allreduce/allreduce_nvls_warp_pipeline.cu b/src/ext/collectives/allreduce/allreduce_nvls_warp_pipeline.cu
index 02b899aa2..73ecdab9d 100644
--- a/src/ext/collectives/allreduce/allreduce_nvls_warp_pipeline.cu
+++ b/src/ext/collectives/allreduce/allreduce_nvls_warp_pipeline.cu
@@ -3,7 +3,6 @@
 
 #include <algorithm>
 #include <mscclpp/algorithm.hpp>
-#include <mscclpp/errors.hpp>
 
 #include "allreduce/allreduce_nvls_warp_pipeline.hpp"
 #include "allreduce/common.hpp"
@@ -143,9 +142,7 @@ struct NvlsWarpPipelineAdapter {
 void AllreduceNvlsWarpPipeline::initialize(std::shared_ptr<Communicator> comm) {
   nSwitchChannels_ = NUM_NVLS_CONNECTION;
   ipcDomainNranks_ = getIpcDomainNranks(comm);
-  // The warp-pipeline kernel addresses 2 * nPeers entries per block in `memoryChannels`,
-  // so per-peer base channel allocation must be at least `2 * nBlocks`. Default
-  // nBlocks = 4 * ipcDomainNranks (see allreduceKernelFunc), so size accordingly.
+  // Per-peer channel allocation must hold 2 * nBlocks entries; default nBlocks = 4 * ipcDomainNranks.
   nBaseChannels_ = std::max(64, 8 * ipcDomainNranks_);
   this->conns_ = setupConnections(comm);
   // setup semaphores
@@ -168,58 +165,8 @@ CommResult AllreduceNvlsWarpPipeline::allreduceKernelFunc(
     return CommResult::CommInvalidArgument;
   }
   std::pair<int, int> blockAndThreadNum = {nBlocks, nThreadsPerBlock};
-  if (blockAndThreadNum.first == 0) {
-    // Default to 4 * ipcDomainNranks blocks, rounded up to a multiple of NUM_NVLS_CONNECTION
-    // so that nBlocks / NUM_NVLS_CONNECTION partitioning in the kernel is well-defined.
-    int defaultBlocks = ctx->ipcDomainNranks * 4;
-    defaultBlocks = ((defaultBlocks + NUM_NVLS_CONNECTION - 1) / NUM_NVLS_CONNECTION) * NUM_NVLS_CONNECTION;
-    blockAndThreadNum.first = std::max(defaultBlocks, NUM_NVLS_CONNECTION);
-  }
-  if (blockAndThreadNum.second == 0) blockAndThreadNum.second = 1024;
-  // The kernel computes nBlocksPerNvlsConn = nBlocks / NUM_NVLS_CONNECTION and indexes the
-  // multicast handle array with bid / nBlocksPerNvlsConn; both must be safe.
-  if (blockAndThreadNum.first < NUM_NVLS_CONNECTION || blockAndThreadNum.first % NUM_NVLS_CONNECTION != 0) {
-    WARN("AllreduceNvlsWarpPipeline requires nBlocks to be a positive multiple of %d (got %d)", NUM_NVLS_CONNECTION,
-         blockAndThreadNum.first);
-    return CommResult::CommInvalidArgument;
-  }
-  // Each block uses 2 * nPeers consecutive entries in `memoryChannels`, so the per-peer
-  // base-channel allocation must support 2 * nBlocks distinct entries.
-  if (2 * blockAndThreadNum.first > this->nBaseChannels_) {
-    WARN(
-        "AllreduceNvlsWarpPipeline: nBlocks %d exceeds channel allocation (nBaseChannels=%d, "
-        "ipcDomainNranks=%d). Increase MSCCLPP_IPC_DOMAIN_NRANKS-aware sizing or reduce nBlocks.",
-        blockAndThreadNum.first, this->nBaseChannels_, ctx->ipcDomainNranks);
-    return CommResult::CommInvalidArgument;
-  }
-  // The kernel hard-codes 14 + 4 + 14 = 32 warps per block and bar.sync member counts
-  // computed from these constants; deviating from 1024 threads breaks those barriers.
-  if (blockAndThreadNum.second != 1024) {
-    WARN("AllreduceNvlsWarpPipeline requires nThreadsPerBlock == 1024 (got %d)", blockAndThreadNum.second);
-    return CommResult::CommInvalidArgument;
-  }
-  // Validate input divisibility by ipcDomainNranks (kernel computes size / ipcDomainNranks).
-  if (inputSize % static_cast<size_t>(ctx->ipcDomainNranks) != 0) {
-    WARN("AllreduceNvlsWarpPipeline requires inputSize %% ipcDomainNranks == 0 (got inputSize=%zu, ipcDomainNranks=%d)",
-         inputSize, ctx->ipcDomainNranks);
-    return CommResult::CommInvalidArgument;
-  }
-  // Validate scratch is large enough for at least one pipeline iteration. The kernel
-  // computes scratchSizePerBlock = (scratchSizePerRank / nBlocks) aligned down to copyPerIter;
-  // if this is 0 the modulo offset arithmetic divides by zero.
-  const size_t sizePerRank = inputSize / static_cast<size_t>(ctx->ipcDomainNranks);
-  const size_t maxSizePerBlock = ((sizePerRank + blockAndThreadNum.first - 1) / blockAndThreadNum.first + 15) / 16 * 16;
-  const size_t copyPerIter = (maxSizePerBlock >= 1024 * 64) ? (1024 * 32) : (1024 * 16);
-  const size_t scratchSizePerRank = this->scratchBufferSize_ / static_cast<size_t>(ctx->ipcDomainNranks);
-  const size_t scratchSizePerBlock =
-      (scratchSizePerRank / static_cast<size_t>(blockAndThreadNum.first)) / copyPerIter * copyPerIter;
-  if (scratchSizePerBlock < copyPerIter) {
-    WARN(
-        "AllreduceNvlsWarpPipeline scratch buffer too small for ipcDomainNranks=%d, nBlocks=%d, inputSize=%zu "
-        "(scratchBufferSize=%zu, need at least ~%zu bytes)",
-        ctx->ipcDomainNranks, blockAndThreadNum.first, inputSize, this->scratchBufferSize_,
-        static_cast<size_t>(ctx->ipcDomainNranks) * static_cast<size_t>(blockAndThreadNum.first) * copyPerIter);
-    return CommResult::CommInvalidArgument;
+  if (blockAndThreadNum.first == 0 || blockAndThreadNum.second == 0) {
+    blockAndThreadNum = {ctx->ipcDomainNranks * 4, 1024};
   }
   cudaError_t error = allreduce(input, this->scratchBuffer_, output, this->memoryChannelsDeviceHandle_.get(), nullptr,
                                 ctx->switchChannelDeviceHandles.get(), nullptr, 0, 0, this->scratchBufferSize_,

From 7d80a333603bb18e63798242682520c7e9a43c8b Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Wed, 6 May 2026 23:43:37 +0000
Subject: [PATCH 23/44] Default torch example SYMMETRIC_MEMORY env to 1

The non-symmetric rsag_zero_copy path uses an incrementing tag in its
context key, so cross-rank memory registration handshakes happen on
every call rather than being cached. At single-host x 8 GPUs and
sizes >= 512 KB this becomes the only candidate (since nvls_zero_copy
is filtered out without symmetric memory) and degrades into apparent
hang. Defaulting SYMMETRIC_MEMORY=1 lets a plain `mpirun ...`
invocation work out of the box; users can still override with
`SYMMETRIC_MEMORY=0` to exercise the non-symmetric path.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 examples/torch-integration/customized_comm_with_tuning.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/torch-integration/customized_comm_with_tuning.py b/examples/torch-integration/customized_comm_with_tuning.py
index 18fdd6f14..cbfb419d4 100644
--- a/examples/torch-integration/customized_comm_with_tuning.py
+++ b/examples/torch-integration/customized_comm_with_tuning.py
@@ -449,7 +449,7 @@ def main():
     accum_map = {"float32": mscclpp.DataType.float32, "float16": mscclpp.DataType.float16}
     accum_str = os.environ.get("ACCUM_DTYPE")
     accum_dtype = accum_map.get(accum_str) if accum_str else None
-    symmetric_memory = os.environ.get("SYMMETRIC_MEMORY", "0") == "1"
+    symmetric_memory = os.environ.get("SYMMETRIC_MEMORY", "1") == "1"
 
     comm_group = init_dist()
     cc = CustomizedComm(comm_group, symmetric_memory=symmetric_memory)

From d1b04a3b26567f7e27c0eefb52b8e4dcc273874a Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Thu, 7 May 2026 00:38:31 +0000
Subject: [PATCH 24/44] NVLS zero-copy allreduce: support FP16 accumulator for
 FP8 inputs

multimem.ld_reduce on FP8 inputs accumulates in FP32 by default. The
ISA also exposes an .acc::f16 variant that keeps the reduction in
FP16, which is faster but lower precision. Plumb AccumT through:

- include/mscclpp/switch_channel_device.hpp:
  Extend SwitchChannelDeviceHandle::multimemLoadReduce with an optional
  AccumT template parameter. When VectorType is one of the FP8 vector
  types (f8_e4m3x{4,8,16} / f8_e5m2x{4,8,16}) and AccumT is __half,
  emit the .acc::f16 form of the instruction; otherwise unchanged.

- src/ext/collectives/include/allreduce/common.hpp:
  Make handleMultiLoadReduceStore template on AccumT and forward it to
  multimemLoadReduce<vectorType, AccumT>(...).

- src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu:
  Template allreduceNvls and NvlsAdapter on AccumT and forward to
  handleMultiLoadReduceStore<T, AccumT>; the existing dispatch<>
  machinery already plumbs AccumT through from the algorithm context.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../customized_comm_with_tuning.py            | 51 ++++++-----
 include/mscclpp/switch_channel_device.hpp     | 84 ++++++++++++++-----
 .../allreduce/allreduce_nvls_zero_copy.cu     | 12 +--
 .../collectives/include/allreduce/common.hpp  |  7 +-
 4 files changed, 100 insertions(+), 54 deletions(-)

diff --git a/examples/torch-integration/customized_comm_with_tuning.py b/examples/torch-integration/customized_comm_with_tuning.py
index cbfb419d4..44a5c9c10 100644
--- a/examples/torch-integration/customized_comm_with_tuning.py
+++ b/examples/torch-integration/customized_comm_with_tuning.py
@@ -76,6 +76,25 @@ class CustomizedComm:
         "default_allreduce_fullmesh": 64,
         "default_allgather_fullmesh2": 32,
     }
+    # (algo_name, min_size, max_size, predicate)
+    # Boundaries are inclusive on both ends. max_size=None means unbounded.
+    # predicate=None means always applicable; otherwise a callable taking `self`.
+    _AR_CANDIDATES_MNNVL = [
+        ("default_allreduce_allpair_packet", 0, 128 << 10, None),
+        ("default_allreduce_nvls_packet", 0, 64 << 10, lambda c: c._nvls),
+        ("default_allreduce_packet", 128 << 10, 4 << 20, None),
+        ("default_allreduce_nvls_zero_copy", 512 << 10, None, lambda c: c._nvls and c.symmetric_memory),
+        ("default_allreduce_rsag_zero_copy", 512 << 10, None, None),
+        ("default_allreduce_rsag", 512 << 10, None, None),
+    ]
+    _AR_CANDIDATES_SINGLE = [
+        ("default_allreduce_packet", 0, 4 << 20, None),
+        ("default_allreduce_allpair_packet", 0, 4 << 20, None),
+        ("default_allreduce_nvls_packet", 0, 4 << 20, lambda c: c._nvls),
+        ("default_allreduce_rsag_zero_copy", 512 << 10, None, None),
+        ("default_allreduce_nvls_zero_copy", 512 << 10, None, lambda c: c._nvls and c.symmetric_memory),
+        ("default_allreduce_fullmesh", 0, None, lambda c: torch.version.hip is not None),
+    ]
 
     def __init__(self, comm: mscclpp.CommGroup, symmetric_memory: bool = False):
         self.comm = comm
@@ -164,32 +183,12 @@ def _ensure_tune_bufs(self):
         return self._tune_buf
 
     def _ar_candidates(self, size: int):
-        out = []
-        if self.multi_host_mnnvl:
-            if size <= 4 << 20:
-                if size <= 128 << 10:
-                    out.append(self._algo("allreduce", "default_allreduce_allpair_packet"))
-                if size <= 64 << 10 and self._nvls:
-                    out.append(self._algo("allreduce", "default_allreduce_nvls_packet"))
-                if size > 128 << 10:
-                    out.append(self._algo("allreduce", "default_allreduce_packet"))
-            if size >= 512 << 10:
-                if self._nvls and self.symmetric_memory:
-                    out.append(self._algo("allreduce", "default_allreduce_nvls_zero_copy"))
-                out.append(self._algo("allreduce", "default_allreduce_rsag"))
-            return out
-        if size <= 4 << 20:
-            out.append(self._algo("allreduce", "default_allreduce_packet"))
-            out.append(self._algo("allreduce", "default_allreduce_allpair_packet"))
-            if self._nvls:
-                out.append(self._algo("allreduce", "default_allreduce_nvls_packet"))
-        if size >= 512 << 10:
-            out.append(self._algo("allreduce", "default_allreduce_rsag_zero_copy"))
-            if self._nvls and self.symmetric_memory:
-                out.append(self._algo("allreduce", "default_allreduce_nvls_zero_copy"))
-        if torch.version.hip is not None:
-            out.append(self._algo("allreduce", "default_allreduce_fullmesh"))
-        return out
+        table = self._AR_CANDIDATES_MNNVL if self.multi_host_mnnvl else self._AR_CANDIDATES_SINGLE
+        return [
+            self._algo("allreduce", name)
+            for name, lo, hi, pred in table
+            if size >= lo and (hi is None or size <= hi) and (pred is None or pred(self))
+        ]
 
     def _ag_candidates(self):
         if self.multi_host_mnnvl:
diff --git a/include/mscclpp/switch_channel_device.hpp b/include/mscclpp/switch_channel_device.hpp
index b52b65723..7b749f7a9 100644
--- a/include/mscclpp/switch_channel_device.hpp
+++ b/include/mscclpp/switch_channel_device.hpp
@@ -37,7 +37,11 @@ struct SwitchChannelDeviceHandle {
     SwitchChannelDeviceHandle::multimemStore(val, reinterpret_cast<T*>(mcPtr) + index);
   }
 
-  template <typename VectorType>
+  /// Vectorized multimem load+reduce. The optional `AccumT` template parameter selects the
+  /// accumulator: when `AccumT == __half` and `VectorType` is an FP8 vector type, the
+  /// `.acc::f16` variant of the instruction is used (faster but lower precision than the
+  /// default FP32 accumulator). For all other types `AccumT` is ignored.
+  template <typename VectorType, typename AccumT = void>
   MSCCLPP_DEVICE_INLINE static VectorType multimemLoadReduce(VectorType* ptr) {
     VectorType val;
     if constexpr (std::is_same_v<VectorType, i32x1>) {
@@ -81,29 +85,71 @@ struct SwitchChannelDeviceHandle {
           : "l"(ptr)
           : "memory");
     } else if constexpr (std::is_same_v<VectorType, f8_e4m3x4>) {
-      asm("multimem.ld_reduce.relaxed.sys.global.add.e4m3x4 %0, [%1];" : "=r"(val.words[0]) : "l"(ptr) : "memory");
+      if constexpr (std::is_same_v<AccumT, __half>) {
+        asm("multimem.ld_reduce.relaxed.sys.global.add.acc::f16.e4m3x4 %0, [%1];"
+            : "=r"(val.words[0])
+            : "l"(ptr)
+            : "memory");
+      } else {
+        asm("multimem.ld_reduce.relaxed.sys.global.add.e4m3x4 %0, [%1];" : "=r"(val.words[0]) : "l"(ptr) : "memory");
+      }
     } else if constexpr (std::is_same_v<VectorType, f8_e4m3x8>) {
-      asm("multimem.ld_reduce.relaxed.sys.global.add.v2.e4m3x4 {%0,%1}, [%2];"
-          : "=r"(val.words[0]), "=r"(val.words[1])
-          : "l"(ptr)
-          : "memory");
+      if constexpr (std::is_same_v<AccumT, __half>) {
+        asm("multimem.ld_reduce.relaxed.sys.global.add.acc::f16.v2.e4m3x4 {%0,%1}, [%2];"
+            : "=r"(val.words[0]), "=r"(val.words[1])
+            : "l"(ptr)
+            : "memory");
+      } else {
+        asm("multimem.ld_reduce.relaxed.sys.global.add.v2.e4m3x4 {%0,%1}, [%2];"
+            : "=r"(val.words[0]), "=r"(val.words[1])
+            : "l"(ptr)
+            : "memory");
+      }
     } else if constexpr (std::is_same_v<VectorType, f8_e4m3x16>) {
-      asm("multimem.ld_reduce.relaxed.sys.global.add.v4.e4m3x4 {%0,%1,%2,%3}, [%4];"
-          : "=r"(val.words[0]), "=r"(val.words[1]), "=r"(val.words[2]), "=r"(val.words[3])
-          : "l"(ptr)
-          : "memory");
+      if constexpr (std::is_same_v<AccumT, __half>) {
+        asm("multimem.ld_reduce.relaxed.sys.global.add.acc::f16.v4.e4m3x4 {%0,%1,%2,%3}, [%4];"
+            : "=r"(val.words[0]), "=r"(val.words[1]), "=r"(val.words[2]), "=r"(val.words[3])
+            : "l"(ptr)
+            : "memory");
+      } else {
+        asm("multimem.ld_reduce.relaxed.sys.global.add.v4.e4m3x4 {%0,%1,%2,%3}, [%4];"
+            : "=r"(val.words[0]), "=r"(val.words[1]), "=r"(val.words[2]), "=r"(val.words[3])
+            : "l"(ptr)
+            : "memory");
+      }
     } else if constexpr (std::is_same_v<VectorType, f8_e5m2x4>) {
-      asm("multimem.ld_reduce.relaxed.sys.global.add.e5m2x4 %0, [%1];" : "=r"(val.words[0]) : "l"(ptr) : "memory");
+      if constexpr (std::is_same_v<AccumT, __half>) {
+        asm("multimem.ld_reduce.relaxed.sys.global.add.acc::f16.e5m2x4 %0, [%1];"
+            : "=r"(val.words[0])
+            : "l"(ptr)
+            : "memory");
+      } else {
+        asm("multimem.ld_reduce.relaxed.sys.global.add.e5m2x4 %0, [%1];" : "=r"(val.words[0]) : "l"(ptr) : "memory");
+      }
     } else if constexpr (std::is_same_v<VectorType, f8_e5m2x8>) {
-      asm("multimem.ld_reduce.relaxed.sys.global.add.v2.e5m2x4 {%0,%1}, [%2];"
-          : "=r"(val.words[0]), "=r"(val.words[1])
-          : "l"(ptr)
-          : "memory");
+      if constexpr (std::is_same_v<AccumT, __half>) {
+        asm("multimem.ld_reduce.relaxed.sys.global.add.acc::f16.v2.e5m2x4 {%0,%1}, [%2];"
+            : "=r"(val.words[0]), "=r"(val.words[1])
+            : "l"(ptr)
+            : "memory");
+      } else {
+        asm("multimem.ld_reduce.relaxed.sys.global.add.v2.e5m2x4 {%0,%1}, [%2];"
+            : "=r"(val.words[0]), "=r"(val.words[1])
+            : "l"(ptr)
+            : "memory");
+      }
     } else if constexpr (std::is_same_v<VectorType, f8_e5m2x16>) {
-      asm("multimem.ld_reduce.relaxed.sys.global.add.v4.e5m2x4 {%0,%1,%2,%3}, [%4];"
-          : "=r"(val.words[0]), "=r"(val.words[1]), "=r"(val.words[2]), "=r"(val.words[3])
-          : "l"(ptr)
-          : "memory");
+      if constexpr (std::is_same_v<AccumT, __half>) {
+        asm("multimem.ld_reduce.relaxed.sys.global.add.acc::f16.v4.e5m2x4 {%0,%1,%2,%3}, [%4];"
+            : "=r"(val.words[0]), "=r"(val.words[1]), "=r"(val.words[2]), "=r"(val.words[3])
+            : "l"(ptr)
+            : "memory");
+      } else {
+        asm("multimem.ld_reduce.relaxed.sys.global.add.v4.e5m2x4 {%0,%1,%2,%3}, [%4];"
+            : "=r"(val.words[0]), "=r"(val.words[1]), "=r"(val.words[2]), "=r"(val.words[3])
+            : "l"(ptr)
+            : "memory");
+      }
     } else {
       static_assert(dependentFalse<VectorType>, "Not supported type");
     }
diff --git a/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu b/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu
index 115a229ae..99146779c 100644
--- a/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu
+++ b/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu
@@ -14,7 +14,7 @@ namespace collective {
 
 constexpr int MAX_NBLOCKS = 32;
 
-template <typename T>
+template <typename T, typename AccumT = T>
 __global__ void __launch_bounds__(1024, 1)
     allreduceNvls([[maybe_unused]] mscclpp::DeviceHandle<mscclpp::BaseMemoryChannel>* memoryChannels,
                   [[maybe_unused]] mscclpp::DeviceHandle<mscclpp::SwitchChannel>* multicast,
@@ -58,8 +58,8 @@ __global__ void __launch_bounds__(1024, 1)
   T* src = (T*)multicastPtr->mcPtr;
   T* dst = (T*)multicastOutPtr->mcPtr;
   if (curBlockSize > 0) {
-    handleMultiLoadReduceStore(src, dst, blockOffset + channelInOffset, blockOffset + channelOutOffset, curBlockSize,
-                               threadIdx.x, blockDim.x);
+    handleMultiLoadReduceStore<T, AccumT>(src, dst, blockOffset + channelInOffset, blockOffset + channelOutOffset,
+                                          curBlockSize, threadIdx.x, blockDim.x);
   }
   __syncthreads();
   if (threadIdx.x < nPeers) {
@@ -90,9 +90,9 @@ struct NvlsAdapter {
 #endif
     {
       using ChannelType = DeviceHandle<mscclpp::BaseMemoryChannel>;
-      allreduceNvls<T><<<nBlocks, nThreadsPerBlock, 0, stream>>>((ChannelType*)memoryChannels, nvlsChannels,
-                                                                 nvlsOutChannels, channelInOffset, channelOutOffset,
-                                                                 inputSize, rank, ipcDomainNranks);
+      allreduceNvls<T, AccumT><<<nBlocks, nThreadsPerBlock, 0, stream>>>(
+          (ChannelType*)memoryChannels, nvlsChannels, nvlsOutChannels, channelInOffset, channelOutOffset, inputSize,
+          rank, ipcDomainNranks);
       return cudaGetLastError();
     }
   }
diff --git a/src/ext/collectives/include/allreduce/common.hpp b/src/ext/collectives/include/allreduce/common.hpp
index 93b18e262..22513ace5 100644
--- a/src/ext/collectives/include/allreduce/common.hpp
+++ b/src/ext/collectives/include/allreduce/common.hpp
@@ -36,7 +36,7 @@ MSCCLPP_DEVICE_INLINE constexpr std::size_t calcVectorSize() {
   }
 }
 
-template <typename T>
+template <typename T, typename AccumT = T>
 MSCCLPP_DEVICE_INLINE void handleMultiLoadReduceStore(T* src, T* dst, size_t srcOffset, size_t dstOffset, size_t size,
                                                       int tid, int nThreads) {
   // nvls can only handle 4 bytes alignment
@@ -54,7 +54,7 @@ MSCCLPP_DEVICE_INLINE void handleMultiLoadReduceStore(T* src, T* dst, size_t src
   vectorType* src4 = (vectorType*)src;
   vectorType* dst4 = (vectorType*)dst;
   for (size_t idx = tid; idx < nVec; idx += nThreads) {
-    auto val = mscclpp::SwitchChannelDeviceHandle::multimemLoadReduce(src4 + srcOffset4 + idx);
+    auto val = mscclpp::SwitchChannelDeviceHandle::multimemLoadReduce<vectorType, AccumT>(src4 + srcOffset4 + idx);
     mscclpp::SwitchChannelDeviceHandle::multimemStore(val, dst4 + dstOffset4 + idx);
   }
   // handle rest of data
@@ -64,7 +64,8 @@ MSCCLPP_DEVICE_INLINE void handleMultiLoadReduceStore(T* src, T* dst, size_t src
   const size_t startIdx = (srcOffset + processed) / sizeof(restVectorType);
   const size_t endIdx = (srcOffset + size) / sizeof(restVectorType);
   for (size_t idx = tid + startIdx; idx < endIdx; idx += nThreads) {
-    auto val = mscclpp::SwitchChannelDeviceHandle::multimemLoadReduce((restVectorType*)src + idx);
+    auto val =
+        mscclpp::SwitchChannelDeviceHandle::multimemLoadReduce<restVectorType, AccumT>((restVectorType*)src + idx);
     mscclpp::SwitchChannelDeviceHandle::multimemStore(val, (restVectorType*)dst + idx);
   }
 }

From 113d859d13f08ca9533e1ba5a0d4b645c26028ee Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Fri, 8 May 2026 03:00:53 +0000
Subject: [PATCH 25/44] fix

---
 src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu b/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu
index 99146779c..ef6d216ca 100644
--- a/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu
+++ b/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu
@@ -83,7 +83,8 @@ struct NvlsAdapter {
       // fp8_e4m3b15 is a software-only type with no hardware NVLS support.
       return cudaErrorNotSupported;
     } else
-#if (!defined(__CUDA_ARCH_SPECIFIC__) && !defined(__CUDA_ARCH_FAMILY_SPECIFIC__)) || (__CUDA_ARCH__ < 1000)
+#if defined(__CUDA_ARCH__) && \
+    ((!defined(__CUDA_ARCH_SPECIFIC__) && !defined(__CUDA_ARCH_FAMILY_SPECIFIC__)) || (__CUDA_ARCH__ < 1000))
         if constexpr (std::is_same_v<T, __fp8_e4m3> || std::is_same_v<T, __fp8_e5m2>) {
       return cudaErrorNotSupported;
     } else

From 9ff7e1c2c38ebf798728fe77eef2f6c5e31989e5 Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Fri, 8 May 2026 03:43:34 +0000
Subject: [PATCH 26/44] update

---
 .../collectives/allreduce/allreduce_nvls_zero_copy.cu    | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu b/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu
index ef6d216ca..63cbd057d 100644
--- a/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu
+++ b/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu
@@ -82,14 +82,7 @@ struct NvlsAdapter {
     } else if constexpr (std::is_same_v<T, __fp8_e4m3b15>) {
       // fp8_e4m3b15 is a software-only type with no hardware NVLS support.
       return cudaErrorNotSupported;
-    } else
-#if defined(__CUDA_ARCH__) && \
-    ((!defined(__CUDA_ARCH_SPECIFIC__) && !defined(__CUDA_ARCH_FAMILY_SPECIFIC__)) || (__CUDA_ARCH__ < 1000))
-        if constexpr (std::is_same_v<T, __fp8_e4m3> || std::is_same_v<T, __fp8_e5m2>) {
-      return cudaErrorNotSupported;
-    } else
-#endif
-    {
+    } else {
       using ChannelType = DeviceHandle<mscclpp::BaseMemoryChannel>;
       allreduceNvls<T, AccumT><<<nBlocks, nThreadsPerBlock, 0, stream>>>(
           (ChannelType*)memoryChannels, nvlsChannels, nvlsOutChannels, channelInOffset, channelOutOffset, inputSize,

From 654bcfa6ba403cc0c40f31cf2224b3f2fa524569 Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Fri, 8 May 2026 03:54:32 +0000
Subject: [PATCH 27/44] update

---
 include/mscclpp/switch_channel_device.hpp | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/include/mscclpp/switch_channel_device.hpp b/include/mscclpp/switch_channel_device.hpp
index 7b749f7a9..841b7f320 100644
--- a/include/mscclpp/switch_channel_device.hpp
+++ b/include/mscclpp/switch_channel_device.hpp
@@ -84,7 +84,9 @@ struct SwitchChannelDeviceHandle {
           : "=r"(val.words[0]), "=r"(val.words[1]), "=r"(val.words[2]), "=r"(val.words[3])
           : "l"(ptr)
           : "memory");
-    } else if constexpr (std::is_same_v<VectorType, f8_e4m3x4>) {
+    }
+#if (defined(__CUDA_ARCH_SPECIFIC__) || defined(__CUDA_ARCH_FAMILY_SPECIFIC__)) && (__CUDA_ARCH__ >= 1000)
+    else if constexpr (std::is_same_v<VectorType, f8_e4m3x4>) {
       if constexpr (std::is_same_v<AccumT, __half>) {
         asm("multimem.ld_reduce.relaxed.sys.global.add.acc::f16.e4m3x4 %0, [%1];"
             : "=r"(val.words[0])
@@ -150,7 +152,9 @@ struct SwitchChannelDeviceHandle {
             : "l"(ptr)
             : "memory");
       }
-    } else {
+    }
+#endif
+    else {
       static_assert(dependentFalse<VectorType>, "Not supported type");
     }
     return val;

From 5516bdbb6be2b307754053898db6b940c83cb011 Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Fri, 8 May 2026 04:22:50 +0000
Subject: [PATCH 28/44] fix

---
 include/mscclpp/switch_channel_device.hpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/mscclpp/switch_channel_device.hpp b/include/mscclpp/switch_channel_device.hpp
index 841b7f320..4e0396dd3 100644
--- a/include/mscclpp/switch_channel_device.hpp
+++ b/include/mscclpp/switch_channel_device.hpp
@@ -155,7 +155,7 @@ struct SwitchChannelDeviceHandle {
     }
 #endif
     else {
-      static_assert(dependentFalse<VectorType>, "Not supported type");
+      assert(false && "Unsupported vector type for multimemLoadReduce");
     }
     return val;
   };
@@ -219,7 +219,7 @@ struct SwitchChannelDeviceHandle {
                    "r"(val.words[1]), "r"(val.words[2]), "r"(val.words[3])
                    : "memory");
     } else {
-      static_assert(dependentFalse<VectorType>, "Not supported type");
+      assert(false && "Unsupported vector type for multimemStore");
     }
   };
 
@@ -244,7 +244,7 @@ struct SwitchChannelDeviceHandle {
     } else if constexpr (std::is_same_v<TValue, uint1> && std::is_same_v<T, __half2>) {
       asm volatile("multimem.red.relaxed.sys.global.add.f16x2 [%0], {%1};" ::"l"(ptr), "r"(val.x) : "memory");
     } else {
-      static_assert(dependentFalse<T>, "Not supported type");
+      assert(false && "Unsupported vector type for multimemStoreReduce");
     }
   };
 #endif  // defined(MSCCLPP_DEVICE_CUDA)

From e208cc326b95c9590c09b1bce35683d9580087a7 Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Fri, 8 May 2026 04:30:05 +0000
Subject: [PATCH 29/44] WIP

---
 include/mscclpp/switch_channel_device.hpp | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/include/mscclpp/switch_channel_device.hpp b/include/mscclpp/switch_channel_device.hpp
index 4e0396dd3..e95dfcf51 100644
--- a/include/mscclpp/switch_channel_device.hpp
+++ b/include/mscclpp/switch_channel_device.hpp
@@ -198,7 +198,9 @@ struct SwitchChannelDeviceHandle {
       asm volatile("multimem.st.relaxed.sys.global.v4.bf16x2 [%0], {%1,%2,%3,%4};" ::"l"(ptr), "r"(val.words[0]),
                    "r"(val.words[1]), "r"(val.words[2]), "r"(val.words[3])
                    : "memory");
-    } else if constexpr (std::is_same_v<VectorType, f8_e4m3x4>) {
+    }
+#if (defined(__CUDA_ARCH_SPECIFIC__) || defined(__CUDA_ARCH_FAMILY_SPECIFIC__)) && (__CUDA_ARCH__ >= 1000)
+    else if constexpr (std::is_same_v<VectorType, f8_e4m3x4>) {
       asm volatile("multimem.st.relaxed.sys.global.e4m3x4 [%0], %1;" ::"l"(ptr), "r"(val.words[0]) : "memory");
     } else if constexpr (std::is_same_v<VectorType, f8_e4m3x8>) {
       asm volatile("multimem.st.relaxed.sys.global.v2.e4m3x4  [%0], {%1,%2};" ::"l"(ptr), "r"(val.words[0]),
@@ -218,7 +220,9 @@ struct SwitchChannelDeviceHandle {
       asm volatile("multimem.st.relaxed.sys.global.v4.e5m2x4 [%0], {%1,%2,%3,%4};" ::"l"(ptr), "r"(val.words[0]),
                    "r"(val.words[1]), "r"(val.words[2]), "r"(val.words[3])
                    : "memory");
-    } else {
+    }
+#endif
+    else {
       assert(false && "Unsupported vector type for multimemStore");
     }
   };

From 825fc124a547c0fa1e5b6df9c79c42a1aebf627f Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Sat, 9 May 2026 03:16:33 +0000
Subject: [PATCH 30/44] address hang issue

---
 src/ext/collectives/allreduce/allreduce_nvls_packet.cu | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/ext/collectives/allreduce/allreduce_nvls_packet.cu b/src/ext/collectives/allreduce/allreduce_nvls_packet.cu
index 2ef0516e3..56455b6ea 100644
--- a/src/ext/collectives/allreduce/allreduce_nvls_packet.cu
+++ b/src/ext/collectives/allreduce/allreduce_nvls_packet.cu
@@ -83,6 +83,7 @@ void AllreduceNvlsPacket::initialize(std::shared_ptr<Communicator> comm) {
   this->nvlsConnections_ = setupNvlsConnections(comm, nvlsBufferSize_, nSwitchChannels);
   this->switchChannels_ =
       setupNvlsChannels(this->nvlsConnections_, this->scratchBuffer_, this->scratchBufferSize_, nSwitchChannels);
+  comm->bootstrap()->barrier();
 }
 
 AlgorithmCtxKey AllreduceNvlsPacket::generateAllreduceContextKey(const void*, void*, size_t, DataType, bool) {

From 224b3deb84fb2977318c56acdb7131c4e9f49eeb Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Wed, 13 May 2026 01:22:51 +0000
Subject: [PATCH 31/44] Clean up completed communicator receives

Erase completed receive bookkeeping from the communicator once the deferred receive future finishes, while preserving ordered receive chaining for repeated rank/tag operations.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 src/core/communicator.cc | 100 ++++++++++++++++++++++++---------------
 1 file changed, 63 insertions(+), 37 deletions(-)

diff --git a/src/core/communicator.cc b/src/core/communicator.cc
index c95ca4213..97fadbbd0 100644
--- a/src/core/communicator.cc
+++ b/src/core/communicator.cc
@@ -3,10 +3,60 @@
 
 #include "communicator.hpp"
 
+#include <utility>
+
 #include "api.h"
 
 namespace mscclpp {
 
+namespace {
+
+template <typename Fn>
+class ScopeGuard {
+ public:
+  explicit ScopeGuard(Fn fn) : fn_(std::move(fn)) {}
+  ScopeGuard(const ScopeGuard&) = delete;
+  ScopeGuard& operator=(const ScopeGuard&) = delete;
+  ~ScopeGuard() { fn_(); }
+
+ private:
+  Fn fn_;
+};
+
+template <typename Fn>
+ScopeGuard<Fn> makeScopeGuard(Fn fn) {
+  return ScopeGuard<Fn>(std::move(fn));
+}
+
+template <typename T, typename Impl, typename Fn>
+std::shared_future<T> makeOrderedRecvFuture(Impl* impl, int remoteRank, int tag, Fn fn) {
+  auto thisRecvItem = std::make_shared<std::weak_ptr<BaseRecvItem>>();
+  auto future = std::async(std::launch::deferred, [impl, remoteRank, tag, thisRecvItem,
+                                                   lastRecvItem = impl->getLastRecvItem(remoteRank, tag),
+                                                   fn = std::move(fn)]() mutable {
+    [[maybe_unused]] auto cleanup = makeScopeGuard([impl, remoteRank, tag, thisRecvItem]() {
+      auto item = thisRecvItem->lock();
+      auto it = impl->lastRecvItems_.find({remoteRank, tag});
+      if (item && it != impl->lastRecvItems_.end() && it->second == item) {
+        impl->lastRecvItems_.erase(it);
+      }
+    });
+
+    if (lastRecvItem) {
+      // Recursive call to the previous receive items
+      lastRecvItem->wait();
+    }
+    return fn();
+  });
+  auto sharedFuture = std::shared_future<T>(std::move(future));
+  auto recvItem = std::make_shared<RecvItem<T>>(sharedFuture);
+  *thisRecvItem = recvItem;
+  impl->setLastRecvItem(remoteRank, tag, recvItem);
+  return sharedFuture;
+}
+
+}  // namespace
+
 Communicator::Impl::Impl(std::shared_ptr<Bootstrap> bootstrap, std::shared_ptr<Context> context)
     : bootstrap_(bootstrap) {
   if (!context) {
@@ -83,19 +133,11 @@ MSCCLPP_API_CPP std::shared_future<RegisteredMemory> Communicator::recvMemory(in
     locRecvMemList.push_back(std::move(locRecvMem));
     return future;
   }
-  auto future = std::async(std::launch::deferred,
-                           [this, remoteRank, tag, lastRecvItem = pimpl_->getLastRecvItem(remoteRank, tag)]() {
-                             if (lastRecvItem) {
-                               // Recursive call to the previous receive items
-                               lastRecvItem->wait();
-                             }
-                             std::vector<char> data;
-                             bootstrap()->recv(data, remoteRank, tag);
-                             return RegisteredMemory::deserialize(data);
-                           });
-  auto shared_future = std::shared_future<RegisteredMemory>(std::move(future));
-  pimpl_->setLastRecvItem(remoteRank, tag, std::make_shared<RecvItem<RegisteredMemory>>(shared_future));
-  return shared_future;
+  return makeOrderedRecvFuture<RegisteredMemory>(pimpl_.get(), remoteRank, tag, [this, remoteRank, tag]() {
+    std::vector<char> data;
+    bootstrap()->recv(data, remoteRank, tag);
+    return RegisteredMemory::deserialize(data);
+  });
 }
 
 MSCCLPP_API_CPP std::shared_future<Connection> Communicator::connect(const Endpoint& localEndpoint, int remoteRank,
@@ -112,12 +154,8 @@ MSCCLPP_API_CPP std::shared_future<Connection> Communicator::connect(const Endpo
 
   bootstrap()->send(localEndpoint.serialize(), remoteRank, tag);
 
-  auto future = std::async(std::launch::deferred, [this, remoteRank, tag, localEndpoint,
-                                                   lastRecvItem = pimpl_->getLastRecvItem(remoteRank, tag)]() mutable {
-    if (lastRecvItem) {
-      // Recursive call to the previous receive items
-      lastRecvItem->wait();
-    }
+  return makeOrderedRecvFuture<Connection>(pimpl_.get(), remoteRank, tag,
+                                           [this, remoteRank, tag, localEndpoint]() mutable {
     std::vector<char> data;
     bootstrap()->recv(data, remoteRank, tag);
     auto remoteEndpoint = Endpoint::deserialize(data);
@@ -125,9 +163,6 @@ MSCCLPP_API_CPP std::shared_future<Connection> Communicator::connect(const Endpo
     pimpl_->connectionInfos_[connection.impl_.get()] = {remoteRank, tag};
     return connection;
   });
-  auto shared_future = std::shared_future<Connection>(std::move(future));
-  pimpl_->setLastRecvItem(remoteRank, tag, std::make_shared<RecvItem<Connection>>(shared_future));
-  return shared_future;
 }
 
 MSCCLPP_API_CPP std::shared_future<Connection> Communicator::connect(const EndpointConfig& localConfig, int remoteRank,
@@ -141,21 +176,12 @@ MSCCLPP_API_CPP std::shared_future<Semaphore> Communicator::buildSemaphore(const
   SemaphoreStub localStub(connection);
   bootstrap()->send(localStub.serialize(), remoteRank, tag);
 
-  auto future =
-      std::async(std::launch::deferred, [this, remoteRank, tag, lastRecvItem = pimpl_->getLastRecvItem(remoteRank, tag),
-                                         localStub = localStub]() mutable {
-        if (lastRecvItem) {
-          // Recursive call to the previous receive items
-          lastRecvItem->wait();
-        }
-        std::vector<char> data;
-        bootstrap()->recv(data, remoteRank, tag);
-        auto remoteStub = SemaphoreStub::deserialize(data);
-        return Semaphore(localStub, remoteStub);
-      });
-  auto shared_future = std::shared_future<Semaphore>(std::move(future));
-  pimpl_->setLastRecvItem(remoteRank, tag, std::make_shared<RecvItem<Semaphore>>(shared_future));
-  return shared_future;
+  return makeOrderedRecvFuture<Semaphore>(pimpl_.get(), remoteRank, tag, [this, remoteRank, tag, localStub]() mutable {
+    std::vector<char> data;
+    bootstrap()->recv(data, remoteRank, tag);
+    auto remoteStub = SemaphoreStub::deserialize(data);
+    return Semaphore(localStub, remoteStub);
+  });
 }
 
 MSCCLPP_API_CPP int Communicator::remoteRankOf(const Connection& connection) {

From 7724e49f316d0059933a7bcfe2ff9f53ac9bc043 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Wed, 13 May 2026 03:26:53 +0000
Subject: [PATCH 32/44] Fix lint and ROCm error alias

Agent-Logs-Url: https://github.com/microsoft/mscclpp/sessions/0f0e525d-a69c-4ff7-8913-983243b5cbf7

Co-authored-by: Binyang2014 <9415966+Binyang2014@users.noreply.github.com>
---
 include/mscclpp/gpu.hpp  |  1 +
 src/core/communicator.cc | 48 ++++++++++++++++++++--------------------
 2 files changed, 25 insertions(+), 24 deletions(-)

diff --git a/include/mscclpp/gpu.hpp b/include/mscclpp/gpu.hpp
index b8d096e2b..b289bd4d3 100644
--- a/include/mscclpp/gpu.hpp
+++ b/include/mscclpp/gpu.hpp
@@ -31,6 +31,7 @@ using CUmemorytype = hipMemoryType;
 constexpr auto cudaErrorPeerAccessAlreadyEnabled = hipErrorPeerAccessAlreadyEnabled;
 constexpr auto cudaErrorContextIsDestroyed = hipErrorContextIsDestroyed;
 constexpr auto cudaErrorInvalidDevice = hipErrorInvalidDevice;
+constexpr auto cudaErrorInvalidValue = hipErrorInvalidValue;
 constexpr auto cudaSuccess = hipSuccess;
 constexpr auto cudaErrorNotSupported = hipErrorNotSupported;
 constexpr auto cudaStreamNonBlocking = hipStreamNonBlocking;
diff --git a/src/core/communicator.cc b/src/core/communicator.cc
index 97fadbbd0..81cd7bbe4 100644
--- a/src/core/communicator.cc
+++ b/src/core/communicator.cc
@@ -31,23 +31,23 @@ ScopeGuard<Fn> makeScopeGuard(Fn fn) {
 template <typename T, typename Impl, typename Fn>
 std::shared_future<T> makeOrderedRecvFuture(Impl* impl, int remoteRank, int tag, Fn fn) {
   auto thisRecvItem = std::make_shared<std::weak_ptr<BaseRecvItem>>();
-  auto future = std::async(std::launch::deferred, [impl, remoteRank, tag, thisRecvItem,
-                                                   lastRecvItem = impl->getLastRecvItem(remoteRank, tag),
-                                                   fn = std::move(fn)]() mutable {
-    [[maybe_unused]] auto cleanup = makeScopeGuard([impl, remoteRank, tag, thisRecvItem]() {
-      auto item = thisRecvItem->lock();
-      auto it = impl->lastRecvItems_.find({remoteRank, tag});
-      if (item && it != impl->lastRecvItems_.end() && it->second == item) {
-        impl->lastRecvItems_.erase(it);
-      }
-    });
-
-    if (lastRecvItem) {
-      // Recursive call to the previous receive items
-      lastRecvItem->wait();
-    }
-    return fn();
-  });
+  auto future = std::async(std::launch::deferred,
+                           [impl, remoteRank, tag, thisRecvItem, lastRecvItem = impl->getLastRecvItem(remoteRank, tag),
+                            fn = std::move(fn)]() mutable {
+                             [[maybe_unused]] auto cleanup = makeScopeGuard([impl, remoteRank, tag, thisRecvItem]() {
+                               auto item = thisRecvItem->lock();
+                               auto it = impl->lastRecvItems_.find({remoteRank, tag});
+                               if (item && it != impl->lastRecvItems_.end() && it->second == item) {
+                                 impl->lastRecvItems_.erase(it);
+                               }
+                             });
+
+                             if (lastRecvItem) {
+                               // Recursive call to the previous receive items
+                               lastRecvItem->wait();
+                             }
+                             return fn();
+                           });
   auto sharedFuture = std::shared_future<T>(std::move(future));
   auto recvItem = std::make_shared<RecvItem<T>>(sharedFuture);
   *thisRecvItem = recvItem;
@@ -156,13 +156,13 @@ MSCCLPP_API_CPP std::shared_future<Connection> Communicator::connect(const Endpo
 
   return makeOrderedRecvFuture<Connection>(pimpl_.get(), remoteRank, tag,
                                            [this, remoteRank, tag, localEndpoint]() mutable {
-    std::vector<char> data;
-    bootstrap()->recv(data, remoteRank, tag);
-    auto remoteEndpoint = Endpoint::deserialize(data);
-    auto connection = context()->connect(localEndpoint, remoteEndpoint);
-    pimpl_->connectionInfos_[connection.impl_.get()] = {remoteRank, tag};
-    return connection;
-  });
+                                             std::vector<char> data;
+                                             bootstrap()->recv(data, remoteRank, tag);
+                                             auto remoteEndpoint = Endpoint::deserialize(data);
+                                             auto connection = context()->connect(localEndpoint, remoteEndpoint);
+                                             pimpl_->connectionInfos_[connection.impl_.get()] = {remoteRank, tag};
+                                             return connection;
+                                           });
 }
 
 MSCCLPP_API_CPP std::shared_future<Connection> Communicator::connect(const EndpointConfig& localConfig, int remoteRank,

From dbebde2b5801ba54220c0f25f253417be81686c3 Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Fri, 15 May 2026 22:26:53 +0000
Subject: [PATCH 33/44] Configure IPC domain per communicator

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../customized_comm_with_tuning.py               | 10 ++++------
 include/mscclpp/core.hpp                         | 12 ++++++++++++
 include/mscclpp/env.hpp                          |  6 ------
 python/csrc/core_py.cpp                          |  2 ++
 python/mscclpp/_core/comm.py                     |  3 +++
 src/core/communicator.cc                         |  9 +++++++++
 src/core/env.cpp                                 |  4 +---
 src/core/include/communicator.hpp                |  1 +
 src/ext/collectives/collective_utils.cc          | 16 +++++++---------
 src/ext/collectives/include/collective_utils.hpp |  5 ++---
 10 files changed, 41 insertions(+), 27 deletions(-)

diff --git a/examples/torch-integration/customized_comm_with_tuning.py b/examples/torch-integration/customized_comm_with_tuning.py
index 44a5c9c10..1243ca91a 100644
--- a/examples/torch-integration/customized_comm_with_tuning.py
+++ b/examples/torch-integration/customized_comm_with_tuning.py
@@ -8,10 +8,6 @@
 
 from mpi4py import MPI
 
-_world_size = MPI.COMM_WORLD.Get_size()
-if _world_size > 1 and "MSCCLPP_IPC_DOMAIN_NRANKS" not in os.environ:
-    os.environ["MSCCLPP_IPC_DOMAIN_NRANKS"] = str(_world_size)
-
 import torch
 import mscclpp
 import mscclpp.ext
@@ -101,8 +97,10 @@ def __init__(self, comm: mscclpp.CommGroup, symmetric_memory: bool = False):
         self.rank = comm.my_rank
         self.world_size = comm.nranks
         self.nranks_per_node = comm.nranks_per_node
-        nvlink_domain_nranks = int(os.environ.get("MSCCLPP_IPC_DOMAIN_NRANKS", "0"))
-        self.multi_host_mnnvl = nvlink_domain_nranks >= self.world_size and self.world_size > self.nranks_per_node
+        if comm.communicator.get_ipc_domain_n_ranks() == 0 and self.world_size > 1:
+            comm.communicator.set_ipc_domain_n_ranks(self.world_size)
+        self.ipc_domain_n_ranks = comm.communicator.get_ipc_domain_n_ranks()
+        self.multi_host_mnnvl = self.ipc_domain_n_ranks >= self.world_size and self.world_size > self.nranks_per_node
         self.symmetric_memory = symmetric_memory
         self._nvls = mscclpp.is_nvls_supported()
 
diff --git a/include/mscclpp/core.hpp b/include/mscclpp/core.hpp
index 45b56bcc0..481f1d3c5 100644
--- a/include/mscclpp/core.hpp
+++ b/include/mscclpp/core.hpp
@@ -821,6 +821,18 @@ class Communicator {
   /// @return The context held by this communicator.
   std::shared_ptr<Context> context();
 
+  /// Set the IPC-domain rank count for collective algorithms using this communicator.
+  ///
+  /// The value describes how many ranks are in one GPU-IPC-reachable peer group, such as a Multi-Node NVLink
+  /// fabric. Set to 0 to use the default `bootstrap()->getNranksPerNode()` value.
+  ///
+  /// @param ipcDomainNranks Number of ranks in the communicator's IPC domain, or 0 to use the default.
+  void setIpcDomainNranks(int ipcDomainNranks);
+
+  /// Get the IPC-domain rank count override for this communicator.
+  /// @return The configured IPC-domain rank count, or 0 if the communicator uses `bootstrap()->getNranksPerNode()`.
+  int getIpcDomainNranks() const;
+
   /// Register a region of GPU memory for use in this communicator's context.
   ///
   /// @param ptr Base pointer to the memory.
diff --git a/include/mscclpp/env.hpp b/include/mscclpp/env.hpp
index 0dd63ed74..a6dd306b6 100644
--- a/include/mscclpp/env.hpp
+++ b/include/mscclpp/env.hpp
@@ -119,12 +119,6 @@ class Env {
   /// Default is 0. Used when `EndpointConfig::Ib::gidIndex` is -1 (unspecified).
   const int ibGidIndex;
 
-  /// Env name: `MSCCLPP_IPC_DOMAIN_NRANKS`. Number of ranks that share a single GPU-IPC-reachable peer
-  /// group (e.g. a Multi-Node NVLink fabric such as GB200 NVL72, or an AMD XGMI domain). This hint is
-  /// consumed only by the collective algorithms; it does not affect `Bootstrap::getNranksPerNode()` or
-  /// any other layer. If unset or non-positive, algorithms fall back to `bootstrap->getNranksPerNode()`.
-  const int ipcDomainNranks;
-
  private:
   Env();
 
diff --git a/python/csrc/core_py.cpp b/python/csrc/core_py.cpp
index a94f9863a..d748c6a00 100644
--- a/python/csrc/core_py.cpp
+++ b/python/csrc/core_py.cpp
@@ -282,6 +282,8 @@ void register_core(nb::module_& m) {
            nb::arg("context") = nullptr)
       .def("bootstrap", &Communicator::bootstrap)
       .def("context", &Communicator::context)
+      .def("set_ipc_domain_n_ranks", &Communicator::setIpcDomainNranks, nb::arg("n_ranks"))
+      .def("get_ipc_domain_n_ranks", &Communicator::getIpcDomainNranks)
       .def(
           "register_memory",
           [](Communicator* self, uintptr_t ptr, size_t size, TransportFlags transports) {
diff --git a/python/mscclpp/_core/comm.py b/python/mscclpp/_core/comm.py
index d42349ddb..f1940eae7 100644
--- a/python/mscclpp/_core/comm.py
+++ b/python/mscclpp/_core/comm.py
@@ -35,6 +35,7 @@ def __init__(
         interfaceIpPortTrio: str = "",
         rank: int = None,
         size: int = None,
+        ipc_domain_n_ranks: int = 0,
     ):
         if interfaceIpPortTrio == "" and (mpi_comm is not None or torch_group is not None):
             uniq_id = None
@@ -70,9 +71,11 @@ def __init__(
         else:
             raise RuntimeError("Either the interface or mpi_group need to be specified")
         self.communicator = CppCommunicator(self.bootstrap)
+        self.communicator.set_ipc_domain_n_ranks(ipc_domain_n_ranks)
         self.my_rank = self.bootstrap.get_rank()
         self.nranks = self.bootstrap.get_n_ranks()
         self.nranks_per_node = self.bootstrap.get_n_ranks_per_node()
+        self.ipc_domain_n_ranks = self.communicator.get_ipc_domain_n_ranks()
 
     def barrier(self):
         self.bootstrap.barrier()
diff --git a/src/core/communicator.cc b/src/core/communicator.cc
index 1ca029d67..2272175e7 100644
--- a/src/core/communicator.cc
+++ b/src/core/communicator.cc
@@ -81,6 +81,15 @@ MSCCLPP_API_CPP std::shared_ptr<Bootstrap> Communicator::bootstrap() { return pi
 
 MSCCLPP_API_CPP std::shared_ptr<Context> Communicator::context() { return pimpl_->context_; }
 
+MSCCLPP_API_CPP void Communicator::setIpcDomainNranks(int ipcDomainNranks) {
+  if (ipcDomainNranks < 0) {
+    throw Error("ipcDomainNranks must be non-negative", ErrorCode::InvalidUsage);
+  }
+  pimpl_->ipcDomainNranks_ = ipcDomainNranks;
+}
+
+MSCCLPP_API_CPP int Communicator::getIpcDomainNranks() const { return pimpl_->ipcDomainNranks_; }
+
 MSCCLPP_API_CPP RegisteredMemory Communicator::registerMemory(void* ptr, size_t size, TransportFlags transports) {
   return context()->registerMemory(ptr, size, transports);
 }
diff --git a/src/core/env.cpp b/src/core/env.cpp
index 18d548b02..7a42471bf 100644
--- a/src/core/env.cpp
+++ b/src/core/env.cpp
@@ -67,8 +67,7 @@ Env::Env()
       ncclSymmetricMemory(readEnv<bool>("MSCCLPP_NCCL_SYMMETRIC_MEMORY", false)),
       forceDisableNvls(readEnv<bool>("MSCCLPP_FORCE_DISABLE_NVLS", false)),
       forceDisableGdr(readEnv<bool>("MSCCLPP_FORCE_DISABLE_GDR", false)),
-      ibGidIndex(readEnv<int>("MSCCLPP_IB_GID_INDEX", 0)),
-      ipcDomainNranks(readEnv<int>("MSCCLPP_IPC_DOMAIN_NRANKS", 0)) {}
+      ibGidIndex(readEnv<int>("MSCCLPP_IB_GID_INDEX", 0)) {}
 
 std::shared_ptr<Env> env() {
   static std::shared_ptr<Env> globalEnv = std::shared_ptr<Env>(new Env());
@@ -98,7 +97,6 @@ std::shared_ptr<Env> env() {
     logEnv("MSCCLPP_FORCE_DISABLE_NVLS", globalEnv->forceDisableNvls);
     logEnv("MSCCLPP_FORCE_DISABLE_GDR", globalEnv->forceDisableGdr);
     logEnv("MSCCLPP_IB_GID_INDEX", globalEnv->ibGidIndex);
-    logEnv("MSCCLPP_IPC_DOMAIN_NRANKS", globalEnv->ipcDomainNranks);
   }
   return globalEnv;
 }
diff --git a/src/core/include/communicator.hpp b/src/core/include/communicator.hpp
index f15e20f74..b9f519b9b 100644
--- a/src/core/include/communicator.hpp
+++ b/src/core/include/communicator.hpp
@@ -60,6 +60,7 @@ struct Communicator::Impl {
   std::shared_ptr<Bootstrap> bootstrap_;
   std::shared_ptr<Context> context_;
   std::unordered_map<const BaseConnection*, ConnectionInfo> connectionInfos_;
+  int ipcDomainNranks_ = 0;
 
   // Temporary storage for the latest RecvItem of each {remoteRank, tag} pair.
   // The RecvItem is removed when it finishes or when getLastRecvItem observes that it is ready.
diff --git a/src/ext/collectives/collective_utils.cc b/src/ext/collectives/collective_utils.cc
index 6acfd7ce0..192fac8d3 100644
--- a/src/ext/collectives/collective_utils.cc
+++ b/src/ext/collectives/collective_utils.cc
@@ -6,7 +6,6 @@
 #include <algorithm>
 #include <mscclpp/algorithm.hpp>
 #include <mscclpp/core.hpp>
-#include <mscclpp/env.hpp>
 #include <mscclpp/errors.hpp>
 #include <mscclpp/memory_channel.hpp>
 #include <mscclpp/switch_channel.hpp>
@@ -73,23 +72,22 @@ std::vector<std::shared_ptr<mscclpp::MemoryDevice2DeviceSemaphore>> setupMemoryS
   return memorySemaphores;
 }
 
-int getIpcDomainNranks(std::shared_ptr<mscclpp::Communicator> comm) {
-  const int envValue = mscclpp::env()->ipcDomainNranks;
-  const int ipcDomainNranks = (envValue > 0) ? envValue : comm->bootstrap()->getNranksPerNode();
+int getIpcDomainNranks(std::shared_ptr<Communicator> comm) {
+  const int commValue = comm->getIpcDomainNranks();
+  const int ipcDomainNranks = (commValue > 0) ? commValue : comm->bootstrap()->getNranksPerNode();
   const int worldSize = comm->bootstrap()->getNranks();
   const int rank = comm->bootstrap()->getRank();
   if (ipcDomainNranks < 2 || ipcDomainNranks > MAX_IPC_DOMAIN_NRANKS) {
-    THROW(mscclpp::LogSubsys::ALGO, mscclpp::Error, mscclpp::ErrorCode::InvalidUsage, "ipcDomainNranks ",
-          ipcDomainNranks, " is out of supported range [2, ", MAX_IPC_DOMAIN_NRANKS, "]");
+    THROW(LogSubsys::ALGO, Error, ErrorCode::InvalidUsage, "ipcDomainNranks ", ipcDomainNranks,
+          " is out of supported range [2, ", MAX_IPC_DOMAIN_NRANKS, "]");
   }
   if (worldSize != ipcDomainNranks) {
-    THROW(mscclpp::LogSubsys::ALGO, mscclpp::Error, mscclpp::ErrorCode::InvalidUsage,
+    THROW(LogSubsys::ALGO, Error, ErrorCode::InvalidUsage,
           "requires worldSize == ipcDomainNranks (got worldSize=", worldSize, ", ipcDomainNranks=", ipcDomainNranks,
           ")");
   }
   if (rank < 0 || rank >= ipcDomainNranks) {
-    THROW(mscclpp::LogSubsys::ALGO, mscclpp::Error, mscclpp::ErrorCode::InvalidUsage, "rank ", rank, " out of [0, ",
-          ipcDomainNranks, ")");
+    THROW(LogSubsys::ALGO, Error, ErrorCode::InvalidUsage, "rank ", rank, " out of [0, ", ipcDomainNranks, ")");
   }
   return ipcDomainNranks;
 }
diff --git a/src/ext/collectives/include/collective_utils.hpp b/src/ext/collectives/include/collective_utils.hpp
index 280a63328..217c7f550 100644
--- a/src/ext/collectives/include/collective_utils.hpp
+++ b/src/ext/collectives/include/collective_utils.hpp
@@ -52,9 +52,8 @@ std::vector<std::shared_ptr<MemoryDevice2DeviceSemaphore>> setupMemorySemaphores
     std::shared_ptr<Communicator> comm, const std::vector<Connection>& connections, int nChannelsPerConnection);
 
 /// Returns the IPC-reachable peer-group size, validated to span the whole communicator and
-/// to be within `[2, MAX_IPC_DOMAIN_NRANKS]`. Reads `MSCCLPP_IPC_DOMAIN_NRANKS` if set to a
-/// positive value; otherwise falls back to `bootstrap->getNranksPerNode()`. Throws
-/// `Error(InvalidUsage)` on violation.
+/// to be within `[2, MAX_IPC_DOMAIN_NRANKS]`. Reads the communicator's IPC-domain override
+/// if set; otherwise falls back to `bootstrap->getNranksPerNode()`. Throws `Error(InvalidUsage)` on violation.
 int getIpcDomainNranks(std::shared_ptr<Communicator> comm);
 
 std::shared_ptr<DeviceHandle<MemoryChannel>> setupMemoryChannelDeviceHandles(

From 93b43547cc003784771bbbffdd554abeab75aaad Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Fri, 15 May 2026 23:15:40 +0000
Subject: [PATCH 34/44] temp solution

---
 .../customized_comm_with_tuning.py            |  9 ++++-----
 include/mscclpp/core.hpp                      |  4 ++--
 src/core/communicator.cc                      |  6 +++---
 .../allgather/allgather_fullmesh.cu           |  2 +-
 .../allgather/allgather_fullmesh_2.cu         |  2 +-
 .../allreduce/allreduce_allpair_packet.cu     |  2 +-
 .../allreduce/allreduce_fullmesh.cu           |  2 +-
 .../allreduce_nvls_block_pipeline.cu          |  4 ++--
 .../allreduce/allreduce_nvls_packet.cu        |  2 +-
 .../allreduce/allreduce_nvls_warp_pipeline.cu |  4 ++--
 .../allreduce/allreduce_nvls_zero_copy.cu     |  3 +--
 .../collectives/allreduce/allreduce_packet.cu |  2 +-
 .../collectives/allreduce/allreduce_rsag.cu   |  2 +-
 .../allreduce/allreduce_rsag_pipeline.cu      |  2 +-
 .../allreduce/allreduce_rsag_zero_copy.cu     |  2 +-
 src/ext/collectives/collective_utils.cc       | 20 -------------------
 .../collectives/include/collective_utils.hpp  |  5 -----
 17 files changed, 23 insertions(+), 50 deletions(-)

diff --git a/examples/torch-integration/customized_comm_with_tuning.py b/examples/torch-integration/customized_comm_with_tuning.py
index 1243ca91a..d0da8c689 100644
--- a/examples/torch-integration/customized_comm_with_tuning.py
+++ b/examples/torch-integration/customized_comm_with_tuning.py
@@ -97,8 +97,6 @@ def __init__(self, comm: mscclpp.CommGroup, symmetric_memory: bool = False):
         self.rank = comm.my_rank
         self.world_size = comm.nranks
         self.nranks_per_node = comm.nranks_per_node
-        if comm.communicator.get_ipc_domain_n_ranks() == 0 and self.world_size > 1:
-            comm.communicator.set_ipc_domain_n_ranks(self.world_size)
         self.ipc_domain_n_ranks = comm.communicator.get_ipc_domain_n_ranks()
         self.multi_host_mnnvl = self.ipc_domain_n_ranks >= self.world_size and self.world_size > self.nranks_per_node
         self.symmetric_memory = symmetric_memory
@@ -433,8 +431,8 @@ def benchmark_allgather(comm: CustomizedComm, dtype=torch.float16, n_warmup=10,
 # -- Bootstrap & main ---------------------------------------------------------
 
 
-def init_dist() -> mscclpp.CommGroup:
-    return mscclpp.CommGroup(mpi_comm=MPI.COMM_WORLD)
+def init_dist(ipc_domain_n_ranks: int = 0) -> mscclpp.CommGroup:
+    return mscclpp.CommGroup(mpi_comm=MPI.COMM_WORLD, ipc_domain_n_ranks=ipc_domain_n_ranks)
 
 
 def main():
@@ -447,8 +445,9 @@ def main():
     accum_str = os.environ.get("ACCUM_DTYPE")
     accum_dtype = accum_map.get(accum_str) if accum_str else None
     symmetric_memory = os.environ.get("SYMMETRIC_MEMORY", "1") == "1"
+    ipc_domain_n_ranks = int(os.environ.get("IPC_DOMAIN_NRANKS", "0"))
 
-    comm_group = init_dist()
+    comm_group = init_dist(ipc_domain_n_ranks=ipc_domain_n_ranks)
     cc = CustomizedComm(comm_group, symmetric_memory=symmetric_memory)
 
     print(
diff --git a/include/mscclpp/core.hpp b/include/mscclpp/core.hpp
index 481f1d3c5..832323ad6 100644
--- a/include/mscclpp/core.hpp
+++ b/include/mscclpp/core.hpp
@@ -829,8 +829,8 @@ class Communicator {
   /// @param ipcDomainNranks Number of ranks in the communicator's IPC domain, or 0 to use the default.
   void setIpcDomainNranks(int ipcDomainNranks);
 
-  /// Get the IPC-domain rank count override for this communicator.
-  /// @return The configured IPC-domain rank count, or 0 if the communicator uses `bootstrap()->getNranksPerNode()`.
+  /// Get the effective IPC-domain rank count for this communicator.
+  /// @return The configured IPC-domain rank count, or `bootstrap()->getNranksPerNode()` if no override is set.
   int getIpcDomainNranks() const;
 
   /// Register a region of GPU memory for use in this communicator's context.
diff --git a/src/core/communicator.cc b/src/core/communicator.cc
index 2272175e7..9bbbff3be 100644
--- a/src/core/communicator.cc
+++ b/src/core/communicator.cc
@@ -3,8 +3,6 @@
 
 #include "communicator.hpp"
 
-#include <utility>
-
 #include "api.h"
 
 namespace mscclpp {
@@ -88,7 +86,9 @@ MSCCLPP_API_CPP void Communicator::setIpcDomainNranks(int ipcDomainNranks) {
   pimpl_->ipcDomainNranks_ = ipcDomainNranks;
 }
 
-MSCCLPP_API_CPP int Communicator::getIpcDomainNranks() const { return pimpl_->ipcDomainNranks_; }
+MSCCLPP_API_CPP int Communicator::getIpcDomainNranks() const {
+  return (pimpl_->ipcDomainNranks_ > 0) ? pimpl_->ipcDomainNranks_ : pimpl_->bootstrap_->getNranksPerNode();
+}
 
 MSCCLPP_API_CPP RegisteredMemory Communicator::registerMemory(void* ptr, size_t size, TransportFlags transports) {
   return context()->registerMemory(ptr, size, transports);
diff --git a/src/ext/collectives/allgather/allgather_fullmesh.cu b/src/ext/collectives/allgather/allgather_fullmesh.cu
index a4196c6cd..8b5cf3b70 100644
--- a/src/ext/collectives/allgather/allgather_fullmesh.cu
+++ b/src/ext/collectives/allgather/allgather_fullmesh.cu
@@ -148,7 +148,7 @@ std::shared_ptr<void> AllgatherFullmesh::initAllgatherContext(std::shared_ptr<Co
   auto ctx = std::make_shared<AlgorithmCtx>();
   ctx->rank = comm->bootstrap()->getRank();
   ctx->workSize = comm->bootstrap()->getNranks();
-  ctx->ipcDomainNranks = comm->bootstrap()->getNranksPerNode();
+  ctx->ipcDomainNranks = comm->getIpcDomainNranks();
 
   // setup semaphores
   ctx->memorySemaphores = setupMemorySemaphores(comm, this->conns_, nChannelsPerConnection);
diff --git a/src/ext/collectives/allgather/allgather_fullmesh_2.cu b/src/ext/collectives/allgather/allgather_fullmesh_2.cu
index 6e69f81ca..de9d93840 100644
--- a/src/ext/collectives/allgather/allgather_fullmesh_2.cu
+++ b/src/ext/collectives/allgather/allgather_fullmesh_2.cu
@@ -159,7 +159,7 @@ std::shared_ptr<void> AllgatherFullmesh2::initAllgatherContext(std::shared_ptr<m
   auto ctx = std::make_shared<AlgorithmCtx>();
   ctx->rank = comm->bootstrap()->getRank();
   ctx->workSize = comm->bootstrap()->getNranks();
-  ctx->ipcDomainNranks = comm->bootstrap()->getNranksPerNode();
+  ctx->ipcDomainNranks = comm->getIpcDomainNranks();
 
   // setup semaphores
   ctx->memorySemaphores = this->memorySemaphores_;
diff --git a/src/ext/collectives/allreduce/allreduce_allpair_packet.cu b/src/ext/collectives/allreduce/allreduce_allpair_packet.cu
index 5be2f3360..6c4f972f2 100644
--- a/src/ext/collectives/allreduce/allreduce_allpair_packet.cu
+++ b/src/ext/collectives/allreduce/allreduce_allpair_packet.cu
@@ -140,7 +140,7 @@ std::shared_ptr<void> AllreduceAllpairPacket::initAllreduceContext(std::shared_p
   const int nChannelsPerConnection = maxBlockNum_;
   ctx->rank = comm->bootstrap()->getRank();
   ctx->workSize = comm->bootstrap()->getNranks();
-  ctx->ipcDomainNranks = getIpcDomainNranks(comm);
+  ctx->ipcDomainNranks = comm->getIpcDomainNranks();
   ctx->memorySemaphores = this->memorySemaphores_;
   ctx->registeredMemories = this->registeredMemories_;
   ctx->registeredMemories.pop_back();  // remove the local memory from previous context
diff --git a/src/ext/collectives/allreduce/allreduce_fullmesh.cu b/src/ext/collectives/allreduce/allreduce_fullmesh.cu
index ef7ecf74d..a54270703 100644
--- a/src/ext/collectives/allreduce/allreduce_fullmesh.cu
+++ b/src/ext/collectives/allreduce/allreduce_fullmesh.cu
@@ -250,7 +250,7 @@ std::shared_ptr<void> AllreduceFullmesh::initAllreduceContext(std::shared_ptr<Co
   auto ctx = std::make_shared<AlgorithmCtx>();
   ctx->rank = comm->bootstrap()->getRank();
   ctx->workSize = comm->bootstrap()->getNranks();
-  ctx->ipcDomainNranks = comm->bootstrap()->getNranksPerNode();
+  ctx->ipcDomainNranks = comm->getIpcDomainNranks();
 
   // setup semaphores
   ctx->memorySemaphores = this->outputSemaphores_;
diff --git a/src/ext/collectives/allreduce/allreduce_nvls_block_pipeline.cu b/src/ext/collectives/allreduce/allreduce_nvls_block_pipeline.cu
index 9d3316e4c..07418f744 100644
--- a/src/ext/collectives/allreduce/allreduce_nvls_block_pipeline.cu
+++ b/src/ext/collectives/allreduce/allreduce_nvls_block_pipeline.cu
@@ -177,7 +177,7 @@ struct NvlsBlockPipelineAdapter {
 
 void AllreduceNvlsBlockPipeline::initialize(std::shared_ptr<Communicator> comm) {
   nSwitchChannels_ = 8;
-  ipcDomainNranks_ = getIpcDomainNranks(comm);
+  ipcDomainNranks_ = comm->getIpcDomainNranks();
   // Per-peer channel allocation must hold up to 4 * ipcDomainNranks entries (see kernel).
   nBaseChannels_ = std::max(64, 4 * ipcDomainNranks_);
   this->conns_ = setupConnections(comm);
@@ -224,7 +224,7 @@ std::shared_ptr<void> AllreduceNvlsBlockPipeline::initAllreduceContext(std::shar
   auto ctx = std::make_shared<AlgorithmCtx>();
   ctx->rank = comm->bootstrap()->getRank();
   ctx->workSize = comm->bootstrap()->getNranks();
-  ctx->ipcDomainNranks = getIpcDomainNranks(comm);
+  ctx->ipcDomainNranks = comm->getIpcDomainNranks();
 
   // setup channels
   ctx->switchChannels =
diff --git a/src/ext/collectives/allreduce/allreduce_nvls_packet.cu b/src/ext/collectives/allreduce/allreduce_nvls_packet.cu
index 56455b6ea..cb9ad17eb 100644
--- a/src/ext/collectives/allreduce/allreduce_nvls_packet.cu
+++ b/src/ext/collectives/allreduce/allreduce_nvls_packet.cu
@@ -95,7 +95,7 @@ std::shared_ptr<void> AllreduceNvlsPacket::initAllreduceContext(std::shared_ptr<
   auto ctx = std::make_shared<AlgorithmCtx>();
   ctx->rank = comm->bootstrap()->getRank();
   ctx->workSize = comm->bootstrap()->getNranks();
-  ctx->ipcDomainNranks = getIpcDomainNranks(comm);
+  ctx->ipcDomainNranks = comm->getIpcDomainNranks();
 
   // setup channels
   ctx->switchChannels = this->switchChannels_;
diff --git a/src/ext/collectives/allreduce/allreduce_nvls_warp_pipeline.cu b/src/ext/collectives/allreduce/allreduce_nvls_warp_pipeline.cu
index 73ecdab9d..a06692947 100644
--- a/src/ext/collectives/allreduce/allreduce_nvls_warp_pipeline.cu
+++ b/src/ext/collectives/allreduce/allreduce_nvls_warp_pipeline.cu
@@ -141,7 +141,7 @@ struct NvlsWarpPipelineAdapter {
 
 void AllreduceNvlsWarpPipeline::initialize(std::shared_ptr<Communicator> comm) {
   nSwitchChannels_ = NUM_NVLS_CONNECTION;
-  ipcDomainNranks_ = getIpcDomainNranks(comm);
+  ipcDomainNranks_ = comm->getIpcDomainNranks();
   // Per-peer channel allocation must hold 2 * nBlocks entries; default nBlocks = 4 * ipcDomainNranks.
   nBaseChannels_ = std::max(64, 8 * ipcDomainNranks_);
   this->conns_ = setupConnections(comm);
@@ -188,7 +188,7 @@ std::shared_ptr<void> AllreduceNvlsWarpPipeline::initAllreduceContext(std::share
   auto ctx = std::make_shared<AlgorithmCtx>();
   ctx->rank = comm->bootstrap()->getRank();
   ctx->workSize = comm->bootstrap()->getNranks();
-  ctx->ipcDomainNranks = getIpcDomainNranks(comm);
+  ctx->ipcDomainNranks = comm->getIpcDomainNranks();
 
   // setup channels
   ctx->switchChannels =
diff --git a/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu b/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu
index 63cbd057d..36095e73c 100644
--- a/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu
+++ b/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu
@@ -99,7 +99,6 @@ void AllreduceNvls::initialize(std::shared_ptr<mscclpp::Communicator> comm) {
   MSCCLPP_CUDATHROW(cudaGetDeviceProperties(&deviceProp, device));
   computeCapabilityMajor_ = deviceProp.major;
   nSwitchChannels_ = 32;
-  getIpcDomainNranks(comm);
   this->conns_ = setupConnections(comm);
   // setup semaphores
   std::vector<std::shared_ptr<mscclpp::MemoryDevice2DeviceSemaphore>> memorySemaphores =
@@ -177,7 +176,7 @@ std::shared_ptr<void> AllreduceNvls::initAllreduceContext(std::shared_ptr<mscclp
   auto ctx = std::make_shared<AlgorithmCtx>();
   ctx->rank = comm->bootstrap()->getRank();
   ctx->workSize = comm->bootstrap()->getNranks();
-  ctx->ipcDomainNranks = getIpcDomainNranks(comm);
+  ctx->ipcDomainNranks = comm->getIpcDomainNranks();
 
   size_t sendBytes, recvBytes;
   CUdeviceptr sendBasePtr, recvBasePtr;
diff --git a/src/ext/collectives/allreduce/allreduce_packet.cu b/src/ext/collectives/allreduce/allreduce_packet.cu
index 7bc9a85f1..f88389dca 100644
--- a/src/ext/collectives/allreduce/allreduce_packet.cu
+++ b/src/ext/collectives/allreduce/allreduce_packet.cu
@@ -263,7 +263,7 @@ std::shared_ptr<void> AllreducePacket::initAllreduceContext(std::shared_ptr<Comm
   const int nChannelsPerConnection = maxBlockNum_;
   ctx->rank = comm->bootstrap()->getRank();
   ctx->workSize = comm->bootstrap()->getNranks();
-  ctx->ipcDomainNranks = getIpcDomainNranks(comm);
+  ctx->ipcDomainNranks = comm->getIpcDomainNranks();
   ctx->memorySemaphores = this->memorySemaphores_;
   ctx->registeredMemories = this->registeredMemories_;
   ctx->registeredMemories.pop_back();  // remove the local memory from previous context
diff --git a/src/ext/collectives/allreduce/allreduce_rsag.cu b/src/ext/collectives/allreduce/allreduce_rsag.cu
index 4dcceb48e..43ff56106 100644
--- a/src/ext/collectives/allreduce/allreduce_rsag.cu
+++ b/src/ext/collectives/allreduce/allreduce_rsag.cu
@@ -203,7 +203,7 @@ std::shared_ptr<void> AllreduceRsAg::initAllreduceContext(std::shared_ptr<Commun
   auto ctx = std::make_shared<AlgorithmCtx>();
   ctx->rank = comm->bootstrap()->getRank();
   ctx->workSize = comm->bootstrap()->getNranks();
-  ctx->ipcDomainNranks = getIpcDomainNranks(comm);
+  ctx->ipcDomainNranks = comm->getIpcDomainNranks();
 
   ctx->memorySemaphores = this->scratchSemaphores_;
   ctx->registeredMemories = this->remoteScratchMemories_;
diff --git a/src/ext/collectives/allreduce/allreduce_rsag_pipeline.cu b/src/ext/collectives/allreduce/allreduce_rsag_pipeline.cu
index 9f63e5905..1e59c7e45 100644
--- a/src/ext/collectives/allreduce/allreduce_rsag_pipeline.cu
+++ b/src/ext/collectives/allreduce/allreduce_rsag_pipeline.cu
@@ -306,7 +306,7 @@ std::shared_ptr<void> AllreduceRsAgPipeline::initAllreduceContext(std::shared_pt
   auto ctx = std::make_shared<AlgorithmCtx>();
   ctx->rank = comm->bootstrap()->getRank();
   ctx->workSize = comm->bootstrap()->getNranks();
-  ctx->ipcDomainNranks = comm->bootstrap()->getNranksPerNode();
+  ctx->ipcDomainNranks = comm->getIpcDomainNranks();
 
   ctx->memorySemaphores = this->scratchSemaphores_;
   ctx->registeredMemories = this->remoteScratchMemories_;
diff --git a/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu b/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu
index c678c2670..f8d612793 100644
--- a/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu
+++ b/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu
@@ -200,7 +200,7 @@ std::shared_ptr<void> AllreduceRsAgZeroCopy::initAllreduceContext(std::shared_pt
   auto ctx = std::make_shared<AlgorithmCtx>();
   ctx->rank = comm->bootstrap()->getRank();
   ctx->workSize = comm->bootstrap()->getNranks();
-  ctx->ipcDomainNranks = getIpcDomainNranks(comm);
+  ctx->ipcDomainNranks = comm->getIpcDomainNranks();
 
   ctx->memorySemaphores = this->semaphores_;
 
diff --git a/src/ext/collectives/collective_utils.cc b/src/ext/collectives/collective_utils.cc
index 192fac8d3..c3856a88e 100644
--- a/src/ext/collectives/collective_utils.cc
+++ b/src/ext/collectives/collective_utils.cc
@@ -72,26 +72,6 @@ std::vector<std::shared_ptr<mscclpp::MemoryDevice2DeviceSemaphore>> setupMemoryS
   return memorySemaphores;
 }
 
-int getIpcDomainNranks(std::shared_ptr<Communicator> comm) {
-  const int commValue = comm->getIpcDomainNranks();
-  const int ipcDomainNranks = (commValue > 0) ? commValue : comm->bootstrap()->getNranksPerNode();
-  const int worldSize = comm->bootstrap()->getNranks();
-  const int rank = comm->bootstrap()->getRank();
-  if (ipcDomainNranks < 2 || ipcDomainNranks > MAX_IPC_DOMAIN_NRANKS) {
-    THROW(LogSubsys::ALGO, Error, ErrorCode::InvalidUsage, "ipcDomainNranks ", ipcDomainNranks,
-          " is out of supported range [2, ", MAX_IPC_DOMAIN_NRANKS, "]");
-  }
-  if (worldSize != ipcDomainNranks) {
-    THROW(LogSubsys::ALGO, Error, ErrorCode::InvalidUsage,
-          "requires worldSize == ipcDomainNranks (got worldSize=", worldSize, ", ipcDomainNranks=", ipcDomainNranks,
-          ")");
-  }
-  if (rank < 0 || rank >= ipcDomainNranks) {
-    THROW(LogSubsys::ALGO, Error, ErrorCode::InvalidUsage, "rank ", rank, " out of [0, ", ipcDomainNranks, ")");
-  }
-  return ipcDomainNranks;
-}
-
 std::shared_ptr<mscclpp::DeviceHandle<mscclpp::MemoryChannel>> setupMemoryChannelDeviceHandles(
     const std::vector<mscclpp::MemoryChannel>& memoryChannels) {
   std::vector<mscclpp::DeviceHandle<mscclpp::MemoryChannel>> memoryChannelDeviceHandles;
diff --git a/src/ext/collectives/include/collective_utils.hpp b/src/ext/collectives/include/collective_utils.hpp
index 217c7f550..c1cad4121 100644
--- a/src/ext/collectives/include/collective_utils.hpp
+++ b/src/ext/collectives/include/collective_utils.hpp
@@ -51,11 +51,6 @@ std::vector<Connection> setupConnections(std::shared_ptr<Communicator> comm);
 std::vector<std::shared_ptr<MemoryDevice2DeviceSemaphore>> setupMemorySemaphores(
     std::shared_ptr<Communicator> comm, const std::vector<Connection>& connections, int nChannelsPerConnection);
 
-/// Returns the IPC-reachable peer-group size, validated to span the whole communicator and
-/// to be within `[2, MAX_IPC_DOMAIN_NRANKS]`. Reads the communicator's IPC-domain override
-/// if set; otherwise falls back to `bootstrap->getNranksPerNode()`. Throws `Error(InvalidUsage)` on violation.
-int getIpcDomainNranks(std::shared_ptr<Communicator> comm);
-
 std::shared_ptr<DeviceHandle<MemoryChannel>> setupMemoryChannelDeviceHandles(
     const std::vector<MemoryChannel>& memoryChannels);
 

From 0744e806fc1f0d9cd8c33231ef1517fe42348395 Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Sat, 16 May 2026 00:39:49 +0000
Subject: [PATCH 35/44] detect ipc domain automaticlly

---
 CMakeLists.txt                                |  1 +
 .../customized_comm_with_tuning.py            |  9 +--
 include/mscclpp/core.hpp                      | 19 ++---
 python/csrc/core_py.cpp                       |  3 +-
 python/mscclpp/_core/comm.py                  |  4 +-
 src/core/bootstrap/bootstrap.cc               | 23 ++++++
 src/core/communicator.cc                      | 11 ---
 src/core/include/communicator.hpp             |  2 -
 src/core/include/utils_internal.hpp           |  1 +
 src/core/utils_internal.cc                    | 77 +++++++++++++++++++
 .../allgather/allgather_fullmesh.cu           |  2 +-
 .../allgather/allgather_fullmesh_2.cu         |  2 +-
 .../allreduce/allreduce_allpair_packet.cu     |  2 +-
 .../allreduce/allreduce_fullmesh.cu           |  2 +-
 .../allreduce_nvls_block_pipeline.cu          |  4 +-
 .../allreduce/allreduce_nvls_packet.cu        |  2 +-
 .../allreduce/allreduce_nvls_warp_pipeline.cu |  4 +-
 .../allreduce/allreduce_nvls_zero_copy.cu     |  2 +-
 .../collectives/allreduce/allreduce_packet.cu |  2 +-
 .../collectives/allreduce/allreduce_rsag.cu   |  2 +-
 .../allreduce/allreduce_rsag_pipeline.cu      |  2 +-
 .../allreduce/allreduce_rsag_zero_copy.cu     |  2 +-
 test/mp_unit/bootstrap_tests.cc               |  1 +
 23 files changed, 130 insertions(+), 49 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 49154e0b0..3f9bf8e07 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -206,6 +206,7 @@ if(MSCCLPP_USE_CUDA)
     else()
         set(GPU_LIBRARIES CUDA::cudart CUDA::cuda_driver)
     endif()
+    list(APPEND GPU_LIBRARIES CUDA::nvml)
 else()
     set(CMAKE_HIP_STANDARD 17)
     set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} -Wall -Wextra")
diff --git a/examples/torch-integration/customized_comm_with_tuning.py b/examples/torch-integration/customized_comm_with_tuning.py
index d0da8c689..6cef88feb 100644
--- a/examples/torch-integration/customized_comm_with_tuning.py
+++ b/examples/torch-integration/customized_comm_with_tuning.py
@@ -97,7 +97,7 @@ def __init__(self, comm: mscclpp.CommGroup, symmetric_memory: bool = False):
         self.rank = comm.my_rank
         self.world_size = comm.nranks
         self.nranks_per_node = comm.nranks_per_node
-        self.ipc_domain_n_ranks = comm.communicator.get_ipc_domain_n_ranks()
+        self.ipc_domain_n_ranks = comm.ipc_domain_n_ranks
         self.multi_host_mnnvl = self.ipc_domain_n_ranks >= self.world_size and self.world_size > self.nranks_per_node
         self.symmetric_memory = symmetric_memory
         self._nvls = mscclpp.is_nvls_supported()
@@ -431,8 +431,8 @@ def benchmark_allgather(comm: CustomizedComm, dtype=torch.float16, n_warmup=10,
 # -- Bootstrap & main ---------------------------------------------------------
 
 
-def init_dist(ipc_domain_n_ranks: int = 0) -> mscclpp.CommGroup:
-    return mscclpp.CommGroup(mpi_comm=MPI.COMM_WORLD, ipc_domain_n_ranks=ipc_domain_n_ranks)
+def init_dist() -> mscclpp.CommGroup:
+    return mscclpp.CommGroup(mpi_comm=MPI.COMM_WORLD)
 
 
 def main():
@@ -445,9 +445,8 @@ def main():
     accum_str = os.environ.get("ACCUM_DTYPE")
     accum_dtype = accum_map.get(accum_str) if accum_str else None
     symmetric_memory = os.environ.get("SYMMETRIC_MEMORY", "1") == "1"
-    ipc_domain_n_ranks = int(os.environ.get("IPC_DOMAIN_NRANKS", "0"))
 
-    comm_group = init_dist(ipc_domain_n_ranks=ipc_domain_n_ranks)
+    comm_group = init_dist()
     cc = CustomizedComm(comm_group, symmetric_memory=symmetric_memory)
 
     print(
diff --git a/include/mscclpp/core.hpp b/include/mscclpp/core.hpp
index 832323ad6..4c14f1eec 100644
--- a/include/mscclpp/core.hpp
+++ b/include/mscclpp/core.hpp
@@ -46,6 +46,10 @@ class Bootstrap {
   /// @return The total number of ranks per node.
   virtual int getNranksPerNode() const = 0;
 
+  /// Return the number of ranks in this rank's GPU IPC domain.
+  /// @return The number of ranks in the GPU IPC domain.
+  virtual int getNranksPerIpcDomain() const;
+
   /// Send arbitrary data to another process.
   ///
   /// Data sent via `send(senderBuff, size, receiverRank, tag)` can be received via `recv(receiverBuff, size,
@@ -144,6 +148,9 @@ class TcpBootstrap : public Bootstrap {
   /// Return the total number of ranks per node.
   int getNranksPerNode() const override;
 
+  /// Return the number of ranks in this rank's GPU IPC domain.
+  int getNranksPerIpcDomain() const override;
+
   /// Send arbitrary data to another process.
   ///
   /// Data sent via `send(senderBuff, size, receiverRank, tag)` can be received via `recv(receiverBuff, size,
@@ -821,18 +828,6 @@ class Communicator {
   /// @return The context held by this communicator.
   std::shared_ptr<Context> context();
 
-  /// Set the IPC-domain rank count for collective algorithms using this communicator.
-  ///
-  /// The value describes how many ranks are in one GPU-IPC-reachable peer group, such as a Multi-Node NVLink
-  /// fabric. Set to 0 to use the default `bootstrap()->getNranksPerNode()` value.
-  ///
-  /// @param ipcDomainNranks Number of ranks in the communicator's IPC domain, or 0 to use the default.
-  void setIpcDomainNranks(int ipcDomainNranks);
-
-  /// Get the effective IPC-domain rank count for this communicator.
-  /// @return The configured IPC-domain rank count, or `bootstrap()->getNranksPerNode()` if no override is set.
-  int getIpcDomainNranks() const;
-
   /// Register a region of GPU memory for use in this communicator's context.
   ///
   /// @param ptr Base pointer to the memory.
diff --git a/python/csrc/core_py.cpp b/python/csrc/core_py.cpp
index d748c6a00..7e9af6c1f 100644
--- a/python/csrc/core_py.cpp
+++ b/python/csrc/core_py.cpp
@@ -56,6 +56,7 @@ void register_core(nb::module_& m) {
       .def("get_rank", &Bootstrap::getRank)
       .def("get_n_ranks", &Bootstrap::getNranks)
       .def("get_n_ranks_per_node", &Bootstrap::getNranksPerNode)
+      .def("get_n_ranks_per_ipc_domain", &Bootstrap::getNranksPerIpcDomain)
       .def(
           "send",
           [](Bootstrap* self, uintptr_t ptr, size_t size, int peer, int tag) {
@@ -282,8 +283,6 @@ void register_core(nb::module_& m) {
            nb::arg("context") = nullptr)
       .def("bootstrap", &Communicator::bootstrap)
       .def("context", &Communicator::context)
-      .def("set_ipc_domain_n_ranks", &Communicator::setIpcDomainNranks, nb::arg("n_ranks"))
-      .def("get_ipc_domain_n_ranks", &Communicator::getIpcDomainNranks)
       .def(
           "register_memory",
           [](Communicator* self, uintptr_t ptr, size_t size, TransportFlags transports) {
diff --git a/python/mscclpp/_core/comm.py b/python/mscclpp/_core/comm.py
index f1940eae7..875e07f18 100644
--- a/python/mscclpp/_core/comm.py
+++ b/python/mscclpp/_core/comm.py
@@ -35,7 +35,6 @@ def __init__(
         interfaceIpPortTrio: str = "",
         rank: int = None,
         size: int = None,
-        ipc_domain_n_ranks: int = 0,
     ):
         if interfaceIpPortTrio == "" and (mpi_comm is not None or torch_group is not None):
             uniq_id = None
@@ -71,11 +70,10 @@ def __init__(
         else:
             raise RuntimeError("Either the interface or mpi_group need to be specified")
         self.communicator = CppCommunicator(self.bootstrap)
-        self.communicator.set_ipc_domain_n_ranks(ipc_domain_n_ranks)
         self.my_rank = self.bootstrap.get_rank()
         self.nranks = self.bootstrap.get_n_ranks()
         self.nranks_per_node = self.bootstrap.get_n_ranks_per_node()
-        self.ipc_domain_n_ranks = self.communicator.get_ipc_domain_n_ranks()
+        self.ipc_domain_n_ranks = self.bootstrap.get_n_ranks_per_ipc_domain()
 
     def barrier(self):
         self.bootstrap.barrier()
diff --git a/src/core/bootstrap/bootstrap.cc b/src/core/bootstrap/bootstrap.cc
index b3032e502..a58357519 100644
--- a/src/core/bootstrap/bootstrap.cc
+++ b/src/core/bootstrap/bootstrap.cc
@@ -50,6 +50,8 @@ MSCCLPP_API_CPP void Bootstrap::groupBarrier(const std::vector<int>& ranks) {
   }
 }
 
+MSCCLPP_API_CPP int Bootstrap::getNranksPerIpcDomain() const { return getNranksPerNode(); }
+
 MSCCLPP_API_CPP void Bootstrap::send(const std::vector<char>& data, int peer, int tag) {
   size_t size = data.size();
   send((void*)&size, sizeof(size_t), peer, tag);
@@ -83,6 +85,7 @@ class TcpBootstrap::Impl {
   int getRank();
   int getNranks();
   int getNranksPerNode();
+  int getNranksPerIpcDomain();
   void allGather(void* allData, int size);
   void broadcast(void* data, int size, int root);
   void send(void* data, int size, int peer, int tag);
@@ -95,6 +98,7 @@ class TcpBootstrap::Impl {
   int rank_;
   int nRanks_;
   int nRanksPerNode_;
+  int nRanksPerIpcDomain_;
   bool netInitialized;
   std::unique_ptr<Socket> listenSockRoot_;
   std::unique_ptr<Socket> listenSock_;
@@ -148,6 +152,7 @@ TcpBootstrap::Impl::Impl(int rank, int nRanks)
     : rank_(rank),
       nRanks_(nRanks),
       nRanksPerNode_(0),
+      nRanksPerIpcDomain_(0),
       netInitialized(false),
       peerCommAddresses_(nRanks, SocketAddress()),
       barrierArr_(nRanks, 0),
@@ -451,6 +456,22 @@ int TcpBootstrap::Impl::getNranksPerNode() {
   return nRanksPerNode_;
 }
 
+int TcpBootstrap::Impl::getNranksPerIpcDomain() {
+  if (nRanksPerIpcDomain_ > 0) return nRanksPerIpcDomain_;
+  std::vector<uint64_t> ipcDomainHashes(nRanks_);
+  ipcDomainHashes[rank_] = getIpcDomainHash();
+  allGather(ipcDomainHashes.data(), sizeof(uint64_t));
+
+  int nRanksPerIpcDomain = 0;
+  for (int i = 0; i < nRanks_; ++i) {
+    if (ipcDomainHashes[i] == ipcDomainHashes[rank_]) {
+      ++nRanksPerIpcDomain;
+    }
+  }
+  nRanksPerIpcDomain_ = nRanksPerIpcDomain;
+  return nRanksPerIpcDomain_;
+}
+
 void TcpBootstrap::Impl::allGather(void* allData, int size) {
   char* data = static_cast<char*>(allData);
   int rank = rank_;
@@ -592,6 +613,8 @@ MSCCLPP_API_CPP int TcpBootstrap::getNranks() const { return pimpl_->getNranks()
 
 MSCCLPP_API_CPP int TcpBootstrap::getNranksPerNode() const { return pimpl_->getNranksPerNode(); }
 
+MSCCLPP_API_CPP int TcpBootstrap::getNranksPerIpcDomain() const { return pimpl_->getNranksPerIpcDomain(); }
+
 MSCCLPP_API_CPP void TcpBootstrap::send(void* data, int size, int peer, int tag) {
   pimpl_->send(data, size, peer, tag);
 }
diff --git a/src/core/communicator.cc b/src/core/communicator.cc
index 9bbbff3be..41e46bc50 100644
--- a/src/core/communicator.cc
+++ b/src/core/communicator.cc
@@ -79,17 +79,6 @@ MSCCLPP_API_CPP std::shared_ptr<Bootstrap> Communicator::bootstrap() { return pi
 
 MSCCLPP_API_CPP std::shared_ptr<Context> Communicator::context() { return pimpl_->context_; }
 
-MSCCLPP_API_CPP void Communicator::setIpcDomainNranks(int ipcDomainNranks) {
-  if (ipcDomainNranks < 0) {
-    throw Error("ipcDomainNranks must be non-negative", ErrorCode::InvalidUsage);
-  }
-  pimpl_->ipcDomainNranks_ = ipcDomainNranks;
-}
-
-MSCCLPP_API_CPP int Communicator::getIpcDomainNranks() const {
-  return (pimpl_->ipcDomainNranks_ > 0) ? pimpl_->ipcDomainNranks_ : pimpl_->bootstrap_->getNranksPerNode();
-}
-
 MSCCLPP_API_CPP RegisteredMemory Communicator::registerMemory(void* ptr, size_t size, TransportFlags transports) {
   return context()->registerMemory(ptr, size, transports);
 }
diff --git a/src/core/include/communicator.hpp b/src/core/include/communicator.hpp
index b9f519b9b..333cc9823 100644
--- a/src/core/include/communicator.hpp
+++ b/src/core/include/communicator.hpp
@@ -60,8 +60,6 @@ struct Communicator::Impl {
   std::shared_ptr<Bootstrap> bootstrap_;
   std::shared_ptr<Context> context_;
   std::unordered_map<const BaseConnection*, ConnectionInfo> connectionInfos_;
-  int ipcDomainNranks_ = 0;
-
   // Temporary storage for the latest RecvItem of each {remoteRank, tag} pair.
   // The RecvItem is removed when it finishes or when getLastRecvItem observes that it is ready.
   std::unordered_map<std::pair<int, int>, std::shared_ptr<BaseRecvItem>, PairHash> lastRecvItems_;
diff --git a/src/core/include/utils_internal.hpp b/src/core/include/utils_internal.hpp
index c5c67e26c..c6934194d 100644
--- a/src/core/include/utils_internal.hpp
+++ b/src/core/include/utils_internal.hpp
@@ -37,6 +37,7 @@ int64_t busIdToInt64(const std::string busId);
 uint64_t getHash(const char* string, int n);
 uint64_t getHostHash();
 uint64_t getPidHash();
+uint64_t getIpcDomainHash();
 void getRandomData(void* buffer, size_t bytes);
 
 struct netIf {
diff --git a/src/core/utils_internal.cc b/src/core/utils_internal.cc
index 8cc554301..2e620b660 100644
--- a/src/core/utils_internal.cc
+++ b/src/core/utils_internal.cc
@@ -6,6 +6,10 @@
 #include <signal.h>
 #include <unistd.h>
 
+#if defined(MSCCLPP_USE_CUDA)
+#include <nvml.h>
+#endif
+
 #include <cstring>
 #include <fstream>
 #include <iostream>
@@ -175,6 +179,79 @@ uint64_t getPidHash(void) {
   return *pidHash;
 }
 
+#if defined(MSCCLPP_USE_CUDA) && defined(NVML_GPU_FABRIC_UUID_LEN)
+namespace {
+
+class NvmlState {
+ public:
+  NvmlState() : initialized_(nvmlInit_v2() == NVML_SUCCESS) {}
+
+  ~NvmlState() {
+    if (initialized_) {
+      (void)nvmlShutdown();
+    }
+  }
+
+  bool isInitialized() const { return initialized_; }
+
+ private:
+  bool initialized_ = false;
+};
+
+uint64_t getFabricHash(const nvmlGpuFabricInfo_t& fabricInfo) {
+  char hashData[NVML_GPU_FABRIC_UUID_LEN + sizeof(fabricInfo.cliqueId)];
+  std::memcpy(hashData, fabricInfo.clusterUuid, NVML_GPU_FABRIC_UUID_LEN);
+  std::memcpy(hashData + NVML_GPU_FABRIC_UUID_LEN, &fabricInfo.cliqueId, sizeof(fabricInfo.cliqueId));
+  return getHash(hashData, sizeof(hashData));
+}
+
+bool tryGetNvmlIpcDomainHash(uint64_t& ipcDomainHash) {
+  // Use the current CUDA device; callers must set the rank's device before querying.
+  int deviceId;
+  if (cudaGetDevice(&deviceId) != cudaSuccess) {
+    return false;
+  }
+
+  char pciBusId[] = "00000000:00:00.0";
+  if (cudaDeviceGetPCIBusId(pciBusId, sizeof(pciBusId), deviceId) != cudaSuccess) {
+    return false;
+  }
+
+  static NvmlState nvml;
+  if (!nvml.isInitialized()) {
+    return false;
+  }
+
+  nvmlDevice_t nvmlDevice;
+  if (nvmlDeviceGetHandleByPciBusId_v2(pciBusId, &nvmlDevice) != NVML_SUCCESS) {
+    return false;
+  }
+
+  nvmlGpuFabricInfo_t fabricInfo = {};
+  if (nvmlDeviceGetGpuFabricInfo(nvmlDevice, &fabricInfo) != NVML_SUCCESS) {
+    return false;
+  }
+  if (fabricInfo.state != NVML_GPU_FABRIC_STATE_COMPLETED || fabricInfo.status != NVML_SUCCESS) {
+    return false;
+  }
+
+  ipcDomainHash = getFabricHash(fabricInfo);
+  return true;
+}
+
+}  // namespace
+#endif
+
+uint64_t getIpcDomainHash(void) {
+#if defined(MSCCLPP_USE_CUDA) && defined(NVML_GPU_FABRIC_UUID_LEN)
+  uint64_t ipcDomainHash;
+  if (tryGetNvmlIpcDomainHash(ipcDomainHash)) {
+    return ipcDomainHash;
+  }
+#endif
+  return getHostHash();
+}
+
 int parseStringList(const char* string, netIf* ifList, int maxList) {
   if (!string) return 0;
 
diff --git a/src/ext/collectives/allgather/allgather_fullmesh.cu b/src/ext/collectives/allgather/allgather_fullmesh.cu
index 8b5cf3b70..84dd4d473 100644
--- a/src/ext/collectives/allgather/allgather_fullmesh.cu
+++ b/src/ext/collectives/allgather/allgather_fullmesh.cu
@@ -148,7 +148,7 @@ std::shared_ptr<void> AllgatherFullmesh::initAllgatherContext(std::shared_ptr<Co
   auto ctx = std::make_shared<AlgorithmCtx>();
   ctx->rank = comm->bootstrap()->getRank();
   ctx->workSize = comm->bootstrap()->getNranks();
-  ctx->ipcDomainNranks = comm->getIpcDomainNranks();
+  ctx->ipcDomainNranks = comm->bootstrap()->getNranksPerIpcDomain();
 
   // setup semaphores
   ctx->memorySemaphores = setupMemorySemaphores(comm, this->conns_, nChannelsPerConnection);
diff --git a/src/ext/collectives/allgather/allgather_fullmesh_2.cu b/src/ext/collectives/allgather/allgather_fullmesh_2.cu
index de9d93840..5a353922f 100644
--- a/src/ext/collectives/allgather/allgather_fullmesh_2.cu
+++ b/src/ext/collectives/allgather/allgather_fullmesh_2.cu
@@ -159,7 +159,7 @@ std::shared_ptr<void> AllgatherFullmesh2::initAllgatherContext(std::shared_ptr<m
   auto ctx = std::make_shared<AlgorithmCtx>();
   ctx->rank = comm->bootstrap()->getRank();
   ctx->workSize = comm->bootstrap()->getNranks();
-  ctx->ipcDomainNranks = comm->getIpcDomainNranks();
+  ctx->ipcDomainNranks = comm->bootstrap()->getNranksPerIpcDomain();
 
   // setup semaphores
   ctx->memorySemaphores = this->memorySemaphores_;
diff --git a/src/ext/collectives/allreduce/allreduce_allpair_packet.cu b/src/ext/collectives/allreduce/allreduce_allpair_packet.cu
index 6c4f972f2..29ef2055b 100644
--- a/src/ext/collectives/allreduce/allreduce_allpair_packet.cu
+++ b/src/ext/collectives/allreduce/allreduce_allpair_packet.cu
@@ -140,7 +140,7 @@ std::shared_ptr<void> AllreduceAllpairPacket::initAllreduceContext(std::shared_p
   const int nChannelsPerConnection = maxBlockNum_;
   ctx->rank = comm->bootstrap()->getRank();
   ctx->workSize = comm->bootstrap()->getNranks();
-  ctx->ipcDomainNranks = comm->getIpcDomainNranks();
+  ctx->ipcDomainNranks = comm->bootstrap()->getNranksPerIpcDomain();
   ctx->memorySemaphores = this->memorySemaphores_;
   ctx->registeredMemories = this->registeredMemories_;
   ctx->registeredMemories.pop_back();  // remove the local memory from previous context
diff --git a/src/ext/collectives/allreduce/allreduce_fullmesh.cu b/src/ext/collectives/allreduce/allreduce_fullmesh.cu
index a54270703..b158f817c 100644
--- a/src/ext/collectives/allreduce/allreduce_fullmesh.cu
+++ b/src/ext/collectives/allreduce/allreduce_fullmesh.cu
@@ -250,7 +250,7 @@ std::shared_ptr<void> AllreduceFullmesh::initAllreduceContext(std::shared_ptr<Co
   auto ctx = std::make_shared<AlgorithmCtx>();
   ctx->rank = comm->bootstrap()->getRank();
   ctx->workSize = comm->bootstrap()->getNranks();
-  ctx->ipcDomainNranks = comm->getIpcDomainNranks();
+  ctx->ipcDomainNranks = comm->bootstrap()->getNranksPerIpcDomain();
 
   // setup semaphores
   ctx->memorySemaphores = this->outputSemaphores_;
diff --git a/src/ext/collectives/allreduce/allreduce_nvls_block_pipeline.cu b/src/ext/collectives/allreduce/allreduce_nvls_block_pipeline.cu
index 07418f744..890e50f58 100644
--- a/src/ext/collectives/allreduce/allreduce_nvls_block_pipeline.cu
+++ b/src/ext/collectives/allreduce/allreduce_nvls_block_pipeline.cu
@@ -177,7 +177,7 @@ struct NvlsBlockPipelineAdapter {
 
 void AllreduceNvlsBlockPipeline::initialize(std::shared_ptr<Communicator> comm) {
   nSwitchChannels_ = 8;
-  ipcDomainNranks_ = comm->getIpcDomainNranks();
+  ipcDomainNranks_ = comm->bootstrap()->getNranksPerIpcDomain();
   // Per-peer channel allocation must hold up to 4 * ipcDomainNranks entries (see kernel).
   nBaseChannels_ = std::max(64, 4 * ipcDomainNranks_);
   this->conns_ = setupConnections(comm);
@@ -224,7 +224,7 @@ std::shared_ptr<void> AllreduceNvlsBlockPipeline::initAllreduceContext(std::shar
   auto ctx = std::make_shared<AlgorithmCtx>();
   ctx->rank = comm->bootstrap()->getRank();
   ctx->workSize = comm->bootstrap()->getNranks();
-  ctx->ipcDomainNranks = comm->getIpcDomainNranks();
+  ctx->ipcDomainNranks = comm->bootstrap()->getNranksPerIpcDomain();
 
   // setup channels
   ctx->switchChannels =
diff --git a/src/ext/collectives/allreduce/allreduce_nvls_packet.cu b/src/ext/collectives/allreduce/allreduce_nvls_packet.cu
index cb9ad17eb..e8ecfb737 100644
--- a/src/ext/collectives/allreduce/allreduce_nvls_packet.cu
+++ b/src/ext/collectives/allreduce/allreduce_nvls_packet.cu
@@ -95,7 +95,7 @@ std::shared_ptr<void> AllreduceNvlsPacket::initAllreduceContext(std::shared_ptr<
   auto ctx = std::make_shared<AlgorithmCtx>();
   ctx->rank = comm->bootstrap()->getRank();
   ctx->workSize = comm->bootstrap()->getNranks();
-  ctx->ipcDomainNranks = comm->getIpcDomainNranks();
+  ctx->ipcDomainNranks = comm->bootstrap()->getNranksPerIpcDomain();
 
   // setup channels
   ctx->switchChannels = this->switchChannels_;
diff --git a/src/ext/collectives/allreduce/allreduce_nvls_warp_pipeline.cu b/src/ext/collectives/allreduce/allreduce_nvls_warp_pipeline.cu
index a06692947..68efc2ab0 100644
--- a/src/ext/collectives/allreduce/allreduce_nvls_warp_pipeline.cu
+++ b/src/ext/collectives/allreduce/allreduce_nvls_warp_pipeline.cu
@@ -141,7 +141,7 @@ struct NvlsWarpPipelineAdapter {
 
 void AllreduceNvlsWarpPipeline::initialize(std::shared_ptr<Communicator> comm) {
   nSwitchChannels_ = NUM_NVLS_CONNECTION;
-  ipcDomainNranks_ = comm->getIpcDomainNranks();
+  ipcDomainNranks_ = comm->bootstrap()->getNranksPerIpcDomain();
   // Per-peer channel allocation must hold 2 * nBlocks entries; default nBlocks = 4 * ipcDomainNranks.
   nBaseChannels_ = std::max(64, 8 * ipcDomainNranks_);
   this->conns_ = setupConnections(comm);
@@ -188,7 +188,7 @@ std::shared_ptr<void> AllreduceNvlsWarpPipeline::initAllreduceContext(std::share
   auto ctx = std::make_shared<AlgorithmCtx>();
   ctx->rank = comm->bootstrap()->getRank();
   ctx->workSize = comm->bootstrap()->getNranks();
-  ctx->ipcDomainNranks = comm->getIpcDomainNranks();
+  ctx->ipcDomainNranks = comm->bootstrap()->getNranksPerIpcDomain();
 
   // setup channels
   ctx->switchChannels =
diff --git a/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu b/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu
index 36095e73c..a6f699b2e 100644
--- a/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu
+++ b/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu
@@ -176,7 +176,7 @@ std::shared_ptr<void> AllreduceNvls::initAllreduceContext(std::shared_ptr<mscclp
   auto ctx = std::make_shared<AlgorithmCtx>();
   ctx->rank = comm->bootstrap()->getRank();
   ctx->workSize = comm->bootstrap()->getNranks();
-  ctx->ipcDomainNranks = comm->getIpcDomainNranks();
+  ctx->ipcDomainNranks = comm->bootstrap()->getNranksPerIpcDomain();
 
   size_t sendBytes, recvBytes;
   CUdeviceptr sendBasePtr, recvBasePtr;
diff --git a/src/ext/collectives/allreduce/allreduce_packet.cu b/src/ext/collectives/allreduce/allreduce_packet.cu
index f88389dca..a0bc0e26e 100644
--- a/src/ext/collectives/allreduce/allreduce_packet.cu
+++ b/src/ext/collectives/allreduce/allreduce_packet.cu
@@ -263,7 +263,7 @@ std::shared_ptr<void> AllreducePacket::initAllreduceContext(std::shared_ptr<Comm
   const int nChannelsPerConnection = maxBlockNum_;
   ctx->rank = comm->bootstrap()->getRank();
   ctx->workSize = comm->bootstrap()->getNranks();
-  ctx->ipcDomainNranks = comm->getIpcDomainNranks();
+  ctx->ipcDomainNranks = comm->bootstrap()->getNranksPerIpcDomain();
   ctx->memorySemaphores = this->memorySemaphores_;
   ctx->registeredMemories = this->registeredMemories_;
   ctx->registeredMemories.pop_back();  // remove the local memory from previous context
diff --git a/src/ext/collectives/allreduce/allreduce_rsag.cu b/src/ext/collectives/allreduce/allreduce_rsag.cu
index 43ff56106..22e3a4ee4 100644
--- a/src/ext/collectives/allreduce/allreduce_rsag.cu
+++ b/src/ext/collectives/allreduce/allreduce_rsag.cu
@@ -203,7 +203,7 @@ std::shared_ptr<void> AllreduceRsAg::initAllreduceContext(std::shared_ptr<Commun
   auto ctx = std::make_shared<AlgorithmCtx>();
   ctx->rank = comm->bootstrap()->getRank();
   ctx->workSize = comm->bootstrap()->getNranks();
-  ctx->ipcDomainNranks = comm->getIpcDomainNranks();
+  ctx->ipcDomainNranks = comm->bootstrap()->getNranksPerIpcDomain();
 
   ctx->memorySemaphores = this->scratchSemaphores_;
   ctx->registeredMemories = this->remoteScratchMemories_;
diff --git a/src/ext/collectives/allreduce/allreduce_rsag_pipeline.cu b/src/ext/collectives/allreduce/allreduce_rsag_pipeline.cu
index 1e59c7e45..bedf15c50 100644
--- a/src/ext/collectives/allreduce/allreduce_rsag_pipeline.cu
+++ b/src/ext/collectives/allreduce/allreduce_rsag_pipeline.cu
@@ -306,7 +306,7 @@ std::shared_ptr<void> AllreduceRsAgPipeline::initAllreduceContext(std::shared_pt
   auto ctx = std::make_shared<AlgorithmCtx>();
   ctx->rank = comm->bootstrap()->getRank();
   ctx->workSize = comm->bootstrap()->getNranks();
-  ctx->ipcDomainNranks = comm->getIpcDomainNranks();
+  ctx->ipcDomainNranks = comm->bootstrap()->getNranksPerIpcDomain();
 
   ctx->memorySemaphores = this->scratchSemaphores_;
   ctx->registeredMemories = this->remoteScratchMemories_;
diff --git a/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu b/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu
index f8d612793..10d3a35c2 100644
--- a/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu
+++ b/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu
@@ -200,7 +200,7 @@ std::shared_ptr<void> AllreduceRsAgZeroCopy::initAllreduceContext(std::shared_pt
   auto ctx = std::make_shared<AlgorithmCtx>();
   ctx->rank = comm->bootstrap()->getRank();
   ctx->workSize = comm->bootstrap()->getNranks();
-  ctx->ipcDomainNranks = comm->getIpcDomainNranks();
+  ctx->ipcDomainNranks = comm->bootstrap()->getNranksPerIpcDomain();
 
   ctx->memorySemaphores = this->semaphores_;
 
diff --git a/test/mp_unit/bootstrap_tests.cc b/test/mp_unit/bootstrap_tests.cc
index c28087a45..eb6985a8e 100644
--- a/test/mp_unit/bootstrap_tests.cc
+++ b/test/mp_unit/bootstrap_tests.cc
@@ -127,6 +127,7 @@ class MPIBootstrap : public mscclpp::Bootstrap {
     MPI_Comm_size(shmcomm, &shmrank);
     return shmrank;
   }
+  int getNranksPerIpcDomain() const override { return getNranksPerNode(); }
   void allGather(void* sendbuf, int size) override {
     MPI_Allgather(MPI_IN_PLACE, 0, MPI_BYTE, sendbuf, size, MPI_BYTE, MPI_COMM_WORLD);
   }

From 94af88d88d0a648411baf319f7449db1efaa592c Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Sat, 16 May 2026 01:24:56 +0000
Subject: [PATCH 36/44] Fix tuning example hang

Avoid probing invalid packet allreduce configurations and reduce the default tuning sweep so the 8-rank tuning example completes reliably.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../customized_comm_with_tuning.py            | 33 ++++++++++++-------
 .../collectives/allreduce/allreduce_packet.cu |  7 +++-
 2 files changed, 28 insertions(+), 12 deletions(-)

diff --git a/examples/torch-integration/customized_comm_with_tuning.py b/examples/torch-integration/customized_comm_with_tuning.py
index 6cef88feb..0a07ca325 100644
--- a/examples/torch-integration/customized_comm_with_tuning.py
+++ b/examples/torch-integration/customized_comm_with_tuning.py
@@ -54,12 +54,16 @@ def _round_pow2(size: int) -> int:
 # -- CustomizedComm -----------------------------------------------------------
 
 
+def _env_int(name: str, default: int) -> int:
+    return int(os.environ.get(name, default))
+
+
 class CustomizedComm:
     """Exposes all_reduce, all_gather, barrier with lazy per-size tuning."""
 
-    _TUNE_N_WARMUP = 5
-    _TUNE_N_GRAPH_LAUNCHES = 10
-    _TUNE_N_OPS_PER_GRAPH = 100
+    _TUNE_N_WARMUP = _env_int("TUNE_N_WARMUP", 2)
+    _TUNE_N_GRAPH_LAUNCHES = _env_int("TUNE_N_GRAPH_LAUNCHES", 3)
+    _TUNE_N_OPS_PER_GRAPH = _env_int("TUNE_N_OPS_PER_GRAPH", 20)
     _CANDIDATE_NBLOCKS = [4, 8, 16, 24, 32, 48, 64, 112, 128]
     _CANDIDATE_NTHREADS = [512, 768, 1024]
     _NBLOCKS_LIMIT = {
@@ -78,16 +82,16 @@ class CustomizedComm:
     _AR_CANDIDATES_MNNVL = [
         ("default_allreduce_allpair_packet", 0, 128 << 10, None),
         ("default_allreduce_nvls_packet", 0, 64 << 10, lambda c: c._nvls),
-        ("default_allreduce_packet", 128 << 10, 4 << 20, None),
+        ("default_allreduce_packet", 128 << 10, 512 << 10, None),
         ("default_allreduce_nvls_zero_copy", 512 << 10, None, lambda c: c._nvls and c.symmetric_memory),
         ("default_allreduce_rsag_zero_copy", 512 << 10, None, None),
         ("default_allreduce_rsag", 512 << 10, None, None),
     ]
     _AR_CANDIDATES_SINGLE = [
-        ("default_allreduce_packet", 0, 4 << 20, None),
-        ("default_allreduce_allpair_packet", 0, 4 << 20, None),
-        ("default_allreduce_nvls_packet", 0, 4 << 20, lambda c: c._nvls),
-        ("default_allreduce_rsag_zero_copy", 512 << 10, None, None),
+        ("default_allreduce_packet", 0, 512 << 10, None),
+        ("default_allreduce_allpair_packet", 0, 128 << 10, None),
+        ("default_allreduce_nvls_packet", 0, 64 << 10, lambda c: c._nvls),
+        ("default_allreduce_rsag_zero_copy", 512 << 10, None, lambda c: not (c._nvls and c.symmetric_memory)),
         ("default_allreduce_nvls_zero_copy", 512 << 10, None, lambda c: c._nvls and c.symmetric_memory),
         ("default_allreduce_fullmesh", 0, None, lambda c: torch.version.hip is not None),
     ]
@@ -224,6 +228,11 @@ def _run_tune(self, collective, algo, buf, size, nb, nt):
                 symmetric_memory=False,
             )
 
+    def _is_tune_config_supported(self, algo, nb, nt):
+        if algo.name in ("default_allreduce_packet", "default_allreduce_allpair_packet"):
+            return nb >= self.world_size - 1 and nt in (512, 1024)
+        return True
+
     def _tune_size(self, collective: str, target_size: int):
         """Auto-tune one (collective, target_size) pair and cache result."""
         buf = self._ensure_tune_bufs()
@@ -239,13 +248,15 @@ def _tune_size(self, collective: str, target_size: int):
                 if nb > nb_limit:
                     continue
                 for nt in self._CANDIDATE_NTHREADS:
+                    if not self._is_tune_config_supported(algo, nb, nt):
+                        continue
                     # Feasibility — sync result across ranks so all agree
                     ret = run(algo, nb, nt)
-                    torch.cuda.synchronize()
                     self._time_buf[0] = float(ret)
                     self._exec_ar(self._time_buf[:1], *self._default_ar_config(), sym=self.symmetric_memory)
                     if self._time_buf[0].item() != 0:
                         continue
+                    torch.cuda.synchronize()
                     used.add(algo)
 
                     # Warmup
@@ -341,7 +352,7 @@ def _bench_sizes(low=5 * 1024, high=80 << 20):
 
 
 def benchmark_allreduce(
-    comm: CustomizedComm, dtype=torch.float16, accum_dtype=None, n_warmup=10, n_graph_launches=10, n_iter=100
+    comm: CustomizedComm, dtype=torch.float16, accum_dtype=None, n_warmup=5, n_graph_launches=5, n_iter=50
 ):
     sizes = _bench_sizes()
     if comm.rank == 0:
@@ -382,7 +393,7 @@ def benchmark_allreduce(
             print(f"{nelems:<18} {size:<18} {ms*1000:<18.2f} {size/(ms*1e-3)/1e9:<18.2f}")
 
 
-def benchmark_allgather(comm: CustomizedComm, dtype=torch.float16, n_warmup=10, n_graph_launches=10, n_iter=100):
+def benchmark_allgather(comm: CustomizedComm, dtype=torch.float16, n_warmup=5, n_graph_launches=5, n_iter=50):
     sizes = _bench_sizes()
     if comm.rank == 0:
         print(f"\n{'='*60}\nAllgather Benchmark\n{'='*60}")
diff --git a/src/ext/collectives/allreduce/allreduce_packet.cu b/src/ext/collectives/allreduce/allreduce_packet.cu
index a0bc0e26e..801bed626 100644
--- a/src/ext/collectives/allreduce/allreduce_packet.cu
+++ b/src/ext/collectives/allreduce/allreduce_packet.cu
@@ -231,7 +231,12 @@ CommResult AllreducePacket::allreduceKernelFunc(const std::shared_ptr<void> ctx_
   auto ctx = std::static_pointer_cast<AlgorithmCtx>(ctx_void);
   std::pair<int, int> blockAndThreadNum = {nBlocks, nThreadsPerBlock};
   if (blockAndThreadNum.first == 0 || blockAndThreadNum.second == 0) {
-    blockAndThreadNum = getDefaultBlockNumAndThreadNum(inputSize, ctx->workSize, ctx->ipcDomainNranks, dtype);
+    blockAndThreadNum = getDefaultBlockNumAndThreadNum(inputSize, ctx->ipcDomainNranks, ctx->workSize, dtype);
+  } else {
+    const int nPeers = ctx->workSize - 1;
+    if (nPeers > 0 && blockAndThreadNum.first < nPeers) {
+      return CommResult::CommInvalidArgument;
+    }
   }
 
   size_t sendBytes;

From f32cfb1fb87be2adce4a33b695ccec43441bf3bc Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Sat, 16 May 2026 19:29:18 +0000
Subject: [PATCH 37/44] update

---
 .../customized_comm_with_tuning.py            | 27 ++++++-------------
 1 file changed, 8 insertions(+), 19 deletions(-)

diff --git a/examples/torch-integration/customized_comm_with_tuning.py b/examples/torch-integration/customized_comm_with_tuning.py
index 0a07ca325..040fda584 100644
--- a/examples/torch-integration/customized_comm_with_tuning.py
+++ b/examples/torch-integration/customized_comm_with_tuning.py
@@ -54,16 +54,12 @@ def _round_pow2(size: int) -> int:
 # -- CustomizedComm -----------------------------------------------------------
 
 
-def _env_int(name: str, default: int) -> int:
-    return int(os.environ.get(name, default))
-
-
 class CustomizedComm:
     """Exposes all_reduce, all_gather, barrier with lazy per-size tuning."""
 
-    _TUNE_N_WARMUP = _env_int("TUNE_N_WARMUP", 2)
-    _TUNE_N_GRAPH_LAUNCHES = _env_int("TUNE_N_GRAPH_LAUNCHES", 3)
-    _TUNE_N_OPS_PER_GRAPH = _env_int("TUNE_N_OPS_PER_GRAPH", 20)
+    _TUNE_N_WARMUP = 3
+    _TUNE_N_GRAPH_LAUNCHES = 5
+    _TUNE_N_OPS_PER_GRAPH = 50
     _CANDIDATE_NBLOCKS = [4, 8, 16, 24, 32, 48, 64, 112, 128]
     _CANDIDATE_NTHREADS = [512, 768, 1024]
     _NBLOCKS_LIMIT = {
@@ -88,10 +84,10 @@ class CustomizedComm:
         ("default_allreduce_rsag", 512 << 10, None, None),
     ]
     _AR_CANDIDATES_SINGLE = [
-        ("default_allreduce_packet", 0, 512 << 10, None),
-        ("default_allreduce_allpair_packet", 0, 128 << 10, None),
-        ("default_allreduce_nvls_packet", 0, 64 << 10, lambda c: c._nvls),
-        ("default_allreduce_rsag_zero_copy", 512 << 10, None, lambda c: not (c._nvls and c.symmetric_memory)),
+        ("default_allreduce_packet", 0, 4 << 20, None),
+        ("default_allreduce_allpair_packet", 0, 512 << 10, None),
+        ("default_allreduce_nvls_packet", 0, 512 << 10, lambda c: c._nvls),
+        ("default_allreduce_rsag_zero_copy", 512 << 10, None, None),
         ("default_allreduce_nvls_zero_copy", 512 << 10, None, lambda c: c._nvls and c.symmetric_memory),
         ("default_allreduce_fullmesh", 0, None, lambda c: torch.version.hip is not None),
     ]
@@ -228,11 +224,6 @@ def _run_tune(self, collective, algo, buf, size, nb, nt):
                 symmetric_memory=False,
             )
 
-    def _is_tune_config_supported(self, algo, nb, nt):
-        if algo.name in ("default_allreduce_packet", "default_allreduce_allpair_packet"):
-            return nb >= self.world_size - 1 and nt in (512, 1024)
-        return True
-
     def _tune_size(self, collective: str, target_size: int):
         """Auto-tune one (collective, target_size) pair and cache result."""
         buf = self._ensure_tune_bufs()
@@ -248,15 +239,13 @@ def _tune_size(self, collective: str, target_size: int):
                 if nb > nb_limit:
                     continue
                 for nt in self._CANDIDATE_NTHREADS:
-                    if not self._is_tune_config_supported(algo, nb, nt):
-                        continue
                     # Feasibility — sync result across ranks so all agree
                     ret = run(algo, nb, nt)
+                    torch.cuda.synchronize()
                     self._time_buf[0] = float(ret)
                     self._exec_ar(self._time_buf[:1], *self._default_ar_config(), sym=self.symmetric_memory)
                     if self._time_buf[0].item() != 0:
                         continue
-                    torch.cuda.synchronize()
                     used.add(algo)
 
                     # Warmup

From 594dc79657bc15cbfa9762d986bb4756726794c2 Mon Sep 17 00:00:00 2001
From: Qinghua Zhou <qinghuazhou@microsoft.com>
Date: Sat, 16 May 2026 23:19:25 +0000
Subject: [PATCH 38/44] Address NVLS review feedback

Handle unsupported FP8 NVLS paths safely, tighten IPC-domain guards, align IPC-domain naming, and add IPC-domain fabric hash logging.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 include/mscclpp/switch_channel_device.hpp     |  6 +-
 src/core/bootstrap/bootstrap.cc               |  2 +
 src/core/include/execution_kernel.hpp         | 10 ++-
 .../allgather/allgather_fullmesh.cu           | 10 +--
 .../allgather/allgather_fullmesh_2.cu         | 12 ++--
 .../allreduce/allreduce_allpair_packet.cu     | 29 +++++----
 .../allreduce/allreduce_fullmesh.cu           | 12 ++--
 .../allreduce_nvls_block_pipeline.cu          | 32 ++++-----
 .../allreduce/allreduce_nvls_packet.cu        |  4 +-
 .../allreduce/allreduce_nvls_warp_pipeline.cu | 30 ++++-----
 .../allreduce/allreduce_nvls_zero_copy.cu     | 33 ++++++----
 .../collectives/allreduce/allreduce_packet.cu | 38 ++++++-----
 .../collectives/allreduce/allreduce_rsag.cu   | 26 ++++----
 .../allreduce/allreduce_rsag_pipeline.cu      | 24 +++----
 .../allreduce/allreduce_rsag_zero_copy.cu     | 18 +++--
 .../allreduce_nvls_block_pipeline.hpp         |  2 +-
 .../allreduce_nvls_warp_pipeline.hpp          |  2 +-
 .../collectives/include/allreduce/common.hpp  | 65 +++++++++++--------
 .../collectives/include/collective_utils.hpp  |  8 +--
 19 files changed, 203 insertions(+), 160 deletions(-)

diff --git a/include/mscclpp/switch_channel_device.hpp b/include/mscclpp/switch_channel_device.hpp
index e95dfcf51..df22bd3aa 100644
--- a/include/mscclpp/switch_channel_device.hpp
+++ b/include/mscclpp/switch_channel_device.hpp
@@ -155,7 +155,7 @@ struct SwitchChannelDeviceHandle {
     }
 #endif
     else {
-      assert(false && "Unsupported vector type for multimemLoadReduce");
+      static_assert(dependentFalse<VectorType>, "Unsupported vector type for multimemLoadReduce");
     }
     return val;
   };
@@ -223,7 +223,7 @@ struct SwitchChannelDeviceHandle {
     }
 #endif
     else {
-      assert(false && "Unsupported vector type for multimemStore");
+      static_assert(dependentFalse<VectorType>, "Unsupported vector type for multimemStore");
     }
   };
 
@@ -248,7 +248,7 @@ struct SwitchChannelDeviceHandle {
     } else if constexpr (std::is_same_v<TValue, uint1> && std::is_same_v<T, __half2>) {
       asm volatile("multimem.red.relaxed.sys.global.add.f16x2 [%0], {%1};" ::"l"(ptr), "r"(val.x) : "memory");
     } else {
-      assert(false && "Unsupported vector type for multimemStoreReduce");
+      static_assert(dependentFalse<TValue>, "Unsupported vector type for multimemStoreReduce");
     }
   };
 #endif  // defined(MSCCLPP_DEVICE_CUDA)
diff --git a/src/core/bootstrap/bootstrap.cc b/src/core/bootstrap/bootstrap.cc
index a58357519..ffdd9c1cc 100644
--- a/src/core/bootstrap/bootstrap.cc
+++ b/src/core/bootstrap/bootstrap.cc
@@ -468,6 +468,8 @@ int TcpBootstrap::Impl::getNranksPerIpcDomain() {
       ++nRanksPerIpcDomain;
     }
   }
+  INFO(MSCCLPP_INIT, "rank %d IPC domain fabric hash 0x%016llx nRanksPerIpcDomain %d", rank_,
+       static_cast<unsigned long long>(ipcDomainHashes[rank_]), nRanksPerIpcDomain);
   nRanksPerIpcDomain_ = nRanksPerIpcDomain;
   return nRanksPerIpcDomain_;
 }
diff --git a/src/core/include/execution_kernel.hpp b/src/core/include/execution_kernel.hpp
index cb808bc8c..e9095ada6 100644
--- a/src/core/include/execution_kernel.hpp
+++ b/src/core/include/execution_kernel.hpp
@@ -525,7 +525,15 @@ MSCCLPP_DEVICE_INLINE void handleMultiLoadReduceStore(const Operation& op, uint3
   if constexpr (std::is_same_v<T, uint8_t>) {
     assert(false && "MULTI_LOAD_REDUCE_STORE is not supported for uint8_t data type");
     return;
-  } else {
+  }
+#if defined(__FP8_TYPES_EXIST__) && \
+    (!(defined(__CUDA_ARCH_SPECIFIC__) || defined(__CUDA_ARCH_FAMILY_SPECIFIC__)) || (__CUDA_ARCH__ < 1000))
+  else if constexpr (std::is_same_v<T, __fp8_e4m3> || std::is_same_v<T, __fp8_e5m2>) {
+    assert(false && "FP8 MULTI_LOAD_REDUCE_STORE requires sm_100a or newer");
+    return;
+  }
+#endif
+  else {
     static_assert(sizeof(T) <= 8, "Only support type with size <= 8 bytes");
     const uint32_t size = min(op.inputBufferSizes[0] - offset, unitSize);
     if (size <= 0) {
diff --git a/src/ext/collectives/allgather/allgather_fullmesh.cu b/src/ext/collectives/allgather/allgather_fullmesh.cu
index 84dd4d473..570a2d612 100644
--- a/src/ext/collectives/allgather/allgather_fullmesh.cu
+++ b/src/ext/collectives/allgather/allgather_fullmesh.cu
@@ -11,8 +11,8 @@ namespace collective {
 template <bool IsOutOfPlace>
 __global__ void __launch_bounds__(1024, 1)
     allgatherFullmesh(void* buff, void* scratch, void* resultBuff, DeviceHandle<MemoryChannel>* memoryChannels,
-                      int rank, int ipcDomainNranks, [[maybe_unused]] int worldSize, size_t nelems) {
-  const int nPeer = ipcDomainNranks - 1;
+                      int rank, int nRanksPerIpcDomain, [[maybe_unused]] int worldSize, size_t nelems) {
+  const int nPeer = nRanksPerIpcDomain - 1;
   const size_t chanOffset = nPeer * blockIdx.x;
   // assume (nelems * sizeof(T)) is divisible by 16
   const size_t nInt4 = nelems * sizeof(int) / sizeof(int4);
@@ -127,11 +127,11 @@ CommResult AllgatherFullmesh::allgatherKernelFunc(const std::shared_ptr<void> ct
   if ((char*)input == (char*)output + rank * inputSize) {
     allgatherFullmesh<false><<<numBlocksAndThreads.first, numBlocksAndThreads.second, 0, stream>>>(
         (void*)input, this->scratchBuffer_, (void*)output, ctx->memoryChannelDeviceHandles.get(), rank,
-        ctx->ipcDomainNranks, ctx->workSize, nElem);
+        ctx->nRanksPerIpcDomain, ctx->workSize, nElem);
   } else {
     allgatherFullmesh<true><<<numBlocksAndThreads.first, numBlocksAndThreads.second, 0, stream>>>(
         (void*)input, this->scratchBuffer_, (void*)output, ctx->memoryChannelDeviceHandles.get(), rank,
-        ctx->ipcDomainNranks, ctx->workSize, nElem);
+        ctx->nRanksPerIpcDomain, ctx->workSize, nElem);
   }
   cudaError_t err = cudaGetLastError();
   if (err != cudaSuccess) {
@@ -148,7 +148,7 @@ std::shared_ptr<void> AllgatherFullmesh::initAllgatherContext(std::shared_ptr<Co
   auto ctx = std::make_shared<AlgorithmCtx>();
   ctx->rank = comm->bootstrap()->getRank();
   ctx->workSize = comm->bootstrap()->getNranks();
-  ctx->ipcDomainNranks = comm->bootstrap()->getNranksPerIpcDomain();
+  ctx->nRanksPerIpcDomain = comm->bootstrap()->getNranksPerIpcDomain();
 
   // setup semaphores
   ctx->memorySemaphores = setupMemorySemaphores(comm, this->conns_, nChannelsPerConnection);
diff --git a/src/ext/collectives/allgather/allgather_fullmesh_2.cu b/src/ext/collectives/allgather/allgather_fullmesh_2.cu
index 5a353922f..f344824f7 100644
--- a/src/ext/collectives/allgather/allgather_fullmesh_2.cu
+++ b/src/ext/collectives/allgather/allgather_fullmesh_2.cu
@@ -12,15 +12,15 @@ __device__ DeviceSyncer deviceSyncer;
 template <bool IsOutOfPlace>
 __global__ void __launch_bounds__(1024, 1)
     allgatherFullmesh2(void* sendbuff, mscclpp::DeviceHandle<mscclpp::MemoryChannel>* memoryChannels,
-                       size_t channelOutOffset, size_t rank, [[maybe_unused]] size_t worldSize, size_t ipcDomainNranks,
-                       size_t nelemsPerGPU) {
+                       size_t channelOutOffset, size_t rank, [[maybe_unused]] size_t worldSize,
+                       size_t nRanksPerIpcDomain, size_t nelemsPerGPU) {
   const size_t tid = threadIdx.x + blockIdx.x * blockDim.x;
   const size_t lid = tid % WARP_SIZE;
   const size_t wid = tid / WARP_SIZE;
 
   const size_t nThread = blockDim.x * gridDim.x;
   const size_t nWarp = nThread / WARP_SIZE;
-  const size_t nPeer = ipcDomainNranks - 1;
+  const size_t nPeer = nRanksPerIpcDomain - 1;
   const size_t chanOffset = nPeer * blockIdx.x;
   auto memChans = memoryChannels + chanOffset;
 
@@ -140,11 +140,11 @@ CommResult AllgatherFullmesh2::allgatherKernelFunc(const std::shared_ptr<void> c
   if ((char*)input == (char*)output + rank * inputSize) {
     allgatherFullmesh2<false><<<numBlocksAndThreads.first, numBlocksAndThreads.second, 0, stream>>>(
         (void*)input, ctx->memoryChannelDeviceHandles.get(), channelOutOffset, ctx->rank, ctx->workSize,
-        ctx->ipcDomainNranks, nElem);
+        ctx->nRanksPerIpcDomain, nElem);
   } else {
     allgatherFullmesh2<true><<<numBlocksAndThreads.first, numBlocksAndThreads.second, 0, stream>>>(
         (void*)input, ctx->memoryChannelDeviceHandles.get(), channelOutOffset, ctx->rank, ctx->workSize,
-        ctx->ipcDomainNranks, nElem);
+        ctx->nRanksPerIpcDomain, nElem);
   }
   cudaError_t err = cudaGetLastError();
   if (err != cudaSuccess) {
@@ -159,7 +159,7 @@ std::shared_ptr<void> AllgatherFullmesh2::initAllgatherContext(std::shared_ptr<m
   auto ctx = std::make_shared<AlgorithmCtx>();
   ctx->rank = comm->bootstrap()->getRank();
   ctx->workSize = comm->bootstrap()->getNranks();
-  ctx->ipcDomainNranks = comm->bootstrap()->getNranksPerIpcDomain();
+  ctx->nRanksPerIpcDomain = comm->bootstrap()->getNranksPerIpcDomain();
 
   // setup semaphores
   ctx->memorySemaphores = this->memorySemaphores_;
diff --git a/src/ext/collectives/allreduce/allreduce_allpair_packet.cu b/src/ext/collectives/allreduce/allreduce_allpair_packet.cu
index 29ef2055b..47c4f61d9 100644
--- a/src/ext/collectives/allreduce/allreduce_allpair_packet.cu
+++ b/src/ext/collectives/allreduce/allreduce_allpair_packet.cu
@@ -14,11 +14,11 @@ namespace collective {
 
 template <ReduceOp OpType, typename T, typename AccumT = T>
 __global__ void allreduceAllPairs(T* buff, T* scratch, T* resultBuff, DeviceHandle<MemoryChannel>* memoryChannels,
-                                  size_t channelDataOffset, size_t scratchBufferSize, int rank, int ipcDomainNranks,
+                                  size_t channelDataOffset, size_t scratchBufferSize, int rank, int nRanksPerIpcDomain,
                                   int worldSize, size_t nelems, uint32_t numScratchBuff, void* flags,
                                   uint32_t flagSize) {
   if (sizeof(T) == 2 || sizeof(T) == 1) nelems = (nelems * sizeof(T) + sizeof(T)) / sizeof(int);
-  const int nPeers = ipcDomainNranks - 1;
+  const int nPeers = nRanksPerIpcDomain - 1;
 
   uint32_t flag = ((uint32_t*)flags)[blockIdx.x];
   size_t scratchBaseOffset = (flag % numScratchBuff) ? (scratchBufferSize / numScratchBuff) : 0;
@@ -72,19 +72,17 @@ template <ReduceOp OpType, typename T, typename AccumT = T>
 struct AllpairAdapter {
   static cudaError_t call(const void* buff, void* scratch, void* resultBuff, void* memoryChannels, void*,
                           DeviceHandle<SwitchChannel>*, DeviceHandle<SwitchChannel>*, size_t channelInOffset, size_t,
-                          size_t scratchBufferSize, int rank, int ipcDomainNranks, int worldSize, size_t inputSize,
+                          size_t scratchBufferSize, int rank, int nRanksPerIpcDomain, int worldSize, size_t inputSize,
                           cudaStream_t stream, void* flags, uint32_t flagSize, uint32_t numScratchBuff, int nBlocks = 0,
                           int nThreadsPerBlock = 0) {
     using ChannelType = DeviceHandle<MemoryChannel>;
     const size_t nelems = inputSize / sizeof(T);
     // Round nBlocks to multiple of nPeers so every block maps to a valid peer.
-    const int nPeers = worldSize - 1;
-    if (nPeers > 0) {
-      nBlocks = (nBlocks / nPeers) * nPeers;
-    }
+    const int nPeers = nRanksPerIpcDomain - 1;
+    nBlocks = (nBlocks / nPeers) * nPeers;
     allreduceAllPairs<OpType, T, AccumT><<<nBlocks, nThreadsPerBlock, 0, stream>>>(
         (T*)buff, (T*)scratch, (T*)resultBuff, (ChannelType*)memoryChannels, channelInOffset, scratchBufferSize, rank,
-        ipcDomainNranks, worldSize, nelems, numScratchBuff, flags, flagSize);
+        nRanksPerIpcDomain, worldSize, nelems, numScratchBuff, flags, flagSize);
     return cudaGetLastError();
   }
 };
@@ -103,13 +101,18 @@ CommResult AllreduceAllpairPacket::allreduceKernelFunc(const std::shared_ptr<voi
                                                        const std::unordered_map<std::string, uintptr_t>&,
                                                        DataType accumDtype) {
   auto algoCtx = std::static_pointer_cast<AlgorithmCtx>(ctx);
+  if (algoCtx->workSize != algoCtx->nRanksPerIpcDomain) {
+    WARN("AllreduceAllpairPacket requires workSize to match nRanksPerIpcDomain, got workSize=%d, nRanksPerIpcDomain=%d",
+         algoCtx->workSize, algoCtx->nRanksPerIpcDomain);
+    return CommResult::CommInvalidArgument;
+  }
   std::pair<int, int> blockAndThreadNum{nBlocks, nThreadsPerBlock};
   if (blockAndThreadNum.first == 0 || blockAndThreadNum.second == 0) {
-    blockAndThreadNum = getDefaultBlockNumAndThreadNum(inputSize, algoCtx->workSize);
+    blockAndThreadNum = getDefaultBlockNumAndThreadNum(inputSize, algoCtx->nRanksPerIpcDomain);
   }
   // nBlocks must be at least nPeers for allpair — each block maps to one peer.
-  const int nPeers = algoCtx->ipcDomainNranks - 1;
-  if (nPeers > 0 && blockAndThreadNum.first < nPeers) {
+  const int nPeers = algoCtx->nRanksPerIpcDomain - 1;
+  if (blockAndThreadNum.first < nPeers) {
     return CommResult::CommInvalidArgument;
   }
   size_t sendBytes;
@@ -124,7 +127,7 @@ CommResult AllreduceAllpairPacket::allreduceKernelFunc(const std::shared_ptr<voi
   }
   cudaError_t error =
       allreduce(input, this->scratchBuffer_, output, algoCtx->memoryChannelDeviceHandles.get(), nullptr, nullptr,
-                nullptr, channelInOffset, 0, this->scratchBufferSize_, algoCtx->rank, algoCtx->ipcDomainNranks,
+                nullptr, channelInOffset, 0, this->scratchBufferSize_, algoCtx->rank, algoCtx->nRanksPerIpcDomain,
                 algoCtx->workSize, inputSize, stream, (void*)flagBuffer_, (uint32_t)flagBufferSize_,
                 this->nSegmentsForScratchBuffer_, blockAndThreadNum.first, blockAndThreadNum.second);
   if (error != cudaSuccess) {
@@ -140,7 +143,7 @@ std::shared_ptr<void> AllreduceAllpairPacket::initAllreduceContext(std::shared_p
   const int nChannelsPerConnection = maxBlockNum_;
   ctx->rank = comm->bootstrap()->getRank();
   ctx->workSize = comm->bootstrap()->getNranks();
-  ctx->ipcDomainNranks = comm->bootstrap()->getNranksPerIpcDomain();
+  ctx->nRanksPerIpcDomain = comm->bootstrap()->getNranksPerIpcDomain();
   ctx->memorySemaphores = this->memorySemaphores_;
   ctx->registeredMemories = this->registeredMemories_;
   ctx->registeredMemories.pop_back();  // remove the local memory from previous context
diff --git a/src/ext/collectives/allreduce/allreduce_fullmesh.cu b/src/ext/collectives/allreduce/allreduce_fullmesh.cu
index b158f817c..2790295e4 100644
--- a/src/ext/collectives/allreduce/allreduce_fullmesh.cu
+++ b/src/ext/collectives/allreduce/allreduce_fullmesh.cu
@@ -13,8 +13,8 @@ template <ReduceOp OpType, typename T, typename AccumT = T>
 __global__ void __launch_bounds__(512, 1)
     allreduceFullmesh(T* buff, T* scratch, T* resultBuff, DeviceHandle<MemoryChannel>* memoryChannels,
                       DeviceHandle<MemoryChannel>* memoryOutChannels, size_t channelOutDataOffset, int rank,
-                      int ipcDomainNranks, int worldSize, size_t nelems) {
-  const int nPeer = ipcDomainNranks - 1;
+                      int nRanksPerIpcDomain, int worldSize, size_t nelems) {
+  const int nPeer = nRanksPerIpcDomain - 1;
   const size_t chanOffset = nPeer * blockIdx.x;
   // assume (nelems * sizeof(T)) is divisible by (16 * worldSize)
   const size_t nInt4 = nelems * sizeof(T) / sizeof(int4);
@@ -157,7 +157,7 @@ template <ReduceOp OpType, typename T, typename AccumT = T>
 struct AllreduceAllconnectAdapter {
   static cudaError_t call(const void* input, void* scratch, void* output, void* memoryChannels, void* memoryOutChannels,
                           DeviceHandle<SwitchChannel>*, DeviceHandle<SwitchChannel>*, size_t,
-                          size_t channelOutDataOffset, size_t, int rank, int ipcDomainNranks, int worldSize,
+                          size_t channelOutDataOffset, size_t, int rank, int nRanksPerIpcDomain, int worldSize,
                           size_t inputSize, cudaStream_t stream, void*, uint32_t, uint32_t, int nBlocks,
                           int nThreadsPerBlock) {
     using ChannelType = DeviceHandle<MemoryChannel>;
@@ -166,7 +166,7 @@ struct AllreduceAllconnectAdapter {
     if (nThreadsPerBlock == 0) nThreadsPerBlock = 512;
     allreduceFullmesh<OpType, T, AccumT><<<nBlocks, nThreadsPerBlock, 0, stream>>>(
         (T*)input, (T*)scratch, (T*)output, (ChannelType*)memoryChannels, (ChannelType*)memoryOutChannels,
-        channelOutDataOffset, rank, ipcDomainNranks, worldSize, nelems);
+        channelOutDataOffset, rank, nRanksPerIpcDomain, worldSize, nelems);
     return cudaGetLastError();
   }
 };
@@ -223,7 +223,7 @@ CommResult AllreduceFullmesh::allreduceKernelFunc(
   }
   cudaError_t error =
       allreduce(input, this->scratchBuffer_, output, inputChannelHandles.get(), ctx->memoryChannelDeviceHandles.get(),
-                nullptr, nullptr, 0, channelOutOffset, 0, ctx->rank, ctx->ipcDomainNranks, ctx->workSize, inputSize,
+                nullptr, nullptr, 0, channelOutOffset, 0, ctx->rank, ctx->nRanksPerIpcDomain, ctx->workSize, inputSize,
                 stream, nullptr, 0, 0, numBlocksAndThreads.first, numBlocksAndThreads.second);
   if (error != cudaSuccess) {
     WARN("AllreduceAllconnect failed with error: %s", cudaGetErrorString(error));
@@ -250,7 +250,7 @@ std::shared_ptr<void> AllreduceFullmesh::initAllreduceContext(std::shared_ptr<Co
   auto ctx = std::make_shared<AlgorithmCtx>();
   ctx->rank = comm->bootstrap()->getRank();
   ctx->workSize = comm->bootstrap()->getNranks();
-  ctx->ipcDomainNranks = comm->bootstrap()->getNranksPerIpcDomain();
+  ctx->nRanksPerIpcDomain = comm->bootstrap()->getNranksPerIpcDomain();
 
   // setup semaphores
   ctx->memorySemaphores = this->outputSemaphores_;
diff --git a/src/ext/collectives/allreduce/allreduce_nvls_block_pipeline.cu b/src/ext/collectives/allreduce/allreduce_nvls_block_pipeline.cu
index 890e50f58..347ce8b41 100644
--- a/src/ext/collectives/allreduce/allreduce_nvls_block_pipeline.cu
+++ b/src/ext/collectives/allreduce/allreduce_nvls_block_pipeline.cu
@@ -21,15 +21,15 @@ __global__ void __launch_bounds__(1024, 1)
                                [[maybe_unused]] DeviceHandle<BaseMemoryChannel>* memoryChannels,
                                [[maybe_unused]] DeviceHandle<SwitchChannel>* switchChannels,
                                [[maybe_unused]] size_t size, [[maybe_unused]] size_t scratchBufferSize,
-                               [[maybe_unused]] int rank, [[maybe_unused]] int ipcDomainNranks) {
+                               [[maybe_unused]] int rank, [[maybe_unused]] int nRanksPerIpcDomain) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
   constexpr int alignment = 16;
-  int nPeers = ipcDomainNranks - 1;
-  int nBlocksForCopy = ipcDomainNranks * 2;
-  int nBlocksForReduce = ipcDomainNranks;
+  int nPeers = nRanksPerIpcDomain - 1;
+  int nBlocksForCopy = nRanksPerIpcDomain * 2;
+  int nBlocksForReduce = nRanksPerIpcDomain;
   int copyReduceRatio = nBlocksForCopy / nBlocksForReduce;
-  size_t scratchSizePerRank = scratchBufferSize / ipcDomainNranks;
-  size_t sizePerRank = size / ipcDomainNranks;
+  size_t scratchSizePerRank = scratchBufferSize / nRanksPerIpcDomain;
+  size_t sizePerRank = size / nRanksPerIpcDomain;
   assert(sizePerRank % alignment == 0);
   uint32_t sizePerBlock =
       ((sizePerRank + (nBlocksForCopy - 1)) / nBlocksForCopy + alignment - 1) / alignment * alignment;
@@ -69,7 +69,7 @@ __global__ void __launch_bounds__(1024, 1)
         deviceSemaphore[bid + 2 * nBlocksForCopy].acquire();
       }
       __syncthreads();
-      for (int i = 0; i < ipcDomainNranks; i++) {
+      for (int i = 0; i < nRanksPerIpcDomain; i++) {
         size_t blockOffset = it * unitSize + bid * sizePerBlock + i * sizePerRank;
         uint32_t scratchOffset = scratchIt * unitSize + bid * scratchSizePerBlock + i * scratchSizePerRank;
         char* srcData = (char*)src + blockOffset;
@@ -126,7 +126,7 @@ __global__ void __launch_bounds__(1024, 1)
         channels->wait();
       }
       __syncthreads();
-      for (int i = 0; i < ipcDomainNranks; i++) {
+      for (int i = 0; i < nRanksPerIpcDomain; i++) {
         size_t blockOffset = it * unitSize + (bid - nBlocksForCopy - nBlocksForReduce) * sizePerBlock + i * sizePerRank;
         uint32_t scratchOffset = scratchIt * unitSize +
                                  (bid - nBlocksForCopy - nBlocksForReduce) * scratchSizePerBlock +
@@ -151,7 +151,7 @@ template <ReduceOp OpType, typename T, typename AccumT = T>
 struct NvlsBlockPipelineAdapter {
   static cudaError_t call(const void* input, void* scratch, void* output, void* memoryChannels, void*,
                           DeviceHandle<SwitchChannel>* nvlsChannels, DeviceHandle<SwitchChannel>*, size_t, size_t,
-                          size_t scratchBufferSize, int rank, int ipcDomainNranks, int, size_t inputSize,
+                          size_t scratchBufferSize, int rank, int nRanksPerIpcDomain, int, size_t inputSize,
                           cudaStream_t stream, void*, uint32_t, uint32_t, int nBlocks, int nThreadsPerBlock) {
     // uint8_t is not supported for NVLS (no hardware support for byte-level reduction)
     if constexpr (std::is_same_v<T, uint8_t>) {
@@ -169,7 +169,7 @@ struct NvlsBlockPipelineAdapter {
         using ChannelType = DeviceHandle<BaseMemoryChannel>;
         allreduceNvlsBlockPipeline<T><<<nBlocks, nThreadsPerBlock, 0, stream>>>(
             input, scratch, output, (ChannelType*)memoryChannels, nvlsChannels, inputSize, scratchBufferSize, rank,
-            ipcDomainNranks);
+            nRanksPerIpcDomain);
         return cudaGetLastError();
       }
   }
@@ -177,9 +177,9 @@ struct NvlsBlockPipelineAdapter {
 
 void AllreduceNvlsBlockPipeline::initialize(std::shared_ptr<Communicator> comm) {
   nSwitchChannels_ = 8;
-  ipcDomainNranks_ = comm->bootstrap()->getNranksPerIpcDomain();
-  // Per-peer channel allocation must hold up to 4 * ipcDomainNranks entries (see kernel).
-  nBaseChannels_ = std::max(64, 4 * ipcDomainNranks_);
+  nRanksPerIpcDomain_ = comm->bootstrap()->getNranksPerIpcDomain();
+  // Per-peer channel allocation must hold up to 4 * nRanksPerIpcDomain entries (see kernel).
+  nBaseChannels_ = std::max(64, 4 * nRanksPerIpcDomain_);
   this->conns_ = setupConnections(comm);
   // setup semaphores
   std::vector<std::shared_ptr<MemoryDevice2DeviceSemaphore>> memorySemaphores =
@@ -202,11 +202,11 @@ CommResult AllreduceNvlsBlockPipeline::allreduceKernelFunc(
   }
   std::pair<int, int> blockAndThreadNum = {nBlocks, nThreadsPerBlock};
   if (blockAndThreadNum.first == 0 || blockAndThreadNum.second == 0) {
-    blockAndThreadNum = {ctx->ipcDomainNranks * 5, 1024};
+    blockAndThreadNum = {ctx->nRanksPerIpcDomain * 5, 1024};
   }
   cudaError_t error = allreduce(input, this->scratchBuffer_, output, this->memoryChannelsDeviceHandle_.get(), nullptr,
                                 ctx->switchChannelDeviceHandles.get(), nullptr, 0, 0, this->scratchBufferSize_,
-                                ctx->rank, ctx->ipcDomainNranks, ctx->workSize, inputSize, stream, nullptr, 0, 0,
+                                ctx->rank, ctx->nRanksPerIpcDomain, ctx->workSize, inputSize, stream, nullptr, 0, 0,
                                 blockAndThreadNum.first, blockAndThreadNum.second);
   if (error != cudaSuccess) {
     WARN("AllreduceNvlsBlockPipeline failed with error: %s", cudaGetErrorString(error));
@@ -224,7 +224,7 @@ std::shared_ptr<void> AllreduceNvlsBlockPipeline::initAllreduceContext(std::shar
   auto ctx = std::make_shared<AlgorithmCtx>();
   ctx->rank = comm->bootstrap()->getRank();
   ctx->workSize = comm->bootstrap()->getNranks();
-  ctx->ipcDomainNranks = comm->bootstrap()->getNranksPerIpcDomain();
+  ctx->nRanksPerIpcDomain = comm->bootstrap()->getNranksPerIpcDomain();
 
   // setup channels
   ctx->switchChannels =
diff --git a/src/ext/collectives/allreduce/allreduce_nvls_packet.cu b/src/ext/collectives/allreduce/allreduce_nvls_packet.cu
index e8ecfb737..f16e8b05f 100644
--- a/src/ext/collectives/allreduce/allreduce_nvls_packet.cu
+++ b/src/ext/collectives/allreduce/allreduce_nvls_packet.cu
@@ -95,7 +95,7 @@ std::shared_ptr<void> AllreduceNvlsPacket::initAllreduceContext(std::shared_ptr<
   auto ctx = std::make_shared<AlgorithmCtx>();
   ctx->rank = comm->bootstrap()->getRank();
   ctx->workSize = comm->bootstrap()->getNranks();
-  ctx->ipcDomainNranks = comm->bootstrap()->getNranksPerIpcDomain();
+  ctx->nRanksPerIpcDomain = comm->bootstrap()->getNranksPerIpcDomain();
 
   // setup channels
   ctx->switchChannels = this->switchChannels_;
@@ -124,7 +124,7 @@ CommResult AllreduceNvlsPacket::allreduceKernelFunc(const std::shared_ptr<void>
   }
   cudaError_t error =
       allreduce(input, this->scratchBuffer_, output, nullptr, nullptr, ctx->switchChannelDeviceHandles.get(), nullptr,
-                0, 0, this->scratchBufferSize_, ctx->rank, ctx->ipcDomainNranks, ctx->workSize, inputSize, stream,
+                0, 0, this->scratchBufferSize_, ctx->rank, ctx->nRanksPerIpcDomain, ctx->workSize, inputSize, stream,
                 (void*)flagBuffer_, (uint32_t)flagBufferSize_, 0, blockAndThreadNum.first, blockAndThreadNum.second);
   if (error != cudaSuccess) {
     WARN(ALGO, "AllreduceNvlsPacket failed with error: ", cudaGetErrorString(error));
diff --git a/src/ext/collectives/allreduce/allreduce_nvls_warp_pipeline.cu b/src/ext/collectives/allreduce/allreduce_nvls_warp_pipeline.cu
index 68efc2ab0..ba447d32a 100644
--- a/src/ext/collectives/allreduce/allreduce_nvls_warp_pipeline.cu
+++ b/src/ext/collectives/allreduce/allreduce_nvls_warp_pipeline.cu
@@ -19,15 +19,15 @@ __global__ void __launch_bounds__(1024, 1)
                               [[maybe_unused]] DeviceHandle<BaseMemoryChannel>* memoryChannels,
                               [[maybe_unused]] DeviceHandle<SwitchChannel>* multicast, [[maybe_unused]] size_t size,
                               [[maybe_unused]] size_t scratchBufferSize, [[maybe_unused]] int rank,
-                              [[maybe_unused]] int ipcDomainNranks) {
+                              [[maybe_unused]] int nRanksPerIpcDomain) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
   constexpr int alignment = 16;
-  int nPeers = ipcDomainNranks - 1;
+  int nPeers = nRanksPerIpcDomain - 1;
   int nBlocks = gridDim.x;
   int nBlocksPerNvlsConn = nBlocks / NUM_NVLS_CONNECTION;
   int bid = blockIdx.x;
-  size_t sizePerRank = size / ipcDomainNranks;
-  size_t scratchSizePerRank = scratchBufferSize / ipcDomainNranks;
+  size_t sizePerRank = size / nRanksPerIpcDomain;
+  size_t scratchSizePerRank = scratchBufferSize / nRanksPerIpcDomain;
   const size_t maxSizePerBlock = ((sizePerRank + nBlocks - 1) / nBlocks + alignment - 1) / alignment * alignment;
   size_t start = bid * maxSizePerBlock;
   size_t end = min(start + maxSizePerBlock, sizePerRank);
@@ -54,7 +54,7 @@ __global__ void __launch_bounds__(1024, 1)
     lastIterSize = sizePerBlock % copyPerIter;
   }
 
-  const size_t chanOffset = (ipcDomainNranks - 1) * blockIdx.x * 2;
+  const size_t chanOffset = (nRanksPerIpcDomain - 1) * blockIdx.x * 2;
   auto memoryChans = memoryChannels + chanOffset;
   __shared__ DeviceHandle<BaseMemoryChannel> channels[(MAX_IPC_DOMAIN_NRANKS - 1) * 2];
   const int lid = threadIdx.x % WARP_SIZE;
@@ -67,7 +67,7 @@ __global__ void __launch_bounds__(1024, 1)
     const size_t iterSize = (it == nIter - 1) ? lastIterSize : copyPerIter;
     if (warpId < endCopyWid) {
       int tidInCopy = threadIdx.x;
-      for (int i = 0; i < ipcDomainNranks; i++) {
+      for (int i = 0; i < nRanksPerIpcDomain; i++) {
         size_t offset = i * sizePerRank + maxSizePerBlock * bid + it * copyPerIter;
         size_t offsetScratch =
             i * scratchSizePerRank + scratchSizePerBlock * bid + (it * copyPerIter) % scratchSizePerBlock;
@@ -98,7 +98,7 @@ __global__ void __launch_bounds__(1024, 1)
         channels[tidInRecvCopy + nPeers].wait();
       }
       asm volatile("bar.sync %0, %1;" ::"r"(3), "r"((NRECV_COPY_WARPS)*WARP_SIZE) : "memory");
-      for (int i = 0; i < ipcDomainNranks; i++) {
+      for (int i = 0; i < nRanksPerIpcDomain; i++) {
         size_t offset = i * sizePerRank + maxSizePerBlock * bid + it * copyPerIter;
         size_t offsetScratch =
             i * scratchSizePerRank + scratchSizePerBlock * bid + (it * copyPerIter) % scratchSizePerBlock;
@@ -115,7 +115,7 @@ template <ReduceOp OpType, typename T, typename AccumT = T>
 struct NvlsWarpPipelineAdapter {
   static cudaError_t call(const void* input, void* scratch, void* output, void* memoryChannels, void*,
                           DeviceHandle<SwitchChannel>* nvlsChannels, DeviceHandle<SwitchChannel>*, size_t, size_t,
-                          size_t scratchBufferSize, int rank, int ipcDomainNranks, int, size_t inputSize,
+                          size_t scratchBufferSize, int rank, int nRanksPerIpcDomain, int, size_t inputSize,
                           cudaStream_t stream, void*, uint32_t, uint32_t, int nBlocks, int nThreadsPerBlock) {
     // uint8_t is not supported for NVLS (no hardware support for byte-level reduction)
     if constexpr (std::is_same_v<T, uint8_t>) {
@@ -133,7 +133,7 @@ struct NvlsWarpPipelineAdapter {
         using ChannelType = DeviceHandle<BaseMemoryChannel>;
         allreduceNvlsWarpPipeline<T><<<nBlocks, nThreadsPerBlock, 0, stream>>>(
             input, scratch, output, (ChannelType*)memoryChannels, nvlsChannels, inputSize, scratchBufferSize, rank,
-            ipcDomainNranks);
+            nRanksPerIpcDomain);
         return cudaGetLastError();
       }
   }
@@ -141,9 +141,9 @@ struct NvlsWarpPipelineAdapter {
 
 void AllreduceNvlsWarpPipeline::initialize(std::shared_ptr<Communicator> comm) {
   nSwitchChannels_ = NUM_NVLS_CONNECTION;
-  ipcDomainNranks_ = comm->bootstrap()->getNranksPerIpcDomain();
-  // Per-peer channel allocation must hold 2 * nBlocks entries; default nBlocks = 4 * ipcDomainNranks.
-  nBaseChannels_ = std::max(64, 8 * ipcDomainNranks_);
+  nRanksPerIpcDomain_ = comm->bootstrap()->getNranksPerIpcDomain();
+  // Per-peer channel allocation must hold 2 * nBlocks entries; default nBlocks = 4 * nRanksPerIpcDomain.
+  nBaseChannels_ = std::max(64, 8 * nRanksPerIpcDomain_);
   this->conns_ = setupConnections(comm);
   // setup semaphores
   std::vector<std::shared_ptr<MemoryDevice2DeviceSemaphore>> memorySemaphores =
@@ -166,11 +166,11 @@ CommResult AllreduceNvlsWarpPipeline::allreduceKernelFunc(
   }
   std::pair<int, int> blockAndThreadNum = {nBlocks, nThreadsPerBlock};
   if (blockAndThreadNum.first == 0 || blockAndThreadNum.second == 0) {
-    blockAndThreadNum = {ctx->ipcDomainNranks * 4, 1024};
+    blockAndThreadNum = {ctx->nRanksPerIpcDomain * 4, 1024};
   }
   cudaError_t error = allreduce(input, this->scratchBuffer_, output, this->memoryChannelsDeviceHandle_.get(), nullptr,
                                 ctx->switchChannelDeviceHandles.get(), nullptr, 0, 0, this->scratchBufferSize_,
-                                ctx->rank, ctx->ipcDomainNranks, ctx->workSize, inputSize, stream, nullptr, 0, 0,
+                                ctx->rank, ctx->nRanksPerIpcDomain, ctx->workSize, inputSize, stream, nullptr, 0, 0,
                                 blockAndThreadNum.first, blockAndThreadNum.second);
   if (error != cudaSuccess) {
     WARN("AllreduceNvlsWarpPipeline failed with error: %s", cudaGetErrorString(error));
@@ -188,7 +188,7 @@ std::shared_ptr<void> AllreduceNvlsWarpPipeline::initAllreduceContext(std::share
   auto ctx = std::make_shared<AlgorithmCtx>();
   ctx->rank = comm->bootstrap()->getRank();
   ctx->workSize = comm->bootstrap()->getNranks();
-  ctx->ipcDomainNranks = comm->bootstrap()->getNranksPerIpcDomain();
+  ctx->nRanksPerIpcDomain = comm->bootstrap()->getNranksPerIpcDomain();
 
   // setup channels
   ctx->switchChannels =
diff --git a/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu b/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu
index a6f699b2e..32fc61423 100644
--- a/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu
+++ b/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu
@@ -20,12 +20,12 @@ __global__ void __launch_bounds__(1024, 1)
                   [[maybe_unused]] mscclpp::DeviceHandle<mscclpp::SwitchChannel>* multicast,
                   [[maybe_unused]] mscclpp::DeviceHandle<mscclpp::SwitchChannel>* multicastOut,
                   [[maybe_unused]] size_t channelInOffset, [[maybe_unused]] size_t channelOutOffset,
-                  [[maybe_unused]] size_t size, [[maybe_unused]] int rank, [[maybe_unused]] int ipcDomainNranks) {
+                  [[maybe_unused]] size_t size, [[maybe_unused]] int rank, [[maybe_unused]] int nRanksPerIpcDomain) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
-  int nPeers = ipcDomainNranks - 1;
+  int nPeers = nRanksPerIpcDomain - 1;
   int nBlocks = gridDim.x;
   int bid = blockIdx.x;
-  size_t sizePerRank = size / ipcDomainNranks;
+  size_t sizePerRank = size / nRanksPerIpcDomain;
   const size_t minAlign = 16;
   // Align sizePerBlock to 16 bytes to ensure aligned vector access in handleMultiLoadReduceStore
   size_t sizePerBlock = (sizePerRank + nBlocks - 1) / nBlocks;
@@ -41,12 +41,12 @@ __global__ void __launch_bounds__(1024, 1)
   mscclpp::DeviceHandle<mscclpp::SwitchChannel>* multicastPtr = multicast + bid;
   mscclpp::DeviceHandle<mscclpp::SwitchChannel>* multicastOutPtr = multicastOut + bid;
 
-  const size_t chanOffset = (ipcDomainNranks - 1) * blockIdx.x;
+  const size_t chanOffset = (nRanksPerIpcDomain - 1) * blockIdx.x;
   auto memoryChans = memoryChannels + chanOffset;
   __shared__ mscclpp::DeviceHandle<mscclpp::BaseMemoryChannel> channels[MAX_IPC_DOMAIN_NRANKS - 1];
   const int lid = threadIdx.x % WARP_SIZE;
   // Peer count may exceed WARP_SIZE on MNNVL.
-  for (int i = lid; i < ipcDomainNranks - 1; i += WARP_SIZE) {
+  for (int i = lid; i < nRanksPerIpcDomain - 1; i += WARP_SIZE) {
     channels[i] = memoryChans[i];
   }
   __syncwarp();
@@ -74,7 +74,7 @@ struct NvlsAdapter {
   static cudaError_t call(const void*, void*, void*, void* memoryChannels, void*,
                           mscclpp::DeviceHandle<mscclpp::SwitchChannel>* nvlsChannels,
                           mscclpp::DeviceHandle<mscclpp::SwitchChannel>* nvlsOutChannels, size_t channelInOffset,
-                          size_t channelOutOffset, size_t, int rank, int ipcDomainNranks, int, size_t inputSize,
+                          size_t channelOutOffset, size_t, int rank, int nRanksPerIpcDomain, int, size_t inputSize,
                           cudaStream_t stream, void*, uint32_t, uint32_t, int nBlocks, int nThreadsPerBlock) {
     // uint8_t is not supported for NVLS (no hardware support for byte-level reduction)
     if constexpr (std::is_same_v<T, uint8_t>) {
@@ -86,7 +86,7 @@ struct NvlsAdapter {
       using ChannelType = DeviceHandle<mscclpp::BaseMemoryChannel>;
       allreduceNvls<T, AccumT><<<nBlocks, nThreadsPerBlock, 0, stream>>>(
           (ChannelType*)memoryChannels, nvlsChannels, nvlsOutChannels, channelInOffset, channelOutOffset, inputSize,
-          rank, ipcDomainNranks);
+          rank, nRanksPerIpcDomain);
       return cudaGetLastError();
     }
   }
@@ -120,6 +120,13 @@ CommResult AllreduceNvls::allreduceKernelFunc(const std::shared_ptr<void> ctx_vo
     return CommResult::CommInvalidArgument;
   }
   auto ctx = std::static_pointer_cast<AlgorithmCtx>(ctx_void);
+#if defined(__FP8_TYPES_EXIST__)
+  bool isFp8Dtype = dtype == mscclpp::DataType::FLOAT8_E4M3FN || dtype == mscclpp::DataType::FLOAT8_E5M2;
+  if (isFp8Dtype && computeCapabilityMajor_ < 10) {
+    WARN("FP8 NVLS allreduce requires compute capability 10.x or newer.");
+    return CommResult::CommInvalidArgument;
+  }
+#endif
   AllreduceFunc allreduce = dispatch<NvlsAdapter>(op, dtype, accumDtype);
   if (!allreduce) {
     WARN("Unsupported operation or data type for allreduce, dtype=%d", static_cast<int>(dtype));
@@ -138,7 +145,7 @@ CommResult AllreduceNvls::allreduceKernelFunc(const std::shared_ptr<void> ctx_vo
   }
   std::pair<int, int> numBlocksAndThreads = {nBlocks, nThreadsPerBlock};
   if (numBlocksAndThreads.first == 0 || numBlocksAndThreads.second == 0) {
-    numBlocksAndThreads = {::min(ctx->ipcDomainNranks, MAX_NBLOCKS), 1024};
+    numBlocksAndThreads = {::min(ctx->nRanksPerIpcDomain, MAX_NBLOCKS), 1024};
     // For GB200 devices with MNNVLS (Multi-Node NVLink Sharp), scale the number of blocks inversely with
     // the number of GPUs. Empirically, 32 blocks works well for 4 GPUs and 16 for 8 GPUs, which
     // follows the formula 128 / nGPUs, clamped to [1, MAX_NBLOCKS].
@@ -152,9 +159,13 @@ CommResult AllreduceNvls::allreduceKernelFunc(const std::shared_ptr<void> ctx_vo
   }
   cudaError_t error =
       allreduce(nullptr, nullptr, nullptr, this->memoryChannelsDeviceHandle_.get(), nullptr, nvlsChannels,
-                nvlsOutChannels, channelInOffset, channelOutOffset, 0, ctx->rank, ctx->ipcDomainNranks, ctx->workSize,
-                inputSize, stream, nullptr, 0, 0, numBlocksAndThreads.first, numBlocksAndThreads.second);
+                nvlsOutChannels, channelInOffset, channelOutOffset, 0, ctx->rank, ctx->nRanksPerIpcDomain,
+                ctx->workSize, inputSize, stream, nullptr, 0, 0, numBlocksAndThreads.first, numBlocksAndThreads.second);
   if (error != cudaSuccess) {
+    if (error == cudaErrorNotSupported) {
+      WARN("AllreduceNvls does not support the requested data type.");
+      return CommResult::CommInvalidArgument;
+    }
     WARN("AllreduceNvls failed with error: %s", cudaGetErrorString(error));
     return CommResult::CommUnhandledCudaError;
   }
@@ -176,7 +187,7 @@ std::shared_ptr<void> AllreduceNvls::initAllreduceContext(std::shared_ptr<mscclp
   auto ctx = std::make_shared<AlgorithmCtx>();
   ctx->rank = comm->bootstrap()->getRank();
   ctx->workSize = comm->bootstrap()->getNranks();
-  ctx->ipcDomainNranks = comm->bootstrap()->getNranksPerIpcDomain();
+  ctx->nRanksPerIpcDomain = comm->bootstrap()->getNranksPerIpcDomain();
 
   size_t sendBytes, recvBytes;
   CUdeviceptr sendBasePtr, recvBasePtr;
diff --git a/src/ext/collectives/allreduce/allreduce_packet.cu b/src/ext/collectives/allreduce/allreduce_packet.cu
index 801bed626..d20625eea 100644
--- a/src/ext/collectives/allreduce/allreduce_packet.cu
+++ b/src/ext/collectives/allreduce/allreduce_packet.cu
@@ -15,7 +15,7 @@ namespace collective {
 template <ReduceOp OpType, typename T, typename AccumT = T>
 __global__ void __launch_bounds__(1024, 1)
     allreducePacket(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle<mscclpp::MemoryChannel>* memoryChannels,
-                    size_t channelDataOffset, size_t scratchBufferSize, int rank, int ipcDomainNranks, int worldSize,
+                    size_t channelDataOffset, size_t scratchBufferSize, int rank, int nRanksPerIpcDomain, int worldSize,
                     size_t nelems, void* flags, uint32_t flagBufferSize, uint32_t numScratchBuff
 #if defined(ENABLE_NPKIT)
                     ,
@@ -53,7 +53,7 @@ __global__ void __launch_bounds__(1024, 1)
   else
     nelems = nelems / (sizeof(int) / sizeof(T));
 
-  const int nPeers = ipcDomainNranks - 1;
+  const int nPeers = nRanksPerIpcDomain - 1;
   const size_t nPkts = nelems / 2;
 
   uint32_t flag = ((uint32_t*)flags)[blockIdx.x];
@@ -154,31 +154,32 @@ template <ReduceOp OpType, typename T, typename AccumT = T>
 struct PacketAdapter {
   static cudaError_t call(const void* buff, void* scratch, void* resultBuff, void* memoryChannels, void*,
                           DeviceHandle<SwitchChannel>*, DeviceHandle<SwitchChannel>*, size_t channelInOffset, size_t,
-                          size_t scratchBufferSize, int rank, int ipcDomainNranks, int worldSize, size_t inputSize,
+                          size_t scratchBufferSize, int rank, int nRanksPerIpcDomain, int worldSize, size_t inputSize,
                           cudaStream_t stream, void* flags, uint32_t flagBufferSize, uint32_t numScratchBuff,
                           int nBlocks = 0, int nThreadsPerBlock = 0) {
     using ChannelType = DeviceHandle<MemoryChannel>;
     const size_t nelems = inputSize / sizeof(T);
-    // Optimize the number of blocks to be multiple of (worldSize - 1)
-    nBlocks = nBlocks / (worldSize - 1) * (worldSize - 1);
+    // Optimize the number of blocks to be multiple of the IPC-domain peer count.
+    const int nPeers = nRanksPerIpcDomain - 1;
+    nBlocks = nBlocks / nPeers * nPeers;
 #if defined(ENABLE_NPKIT)
     size_t sharedMemSize = sizeof(NpKitEvent) * NPKIT_SHM_NUM_EVENTS;
     allreducePacket<OpType, T, AccumT><<<nBlocks, nThreadsPerBlock, sharedMemSize, stream>>>(
         (T*)buff, (T*)scratch, (T*)resultBuff, (ChannelType*)memoryChannels, channelInOffset, scratchBufferSize, rank,
-        ipcDomainNranks, worldSize, nelems, flags, flagBufferSize, numScratchBuff, NpKit::GetGpuEventCollectContexts(),
-        NpKit::GetCpuTimestamp());
+        nRanksPerIpcDomain, worldSize, nelems, flags, flagBufferSize, numScratchBuff,
+        NpKit::GetGpuEventCollectContexts(), NpKit::GetCpuTimestamp());
 #else
     allreducePacket<OpType, T, AccumT><<<nBlocks, nThreadsPerBlock, 0, stream>>>(
         (T*)buff, (T*)scratch, (T*)resultBuff, (ChannelType*)memoryChannels, channelInOffset, scratchBufferSize, rank,
-        ipcDomainNranks, worldSize, nelems, flags, flagBufferSize, numScratchBuff);
+        nRanksPerIpcDomain, worldSize, nelems, flags, flagBufferSize, numScratchBuff);
 #endif
     return cudaGetLastError();
   }
 };
 
-inline std::pair<int, int> getDefaultBlockNumAndThreadNum(size_t inputSize, int ipcDomainNranks, int worldSize,
+inline std::pair<int, int> getDefaultBlockNumAndThreadNum(size_t inputSize, int nRanksPerIpcDomain, int worldSize,
                                                           [[maybe_unused]] DataType dtype) {
-  int nBlocks = (ipcDomainNranks - 1) * 4;
+  int nBlocks = (nRanksPerIpcDomain - 1) * 4;
   int nThreadsPerBlock = 1024;
   if (inputSize >= 32768) {
     nBlocks = (worldSize - 1) * 8;
@@ -229,12 +230,17 @@ CommResult AllreducePacket::allreduceKernelFunc(const std::shared_ptr<void> ctx_
                                                 const std::unordered_map<std::string, uintptr_t>&,
                                                 DataType accumDtype) {
   auto ctx = std::static_pointer_cast<AlgorithmCtx>(ctx_void);
+  if (ctx->workSize != ctx->nRanksPerIpcDomain) {
+    WARN(ALGO, "AllreducePacket requires workSize to match nRanksPerIpcDomain, got workSize=", ctx->workSize,
+         ", nRanksPerIpcDomain=", ctx->nRanksPerIpcDomain);
+    return CommResult::CommInvalidArgument;
+  }
   std::pair<int, int> blockAndThreadNum = {nBlocks, nThreadsPerBlock};
   if (blockAndThreadNum.first == 0 || blockAndThreadNum.second == 0) {
-    blockAndThreadNum = getDefaultBlockNumAndThreadNum(inputSize, ctx->ipcDomainNranks, ctx->workSize, dtype);
+    blockAndThreadNum = getDefaultBlockNumAndThreadNum(inputSize, ctx->nRanksPerIpcDomain, ctx->workSize, dtype);
   } else {
-    const int nPeers = ctx->workSize - 1;
-    if (nPeers > 0 && blockAndThreadNum.first < nPeers) {
+    const int nPeers = ctx->nRanksPerIpcDomain - 1;
+    if (blockAndThreadNum.first < nPeers) {
       return CommResult::CommInvalidArgument;
     }
   }
@@ -252,8 +258,8 @@ CommResult AllreducePacket::allreduceKernelFunc(const std::shared_ptr<void> ctx_
   }
   cudaError_t error =
       allreduce(input, this->scratchBuffer_, output, ctx->memoryChannelDeviceHandles.get(), nullptr, nullptr, nullptr,
-                channelInOffset, 0, this->scratchBufferSize_, ctx->rank, ctx->ipcDomainNranks, ctx->workSize, inputSize,
-                stream, (void*)flagBuffer_, (uint32_t)flagBufferSize_, this->nSegmentsForScratchBuffer_,
+                channelInOffset, 0, this->scratchBufferSize_, ctx->rank, ctx->nRanksPerIpcDomain, ctx->workSize,
+                inputSize, stream, (void*)flagBuffer_, (uint32_t)flagBufferSize_, this->nSegmentsForScratchBuffer_,
                 blockAndThreadNum.first, blockAndThreadNum.second);
   if (error != cudaSuccess) {
     WARN(ALGO, "AllreducePacket failed with error: ", cudaGetErrorString(error));
@@ -268,7 +274,7 @@ std::shared_ptr<void> AllreducePacket::initAllreduceContext(std::shared_ptr<Comm
   const int nChannelsPerConnection = maxBlockNum_;
   ctx->rank = comm->bootstrap()->getRank();
   ctx->workSize = comm->bootstrap()->getNranks();
-  ctx->ipcDomainNranks = comm->bootstrap()->getNranksPerIpcDomain();
+  ctx->nRanksPerIpcDomain = comm->bootstrap()->getNranksPerIpcDomain();
   ctx->memorySemaphores = this->memorySemaphores_;
   ctx->registeredMemories = this->registeredMemories_;
   ctx->registeredMemories.pop_back();  // remove the local memory from previous context
diff --git a/src/ext/collectives/allreduce/allreduce_rsag.cu b/src/ext/collectives/allreduce/allreduce_rsag.cu
index 22e3a4ee4..f07e0e2c8 100644
--- a/src/ext/collectives/allreduce/allreduce_rsag.cu
+++ b/src/ext/collectives/allreduce/allreduce_rsag.cu
@@ -31,18 +31,18 @@ namespace collective {
 template <ReduceOp OpType, typename T>
 __global__ void __launch_bounds__(1024, 1)
     allreduceRsAg(T* buff, T* scratch, T* resultBuff, DeviceHandle<BaseMemoryChannel>* memoryChannels,
-                  DeviceHandle<SwitchChannel>* switchChannels, void* remoteMemories, int rank, int ipcDomainNranks,
+                  DeviceHandle<SwitchChannel>* switchChannels, void* remoteMemories, int rank, int nRanksPerIpcDomain,
                   int worldSize, size_t nelems) {
   int blockId = blockIdx.x;
-  uint32_t nPeers = ipcDomainNranks - 1;
+  uint32_t nPeers = nRanksPerIpcDomain - 1;
 
   assert((uintptr_t)buff % sizeof(int4) == 0);
   assert((uintptr_t)resultBuff % sizeof(int4) == 0);
 
   constexpr uint32_t nelemsPerInt4 = sizeof(int4) / sizeof(T);
-  uint32_t alignedNelems = ((nelems + ipcDomainNranks - 1) / ipcDomainNranks + nelemsPerInt4 - 1) / nelemsPerInt4 *
-                           nelemsPerInt4 * ipcDomainNranks;
-  uint32_t nelemsPerRank = alignedNelems / ipcDomainNranks;
+  uint32_t alignedNelems = ((nelems + nRanksPerIpcDomain - 1) / nRanksPerIpcDomain + nelemsPerInt4 - 1) /
+                           nelemsPerInt4 * nelemsPerInt4 * nRanksPerIpcDomain;
+  uint32_t nelemsPerRank = alignedNelems / nRanksPerIpcDomain;
   uint32_t nInt4PerRank = nelemsPerRank / nelemsPerInt4;
   uint32_t lastInt4Index = nelems / nelemsPerInt4;
   uint32_t remainder = nelems % nelemsPerInt4;
@@ -59,7 +59,7 @@ __global__ void __launch_bounds__(1024, 1)
     nInt4PerBlock += remainderForBlock;
   }
   if (nInt4PerBlock == 0) return;
-  uint32_t nInt4ForCopy = nInt4PerBlock * ipcDomainNranks;
+  uint32_t nInt4ForCopy = nInt4PerBlock * nRanksPerIpcDomain;
 
   for (uint32_t idx = threadIdx.x; idx < nInt4ForCopy; idx += blockDim.x) {
     int rankIdx = idx / nInt4PerBlock;
@@ -84,13 +84,13 @@ __global__ void __launch_bounds__(1024, 1)
     if (offset > lastInt4Index) continue;
     int4 tmp = scratch4[offset];
     for (uint32_t i = 0; i < nPeers; i++) {
-      int rankIdx = (rank + i + 1) % ipcDomainNranks;
+      int rankIdx = (rank + i + 1) % nRanksPerIpcDomain;
       int peerIdx = rankIdx < rank ? rankIdx : rankIdx - 1;
       int4 data = mscclpp::read<int4>(((void**)remoteMemories)[peerIdx], offset);
       tmp = calVector<T, OpType>(data, tmp);
     }
     for (uint32_t i = 0; i < nPeers; i++) {
-      int rankIdx = (rank + i + 1) % ipcDomainNranks;
+      int rankIdx = (rank + i + 1) % nRanksPerIpcDomain;
       int peerIdx = rankIdx < rank ? rankIdx : rankIdx - 1;
       mscclpp::write<int4>(((void**)remoteMemories)[peerIdx], offset, tmp);
     }
@@ -127,8 +127,8 @@ template <ReduceOp OpType, typename T, typename AccumT = T>
 struct AllreduceRsAgAdapter {
   static cudaError_t call(const void* input, void* scratch, void* output, void* memoryChannels, void* remoteMemories,
                           DeviceHandle<SwitchChannel>* switchChannel, DeviceHandle<SwitchChannel>*, size_t, size_t,
-                          size_t, int rank, int ipcDomainNranks, int worldSize, size_t inputSize, cudaStream_t stream,
-                          void*, uint32_t, uint32_t, int nBlocks, int nThreadsPerBlock) {
+                          size_t, int rank, int nRanksPerIpcDomain, int worldSize, size_t inputSize,
+                          cudaStream_t stream, void*, uint32_t, uint32_t, int nBlocks, int nThreadsPerBlock) {
     using ChannelType = DeviceHandle<BaseMemoryChannel>;
     size_t nelems = inputSize / sizeof(T);
     if (nBlocks == 0 || nThreadsPerBlock == 0) {
@@ -137,7 +137,7 @@ struct AllreduceRsAgAdapter {
     }
     allreduceRsAg<OpType, T><<<nBlocks, nThreadsPerBlock, 0, stream>>>(
         (T*)input, (T*)scratch, (T*)output, (ChannelType*)memoryChannels, switchChannel, remoteMemories, rank,
-        ipcDomainNranks, worldSize, nelems);
+        nRanksPerIpcDomain, worldSize, nelems);
     return cudaGetLastError();
   }
 };
@@ -185,7 +185,7 @@ CommResult AllreduceRsAg::allreduceKernelFunc(const std::shared_ptr<void> ctx, c
   }
   cudaError_t error = allreduce(input, this->scratchBuffer_, output, this->baseMemoryChannelHandles_.get(),
                                 this->remoteMemoryHandles_.get(), nullptr, nullptr, 0, 0, 0, algoCtx->rank,
-                                algoCtx->ipcDomainNranks, algoCtx->workSize, inputSize, stream, nullptr, 0, 0,
+                                algoCtx->nRanksPerIpcDomain, algoCtx->workSize, inputSize, stream, nullptr, 0, 0,
                                 numBlocksAndThreads.first, numBlocksAndThreads.second);
   if (error != cudaSuccess) {
     WARN(ALGO, "Allreduce kernel launch failed with error: ", cudaGetErrorString(error));
@@ -203,7 +203,7 @@ std::shared_ptr<void> AllreduceRsAg::initAllreduceContext(std::shared_ptr<Commun
   auto ctx = std::make_shared<AlgorithmCtx>();
   ctx->rank = comm->bootstrap()->getRank();
   ctx->workSize = comm->bootstrap()->getNranks();
-  ctx->ipcDomainNranks = comm->bootstrap()->getNranksPerIpcDomain();
+  ctx->nRanksPerIpcDomain = comm->bootstrap()->getNranksPerIpcDomain();
 
   ctx->memorySemaphores = this->scratchSemaphores_;
   ctx->registeredMemories = this->remoteScratchMemories_;
diff --git a/src/ext/collectives/allreduce/allreduce_rsag_pipeline.cu b/src/ext/collectives/allreduce/allreduce_rsag_pipeline.cu
index bedf15c50..e9d543eaa 100644
--- a/src/ext/collectives/allreduce/allreduce_rsag_pipeline.cu
+++ b/src/ext/collectives/allreduce/allreduce_rsag_pipeline.cu
@@ -86,15 +86,15 @@ template <ReduceOp OpType, typename T>
 __global__ void __launch_bounds__(1024, 1)
     allreduceRsAgPipeline(T* buff, T* scratch, T* resultBuff, DeviceHandle<BaseMemoryChannel>* memoryChannels,
                           DeviceHandle<SwitchChannel>* switchChannels, void* remoteMemories, int rank,
-                          int ipcDomainNranks, int worldSize, size_t nelems, size_t scratchSize, uint32_t nblocksForPut,
-                          uint32_t nblocksForReduce, uint32_t nblocksForRecv) {
+                          int nRanksPerIpcDomain, int worldSize, size_t nelems, size_t scratchSize,
+                          uint32_t nblocksForPut, uint32_t nblocksForReduce, uint32_t nblocksForRecv) {
   uint32_t bid = blockIdx.x;
   constexpr uint32_t nStepsPerIter = 4;
   uint32_t nInt4 = (nelems * sizeof(T) + sizeof(int4) - 1) / sizeof(int4);
   uint32_t nInt4PerIter = nblocksForReduce * blockDim.x * nStepsPerIter;
   const uint32_t chunkSize = nInt4PerIter * worldSize;
   uint32_t nIters = (nInt4 + chunkSize - 1) / chunkSize;
-  uint32_t nPeers = ipcDomainNranks - 1;
+  uint32_t nPeers = nRanksPerIpcDomain - 1;
   int4* scratch4 = reinterpret_cast<int4*>((char*)scratch);
   const uint32_t scratchIterStride = 2 * chunkSize;  // one for AS, one for AG
   const uint32_t pipelineDepth = scratchSize / sizeof(int4) / scratchIterStride;
@@ -111,7 +111,7 @@ __global__ void __launch_bounds__(1024, 1)
       __syncthreads();
       uint32_t threadIdInPut = bid * blockDim.x + threadIdx.x;
       for (uint32_t peer = 0; peer < nPeers; peer++) {
-        int remoteRankId = (rank + peer + 1) % ipcDomainNranks;
+        int remoteRankId = (rank + peer + 1) % nRanksPerIpcDomain;
         int peerId = remoteRankId < rank ? remoteRankId : remoteRankId - 1;
         // Read chunk[remoteRankId] from local buff, write to peer's scratch[rank] (sender's slot)
         uint32_t srcOffset = iter * chunkSize + remoteRankId * nInt4PerIter;
@@ -164,7 +164,7 @@ __global__ void __launch_bounds__(1024, 1)
         int4 tmp = loadVec(buff, myChunkOffset, nelems);
         // Add data from each peer's slot in scratch (peer sent their chunk[rank] to our scratch[peer])
         for (uint32_t peer = 0; peer < nPeers; peer++) {
-          int remoteRankId = (rank + peer + 1) % ipcDomainNranks;
+          int remoteRankId = (rank + peer + 1) % nRanksPerIpcDomain;
           uint32_t peerSlotOffset =
               baseOffset + remoteRankId * nInt4PerIter + threadIdInPut + putStep * blockDim.x * nblocksForPut;
           int4 data = scratch4[peerSlotOffset];
@@ -175,7 +175,7 @@ __global__ void __launch_bounds__(1024, 1)
         uint32_t dstOffset =
             baseOffset + chunkSize + rank * nInt4PerIter + threadIdInPut + putStep * blockDim.x * nblocksForPut;
         for (uint32_t i = 0; i < nPeers; i++) {
-          int peerIdx = (rank + i + 1) % ipcDomainNranks;
+          int peerIdx = (rank + i + 1) % nRanksPerIpcDomain;
           int index = peerIdx < rank ? peerIdx : peerIdx - 1;
           mscclpp::write<int4>(((void**)remoteMemories)[index], dstOffset, tmp);
         }
@@ -203,7 +203,7 @@ __global__ void __launch_bounds__(1024, 1)
       __syncthreads();
       // Copy other ranks' reduced chunks from scratch to result
       for (uint32_t peer = 0; peer < nPeers; peer++) {
-        int remoteRankId = (rank + peer + 1) % ipcDomainNranks;
+        int remoteRankId = (rank + peer + 1) % nRanksPerIpcDomain;
         for (uint32_t step = 0; step < nStepsPerIter * REDUCE_COPY_RATIO; step++) {
           uint32_t offset = baseOffset + chunkSize + remoteRankId * nInt4PerIter + threadIdInRecv +
                             step * blockDim.x * nblocksForRecv;
@@ -224,7 +224,7 @@ template <ReduceOp OpType, typename T, typename AccumT = T>
 struct AllreduceRsAgPipelineAdapter {
   static cudaError_t call(const void* input, void* scratch, void* output, void* memoryChannels, void* remoteMemories,
                           DeviceHandle<SwitchChannel>* switchChannel, DeviceHandle<SwitchChannel>*, size_t, size_t,
-                          size_t scratchSize, int rank, int ipcDomainNranks, int worldSize, size_t inputSize,
+                          size_t scratchSize, int rank, int nRanksPerIpcDomain, int worldSize, size_t inputSize,
                           cudaStream_t stream, void*, uint32_t, uint32_t, int nBlocks, int nThreadsPerBlock) {
     using ChannelType = DeviceHandle<BaseMemoryChannel>;
     size_t nelems = inputSize / sizeof(T);
@@ -248,7 +248,7 @@ struct AllreduceRsAgPipelineAdapter {
     }
     allreduceRsAgPipeline<OpType, T><<<nBlocks, nThreadsPerBlock, 0, stream>>>(
         (T*)input, (T*)scratch, (T*)output, (ChannelType*)memoryChannels, switchChannel, remoteMemories, rank,
-        ipcDomainNranks, worldSize, nelems, scratchSize, nblocksForPut, nblocksForReduce, nblocksForRecv);
+        nRanksPerIpcDomain, worldSize, nelems, scratchSize, nblocksForPut, nblocksForReduce, nblocksForRecv);
     return cudaGetLastError();
   }
 };
@@ -288,8 +288,8 @@ CommResult AllreduceRsAgPipeline::allreduceKernelFunc(
   std::pair<int, int> numBlocksAndThreads = {nBlocks, nThreadsPerBlock};
   cudaError_t error = allreduce(input, this->scratchBuffer_, output, this->baseMemoryChannelHandles_.get(),
                                 this->remoteMemoryHandles_.get(), nullptr, nullptr, 0, 0, this->scratchBufferSize_,
-                                algoCtx->rank, algoCtx->ipcDomainNranks, algoCtx->workSize, inputSize, stream, nullptr,
-                                0, 0, numBlocksAndThreads.first, numBlocksAndThreads.second);
+                                algoCtx->rank, algoCtx->nRanksPerIpcDomain, algoCtx->workSize, inputSize, stream,
+                                nullptr, 0, 0, numBlocksAndThreads.first, numBlocksAndThreads.second);
   if (error != cudaSuccess) {
     WARN(ALGO, "Allreduce kernel launch failed with error: ", cudaGetErrorString(error));
     return CommResult::CommUnhandledCudaError;
@@ -306,7 +306,7 @@ std::shared_ptr<void> AllreduceRsAgPipeline::initAllreduceContext(std::shared_pt
   auto ctx = std::make_shared<AlgorithmCtx>();
   ctx->rank = comm->bootstrap()->getRank();
   ctx->workSize = comm->bootstrap()->getNranks();
-  ctx->ipcDomainNranks = comm->bootstrap()->getNranksPerIpcDomain();
+  ctx->nRanksPerIpcDomain = comm->bootstrap()->getNranksPerIpcDomain();
 
   ctx->memorySemaphores = this->scratchSemaphores_;
   ctx->registeredMemories = this->remoteScratchMemories_;
diff --git a/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu b/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu
index 10d3a35c2..753ad7999 100644
--- a/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu
+++ b/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu
@@ -116,8 +116,8 @@ template <ReduceOp OpType, typename T, typename AccumT = T>
 struct AllreduceRsAgZeroCopyAdapter {
   static cudaError_t call(const void* input, void* scratch, void* output, void* memoryChannels, void* remoteMemories,
                           DeviceHandle<SwitchChannel>* switchChannel, DeviceHandle<SwitchChannel>*, size_t, size_t,
-                          size_t, int rank, int ipcDomainNranks, int worldSize, size_t inputSize, cudaStream_t stream,
-                          void*, uint32_t, uint32_t, int nBlocks, int nThreadsPerBlock) {
+                          size_t, int rank, int nRanksPerIpcDomain, int worldSize, size_t inputSize,
+                          cudaStream_t stream, void*, uint32_t, uint32_t, int nBlocks, int nThreadsPerBlock) {
     using ChannelType = DeviceHandle<BaseMemoryChannel>;
     size_t nelems = inputSize / sizeof(T);
     if (nBlocks == 0 || nThreadsPerBlock == 0) {
@@ -127,16 +127,16 @@ struct AllreduceRsAgZeroCopyAdapter {
         nBlocks = 128;
       }
     }
-    if (ipcDomainNranks == 4) {
+    if (nRanksPerIpcDomain == 4) {
       allreduceRsAgZeroCopy<4, OpType, T, AccumT>
           <<<nBlocks, nThreadsPerBlock, 0, stream>>>((T*)input, (T*)scratch, (T*)output, (ChannelType*)memoryChannels,
                                                      switchChannel, remoteMemories, rank, worldSize, nelems);
-    } else if (ipcDomainNranks == 8) {
+    } else if (nRanksPerIpcDomain == 8) {
       allreduceRsAgZeroCopy<8, OpType, T, AccumT>
           <<<nBlocks, nThreadsPerBlock, 0, stream>>>((T*)input, (T*)scratch, (T*)output, (ChannelType*)memoryChannels,
                                                      switchChannel, remoteMemories, rank, worldSize, nelems);
     } else {
-      WARN(ALGO, "AllreduceRsAgZeroCopy only supports ipcDomainNranks of 4 or 8, got: ", ipcDomainNranks);
+      WARN(ALGO, "AllreduceRsAgZeroCopy only supports nRanksPerIpcDomain of 4 or 8, got: ", nRanksPerIpcDomain);
       return cudaErrorInvalidValue;
     }
     return cudaGetLastError();
@@ -172,9 +172,13 @@ CommResult AllreduceRsAgZeroCopy::allreduceKernelFunc(const std::shared_ptr<void
   }
   cudaError_t error =
       allreduce(input, nullptr, output, this->baseMemoryChannelHandles_.get(), algoCtx->remoteMemoryHandles.get(),
-                nullptr, nullptr, 0, 0, 0, algoCtx->rank, algoCtx->ipcDomainNranks, algoCtx->workSize, inputSize,
+                nullptr, nullptr, 0, 0, 0, algoCtx->rank, algoCtx->nRanksPerIpcDomain, algoCtx->workSize, inputSize,
                 stream, nullptr, 0, 0, numBlocksAndThreads.first, numBlocksAndThreads.second);
   if (error != cudaSuccess) {
+    if (error == cudaErrorInvalidValue) {
+      WARN(ALGO, "AllreduceRsAgZeroCopy received invalid launch arguments: ", cudaGetErrorString(error));
+      return CommResult::CommInvalidArgument;
+    }
     WARN(ALGO, "Allreduce kernel launch failed with error: ", cudaGetErrorString(error));
     return CommResult::CommUnhandledCudaError;
   }
@@ -200,7 +204,7 @@ std::shared_ptr<void> AllreduceRsAgZeroCopy::initAllreduceContext(std::shared_pt
   auto ctx = std::make_shared<AlgorithmCtx>();
   ctx->rank = comm->bootstrap()->getRank();
   ctx->workSize = comm->bootstrap()->getNranks();
-  ctx->ipcDomainNranks = comm->bootstrap()->getNranksPerIpcDomain();
+  ctx->nRanksPerIpcDomain = comm->bootstrap()->getNranksPerIpcDomain();
 
   ctx->memorySemaphores = this->semaphores_;
 
diff --git a/src/ext/collectives/include/allreduce/allreduce_nvls_block_pipeline.hpp b/src/ext/collectives/include/allreduce/allreduce_nvls_block_pipeline.hpp
index 9a1742db1..5662d1163 100644
--- a/src/ext/collectives/include/allreduce/allreduce_nvls_block_pipeline.hpp
+++ b/src/ext/collectives/include/allreduce/allreduce_nvls_block_pipeline.hpp
@@ -29,7 +29,7 @@ class AllreduceNvlsBlockPipeline : public AlgorithmBuilder {
   void* scratchBuffer_;
   size_t scratchBufferSize_;
   uint32_t nSwitchChannels_;
-  int ipcDomainNranks_ = 0;
+  int nRanksPerIpcDomain_ = 0;
   int nBaseChannels_ = 0;
   std::shared_ptr<DeviceHandle<BaseMemoryChannel>> memoryChannelsDeviceHandle_;
   std::vector<BaseMemoryChannel> baseChannels_;
diff --git a/src/ext/collectives/include/allreduce/allreduce_nvls_warp_pipeline.hpp b/src/ext/collectives/include/allreduce/allreduce_nvls_warp_pipeline.hpp
index e2aa8c873..f347c871f 100644
--- a/src/ext/collectives/include/allreduce/allreduce_nvls_warp_pipeline.hpp
+++ b/src/ext/collectives/include/allreduce/allreduce_nvls_warp_pipeline.hpp
@@ -29,7 +29,7 @@ class AllreduceNvlsWarpPipeline : public AlgorithmBuilder {
   void* scratchBuffer_;
   size_t scratchBufferSize_;
   uint32_t nSwitchChannels_;
-  int ipcDomainNranks_ = 0;
+  int nRanksPerIpcDomain_ = 0;
   int nBaseChannels_ = 0;
   std::shared_ptr<DeviceHandle<BaseMemoryChannel>> memoryChannelsDeviceHandle_;
   std::vector<BaseMemoryChannel> baseChannels_;
diff --git a/src/ext/collectives/include/allreduce/common.hpp b/src/ext/collectives/include/allreduce/common.hpp
index 22513ace5..5d593449c 100644
--- a/src/ext/collectives/include/allreduce/common.hpp
+++ b/src/ext/collectives/include/allreduce/common.hpp
@@ -39,34 +39,43 @@ MSCCLPP_DEVICE_INLINE constexpr std::size_t calcVectorSize() {
 template <typename T, typename AccumT = T>
 MSCCLPP_DEVICE_INLINE void handleMultiLoadReduceStore(T* src, T* dst, size_t srcOffset, size_t dstOffset, size_t size,
                                                       int tid, int nThreads) {
-  // nvls can only handle 4 bytes alignment
-  MSCCLPP_ASSERT_DEVICE(size % 4 == 0, "size must be 4 bytes aligned");
-  constexpr size_t nElem = calcVectorSize<T>();
-  // For integer types, use 1-element vectors since multimem doesn't support vectorized integer operations
-  constexpr size_t vecSize = (std::is_same_v<T, int> || std::is_same_v<T, int32_t> || std::is_same_v<T, unsigned int> ||
-                              std::is_same_v<T, uint32_t>)
-                                 ? 1
-                                 : nElem;
-  using vectorType = mscclpp::VectorType<T, vecSize>;
-  const size_t nVec = size / sizeof(vectorType);
-  const size_t srcOffset4 = srcOffset / sizeof(vectorType);
-  const size_t dstOffset4 = dstOffset / sizeof(vectorType);
-  vectorType* src4 = (vectorType*)src;
-  vectorType* dst4 = (vectorType*)dst;
-  for (size_t idx = tid; idx < nVec; idx += nThreads) {
-    auto val = mscclpp::SwitchChannelDeviceHandle::multimemLoadReduce<vectorType, AccumT>(src4 + srcOffset4 + idx);
-    mscclpp::SwitchChannelDeviceHandle::multimemStore(val, dst4 + dstOffset4 + idx);
-  }
-  // handle rest of data
-  size_t processed = nVec * sizeof(vectorType);
-  constexpr size_t nRestElem = 4 / sizeof(T);
-  using restVectorType = mscclpp::VectorType<T, nRestElem>;
-  const size_t startIdx = (srcOffset + processed) / sizeof(restVectorType);
-  const size_t endIdx = (srcOffset + size) / sizeof(restVectorType);
-  for (size_t idx = tid + startIdx; idx < endIdx; idx += nThreads) {
-    auto val =
-        mscclpp::SwitchChannelDeviceHandle::multimemLoadReduce<restVectorType, AccumT>((restVectorType*)src + idx);
-    mscclpp::SwitchChannelDeviceHandle::multimemStore(val, (restVectorType*)dst + idx);
+#if defined(__FP8_TYPES_EXIST__) && \
+    (!(defined(__CUDA_ARCH_SPECIFIC__) || defined(__CUDA_ARCH_FAMILY_SPECIFIC__)) || (__CUDA_ARCH__ < 1000))
+  if constexpr (std::is_same_v<T, __fp8_e4m3> || std::is_same_v<T, __fp8_e5m2>) {
+    assert(false && "FP8 NVLS multimem requires sm_100a or newer");
+    return;
+  } else
+#endif
+  {
+    // nvls can only handle 4 bytes alignment
+    MSCCLPP_ASSERT_DEVICE(size % 4 == 0, "size must be 4 bytes aligned");
+    constexpr size_t nElem = calcVectorSize<T>();
+    // For integer types, use 1-element vectors since multimem doesn't support vectorized integer operations
+    constexpr size_t vecSize = (std::is_same_v<T, int> || std::is_same_v<T, int32_t> ||
+                                std::is_same_v<T, unsigned int> || std::is_same_v<T, uint32_t>)
+                                   ? 1
+                                   : nElem;
+    using vectorType = mscclpp::VectorType<T, vecSize>;
+    const size_t nVec = size / sizeof(vectorType);
+    const size_t srcOffset4 = srcOffset / sizeof(vectorType);
+    const size_t dstOffset4 = dstOffset / sizeof(vectorType);
+    vectorType* src4 = (vectorType*)src;
+    vectorType* dst4 = (vectorType*)dst;
+    for (size_t idx = tid; idx < nVec; idx += nThreads) {
+      auto val = mscclpp::SwitchChannelDeviceHandle::multimemLoadReduce<vectorType, AccumT>(src4 + srcOffset4 + idx);
+      mscclpp::SwitchChannelDeviceHandle::multimemStore(val, dst4 + dstOffset4 + idx);
+    }
+    // handle rest of data
+    size_t processed = nVec * sizeof(vectorType);
+    constexpr size_t nRestElem = 4 / sizeof(T);
+    using restVectorType = mscclpp::VectorType<T, nRestElem>;
+    const size_t startIdx = (srcOffset + processed) / sizeof(restVectorType);
+    const size_t endIdx = (srcOffset + size) / sizeof(restVectorType);
+    for (size_t idx = tid + startIdx; idx < endIdx; idx += nThreads) {
+      auto val =
+          mscclpp::SwitchChannelDeviceHandle::multimemLoadReduce<restVectorType, AccumT>((restVectorType*)src + idx);
+      mscclpp::SwitchChannelDeviceHandle::multimemStore(val, (restVectorType*)dst + idx);
+    }
   }
 }
 #endif  // defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
diff --git a/src/ext/collectives/include/collective_utils.hpp b/src/ext/collectives/include/collective_utils.hpp
index c1cad4121..2e61b9379 100644
--- a/src/ext/collectives/include/collective_utils.hpp
+++ b/src/ext/collectives/include/collective_utils.hpp
@@ -27,8 +27,8 @@ namespace mscclpp {
 namespace collective {
 constexpr int NUM_NVLS_CONNECTION = 8;
 // Sized to cover MAX_IPC_DOMAIN_NRANKS-scale allreduce algos whose device-side
-// semaphore indices grow as O(ipcDomainNranks) (e.g. nvls_block_pipeline uses
-// up to ~5 * ipcDomainNranks entries).
+// semaphore indices grow as O(nRanksPerIpcDomain) (e.g. nvls_block_pipeline uses
+// up to ~5 * nRanksPerIpcDomain entries).
 constexpr int NUM_SEMAPHORES = 512;
 
 // Upper bound on the number of NVLink-reachable ranks that participate in a
@@ -37,7 +37,7 @@ constexpr int NUM_SEMAPHORES = 512;
 // of shared-memory channel arrays in the allreduce/allgather kernels.
 constexpr int MAX_IPC_DOMAIN_NRANKS = 72;
 
-constexpr int SCRATCH_SIZE = 2 * 1024 * 1024 * 70;  // double buffer * 35 thread-blocks * 8 ranks * 256KB = 70MB
+constexpr int SCRATCH_SIZE = 2 * 1024 * 1024 * 70;  // Two 70 MiB buffers for double-buffered packet scratch space.
 
 std::vector<RegisteredMemory> setupRemoteMemories(std::shared_ptr<Communicator> comm, int rank,
                                                   RegisteredMemory localMemory);
@@ -79,7 +79,7 @@ class AlgorithmCtx {
  public:
   int rank;
   int workSize;
-  int ipcDomainNranks;
+  int nRanksPerIpcDomain;
 
   std::vector<RegisteredMemory> registeredMemories;
   std::vector<MemoryChannel> memoryChannels;

From 18d37379d264f5b08e409ca3b99b3fd4c24f67cc Mon Sep 17 00:00:00 2001
From: Qinghua Zhou <qinghuazhou@microsoft.com>
Date: Sat, 16 May 2026 23:23:30 +0000
Subject: [PATCH 39/44] Tighten NVML IPC domain hash lookup

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 src/core/utils_internal.cc | 22 +++++-----------------
 1 file changed, 5 insertions(+), 17 deletions(-)

diff --git a/src/core/utils_internal.cc b/src/core/utils_internal.cc
index 2e620b660..adbf8e5b7 100644
--- a/src/core/utils_internal.cc
+++ b/src/core/utils_internal.cc
@@ -208,30 +208,18 @@ uint64_t getFabricHash(const nvmlGpuFabricInfo_t& fabricInfo) {
 bool tryGetNvmlIpcDomainHash(uint64_t& ipcDomainHash) {
   // Use the current CUDA device; callers must set the rank's device before querying.
   int deviceId;
-  if (cudaGetDevice(&deviceId) != cudaSuccess) {
-    return false;
-  }
-
   char pciBusId[] = "00000000:00:00.0";
-  if (cudaDeviceGetPCIBusId(pciBusId, sizeof(pciBusId), deviceId) != cudaSuccess) {
+  if (cudaGetDevice(&deviceId) != cudaSuccess ||
+      cudaDeviceGetPCIBusId(pciBusId, sizeof(pciBusId), deviceId) != cudaSuccess) {
     return false;
   }
 
   static NvmlState nvml;
-  if (!nvml.isInitialized()) {
-    return false;
-  }
-
   nvmlDevice_t nvmlDevice;
-  if (nvmlDeviceGetHandleByPciBusId_v2(pciBusId, &nvmlDevice) != NVML_SUCCESS) {
-    return false;
-  }
-
   nvmlGpuFabricInfo_t fabricInfo = {};
-  if (nvmlDeviceGetGpuFabricInfo(nvmlDevice, &fabricInfo) != NVML_SUCCESS) {
-    return false;
-  }
-  if (fabricInfo.state != NVML_GPU_FABRIC_STATE_COMPLETED || fabricInfo.status != NVML_SUCCESS) {
+  if (!nvml.isInitialized() || nvmlDeviceGetHandleByPciBusId_v2(pciBusId, &nvmlDevice) != NVML_SUCCESS ||
+      nvmlDeviceGetGpuFabricInfo(nvmlDevice, &fabricInfo) != NVML_SUCCESS ||
+      fabricInfo.state != NVML_GPU_FABRIC_STATE_COMPLETED || fabricInfo.status != NVML_SUCCESS) {
     return false;
   }
 

From 4db71b93b74fc137a5482f6a840b9161c8759e2d Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Mon, 18 May 2026 20:50:01 +0000
Subject: [PATCH 40/44] Move barrier into setupNvlsChannels and clean up NVLS
 pipeline state

- setupNvlsChannels now takes the Communicator and barriers internally
  after binding all switch channels, replacing the explicit
  bootstrap()->barrier() previously done only in AllreduceNvlsPacket.
- Demote nRanksPerIpcDomain_ / nBaseChannels_ to locals in
  AllreduceNvlsBlockPipeline and AllreduceNvlsWarpPipeline; they were
  never read outside initialize().
- Drive-by: pick up in-tree edits to switch_channel_device.hpp,
  executor.cc, communicator.hpp, and allreduce_rsag.cu.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 include/mscclpp/switch_channel_device.hpp             |  3 +--
 src/core/executor/executor.cc                         |  1 +
 src/core/include/communicator.hpp                     |  1 +
 .../allreduce/allreduce_nvls_block_pipeline.cu        | 11 +++++------
 .../collectives/allreduce/allreduce_nvls_packet.cu    |  3 +--
 .../allreduce/allreduce_nvls_warp_pipeline.cu         | 11 +++++------
 .../collectives/allreduce/allreduce_nvls_zero_copy.cu |  6 +++---
 src/ext/collectives/allreduce/allreduce_rsag.cu       |  2 +-
 src/ext/collectives/collective_utils.cc               |  8 ++++----
 .../allreduce/allreduce_nvls_block_pipeline.hpp       |  2 --
 .../allreduce/allreduce_nvls_warp_pipeline.hpp        |  2 --
 src/ext/collectives/include/collective_utils.hpp      |  3 ++-
 12 files changed, 24 insertions(+), 29 deletions(-)

diff --git a/include/mscclpp/switch_channel_device.hpp b/include/mscclpp/switch_channel_device.hpp
index df22bd3aa..fcdd7fddb 100644
--- a/include/mscclpp/switch_channel_device.hpp
+++ b/include/mscclpp/switch_channel_device.hpp
@@ -39,8 +39,7 @@ struct SwitchChannelDeviceHandle {
 
   /// Vectorized multimem load+reduce. The optional `AccumT` template parameter selects the
   /// accumulator: when `AccumT == __half` and `VectorType` is an FP8 vector type, the
-  /// `.acc::f16` variant of the instruction is used (faster but lower precision than the
-  /// default FP32 accumulator). For all other types `AccumT` is ignored.
+  /// `.acc::f16` variant of the instruction is used. For all other types `AccumT` is ignored.
   template <typename VectorType, typename AccumT = void>
   MSCCLPP_DEVICE_INLINE static VectorType multimemLoadReduce(VectorType* ptr) {
     VectorType val;
diff --git a/src/core/executor/executor.cc b/src/core/executor/executor.cc
index fcecc4ddf..15c6af4e6 100644
--- a/src/core/executor/executor.cc
+++ b/src/core/executor/executor.cc
@@ -389,6 +389,7 @@ struct Executor::Impl {
           nvlsConnection->bindAllocatedMemory((CUdeviceptr)bufferInfo.first, bufferInfo.second);
       context.nvlsChannels.push_back(switchChannel);
     }
+    this->comm->bootstrap()->barrier();
   }
 
   void setupSemaphores(ExecutionContext& context, const ExecutionPlan& plan) {
diff --git a/src/core/include/communicator.hpp b/src/core/include/communicator.hpp
index 333cc9823..f15e20f74 100644
--- a/src/core/include/communicator.hpp
+++ b/src/core/include/communicator.hpp
@@ -60,6 +60,7 @@ struct Communicator::Impl {
   std::shared_ptr<Bootstrap> bootstrap_;
   std::shared_ptr<Context> context_;
   std::unordered_map<const BaseConnection*, ConnectionInfo> connectionInfos_;
+
   // Temporary storage for the latest RecvItem of each {remoteRank, tag} pair.
   // The RecvItem is removed when it finishes or when getLastRecvItem observes that it is ready.
   std::unordered_map<std::pair<int, int>, std::shared_ptr<BaseRecvItem>, PairHash> lastRecvItems_;
diff --git a/src/ext/collectives/allreduce/allreduce_nvls_block_pipeline.cu b/src/ext/collectives/allreduce/allreduce_nvls_block_pipeline.cu
index 347ce8b41..04c7f8c99 100644
--- a/src/ext/collectives/allreduce/allreduce_nvls_block_pipeline.cu
+++ b/src/ext/collectives/allreduce/allreduce_nvls_block_pipeline.cu
@@ -1,7 +1,6 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT License.
 
-#include <algorithm>
 #include <mscclpp/algorithm.hpp>
 
 #include "allreduce/allreduce_nvls_block_pipeline.hpp"
@@ -177,15 +176,15 @@ struct NvlsBlockPipelineAdapter {
 
 void AllreduceNvlsBlockPipeline::initialize(std::shared_ptr<Communicator> comm) {
   nSwitchChannels_ = 8;
-  nRanksPerIpcDomain_ = comm->bootstrap()->getNranksPerIpcDomain();
+  int nRanksPerIpcDomain = comm->bootstrap()->getNranksPerIpcDomain();
   // Per-peer channel allocation must hold up to 4 * nRanksPerIpcDomain entries (see kernel).
-  nBaseChannels_ = std::max(64, 4 * nRanksPerIpcDomain_);
+  int nBaseChannels = std::max(64, 4 * nRanksPerIpcDomain);
   this->conns_ = setupConnections(comm);
   // setup semaphores
   std::vector<std::shared_ptr<MemoryDevice2DeviceSemaphore>> memorySemaphores =
-      setupMemorySemaphores(comm, this->conns_, nBaseChannels_);
+      setupMemorySemaphores(comm, this->conns_, nBaseChannels);
   // setup base memory channels
-  this->baseChannels_ = setupBaseMemoryChannels(this->conns_, memorySemaphores, nBaseChannels_);
+  this->baseChannels_ = setupBaseMemoryChannels(this->conns_, memorySemaphores, nBaseChannels);
   this->memoryChannelsDeviceHandle_ = setupBaseMemoryChannelDeviceHandles(this->baseChannels_);
   this->nvlsConnections_ = setupNvlsConnections(comm, nvlsBufferSize_, nSwitchChannels_);
 }
@@ -228,7 +227,7 @@ std::shared_ptr<void> AllreduceNvlsBlockPipeline::initAllreduceContext(std::shar
 
   // setup channels
   ctx->switchChannels =
-      setupNvlsChannels(this->nvlsConnections_, this->scratchBuffer_, scratchBufferSize_, nSwitchChannels_);
+      setupNvlsChannels(comm, this->nvlsConnections_, this->scratchBuffer_, scratchBufferSize_, nSwitchChannels_);
   ctx->switchChannelDeviceHandles = setupNvlsChannelDeviceHandles(ctx->switchChannels);
   return ctx;
 }
diff --git a/src/ext/collectives/allreduce/allreduce_nvls_packet.cu b/src/ext/collectives/allreduce/allreduce_nvls_packet.cu
index f16e8b05f..1918eef19 100644
--- a/src/ext/collectives/allreduce/allreduce_nvls_packet.cu
+++ b/src/ext/collectives/allreduce/allreduce_nvls_packet.cu
@@ -82,8 +82,7 @@ void AllreduceNvlsPacket::initialize(std::shared_ptr<Communicator> comm) {
   int nSwitchChannels = 1;
   this->nvlsConnections_ = setupNvlsConnections(comm, nvlsBufferSize_, nSwitchChannels);
   this->switchChannels_ =
-      setupNvlsChannels(this->nvlsConnections_, this->scratchBuffer_, this->scratchBufferSize_, nSwitchChannels);
-  comm->bootstrap()->barrier();
+      setupNvlsChannels(comm, this->nvlsConnections_, this->scratchBuffer_, this->scratchBufferSize_, nSwitchChannels);
 }
 
 AlgorithmCtxKey AllreduceNvlsPacket::generateAllreduceContextKey(const void*, void*, size_t, DataType, bool) {
diff --git a/src/ext/collectives/allreduce/allreduce_nvls_warp_pipeline.cu b/src/ext/collectives/allreduce/allreduce_nvls_warp_pipeline.cu
index ba447d32a..d5bbb2e71 100644
--- a/src/ext/collectives/allreduce/allreduce_nvls_warp_pipeline.cu
+++ b/src/ext/collectives/allreduce/allreduce_nvls_warp_pipeline.cu
@@ -1,7 +1,6 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT License.
 
-#include <algorithm>
 #include <mscclpp/algorithm.hpp>
 
 #include "allreduce/allreduce_nvls_warp_pipeline.hpp"
@@ -141,15 +140,15 @@ struct NvlsWarpPipelineAdapter {
 
 void AllreduceNvlsWarpPipeline::initialize(std::shared_ptr<Communicator> comm) {
   nSwitchChannels_ = NUM_NVLS_CONNECTION;
-  nRanksPerIpcDomain_ = comm->bootstrap()->getNranksPerIpcDomain();
+  int nRanksPerIpcDomain = comm->bootstrap()->getNranksPerIpcDomain();
   // Per-peer channel allocation must hold 2 * nBlocks entries; default nBlocks = 4 * nRanksPerIpcDomain.
-  nBaseChannels_ = std::max(64, 8 * nRanksPerIpcDomain_);
+  int nBaseChannels = std::max(64, 8 * nRanksPerIpcDomain);
   this->conns_ = setupConnections(comm);
   // setup semaphores
   std::vector<std::shared_ptr<MemoryDevice2DeviceSemaphore>> memorySemaphores =
-      setupMemorySemaphores(comm, this->conns_, nBaseChannels_);
+      setupMemorySemaphores(comm, this->conns_, nBaseChannels);
   // setup base memory channels
-  this->baseChannels_ = setupBaseMemoryChannels(this->conns_, memorySemaphores, nBaseChannels_);
+  this->baseChannels_ = setupBaseMemoryChannels(this->conns_, memorySemaphores, nBaseChannels);
   this->memoryChannelsDeviceHandle_ = setupBaseMemoryChannelDeviceHandles(this->baseChannels_);
   this->nvlsConnections_ = setupNvlsConnections(comm, nvlsBufferSize_, nSwitchChannels_);
 }
@@ -192,7 +191,7 @@ std::shared_ptr<void> AllreduceNvlsWarpPipeline::initAllreduceContext(std::share
 
   // setup channels
   ctx->switchChannels =
-      setupNvlsChannels(this->nvlsConnections_, this->scratchBuffer_, scratchBufferSize_, nSwitchChannels_);
+      setupNvlsChannels(comm, this->nvlsConnections_, this->scratchBuffer_, scratchBufferSize_, nSwitchChannels_);
   ctx->switchChannelDeviceHandles = setupNvlsChannelDeviceHandles(ctx->switchChannels);
   return ctx;
 }
diff --git a/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu b/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu
index 32fc61423..481e8ad85 100644
--- a/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu
+++ b/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu
@@ -2,7 +2,6 @@
 // Licensed under the MIT License.
 
 #include <mscclpp/core.hpp>
-#include <mscclpp/errors.hpp>
 
 #include "allreduce/allreduce_nvls_zero_copy.hpp"
 #include "allreduce/common.hpp"
@@ -195,11 +194,12 @@ std::shared_ptr<void> AllreduceNvls::initAllreduceContext(std::shared_ptr<mscclp
   MSCCLPP_CUTHROW(cuMemGetAddressRange(&recvBasePtr, &recvBytes, (CUdeviceptr)output));
 
   // setup channels
-  ctx->switchChannels = setupNvlsChannels(this->nvlsConnections_, (void*)sendBasePtr, sendBytes, nSwitchChannels_);
+  ctx->switchChannels =
+      setupNvlsChannels(comm, this->nvlsConnections_, (void*)sendBasePtr, sendBytes, nSwitchChannels_);
   if (input != output) {
     auto nvlsOutConnections = this->nvlsOutConnections_;
     std::vector<mscclpp::SwitchChannel> outChannels =
-        setupNvlsChannels(this->nvlsOutConnections_, (void*)recvBasePtr, recvBytes, nSwitchChannels_);
+        setupNvlsChannels(comm, this->nvlsOutConnections_, (void*)recvBasePtr, recvBytes, nSwitchChannels_);
     ctx->switchChannels.insert(ctx->switchChannels.end(), outChannels.begin(), outChannels.end());
   }
 
diff --git a/src/ext/collectives/allreduce/allreduce_rsag.cu b/src/ext/collectives/allreduce/allreduce_rsag.cu
index f07e0e2c8..6fffc4dac 100644
--- a/src/ext/collectives/allreduce/allreduce_rsag.cu
+++ b/src/ext/collectives/allreduce/allreduce_rsag.cu
@@ -144,7 +144,7 @@ struct AllreduceRsAgAdapter {
 
 void AllreduceRsAg::initialize(std::shared_ptr<Communicator> comm) {
   this->conns_ = setupConnections(comm);
-  nChannelsPerConnection_ = 128;
+  nChannelsPerConnection_ = 64;
   comm_ = comm;
   // setup semaphores
   this->scratchSemaphores_ = setupMemorySemaphores(comm, this->conns_, nChannelsPerConnection_);
diff --git a/src/ext/collectives/collective_utils.cc b/src/ext/collectives/collective_utils.cc
index c3856a88e..5d038afae 100644
--- a/src/ext/collectives/collective_utils.cc
+++ b/src/ext/collectives/collective_utils.cc
@@ -6,12 +6,9 @@
 #include <algorithm>
 #include <mscclpp/algorithm.hpp>
 #include <mscclpp/core.hpp>
-#include <mscclpp/errors.hpp>
 #include <mscclpp/memory_channel.hpp>
 #include <mscclpp/switch_channel.hpp>
 
-#include "logger.hpp"
-
 namespace mscclpp {
 namespace collective {
 std::vector<mscclpp::RegisteredMemory> setupRemoteMemories(std::shared_ptr<mscclpp::Communicator> comm, int rank,
@@ -101,7 +98,8 @@ std::vector<std::shared_ptr<mscclpp::NvlsConnection>> setupNvlsConnections(std::
   return nvlsConnections;
 }
 
-std::vector<mscclpp::SwitchChannel> setupNvlsChannels(std::vector<std::shared_ptr<mscclpp::NvlsConnection>> conns,
+std::vector<mscclpp::SwitchChannel> setupNvlsChannels(std::shared_ptr<mscclpp::Communicator> comm,
+                                                      std::vector<std::shared_ptr<mscclpp::NvlsConnection>> conns,
                                                       void* buffer, size_t bufferSize, int nSwitchChannels) {
   std::vector<mscclpp::SwitchChannel> channels;
 
@@ -110,6 +108,8 @@ std::vector<mscclpp::SwitchChannel> setupNvlsChannels(std::vector<std::shared_pt
     mscclpp::SwitchChannel switchChannel = nvlsConnection->bindAllocatedMemory((CUdeviceptr)buffer, bufferSize);
     channels.push_back(switchChannel);
   }
+  // Synchronize to make sure all ranks have their NVLS channels set up before any rank starts using them.
+  comm->bootstrap()->barrier();
   return channels;
 }
 
diff --git a/src/ext/collectives/include/allreduce/allreduce_nvls_block_pipeline.hpp b/src/ext/collectives/include/allreduce/allreduce_nvls_block_pipeline.hpp
index 5662d1163..81b74add4 100644
--- a/src/ext/collectives/include/allreduce/allreduce_nvls_block_pipeline.hpp
+++ b/src/ext/collectives/include/allreduce/allreduce_nvls_block_pipeline.hpp
@@ -29,8 +29,6 @@ class AllreduceNvlsBlockPipeline : public AlgorithmBuilder {
   void* scratchBuffer_;
   size_t scratchBufferSize_;
   uint32_t nSwitchChannels_;
-  int nRanksPerIpcDomain_ = 0;
-  int nBaseChannels_ = 0;
   std::shared_ptr<DeviceHandle<BaseMemoryChannel>> memoryChannelsDeviceHandle_;
   std::vector<BaseMemoryChannel> baseChannels_;
   std::vector<Connection> conns_;
diff --git a/src/ext/collectives/include/allreduce/allreduce_nvls_warp_pipeline.hpp b/src/ext/collectives/include/allreduce/allreduce_nvls_warp_pipeline.hpp
index f347c871f..8f02a8738 100644
--- a/src/ext/collectives/include/allreduce/allreduce_nvls_warp_pipeline.hpp
+++ b/src/ext/collectives/include/allreduce/allreduce_nvls_warp_pipeline.hpp
@@ -29,8 +29,6 @@ class AllreduceNvlsWarpPipeline : public AlgorithmBuilder {
   void* scratchBuffer_;
   size_t scratchBufferSize_;
   uint32_t nSwitchChannels_;
-  int nRanksPerIpcDomain_ = 0;
-  int nBaseChannels_ = 0;
   std::shared_ptr<DeviceHandle<BaseMemoryChannel>> memoryChannelsDeviceHandle_;
   std::vector<BaseMemoryChannel> baseChannels_;
   std::vector<Connection> conns_;
diff --git a/src/ext/collectives/include/collective_utils.hpp b/src/ext/collectives/include/collective_utils.hpp
index 2e61b9379..95ce7f5a4 100644
--- a/src/ext/collectives/include/collective_utils.hpp
+++ b/src/ext/collectives/include/collective_utils.hpp
@@ -57,7 +57,8 @@ std::shared_ptr<DeviceHandle<MemoryChannel>> setupMemoryChannelDeviceHandles(
 std::vector<std::shared_ptr<NvlsConnection>> setupNvlsConnections(std::shared_ptr<Communicator> comm, size_t size,
                                                                   int numConnections);
 
-std::vector<SwitchChannel> setupNvlsChannels(std::vector<std::shared_ptr<NvlsConnection>> conns, void* buffer,
+std::vector<SwitchChannel> setupNvlsChannels(std::shared_ptr<Communicator> comm,
+                                             std::vector<std::shared_ptr<NvlsConnection>> conns, void* buffer,
                                              size_t bufferSize, int nSwitchChannels);
 
 std::shared_ptr<DeviceHandle<SwitchChannel>> setupNvlsChannelDeviceHandles(

From ac44e98d962e6d629372b1ab5e61b6f45449a766 Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Wed, 20 May 2026 20:33:27 +0000
Subject: [PATCH 41/44] update

---
 src/ext/collectives/include/collective_utils.hpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/ext/collectives/include/collective_utils.hpp b/src/ext/collectives/include/collective_utils.hpp
index be18477a3..c2bcd87e3 100644
--- a/src/ext/collectives/include/collective_utils.hpp
+++ b/src/ext/collectives/include/collective_utils.hpp
@@ -80,6 +80,7 @@ class AlgorithmCtx {
  public:
   int rank;
   int worldSize;
+  int nRanksPerNode;
   int nRanksPerIpcDomain;
 
   std::vector<RegisteredMemory> registeredMemories;

From 42ece408b993a4be71012b169eed5b28453db796 Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Sun, 24 May 2026 05:56:11 +0000
Subject: [PATCH 42/44] Fix memory leak

---
 .../customized_comm_with_tuning.py            |  4 ++--
 .../allreduce/allreduce_fullmesh.cu           | 24 ++++++++++++++-----
 .../allreduce/allreduce_rsag_zero_copy.cu     |  2 --
 .../include/allreduce/allreduce_fullmesh.hpp  |  2 --
 .../allreduce/allreduce_rsag_zero_copy.hpp    |  2 --
 5 files changed, 20 insertions(+), 14 deletions(-)

diff --git a/examples/torch-integration/customized_comm_with_tuning.py b/examples/torch-integration/customized_comm_with_tuning.py
index cb3661675..cf475cdfc 100644
--- a/examples/torch-integration/customized_comm_with_tuning.py
+++ b/examples/torch-integration/customized_comm_with_tuning.py
@@ -58,8 +58,8 @@ class CustomizedComm:
     """Exposes all_reduce, all_gather, barrier with lazy per-size tuning."""
 
     _TUNE_N_WARMUP = 5
-    _TUNE_N_GRAPH_LAUNCHES = 10
-    _TUNE_N_OPS_PER_GRAPH = 100
+    _TUNE_N_GRAPH_LAUNCHES = 5
+    _TUNE_N_OPS_PER_GRAPH = 20
     _CANDIDATE_NBLOCKS = [4, 8, 16, 24, 32, 48, 56, 64, 128]
     _CANDIDATE_NTHREADS = [512, 768, 1024]
     _NBLOCKS_LIMIT = {
diff --git a/src/ext/collectives/allreduce/allreduce_fullmesh.cu b/src/ext/collectives/allreduce/allreduce_fullmesh.cu
index f547ab4fd..eb8726245 100644
--- a/src/ext/collectives/allreduce/allreduce_fullmesh.cu
+++ b/src/ext/collectives/allreduce/allreduce_fullmesh.cu
@@ -9,6 +9,17 @@
 namespace mscclpp {
 namespace collective {
 
+namespace {
+// Per-context cache of input-side MemoryChannels keyed by input pointer.
+// Lifetime is tied to AlgorithmCtx, so entries are released when the ctx is
+// evicted from the framework's context cache (avoids unbounded growth across
+// allreduce calls that pass different input buffers).
+using InputChannelsCache =
+    std::unordered_map<const void*,
+                       std::pair<std::vector<MemoryChannel>, std::shared_ptr<DeviceHandle<MemoryChannel>>>>;
+constexpr const char* kInputChannelsExtraKey = "inputChannels";
+}  // namespace
+
 template <ReduceOp OpType, typename T, typename AccumT = T>
 __global__ void __launch_bounds__(512, 1)
     allreduceFullmesh(T* buff, T* scratch, T* resultBuff, DeviceHandle<MemoryChannel>* memoryChannels,
@@ -195,17 +206,17 @@ CommResult AllreduceFullmesh::allreduceKernelFunc(
     MSCCLPP_CUTHROW(cuMemGetAddressRange(&recvBasePtr, &recvBytes, (CUdeviceptr)output));
     channelOutOffset = (char*)output - (char*)recvBasePtr;
   }
-  std::shared_ptr<DeviceHandle<MemoryChannel>> inputChannelHandles;
-  if (this->memoryChannelsMap_.find(input) != this->memoryChannelsMap_.end()) {
-    inputChannelHandles = this->memoryChannelsMap_[input].second;
-  } else {
+  auto& inputChannelsCache = *static_cast<InputChannelsCache*>(ctx->extras.at(kInputChannelsExtraKey).get());
+  auto it = inputChannelsCache.find(input);
+  if (it == inputChannelsCache.end()) {
     RegisteredMemory localMemory = comm_->registerMemory(const_cast<void*>(input), inputSize, Transport::CudaIpc);
     std::vector<MemoryChannel> channels =
         setupMemoryChannels(this->conns_, this->inputScratchSemaphores_, this->remoteScratchMemories_, localMemory,
                             nChannelsPerConnection_);
-    this->memoryChannelsMap_[input] = std::make_pair(channels, setupMemoryChannelDeviceHandles(channels));
+    auto handles = setupMemoryChannelDeviceHandles(channels);
+    it = inputChannelsCache.emplace(input, std::make_pair(std::move(channels), std::move(handles))).first;
   }
-  inputChannelHandles = this->memoryChannelsMap_[input].second;
+  std::shared_ptr<DeviceHandle<MemoryChannel>> inputChannelHandles = it->second.second;
 
   AllreduceFunc allreduce = dispatch<AllreduceAllconnectAdapter>(op, dtype, accumDtype);
   if (!allreduce) {
@@ -267,6 +278,7 @@ std::shared_ptr<void> AllreduceFullmesh::initAllreduceContext(std::shared_ptr<Co
   ctx->memoryChannels = setupMemoryChannels(this->conns_, ctx->memorySemaphores, ctx->registeredMemories, localMemory,
                                             nChannelsPerConnection_);
   ctx->memoryChannelDeviceHandles = setupMemoryChannelDeviceHandles(ctx->memoryChannels);
+  ctx->extras.insert({kInputChannelsExtraKey, std::make_shared<InputChannelsCache>()});
   return ctx;
 }
 
diff --git a/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu b/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu
index 877a722a3..e7ed0cabe 100644
--- a/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu
+++ b/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu
@@ -211,8 +211,6 @@ std::shared_ptr<void> AllreduceRsAgZeroCopy::initAllreduceContext(std::shared_pt
   // register input and output memories
   RegisteredMemory inputMemory = comm->registerMemory((void*)input, size, Transport::CudaIpc);
   RegisteredMemory outputMemory = comm->registerMemory(output, size, Transport::CudaIpc);
-  this->inputMemories_.push_back(inputMemory);
-  this->outputMemories_.push_back(outputMemory);
 
   auto remoteInputMemories = setupRemoteMemories(comm, ctx->rank, inputMemory);
   auto remoteOutputMemories = setupRemoteMemories(comm, ctx->rank, outputMemory);
diff --git a/src/ext/collectives/include/allreduce/allreduce_fullmesh.hpp b/src/ext/collectives/include/allreduce/allreduce_fullmesh.hpp
index a54352b3f..e0c63a3d3 100644
--- a/src/ext/collectives/include/allreduce/allreduce_fullmesh.hpp
+++ b/src/ext/collectives/include/allreduce/allreduce_fullmesh.hpp
@@ -30,8 +30,6 @@ class AllreduceFullmesh : public mscclpp::AlgorithmBuilder {
   std::vector<std::shared_ptr<MemoryDevice2DeviceSemaphore>> inputScratchSemaphores_;
   std::vector<RegisteredMemory> remoteScratchMemories_;
   RegisteredMemory localScratchMemory_;
-  std::unordered_map<const void*, std::pair<std::vector<MemoryChannel>, std::shared_ptr<DeviceHandle<MemoryChannel>>>>
-      memoryChannelsMap_;
   bool symmetricMemory_ = false;
 };
 }  // namespace collective
diff --git a/src/ext/collectives/include/allreduce/allreduce_rsag_zero_copy.hpp b/src/ext/collectives/include/allreduce/allreduce_rsag_zero_copy.hpp
index 05bf2ef3c..528d9708b 100644
--- a/src/ext/collectives/include/allreduce/allreduce_rsag_zero_copy.hpp
+++ b/src/ext/collectives/include/allreduce/allreduce_rsag_zero_copy.hpp
@@ -27,8 +27,6 @@ class AllreduceRsAgZeroCopy : public mscclpp::AlgorithmBuilder {
   int nChannelsPerConnection_;
   std::vector<Connection> conns_;
   std::vector<std::shared_ptr<MemoryDevice2DeviceSemaphore>> semaphores_;
-  std::vector<RegisteredMemory> inputMemories_;
-  std::vector<RegisteredMemory> outputMemories_;
 
   std::vector<BaseMemoryChannel> baseChannels_;
   std::shared_ptr<DeviceHandle<BaseMemoryChannel>> baseMemoryChannelHandles_;

From 641420de6dd89f2cdd7645a253b8d65917edadd1 Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Tue, 26 May 2026 22:05:22 +0000
Subject: [PATCH 43/44] increase nvls memory size to 64 GB

---
 .../collectives/include/allreduce/allreduce_nvls_zero_copy.hpp  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/ext/collectives/include/allreduce/allreduce_nvls_zero_copy.hpp b/src/ext/collectives/include/allreduce/allreduce_nvls_zero_copy.hpp
index c40bd2cda..a28bcae37 100644
--- a/src/ext/collectives/include/allreduce/allreduce_nvls_zero_copy.hpp
+++ b/src/ext/collectives/include/allreduce/allreduce_nvls_zero_copy.hpp
@@ -28,7 +28,7 @@ class AllreduceNvls : public AlgorithmBuilder {
   // Large buffer size because cuMemMap requires offset=0 for multicast handles, so the entire
   // user allocation must be mapped. This only reserves virtual address space; no physical memory
   // is consumed beyond what is actually bound.
-  const size_t nvlsBufferSize_ = (1UL << 34);
+  const size_t nvlsBufferSize_ = (1UL << 36);
   uint32_t nSwitchChannels_;
   std::shared_ptr<DeviceHandle<BaseMemoryChannel>> memoryChannelsDeviceHandle_;
   std::vector<BaseMemoryChannel> baseChannels_;

From ea73a1e1b7d666802705d69027afd201303e9574 Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Tue, 26 May 2026 22:34:01 +0000
Subject: [PATCH 44/44] WIP

---
 include/mscclpp/gpu_utils.hpp                                  | 3 ++-
 .../collectives/include/allreduce/allreduce_nvls_zero_copy.hpp | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/include/mscclpp/gpu_utils.hpp b/include/mscclpp/gpu_utils.hpp
index b079e0fd9..ed5f9f63b 100644
--- a/include/mscclpp/gpu_utils.hpp
+++ b/include/mscclpp/gpu_utils.hpp
@@ -342,7 +342,8 @@ class GpuBuffer {
     MSCCLPP_CUDATHROW(cudaGetDevice(&deviceId_));
 #if (CUDA_NVLS_API_AVAILABLE)
     if (isNvlsSupported()) {
-      size_t gran = detail::getMulticastGranularity(nelems * sizeof(T), CU_MULTICAST_GRANULARITY_RECOMMENDED);
+      // TODO: pass granularity from the caller instead of using the minimum granularity.
+      size_t gran = detail::getMulticastGranularity(nelems * sizeof(T), CU_MULTICAST_GRANULARITY_MINIMUM);
       bytes_ = (nelems * sizeof(T) + gran - 1) / gran * gran / sizeof(T) * sizeof(T);
       memory_ = detail::gpuCallocPhysicalShared<T>(nelems, gran);
       return;
diff --git a/src/ext/collectives/include/allreduce/allreduce_nvls_zero_copy.hpp b/src/ext/collectives/include/allreduce/allreduce_nvls_zero_copy.hpp
index a28bcae37..c40bd2cda 100644
--- a/src/ext/collectives/include/allreduce/allreduce_nvls_zero_copy.hpp
+++ b/src/ext/collectives/include/allreduce/allreduce_nvls_zero_copy.hpp
@@ -28,7 +28,7 @@ class AllreduceNvls : public AlgorithmBuilder {
   // Large buffer size because cuMemMap requires offset=0 for multicast handles, so the entire
   // user allocation must be mapped. This only reserves virtual address space; no physical memory
   // is consumed beyond what is actually bound.
-  const size_t nvlsBufferSize_ = (1UL << 36);
+  const size_t nvlsBufferSize_ = (1UL << 34);
   uint32_t nSwitchChannels_;
   std::shared_ptr<DeviceHandle<BaseMemoryChannel>> memoryChannelsDeviceHandle_;
   std::vector<BaseMemoryChannel> baseChannels_;