From dd8b301a658ce12688e743d996c33ec53544694a Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Mon, 27 Apr 2026 20:36:58 +0000 Subject: [PATCH 01/44] Scale native allreduce/allgather algos for MNNVL/MNNVLS Bump MAX_NRANKS_PER_NODE from 8 to 72 to cover Multi-Node NVLink (MNNVL) domains up to GB200 NVL72, and bump NUM_SEMAPHORES from 64 to 512 to accommodate semaphore indexing that grows as O(nRanksPerNode). Convert allreduce_rsag_zero_copy from a compile-time-templated kernel ({4,8} ranks) to a runtime nRanksPerNode kernel; fuse load+reduce inside the peer loop to avoid O(NPeers) register pressure that would otherwise spill at NVL72 scale. Bump AllreduceAllpairPacket::maxBlockNum_ from 28 to 72 so the adapter can launch >= nPeers blocks at MNNVL scale. Fix a shared-memory channel-cache bug across five kernels: nvls_zero_copy, nvls_warp_pipeline, packet, allreduce_fullmesh, and allgather_fullmesh. The original 'if (lid < nPeers) channels[lid] = ...' load only populated the first WARP_SIZE entries, but threads from multiple warps later read channels[threadIdx.x] up to nPeers-1. Replace with a per-warp strided loop so every warp loads all entries before __syncwarp(); the same-value cross-warp writes are benign. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../allgather/allgather_fullmesh.cu | 7 ++- .../allreduce/allreduce_fullmesh.cu | 9 ++- .../allreduce/allreduce_nvls_warp_pipeline.cu | 7 ++- .../allreduce/allreduce_nvls_zero_copy.cu | 7 ++- .../collectives/allreduce/allreduce_packet.cu | 7 ++- .../allreduce/allreduce_rsag_zero_copy.cu | 55 ++++++++----------- .../allreduce/allreduce_allpair_packet.hpp | 4 +- .../collectives/include/collective_utils.hpp | 13 ++++- 8 files changed, 61 insertions(+), 48 deletions(-) diff --git a/src/ext/collectives/allgather/allgather_fullmesh.cu b/src/ext/collectives/allgather/allgather_fullmesh.cu index fb51a3425..17054869e 100644 --- a/src/ext/collectives/allgather/allgather_fullmesh.cu +++ b/src/ext/collectives/allgather/allgather_fullmesh.cu @@ -30,8 +30,11 @@ __global__ void __launch_bounds__(1024, 1) __shared__ DeviceHandle channels[MAX_NRANKS_PER_NODE - 1]; const int lid = threadIdx.x % WARP_SIZE; - if (lid < nPeer) { - channels[lid] = memoryChans[lid]; + // Each warp redundantly loads all entries (same value, benign race) so that + // every warp has the data its threads will read after __syncwarp(). Required + // when nPeer > WARP_SIZE (MNNVL/NVL72 scale). + for (int i = lid; i < nPeer; i += WARP_SIZE) { + channels[i] = memoryChans[i]; } __syncwarp(); const int tid = threadIdx.x + blockIdx.x * blockDim.x; diff --git a/src/ext/collectives/allreduce/allreduce_fullmesh.cu b/src/ext/collectives/allreduce/allreduce_fullmesh.cu index 24d2a31c2..9d144c621 100644 --- a/src/ext/collectives/allreduce/allreduce_fullmesh.cu +++ b/src/ext/collectives/allreduce/allreduce_fullmesh.cu @@ -52,9 +52,12 @@ __global__ void __launch_bounds__(512, 1) __shared__ DeviceHandle channels[MAX_NRANKS_PER_NODE - 1]; __shared__ DeviceHandle outChannels[MAX_NRANKS_PER_NODE - 1]; const int lid = threadIdx.x % WARP_SIZE; - if (lid < nPeer) { - channels[lid] = memoryChans[lid]; - outChannels[lid] = memoryOutChans[lid]; + // Each warp redundantly loads all entries (same value, benign race) so that + // every warp has the data its threads will read after __syncwarp(). Required + // when nPeer > WARP_SIZE (MNNVL/NVL72 scale). + for (int i = lid; i < nPeer; i += WARP_SIZE) { + channels[i] = memoryChans[i]; + outChannels[i] = memoryOutChans[i]; } __syncwarp(); diff --git a/src/ext/collectives/allreduce/allreduce_nvls_warp_pipeline.cu b/src/ext/collectives/allreduce/allreduce_nvls_warp_pipeline.cu index 3bb054dae..9be621f08 100644 --- a/src/ext/collectives/allreduce/allreduce_nvls_warp_pipeline.cu +++ b/src/ext/collectives/allreduce/allreduce_nvls_warp_pipeline.cu @@ -57,8 +57,11 @@ __global__ void __launch_bounds__(1024, 1) auto memoryChans = memoryChannels + chanOffset; __shared__ DeviceHandle channels[(MAX_NRANKS_PER_NODE - 1) * 2]; const int lid = threadIdx.x % WARP_SIZE; - if (lid < nPeers * 2) { - channels[lid] = memoryChans[lid]; + // Each warp redundantly loads all entries (same value, benign race) so that + // every warp has the data its threads will read after __syncwarp(). Required + // when nPeers*2 > WARP_SIZE (MNNVL scale). + for (int i = lid; i < nPeers * 2; i += WARP_SIZE) { + channels[i] = memoryChans[i]; } __syncwarp(); for (int it = 0; it < nIter; it++) { diff --git a/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu b/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu index e7f2028fa..735deb0a1 100644 --- a/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu +++ b/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu @@ -44,8 +44,11 @@ __global__ void __launch_bounds__(1024, 1) auto memoryChans = memoryChannels + chanOffset; __shared__ mscclpp::DeviceHandle channels[MAX_NRANKS_PER_NODE - 1]; const int lid = threadIdx.x % WARP_SIZE; - if (lid < nRanksPerNode - 1) { - channels[lid] = memoryChans[lid]; + // Each warp redundantly loads all entries (same value, benign race) so that + // every warp has the data its threads will read after __syncwarp(). Required + // when nPeers > WARP_SIZE (MNNVL/NVL72 → 71 peers). + for (int i = lid; i < nRanksPerNode - 1; i += WARP_SIZE) { + channels[i] = memoryChans[i]; } __syncwarp(); if (threadIdx.x < nPeers) { diff --git a/src/ext/collectives/allreduce/allreduce_packet.cu b/src/ext/collectives/allreduce/allreduce_packet.cu index e2d8ef732..d39da408e 100644 --- a/src/ext/collectives/allreduce/allreduce_packet.cu +++ b/src/ext/collectives/allreduce/allreduce_packet.cu @@ -83,8 +83,11 @@ __global__ void __launch_bounds__(1024, 1) // Put channels into shared memory, read channel info from global memory is unexpectable slow. __shared__ mscclpp::DeviceHandle channels[MAX_NRANKS_PER_NODE - 1]; const int lid = tid % WARP_SIZE; - if (lid < nPeers) { - channels[lid] = memoryChannels[lid]; + // Each warp redundantly loads all entries (same value, benign race) so that + // every warp has the data its threads will read after __syncwarp(). Required + // when nPeers > WARP_SIZE (MNNVL/NVL72 scale). + for (int i = lid; i < nPeers; i += WARP_SIZE) { + channels[i] = memoryChannels[i]; } __syncwarp(); // step 1: write to scratch buffer diff --git a/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu b/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu index f95ba7e33..42d86fc89 100644 --- a/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu +++ b/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu @@ -35,25 +35,26 @@ __device__ mscclpp::DeviceSyncer globalSyncer; // // This approach requires registering both input and output buffers as remote // memories (2 * nPeers handles), but avoids scratch buffer allocation and -// the extra copy steps of the standard RSAG. The NRanksPerNode template -// parameter enables compile-time unrolling of peer loops (supports 4 or 8). +// the extra copy steps of the standard RSAG. nRanksPerNode is accepted at +// runtime, which allows the same kernel to handle any NVLink-domain size +// (including Multi-Node NVLink fabrics up to NVL72). -template +template __global__ void __launch_bounds__(1024, 1) allreduceRsAgZeroCopy(T* buff, T* scratch, T* resultBuff, DeviceHandle* memoryChannels, - DeviceHandle* switchChannels, void* remoteMemories, int rank, int worldSize, - size_t nelems) { + DeviceHandle* switchChannels, void* remoteMemories, int rank, + int nRanksPerNode, int worldSize, size_t nelems) { int blockId = blockIdx.x; assert((uintptr_t)buff % sizeof(int4) == 0); assert((uintptr_t)resultBuff % sizeof(int4) == 0); - constexpr int NPeers = NRanksPerNode - 1; + const int NPeers = nRanksPerNode - 1; constexpr uint32_t nelemsPerInt4 = sizeof(int4) / sizeof(T); - const uint32_t outputRemoteBufferOffset = NRanksPerNode - 1; - uint32_t alignedNelems = ((nelems + NRanksPerNode - 1) / NRanksPerNode + nelemsPerInt4 - 1) / nelemsPerInt4 * - nelemsPerInt4 * NRanksPerNode; - uint32_t nelemsPerRank = alignedNelems / NRanksPerNode; + const uint32_t outputRemoteBufferOffset = NPeers; + uint32_t alignedNelems = ((nelems + nRanksPerNode - 1) / nRanksPerNode + nelemsPerInt4 - 1) / nelemsPerInt4 * + nelemsPerInt4 * nRanksPerNode; + uint32_t nelemsPerRank = alignedNelems / nRanksPerNode; uint32_t nInt4PerRank = nelemsPerRank / nelemsPerInt4; uint32_t nInt4Total = (nelems + nelemsPerInt4 - 1) / nelemsPerInt4; @@ -69,12 +70,11 @@ __global__ void __launch_bounds__(1024, 1) } if (nInt4PerBlock == 0) return; - if (threadIdx.x < NPeers) { + if ((int)threadIdx.x < NPeers) { memoryChannelsLocal[threadIdx.x].relaxedSignal(); memoryChannelsLocal[threadIdx.x].relaxedWait(); } __syncthreads(); - int4 data[NPeers]; // AccumInt4: when AccumT != T, use a wider accumulator type. // For AccumT == T, this is just int4 (no-op conversion). constexpr int nElemsPerInt4 = sizeof(int4) / sizeof(T); @@ -84,20 +84,17 @@ __global__ void __launch_bounds__(1024, 1) uint32_t offset = idx + offset4 + rank * nInt4PerRank; if (offset >= nInt4Total) continue; int4 tmp_raw = buff4[offset]; -#pragma unroll - for (int i = 0; i < NPeers; i++) { - int rankIdx = (rank + i + 1) % NRanksPerNode; - int peerIdx = rankIdx < rank ? rankIdx : rankIdx - 1; - data[i] = mscclpp::read(((void**)remoteMemories)[peerIdx], offset); - } + int4 data; AccumVec acc = mscclpp::upcastVector(tmp_raw); for (int i = 0; i < NPeers; i++) { - acc = mscclpp::calVectorAccum(acc, data[i]); + int rankIdx = (rank + i + 1) % nRanksPerNode; + int peerIdx = rankIdx < rank ? rankIdx : rankIdx - 1; + data = mscclpp::read(((void**)remoteMemories)[peerIdx], offset); + acc = mscclpp::calVectorAccum(acc, data); } int4 tmp = mscclpp::downcastVector(acc); -#pragma unroll for (int i = 0; i < NPeers; i++) { - int rankIdx = (rank + i + 1) % NRanksPerNode; + int rankIdx = (rank + i + 1) % nRanksPerNode; int peerIdx = rankIdx < rank ? rankIdx : rankIdx - 1; mscclpp::write(((void**)remoteMemories)[outputRemoteBufferOffset + peerIdx], offset, tmp); } @@ -105,7 +102,7 @@ __global__ void __launch_bounds__(1024, 1) } // Use device barrier gives better performance here. globalSyncer.sync(gridDim.x); - if (blockIdx.x == 0 && threadIdx.x < NPeers) { + if (blockIdx.x == 0 && (int)threadIdx.x < NPeers) { memoryChannelsLocal[threadIdx.x].signal(); memoryChannelsLocal[threadIdx.x].wait(); } @@ -126,17 +123,9 @@ struct AllreduceRsAgZeroCopyAdapter { nBlocks = 128; } } - if (nRanksPerNode == 4) { - allreduceRsAgZeroCopy<4, OpType, T, AccumT> - <<>>((T*)input, (T*)scratch, (T*)output, (ChannelType*)memoryChannels, - switchChannel, remoteMemories, rank, worldSize, nelems); - } else if (nRanksPerNode == 8) { - allreduceRsAgZeroCopy<8, OpType, T, AccumT> - <<>>((T*)input, (T*)scratch, (T*)output, (ChannelType*)memoryChannels, - switchChannel, remoteMemories, rank, worldSize, nelems); - } else { - THROW(ALGO, Error, ErrorCode::InvalidUsage, "Unsupported number of ranks per node: ", nRanksPerNode); - } + allreduceRsAgZeroCopy<<>>( + (T*)input, (T*)scratch, (T*)output, (ChannelType*)memoryChannels, switchChannel, remoteMemories, rank, + nRanksPerNode, worldSize, nelems); return cudaGetLastError(); } }; diff --git a/src/ext/collectives/include/allreduce/allreduce_allpair_packet.hpp b/src/ext/collectives/include/allreduce/allreduce_allpair_packet.hpp index 362308b2e..fe96f7622 100644 --- a/src/ext/collectives/include/allreduce/allreduce_allpair_packet.hpp +++ b/src/ext/collectives/include/allreduce/allreduce_allpair_packet.hpp @@ -29,7 +29,9 @@ class AllreduceAllpairPacket : public AlgorithmBuilder { void* scratchBuffer_; size_t scratchBufferSize_; const int nSegmentsForScratchBuffer_ = 2; - const int maxBlockNum_ = 28; + // Must be at least MAX_NRANKS_PER_NODE-1 so the adapter can launch one + // block per peer at MNNVL scale. + const int maxBlockNum_ = 72; std::vector conns_; std::vector> memorySemaphores_; std::vector registeredMemories_; diff --git a/src/ext/collectives/include/collective_utils.hpp b/src/ext/collectives/include/collective_utils.hpp index f705a9d1d..638214dd5 100644 --- a/src/ext/collectives/include/collective_utils.hpp +++ b/src/ext/collectives/include/collective_utils.hpp @@ -26,9 +26,16 @@ namespace mscclpp { namespace collective { constexpr int NUM_NVLS_CONNECTION = 8; -constexpr int NUM_SEMAPHORES = 64; - -constexpr int MAX_NRANKS_PER_NODE = 8; +// Sized to cover MAX_NRANKS_PER_NODE-scale allreduce algos whose device-side +// semaphore indices grow as O(nRanksPerNode) (e.g. nvls_block_pipeline uses +// up to ~5 * nRanksPerNode entries). +constexpr int NUM_SEMAPHORES = 512; + +// Upper bound on the number of NVLink-reachable ranks that participate in a +// single collective. Sized to cover Multi-Node NVLink (MNNVL) domains up to +// GB200 NVL72 (72 GPUs sharing one NVLink fabric). Drives compile-time sizing +// of shared-memory channel arrays in the allreduce/allgather kernels. +constexpr int MAX_NRANKS_PER_NODE = 72; constexpr int SCRATCH_SIZE = 2 * 1024 * 1024 * 70; // double buffer * 35 thread-blocks * 8 ranks * 256KB = 70MB From 893a08e69c036311bc3e8e74bf4e5973d9e0c317 Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Tue, 28 Apr 2026 05:38:59 +0000 Subject: [PATCH 02/44] Enable MNNVL allreduce tuning Add an MNNVL rank-domain override so MSCCL++ collectives can treat multi-host NVLink fabrics as a single CUDA IPC/NVLS peer group. Update packet, RSAG, and NVLS allreduce paths to use the collective domain size and teach the torch integration tuning example to select MNNVL-capable allreduce algorithms. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../customized_comm_with_tuning.py | 176 +++++++++++++++--- include/mscclpp/env.hpp | 6 + src/core/bootstrap/bootstrap.cc | 5 + src/core/env.cpp | 4 +- .../allreduce/allreduce_allpair_packet.cu | 7 +- .../allreduce/allreduce_nvls_packet.cu | 4 +- .../allreduce/allreduce_nvls_zero_copy.cu | 2 +- .../collectives/allreduce/allreduce_packet.cu | 7 +- .../collectives/allreduce/allreduce_rsag.cu | 2 +- .../allreduce/allreduce_rsag_zero_copy.cu | 2 +- src/ext/collectives/collective_utils.cc | 21 ++- .../collectives/include/collective_utils.hpp | 4 +- 12 files changed, 199 insertions(+), 41 deletions(-) diff --git a/examples/torch-integration/customized_comm_with_tuning.py b/examples/torch-integration/customized_comm_with_tuning.py index 060a0097d..035c1dbbb 100644 --- a/examples/torch-integration/customized_comm_with_tuning.py +++ b/examples/torch-integration/customized_comm_with_tuning.py @@ -2,11 +2,34 @@ # Licensed under the MIT License. # torchrun --nnodes=1 --nproc_per_node=8 examples/torch-integration/customized_comm_with_tuning.py +# mpirun -np 2 --hostfile python3 examples/torch-integration/customized_comm_with_tuning.py -import os +import gc +import fcntl import ipaddress +import os +import socket +import struct +import sys +import traceback + +def _get_bootstrap_world_size(): + for name in ("WORLD_SIZE", "OMPI_COMM_WORLD_SIZE", "PMI_SIZE", "SLURM_NTASKS"): + value = os.environ.get(name) + if value is not None: + return int(value) + return None + + +_bootstrap_world_size = _get_bootstrap_world_size() +if ( + _bootstrap_world_size + and _bootstrap_world_size > 1 + and "MSCCLPP_MNNVL_NRANKS_PER_NODE" not in os.environ + and os.environ.get("MSCCLPP_ENABLE_MNNVL", "1") != "0" +): + os.environ["MSCCLPP_MNNVL_NRANKS_PER_NODE"] = str(_bootstrap_world_size) -import netifaces as ni import torch import mscclpp import mscclpp.ext @@ -37,15 +60,44 @@ def _load_algorithms(scratch: torch.Tensor, rank: int): def _interfaces_for_ip(ip: str): target = ipaddress.ip_address(ip) - for iface in ni.interfaces(): - addrs = ni.ifaddresses(iface) - if ni.AF_INET in addrs: - for link in addrs[ni.AF_INET]: - if "addr" in link and ipaddress.ip_address(link["addr"]) == target: - return iface + for iface in os.listdir("/sys/class/net"): + try: + with socket.socket(socket.AF_INET, socket.SOCK_DGRAM) as sock: + req = struct.pack("256s", iface.encode()[:15]) + addr = socket.inet_ntoa(fcntl.ioctl(sock.fileno(), 0x8915, req)[20:24]) + except OSError: + continue + if ipaddress.ip_address(addr) == target: + return iface return None +def _resolve_interface(master_addr: str): + for env_name in ("MSCCLPP_INTERFACE", "MSCCLPP_SOCKET_IFNAME", "NCCL_SOCKET_IFNAME"): + value = os.environ.get(env_name) + if value: + iface = value.split(",")[0].strip() + if iface in os.listdir("/sys/class/net"): + return iface + raise ValueError(f"Interface {iface} from {env_name} does not exist") + return _interfaces_for_ip(master_addr) + + +def _get_env_int(*names: str, default=None): + for name in names: + value = os.environ.get(name) + if value is not None: + return int(value) + return default + + +def _running_under_mpi() -> bool: + return any( + name in os.environ + for name in ("OMPI_COMM_WORLD_RANK", "PMI_RANK", "PMIX_RANK", "MPI_LOCALRANKID", "SLURM_PROCID") + ) + + def _to_mscclpp_op(op) -> mscclpp.ReduceOp: if op == torch.distributed.ReduceOp.SUM: return mscclpp.ReduceOp.SUM @@ -76,6 +128,7 @@ class CustomizedComm: "default_allreduce_nvls_packet": 16, "default_allreduce_packet": 56, "default_allreduce_allpair_packet": 56, + "default_allreduce_rsag": 64, "default_allreduce_fullmesh": 64, "default_allgather_fullmesh2": 32, } @@ -84,6 +137,12 @@ def __init__(self, comm: mscclpp.CommGroup, symmetric_memory: bool = False): self.comm = comm self.rank = comm.my_rank self.world_size = comm.nranks + self.nranks_per_node = comm.nranks_per_node + self.mnnvl_domain = self.world_size > 1 and os.environ.get("MSCCLPP_MNNVL_NRANKS_PER_NODE") == str( + self.world_size + ) + self.multi_node = self.world_size > self.nranks_per_node and not self.mnnvl_domain + self.multi_host_mnnvl = self.mnnvl_domain and self.world_size > 1 self.symmetric_memory = symmetric_memory self._nvls = mscclpp.is_nvls_supported() @@ -106,6 +165,10 @@ def _default_ar_config(self): pkt = self._algo("allreduce", "default_allreduce_nvls_packet") if self._nvls and pkt: return (pkt, 0, 0) + if self.multi_node or self.multi_host_mnnvl: + rsag = self._algo("allreduce", "default_allreduce_rsag") + if rsag: + return (rsag, 0, 0) return (self._algo("allreduce", "default_allreduce_packet"), 0, 0) # -- low-level execute -- @@ -166,23 +229,48 @@ def _ensure_tune_bufs(self): def _ar_candidates(self, size: int): out = [] - if size <= 4 << 20: + if self.multi_host_mnnvl: + if size <= 4 << 20: + a = self._algo("allreduce", "default_allreduce_packet") + if a: + out.append(a) + a = self._algo("allreduce", "default_allreduce_nvls_packet") + if self._nvls and a: + out.append(a) + if size >= 512 << 10: + a = self._algo("allreduce", "default_allreduce_rsag") + if a: + out.append(a) + return out + if self.multi_node: a = self._algo("allreduce", "default_allreduce_nvls_packet") if self._nvls and a: out.append(a) a = self._algo("allreduce", "default_allreduce_packet") + if a: + out.append(a) + if size >= 512 << 10: + a = self._algo("allreduce", "default_allreduce_rsag") + if a: + out.append(a) + return out + if size <= 4 << 20: + a = self._algo("allreduce", "default_allreduce_packet") if a: out.append(a) a = self._algo("allreduce", "default_allreduce_allpair_packet") if a: out.append(a) - if size >= 512 << 10: - a = self._algo("allreduce", "default_allreduce_nvls_zero_copy") - if self._nvls and self.symmetric_memory and a: + a = self._algo("allreduce", "default_allreduce_nvls_packet") + if self._nvls and a: out.append(a) + if size >= 512 << 10: a = self._algo("allreduce", "default_allreduce_rsag_zero_copy") if a: out.append(a) + a = self._algo("allreduce", "default_allreduce_nvls_zero_copy") + if self._nvls and self.symmetric_memory and a: + out.append(a) if torch.version.hip is not None: a = self._algo("allreduce", "default_allreduce_fullmesh") if a: @@ -190,6 +278,8 @@ def _ar_candidates(self, size: int): return out def _ag_candidates(self): + if self.multi_node or self.multi_host_mnnvl: + return [] a = self._algo("allgather", "default_allgather_fullmesh2") return [a] if a else [] @@ -314,6 +404,8 @@ def all_reduce(self, tensor, op=torch.distributed.ReduceOp.SUM, stream=None, acc ) def all_gather(self, output_tensor, input_tensor, stream=None): + if self.multi_node or self.multi_host_mnnvl: + raise RuntimeError("all_gather in this example currently supports only single-node runs") sz = _round_pow2(input_tensor.nbytes) if sz not in self._tune_cache["allgather"]: self._tune_size("allgather", sz) @@ -332,7 +424,11 @@ def destroy(self): # -- Benchmarks (standalone) -------------------------------------------------- -def _bench_sizes(low=5 * 1024, high=80 << 20): +def _bench_sizes(low=None, high=None): + if low is None: + low = _get_env_int("MSCCLPP_BENCH_LOW_SIZE", default=5 * 1024) + if high is None: + high = _get_env_int("MSCCLPP_BENCH_HIGH_SIZE", default=80 << 20) sizes, c = [], low while c <= high: sizes.append(c) @@ -433,13 +529,21 @@ def benchmark_allgather(comm: CustomizedComm, dtype=torch.float16, n_warmup=10, def init_dist() -> mscclpp.CommGroup: addr = os.environ.get("MSCCLPP_MASTER_ADDR") - if addr: - rank, world = int(os.environ["RANK"]), int(os.environ["WORLD_SIZE"]) - port = os.environ["MSCCLPP_MASTER_PORT"] - iface = _interfaces_for_ip(addr) + rank = _get_env_int("RANK", "OMPI_COMM_WORLD_RANK", "PMI_RANK", "SLURM_PROCID") + world = _get_env_int("WORLD_SIZE", "OMPI_COMM_WORLD_SIZE", "PMI_SIZE", "SLURM_NTASKS") + if addr and rank is not None and world is not None: + port = os.environ.get("MSCCLPP_MASTER_PORT", "29500") + iface = _resolve_interface(addr) if not iface: raise ValueError(f"No interface for {addr}") return mscclpp.CommGroup(interfaceIpPortTrio=f"{iface}:{addr}:{port}", rank=rank, size=world) + if _running_under_mpi(): + try: + from mpi4py import MPI + except ModuleNotFoundError as exc: + raise RuntimeError("mpi4py is required to launch this example with mpirun") from exc + + return mscclpp.CommGroup(mpi_comm=MPI.COMM_WORLD) import torch.distributed as dist dist.init_process_group(backend="gloo") @@ -447,7 +551,7 @@ def init_dist() -> mscclpp.CommGroup: def main(): - local = int(os.environ["LOCAL_RANK"]) + local = _get_env_int("LOCAL_RANK", "OMPI_COMM_WORLD_LOCAL_RANK", "MPI_LOCALRANKID", "SLURM_LOCALID", default=0) torch.cuda.set_device(local) dtype_str = os.environ.get("DTYPE", "float16") @@ -455,22 +559,48 @@ def main(): accum_map = {"float32": mscclpp.DataType.float32, "float16": mscclpp.DataType.float16} accum_str = os.environ.get("ACCUM_DTYPE") accum_dtype = accum_map.get(accum_str) if accum_str else None + n_warmup = _get_env_int("MSCCLPP_BENCH_WARMUP", default=10) + n_graph_launches = _get_env_int("MSCCLPP_BENCH_GRAPH_LAUNCHES", default=10) + n_iter = _get_env_int("MSCCLPP_BENCH_ITERS", default=100) comm_group = init_dist() cc = CustomizedComm(comm_group) print(f"rank {local} starting benchmarks with dtype={dtype} accum_dtype={accum_dtype}...") - benchmark_allreduce(cc, dtype=dtype, accum_dtype=accum_dtype) + benchmark_allreduce( + cc, + dtype=dtype, + accum_dtype=accum_dtype, + n_warmup=n_warmup, + n_graph_launches=n_graph_launches, + n_iter=n_iter, + ) cc.barrier() torch.cuda.synchronize() - benchmark_allgather(cc, dtype=dtype) - cc.barrier() - torch.cuda.synchronize() + if cc.multi_node or cc.multi_host_mnnvl: + if cc.rank == 0: + print("Skipping allgather benchmark on multi-node: this example's allgather path is single-node only.") + else: + benchmark_allgather(cc, dtype=dtype, n_warmup=n_warmup, n_graph_launches=n_graph_launches, n_iter=n_iter) + cc.barrier() + torch.cuda.synchronize() cc.destroy() + del cc + del comm_group + gc.collect() print(f"rank {local} completed successfully.") if __name__ == "__main__": - main() + exit_code = 0 + try: + main() + except Exception: + exit_code = 1 + traceback.print_exc() + finally: + sys.stdout.flush() + sys.stderr.flush() + os._exit(exit_code) diff --git a/include/mscclpp/env.hpp b/include/mscclpp/env.hpp index a6dd306b6..09d364c3b 100644 --- a/include/mscclpp/env.hpp +++ b/include/mscclpp/env.hpp @@ -119,6 +119,12 @@ class Env { /// Default is 0. Used when `EndpointConfig::Ib::gidIndex` is -1 (unspecified). const int ibGidIndex; + /// Env name: `MSCCLPP_MNNVL_NRANKS_PER_NODE`. Overrides the NVLink-domain size reported by the bootstrap. + /// This is intended for Multi-Node NVLink (MNNVL) deployments where a single CUDA IPC / NVLS domain spans + /// multiple hosts and should be treated as one collective peer group. + /// If unset or non-positive, the bootstrap falls back to physical-host-based detection. + const int mnnvlNranksPerNode; + private: Env(); diff --git a/src/core/bootstrap/bootstrap.cc b/src/core/bootstrap/bootstrap.cc index b3032e502..c84ef4c0f 100644 --- a/src/core/bootstrap/bootstrap.cc +++ b/src/core/bootstrap/bootstrap.cc @@ -5,6 +5,7 @@ #include #include +#include #include #include #include @@ -433,6 +434,10 @@ void TcpBootstrap::Impl::establishConnections(int64_t timeoutSec) { int TcpBootstrap::Impl::getNranksPerNode() { if (nRanksPerNode_ > 0) return nRanksPerNode_; + if (env()->mnnvlNranksPerNode > 0) { + nRanksPerNode_ = env()->mnnvlNranksPerNode; + return nRanksPerNode_; + } int nRanksPerNode = 0; bool useIpv4 = peerCommAddresses_[rank_].sa.sa_family == AF_INET; for (int i = 0; i < nRanks_; i++) { diff --git a/src/core/env.cpp b/src/core/env.cpp index 7a42471bf..b46670d79 100644 --- a/src/core/env.cpp +++ b/src/core/env.cpp @@ -67,7 +67,8 @@ Env::Env() ncclSymmetricMemory(readEnv("MSCCLPP_NCCL_SYMMETRIC_MEMORY", false)), forceDisableNvls(readEnv("MSCCLPP_FORCE_DISABLE_NVLS", false)), forceDisableGdr(readEnv("MSCCLPP_FORCE_DISABLE_GDR", false)), - ibGidIndex(readEnv("MSCCLPP_IB_GID_INDEX", 0)) {} + ibGidIndex(readEnv("MSCCLPP_IB_GID_INDEX", 0)), + mnnvlNranksPerNode(readEnv("MSCCLPP_MNNVL_NRANKS_PER_NODE", 0)) {} std::shared_ptr env() { static std::shared_ptr globalEnv = std::shared_ptr(new Env()); @@ -97,6 +98,7 @@ std::shared_ptr env() { logEnv("MSCCLPP_FORCE_DISABLE_NVLS", globalEnv->forceDisableNvls); logEnv("MSCCLPP_FORCE_DISABLE_GDR", globalEnv->forceDisableGdr); logEnv("MSCCLPP_IB_GID_INDEX", globalEnv->ibGidIndex); + logEnv("MSCCLPP_MNNVL_NRANKS_PER_NODE", globalEnv->mnnvlNranksPerNode); } return globalEnv; } diff --git a/src/ext/collectives/allreduce/allreduce_allpair_packet.cu b/src/ext/collectives/allreduce/allreduce_allpair_packet.cu index 17bcfc338..9516ad786 100644 --- a/src/ext/collectives/allreduce/allreduce_allpair_packet.cu +++ b/src/ext/collectives/allreduce/allreduce_allpair_packet.cu @@ -17,9 +17,6 @@ __global__ void allreduceAllPairs(T* buff, T* scratch, T* resultBuff, DeviceHand size_t channelDataOffset, size_t scratchBufferSize, int rank, int nRanksPerNode, int worldSize, size_t nelems, uint32_t numScratchBuff, void* flags, uint32_t flagSize) { - // This version of allreduce only works for single nodes - if (worldSize != nRanksPerNode) return; - if (sizeof(T) == 2 || sizeof(T) == 1) nelems = (nelems * sizeof(T) + sizeof(T)) / sizeof(int); const int nPeers = nRanksPerNode - 1; @@ -143,7 +140,7 @@ std::shared_ptr AllreduceAllpairPacket::initAllreduceContext(std::shared_p const int nChannelsPerConnection = maxBlockNum_; ctx->rank = comm->bootstrap()->getRank(); ctx->workSize = comm->bootstrap()->getNranks(); - ctx->nRanksPerNode = comm->bootstrap()->getNranksPerNode(); + ctx->nRanksPerNode = getCollectiveDomainNranksPerNode(comm, this->conns_); ctx->memorySemaphores = this->memorySemaphores_; ctx->registeredMemories = this->registeredMemories_; ctx->registeredMemories.pop_back(); // remove the local memory from previous context @@ -189,4 +186,4 @@ std::shared_ptr AllreduceAllpairPacket::build() { }); } } // namespace collective -} // namespace mscclpp \ No newline at end of file +} // namespace mscclpp diff --git a/src/ext/collectives/allreduce/allreduce_nvls_packet.cu b/src/ext/collectives/allreduce/allreduce_nvls_packet.cu index a616485e1..21f710283 100644 --- a/src/ext/collectives/allreduce/allreduce_nvls_packet.cu +++ b/src/ext/collectives/allreduce/allreduce_nvls_packet.cu @@ -94,7 +94,7 @@ std::shared_ptr AllreduceNvlsPacket::initAllreduceContext(std::shared_ptr< auto ctx = std::make_shared(); ctx->rank = comm->bootstrap()->getRank(); ctx->workSize = comm->bootstrap()->getNranks(); - ctx->nRanksPerNode = comm->bootstrap()->getNranksPerNode(); + ctx->nRanksPerNode = getCollectiveDomainNranksPerNode(comm); // setup channels ctx->switchChannels = this->switchChannels_; @@ -154,4 +154,4 @@ std::shared_ptr AllreduceNvlsPacket::build() { }); } } // namespace collective -} // namespace mscclpp \ No newline at end of file +} // namespace mscclpp diff --git a/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu b/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu index 735deb0a1..25077004b 100644 --- a/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu +++ b/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu @@ -183,7 +183,7 @@ std::shared_ptr AllreduceNvls::initAllreduceContext(std::shared_ptr(); ctx->rank = comm->bootstrap()->getRank(); ctx->workSize = comm->bootstrap()->getNranks(); - ctx->nRanksPerNode = comm->bootstrap()->getNranksPerNode(); + ctx->nRanksPerNode = getCollectiveDomainNranksPerNode(comm); size_t sendBytes, recvBytes; CUdeviceptr sendBasePtr, recvBasePtr; diff --git a/src/ext/collectives/allreduce/allreduce_packet.cu b/src/ext/collectives/allreduce/allreduce_packet.cu index d39da408e..c195aefa3 100644 --- a/src/ext/collectives/allreduce/allreduce_packet.cu +++ b/src/ext/collectives/allreduce/allreduce_packet.cu @@ -23,9 +23,6 @@ __global__ void __launch_bounds__(1024, 1) #else ) { #endif - // This version of allreduce only works for single nodes - if (worldSize != nRanksPerNode) return; - #if defined(ENABLE_NPKIT) extern __shared__ int4 NpkitSharedMem[]; NpKitEvent* event_buffer = (NpKitEvent*)((char*)NpkitSharedMem); @@ -267,7 +264,7 @@ std::shared_ptr AllreducePacket::initAllreduceContext(std::shared_ptrrank = comm->bootstrap()->getRank(); ctx->workSize = comm->bootstrap()->getNranks(); - ctx->nRanksPerNode = comm->bootstrap()->getNranksPerNode(); + ctx->nRanksPerNode = getCollectiveDomainNranksPerNode(comm, this->conns_); ctx->memorySemaphores = this->memorySemaphores_; ctx->registeredMemories = this->registeredMemories_; ctx->registeredMemories.pop_back(); // remove the local memory from previous context @@ -313,4 +310,4 @@ std::shared_ptr AllreducePacket::build() { } } // namespace collective -} // namespace mscclpp \ No newline at end of file +} // namespace mscclpp diff --git a/src/ext/collectives/allreduce/allreduce_rsag.cu b/src/ext/collectives/allreduce/allreduce_rsag.cu index db471b932..f964b87e9 100644 --- a/src/ext/collectives/allreduce/allreduce_rsag.cu +++ b/src/ext/collectives/allreduce/allreduce_rsag.cu @@ -199,7 +199,7 @@ std::shared_ptr AllreduceRsAg::initAllreduceContext(std::shared_ptr(); ctx->rank = comm->bootstrap()->getRank(); ctx->workSize = comm->bootstrap()->getNranks(); - ctx->nRanksPerNode = comm->bootstrap()->getNranksPerNode(); + ctx->nRanksPerNode = getCollectiveDomainNranksPerNode(comm, this->conns_); ctx->memorySemaphores = this->scratchSemaphores_; ctx->registeredMemories = this->remoteScratchMemories_; diff --git a/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu b/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu index 42d86fc89..c4dea321c 100644 --- a/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu +++ b/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu @@ -183,7 +183,7 @@ std::shared_ptr AllreduceRsAgZeroCopy::initAllreduceContext(std::shared_pt auto ctx = std::make_shared(); ctx->rank = comm->bootstrap()->getRank(); ctx->workSize = comm->bootstrap()->getNranks(); - ctx->nRanksPerNode = comm->bootstrap()->getNranksPerNode(); + ctx->nRanksPerNode = getCollectiveDomainNranksPerNode(comm, this->conns_); ctx->memorySemaphores = this->semaphores_; diff --git a/src/ext/collectives/collective_utils.cc b/src/ext/collectives/collective_utils.cc index 016c4a5cc..4d46c53bc 100644 --- a/src/ext/collectives/collective_utils.cc +++ b/src/ext/collectives/collective_utils.cc @@ -69,6 +69,25 @@ std::vector> setupMemoryS return memorySemaphores; } +int getCollectiveDomainNranksPerNode(std::shared_ptr comm, + const std::vector& connections) { + const int worldSize = comm->bootstrap()->getNranks(); + const int nRanksPerNode = comm->bootstrap()->getNranksPerNode(); + if (worldSize <= nRanksPerNode) { + return nRanksPerNode; + } + const bool allPeersUseCudaIpc = + std::all_of(connections.begin(), connections.end(), + [](const auto& connection) { return connection.transport() == mscclpp::Transport::CudaIpc; }); + return allPeersUseCudaIpc ? worldSize : nRanksPerNode; +} + +int getCollectiveDomainNranksPerNode(std::shared_ptr comm) { + const int worldSize = comm->bootstrap()->getNranks(); + const int nRanksPerNode = comm->bootstrap()->getNranksPerNode(); + return worldSize > nRanksPerNode ? worldSize : nRanksPerNode; +} + std::shared_ptr> setupMemoryChannelDeviceHandles( const std::vector& memoryChannels) { std::vector> memoryChannelDeviceHandles; @@ -153,4 +172,4 @@ std::shared_ptr> setupBaseMemo } // namespace collective -} // namespace mscclpp \ No newline at end of file +} // namespace mscclpp diff --git a/src/ext/collectives/include/collective_utils.hpp b/src/ext/collectives/include/collective_utils.hpp index 638214dd5..38362a659 100644 --- a/src/ext/collectives/include/collective_utils.hpp +++ b/src/ext/collectives/include/collective_utils.hpp @@ -50,6 +50,8 @@ std::vector setupMemoryChannels( std::vector setupConnections(std::shared_ptr comm); std::vector> setupMemorySemaphores( std::shared_ptr comm, const std::vector& connections, int nChannelsPerConnection); +int getCollectiveDomainNranksPerNode(std::shared_ptr comm, const std::vector& connections); +int getCollectiveDomainNranksPerNode(std::shared_ptr comm); std::shared_ptr> setupMemoryChannelDeviceHandles( const std::vector& memoryChannels); @@ -96,4 +98,4 @@ class AlgorithmCtx { } // namespace collective } // namespace mscclpp -#endif // MSCCLPP_EXT_COLLECTIVE_UTILS_HPP_ \ No newline at end of file +#endif // MSCCLPP_EXT_COLLECTIVE_UTILS_HPP_ From dded5e0e3933573080acfad30b30681e9a4b19b7 Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Tue, 28 Apr 2026 06:41:17 +0000 Subject: [PATCH 03/44] Improve MNNVL allreduce tuning performance Add the allpair packet algorithm to the MNNVL small-message candidate set and enable zero-copy NVLS/RSAG candidates for larger symmetric-memory allreduce benchmarks. Run the standalone tuning example with symmetric memory so RawGpuBuffer-backed tensors can use the zero-copy paths. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../customized_comm_with_tuning.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/examples/torch-integration/customized_comm_with_tuning.py b/examples/torch-integration/customized_comm_with_tuning.py index 035c1dbbb..75a2c9608 100644 --- a/examples/torch-integration/customized_comm_with_tuning.py +++ b/examples/torch-integration/customized_comm_with_tuning.py @@ -126,9 +126,11 @@ class CustomizedComm: _CANDIDATE_NTHREADS = [512, 768, 1024] _NBLOCKS_LIMIT = { "default_allreduce_nvls_packet": 16, + "default_allreduce_nvls_zero_copy": 32, "default_allreduce_packet": 56, "default_allreduce_allpair_packet": 56, "default_allreduce_rsag": 64, + "default_allreduce_rsag_zero_copy": 64, "default_allreduce_fullmesh": 64, "default_allgather_fullmesh2": 32, } @@ -232,12 +234,21 @@ def _ar_candidates(self, size: int): if self.multi_host_mnnvl: if size <= 4 << 20: a = self._algo("allreduce", "default_allreduce_packet") + if a: + out.append(a) + a = self._algo("allreduce", "default_allreduce_allpair_packet") if a: out.append(a) a = self._algo("allreduce", "default_allreduce_nvls_packet") if self._nvls and a: out.append(a) if size >= 512 << 10: + a = self._algo("allreduce", "default_allreduce_rsag_zero_copy") + if self.symmetric_memory and a: + out.append(a) + a = self._algo("allreduce", "default_allreduce_nvls_zero_copy") + if self._nvls and self.symmetric_memory and a: + out.append(a) a = self._algo("allreduce", "default_allreduce_rsag") if a: out.append(a) @@ -564,7 +575,7 @@ def main(): n_iter = _get_env_int("MSCCLPP_BENCH_ITERS", default=100) comm_group = init_dist() - cc = CustomizedComm(comm_group) + cc = CustomizedComm(comm_group, symmetric_memory=True) print(f"rank {local} starting benchmarks with dtype={dtype} accum_dtype={accum_dtype}...") benchmark_allreduce( From 865c2bc795d5cf3f4e45c5c480eec41b36d5b96e Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Tue, 28 Apr 2026 07:55:52 +0000 Subject: [PATCH 04/44] Optimize MNNVL allreduce without symmetric memory Run the tuning example with symmetric memory disabled, make allreduce tuning use the same symmetric-memory mode as execution, and narrow the MNNVL small-message candidate set to avoid slower packet/NVLS choices. Increase packet and RSAG channel parallelism so non-symmetric CUDA-IPC paths can use 112-block packet and 128-block RSAG configs. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../customized_comm_with_tuning.py | 37 +++++++++++-------- .../collectives/allreduce/allreduce_rsag.cu | 8 +++- .../include/allreduce/allreduce_packet.hpp | 4 +- 3 files changed, 30 insertions(+), 19 deletions(-) diff --git a/examples/torch-integration/customized_comm_with_tuning.py b/examples/torch-integration/customized_comm_with_tuning.py index 75a2c9608..4190d562e 100644 --- a/examples/torch-integration/customized_comm_with_tuning.py +++ b/examples/torch-integration/customized_comm_with_tuning.py @@ -122,14 +122,14 @@ class CustomizedComm: _TUNE_N_WARMUP = 5 _TUNE_N_GRAPH_LAUNCHES = 10 _TUNE_N_OPS_PER_GRAPH = 100 - _CANDIDATE_NBLOCKS = [4, 8, 16, 24, 32, 48, 64, 128] + _CANDIDATE_NBLOCKS = [4, 8, 16, 24, 32, 48, 64, 112, 128] _CANDIDATE_NTHREADS = [512, 768, 1024] _NBLOCKS_LIMIT = { "default_allreduce_nvls_packet": 16, "default_allreduce_nvls_zero_copy": 32, - "default_allreduce_packet": 56, + "default_allreduce_packet": 112, "default_allreduce_allpair_packet": 56, - "default_allreduce_rsag": 64, + "default_allreduce_rsag": 128, "default_allreduce_rsag_zero_copy": 64, "default_allreduce_fullmesh": 64, "default_allgather_fullmesh2": 32, @@ -162,6 +162,11 @@ def __init__(self, comm: mscclpp.CommGroup, symmetric_memory: bool = False): def _algo(self, collective: str, name: str): return self._algos.get((collective, name)) + def _nblocks_limit(self, algo_name: str, size: int) -> int: + if algo_name == "default_allreduce_packet" and size < (1 << 20): + return 56 + return self._NBLOCKS_LIMIT.get(algo_name, 128) + def _default_ar_config(self): """Fallback allreduce config for barrier / timing sync.""" pkt = self._algo("allreduce", "default_allreduce_nvls_packet") @@ -218,7 +223,7 @@ def _exec_ag(self, inp, out, algo, nb, nt, stream=None, sym=None): def _barrier_internal(self): a, nb, nt = self._default_ar_config() - self._exec_ar(self._barrier_tensor, a, nb, nt, sym=True) + self._exec_ar(self._barrier_tensor, a, nb, nt, sym=self.symmetric_memory) # -- lazy tuning -- @@ -233,15 +238,17 @@ def _ar_candidates(self, size: int): out = [] if self.multi_host_mnnvl: if size <= 4 << 20: - a = self._algo("allreduce", "default_allreduce_packet") - if a: - out.append(a) a = self._algo("allreduce", "default_allreduce_allpair_packet") if a: out.append(a) - a = self._algo("allreduce", "default_allreduce_nvls_packet") - if self._nvls and a: - out.append(a) + if size <= 64 << 10: + a = self._algo("allreduce", "default_allreduce_nvls_packet") + if self._nvls and a: + out.append(a) + if size > 128 << 10: + a = self._algo("allreduce", "default_allreduce_packet") + if a: + out.append(a) if size >= 512 << 10: a = self._algo("allreduce", "default_allreduce_rsag_zero_copy") if self.symmetric_memory and a: @@ -308,7 +315,7 @@ def _run_tune(self, collective, algo, buf, size, nb, nt): stream=torch.cuda.current_stream().cuda_stream, nblocks=nb, nthreads_per_block=nt, - symmetric_memory=True, + symmetric_memory=self.symmetric_memory, ) else: total = size * self.world_size @@ -337,7 +344,7 @@ def _tune_size(self, collective: str, target_size: int): run = lambda a, nb, nt: self._run_tune(collective, a, buf, target_size, nb, nt) for algo in cands: - nb_limit = self._NBLOCKS_LIMIT.get(algo.name, 128) + nb_limit = self._nblocks_limit(algo.name, target_size) for nb in self._CANDIDATE_NBLOCKS: if nb > nb_limit: continue @@ -346,7 +353,7 @@ def _tune_size(self, collective: str, target_size: int): ret = run(algo, nb, nt) torch.cuda.synchronize() self._time_buf[0] = float(ret) - self._exec_ar(self._time_buf[:1], *self._default_ar_config(), sym=True) + self._exec_ar(self._time_buf[:1], *self._default_ar_config(), sym=self.symmetric_memory) if self._time_buf[0].item() != 0: continue used.add(algo) @@ -375,7 +382,7 @@ def _tune_size(self, collective: str, target_size: int): # Cross-rank timing sync self._time_buf.fill_(elapsed) torch.cuda.current_stream().wait_stream(cs) - self._exec_ar(self._time_buf, *self._default_ar_config(), sym=True) + self._exec_ar(self._time_buf, *self._default_ar_config(), sym=self.symmetric_memory) avg = self._time_buf[self.rank].item() / self.world_size if avg < best_time: @@ -575,7 +582,7 @@ def main(): n_iter = _get_env_int("MSCCLPP_BENCH_ITERS", default=100) comm_group = init_dist() - cc = CustomizedComm(comm_group, symmetric_memory=True) + cc = CustomizedComm(comm_group, symmetric_memory=False) print(f"rank {local} starting benchmarks with dtype={dtype} accum_dtype={accum_dtype}...") benchmark_allreduce( diff --git a/src/ext/collectives/allreduce/allreduce_rsag.cu b/src/ext/collectives/allreduce/allreduce_rsag.cu index f964b87e9..7f9e6bfd6 100644 --- a/src/ext/collectives/allreduce/allreduce_rsag.cu +++ b/src/ext/collectives/allreduce/allreduce_rsag.cu @@ -133,7 +133,7 @@ struct AllreduceRsAgAdapter { size_t nelems = inputSize / sizeof(T); if (nBlocks == 0 || nThreadsPerBlock == 0) { nThreadsPerBlock = 1024; - nBlocks = 64; + nBlocks = 128; } allreduceRsAg<<>>( (T*)input, (T*)scratch, (T*)output, (ChannelType*)memoryChannels, switchChannel, remoteMemories, rank, @@ -144,7 +144,7 @@ struct AllreduceRsAgAdapter { void AllreduceRsAg::initialize(std::shared_ptr comm) { this->conns_ = setupConnections(comm); - nChannelsPerConnection_ = 64; + nChannelsPerConnection_ = 128; comm_ = comm; // setup semaphores this->scratchSemaphores_ = setupMemorySemaphores(comm, this->conns_, nChannelsPerConnection_); @@ -179,6 +179,10 @@ CommResult AllreduceRsAg::allreduceKernelFunc(const std::shared_ptr ctx, c return CommResult::CommInvalidArgument; } std::pair numBlocksAndThreads = {nBlocks, nThreadsPerBlock}; + if (numBlocksAndThreads.first > nChannelsPerConnection_) { + WARN(ALGO, "Block number ", numBlocksAndThreads.first, " exceeds the maximum limit ", nChannelsPerConnection_); + return CommResult::CommInvalidArgument; + } cudaError_t error = allreduce(input, this->scratchBuffer_, output, this->baseMemoryChannelHandles_.get(), this->remoteMemoryHandles_.get(), nullptr, nullptr, 0, 0, 0, algoCtx->rank, algoCtx->nRanksPerNode, algoCtx->workSize, inputSize, stream, nullptr, 0, 0, diff --git a/src/ext/collectives/include/allreduce/allreduce_packet.hpp b/src/ext/collectives/include/allreduce/allreduce_packet.hpp index de7ca4719..771126c96 100644 --- a/src/ext/collectives/include/allreduce/allreduce_packet.hpp +++ b/src/ext/collectives/include/allreduce/allreduce_packet.hpp @@ -29,7 +29,7 @@ class AllreducePacket : public AlgorithmBuilder { void* scratchBuffer_; size_t scratchBufferSize_; const int nSegmentsForScratchBuffer_ = 2; - const int maxBlockNum_ = 56; + const int maxBlockNum_ = 112; std::vector conns_; uintptr_t flagBuffer_; size_t flagBufferSize_; @@ -37,4 +37,4 @@ class AllreducePacket : public AlgorithmBuilder { std::vector registeredMemories_; }; } // namespace collective -} // namespace mscclpp \ No newline at end of file +} // namespace mscclpp From 3bc00cb7f0ab309b7a274db29de839730116098c Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Tue, 28 Apr 2026 08:24:49 +0000 Subject: [PATCH 05/44] Enable NVLS zero-copy without symmetric memory flag Allow default_allreduce_nvls_zero_copy to run when the public symmetric_memory flag is false; the algorithm already binds the concrete input and output allocations in its context. Include that fast path in MNNVL tuning and bound allpair/NVLS packet candidates to small sizes so large-message no-symmetric tuning avoids slow or unsafe packet variants. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../torch-integration/customized_comm_with_tuning.py | 9 +++++---- .../collectives/allreduce/allreduce_nvls_zero_copy.cu | 7 +------ .../include/allreduce/allreduce_nvls_zero_copy.hpp | 3 +-- 3 files changed, 7 insertions(+), 12 deletions(-) diff --git a/examples/torch-integration/customized_comm_with_tuning.py b/examples/torch-integration/customized_comm_with_tuning.py index 4190d562e..0736cb68e 100644 --- a/examples/torch-integration/customized_comm_with_tuning.py +++ b/examples/torch-integration/customized_comm_with_tuning.py @@ -238,9 +238,10 @@ def _ar_candidates(self, size: int): out = [] if self.multi_host_mnnvl: if size <= 4 << 20: - a = self._algo("allreduce", "default_allreduce_allpair_packet") - if a: - out.append(a) + if size <= 128 << 10: + a = self._algo("allreduce", "default_allreduce_allpair_packet") + if a: + out.append(a) if size <= 64 << 10: a = self._algo("allreduce", "default_allreduce_nvls_packet") if self._nvls and a: @@ -254,7 +255,7 @@ def _ar_candidates(self, size: int): if self.symmetric_memory and a: out.append(a) a = self._algo("allreduce", "default_allreduce_nvls_zero_copy") - if self._nvls and self.symmetric_memory and a: + if self._nvls and a: out.append(a) a = self._algo("allreduce", "default_allreduce_rsag") if a: diff --git a/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu b/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu index 25077004b..8c360f962 100644 --- a/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu +++ b/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu @@ -122,10 +122,6 @@ CommResult AllreduceNvls::allreduceKernelFunc(const std::shared_ptr ctx_vo cudaStream_t stream, int nBlocks, int nThreadsPerBlock, [[maybe_unused]] const std::unordered_map& extras, mscclpp::DataType accumDtype) { - if (!symmetricMemory_) { - WARN("AllreduceNvls requires symmetric memory for now."); - return CommResult::CommInvalidArgument; - } auto ctx = std::static_pointer_cast(ctx_void); AllreduceFunc allreduce = dispatch(op, dtype, accumDtype); if (!allreduce) { @@ -169,8 +165,7 @@ CommResult AllreduceNvls::allreduceKernelFunc(const std::shared_ptr ctx_vo } mscclpp::AlgorithmCtxKey AllreduceNvls::generateAllreduceContextKey(const void* input, void* output, size_t, - mscclpp::DataType, bool symmetricMemory) { - symmetricMemory_ = symmetricMemory; + mscclpp::DataType, bool) { size_t sendBytes, recvBytes; CUdeviceptr sendBasePtr, recvBasePtr; MSCCLPP_CUTHROW(cuMemGetAddressRange(&sendBasePtr, &sendBytes, (CUdeviceptr)input)); diff --git a/src/ext/collectives/include/allreduce/allreduce_nvls_zero_copy.hpp b/src/ext/collectives/include/allreduce/allreduce_nvls_zero_copy.hpp index d53ea180b..396152800 100644 --- a/src/ext/collectives/include/allreduce/allreduce_nvls_zero_copy.hpp +++ b/src/ext/collectives/include/allreduce/allreduce_nvls_zero_copy.hpp @@ -15,7 +15,6 @@ class AllreduceNvls : public AlgorithmBuilder { std::shared_ptr build() override; private: - bool symmetricMemory_ = false; void initialize(std::shared_ptr comm); CommResult allreduceKernelFunc(const std::shared_ptr ctx, const void* input, void* output, size_t inputSize, DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks, int nThreadsPerBlock, @@ -41,4 +40,4 @@ class AllreduceNvls : public AlgorithmBuilder { } // namespace collective } // namespace mscclpp -#endif // MSCCLPP_ALLREDUCE_NVLS_ZERO_COPY_HPP_ \ No newline at end of file +#endif // MSCCLPP_ALLREDUCE_NVLS_ZERO_COPY_HPP_ From 533f329971e003e2ca67803c19959d13bf7140ea Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Tue, 28 Apr 2026 16:23:23 +0000 Subject: [PATCH 06/44] Tune no-sym MNNVL with RSAG zero-copy Disable NVLS zero-copy when symmetric memory is not enabled, and allow the RSAG zero-copy path to participate in MNNVL tuning for non-symmetric memory. Cache RSAG zero-copy contexts by the concrete buffer pointers so CUDA graph capture does not create a new registration for every execute call, and cap requested blocks at the channel count. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../torch-integration/customized_comm_with_tuning.py | 6 +++--- .../collectives/allreduce/allreduce_nvls_zero_copy.cu | 7 ++++++- .../collectives/allreduce/allreduce_rsag_zero_copy.cu | 9 ++++++--- .../include/allreduce/allreduce_nvls_zero_copy.hpp | 1 + 4 files changed, 16 insertions(+), 7 deletions(-) diff --git a/examples/torch-integration/customized_comm_with_tuning.py b/examples/torch-integration/customized_comm_with_tuning.py index 0736cb68e..6f8f097d0 100644 --- a/examples/torch-integration/customized_comm_with_tuning.py +++ b/examples/torch-integration/customized_comm_with_tuning.py @@ -130,7 +130,7 @@ class CustomizedComm: "default_allreduce_packet": 112, "default_allreduce_allpair_packet": 56, "default_allreduce_rsag": 128, - "default_allreduce_rsag_zero_copy": 64, + "default_allreduce_rsag_zero_copy": 128, "default_allreduce_fullmesh": 64, "default_allgather_fullmesh2": 32, } @@ -252,10 +252,10 @@ def _ar_candidates(self, size: int): out.append(a) if size >= 512 << 10: a = self._algo("allreduce", "default_allreduce_rsag_zero_copy") - if self.symmetric_memory and a: + if a: out.append(a) a = self._algo("allreduce", "default_allreduce_nvls_zero_copy") - if self._nvls and a: + if self._nvls and self.symmetric_memory and a: out.append(a) a = self._algo("allreduce", "default_allreduce_rsag") if a: diff --git a/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu b/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu index 8c360f962..25077004b 100644 --- a/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu +++ b/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu @@ -122,6 +122,10 @@ CommResult AllreduceNvls::allreduceKernelFunc(const std::shared_ptr ctx_vo cudaStream_t stream, int nBlocks, int nThreadsPerBlock, [[maybe_unused]] const std::unordered_map& extras, mscclpp::DataType accumDtype) { + if (!symmetricMemory_) { + WARN("AllreduceNvls requires symmetric memory for now."); + return CommResult::CommInvalidArgument; + } auto ctx = std::static_pointer_cast(ctx_void); AllreduceFunc allreduce = dispatch(op, dtype, accumDtype); if (!allreduce) { @@ -165,7 +169,8 @@ CommResult AllreduceNvls::allreduceKernelFunc(const std::shared_ptr ctx_vo } mscclpp::AlgorithmCtxKey AllreduceNvls::generateAllreduceContextKey(const void* input, void* output, size_t, - mscclpp::DataType, bool) { + mscclpp::DataType, bool symmetricMemory) { + symmetricMemory_ = symmetricMemory; size_t sendBytes, recvBytes; CUdeviceptr sendBasePtr, recvBasePtr; MSCCLPP_CUTHROW(cuMemGetAddressRange(&sendBasePtr, &sendBytes, (CUdeviceptr)input)); diff --git a/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu b/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu index c4dea321c..a11da0f89 100644 --- a/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu +++ b/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu @@ -153,6 +153,10 @@ CommResult AllreduceRsAgZeroCopy::allreduceKernelFunc(const std::shared_ptr numBlocksAndThreads = {nBlocks, nThreadsPerBlock}; + if (numBlocksAndThreads.first > nChannelsPerConnection_) { + WARN(ALGO, "Block number ", numBlocksAndThreads.first, " exceeds the maximum limit ", nChannelsPerConnection_); + return CommResult::CommInvalidArgument; + } cudaError_t error = allreduce(input, nullptr, output, this->baseMemoryChannelHandles_.get(), algoCtx->remoteMemoryHandles.get(), nullptr, nullptr, 0, 0, 0, algoCtx->rank, algoCtx->nRanksPerNode, algoCtx->workSize, inputSize, stream, @@ -165,9 +169,8 @@ CommResult AllreduceRsAgZeroCopy::allreduceKernelFunc(const std::shared_ptr AllreduceRsAgZeroCopy::initAllreduceContext(std::shared_ptr comm, const void* input, diff --git a/src/ext/collectives/include/allreduce/allreduce_nvls_zero_copy.hpp b/src/ext/collectives/include/allreduce/allreduce_nvls_zero_copy.hpp index 396152800..c40bd2cda 100644 --- a/src/ext/collectives/include/allreduce/allreduce_nvls_zero_copy.hpp +++ b/src/ext/collectives/include/allreduce/allreduce_nvls_zero_copy.hpp @@ -15,6 +15,7 @@ class AllreduceNvls : public AlgorithmBuilder { std::shared_ptr build() override; private: + bool symmetricMemory_ = false; void initialize(std::shared_ptr comm); CommResult allreduceKernelFunc(const std::shared_ptr ctx, const void* input, void* output, size_t inputSize, DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks, int nThreadsPerBlock, From 45a651b2c81ec61bc846d823db1691772c33d280 Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Fri, 1 May 2026 18:27:17 +0000 Subject: [PATCH 07/44] Decouple IPC-domain hint from bootstrap nRanksPerNode Replace MSCCLPP_MNNVL_NRANKS_PER_NODE (which overrode TcpBootstrap and silently changed getNranksPerNode() for every consumer) with a single algorithm-level helper getIpcDomainNranks(comm) backed by a new MSCCLPP_IPC_DOMAIN_NRANKS env. The neutral IPC name covers both NVLink/ MNNVL on NV and XGMI on AMD. Bootstrap is unchanged and continues to report physical-host detection. Collapse the two getCollectiveDomainNranksPerNode overloads into one canonical helper and route all six allreduce algos (packet, allpair_packet, nvls_packet, nvls_zero_copy, rsag, rsag_zero_copy) through it. Update the standalone tuning example to use the new env name; drop the undeclared MSCCLPP_ENABLE_MNNVL gate; fix multi_host_mnnvl detection now that nranks_per_node is no longer overridden by the bootstrap. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../customized_comm_with_tuning.py | 17 +++++--------- include/mscclpp/env.hpp | 10 ++++----- src/core/bootstrap/bootstrap.cc | 5 ----- src/core/env.cpp | 4 ++-- .../allreduce/allreduce_allpair_packet.cu | 2 +- .../allreduce/allreduce_nvls_packet.cu | 2 +- .../allreduce/allreduce_nvls_zero_copy.cu | 2 +- .../collectives/allreduce/allreduce_packet.cu | 2 +- .../collectives/allreduce/allreduce_rsag.cu | 2 +- .../allreduce/allreduce_rsag_zero_copy.cu | 4 ++-- src/ext/collectives/collective_utils.cc | 22 +++++-------------- .../collectives/include/collective_utils.hpp | 9 ++++++-- 12 files changed, 33 insertions(+), 48 deletions(-) diff --git a/examples/torch-integration/customized_comm_with_tuning.py b/examples/torch-integration/customized_comm_with_tuning.py index 6f8f097d0..1d54cfa77 100644 --- a/examples/torch-integration/customized_comm_with_tuning.py +++ b/examples/torch-integration/customized_comm_with_tuning.py @@ -13,6 +13,7 @@ import sys import traceback + def _get_bootstrap_world_size(): for name in ("WORLD_SIZE", "OMPI_COMM_WORLD_SIZE", "PMI_SIZE", "SLURM_NTASKS"): value = os.environ.get(name) @@ -22,13 +23,8 @@ def _get_bootstrap_world_size(): _bootstrap_world_size = _get_bootstrap_world_size() -if ( - _bootstrap_world_size - and _bootstrap_world_size > 1 - and "MSCCLPP_MNNVL_NRANKS_PER_NODE" not in os.environ - and os.environ.get("MSCCLPP_ENABLE_MNNVL", "1") != "0" -): - os.environ["MSCCLPP_MNNVL_NRANKS_PER_NODE"] = str(_bootstrap_world_size) +if _bootstrap_world_size and _bootstrap_world_size > 1 and "MSCCLPP_IPC_DOMAIN_NRANKS" not in os.environ: + os.environ["MSCCLPP_IPC_DOMAIN_NRANKS"] = str(_bootstrap_world_size) import torch import mscclpp @@ -140,11 +136,10 @@ def __init__(self, comm: mscclpp.CommGroup, symmetric_memory: bool = False): self.rank = comm.my_rank self.world_size = comm.nranks self.nranks_per_node = comm.nranks_per_node - self.mnnvl_domain = self.world_size > 1 and os.environ.get("MSCCLPP_MNNVL_NRANKS_PER_NODE") == str( - self.world_size - ) + nvlink_domain_nranks = int(os.environ.get("MSCCLPP_IPC_DOMAIN_NRANKS", "0")) + self.mnnvl_domain = self.world_size > 1 and nvlink_domain_nranks >= self.world_size self.multi_node = self.world_size > self.nranks_per_node and not self.mnnvl_domain - self.multi_host_mnnvl = self.mnnvl_domain and self.world_size > 1 + self.multi_host_mnnvl = self.mnnvl_domain and self.world_size > self.nranks_per_node self.symmetric_memory = symmetric_memory self._nvls = mscclpp.is_nvls_supported() diff --git a/include/mscclpp/env.hpp b/include/mscclpp/env.hpp index 09d364c3b..0dd63ed74 100644 --- a/include/mscclpp/env.hpp +++ b/include/mscclpp/env.hpp @@ -119,11 +119,11 @@ class Env { /// Default is 0. Used when `EndpointConfig::Ib::gidIndex` is -1 (unspecified). const int ibGidIndex; - /// Env name: `MSCCLPP_MNNVL_NRANKS_PER_NODE`. Overrides the NVLink-domain size reported by the bootstrap. - /// This is intended for Multi-Node NVLink (MNNVL) deployments where a single CUDA IPC / NVLS domain spans - /// multiple hosts and should be treated as one collective peer group. - /// If unset or non-positive, the bootstrap falls back to physical-host-based detection. - const int mnnvlNranksPerNode; + /// Env name: `MSCCLPP_IPC_DOMAIN_NRANKS`. Number of ranks that share a single GPU-IPC-reachable peer + /// group (e.g. a Multi-Node NVLink fabric such as GB200 NVL72, or an AMD XGMI domain). This hint is + /// consumed only by the collective algorithms; it does not affect `Bootstrap::getNranksPerNode()` or + /// any other layer. If unset or non-positive, algorithms fall back to `bootstrap->getNranksPerNode()`. + const int ipcDomainNranks; private: Env(); diff --git a/src/core/bootstrap/bootstrap.cc b/src/core/bootstrap/bootstrap.cc index c84ef4c0f..b3032e502 100644 --- a/src/core/bootstrap/bootstrap.cc +++ b/src/core/bootstrap/bootstrap.cc @@ -5,7 +5,6 @@ #include #include -#include #include #include #include @@ -434,10 +433,6 @@ void TcpBootstrap::Impl::establishConnections(int64_t timeoutSec) { int TcpBootstrap::Impl::getNranksPerNode() { if (nRanksPerNode_ > 0) return nRanksPerNode_; - if (env()->mnnvlNranksPerNode > 0) { - nRanksPerNode_ = env()->mnnvlNranksPerNode; - return nRanksPerNode_; - } int nRanksPerNode = 0; bool useIpv4 = peerCommAddresses_[rank_].sa.sa_family == AF_INET; for (int i = 0; i < nRanks_; i++) { diff --git a/src/core/env.cpp b/src/core/env.cpp index b46670d79..18d548b02 100644 --- a/src/core/env.cpp +++ b/src/core/env.cpp @@ -68,7 +68,7 @@ Env::Env() forceDisableNvls(readEnv("MSCCLPP_FORCE_DISABLE_NVLS", false)), forceDisableGdr(readEnv("MSCCLPP_FORCE_DISABLE_GDR", false)), ibGidIndex(readEnv("MSCCLPP_IB_GID_INDEX", 0)), - mnnvlNranksPerNode(readEnv("MSCCLPP_MNNVL_NRANKS_PER_NODE", 0)) {} + ipcDomainNranks(readEnv("MSCCLPP_IPC_DOMAIN_NRANKS", 0)) {} std::shared_ptr env() { static std::shared_ptr globalEnv = std::shared_ptr(new Env()); @@ -98,7 +98,7 @@ std::shared_ptr env() { logEnv("MSCCLPP_FORCE_DISABLE_NVLS", globalEnv->forceDisableNvls); logEnv("MSCCLPP_FORCE_DISABLE_GDR", globalEnv->forceDisableGdr); logEnv("MSCCLPP_IB_GID_INDEX", globalEnv->ibGidIndex); - logEnv("MSCCLPP_MNNVL_NRANKS_PER_NODE", globalEnv->mnnvlNranksPerNode); + logEnv("MSCCLPP_IPC_DOMAIN_NRANKS", globalEnv->ipcDomainNranks); } return globalEnv; } diff --git a/src/ext/collectives/allreduce/allreduce_allpair_packet.cu b/src/ext/collectives/allreduce/allreduce_allpair_packet.cu index 9516ad786..690d0eb4e 100644 --- a/src/ext/collectives/allreduce/allreduce_allpair_packet.cu +++ b/src/ext/collectives/allreduce/allreduce_allpair_packet.cu @@ -140,7 +140,7 @@ std::shared_ptr AllreduceAllpairPacket::initAllreduceContext(std::shared_p const int nChannelsPerConnection = maxBlockNum_; ctx->rank = comm->bootstrap()->getRank(); ctx->workSize = comm->bootstrap()->getNranks(); - ctx->nRanksPerNode = getCollectiveDomainNranksPerNode(comm, this->conns_); + ctx->nRanksPerNode = getIpcDomainNranks(comm); ctx->memorySemaphores = this->memorySemaphores_; ctx->registeredMemories = this->registeredMemories_; ctx->registeredMemories.pop_back(); // remove the local memory from previous context diff --git a/src/ext/collectives/allreduce/allreduce_nvls_packet.cu b/src/ext/collectives/allreduce/allreduce_nvls_packet.cu index 21f710283..d331cc672 100644 --- a/src/ext/collectives/allreduce/allreduce_nvls_packet.cu +++ b/src/ext/collectives/allreduce/allreduce_nvls_packet.cu @@ -94,7 +94,7 @@ std::shared_ptr AllreduceNvlsPacket::initAllreduceContext(std::shared_ptr< auto ctx = std::make_shared(); ctx->rank = comm->bootstrap()->getRank(); ctx->workSize = comm->bootstrap()->getNranks(); - ctx->nRanksPerNode = getCollectiveDomainNranksPerNode(comm); + ctx->nRanksPerNode = getIpcDomainNranks(comm); // setup channels ctx->switchChannels = this->switchChannels_; diff --git a/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu b/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu index 25077004b..36fcf860b 100644 --- a/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu +++ b/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu @@ -183,7 +183,7 @@ std::shared_ptr AllreduceNvls::initAllreduceContext(std::shared_ptr(); ctx->rank = comm->bootstrap()->getRank(); ctx->workSize = comm->bootstrap()->getNranks(); - ctx->nRanksPerNode = getCollectiveDomainNranksPerNode(comm); + ctx->nRanksPerNode = getIpcDomainNranks(comm); size_t sendBytes, recvBytes; CUdeviceptr sendBasePtr, recvBasePtr; diff --git a/src/ext/collectives/allreduce/allreduce_packet.cu b/src/ext/collectives/allreduce/allreduce_packet.cu index c195aefa3..d631c35a8 100644 --- a/src/ext/collectives/allreduce/allreduce_packet.cu +++ b/src/ext/collectives/allreduce/allreduce_packet.cu @@ -264,7 +264,7 @@ std::shared_ptr AllreducePacket::initAllreduceContext(std::shared_ptrrank = comm->bootstrap()->getRank(); ctx->workSize = comm->bootstrap()->getNranks(); - ctx->nRanksPerNode = getCollectiveDomainNranksPerNode(comm, this->conns_); + ctx->nRanksPerNode = getIpcDomainNranks(comm); ctx->memorySemaphores = this->memorySemaphores_; ctx->registeredMemories = this->registeredMemories_; ctx->registeredMemories.pop_back(); // remove the local memory from previous context diff --git a/src/ext/collectives/allreduce/allreduce_rsag.cu b/src/ext/collectives/allreduce/allreduce_rsag.cu index 7f9e6bfd6..4c46bf9b3 100644 --- a/src/ext/collectives/allreduce/allreduce_rsag.cu +++ b/src/ext/collectives/allreduce/allreduce_rsag.cu @@ -203,7 +203,7 @@ std::shared_ptr AllreduceRsAg::initAllreduceContext(std::shared_ptr(); ctx->rank = comm->bootstrap()->getRank(); ctx->workSize = comm->bootstrap()->getNranks(); - ctx->nRanksPerNode = getCollectiveDomainNranksPerNode(comm, this->conns_); + ctx->nRanksPerNode = getIpcDomainNranks(comm); ctx->memorySemaphores = this->scratchSemaphores_; ctx->registeredMemories = this->remoteScratchMemories_; diff --git a/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu b/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu index a11da0f89..67eed6d31 100644 --- a/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu +++ b/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu @@ -169,7 +169,7 @@ CommResult AllreduceRsAgZeroCopy::allreduceKernelFunc(const std::shared_ptr AllreduceRsAgZeroCopy::initAllreduceContext(std::shared_pt auto ctx = std::make_shared(); ctx->rank = comm->bootstrap()->getRank(); ctx->workSize = comm->bootstrap()->getNranks(); - ctx->nRanksPerNode = getCollectiveDomainNranksPerNode(comm, this->conns_); + ctx->nRanksPerNode = getIpcDomainNranks(comm); ctx->memorySemaphores = this->semaphores_; diff --git a/src/ext/collectives/collective_utils.cc b/src/ext/collectives/collective_utils.cc index 4d46c53bc..de33009c6 100644 --- a/src/ext/collectives/collective_utils.cc +++ b/src/ext/collectives/collective_utils.cc @@ -6,6 +6,7 @@ #include #include #include +#include #include #include @@ -69,23 +70,12 @@ std::vector> setupMemoryS return memorySemaphores; } -int getCollectiveDomainNranksPerNode(std::shared_ptr comm, - const std::vector& connections) { - const int worldSize = comm->bootstrap()->getNranks(); - const int nRanksPerNode = comm->bootstrap()->getNranksPerNode(); - if (worldSize <= nRanksPerNode) { - return nRanksPerNode; +int getIpcDomainNranks(std::shared_ptr comm) { + const int envValue = mscclpp::env()->ipcDomainNranks; + if (envValue > 0) { + return envValue; } - const bool allPeersUseCudaIpc = - std::all_of(connections.begin(), connections.end(), - [](const auto& connection) { return connection.transport() == mscclpp::Transport::CudaIpc; }); - return allPeersUseCudaIpc ? worldSize : nRanksPerNode; -} - -int getCollectiveDomainNranksPerNode(std::shared_ptr comm) { - const int worldSize = comm->bootstrap()->getNranks(); - const int nRanksPerNode = comm->bootstrap()->getNranksPerNode(); - return worldSize > nRanksPerNode ? worldSize : nRanksPerNode; + return comm->bootstrap()->getNranksPerNode(); } std::shared_ptr> setupMemoryChannelDeviceHandles( diff --git a/src/ext/collectives/include/collective_utils.hpp b/src/ext/collectives/include/collective_utils.hpp index 38362a659..44a214020 100644 --- a/src/ext/collectives/include/collective_utils.hpp +++ b/src/ext/collectives/include/collective_utils.hpp @@ -50,8 +50,13 @@ std::vector setupMemoryChannels( std::vector setupConnections(std::shared_ptr comm); std::vector> setupMemorySemaphores( std::shared_ptr comm, const std::vector& connections, int nChannelsPerConnection); -int getCollectiveDomainNranksPerNode(std::shared_ptr comm, const std::vector& connections); -int getCollectiveDomainNranksPerNode(std::shared_ptr comm); + +/// Number of ranks that participate in the same GPU-IPC-reachable peer group (e.g. a single host or +/// a Multi-Node NVLink fabric, or an AMD XGMI domain). Returns the value of `MSCCLPP_IPC_DOMAIN_NRANKS` +/// if set to a positive value; otherwise falls back to `bootstrap->getNranksPerNode()`. This is +/// intentionally independent of `nRanksPerNode` so that algorithms can opt in to MNNVL-like behavior +/// without changing the meaning of bootstrap-level APIs. +int getIpcDomainNranks(std::shared_ptr comm); std::shared_ptr> setupMemoryChannelDeviceHandles( const std::vector& memoryChannels); From 2a2fca8a587a658888fe5a21f5b42cd07bf6cec2 Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Fri, 1 May 2026 19:06:07 +0000 Subject: [PATCH 08/44] Rename collective ctx/kernel param nRanksPerNode to ipcDomainNranks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The AlgorithmCtx field and the kernel/host parameters that hold the collective's IPC peer-group size were named nRanksPerNode, which is misleading on Multi-Node NVLink (where the value spans multiple hosts) and on AMD (where the relevant fabric is XGMI, not NVLink). Rename to ipcDomainNranks throughout the collective algorithms to match the neutral naming introduced for the env helper. Scope intentionally limited to src/ext/collectives/. The following are left untouched on purpose: - Bootstrap::getNranksPerNode() — physical-host detection, semantics unchanged. - Algorithm::Constraint::nRanksPerNode (public API in include/mscclpp/algorithm.hpp) and the DSL plan config in algorithm_collection_builder.cc — these describe a plan's required physical topology. - NCCL adapter (src/ext/nccl/) — preserves NCCL ABI compatibility. - MAX_NRANKS_PER_NODE — sizing constant for shared-memory arrays. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../allgather/allgather_fullmesh.cu | 10 +++---- .../allgather/allgather_fullmesh_2.cu | 10 +++---- .../allreduce/allreduce_allpair_packet.cu | 14 ++++----- .../allreduce/allreduce_fullmesh.cu | 12 ++++---- .../allreduce_nvls_block_pipeline.cu | 30 +++++++++---------- .../allreduce/allreduce_nvls_packet.cu | 4 +-- .../allreduce/allreduce_nvls_warp_pipeline.cu | 28 ++++++++--------- .../allreduce/allreduce_nvls_zero_copy.cu | 20 ++++++------- .../collectives/allreduce/allreduce_packet.cu | 20 ++++++------- .../collectives/allreduce/allreduce_rsag.cu | 24 +++++++-------- .../allreduce/allreduce_rsag_pipeline.cu | 22 +++++++------- .../allreduce/allreduce_rsag_zero_copy.cu | 26 ++++++++-------- .../collectives/include/collective_utils.hpp | 10 +++---- 13 files changed, 115 insertions(+), 115 deletions(-) diff --git a/src/ext/collectives/allgather/allgather_fullmesh.cu b/src/ext/collectives/allgather/allgather_fullmesh.cu index 17054869e..cbe199bcb 100644 --- a/src/ext/collectives/allgather/allgather_fullmesh.cu +++ b/src/ext/collectives/allgather/allgather_fullmesh.cu @@ -11,8 +11,8 @@ namespace collective { template __global__ void __launch_bounds__(1024, 1) allgatherFullmesh(void* buff, void* scratch, void* resultBuff, DeviceHandle* memoryChannels, - int rank, int nRanksPerNode, [[maybe_unused]] int worldSize, size_t nelems) { - const int nPeer = nRanksPerNode - 1; + int rank, int ipcDomainNranks, [[maybe_unused]] int worldSize, size_t nelems) { + const int nPeer = ipcDomainNranks - 1; const size_t chanOffset = nPeer * blockIdx.x; // assume (nelems * sizeof(T)) is divisible by 16 const size_t nInt4 = nelems * sizeof(int) / sizeof(int4); @@ -129,11 +129,11 @@ CommResult AllgatherFullmesh::allgatherKernelFunc(const std::shared_ptr ct if ((char*)input == (char*)output + rank * inputSize) { allgatherFullmesh<<>>( (void*)input, this->scratchBuffer_, (void*)output, ctx->memoryChannelDeviceHandles.get(), rank, - ctx->nRanksPerNode, ctx->workSize, nElem); + ctx->ipcDomainNranks, ctx->workSize, nElem); } else { allgatherFullmesh<<>>( (void*)input, this->scratchBuffer_, (void*)output, ctx->memoryChannelDeviceHandles.get(), rank, - ctx->nRanksPerNode, ctx->workSize, nElem); + ctx->ipcDomainNranks, ctx->workSize, nElem); } cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { @@ -150,7 +150,7 @@ std::shared_ptr AllgatherFullmesh::initAllgatherContext(std::shared_ptr(); ctx->rank = comm->bootstrap()->getRank(); ctx->workSize = comm->bootstrap()->getNranks(); - ctx->nRanksPerNode = comm->bootstrap()->getNranksPerNode(); + ctx->ipcDomainNranks = comm->bootstrap()->getNranksPerNode(); // setup semaphores ctx->memorySemaphores = setupMemorySemaphores(comm, this->conns_, nChannelsPerConnection); diff --git a/src/ext/collectives/allgather/allgather_fullmesh_2.cu b/src/ext/collectives/allgather/allgather_fullmesh_2.cu index 9d169d689..6e69f81ca 100644 --- a/src/ext/collectives/allgather/allgather_fullmesh_2.cu +++ b/src/ext/collectives/allgather/allgather_fullmesh_2.cu @@ -12,7 +12,7 @@ __device__ DeviceSyncer deviceSyncer; template __global__ void __launch_bounds__(1024, 1) allgatherFullmesh2(void* sendbuff, mscclpp::DeviceHandle* memoryChannels, - size_t channelOutOffset, size_t rank, [[maybe_unused]] size_t worldSize, size_t nRanksPerNode, + size_t channelOutOffset, size_t rank, [[maybe_unused]] size_t worldSize, size_t ipcDomainNranks, size_t nelemsPerGPU) { const size_t tid = threadIdx.x + blockIdx.x * blockDim.x; const size_t lid = tid % WARP_SIZE; @@ -20,7 +20,7 @@ __global__ void __launch_bounds__(1024, 1) const size_t nThread = blockDim.x * gridDim.x; const size_t nWarp = nThread / WARP_SIZE; - const size_t nPeer = nRanksPerNode - 1; + const size_t nPeer = ipcDomainNranks - 1; const size_t chanOffset = nPeer * blockIdx.x; auto memChans = memoryChannels + chanOffset; @@ -140,11 +140,11 @@ CommResult AllgatherFullmesh2::allgatherKernelFunc(const std::shared_ptr c if ((char*)input == (char*)output + rank * inputSize) { allgatherFullmesh2<<>>( (void*)input, ctx->memoryChannelDeviceHandles.get(), channelOutOffset, ctx->rank, ctx->workSize, - ctx->nRanksPerNode, nElem); + ctx->ipcDomainNranks, nElem); } else { allgatherFullmesh2<<>>( (void*)input, ctx->memoryChannelDeviceHandles.get(), channelOutOffset, ctx->rank, ctx->workSize, - ctx->nRanksPerNode, nElem); + ctx->ipcDomainNranks, nElem); } cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { @@ -159,7 +159,7 @@ std::shared_ptr AllgatherFullmesh2::initAllgatherContext(std::shared_ptr(); ctx->rank = comm->bootstrap()->getRank(); ctx->workSize = comm->bootstrap()->getNranks(); - ctx->nRanksPerNode = comm->bootstrap()->getNranksPerNode(); + ctx->ipcDomainNranks = comm->bootstrap()->getNranksPerNode(); // setup semaphores ctx->memorySemaphores = this->memorySemaphores_; diff --git a/src/ext/collectives/allreduce/allreduce_allpair_packet.cu b/src/ext/collectives/allreduce/allreduce_allpair_packet.cu index 690d0eb4e..5be2f3360 100644 --- a/src/ext/collectives/allreduce/allreduce_allpair_packet.cu +++ b/src/ext/collectives/allreduce/allreduce_allpair_packet.cu @@ -14,11 +14,11 @@ namespace collective { template __global__ void allreduceAllPairs(T* buff, T* scratch, T* resultBuff, DeviceHandle* memoryChannels, - size_t channelDataOffset, size_t scratchBufferSize, int rank, int nRanksPerNode, + size_t channelDataOffset, size_t scratchBufferSize, int rank, int ipcDomainNranks, int worldSize, size_t nelems, uint32_t numScratchBuff, void* flags, uint32_t flagSize) { if (sizeof(T) == 2 || sizeof(T) == 1) nelems = (nelems * sizeof(T) + sizeof(T)) / sizeof(int); - const int nPeers = nRanksPerNode - 1; + const int nPeers = ipcDomainNranks - 1; uint32_t flag = ((uint32_t*)flags)[blockIdx.x]; size_t scratchBaseOffset = (flag % numScratchBuff) ? (scratchBufferSize / numScratchBuff) : 0; @@ -72,7 +72,7 @@ template struct AllpairAdapter { static cudaError_t call(const void* buff, void* scratch, void* resultBuff, void* memoryChannels, void*, DeviceHandle*, DeviceHandle*, size_t channelInOffset, size_t, - size_t scratchBufferSize, int rank, int nRanksPerNode, int worldSize, size_t inputSize, + size_t scratchBufferSize, int rank, int ipcDomainNranks, int worldSize, size_t inputSize, cudaStream_t stream, void* flags, uint32_t flagSize, uint32_t numScratchBuff, int nBlocks = 0, int nThreadsPerBlock = 0) { using ChannelType = DeviceHandle; @@ -84,7 +84,7 @@ struct AllpairAdapter { } allreduceAllPairs<<>>( (T*)buff, (T*)scratch, (T*)resultBuff, (ChannelType*)memoryChannels, channelInOffset, scratchBufferSize, rank, - nRanksPerNode, worldSize, nelems, numScratchBuff, flags, flagSize); + ipcDomainNranks, worldSize, nelems, numScratchBuff, flags, flagSize); return cudaGetLastError(); } }; @@ -108,7 +108,7 @@ CommResult AllreduceAllpairPacket::allreduceKernelFunc(const std::shared_ptrworkSize); } // nBlocks must be at least nPeers for allpair — each block maps to one peer. - const int nPeers = algoCtx->nRanksPerNode - 1; + const int nPeers = algoCtx->ipcDomainNranks - 1; if (nPeers > 0 && blockAndThreadNum.first < nPeers) { return CommResult::CommInvalidArgument; } @@ -124,7 +124,7 @@ CommResult AllreduceAllpairPacket::allreduceKernelFunc(const std::shared_ptrscratchBuffer_, output, algoCtx->memoryChannelDeviceHandles.get(), nullptr, nullptr, - nullptr, channelInOffset, 0, this->scratchBufferSize_, algoCtx->rank, algoCtx->nRanksPerNode, + nullptr, channelInOffset, 0, this->scratchBufferSize_, algoCtx->rank, algoCtx->ipcDomainNranks, algoCtx->workSize, inputSize, stream, (void*)flagBuffer_, (uint32_t)flagBufferSize_, this->nSegmentsForScratchBuffer_, blockAndThreadNum.first, blockAndThreadNum.second); if (error != cudaSuccess) { @@ -140,7 +140,7 @@ std::shared_ptr AllreduceAllpairPacket::initAllreduceContext(std::shared_p const int nChannelsPerConnection = maxBlockNum_; ctx->rank = comm->bootstrap()->getRank(); ctx->workSize = comm->bootstrap()->getNranks(); - ctx->nRanksPerNode = getIpcDomainNranks(comm); + ctx->ipcDomainNranks = getIpcDomainNranks(comm); ctx->memorySemaphores = this->memorySemaphores_; ctx->registeredMemories = this->registeredMemories_; ctx->registeredMemories.pop_back(); // remove the local memory from previous context diff --git a/src/ext/collectives/allreduce/allreduce_fullmesh.cu b/src/ext/collectives/allreduce/allreduce_fullmesh.cu index 9d144c621..b95dcb284 100644 --- a/src/ext/collectives/allreduce/allreduce_fullmesh.cu +++ b/src/ext/collectives/allreduce/allreduce_fullmesh.cu @@ -13,8 +13,8 @@ template __global__ void __launch_bounds__(512, 1) allreduceFullmesh(T* buff, T* scratch, T* resultBuff, DeviceHandle* memoryChannels, DeviceHandle* memoryOutChannels, size_t channelOutDataOffset, int rank, - int nRanksPerNode, int worldSize, size_t nelems) { - const int nPeer = nRanksPerNode - 1; + int ipcDomainNranks, int worldSize, size_t nelems) { + const int nPeer = ipcDomainNranks - 1; const size_t chanOffset = nPeer * blockIdx.x; // assume (nelems * sizeof(T)) is divisible by (16 * worldSize) const size_t nInt4 = nelems * sizeof(T) / sizeof(int4); @@ -159,7 +159,7 @@ template struct AllreduceAllconnectAdapter { static cudaError_t call(const void* input, void* scratch, void* output, void* memoryChannels, void* memoryOutChannels, DeviceHandle*, DeviceHandle*, size_t, - size_t channelOutDataOffset, size_t, int rank, int nRanksPerNode, int worldSize, + size_t channelOutDataOffset, size_t, int rank, int ipcDomainNranks, int worldSize, size_t inputSize, cudaStream_t stream, void*, uint32_t, uint32_t, int nBlocks, int nThreadsPerBlock) { using ChannelType = DeviceHandle; @@ -168,7 +168,7 @@ struct AllreduceAllconnectAdapter { if (nThreadsPerBlock == 0) nThreadsPerBlock = 512; allreduceFullmesh<<>>( (T*)input, (T*)scratch, (T*)output, (ChannelType*)memoryChannels, (ChannelType*)memoryOutChannels, - channelOutDataOffset, rank, nRanksPerNode, worldSize, nelems); + channelOutDataOffset, rank, ipcDomainNranks, worldSize, nelems); return cudaGetLastError(); } }; @@ -225,7 +225,7 @@ CommResult AllreduceFullmesh::allreduceKernelFunc( } cudaError_t error = allreduce(input, this->scratchBuffer_, output, inputChannelHandles.get(), ctx->memoryChannelDeviceHandles.get(), - nullptr, nullptr, 0, channelOutOffset, 0, ctx->rank, ctx->nRanksPerNode, ctx->workSize, inputSize, + nullptr, nullptr, 0, channelOutOffset, 0, ctx->rank, ctx->ipcDomainNranks, ctx->workSize, inputSize, stream, nullptr, 0, 0, numBlocksAndThreads.first, numBlocksAndThreads.second); if (error != cudaSuccess) { WARN("AllreduceAllconnect failed with error: %s", cudaGetErrorString(error)); @@ -252,7 +252,7 @@ std::shared_ptr AllreduceFullmesh::initAllreduceContext(std::shared_ptr(); ctx->rank = comm->bootstrap()->getRank(); ctx->workSize = comm->bootstrap()->getNranks(); - ctx->nRanksPerNode = comm->bootstrap()->getNranksPerNode(); + ctx->ipcDomainNranks = comm->bootstrap()->getNranksPerNode(); // setup semaphores ctx->memorySemaphores = this->outputSemaphores_; diff --git a/src/ext/collectives/allreduce/allreduce_nvls_block_pipeline.cu b/src/ext/collectives/allreduce/allreduce_nvls_block_pipeline.cu index 2d71cd638..3ecb361fc 100644 --- a/src/ext/collectives/allreduce/allreduce_nvls_block_pipeline.cu +++ b/src/ext/collectives/allreduce/allreduce_nvls_block_pipeline.cu @@ -20,15 +20,15 @@ __global__ void __launch_bounds__(1024, 1) [[maybe_unused]] DeviceHandle* memoryChannels, [[maybe_unused]] DeviceHandle* switchChannels, [[maybe_unused]] size_t size, [[maybe_unused]] size_t scratchBufferSize, - [[maybe_unused]] int rank, [[maybe_unused]] int nRanksPerNode) { + [[maybe_unused]] int rank, [[maybe_unused]] int ipcDomainNranks) { #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 constexpr int alignment = 16; - int nPeers = nRanksPerNode - 1; - int nBlocksForCopy = nRanksPerNode * 2; - int nBlocksForReduce = nRanksPerNode; + int nPeers = ipcDomainNranks - 1; + int nBlocksForCopy = ipcDomainNranks * 2; + int nBlocksForReduce = ipcDomainNranks; int copyReduceRatio = nBlocksForCopy / nBlocksForReduce; - size_t scratchSizePerRank = scratchBufferSize / nRanksPerNode; - size_t sizePerRank = size / nRanksPerNode; + size_t scratchSizePerRank = scratchBufferSize / ipcDomainNranks; + size_t sizePerRank = size / ipcDomainNranks; assert(sizePerRank % alignment == 0); uint32_t sizePerBlock = ((sizePerRank + (nBlocksForCopy - 1)) / nBlocksForCopy + alignment - 1) / alignment * alignment; @@ -68,7 +68,7 @@ __global__ void __launch_bounds__(1024, 1) deviceSemaphore[bid + 2 * nBlocksForCopy].acquire(); } __syncthreads(); - for (int i = 0; i < nRanksPerNode; i++) { + for (int i = 0; i < ipcDomainNranks; i++) { size_t blockOffset = it * unitSize + bid * sizePerBlock + i * sizePerRank; uint32_t scratchOffset = scratchIt * unitSize + bid * scratchSizePerBlock + i * scratchSizePerRank; char* srcData = (char*)src + blockOffset; @@ -125,7 +125,7 @@ __global__ void __launch_bounds__(1024, 1) channels->wait(); } __syncthreads(); - for (int i = 0; i < nRanksPerNode; i++) { + for (int i = 0; i < ipcDomainNranks; i++) { size_t blockOffset = it * unitSize + (bid - nBlocksForCopy - nBlocksForReduce) * sizePerBlock + i * sizePerRank; uint32_t scratchOffset = scratchIt * unitSize + (bid - nBlocksForCopy - nBlocksForReduce) * scratchSizePerBlock + @@ -150,7 +150,7 @@ template struct NvlsBlockPipelineAdapter { static cudaError_t call(const void* input, void* scratch, void* output, void* memoryChannels, void*, DeviceHandle* nvlsChannels, DeviceHandle*, size_t, size_t, - size_t scratchBufferSize, int rank, int nRanksPerNode, int, size_t inputSize, + size_t scratchBufferSize, int rank, int ipcDomainNranks, int, size_t inputSize, cudaStream_t stream, void*, uint32_t, uint32_t, int nBlocks, int nThreadsPerBlock) { // uint8_t is not supported for NVLS (no hardware support for byte-level reduction) if constexpr (std::is_same_v) { @@ -166,9 +166,9 @@ struct NvlsBlockPipelineAdapter { #endif { using ChannelType = DeviceHandle; - allreduceNvlsBlockPipeline - <<>>(input, scratch, output, (ChannelType*)memoryChannels, - nvlsChannels, inputSize, scratchBufferSize, rank, nRanksPerNode); + allreduceNvlsBlockPipeline<<>>( + input, scratch, output, (ChannelType*)memoryChannels, nvlsChannels, inputSize, scratchBufferSize, rank, + ipcDomainNranks); return cudaGetLastError(); } } @@ -200,11 +200,11 @@ CommResult AllreduceNvlsBlockPipeline::allreduceKernelFunc(const std::shared_ptr } std::pair blockAndThreadNum = {nBlocks, nThreadsPerBlock}; if (blockAndThreadNum.first == 0 || blockAndThreadNum.second == 0) { - blockAndThreadNum = {ctx->nRanksPerNode * 5, 1024}; + blockAndThreadNum = {ctx->ipcDomainNranks * 5, 1024}; } cudaError_t error = allreduce(input, this->scratchBuffer_, output, this->memoryChannelsDeviceHandle_.get(), nullptr, ctx->switchChannelDeviceHandles.get(), nullptr, 0, 0, this->scratchBufferSize_, - ctx->rank, ctx->nRanksPerNode, ctx->workSize, inputSize, stream, nullptr, 0, 0, + ctx->rank, ctx->ipcDomainNranks, ctx->workSize, inputSize, stream, nullptr, 0, 0, blockAndThreadNum.first, blockAndThreadNum.second); if (error != cudaSuccess) { WARN("AllreduceNvlsBlockPipeline failed with error: %s", cudaGetErrorString(error)); @@ -222,7 +222,7 @@ std::shared_ptr AllreduceNvlsBlockPipeline::initAllreduceContext(std::shar auto ctx = std::make_shared(); ctx->rank = comm->bootstrap()->getRank(); ctx->workSize = comm->bootstrap()->getNranks(); - ctx->nRanksPerNode = comm->bootstrap()->getNranksPerNode(); + ctx->ipcDomainNranks = comm->bootstrap()->getNranksPerNode(); // setup channels ctx->switchChannels = diff --git a/src/ext/collectives/allreduce/allreduce_nvls_packet.cu b/src/ext/collectives/allreduce/allreduce_nvls_packet.cu index d331cc672..2ef0516e3 100644 --- a/src/ext/collectives/allreduce/allreduce_nvls_packet.cu +++ b/src/ext/collectives/allreduce/allreduce_nvls_packet.cu @@ -94,7 +94,7 @@ std::shared_ptr AllreduceNvlsPacket::initAllreduceContext(std::shared_ptr< auto ctx = std::make_shared(); ctx->rank = comm->bootstrap()->getRank(); ctx->workSize = comm->bootstrap()->getNranks(); - ctx->nRanksPerNode = getIpcDomainNranks(comm); + ctx->ipcDomainNranks = getIpcDomainNranks(comm); // setup channels ctx->switchChannels = this->switchChannels_; @@ -123,7 +123,7 @@ CommResult AllreduceNvlsPacket::allreduceKernelFunc(const std::shared_ptr } cudaError_t error = allreduce(input, this->scratchBuffer_, output, nullptr, nullptr, ctx->switchChannelDeviceHandles.get(), nullptr, - 0, 0, this->scratchBufferSize_, ctx->rank, ctx->nRanksPerNode, ctx->workSize, inputSize, stream, + 0, 0, this->scratchBufferSize_, ctx->rank, ctx->ipcDomainNranks, ctx->workSize, inputSize, stream, (void*)flagBuffer_, (uint32_t)flagBufferSize_, 0, blockAndThreadNum.first, blockAndThreadNum.second); if (error != cudaSuccess) { WARN(ALGO, "AllreduceNvlsPacket failed with error: ", cudaGetErrorString(error)); diff --git a/src/ext/collectives/allreduce/allreduce_nvls_warp_pipeline.cu b/src/ext/collectives/allreduce/allreduce_nvls_warp_pipeline.cu index 9be621f08..1bdac9ada 100644 --- a/src/ext/collectives/allreduce/allreduce_nvls_warp_pipeline.cu +++ b/src/ext/collectives/allreduce/allreduce_nvls_warp_pipeline.cu @@ -18,15 +18,15 @@ __global__ void __launch_bounds__(1024, 1) [[maybe_unused]] DeviceHandle* memoryChannels, [[maybe_unused]] DeviceHandle* multicast, [[maybe_unused]] size_t size, [[maybe_unused]] size_t scratchBufferSize, [[maybe_unused]] int rank, - [[maybe_unused]] int nRanksPerNode) { + [[maybe_unused]] int ipcDomainNranks) { #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 constexpr int alignment = 16; - int nPeers = nRanksPerNode - 1; + int nPeers = ipcDomainNranks - 1; int nBlocks = gridDim.x; int nBlocksPerNvlsConn = nBlocks / NUM_NVLS_CONNECTION; int bid = blockIdx.x; - size_t sizePerRank = size / nRanksPerNode; - size_t scratchSizePerRank = scratchBufferSize / nRanksPerNode; + size_t sizePerRank = size / ipcDomainNranks; + size_t scratchSizePerRank = scratchBufferSize / ipcDomainNranks; const size_t maxSizePerBlock = ((sizePerRank + nBlocks - 1) / nBlocks + alignment - 1) / alignment * alignment; size_t start = bid * maxSizePerBlock; size_t end = min(start + maxSizePerBlock, sizePerRank); @@ -53,7 +53,7 @@ __global__ void __launch_bounds__(1024, 1) lastIterSize = sizePerBlock % copyPerIter; } - const size_t chanOffset = (nRanksPerNode - 1) * blockIdx.x * 2; + const size_t chanOffset = (ipcDomainNranks - 1) * blockIdx.x * 2; auto memoryChans = memoryChannels + chanOffset; __shared__ DeviceHandle channels[(MAX_NRANKS_PER_NODE - 1) * 2]; const int lid = threadIdx.x % WARP_SIZE; @@ -68,7 +68,7 @@ __global__ void __launch_bounds__(1024, 1) const size_t iterSize = (it == nIter - 1) ? lastIterSize : copyPerIter; if (warpId < endCopyWid) { int tidInCopy = threadIdx.x; - for (int i = 0; i < nRanksPerNode; i++) { + for (int i = 0; i < ipcDomainNranks; i++) { size_t offset = i * sizePerRank + maxSizePerBlock * bid + it * copyPerIter; size_t offsetScratch = i * scratchSizePerRank + scratchSizePerBlock * bid + (it * copyPerIter) % scratchSizePerBlock; @@ -99,7 +99,7 @@ __global__ void __launch_bounds__(1024, 1) channels[tidInRecvCopy + nPeers].wait(); } asm volatile("bar.sync %0, %1;" ::"r"(3), "r"((NRECV_COPY_WARPS)*WARP_SIZE) : "memory"); - for (int i = 0; i < nRanksPerNode; i++) { + for (int i = 0; i < ipcDomainNranks; i++) { size_t offset = i * sizePerRank + maxSizePerBlock * bid + it * copyPerIter; size_t offsetScratch = i * scratchSizePerRank + scratchSizePerBlock * bid + (it * copyPerIter) % scratchSizePerBlock; @@ -116,7 +116,7 @@ template struct NvlsWarpPipelineAdapter { static cudaError_t call(const void* input, void* scratch, void* output, void* memoryChannels, void*, DeviceHandle* nvlsChannels, DeviceHandle*, size_t, size_t, - size_t scratchBufferSize, int rank, int nRanksPerNode, int, size_t inputSize, + size_t scratchBufferSize, int rank, int ipcDomainNranks, int, size_t inputSize, cudaStream_t stream, void*, uint32_t, uint32_t, int nBlocks, int nThreadsPerBlock) { // uint8_t is not supported for NVLS (no hardware support for byte-level reduction) if constexpr (std::is_same_v) { @@ -132,9 +132,9 @@ struct NvlsWarpPipelineAdapter { #endif { using ChannelType = DeviceHandle; - allreduceNvlsWarpPipeline - <<>>(input, scratch, output, (ChannelType*)memoryChannels, - nvlsChannels, inputSize, scratchBufferSize, rank, nRanksPerNode); + allreduceNvlsWarpPipeline<<>>( + input, scratch, output, (ChannelType*)memoryChannels, nvlsChannels, inputSize, scratchBufferSize, rank, + ipcDomainNranks); return cudaGetLastError(); } } @@ -165,11 +165,11 @@ CommResult AllreduceNvlsWarpPipeline::allreduceKernelFunc( } std::pair blockAndThreadNum = {nBlocks, nThreadsPerBlock}; if (blockAndThreadNum.first == 0 || blockAndThreadNum.second == 0) { - blockAndThreadNum = {ctx->nRanksPerNode * 4, 1024}; + blockAndThreadNum = {ctx->ipcDomainNranks * 4, 1024}; } cudaError_t error = allreduce(input, this->scratchBuffer_, output, this->memoryChannelsDeviceHandle_.get(), nullptr, ctx->switchChannelDeviceHandles.get(), nullptr, 0, 0, this->scratchBufferSize_, - ctx->rank, ctx->nRanksPerNode, ctx->workSize, inputSize, stream, nullptr, 0, 0, + ctx->rank, ctx->ipcDomainNranks, ctx->workSize, inputSize, stream, nullptr, 0, 0, blockAndThreadNum.first, blockAndThreadNum.second); if (error != cudaSuccess) { WARN("AllreduceNvlsWarpPipeline failed with error: %s", cudaGetErrorString(error)); @@ -187,7 +187,7 @@ std::shared_ptr AllreduceNvlsWarpPipeline::initAllreduceContext(std::share auto ctx = std::make_shared(); ctx->rank = comm->bootstrap()->getRank(); ctx->workSize = comm->bootstrap()->getNranks(); - ctx->nRanksPerNode = comm->bootstrap()->getNranksPerNode(); + ctx->ipcDomainNranks = comm->bootstrap()->getNranksPerNode(); // setup channels ctx->switchChannels = diff --git a/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu b/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu index 36fcf860b..a9d46d4f5 100644 --- a/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu +++ b/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu @@ -19,12 +19,12 @@ __global__ void __launch_bounds__(1024, 1) [[maybe_unused]] mscclpp::DeviceHandle* multicast, [[maybe_unused]] mscclpp::DeviceHandle* multicastOut, [[maybe_unused]] size_t channelInOffset, [[maybe_unused]] size_t channelOutOffset, - [[maybe_unused]] size_t size, [[maybe_unused]] int rank, [[maybe_unused]] int nRanksPerNode) { + [[maybe_unused]] size_t size, [[maybe_unused]] int rank, [[maybe_unused]] int ipcDomainNranks) { #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 - int nPeers = nRanksPerNode - 1; + int nPeers = ipcDomainNranks - 1; int nBlocks = gridDim.x; int bid = blockIdx.x; - size_t sizePerRank = size / nRanksPerNode; + size_t sizePerRank = size / ipcDomainNranks; const size_t minAlign = 16; // Align sizePerBlock to 16 bytes to ensure aligned vector access in handleMultiLoadReduceStore size_t sizePerBlock = (sizePerRank + nBlocks - 1) / nBlocks; @@ -40,14 +40,14 @@ __global__ void __launch_bounds__(1024, 1) mscclpp::DeviceHandle* multicastPtr = multicast + bid; mscclpp::DeviceHandle* multicastOutPtr = multicastOut + bid; - const size_t chanOffset = (nRanksPerNode - 1) * blockIdx.x; + const size_t chanOffset = (ipcDomainNranks - 1) * blockIdx.x; auto memoryChans = memoryChannels + chanOffset; __shared__ mscclpp::DeviceHandle channels[MAX_NRANKS_PER_NODE - 1]; const int lid = threadIdx.x % WARP_SIZE; // Each warp redundantly loads all entries (same value, benign race) so that // every warp has the data its threads will read after __syncwarp(). Required // when nPeers > WARP_SIZE (MNNVL/NVL72 → 71 peers). - for (int i = lid; i < nRanksPerNode - 1; i += WARP_SIZE) { + for (int i = lid; i < ipcDomainNranks - 1; i += WARP_SIZE) { channels[i] = memoryChans[i]; } __syncwarp(); @@ -75,7 +75,7 @@ struct NvlsAdapter { static cudaError_t call(const void*, void*, void*, void* memoryChannels, void*, mscclpp::DeviceHandle* nvlsChannels, mscclpp::DeviceHandle* nvlsOutChannels, size_t channelInOffset, - size_t channelOutOffset, size_t, int rank, int nRanksPerNode, int, size_t inputSize, + size_t channelOutOffset, size_t, int rank, int ipcDomainNranks, int, size_t inputSize, cudaStream_t stream, void*, uint32_t, uint32_t, int nBlocks, int nThreadsPerBlock) { // uint8_t is not supported for NVLS (no hardware support for byte-level reduction) if constexpr (std::is_same_v) { @@ -93,7 +93,7 @@ struct NvlsAdapter { using ChannelType = DeviceHandle; allreduceNvls<<>>((ChannelType*)memoryChannels, nvlsChannels, nvlsOutChannels, channelInOffset, channelOutOffset, - inputSize, rank, nRanksPerNode); + inputSize, rank, ipcDomainNranks); return cudaGetLastError(); } } @@ -145,7 +145,7 @@ CommResult AllreduceNvls::allreduceKernelFunc(const std::shared_ptr ctx_vo } std::pair numBlocksAndThreads = {nBlocks, nThreadsPerBlock}; if (numBlocksAndThreads.first == 0 || numBlocksAndThreads.second == 0) { - numBlocksAndThreads = {::min(ctx->nRanksPerNode, MAX_NBLOCKS), 1024}; + numBlocksAndThreads = {::min(ctx->ipcDomainNranks, MAX_NBLOCKS), 1024}; // For GB200 devices with MNNVLS (Multi-Node NVLink Sharp), scale the number of blocks inversely with // the number of GPUs. Empirically, 32 blocks works well for 4 GPUs and 16 for 8 GPUs, which // follows the formula 128 / nGPUs, clamped to [1, MAX_NBLOCKS]. @@ -159,7 +159,7 @@ CommResult AllreduceNvls::allreduceKernelFunc(const std::shared_ptr ctx_vo } cudaError_t error = allreduce(nullptr, nullptr, nullptr, this->memoryChannelsDeviceHandle_.get(), nullptr, nvlsChannels, - nvlsOutChannels, channelInOffset, channelOutOffset, 0, ctx->rank, ctx->nRanksPerNode, ctx->workSize, + nvlsOutChannels, channelInOffset, channelOutOffset, 0, ctx->rank, ctx->ipcDomainNranks, ctx->workSize, inputSize, stream, nullptr, 0, 0, numBlocksAndThreads.first, numBlocksAndThreads.second); if (error != cudaSuccess) { WARN("AllreduceNvls failed with error: %s", cudaGetErrorString(error)); @@ -183,7 +183,7 @@ std::shared_ptr AllreduceNvls::initAllreduceContext(std::shared_ptr(); ctx->rank = comm->bootstrap()->getRank(); ctx->workSize = comm->bootstrap()->getNranks(); - ctx->nRanksPerNode = getIpcDomainNranks(comm); + ctx->ipcDomainNranks = getIpcDomainNranks(comm); size_t sendBytes, recvBytes; CUdeviceptr sendBasePtr, recvBasePtr; diff --git a/src/ext/collectives/allreduce/allreduce_packet.cu b/src/ext/collectives/allreduce/allreduce_packet.cu index d631c35a8..ebb2f618a 100644 --- a/src/ext/collectives/allreduce/allreduce_packet.cu +++ b/src/ext/collectives/allreduce/allreduce_packet.cu @@ -15,7 +15,7 @@ namespace collective { template __global__ void __launch_bounds__(1024, 1) allreducePacket(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle* memoryChannels, - size_t channelDataOffset, size_t scratchBufferSize, int rank, int nRanksPerNode, int worldSize, + size_t channelDataOffset, size_t scratchBufferSize, int rank, int ipcDomainNranks, int worldSize, size_t nelems, void* flags, uint32_t flagBufferSize, uint32_t numScratchBuff #if defined(ENABLE_NPKIT) , @@ -53,7 +53,7 @@ __global__ void __launch_bounds__(1024, 1) else nelems = nelems / (sizeof(int) / sizeof(T)); - const int nPeers = nRanksPerNode - 1; + const int nPeers = ipcDomainNranks - 1; const size_t nPkts = nelems / 2; uint32_t flag = ((uint32_t*)flags)[blockIdx.x]; @@ -156,7 +156,7 @@ template struct PacketAdapter { static cudaError_t call(const void* buff, void* scratch, void* resultBuff, void* memoryChannels, void*, DeviceHandle*, DeviceHandle*, size_t channelInOffset, size_t, - size_t scratchBufferSize, int rank, int nRanksPerNode, int worldSize, size_t inputSize, + size_t scratchBufferSize, int rank, int ipcDomainNranks, int worldSize, size_t inputSize, cudaStream_t stream, void* flags, uint32_t flagBufferSize, uint32_t numScratchBuff, int nBlocks = 0, int nThreadsPerBlock = 0) { using ChannelType = DeviceHandle; @@ -167,20 +167,20 @@ struct PacketAdapter { size_t sharedMemSize = sizeof(NpKitEvent) * NPKIT_SHM_NUM_EVENTS; allreducePacket<<>>( (T*)buff, (T*)scratch, (T*)resultBuff, (ChannelType*)memoryChannels, channelInOffset, scratchBufferSize, rank, - nRanksPerNode, worldSize, nelems, flags, flagBufferSize, numScratchBuff, NpKit::GetGpuEventCollectContexts(), + ipcDomainNranks, worldSize, nelems, flags, flagBufferSize, numScratchBuff, NpKit::GetGpuEventCollectContexts(), NpKit::GetCpuTimestamp()); #else allreducePacket<<>>( (T*)buff, (T*)scratch, (T*)resultBuff, (ChannelType*)memoryChannels, channelInOffset, scratchBufferSize, rank, - nRanksPerNode, worldSize, nelems, flags, flagBufferSize, numScratchBuff); + ipcDomainNranks, worldSize, nelems, flags, flagBufferSize, numScratchBuff); #endif return cudaGetLastError(); } }; -inline std::pair getDefaultBlockNumAndThreadNum(size_t inputSize, int nRanksPerNode, int worldSize, +inline std::pair getDefaultBlockNumAndThreadNum(size_t inputSize, int ipcDomainNranks, int worldSize, [[maybe_unused]] DataType dtype) { - int nBlocks = (nRanksPerNode - 1) * 4; + int nBlocks = (ipcDomainNranks - 1) * 4; int nThreadsPerBlock = 1024; if (inputSize >= 32768) { nBlocks = (worldSize - 1) * 8; @@ -232,7 +232,7 @@ CommResult AllreducePacket::allreduceKernelFunc(const std::shared_ptr ctx_ auto ctx = std::static_pointer_cast(ctx_void); std::pair blockAndThreadNum = {nBlocks, nThreadsPerBlock}; if (blockAndThreadNum.first == 0 || blockAndThreadNum.second == 0) { - blockAndThreadNum = getDefaultBlockNumAndThreadNum(inputSize, ctx->workSize, ctx->nRanksPerNode, dtype); + blockAndThreadNum = getDefaultBlockNumAndThreadNum(inputSize, ctx->workSize, ctx->ipcDomainNranks, dtype); } size_t sendBytes; @@ -248,7 +248,7 @@ CommResult AllreducePacket::allreduceKernelFunc(const std::shared_ptr ctx_ } cudaError_t error = allreduce(input, this->scratchBuffer_, output, ctx->memoryChannelDeviceHandles.get(), nullptr, nullptr, nullptr, - channelInOffset, 0, this->scratchBufferSize_, ctx->rank, ctx->nRanksPerNode, ctx->workSize, inputSize, + channelInOffset, 0, this->scratchBufferSize_, ctx->rank, ctx->ipcDomainNranks, ctx->workSize, inputSize, stream, (void*)flagBuffer_, (uint32_t)flagBufferSize_, this->nSegmentsForScratchBuffer_, blockAndThreadNum.first, blockAndThreadNum.second); if (error != cudaSuccess) { @@ -264,7 +264,7 @@ std::shared_ptr AllreducePacket::initAllreduceContext(std::shared_ptrrank = comm->bootstrap()->getRank(); ctx->workSize = comm->bootstrap()->getNranks(); - ctx->nRanksPerNode = getIpcDomainNranks(comm); + ctx->ipcDomainNranks = getIpcDomainNranks(comm); ctx->memorySemaphores = this->memorySemaphores_; ctx->registeredMemories = this->registeredMemories_; ctx->registeredMemories.pop_back(); // remove the local memory from previous context diff --git a/src/ext/collectives/allreduce/allreduce_rsag.cu b/src/ext/collectives/allreduce/allreduce_rsag.cu index 4c46bf9b3..93e2d0c46 100644 --- a/src/ext/collectives/allreduce/allreduce_rsag.cu +++ b/src/ext/collectives/allreduce/allreduce_rsag.cu @@ -31,18 +31,18 @@ namespace collective { template __global__ void __launch_bounds__(1024, 1) allreduceRsAg(T* buff, T* scratch, T* resultBuff, DeviceHandle* memoryChannels, - DeviceHandle* switchChannels, void* remoteMemories, int rank, int nRanksPerNode, + DeviceHandle* switchChannels, void* remoteMemories, int rank, int ipcDomainNranks, int worldSize, size_t nelems) { int blockId = blockIdx.x; - uint32_t nPeers = nRanksPerNode - 1; + uint32_t nPeers = ipcDomainNranks - 1; assert((uintptr_t)buff % sizeof(int4) == 0); assert((uintptr_t)resultBuff % sizeof(int4) == 0); constexpr uint32_t nelemsPerInt4 = sizeof(int4) / sizeof(T); - uint32_t alignedNelems = ((nelems + nRanksPerNode - 1) / nRanksPerNode + nelemsPerInt4 - 1) / nelemsPerInt4 * - nelemsPerInt4 * nRanksPerNode; - uint32_t nelemsPerRank = alignedNelems / nRanksPerNode; + uint32_t alignedNelems = ((nelems + ipcDomainNranks - 1) / ipcDomainNranks + nelemsPerInt4 - 1) / nelemsPerInt4 * + nelemsPerInt4 * ipcDomainNranks; + uint32_t nelemsPerRank = alignedNelems / ipcDomainNranks; uint32_t nInt4PerRank = nelemsPerRank / nelemsPerInt4; uint32_t lastInt4Index = nelems / nelemsPerInt4; uint32_t remainder = nelems % nelemsPerInt4; @@ -59,7 +59,7 @@ __global__ void __launch_bounds__(1024, 1) nInt4PerBlock += remainderForBlock; } if (nInt4PerBlock == 0) return; - uint32_t nInt4ForCopy = nInt4PerBlock * nRanksPerNode; + uint32_t nInt4ForCopy = nInt4PerBlock * ipcDomainNranks; for (uint32_t idx = threadIdx.x; idx < nInt4ForCopy; idx += blockDim.x) { int rankIdx = idx / nInt4PerBlock; @@ -84,13 +84,13 @@ __global__ void __launch_bounds__(1024, 1) if (offset > lastInt4Index) continue; int4 tmp = scratch4[offset]; for (uint32_t i = 0; i < nPeers; i++) { - int rankIdx = (rank + i + 1) % nRanksPerNode; + int rankIdx = (rank + i + 1) % ipcDomainNranks; int peerIdx = rankIdx < rank ? rankIdx : rankIdx - 1; int4 data = mscclpp::read(((void**)remoteMemories)[peerIdx], offset); tmp = calVector(data, tmp); } for (uint32_t i = 0; i < nPeers; i++) { - int rankIdx = (rank + i + 1) % nRanksPerNode; + int rankIdx = (rank + i + 1) % ipcDomainNranks; int peerIdx = rankIdx < rank ? rankIdx : rankIdx - 1; mscclpp::write(((void**)remoteMemories)[peerIdx], offset, tmp); } @@ -127,7 +127,7 @@ template struct AllreduceRsAgAdapter { static cudaError_t call(const void* input, void* scratch, void* output, void* memoryChannels, void* remoteMemories, DeviceHandle* switchChannel, DeviceHandle*, size_t, size_t, - size_t, int rank, int nRanksPerNode, int worldSize, size_t inputSize, cudaStream_t stream, + size_t, int rank, int ipcDomainNranks, int worldSize, size_t inputSize, cudaStream_t stream, void*, uint32_t, uint32_t, int nBlocks, int nThreadsPerBlock) { using ChannelType = DeviceHandle; size_t nelems = inputSize / sizeof(T); @@ -137,7 +137,7 @@ struct AllreduceRsAgAdapter { } allreduceRsAg<<>>( (T*)input, (T*)scratch, (T*)output, (ChannelType*)memoryChannels, switchChannel, remoteMemories, rank, - nRanksPerNode, worldSize, nelems); + ipcDomainNranks, worldSize, nelems); return cudaGetLastError(); } }; @@ -185,7 +185,7 @@ CommResult AllreduceRsAg::allreduceKernelFunc(const std::shared_ptr ctx, c } cudaError_t error = allreduce(input, this->scratchBuffer_, output, this->baseMemoryChannelHandles_.get(), this->remoteMemoryHandles_.get(), nullptr, nullptr, 0, 0, 0, algoCtx->rank, - algoCtx->nRanksPerNode, algoCtx->workSize, inputSize, stream, nullptr, 0, 0, + algoCtx->ipcDomainNranks, algoCtx->workSize, inputSize, stream, nullptr, 0, 0, numBlocksAndThreads.first, numBlocksAndThreads.second); if (error != cudaSuccess) { WARN(ALGO, "Allreduce kernel launch failed with error: ", cudaGetErrorString(error)); @@ -203,7 +203,7 @@ std::shared_ptr AllreduceRsAg::initAllreduceContext(std::shared_ptr(); ctx->rank = comm->bootstrap()->getRank(); ctx->workSize = comm->bootstrap()->getNranks(); - ctx->nRanksPerNode = getIpcDomainNranks(comm); + ctx->ipcDomainNranks = getIpcDomainNranks(comm); ctx->memorySemaphores = this->scratchSemaphores_; ctx->registeredMemories = this->remoteScratchMemories_; diff --git a/src/ext/collectives/allreduce/allreduce_rsag_pipeline.cu b/src/ext/collectives/allreduce/allreduce_rsag_pipeline.cu index eabe3dc53..9f63e5905 100644 --- a/src/ext/collectives/allreduce/allreduce_rsag_pipeline.cu +++ b/src/ext/collectives/allreduce/allreduce_rsag_pipeline.cu @@ -86,7 +86,7 @@ template __global__ void __launch_bounds__(1024, 1) allreduceRsAgPipeline(T* buff, T* scratch, T* resultBuff, DeviceHandle* memoryChannels, DeviceHandle* switchChannels, void* remoteMemories, int rank, - int nRanksPerNode, int worldSize, size_t nelems, size_t scratchSize, uint32_t nblocksForPut, + int ipcDomainNranks, int worldSize, size_t nelems, size_t scratchSize, uint32_t nblocksForPut, uint32_t nblocksForReduce, uint32_t nblocksForRecv) { uint32_t bid = blockIdx.x; constexpr uint32_t nStepsPerIter = 4; @@ -94,7 +94,7 @@ __global__ void __launch_bounds__(1024, 1) uint32_t nInt4PerIter = nblocksForReduce * blockDim.x * nStepsPerIter; const uint32_t chunkSize = nInt4PerIter * worldSize; uint32_t nIters = (nInt4 + chunkSize - 1) / chunkSize; - uint32_t nPeers = nRanksPerNode - 1; + uint32_t nPeers = ipcDomainNranks - 1; int4* scratch4 = reinterpret_cast((char*)scratch); const uint32_t scratchIterStride = 2 * chunkSize; // one for AS, one for AG const uint32_t pipelineDepth = scratchSize / sizeof(int4) / scratchIterStride; @@ -111,7 +111,7 @@ __global__ void __launch_bounds__(1024, 1) __syncthreads(); uint32_t threadIdInPut = bid * blockDim.x + threadIdx.x; for (uint32_t peer = 0; peer < nPeers; peer++) { - int remoteRankId = (rank + peer + 1) % nRanksPerNode; + int remoteRankId = (rank + peer + 1) % ipcDomainNranks; int peerId = remoteRankId < rank ? remoteRankId : remoteRankId - 1; // Read chunk[remoteRankId] from local buff, write to peer's scratch[rank] (sender's slot) uint32_t srcOffset = iter * chunkSize + remoteRankId * nInt4PerIter; @@ -164,7 +164,7 @@ __global__ void __launch_bounds__(1024, 1) int4 tmp = loadVec(buff, myChunkOffset, nelems); // Add data from each peer's slot in scratch (peer sent their chunk[rank] to our scratch[peer]) for (uint32_t peer = 0; peer < nPeers; peer++) { - int remoteRankId = (rank + peer + 1) % nRanksPerNode; + int remoteRankId = (rank + peer + 1) % ipcDomainNranks; uint32_t peerSlotOffset = baseOffset + remoteRankId * nInt4PerIter + threadIdInPut + putStep * blockDim.x * nblocksForPut; int4 data = scratch4[peerSlotOffset]; @@ -175,7 +175,7 @@ __global__ void __launch_bounds__(1024, 1) uint32_t dstOffset = baseOffset + chunkSize + rank * nInt4PerIter + threadIdInPut + putStep * blockDim.x * nblocksForPut; for (uint32_t i = 0; i < nPeers; i++) { - int peerIdx = (rank + i + 1) % nRanksPerNode; + int peerIdx = (rank + i + 1) % ipcDomainNranks; int index = peerIdx < rank ? peerIdx : peerIdx - 1; mscclpp::write(((void**)remoteMemories)[index], dstOffset, tmp); } @@ -203,7 +203,7 @@ __global__ void __launch_bounds__(1024, 1) __syncthreads(); // Copy other ranks' reduced chunks from scratch to result for (uint32_t peer = 0; peer < nPeers; peer++) { - int remoteRankId = (rank + peer + 1) % nRanksPerNode; + int remoteRankId = (rank + peer + 1) % ipcDomainNranks; for (uint32_t step = 0; step < nStepsPerIter * REDUCE_COPY_RATIO; step++) { uint32_t offset = baseOffset + chunkSize + remoteRankId * nInt4PerIter + threadIdInRecv + step * blockDim.x * nblocksForRecv; @@ -224,7 +224,7 @@ template struct AllreduceRsAgPipelineAdapter { static cudaError_t call(const void* input, void* scratch, void* output, void* memoryChannels, void* remoteMemories, DeviceHandle* switchChannel, DeviceHandle*, size_t, size_t, - size_t scratchSize, int rank, int nRanksPerNode, int worldSize, size_t inputSize, + size_t scratchSize, int rank, int ipcDomainNranks, int worldSize, size_t inputSize, cudaStream_t stream, void*, uint32_t, uint32_t, int nBlocks, int nThreadsPerBlock) { using ChannelType = DeviceHandle; size_t nelems = inputSize / sizeof(T); @@ -248,7 +248,7 @@ struct AllreduceRsAgPipelineAdapter { } allreduceRsAgPipeline<<>>( (T*)input, (T*)scratch, (T*)output, (ChannelType*)memoryChannels, switchChannel, remoteMemories, rank, - nRanksPerNode, worldSize, nelems, scratchSize, nblocksForPut, nblocksForReduce, nblocksForRecv); + ipcDomainNranks, worldSize, nelems, scratchSize, nblocksForPut, nblocksForReduce, nblocksForRecv); return cudaGetLastError(); } }; @@ -288,8 +288,8 @@ CommResult AllreduceRsAgPipeline::allreduceKernelFunc( std::pair numBlocksAndThreads = {nBlocks, nThreadsPerBlock}; cudaError_t error = allreduce(input, this->scratchBuffer_, output, this->baseMemoryChannelHandles_.get(), this->remoteMemoryHandles_.get(), nullptr, nullptr, 0, 0, this->scratchBufferSize_, - algoCtx->rank, algoCtx->nRanksPerNode, algoCtx->workSize, inputSize, stream, nullptr, 0, - 0, numBlocksAndThreads.first, numBlocksAndThreads.second); + algoCtx->rank, algoCtx->ipcDomainNranks, algoCtx->workSize, inputSize, stream, nullptr, + 0, 0, numBlocksAndThreads.first, numBlocksAndThreads.second); if (error != cudaSuccess) { WARN(ALGO, "Allreduce kernel launch failed with error: ", cudaGetErrorString(error)); return CommResult::CommUnhandledCudaError; @@ -306,7 +306,7 @@ std::shared_ptr AllreduceRsAgPipeline::initAllreduceContext(std::shared_pt auto ctx = std::make_shared(); ctx->rank = comm->bootstrap()->getRank(); ctx->workSize = comm->bootstrap()->getNranks(); - ctx->nRanksPerNode = comm->bootstrap()->getNranksPerNode(); + ctx->ipcDomainNranks = comm->bootstrap()->getNranksPerNode(); ctx->memorySemaphores = this->scratchSemaphores_; ctx->registeredMemories = this->remoteScratchMemories_; diff --git a/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu b/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu index 67eed6d31..ea6643255 100644 --- a/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu +++ b/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu @@ -35,7 +35,7 @@ __device__ mscclpp::DeviceSyncer globalSyncer; // // This approach requires registering both input and output buffers as remote // memories (2 * nPeers handles), but avoids scratch buffer allocation and -// the extra copy steps of the standard RSAG. nRanksPerNode is accepted at +// the extra copy steps of the standard RSAG. ipcDomainNranks is accepted at // runtime, which allows the same kernel to handle any NVLink-domain size // (including Multi-Node NVLink fabrics up to NVL72). @@ -43,18 +43,18 @@ template __global__ void __launch_bounds__(1024, 1) allreduceRsAgZeroCopy(T* buff, T* scratch, T* resultBuff, DeviceHandle* memoryChannels, DeviceHandle* switchChannels, void* remoteMemories, int rank, - int nRanksPerNode, int worldSize, size_t nelems) { + int ipcDomainNranks, int worldSize, size_t nelems) { int blockId = blockIdx.x; assert((uintptr_t)buff % sizeof(int4) == 0); assert((uintptr_t)resultBuff % sizeof(int4) == 0); - const int NPeers = nRanksPerNode - 1; + const int NPeers = ipcDomainNranks - 1; constexpr uint32_t nelemsPerInt4 = sizeof(int4) / sizeof(T); const uint32_t outputRemoteBufferOffset = NPeers; - uint32_t alignedNelems = ((nelems + nRanksPerNode - 1) / nRanksPerNode + nelemsPerInt4 - 1) / nelemsPerInt4 * - nelemsPerInt4 * nRanksPerNode; - uint32_t nelemsPerRank = alignedNelems / nRanksPerNode; + uint32_t alignedNelems = ((nelems + ipcDomainNranks - 1) / ipcDomainNranks + nelemsPerInt4 - 1) / nelemsPerInt4 * + nelemsPerInt4 * ipcDomainNranks; + uint32_t nelemsPerRank = alignedNelems / ipcDomainNranks; uint32_t nInt4PerRank = nelemsPerRank / nelemsPerInt4; uint32_t nInt4Total = (nelems + nelemsPerInt4 - 1) / nelemsPerInt4; @@ -87,14 +87,14 @@ __global__ void __launch_bounds__(1024, 1) int4 data; AccumVec acc = mscclpp::upcastVector(tmp_raw); for (int i = 0; i < NPeers; i++) { - int rankIdx = (rank + i + 1) % nRanksPerNode; + int rankIdx = (rank + i + 1) % ipcDomainNranks; int peerIdx = rankIdx < rank ? rankIdx : rankIdx - 1; data = mscclpp::read(((void**)remoteMemories)[peerIdx], offset); acc = mscclpp::calVectorAccum(acc, data); } int4 tmp = mscclpp::downcastVector(acc); for (int i = 0; i < NPeers; i++) { - int rankIdx = (rank + i + 1) % nRanksPerNode; + int rankIdx = (rank + i + 1) % ipcDomainNranks; int peerIdx = rankIdx < rank ? rankIdx : rankIdx - 1; mscclpp::write(((void**)remoteMemories)[outputRemoteBufferOffset + peerIdx], offset, tmp); } @@ -112,7 +112,7 @@ template struct AllreduceRsAgZeroCopyAdapter { static cudaError_t call(const void* input, void* scratch, void* output, void* memoryChannels, void* remoteMemories, DeviceHandle* switchChannel, DeviceHandle*, size_t, size_t, - size_t, int rank, int nRanksPerNode, int worldSize, size_t inputSize, cudaStream_t stream, + size_t, int rank, int ipcDomainNranks, int worldSize, size_t inputSize, cudaStream_t stream, void*, uint32_t, uint32_t, int nBlocks, int nThreadsPerBlock) { using ChannelType = DeviceHandle; size_t nelems = inputSize / sizeof(T); @@ -125,7 +125,7 @@ struct AllreduceRsAgZeroCopyAdapter { } allreduceRsAgZeroCopy<<>>( (T*)input, (T*)scratch, (T*)output, (ChannelType*)memoryChannels, switchChannel, remoteMemories, rank, - nRanksPerNode, worldSize, nelems); + ipcDomainNranks, worldSize, nelems); return cudaGetLastError(); } }; @@ -159,8 +159,8 @@ CommResult AllreduceRsAgZeroCopy::allreduceKernelFunc(const std::shared_ptrbaseMemoryChannelHandles_.get(), algoCtx->remoteMemoryHandles.get(), - nullptr, nullptr, 0, 0, 0, algoCtx->rank, algoCtx->nRanksPerNode, algoCtx->workSize, inputSize, stream, - nullptr, 0, 0, numBlocksAndThreads.first, numBlocksAndThreads.second); + nullptr, nullptr, 0, 0, 0, algoCtx->rank, algoCtx->ipcDomainNranks, algoCtx->workSize, inputSize, + stream, nullptr, 0, 0, numBlocksAndThreads.first, numBlocksAndThreads.second); if (error != cudaSuccess) { WARN(ALGO, "Allreduce kernel launch failed with error: ", cudaGetErrorString(error)); return CommResult::CommUnhandledCudaError; @@ -186,7 +186,7 @@ std::shared_ptr AllreduceRsAgZeroCopy::initAllreduceContext(std::shared_pt auto ctx = std::make_shared(); ctx->rank = comm->bootstrap()->getRank(); ctx->workSize = comm->bootstrap()->getNranks(); - ctx->nRanksPerNode = getIpcDomainNranks(comm); + ctx->ipcDomainNranks = getIpcDomainNranks(comm); ctx->memorySemaphores = this->semaphores_; diff --git a/src/ext/collectives/include/collective_utils.hpp b/src/ext/collectives/include/collective_utils.hpp index 44a214020..7fa6a81ea 100644 --- a/src/ext/collectives/include/collective_utils.hpp +++ b/src/ext/collectives/include/collective_utils.hpp @@ -27,8 +27,8 @@ namespace mscclpp { namespace collective { constexpr int NUM_NVLS_CONNECTION = 8; // Sized to cover MAX_NRANKS_PER_NODE-scale allreduce algos whose device-side -// semaphore indices grow as O(nRanksPerNode) (e.g. nvls_block_pipeline uses -// up to ~5 * nRanksPerNode entries). +// semaphore indices grow as O(ipcDomainNranks) (e.g. nvls_block_pipeline uses +// up to ~5 * ipcDomainNranks entries). constexpr int NUM_SEMAPHORES = 512; // Upper bound on the number of NVLink-reachable ranks that participate in a @@ -54,8 +54,8 @@ std::vector> setupMemorySemaphores /// Number of ranks that participate in the same GPU-IPC-reachable peer group (e.g. a single host or /// a Multi-Node NVLink fabric, or an AMD XGMI domain). Returns the value of `MSCCLPP_IPC_DOMAIN_NRANKS` /// if set to a positive value; otherwise falls back to `bootstrap->getNranksPerNode()`. This is -/// intentionally independent of `nRanksPerNode` so that algorithms can opt in to MNNVL-like behavior -/// without changing the meaning of bootstrap-level APIs. +/// intentionally independent of `Bootstrap::getNranksPerNode()` so that algorithms can opt in to +/// MNNVL-like behavior without changing the meaning of bootstrap-level APIs. int getIpcDomainNranks(std::shared_ptr comm); std::shared_ptr> setupMemoryChannelDeviceHandles( @@ -86,7 +86,7 @@ class AlgorithmCtx { public: int rank; int workSize; - int nRanksPerNode; + int ipcDomainNranks; std::vector registeredMemories; std::vector memoryChannels; From 2efda4d81964f87092d7dc75d5d89c1f2c9166ee Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Fri, 1 May 2026 23:09:22 +0000 Subject: [PATCH 09/44] Restore compile-time templated NRanksPerNode for rsag_zero_copy Recovers the per-thread int4 register array + #pragma unroll for the {4, 8} rank cases. All NPeers remote reads are issued in parallel so their latency overlaps instead of being serialized by the runtime fused load+reduce loop. The runtime-domain (NVL72) fallback is removed; the algo now returns cudaErrorInvalidValue for unsupported ipcDomainNranks, and rsag_zero_copy is dropped from the MNNVL candidate list in the tuning example. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../customized_comm_with_tuning.py | 3 - .../allreduce/allreduce_rsag_zero_copy.cu | 60 ++++++++++++------- 2 files changed, 40 insertions(+), 23 deletions(-) diff --git a/examples/torch-integration/customized_comm_with_tuning.py b/examples/torch-integration/customized_comm_with_tuning.py index 1d54cfa77..9ad7f22a5 100644 --- a/examples/torch-integration/customized_comm_with_tuning.py +++ b/examples/torch-integration/customized_comm_with_tuning.py @@ -246,9 +246,6 @@ def _ar_candidates(self, size: int): if a: out.append(a) if size >= 512 << 10: - a = self._algo("allreduce", "default_allreduce_rsag_zero_copy") - if a: - out.append(a) a = self._algo("allreduce", "default_allreduce_nvls_zero_copy") if self._nvls and self.symmetric_memory and a: out.append(a) diff --git a/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu b/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu index ea6643255..09fa2fe70 100644 --- a/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu +++ b/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu @@ -35,26 +35,32 @@ __device__ mscclpp::DeviceSyncer globalSyncer; // // This approach requires registering both input and output buffers as remote // memories (2 * nPeers handles), but avoids scratch buffer allocation and -// the extra copy steps of the standard RSAG. ipcDomainNranks is accepted at -// runtime, which allows the same kernel to handle any NVLink-domain size -// (including Multi-Node NVLink fabrics up to NVL72). +// the extra copy steps of the standard RSAG. +// +// The kernel is templated on NRanksPerNode so the compiler can keep an int4 +// register array of NPeers elements, #pragma unroll the peer loops, and turn +// the per-iteration modulo into a single AND. This issues all NPeers remote +// reads in parallel so their latency is overlapped instead of serialized. +// Only small fixed sizes ({4, 8}) are instantiated; larger MNNVL domains +// (where the int4 array would spill out of registers) must use a different +// algorithm. -template +template __global__ void __launch_bounds__(1024, 1) allreduceRsAgZeroCopy(T* buff, T* scratch, T* resultBuff, DeviceHandle* memoryChannels, - DeviceHandle* switchChannels, void* remoteMemories, int rank, - int ipcDomainNranks, int worldSize, size_t nelems) { + DeviceHandle* switchChannels, void* remoteMemories, int rank, int worldSize, + size_t nelems) { int blockId = blockIdx.x; assert((uintptr_t)buff % sizeof(int4) == 0); assert((uintptr_t)resultBuff % sizeof(int4) == 0); - const int NPeers = ipcDomainNranks - 1; + constexpr int NPeers = NRanksPerNode - 1; constexpr uint32_t nelemsPerInt4 = sizeof(int4) / sizeof(T); - const uint32_t outputRemoteBufferOffset = NPeers; - uint32_t alignedNelems = ((nelems + ipcDomainNranks - 1) / ipcDomainNranks + nelemsPerInt4 - 1) / nelemsPerInt4 * - nelemsPerInt4 * ipcDomainNranks; - uint32_t nelemsPerRank = alignedNelems / ipcDomainNranks; + constexpr uint32_t outputRemoteBufferOffset = NPeers; + uint32_t alignedNelems = ((nelems + NRanksPerNode - 1) / NRanksPerNode + nelemsPerInt4 - 1) / nelemsPerInt4 * + nelemsPerInt4 * NRanksPerNode; + uint32_t nelemsPerRank = alignedNelems / NRanksPerNode; uint32_t nInt4PerRank = nelemsPerRank / nelemsPerInt4; uint32_t nInt4Total = (nelems + nelemsPerInt4 - 1) / nelemsPerInt4; @@ -75,6 +81,7 @@ __global__ void __launch_bounds__(1024, 1) memoryChannelsLocal[threadIdx.x].relaxedWait(); } __syncthreads(); + int4 data[NPeers]; // AccumInt4: when AccumT != T, use a wider accumulator type. // For AccumT == T, this is just int4 (no-op conversion). constexpr int nElemsPerInt4 = sizeof(int4) / sizeof(T); @@ -84,17 +91,21 @@ __global__ void __launch_bounds__(1024, 1) uint32_t offset = idx + offset4 + rank * nInt4PerRank; if (offset >= nInt4Total) continue; int4 tmp_raw = buff4[offset]; - int4 data; - AccumVec acc = mscclpp::upcastVector(tmp_raw); +#pragma unroll for (int i = 0; i < NPeers; i++) { - int rankIdx = (rank + i + 1) % ipcDomainNranks; + int rankIdx = (rank + i + 1) % NRanksPerNode; int peerIdx = rankIdx < rank ? rankIdx : rankIdx - 1; - data = mscclpp::read(((void**)remoteMemories)[peerIdx], offset); - acc = mscclpp::calVectorAccum(acc, data); + data[i] = mscclpp::read(((void**)remoteMemories)[peerIdx], offset); + } + AccumVec acc = mscclpp::upcastVector(tmp_raw); +#pragma unroll + for (int i = 0; i < NPeers; i++) { + acc = mscclpp::calVectorAccum(acc, data[i]); } int4 tmp = mscclpp::downcastVector(acc); +#pragma unroll for (int i = 0; i < NPeers; i++) { - int rankIdx = (rank + i + 1) % ipcDomainNranks; + int rankIdx = (rank + i + 1) % NRanksPerNode; int peerIdx = rankIdx < rank ? rankIdx : rankIdx - 1; mscclpp::write(((void**)remoteMemories)[outputRemoteBufferOffset + peerIdx], offset, tmp); } @@ -123,9 +134,18 @@ struct AllreduceRsAgZeroCopyAdapter { nBlocks = 128; } } - allreduceRsAgZeroCopy<<>>( - (T*)input, (T*)scratch, (T*)output, (ChannelType*)memoryChannels, switchChannel, remoteMemories, rank, - ipcDomainNranks, worldSize, nelems); + if (ipcDomainNranks == 4) { + allreduceRsAgZeroCopy<4, OpType, T, AccumT> + <<>>((T*)input, (T*)scratch, (T*)output, (ChannelType*)memoryChannels, + switchChannel, remoteMemories, rank, worldSize, nelems); + } else if (ipcDomainNranks == 8) { + allreduceRsAgZeroCopy<8, OpType, T, AccumT> + <<>>((T*)input, (T*)scratch, (T*)output, (ChannelType*)memoryChannels, + switchChannel, remoteMemories, rank, worldSize, nelems); + } else { + WARN(ALGO, "AllreduceRsAgZeroCopy only supports ipcDomainNranks of 4 or 8, got: ", ipcDomainNranks); + return cudaErrorInvalidValue; + } return cudaGetLastError(); } }; From 1c298175661003f9d80192349cebbb3575f8d0d3 Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Fri, 1 May 2026 23:40:11 +0000 Subject: [PATCH 10/44] Revert AllreduceRsAgZeroCopy non-symmetric ctx key tag back to ++tag Commit 533f3299 dropped the static tag counter from generateAllreduceContextKey, causing every non-symmetric call to return the same key (zero) and reuse a stale context. Restore the pre-MNNVL behavior of returning a unique key per non-symmetric call so the context cache rebuilds when buffers change. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu b/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu index 09fa2fe70..a20756aee 100644 --- a/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu +++ b/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu @@ -191,6 +191,7 @@ CommResult AllreduceRsAgZeroCopy::allreduceKernelFunc(const std::shared_ptr AllreduceRsAgZeroCopy::initAllreduceContext(std::shared_ptr comm, const void* input, From 7bc5e0406b4b45eaae5d4cb42ca95ba69b0a2d56 Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Sat, 2 May 2026 03:19:31 +0000 Subject: [PATCH 11/44] Reset GPU tokens before reuse Clear recycled TokenPool entries before handing them out so device-to-device semaphores start from a clean counter value. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- include/mscclpp/gpu_utils.hpp | 3 +++ src/core/gpu_utils.cc | 7 +++++++ src/core/utils_internal.cc | 6 ++++-- 3 files changed, 14 insertions(+), 2 deletions(-) diff --git a/include/mscclpp/gpu_utils.hpp b/include/mscclpp/gpu_utils.hpp index ecd13c478..f7ec67d05 100644 --- a/include/mscclpp/gpu_utils.hpp +++ b/include/mscclpp/gpu_utils.hpp @@ -165,6 +165,7 @@ void gpuFreePhysical(void* ptr); void gpuMemcpyAsync(void* dst, const void* src, size_t bytes, cudaStream_t stream, cudaMemcpyKind kind = cudaMemcpyDefault); void gpuMemcpy(void* dst, const void* src, size_t bytes, cudaMemcpyKind kind = cudaMemcpyDefault); +void gpuMemset(void* ptr, int value, size_t bytes); /// A template function that allocates memory while ensuring that the memory will be freed when the returned object is /// destroyed. @@ -300,6 +301,8 @@ void gpuMemcpy(T* dst, const T* src, size_t nelems, cudaMemcpyKind kind = cudaMe detail::gpuMemcpy(dst, src, nelems * sizeof(T), kind); } +inline void memset(void* ptr, int value, size_t bytes) { detail::gpuMemset(ptr, value, bytes); } + /// Check if NVLink SHARP (NVLS) is supported. /// /// @return True if NVLink SHARP (NVLS) is supported, false otherwise. diff --git a/src/core/gpu_utils.cc b/src/core/gpu_utils.cc index 09d5025d6..1ce61322c 100644 --- a/src/core/gpu_utils.cc +++ b/src/core/gpu_utils.cc @@ -267,6 +267,13 @@ void gpuMemcpy(void* dst, const void* src, size_t bytes, cudaMemcpyKind kind) { MSCCLPP_CUDATHROW(cudaStreamSynchronize(stream)); } +void gpuMemset(void* ptr, int value, size_t bytes) { + AvoidCudaGraphCaptureGuard cgcGuard; + CudaStreamWithFlags stream(cudaStreamNonBlocking); + MSCCLPP_CUDATHROW(cudaMemsetAsync(ptr, value, bytes, stream)); + MSCCLPP_CUDATHROW(cudaStreamSynchronize(stream)); +} + } // namespace detail bool isNvlsSupported() { diff --git a/src/core/utils_internal.cc b/src/core/utils_internal.cc index 9504a52cf..ea867ffff 100644 --- a/src/core/utils_internal.cc +++ b/src/core/utils_internal.cc @@ -263,8 +263,10 @@ std::shared_ptr TokenPool::getToken() { for (int bit = 0; bit < UINT64_WIDTH; bit++) { if (holes & (1UL << bit)) { allocationMap_[i].set(bit); - INFO(MSCCLPP_ALLOC, "TokenPool allocated token at addr %p", baseAddr_ + i * UINT64_WIDTH + bit); - return std::shared_ptr(baseAddr_ + i * UINT64_WIDTH + bit, deleter); + uint64_t* token = baseAddr_ + i * UINT64_WIDTH + bit; + mscclpp::memset(token, 0, sizeof(uint64_t)); + INFO(MSCCLPP_ALLOC, "TokenPool allocated token at addr %p", token); + return std::shared_ptr(token, deleter); } } } From 9a368843691be73d6f40cbb0d1277f6e20d56013 Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Sat, 2 May 2026 03:32:18 +0000 Subject: [PATCH 12/44] Rename gpuMemset wrapper and zero TokenPool slots in deleter Two follow-ups to commit 7bc5e040: * Rename mscclpp::memset to mscclpp::gpuMemset for symmetry with gpuMemcpy / gpuMemcpyAsync, and avoid shadowing std::memset for callers that pull the namespace in. Also add the missing doc comment. * Move the per-slot zeroing from getToken() into the deleter so the cost is paid on release rather than acquire. This is safe because gpuCallocPhysical already zeros the underlying buffer at TokenPool construction, so first-time tokens are clean and recycled tokens are scrubbed on release. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- include/mscclpp/gpu_utils.hpp | 6 +++++- src/core/utils_internal.cc | 9 +++++---- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/include/mscclpp/gpu_utils.hpp b/include/mscclpp/gpu_utils.hpp index f7ec67d05..b079e0fd9 100644 --- a/include/mscclpp/gpu_utils.hpp +++ b/include/mscclpp/gpu_utils.hpp @@ -301,7 +301,11 @@ void gpuMemcpy(T* dst, const T* src, size_t nelems, cudaMemcpyKind kind = cudaMe detail::gpuMemcpy(dst, src, nelems * sizeof(T), kind); } -inline void memset(void* ptr, int value, size_t bytes) { detail::gpuMemset(ptr, value, bytes); } +/// Sets `bytes` of memory at `ptr` to `value` synchronously. +/// @param ptr Destination address. +/// @param value Value to set (interpreted as unsigned char per CUDA semantics). +/// @param bytes Number of bytes to set. +inline void gpuMemset(void* ptr, int value, size_t bytes) { detail::gpuMemset(ptr, value, bytes); } /// Check if NVLink SHARP (NVLS) is supported. /// diff --git a/src/core/utils_internal.cc b/src/core/utils_internal.cc index ea867ffff..8cc554301 100644 --- a/src/core/utils_internal.cc +++ b/src/core/utils_internal.cc @@ -248,6 +248,9 @@ TokenPool::TokenPool(size_t nToken) : nToken_(nToken) { std::shared_ptr TokenPool::getToken() { auto deleter = [self = shared_from_this()](uint64_t* token) { + // Zero the slot on release so the next allocator hands out a clean + // semaphore counter (matches a freshly-allocated slot). + mscclpp::gpuMemset(token, 0, sizeof(uint64_t)); size_t index = (token - self->baseAddr_) / UINT64_WIDTH; size_t bit = (token - self->baseAddr_) % UINT64_WIDTH; uint64_t mask = 1UL << bit; @@ -263,10 +266,8 @@ std::shared_ptr TokenPool::getToken() { for (int bit = 0; bit < UINT64_WIDTH; bit++) { if (holes & (1UL << bit)) { allocationMap_[i].set(bit); - uint64_t* token = baseAddr_ + i * UINT64_WIDTH + bit; - mscclpp::memset(token, 0, sizeof(uint64_t)); - INFO(MSCCLPP_ALLOC, "TokenPool allocated token at addr %p", token); - return std::shared_ptr(token, deleter); + INFO(MSCCLPP_ALLOC, "TokenPool allocated token at addr %p", baseAddr_ + i * UINT64_WIDTH + bit); + return std::shared_ptr(baseAddr_ + i * UINT64_WIDTH + bit, deleter); } } } From 6296803d87a451e96f3ae5b62c2b616740861d8a Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Tue, 5 May 2026 04:41:14 +0000 Subject: [PATCH 13/44] Make NVLS non-zero-copy allreduce algorithms MNNVL-ready Both default_allreduce_nvls_warp_pipeline and default_allreduce_nvls_block_pipeline were only partially MNNVL-aware: their kernels had been updated to use ipcDomainNranks (with shared-memory channel arrays sized for the global NVLink-domain bound), but the host-side context init still hard-coded ctx->ipcDomainNranks = bootstrap->getNranksPerNode(). On a fully populated MNNVL fabric (e.g. NVL72 where world == ipcDomainNranks but the per-physical-host nranksPerNode is much smaller), this mismatched the multicast group span and produced wrong/missing data plus out-of-bounds scratch indexing. Changes: - Rename MAX_NRANKS_PER_NODE -> MAX_IPC_DOMAIN_NRANKS to match the rest of the IPC-domain naming (getIpcDomainNranks, ipcDomainNranks, MSCCLPP_IPC_DOMAIN_NRANKS env var). Pure rename, no semantic change. - Add validateIpcDomainSpansWorld(comm, algName) helper in collective_utils that wraps getIpcDomainNranks() and asserts the IPC-domain == whole-comm invariant required by NVLS algorithms (worldSize == ipcDomainNranks, rank < ipcDomainNranks, ipcDomainNranks in [2, MAX_IPC_DOMAIN_NRANKS]), throwing Error(InvalidUsage) on violation and returning the validated value. - nvls_zero_copy / nvls_block_pipeline / nvls_warp_pipeline initialize() each now call the helper instead of repeating the same ~20-line check inline. - initAllreduceContext() in both pipelines now uses getIpcDomainNranks(comm) instead of bootstrap->getNranksPerNode(). - Per-peer base channel allocation (nBaseChannels_) is sized in initialize() as max(64, 4*ipc) for block pipeline and max(64, 8*ipc) for warp pipeline so the kernel's per-block channel addressing remains in-bounds at NVL72 scale. - Block pipeline initialize() also asserts 6*ipcDomainNranks <= NUM_SEMAPHORES. - allreduceKernelFunc() in both pipelines now validates launch shape and the user-supplied scratch buffer size before launching, returning CommInvalidArgument with a clear WARN on mismatch: - Block: nBlocks must equal 5*ipcDomainNranks (structurally required by the kernel's three-phase block partition), nThreads == 1024, inputSize aligned to (ipc * 16) bytes, scratchSizePerBlock >= unitSize. - Warp: nBlocks >= NUM_NVLS_CONNECTION and a multiple of it (kernel does nBlocks / NUM_NVLS_CONNECTION partitioning of the multicast handles), 2*nBlocks <= nBaseChannels_, nThreads == 1024 (32 warps hard-coded in the bar.sync member counts), inputSize divisible by ipcDomainNranks, scratchSizePerBlock >= copyPerIter. - Default nBlocks for warp pipeline is rounded up to a multiple of NUM_NVLS_CONNECTION so the structural constraint holds for any ipcDomainNranks. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../allgather/allgather_fullmesh.cu | 2 +- .../allreduce/allreduce_fullmesh.cu | 4 +- .../allreduce_nvls_block_pipeline.cu | 66 ++++++++++++++--- .../allreduce/allreduce_nvls_warp_pipeline.cu | 72 ++++++++++++++++--- .../allreduce/allreduce_nvls_zero_copy.cu | 4 +- .../collectives/allreduce/allreduce_packet.cu | 2 +- src/ext/collectives/collective_utils.cc | 23 ++++++ .../allreduce/allreduce_allpair_packet.hpp | 2 +- .../allreduce_nvls_block_pipeline.hpp | 2 + .../allreduce_nvls_warp_pipeline.hpp | 2 + .../collectives/include/collective_utils.hpp | 14 +++- 11 files changed, 166 insertions(+), 27 deletions(-) diff --git a/src/ext/collectives/allgather/allgather_fullmesh.cu b/src/ext/collectives/allgather/allgather_fullmesh.cu index cbe199bcb..8ce77fca1 100644 --- a/src/ext/collectives/allgather/allgather_fullmesh.cu +++ b/src/ext/collectives/allgather/allgather_fullmesh.cu @@ -28,7 +28,7 @@ __global__ void __launch_bounds__(1024, 1) const size_t restNInt4 = nInt4 % nInt4PerChunk; const size_t scratchChunkRankOffset = nInt4PerChunk * rank; - __shared__ DeviceHandle channels[MAX_NRANKS_PER_NODE - 1]; + __shared__ DeviceHandle channels[MAX_IPC_DOMAIN_NRANKS - 1]; const int lid = threadIdx.x % WARP_SIZE; // Each warp redundantly loads all entries (same value, benign race) so that // every warp has the data its threads will read after __syncwarp(). Required diff --git a/src/ext/collectives/allreduce/allreduce_fullmesh.cu b/src/ext/collectives/allreduce/allreduce_fullmesh.cu index b95dcb284..f1d815604 100644 --- a/src/ext/collectives/allreduce/allreduce_fullmesh.cu +++ b/src/ext/collectives/allreduce/allreduce_fullmesh.cu @@ -49,8 +49,8 @@ __global__ void __launch_bounds__(512, 1) const size_t blockOffset = nInt4PerChunk * blockIdx.x; const size_t scratchChunkRankOffset = chunkSizePerRank * rank; - __shared__ DeviceHandle channels[MAX_NRANKS_PER_NODE - 1]; - __shared__ DeviceHandle outChannels[MAX_NRANKS_PER_NODE - 1]; + __shared__ DeviceHandle channels[MAX_IPC_DOMAIN_NRANKS - 1]; + __shared__ DeviceHandle outChannels[MAX_IPC_DOMAIN_NRANKS - 1]; const int lid = threadIdx.x % WARP_SIZE; // Each warp redundantly loads all entries (same value, benign race) so that // every warp has the data its threads will read after __syncwarp(). Required diff --git a/src/ext/collectives/allreduce/allreduce_nvls_block_pipeline.cu b/src/ext/collectives/allreduce/allreduce_nvls_block_pipeline.cu index 3ecb361fc..4eeb03355 100644 --- a/src/ext/collectives/allreduce/allreduce_nvls_block_pipeline.cu +++ b/src/ext/collectives/allreduce/allreduce_nvls_block_pipeline.cu @@ -1,7 +1,9 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT License. +#include #include +#include #include "allreduce/allreduce_nvls_block_pipeline.hpp" #include "allreduce/common.hpp" @@ -176,31 +178,73 @@ struct NvlsBlockPipelineAdapter { void AllreduceNvlsBlockPipeline::initialize(std::shared_ptr comm) { nSwitchChannels_ = 8; - int nBaseChannels = 64; + ipcDomainNranks_ = validateIpcDomainSpansWorld(comm, "AllreduceNvlsBlockPipeline"); + // Block-pipeline device-side semaphore indices grow as 6 * ipcDomainNranks (see kernel). + if (6 * ipcDomainNranks_ > NUM_SEMAPHORES) { + throw Error("AllreduceNvlsBlockPipeline: ipcDomainNranks " + std::to_string(ipcDomainNranks_) + + " exceeds NUM_SEMAPHORES capacity (" + std::to_string(NUM_SEMAPHORES) + ")", + ErrorCode::InvalidUsage); + } + // The kernel addresses up to `2 * nBlocksForCopy = 4 * ipcDomainNranks` distinct entries + // per peer in `memoryChannels`. Scale the per-connection allocation to match. + nBaseChannels_ = std::max(64, 4 * ipcDomainNranks_); this->conns_ = setupConnections(comm); // setup semaphores std::vector> memorySemaphores = - setupMemorySemaphores(comm, this->conns_, nBaseChannels); + setupMemorySemaphores(comm, this->conns_, nBaseChannels_); // setup base memory channels - this->baseChannels_ = setupBaseMemoryChannels(this->conns_, memorySemaphores, nBaseChannels); + this->baseChannels_ = setupBaseMemoryChannels(this->conns_, memorySemaphores, nBaseChannels_); this->memoryChannelsDeviceHandle_ = setupBaseMemoryChannelDeviceHandles(this->baseChannels_); this->nvlsConnections_ = setupNvlsConnections(comm, nvlsBufferSize_, nSwitchChannels_); } -CommResult AllreduceNvlsBlockPipeline::allreduceKernelFunc(const std::shared_ptr ctx_void, const void* input, - void* output, size_t inputSize, DataType dtype, ReduceOp op, - cudaStream_t stream, int nBlocks, int nThreadsPerBlock, - const std::unordered_map& extras, - DataType accumDtype) { +CommResult AllreduceNvlsBlockPipeline::allreduceKernelFunc( + const std::shared_ptr ctx_void, const void* input, void* output, size_t inputSize, DataType dtype, + ReduceOp op, cudaStream_t stream, int nBlocks, int nThreadsPerBlock, + [[maybe_unused]] const std::unordered_map& extras, DataType accumDtype) { auto ctx = std::static_pointer_cast(ctx_void); AllreduceFunc allreduce = dispatch(op, dtype, accumDtype); if (!allreduce) { WARN("Unsupported operation or data type for allreduce, dtype=%d", static_cast(dtype)); return CommResult::CommInvalidArgument; } + const int requiredBlocks = ctx->ipcDomainNranks * 5; std::pair blockAndThreadNum = {nBlocks, nThreadsPerBlock}; - if (blockAndThreadNum.first == 0 || blockAndThreadNum.second == 0) { - blockAndThreadNum = {ctx->ipcDomainNranks * 5, 1024}; + if (blockAndThreadNum.first == 0) blockAndThreadNum.first = requiredBlocks; + if (blockAndThreadNum.second == 0) blockAndThreadNum.second = 1024; + if (blockAndThreadNum.first != requiredBlocks) { + WARN("AllreduceNvlsBlockPipeline requires nBlocks == 5 * ipcDomainNranks (got %d, expected %d)", + blockAndThreadNum.first, requiredBlocks); + return CommResult::CommInvalidArgument; + } + if (blockAndThreadNum.second != 1024) { + WARN("AllreduceNvlsBlockPipeline requires nThreadsPerBlock == 1024 (got %d)", blockAndThreadNum.second); + return CommResult::CommInvalidArgument; + } + // Validate input alignment/divisibility expectations of the kernel. + constexpr size_t kKernelAlignment = 16; + const size_t perRankBytes = inputSize / ctx->ipcDomainNranks; + if (perRankBytes * static_cast(ctx->ipcDomainNranks) != inputSize || perRankBytes % kKernelAlignment != 0) { + WARN( + "AllreduceNvlsBlockPipeline requires inputSize %% (ipcDomainNranks * %zu) == 0 (got inputSize=%zu, " + "ipcDomainNranks=%d)", + kKernelAlignment, inputSize, ctx->ipcDomainNranks); + return CommResult::CommInvalidArgument; + } + // Validate scratch is large enough for at least one pipeline iteration. The kernel + // computes scratchSizePerBlock = (scratchSizePerRank / nBlocksForCopy) aligned down + // to unitSize; if this is 0, maxItersForScratch is 0 and the kernel deadlocks. + const size_t unitSize = (inputSize <= static_cast(1024) * 1024 * 128) ? (1ULL << 16) : (1ULL << 17); + const size_t scratchSizePerRank = this->scratchBufferSize_ / ctx->ipcDomainNranks; + const size_t nBlocksForCopy = static_cast(ctx->ipcDomainNranks) * 2; + const size_t scratchSizePerBlock = (scratchSizePerRank / nBlocksForCopy) / unitSize * unitSize; + if (scratchSizePerBlock < unitSize) { + WARN( + "AllreduceNvlsBlockPipeline scratch buffer too small for ipcDomainNranks=%d and inputSize=%zu " + "(scratchBufferSize=%zu, need at least ~%zu bytes)", + ctx->ipcDomainNranks, inputSize, this->scratchBufferSize_, + static_cast(ctx->ipcDomainNranks) * nBlocksForCopy * unitSize); + return CommResult::CommInvalidArgument; } cudaError_t error = allreduce(input, this->scratchBuffer_, output, this->memoryChannelsDeviceHandle_.get(), nullptr, ctx->switchChannelDeviceHandles.get(), nullptr, 0, 0, this->scratchBufferSize_, @@ -222,7 +266,7 @@ std::shared_ptr AllreduceNvlsBlockPipeline::initAllreduceContext(std::shar auto ctx = std::make_shared(); ctx->rank = comm->bootstrap()->getRank(); ctx->workSize = comm->bootstrap()->getNranks(); - ctx->ipcDomainNranks = comm->bootstrap()->getNranksPerNode(); + ctx->ipcDomainNranks = getIpcDomainNranks(comm); // setup channels ctx->switchChannels = diff --git a/src/ext/collectives/allreduce/allreduce_nvls_warp_pipeline.cu b/src/ext/collectives/allreduce/allreduce_nvls_warp_pipeline.cu index 1bdac9ada..05e4f747f 100644 --- a/src/ext/collectives/allreduce/allreduce_nvls_warp_pipeline.cu +++ b/src/ext/collectives/allreduce/allreduce_nvls_warp_pipeline.cu @@ -1,7 +1,9 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT License. +#include #include +#include #include "allreduce/allreduce_nvls_warp_pipeline.hpp" #include "allreduce/common.hpp" @@ -55,7 +57,7 @@ __global__ void __launch_bounds__(1024, 1) const size_t chanOffset = (ipcDomainNranks - 1) * blockIdx.x * 2; auto memoryChans = memoryChannels + chanOffset; - __shared__ DeviceHandle channels[(MAX_NRANKS_PER_NODE - 1) * 2]; + __shared__ DeviceHandle channels[(MAX_IPC_DOMAIN_NRANKS - 1) * 2]; const int lid = threadIdx.x % WARP_SIZE; // Each warp redundantly loads all entries (same value, benign race) so that // every warp has the data its threads will read after __syncwarp(). Required @@ -141,14 +143,18 @@ struct NvlsWarpPipelineAdapter { }; void AllreduceNvlsWarpPipeline::initialize(std::shared_ptr comm) { - nSwitchChannels_ = 8; - int nBaseChannels = 64; + nSwitchChannels_ = NUM_NVLS_CONNECTION; + ipcDomainNranks_ = validateIpcDomainSpansWorld(comm, "AllreduceNvlsWarpPipeline"); + // The warp-pipeline kernel addresses 2 * nPeers entries per block in `memoryChannels`, + // so per-peer base channel allocation must be at least `2 * nBlocks`. Default + // nBlocks = 4 * ipcDomainNranks (see allreduceKernelFunc), so size accordingly. + nBaseChannels_ = std::max(64, 8 * ipcDomainNranks_); this->conns_ = setupConnections(comm); // setup semaphores std::vector> memorySemaphores = - setupMemorySemaphores(comm, this->conns_, nBaseChannels); + setupMemorySemaphores(comm, this->conns_, nBaseChannels_); // setup base memory channels - this->baseChannels_ = setupBaseMemoryChannels(this->conns_, memorySemaphores, nBaseChannels); + this->baseChannels_ = setupBaseMemoryChannels(this->conns_, memorySemaphores, nBaseChannels_); this->memoryChannelsDeviceHandle_ = setupBaseMemoryChannelDeviceHandles(this->baseChannels_); this->nvlsConnections_ = setupNvlsConnections(comm, nvlsBufferSize_, nSwitchChannels_); } @@ -164,8 +170,58 @@ CommResult AllreduceNvlsWarpPipeline::allreduceKernelFunc( return CommResult::CommInvalidArgument; } std::pair blockAndThreadNum = {nBlocks, nThreadsPerBlock}; - if (blockAndThreadNum.first == 0 || blockAndThreadNum.second == 0) { - blockAndThreadNum = {ctx->ipcDomainNranks * 4, 1024}; + if (blockAndThreadNum.first == 0) { + // Default to 4 * ipcDomainNranks blocks, rounded up to a multiple of NUM_NVLS_CONNECTION + // so that nBlocks / NUM_NVLS_CONNECTION partitioning in the kernel is well-defined. + int defaultBlocks = ctx->ipcDomainNranks * 4; + defaultBlocks = ((defaultBlocks + NUM_NVLS_CONNECTION - 1) / NUM_NVLS_CONNECTION) * NUM_NVLS_CONNECTION; + blockAndThreadNum.first = std::max(defaultBlocks, NUM_NVLS_CONNECTION); + } + if (blockAndThreadNum.second == 0) blockAndThreadNum.second = 1024; + // The kernel computes nBlocksPerNvlsConn = nBlocks / NUM_NVLS_CONNECTION and indexes the + // multicast handle array with bid / nBlocksPerNvlsConn; both must be safe. + if (blockAndThreadNum.first < NUM_NVLS_CONNECTION || blockAndThreadNum.first % NUM_NVLS_CONNECTION != 0) { + WARN("AllreduceNvlsWarpPipeline requires nBlocks to be a positive multiple of %d (got %d)", NUM_NVLS_CONNECTION, + blockAndThreadNum.first); + return CommResult::CommInvalidArgument; + } + // Each block uses 2 * nPeers consecutive entries in `memoryChannels`, so the per-peer + // base-channel allocation must support 2 * nBlocks distinct entries. + if (2 * blockAndThreadNum.first > this->nBaseChannels_) { + WARN( + "AllreduceNvlsWarpPipeline: nBlocks %d exceeds channel allocation (nBaseChannels=%d, " + "ipcDomainNranks=%d). Increase MSCCLPP_IPC_DOMAIN_NRANKS-aware sizing or reduce nBlocks.", + blockAndThreadNum.first, this->nBaseChannels_, ctx->ipcDomainNranks); + return CommResult::CommInvalidArgument; + } + // The kernel hard-codes 14 + 4 + 14 = 32 warps per block and bar.sync member counts + // computed from these constants; deviating from 1024 threads breaks those barriers. + if (blockAndThreadNum.second != 1024) { + WARN("AllreduceNvlsWarpPipeline requires nThreadsPerBlock == 1024 (got %d)", blockAndThreadNum.second); + return CommResult::CommInvalidArgument; + } + // Validate input divisibility by ipcDomainNranks (kernel computes size / ipcDomainNranks). + if (inputSize % static_cast(ctx->ipcDomainNranks) != 0) { + WARN("AllreduceNvlsWarpPipeline requires inputSize %% ipcDomainNranks == 0 (got inputSize=%zu, ipcDomainNranks=%d)", + inputSize, ctx->ipcDomainNranks); + return CommResult::CommInvalidArgument; + } + // Validate scratch is large enough for at least one pipeline iteration. The kernel + // computes scratchSizePerBlock = (scratchSizePerRank / nBlocks) aligned down to copyPerIter; + // if this is 0 the modulo offset arithmetic divides by zero. + const size_t sizePerRank = inputSize / static_cast(ctx->ipcDomainNranks); + const size_t maxSizePerBlock = ((sizePerRank + blockAndThreadNum.first - 1) / blockAndThreadNum.first + 15) / 16 * 16; + const size_t copyPerIter = (maxSizePerBlock >= 1024 * 64) ? (1024 * 32) : (1024 * 16); + const size_t scratchSizePerRank = this->scratchBufferSize_ / static_cast(ctx->ipcDomainNranks); + const size_t scratchSizePerBlock = + (scratchSizePerRank / static_cast(blockAndThreadNum.first)) / copyPerIter * copyPerIter; + if (scratchSizePerBlock < copyPerIter) { + WARN( + "AllreduceNvlsWarpPipeline scratch buffer too small for ipcDomainNranks=%d, nBlocks=%d, inputSize=%zu " + "(scratchBufferSize=%zu, need at least ~%zu bytes)", + ctx->ipcDomainNranks, blockAndThreadNum.first, inputSize, this->scratchBufferSize_, + static_cast(ctx->ipcDomainNranks) * static_cast(blockAndThreadNum.first) * copyPerIter); + return CommResult::CommInvalidArgument; } cudaError_t error = allreduce(input, this->scratchBuffer_, output, this->memoryChannelsDeviceHandle_.get(), nullptr, ctx->switchChannelDeviceHandles.get(), nullptr, 0, 0, this->scratchBufferSize_, @@ -187,7 +243,7 @@ std::shared_ptr AllreduceNvlsWarpPipeline::initAllreduceContext(std::share auto ctx = std::make_shared(); ctx->rank = comm->bootstrap()->getRank(); ctx->workSize = comm->bootstrap()->getNranks(); - ctx->ipcDomainNranks = comm->bootstrap()->getNranksPerNode(); + ctx->ipcDomainNranks = getIpcDomainNranks(comm); // setup channels ctx->switchChannels = diff --git a/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu b/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu index a9d46d4f5..5d6fc4d37 100644 --- a/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu +++ b/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu @@ -2,6 +2,7 @@ // Licensed under the MIT License. #include +#include #include "allreduce/allreduce_nvls_zero_copy.hpp" #include "allreduce/common.hpp" @@ -42,7 +43,7 @@ __global__ void __launch_bounds__(1024, 1) const size_t chanOffset = (ipcDomainNranks - 1) * blockIdx.x; auto memoryChans = memoryChannels + chanOffset; - __shared__ mscclpp::DeviceHandle channels[MAX_NRANKS_PER_NODE - 1]; + __shared__ mscclpp::DeviceHandle channels[MAX_IPC_DOMAIN_NRANKS - 1]; const int lid = threadIdx.x % WARP_SIZE; // Each warp redundantly loads all entries (same value, benign race) so that // every warp has the data its threads will read after __syncwarp(). Required @@ -106,6 +107,7 @@ void AllreduceNvls::initialize(std::shared_ptr comm) { MSCCLPP_CUDATHROW(cudaGetDeviceProperties(&deviceProp, device)); computeCapabilityMajor_ = deviceProp.major; nSwitchChannels_ = 32; + validateIpcDomainSpansWorld(comm, "AllreduceNvls"); this->conns_ = setupConnections(comm); // setup semaphores std::vector> memorySemaphores = diff --git a/src/ext/collectives/allreduce/allreduce_packet.cu b/src/ext/collectives/allreduce/allreduce_packet.cu index 84e182eb7..cc91370ca 100644 --- a/src/ext/collectives/allreduce/allreduce_packet.cu +++ b/src/ext/collectives/allreduce/allreduce_packet.cu @@ -78,7 +78,7 @@ __global__ void __launch_bounds__(1024, 1) uint2* dst = (uint2*)((char*)resultBuff + rank * nelemsPerRank * sizeof(int)); // Put channels into shared memory, read channel info from global memory is unexpectable slow. - __shared__ mscclpp::DeviceHandle channels[MAX_NRANKS_PER_NODE - 1]; + __shared__ mscclpp::DeviceHandle channels[MAX_IPC_DOMAIN_NRANKS - 1]; const int lid = tid % WARP_SIZE; // Each warp redundantly loads all entries (same value, benign race) so that // every warp has the data its threads will read after __syncwarp(). Required diff --git a/src/ext/collectives/collective_utils.cc b/src/ext/collectives/collective_utils.cc index de33009c6..33b6ef779 100644 --- a/src/ext/collectives/collective_utils.cc +++ b/src/ext/collectives/collective_utils.cc @@ -7,6 +7,7 @@ #include #include #include +#include #include #include @@ -78,6 +79,28 @@ int getIpcDomainNranks(std::shared_ptr comm) { return comm->bootstrap()->getNranksPerNode(); } +int validateIpcDomainSpansWorld(std::shared_ptr comm, const char* algName) { + const int ipcDomainNranks = getIpcDomainNranks(comm); + const int worldSize = comm->bootstrap()->getNranks(); + const int rank = comm->bootstrap()->getRank(); + if (ipcDomainNranks < 2 || ipcDomainNranks > MAX_IPC_DOMAIN_NRANKS) { + throw mscclpp::Error(std::string(algName) + ": ipcDomainNranks " + std::to_string(ipcDomainNranks) + + " is out of supported range [2, " + std::to_string(MAX_IPC_DOMAIN_NRANKS) + "]", + mscclpp::ErrorCode::InvalidUsage); + } + if (worldSize != ipcDomainNranks) { + throw mscclpp::Error(std::string(algName) + " requires worldSize == ipcDomainNranks (got worldSize=" + + std::to_string(worldSize) + ", ipcDomainNranks=" + std::to_string(ipcDomainNranks) + ")", + mscclpp::ErrorCode::InvalidUsage); + } + if (rank < 0 || rank >= ipcDomainNranks) { + throw mscclpp::Error(std::string(algName) + ": rank " + std::to_string(rank) + " out of [0, " + + std::to_string(ipcDomainNranks) + ")", + mscclpp::ErrorCode::InvalidUsage); + } + return ipcDomainNranks; +} + std::shared_ptr> setupMemoryChannelDeviceHandles( const std::vector& memoryChannels) { std::vector> memoryChannelDeviceHandles; diff --git a/src/ext/collectives/include/allreduce/allreduce_allpair_packet.hpp b/src/ext/collectives/include/allreduce/allreduce_allpair_packet.hpp index fe96f7622..79c211b39 100644 --- a/src/ext/collectives/include/allreduce/allreduce_allpair_packet.hpp +++ b/src/ext/collectives/include/allreduce/allreduce_allpair_packet.hpp @@ -29,7 +29,7 @@ class AllreduceAllpairPacket : public AlgorithmBuilder { void* scratchBuffer_; size_t scratchBufferSize_; const int nSegmentsForScratchBuffer_ = 2; - // Must be at least MAX_NRANKS_PER_NODE-1 so the adapter can launch one + // Must be at least MAX_IPC_DOMAIN_NRANKS-1 so the adapter can launch one // block per peer at MNNVL scale. const int maxBlockNum_ = 72; std::vector conns_; diff --git a/src/ext/collectives/include/allreduce/allreduce_nvls_block_pipeline.hpp b/src/ext/collectives/include/allreduce/allreduce_nvls_block_pipeline.hpp index 81b74add4..9a1742db1 100644 --- a/src/ext/collectives/include/allreduce/allreduce_nvls_block_pipeline.hpp +++ b/src/ext/collectives/include/allreduce/allreduce_nvls_block_pipeline.hpp @@ -29,6 +29,8 @@ class AllreduceNvlsBlockPipeline : public AlgorithmBuilder { void* scratchBuffer_; size_t scratchBufferSize_; uint32_t nSwitchChannels_; + int ipcDomainNranks_ = 0; + int nBaseChannels_ = 0; std::shared_ptr> memoryChannelsDeviceHandle_; std::vector baseChannels_; std::vector conns_; diff --git a/src/ext/collectives/include/allreduce/allreduce_nvls_warp_pipeline.hpp b/src/ext/collectives/include/allreduce/allreduce_nvls_warp_pipeline.hpp index 8f02a8738..e2aa8c873 100644 --- a/src/ext/collectives/include/allreduce/allreduce_nvls_warp_pipeline.hpp +++ b/src/ext/collectives/include/allreduce/allreduce_nvls_warp_pipeline.hpp @@ -29,6 +29,8 @@ class AllreduceNvlsWarpPipeline : public AlgorithmBuilder { void* scratchBuffer_; size_t scratchBufferSize_; uint32_t nSwitchChannels_; + int ipcDomainNranks_ = 0; + int nBaseChannels_ = 0; std::shared_ptr> memoryChannelsDeviceHandle_; std::vector baseChannels_; std::vector conns_; diff --git a/src/ext/collectives/include/collective_utils.hpp b/src/ext/collectives/include/collective_utils.hpp index 7fa6a81ea..892df3b11 100644 --- a/src/ext/collectives/include/collective_utils.hpp +++ b/src/ext/collectives/include/collective_utils.hpp @@ -26,7 +26,7 @@ namespace mscclpp { namespace collective { constexpr int NUM_NVLS_CONNECTION = 8; -// Sized to cover MAX_NRANKS_PER_NODE-scale allreduce algos whose device-side +// Sized to cover MAX_IPC_DOMAIN_NRANKS-scale allreduce algos whose device-side // semaphore indices grow as O(ipcDomainNranks) (e.g. nvls_block_pipeline uses // up to ~5 * ipcDomainNranks entries). constexpr int NUM_SEMAPHORES = 512; @@ -35,7 +35,7 @@ constexpr int NUM_SEMAPHORES = 512; // single collective. Sized to cover Multi-Node NVLink (MNNVL) domains up to // GB200 NVL72 (72 GPUs sharing one NVLink fabric). Drives compile-time sizing // of shared-memory channel arrays in the allreduce/allgather kernels. -constexpr int MAX_NRANKS_PER_NODE = 72; +constexpr int MAX_IPC_DOMAIN_NRANKS = 72; constexpr int SCRATCH_SIZE = 2 * 1024 * 1024 * 70; // double buffer * 35 thread-blocks * 8 ranks * 256KB = 70MB @@ -58,6 +58,16 @@ std::vector> setupMemorySemaphores /// MNNVL-like behavior without changing the meaning of bootstrap-level APIs. int getIpcDomainNranks(std::shared_ptr comm); +/// Validates that the IPC domain spans the whole communicator and that the local rank fits within +/// the supported `[2, MAX_IPC_DOMAIN_NRANKS]` range. Used by NVLS allreduce algorithms whose +/// multicast group spans the whole communicator (see `setupNvlsConnections`) and whose kernels +/// use the global rank to compute per-rank offsets while sizing per-rank work by +/// `ipcDomainNranks`. These assumptions only hold when the IPC-reachable peer group is exactly +/// the whole communicator (e.g. a fully populated MNNVL fabric like NVL72). Returns the validated +/// `ipcDomainNranks`; throws `Error(InvalidUsage)` on violation. `algName` is used as a prefix +/// in error messages. +int validateIpcDomainSpansWorld(std::shared_ptr comm, const char* algName); + std::shared_ptr> setupMemoryChannelDeviceHandles( const std::vector& memoryChannels); From 9aeeaf0f127768fdd3974a4cf5b66200654d3414 Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Wed, 6 May 2026 18:51:29 +0000 Subject: [PATCH 14/44] Simplify torch-integration tuning example for MPI-only multi-node testing Use mpi4py for bootstrap and local-rank discovery; drop the torchrun / gloo / manual MSCCLPP_MASTER_ADDR paths and the netifaces dependency. Add MNNVL/multi-node algorithm selection (rsag, rsag_zero_copy, nvls_zero_copy) and route barrier / timing-sync allreduces through the configured symmetric_memory flag so they work across hosts. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../customized_comm_with_tuning.py | 132 ++---------------- 1 file changed, 14 insertions(+), 118 deletions(-) diff --git a/examples/torch-integration/customized_comm_with_tuning.py b/examples/torch-integration/customized_comm_with_tuning.py index 9ad7f22a5..8d1efd533 100644 --- a/examples/torch-integration/customized_comm_with_tuning.py +++ b/examples/torch-integration/customized_comm_with_tuning.py @@ -1,30 +1,16 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT License. -# torchrun --nnodes=1 --nproc_per_node=8 examples/torch-integration/customized_comm_with_tuning.py -# mpirun -np 2 --hostfile python3 examples/torch-integration/customized_comm_with_tuning.py +# mpirun -np 8 python3 examples/torch-integration/customized_comm_with_tuning.py +# mpirun -np 16 --hostfile python3 examples/torch-integration/customized_comm_with_tuning.py -import gc -import fcntl -import ipaddress import os -import socket -import struct -import sys -import traceback +from mpi4py import MPI -def _get_bootstrap_world_size(): - for name in ("WORLD_SIZE", "OMPI_COMM_WORLD_SIZE", "PMI_SIZE", "SLURM_NTASKS"): - value = os.environ.get(name) - if value is not None: - return int(value) - return None - - -_bootstrap_world_size = _get_bootstrap_world_size() -if _bootstrap_world_size and _bootstrap_world_size > 1 and "MSCCLPP_IPC_DOMAIN_NRANKS" not in os.environ: - os.environ["MSCCLPP_IPC_DOMAIN_NRANKS"] = str(_bootstrap_world_size) +_world_size = MPI.COMM_WORLD.Get_size() +if _world_size > 1 and "MSCCLPP_IPC_DOMAIN_NRANKS" not in os.environ: + os.environ["MSCCLPP_IPC_DOMAIN_NRANKS"] = str(_world_size) import torch import mscclpp @@ -54,46 +40,6 @@ def _load_algorithms(scratch: torch.Tensor, rank: int): ) -def _interfaces_for_ip(ip: str): - target = ipaddress.ip_address(ip) - for iface in os.listdir("/sys/class/net"): - try: - with socket.socket(socket.AF_INET, socket.SOCK_DGRAM) as sock: - req = struct.pack("256s", iface.encode()[:15]) - addr = socket.inet_ntoa(fcntl.ioctl(sock.fileno(), 0x8915, req)[20:24]) - except OSError: - continue - if ipaddress.ip_address(addr) == target: - return iface - return None - - -def _resolve_interface(master_addr: str): - for env_name in ("MSCCLPP_INTERFACE", "MSCCLPP_SOCKET_IFNAME", "NCCL_SOCKET_IFNAME"): - value = os.environ.get(env_name) - if value: - iface = value.split(",")[0].strip() - if iface in os.listdir("/sys/class/net"): - return iface - raise ValueError(f"Interface {iface} from {env_name} does not exist") - return _interfaces_for_ip(master_addr) - - -def _get_env_int(*names: str, default=None): - for name in names: - value = os.environ.get(name) - if value is not None: - return int(value) - return default - - -def _running_under_mpi() -> bool: - return any( - name in os.environ - for name in ("OMPI_COMM_WORLD_RANK", "PMI_RANK", "PMIX_RANK", "MPI_LOCALRANKID", "SLURM_PROCID") - ) - - def _to_mscclpp_op(op) -> mscclpp.ReduceOp: if op == torch.distributed.ReduceOp.SUM: return mscclpp.ReduceOp.SUM @@ -157,11 +103,6 @@ def __init__(self, comm: mscclpp.CommGroup, symmetric_memory: bool = False): def _algo(self, collective: str, name: str): return self._algos.get((collective, name)) - def _nblocks_limit(self, algo_name: str, size: int) -> int: - if algo_name == "default_allreduce_packet" and size < (1 << 20): - return 56 - return self._NBLOCKS_LIMIT.get(algo_name, 128) - def _default_ar_config(self): """Fallback allreduce config for barrier / timing sync.""" pkt = self._algo("allreduce", "default_allreduce_nvls_packet") @@ -337,7 +278,7 @@ def _tune_size(self, collective: str, target_size: int): run = lambda a, nb, nt: self._run_tune(collective, a, buf, target_size, nb, nt) for algo in cands: - nb_limit = self._nblocks_limit(algo.name, target_size) + nb_limit = self._NBLOCKS_LIMIT.get(algo.name, 128) for nb in self._CANDIDATE_NBLOCKS: if nb > nb_limit: continue @@ -435,11 +376,7 @@ def destroy(self): # -- Benchmarks (standalone) -------------------------------------------------- -def _bench_sizes(low=None, high=None): - if low is None: - low = _get_env_int("MSCCLPP_BENCH_LOW_SIZE", default=5 * 1024) - if high is None: - high = _get_env_int("MSCCLPP_BENCH_HIGH_SIZE", default=80 << 20) +def _bench_sizes(low=5 * 1024, high=80 << 20): sizes, c = [], low while c <= high: sizes.append(c) @@ -539,30 +476,11 @@ def benchmark_allgather(comm: CustomizedComm, dtype=torch.float16, n_warmup=10, def init_dist() -> mscclpp.CommGroup: - addr = os.environ.get("MSCCLPP_MASTER_ADDR") - rank = _get_env_int("RANK", "OMPI_COMM_WORLD_RANK", "PMI_RANK", "SLURM_PROCID") - world = _get_env_int("WORLD_SIZE", "OMPI_COMM_WORLD_SIZE", "PMI_SIZE", "SLURM_NTASKS") - if addr and rank is not None and world is not None: - port = os.environ.get("MSCCLPP_MASTER_PORT", "29500") - iface = _resolve_interface(addr) - if not iface: - raise ValueError(f"No interface for {addr}") - return mscclpp.CommGroup(interfaceIpPortTrio=f"{iface}:{addr}:{port}", rank=rank, size=world) - if _running_under_mpi(): - try: - from mpi4py import MPI - except ModuleNotFoundError as exc: - raise RuntimeError("mpi4py is required to launch this example with mpirun") from exc - - return mscclpp.CommGroup(mpi_comm=MPI.COMM_WORLD) - import torch.distributed as dist - - dist.init_process_group(backend="gloo") - return mscclpp.CommGroup(torch_group=dist.group.WORLD) + return mscclpp.CommGroup(mpi_comm=MPI.COMM_WORLD) def main(): - local = _get_env_int("LOCAL_RANK", "OMPI_COMM_WORLD_LOCAL_RANK", "MPI_LOCALRANKID", "SLURM_LOCALID", default=0) + local = MPI.COMM_WORLD.Split_type(MPI.COMM_TYPE_SHARED).Get_rank() torch.cuda.set_device(local) dtype_str = os.environ.get("DTYPE", "float16") @@ -570,22 +488,12 @@ def main(): accum_map = {"float32": mscclpp.DataType.float32, "float16": mscclpp.DataType.float16} accum_str = os.environ.get("ACCUM_DTYPE") accum_dtype = accum_map.get(accum_str) if accum_str else None - n_warmup = _get_env_int("MSCCLPP_BENCH_WARMUP", default=10) - n_graph_launches = _get_env_int("MSCCLPP_BENCH_GRAPH_LAUNCHES", default=10) - n_iter = _get_env_int("MSCCLPP_BENCH_ITERS", default=100) comm_group = init_dist() - cc = CustomizedComm(comm_group, symmetric_memory=False) + cc = CustomizedComm(comm_group) print(f"rank {local} starting benchmarks with dtype={dtype} accum_dtype={accum_dtype}...") - benchmark_allreduce( - cc, - dtype=dtype, - accum_dtype=accum_dtype, - n_warmup=n_warmup, - n_graph_launches=n_graph_launches, - n_iter=n_iter, - ) + benchmark_allreduce(cc, dtype=dtype, accum_dtype=accum_dtype) cc.barrier() torch.cuda.synchronize() @@ -593,25 +501,13 @@ def main(): if cc.rank == 0: print("Skipping allgather benchmark on multi-node: this example's allgather path is single-node only.") else: - benchmark_allgather(cc, dtype=dtype, n_warmup=n_warmup, n_graph_launches=n_graph_launches, n_iter=n_iter) + benchmark_allgather(cc, dtype=dtype) cc.barrier() torch.cuda.synchronize() cc.destroy() - del cc - del comm_group - gc.collect() print(f"rank {local} completed successfully.") if __name__ == "__main__": - exit_code = 0 - try: - main() - except Exception: - exit_code = 1 - traceback.print_exc() - finally: - sys.stdout.flush() - sys.stderr.flush() - os._exit(exit_code) + main() From 905b23d9a8d34071140fcd17f89b413001c44f58 Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Wed, 6 May 2026 19:00:22 +0000 Subject: [PATCH 15/44] Drop non-MNNVL multi_node regime from torch-integration example The example is now MNNVL-only: a run is either single-host (everything fits in one node) or multi-host MNNVL (one cross-host NVLink domain). Plain multi-node-without-MNNVL had its own algorithm branch that this example will never exercise, so remove the multi_node flag and the intermediate mnnvl_domain variable. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../customized_comm_with_tuning.py | 24 ++++--------------- 1 file changed, 5 insertions(+), 19 deletions(-) diff --git a/examples/torch-integration/customized_comm_with_tuning.py b/examples/torch-integration/customized_comm_with_tuning.py index 8d1efd533..6da9d7134 100644 --- a/examples/torch-integration/customized_comm_with_tuning.py +++ b/examples/torch-integration/customized_comm_with_tuning.py @@ -83,9 +83,7 @@ def __init__(self, comm: mscclpp.CommGroup, symmetric_memory: bool = False): self.world_size = comm.nranks self.nranks_per_node = comm.nranks_per_node nvlink_domain_nranks = int(os.environ.get("MSCCLPP_IPC_DOMAIN_NRANKS", "0")) - self.mnnvl_domain = self.world_size > 1 and nvlink_domain_nranks >= self.world_size - self.multi_node = self.world_size > self.nranks_per_node and not self.mnnvl_domain - self.multi_host_mnnvl = self.mnnvl_domain and self.world_size > self.nranks_per_node + self.multi_host_mnnvl = nvlink_domain_nranks >= self.world_size and self.world_size > self.nranks_per_node self.symmetric_memory = symmetric_memory self._nvls = mscclpp.is_nvls_supported() @@ -108,7 +106,7 @@ def _default_ar_config(self): pkt = self._algo("allreduce", "default_allreduce_nvls_packet") if self._nvls and pkt: return (pkt, 0, 0) - if self.multi_node or self.multi_host_mnnvl: + if self.multi_host_mnnvl: rsag = self._algo("allreduce", "default_allreduce_rsag") if rsag: return (rsag, 0, 0) @@ -194,18 +192,6 @@ def _ar_candidates(self, size: int): if a: out.append(a) return out - if self.multi_node: - a = self._algo("allreduce", "default_allreduce_nvls_packet") - if self._nvls and a: - out.append(a) - a = self._algo("allreduce", "default_allreduce_packet") - if a: - out.append(a) - if size >= 512 << 10: - a = self._algo("allreduce", "default_allreduce_rsag") - if a: - out.append(a) - return out if size <= 4 << 20: a = self._algo("allreduce", "default_allreduce_packet") if a: @@ -230,7 +216,7 @@ def _ar_candidates(self, size: int): return out def _ag_candidates(self): - if self.multi_node or self.multi_host_mnnvl: + if self.multi_host_mnnvl: return [] a = self._algo("allgather", "default_allgather_fullmesh2") return [a] if a else [] @@ -356,7 +342,7 @@ def all_reduce(self, tensor, op=torch.distributed.ReduceOp.SUM, stream=None, acc ) def all_gather(self, output_tensor, input_tensor, stream=None): - if self.multi_node or self.multi_host_mnnvl: + if self.multi_host_mnnvl: raise RuntimeError("all_gather in this example currently supports only single-node runs") sz = _round_pow2(input_tensor.nbytes) if sz not in self._tune_cache["allgather"]: @@ -497,7 +483,7 @@ def main(): cc.barrier() torch.cuda.synchronize() - if cc.multi_node or cc.multi_host_mnnvl: + if cc.multi_host_mnnvl: if cc.rank == 0: print("Skipping allgather benchmark on multi-node: this example's allgather path is single-node only.") else: From 4a0d5b29d509b00268a64f6e0a5b4db602e8cb46 Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Wed, 6 May 2026 21:14:36 +0000 Subject: [PATCH 16/44] Simplify torch-integration tuning example - Drop the multi_host_mnnvl-specific rsag fallback in _default_ar_config; fall through to default_allreduce_packet when NVLS is unavailable. - Add SYMMETRIC_MEMORY env var so the tuning sweep can include the zero-copy NVLS / RSAG candidates without editing the source. - Make _algo() raise on miss (direct dict lookup) and drop the defensive 'if a:' guards in _ar_candidates / _ag_candidates / _default_ar_config; merge existence checks into the platform conditions (self._nvls, self.symmetric_memory). Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../customized_comm_with_tuning.py | 73 +++++++------------ 1 file changed, 26 insertions(+), 47 deletions(-) diff --git a/examples/torch-integration/customized_comm_with_tuning.py b/examples/torch-integration/customized_comm_with_tuning.py index 6da9d7134..18fdd6f14 100644 --- a/examples/torch-integration/customized_comm_with_tuning.py +++ b/examples/torch-integration/customized_comm_with_tuning.py @@ -99,17 +99,12 @@ def __init__(self, comm: mscclpp.CommGroup, symmetric_memory: bool = False): self._time_buf = None def _algo(self, collective: str, name: str): - return self._algos.get((collective, name)) + return self._algos[(collective, name)] def _default_ar_config(self): """Fallback allreduce config for barrier / timing sync.""" - pkt = self._algo("allreduce", "default_allreduce_nvls_packet") - if self._nvls and pkt: - return (pkt, 0, 0) - if self.multi_host_mnnvl: - rsag = self._algo("allreduce", "default_allreduce_rsag") - if rsag: - return (rsag, 0, 0) + if self._nvls: + return (self._algo("allreduce", "default_allreduce_nvls_packet"), 0, 0) return (self._algo("allreduce", "default_allreduce_packet"), 0, 0) # -- low-level execute -- @@ -157,7 +152,7 @@ def _exec_ag(self, inp, out, algo, nb, nt, stream=None, sym=None): def _barrier_internal(self): a, nb, nt = self._default_ar_config() - self._exec_ar(self._barrier_tensor, a, nb, nt, sym=self.symmetric_memory) + self._exec_ar(self._barrier_tensor, a, nb, nt, sym=True) # -- lazy tuning -- @@ -173,53 +168,33 @@ def _ar_candidates(self, size: int): if self.multi_host_mnnvl: if size <= 4 << 20: if size <= 128 << 10: - a = self._algo("allreduce", "default_allreduce_allpair_packet") - if a: - out.append(a) - if size <= 64 << 10: - a = self._algo("allreduce", "default_allreduce_nvls_packet") - if self._nvls and a: - out.append(a) + out.append(self._algo("allreduce", "default_allreduce_allpair_packet")) + if size <= 64 << 10 and self._nvls: + out.append(self._algo("allreduce", "default_allreduce_nvls_packet")) if size > 128 << 10: - a = self._algo("allreduce", "default_allreduce_packet") - if a: - out.append(a) + out.append(self._algo("allreduce", "default_allreduce_packet")) if size >= 512 << 10: - a = self._algo("allreduce", "default_allreduce_nvls_zero_copy") - if self._nvls and self.symmetric_memory and a: - out.append(a) - a = self._algo("allreduce", "default_allreduce_rsag") - if a: - out.append(a) + if self._nvls and self.symmetric_memory: + out.append(self._algo("allreduce", "default_allreduce_nvls_zero_copy")) + out.append(self._algo("allreduce", "default_allreduce_rsag")) return out if size <= 4 << 20: - a = self._algo("allreduce", "default_allreduce_packet") - if a: - out.append(a) - a = self._algo("allreduce", "default_allreduce_allpair_packet") - if a: - out.append(a) - a = self._algo("allreduce", "default_allreduce_nvls_packet") - if self._nvls and a: - out.append(a) + out.append(self._algo("allreduce", "default_allreduce_packet")) + out.append(self._algo("allreduce", "default_allreduce_allpair_packet")) + if self._nvls: + out.append(self._algo("allreduce", "default_allreduce_nvls_packet")) if size >= 512 << 10: - a = self._algo("allreduce", "default_allreduce_rsag_zero_copy") - if a: - out.append(a) - a = self._algo("allreduce", "default_allreduce_nvls_zero_copy") - if self._nvls and self.symmetric_memory and a: - out.append(a) + out.append(self._algo("allreduce", "default_allreduce_rsag_zero_copy")) + if self._nvls and self.symmetric_memory: + out.append(self._algo("allreduce", "default_allreduce_nvls_zero_copy")) if torch.version.hip is not None: - a = self._algo("allreduce", "default_allreduce_fullmesh") - if a: - out.append(a) + out.append(self._algo("allreduce", "default_allreduce_fullmesh")) return out def _ag_candidates(self): if self.multi_host_mnnvl: return [] - a = self._algo("allgather", "default_allgather_fullmesh2") - return [a] if a else [] + return [self._algo("allgather", "default_allgather_fullmesh2")] def _run_tune(self, collective, algo, buf, size, nb, nt): """Single tune invocation for either collective.""" @@ -474,11 +449,15 @@ def main(): accum_map = {"float32": mscclpp.DataType.float32, "float16": mscclpp.DataType.float16} accum_str = os.environ.get("ACCUM_DTYPE") accum_dtype = accum_map.get(accum_str) if accum_str else None + symmetric_memory = os.environ.get("SYMMETRIC_MEMORY", "0") == "1" comm_group = init_dist() - cc = CustomizedComm(comm_group) + cc = CustomizedComm(comm_group, symmetric_memory=symmetric_memory) - print(f"rank {local} starting benchmarks with dtype={dtype} accum_dtype={accum_dtype}...") + print( + f"rank {local} starting benchmarks with dtype={dtype} " + f"accum_dtype={accum_dtype} symmetric_memory={symmetric_memory}..." + ) benchmark_allreduce(cc, dtype=dtype, accum_dtype=accum_dtype) cc.barrier() torch.cuda.synchronize() From 307a4718884a59dd2acead9ca899a1667598b470 Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Wed, 6 May 2026 21:37:09 +0000 Subject: [PATCH 17/44] Shorten verbose comments and use THROW in validateIpcDomainSpansWorld - Collapse the duplicated 3-line warp-strided-load comment in 5 kernels (allgather_fullmesh, allreduce_fullmesh, allreduce_packet, allreduce_nvls_zero_copy, allreduce_nvls_warp_pipeline) into a single one-line 'Peer count may exceed WARP_SIZE on MNNVL.' note. - Drop the algName parameter from validateIpcDomainSpansWorld; switch its 3 throws to use the THROW logger macro (LogSubsys::ALGO), which already captures file/line/function. Update the 3 callsites (nvls_block_pipeline, nvls_warp_pipeline, nvls_zero_copy) and trim the Doxygen comment accordingly. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../allgather/allgather_fullmesh.cu | 4 +--- .../allreduce/allreduce_fullmesh.cu | 4 +--- .../allreduce_nvls_block_pipeline.cu | 2 +- .../allreduce/allreduce_nvls_warp_pipeline.cu | 6 ++---- .../allreduce/allreduce_nvls_zero_copy.cu | 6 ++---- .../collectives/allreduce/allreduce_packet.cu | 4 +--- src/ext/collectives/collective_utils.cc | 20 +++++++++---------- .../collectives/include/collective_utils.hpp | 10 +++------- 8 files changed, 21 insertions(+), 35 deletions(-) diff --git a/src/ext/collectives/allgather/allgather_fullmesh.cu b/src/ext/collectives/allgather/allgather_fullmesh.cu index 8ce77fca1..a4196c6cd 100644 --- a/src/ext/collectives/allgather/allgather_fullmesh.cu +++ b/src/ext/collectives/allgather/allgather_fullmesh.cu @@ -30,9 +30,7 @@ __global__ void __launch_bounds__(1024, 1) __shared__ DeviceHandle channels[MAX_IPC_DOMAIN_NRANKS - 1]; const int lid = threadIdx.x % WARP_SIZE; - // Each warp redundantly loads all entries (same value, benign race) so that - // every warp has the data its threads will read after __syncwarp(). Required - // when nPeer > WARP_SIZE (MNNVL/NVL72 scale). + // Peer count may exceed WARP_SIZE on MNNVL. for (int i = lid; i < nPeer; i += WARP_SIZE) { channels[i] = memoryChans[i]; } diff --git a/src/ext/collectives/allreduce/allreduce_fullmesh.cu b/src/ext/collectives/allreduce/allreduce_fullmesh.cu index f1d815604..ef7ecf74d 100644 --- a/src/ext/collectives/allreduce/allreduce_fullmesh.cu +++ b/src/ext/collectives/allreduce/allreduce_fullmesh.cu @@ -52,9 +52,7 @@ __global__ void __launch_bounds__(512, 1) __shared__ DeviceHandle channels[MAX_IPC_DOMAIN_NRANKS - 1]; __shared__ DeviceHandle outChannels[MAX_IPC_DOMAIN_NRANKS - 1]; const int lid = threadIdx.x % WARP_SIZE; - // Each warp redundantly loads all entries (same value, benign race) so that - // every warp has the data its threads will read after __syncwarp(). Required - // when nPeer > WARP_SIZE (MNNVL/NVL72 scale). + // Peer count may exceed WARP_SIZE on MNNVL. for (int i = lid; i < nPeer; i += WARP_SIZE) { channels[i] = memoryChans[i]; outChannels[i] = memoryOutChans[i]; diff --git a/src/ext/collectives/allreduce/allreduce_nvls_block_pipeline.cu b/src/ext/collectives/allreduce/allreduce_nvls_block_pipeline.cu index 4eeb03355..8c4a1e236 100644 --- a/src/ext/collectives/allreduce/allreduce_nvls_block_pipeline.cu +++ b/src/ext/collectives/allreduce/allreduce_nvls_block_pipeline.cu @@ -178,7 +178,7 @@ struct NvlsBlockPipelineAdapter { void AllreduceNvlsBlockPipeline::initialize(std::shared_ptr comm) { nSwitchChannels_ = 8; - ipcDomainNranks_ = validateIpcDomainSpansWorld(comm, "AllreduceNvlsBlockPipeline"); + ipcDomainNranks_ = validateIpcDomainSpansWorld(comm); // Block-pipeline device-side semaphore indices grow as 6 * ipcDomainNranks (see kernel). if (6 * ipcDomainNranks_ > NUM_SEMAPHORES) { throw Error("AllreduceNvlsBlockPipeline: ipcDomainNranks " + std::to_string(ipcDomainNranks_) + diff --git a/src/ext/collectives/allreduce/allreduce_nvls_warp_pipeline.cu b/src/ext/collectives/allreduce/allreduce_nvls_warp_pipeline.cu index 05e4f747f..950c287bf 100644 --- a/src/ext/collectives/allreduce/allreduce_nvls_warp_pipeline.cu +++ b/src/ext/collectives/allreduce/allreduce_nvls_warp_pipeline.cu @@ -59,9 +59,7 @@ __global__ void __launch_bounds__(1024, 1) auto memoryChans = memoryChannels + chanOffset; __shared__ DeviceHandle channels[(MAX_IPC_DOMAIN_NRANKS - 1) * 2]; const int lid = threadIdx.x % WARP_SIZE; - // Each warp redundantly loads all entries (same value, benign race) so that - // every warp has the data its threads will read after __syncwarp(). Required - // when nPeers*2 > WARP_SIZE (MNNVL scale). + // Peer count may exceed WARP_SIZE on MNNVL. for (int i = lid; i < nPeers * 2; i += WARP_SIZE) { channels[i] = memoryChans[i]; } @@ -144,7 +142,7 @@ struct NvlsWarpPipelineAdapter { void AllreduceNvlsWarpPipeline::initialize(std::shared_ptr comm) { nSwitchChannels_ = NUM_NVLS_CONNECTION; - ipcDomainNranks_ = validateIpcDomainSpansWorld(comm, "AllreduceNvlsWarpPipeline"); + ipcDomainNranks_ = validateIpcDomainSpansWorld(comm); // The warp-pipeline kernel addresses 2 * nPeers entries per block in `memoryChannels`, // so per-peer base channel allocation must be at least `2 * nBlocks`. Default // nBlocks = 4 * ipcDomainNranks (see allreduceKernelFunc), so size accordingly. diff --git a/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu b/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu index 5d6fc4d37..6ab0cd639 100644 --- a/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu +++ b/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu @@ -45,9 +45,7 @@ __global__ void __launch_bounds__(1024, 1) auto memoryChans = memoryChannels + chanOffset; __shared__ mscclpp::DeviceHandle channels[MAX_IPC_DOMAIN_NRANKS - 1]; const int lid = threadIdx.x % WARP_SIZE; - // Each warp redundantly loads all entries (same value, benign race) so that - // every warp has the data its threads will read after __syncwarp(). Required - // when nPeers > WARP_SIZE (MNNVL/NVL72 → 71 peers). + // Peer count may exceed WARP_SIZE on MNNVL. for (int i = lid; i < ipcDomainNranks - 1; i += WARP_SIZE) { channels[i] = memoryChans[i]; } @@ -107,7 +105,7 @@ void AllreduceNvls::initialize(std::shared_ptr comm) { MSCCLPP_CUDATHROW(cudaGetDeviceProperties(&deviceProp, device)); computeCapabilityMajor_ = deviceProp.major; nSwitchChannels_ = 32; - validateIpcDomainSpansWorld(comm, "AllreduceNvls"); + validateIpcDomainSpansWorld(comm); this->conns_ = setupConnections(comm); // setup semaphores std::vector> memorySemaphores = diff --git a/src/ext/collectives/allreduce/allreduce_packet.cu b/src/ext/collectives/allreduce/allreduce_packet.cu index cc91370ca..7bc9a85f1 100644 --- a/src/ext/collectives/allreduce/allreduce_packet.cu +++ b/src/ext/collectives/allreduce/allreduce_packet.cu @@ -80,9 +80,7 @@ __global__ void __launch_bounds__(1024, 1) // Put channels into shared memory, read channel info from global memory is unexpectable slow. __shared__ mscclpp::DeviceHandle channels[MAX_IPC_DOMAIN_NRANKS - 1]; const int lid = tid % WARP_SIZE; - // Each warp redundantly loads all entries (same value, benign race) so that - // every warp has the data its threads will read after __syncwarp(). Required - // when nPeers > WARP_SIZE (MNNVL/NVL72 scale). + // Peer count may exceed WARP_SIZE on MNNVL. for (int i = lid; i < nPeers; i += WARP_SIZE) { channels[i] = memoryChannels[i]; } diff --git a/src/ext/collectives/collective_utils.cc b/src/ext/collectives/collective_utils.cc index 33b6ef779..e4eb7142c 100644 --- a/src/ext/collectives/collective_utils.cc +++ b/src/ext/collectives/collective_utils.cc @@ -11,6 +11,8 @@ #include #include +#include "logger.hpp" + namespace mscclpp { namespace collective { std::vector setupRemoteMemories(std::shared_ptr comm, int rank, @@ -79,24 +81,22 @@ int getIpcDomainNranks(std::shared_ptr comm) { return comm->bootstrap()->getNranksPerNode(); } -int validateIpcDomainSpansWorld(std::shared_ptr comm, const char* algName) { +int validateIpcDomainSpansWorld(std::shared_ptr comm) { const int ipcDomainNranks = getIpcDomainNranks(comm); const int worldSize = comm->bootstrap()->getNranks(); const int rank = comm->bootstrap()->getRank(); if (ipcDomainNranks < 2 || ipcDomainNranks > MAX_IPC_DOMAIN_NRANKS) { - throw mscclpp::Error(std::string(algName) + ": ipcDomainNranks " + std::to_string(ipcDomainNranks) + - " is out of supported range [2, " + std::to_string(MAX_IPC_DOMAIN_NRANKS) + "]", - mscclpp::ErrorCode::InvalidUsage); + THROW(mscclpp::LogSubsys::ALGO, mscclpp::Error, mscclpp::ErrorCode::InvalidUsage, "ipcDomainNranks ", + ipcDomainNranks, " is out of supported range [2, ", MAX_IPC_DOMAIN_NRANKS, "]"); } if (worldSize != ipcDomainNranks) { - throw mscclpp::Error(std::string(algName) + " requires worldSize == ipcDomainNranks (got worldSize=" + - std::to_string(worldSize) + ", ipcDomainNranks=" + std::to_string(ipcDomainNranks) + ")", - mscclpp::ErrorCode::InvalidUsage); + THROW(mscclpp::LogSubsys::ALGO, mscclpp::Error, mscclpp::ErrorCode::InvalidUsage, + "requires worldSize == ipcDomainNranks (got worldSize=", worldSize, ", ipcDomainNranks=", ipcDomainNranks, + ")"); } if (rank < 0 || rank >= ipcDomainNranks) { - throw mscclpp::Error(std::string(algName) + ": rank " + std::to_string(rank) + " out of [0, " + - std::to_string(ipcDomainNranks) + ")", - mscclpp::ErrorCode::InvalidUsage); + THROW(mscclpp::LogSubsys::ALGO, mscclpp::Error, mscclpp::ErrorCode::InvalidUsage, "rank ", rank, " out of [0, ", + ipcDomainNranks, ")"); } return ipcDomainNranks; } diff --git a/src/ext/collectives/include/collective_utils.hpp b/src/ext/collectives/include/collective_utils.hpp index 892df3b11..6b0c6ab48 100644 --- a/src/ext/collectives/include/collective_utils.hpp +++ b/src/ext/collectives/include/collective_utils.hpp @@ -60,13 +60,9 @@ int getIpcDomainNranks(std::shared_ptr comm); /// Validates that the IPC domain spans the whole communicator and that the local rank fits within /// the supported `[2, MAX_IPC_DOMAIN_NRANKS]` range. Used by NVLS allreduce algorithms whose -/// multicast group spans the whole communicator (see `setupNvlsConnections`) and whose kernels -/// use the global rank to compute per-rank offsets while sizing per-rank work by -/// `ipcDomainNranks`. These assumptions only hold when the IPC-reachable peer group is exactly -/// the whole communicator (e.g. a fully populated MNNVL fabric like NVL72). Returns the validated -/// `ipcDomainNranks`; throws `Error(InvalidUsage)` on violation. `algName` is used as a prefix -/// in error messages. -int validateIpcDomainSpansWorld(std::shared_ptr comm, const char* algName); +/// multicast group spans the whole communicator. Returns the validated `ipcDomainNranks`; throws +/// `Error(InvalidUsage)` on violation. +int validateIpcDomainSpansWorld(std::shared_ptr comm); std::shared_ptr> setupMemoryChannelDeviceHandles( const std::vector& memoryChannels); From f0c6ac081f23425e3a91c1493a1f4c7f40909600 Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Wed, 6 May 2026 21:49:48 +0000 Subject: [PATCH 18/44] Fold validateIpcDomainSpansWorld into getIpcDomainNranks getIpcDomainNranks now performs the range / world-size / rank checks itself and throws on violation, so the separate validateIpcDomainSpansWorld helper is unnecessary. Update the 3 NVLS callsites (block_pipeline, warp_pipeline, nvls_zero_copy) to call getIpcDomainNranks directly. The non-NVLS callers also pick up the strict validation, which is fine because they are only invoked in single-host or multi-host MNNVL scenarios where worldSize == ipcDomainNranks (the NCCL adapter's multi-node path returns nullptr, falling back to NCCL/RCCL). Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../allreduce/allreduce_nvls_block_pipeline.cu | 2 +- .../allreduce/allreduce_nvls_warp_pipeline.cu | 2 +- .../allreduce/allreduce_nvls_zero_copy.cu | 2 +- src/ext/collectives/collective_utils.cc | 9 +-------- src/ext/collectives/include/collective_utils.hpp | 15 ++++----------- 5 files changed, 8 insertions(+), 22 deletions(-) diff --git a/src/ext/collectives/allreduce/allreduce_nvls_block_pipeline.cu b/src/ext/collectives/allreduce/allreduce_nvls_block_pipeline.cu index 8c4a1e236..f5c0d2f85 100644 --- a/src/ext/collectives/allreduce/allreduce_nvls_block_pipeline.cu +++ b/src/ext/collectives/allreduce/allreduce_nvls_block_pipeline.cu @@ -178,7 +178,7 @@ struct NvlsBlockPipelineAdapter { void AllreduceNvlsBlockPipeline::initialize(std::shared_ptr comm) { nSwitchChannels_ = 8; - ipcDomainNranks_ = validateIpcDomainSpansWorld(comm); + ipcDomainNranks_ = getIpcDomainNranks(comm); // Block-pipeline device-side semaphore indices grow as 6 * ipcDomainNranks (see kernel). if (6 * ipcDomainNranks_ > NUM_SEMAPHORES) { throw Error("AllreduceNvlsBlockPipeline: ipcDomainNranks " + std::to_string(ipcDomainNranks_) + diff --git a/src/ext/collectives/allreduce/allreduce_nvls_warp_pipeline.cu b/src/ext/collectives/allreduce/allreduce_nvls_warp_pipeline.cu index 950c287bf..02b899aa2 100644 --- a/src/ext/collectives/allreduce/allreduce_nvls_warp_pipeline.cu +++ b/src/ext/collectives/allreduce/allreduce_nvls_warp_pipeline.cu @@ -142,7 +142,7 @@ struct NvlsWarpPipelineAdapter { void AllreduceNvlsWarpPipeline::initialize(std::shared_ptr comm) { nSwitchChannels_ = NUM_NVLS_CONNECTION; - ipcDomainNranks_ = validateIpcDomainSpansWorld(comm); + ipcDomainNranks_ = getIpcDomainNranks(comm); // The warp-pipeline kernel addresses 2 * nPeers entries per block in `memoryChannels`, // so per-peer base channel allocation must be at least `2 * nBlocks`. Default // nBlocks = 4 * ipcDomainNranks (see allreduceKernelFunc), so size accordingly. diff --git a/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu b/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu index 6ab0cd639..115a229ae 100644 --- a/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu +++ b/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu @@ -105,7 +105,7 @@ void AllreduceNvls::initialize(std::shared_ptr comm) { MSCCLPP_CUDATHROW(cudaGetDeviceProperties(&deviceProp, device)); computeCapabilityMajor_ = deviceProp.major; nSwitchChannels_ = 32; - validateIpcDomainSpansWorld(comm); + getIpcDomainNranks(comm); this->conns_ = setupConnections(comm); // setup semaphores std::vector> memorySemaphores = diff --git a/src/ext/collectives/collective_utils.cc b/src/ext/collectives/collective_utils.cc index e4eb7142c..6acfd7ce0 100644 --- a/src/ext/collectives/collective_utils.cc +++ b/src/ext/collectives/collective_utils.cc @@ -75,14 +75,7 @@ std::vector> setupMemoryS int getIpcDomainNranks(std::shared_ptr comm) { const int envValue = mscclpp::env()->ipcDomainNranks; - if (envValue > 0) { - return envValue; - } - return comm->bootstrap()->getNranksPerNode(); -} - -int validateIpcDomainSpansWorld(std::shared_ptr comm) { - const int ipcDomainNranks = getIpcDomainNranks(comm); + const int ipcDomainNranks = (envValue > 0) ? envValue : comm->bootstrap()->getNranksPerNode(); const int worldSize = comm->bootstrap()->getNranks(); const int rank = comm->bootstrap()->getRank(); if (ipcDomainNranks < 2 || ipcDomainNranks > MAX_IPC_DOMAIN_NRANKS) { diff --git a/src/ext/collectives/include/collective_utils.hpp b/src/ext/collectives/include/collective_utils.hpp index 6b0c6ab48..280a63328 100644 --- a/src/ext/collectives/include/collective_utils.hpp +++ b/src/ext/collectives/include/collective_utils.hpp @@ -51,18 +51,11 @@ std::vector setupConnections(std::shared_ptr comm); std::vector> setupMemorySemaphores( std::shared_ptr comm, const std::vector& connections, int nChannelsPerConnection); -/// Number of ranks that participate in the same GPU-IPC-reachable peer group (e.g. a single host or -/// a Multi-Node NVLink fabric, or an AMD XGMI domain). Returns the value of `MSCCLPP_IPC_DOMAIN_NRANKS` -/// if set to a positive value; otherwise falls back to `bootstrap->getNranksPerNode()`. This is -/// intentionally independent of `Bootstrap::getNranksPerNode()` so that algorithms can opt in to -/// MNNVL-like behavior without changing the meaning of bootstrap-level APIs. -int getIpcDomainNranks(std::shared_ptr comm); - -/// Validates that the IPC domain spans the whole communicator and that the local rank fits within -/// the supported `[2, MAX_IPC_DOMAIN_NRANKS]` range. Used by NVLS allreduce algorithms whose -/// multicast group spans the whole communicator. Returns the validated `ipcDomainNranks`; throws +/// Returns the IPC-reachable peer-group size, validated to span the whole communicator and +/// to be within `[2, MAX_IPC_DOMAIN_NRANKS]`. Reads `MSCCLPP_IPC_DOMAIN_NRANKS` if set to a +/// positive value; otherwise falls back to `bootstrap->getNranksPerNode()`. Throws /// `Error(InvalidUsage)` on violation. -int validateIpcDomainSpansWorld(std::shared_ptr comm); +int getIpcDomainNranks(std::shared_ptr comm); std::shared_ptr> setupMemoryChannelDeviceHandles( const std::vector& memoryChannels); From bde23ce38e6399e52d4662018935863d5654fd4a Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Wed, 6 May 2026 22:16:08 +0000 Subject: [PATCH 19/44] Revert verbose RSAG zero-copy comment; rename NRanksPerNode template param - Restore the original two-line note about the templated peer-loop unrolling instead of the multi-paragraph rationale block. - Rename the kernel template parameter from NRanksPerNode to NRanks. The IPC domain can span multiple physical hosts under MNNVL, so the 'PerNode' suffix is misleading; NRanks matches the runtime ipcDomainNranks parameter that drives template dispatch. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../allreduce/allreduce_rsag_zero_copy.cu | 25 +++++++------------ 1 file changed, 9 insertions(+), 16 deletions(-) diff --git a/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu b/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu index a20756aee..c678c2670 100644 --- a/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu +++ b/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu @@ -35,17 +35,10 @@ __device__ mscclpp::DeviceSyncer globalSyncer; // // This approach requires registering both input and output buffers as remote // memories (2 * nPeers handles), but avoids scratch buffer allocation and -// the extra copy steps of the standard RSAG. -// -// The kernel is templated on NRanksPerNode so the compiler can keep an int4 -// register array of NPeers elements, #pragma unroll the peer loops, and turn -// the per-iteration modulo into a single AND. This issues all NPeers remote -// reads in parallel so their latency is overlapped instead of serialized. -// Only small fixed sizes ({4, 8}) are instantiated; larger MNNVL domains -// (where the int4 array would spill out of registers) must use a different -// algorithm. +// the extra copy steps of the standard RSAG. The NRanks template +// parameter enables compile-time unrolling of peer loops (supports 4 or 8). -template +template __global__ void __launch_bounds__(1024, 1) allreduceRsAgZeroCopy(T* buff, T* scratch, T* resultBuff, DeviceHandle* memoryChannels, DeviceHandle* switchChannels, void* remoteMemories, int rank, int worldSize, @@ -55,12 +48,12 @@ __global__ void __launch_bounds__(1024, 1) assert((uintptr_t)buff % sizeof(int4) == 0); assert((uintptr_t)resultBuff % sizeof(int4) == 0); - constexpr int NPeers = NRanksPerNode - 1; + constexpr int NPeers = NRanks - 1; constexpr uint32_t nelemsPerInt4 = sizeof(int4) / sizeof(T); constexpr uint32_t outputRemoteBufferOffset = NPeers; - uint32_t alignedNelems = ((nelems + NRanksPerNode - 1) / NRanksPerNode + nelemsPerInt4 - 1) / nelemsPerInt4 * - nelemsPerInt4 * NRanksPerNode; - uint32_t nelemsPerRank = alignedNelems / NRanksPerNode; + uint32_t alignedNelems = + ((nelems + NRanks - 1) / NRanks + nelemsPerInt4 - 1) / nelemsPerInt4 * nelemsPerInt4 * NRanks; + uint32_t nelemsPerRank = alignedNelems / NRanks; uint32_t nInt4PerRank = nelemsPerRank / nelemsPerInt4; uint32_t nInt4Total = (nelems + nelemsPerInt4 - 1) / nelemsPerInt4; @@ -93,7 +86,7 @@ __global__ void __launch_bounds__(1024, 1) int4 tmp_raw = buff4[offset]; #pragma unroll for (int i = 0; i < NPeers; i++) { - int rankIdx = (rank + i + 1) % NRanksPerNode; + int rankIdx = (rank + i + 1) % NRanks; int peerIdx = rankIdx < rank ? rankIdx : rankIdx - 1; data[i] = mscclpp::read(((void**)remoteMemories)[peerIdx], offset); } @@ -105,7 +98,7 @@ __global__ void __launch_bounds__(1024, 1) int4 tmp = mscclpp::downcastVector(acc); #pragma unroll for (int i = 0; i < NPeers; i++) { - int rankIdx = (rank + i + 1) % NRanksPerNode; + int rankIdx = (rank + i + 1) % NRanks; int peerIdx = rankIdx < rank ? rankIdx : rankIdx - 1; mscclpp::write(((void**)remoteMemories)[outputRemoteBufferOffset + peerIdx], offset, tmp); } From 095cfff11d00e93a09f24ee391161a6f1209dc1b Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Wed, 6 May 2026 22:23:18 +0000 Subject: [PATCH 20/44] Revert RSAG nBlocks default to 64 The 128-block default fires only when the caller passes nBlocks=0 (i.e. no tuning). Tuning explicitly drives nBlocks via the adapter, so the historical default of 64 is fine. Keep nChannelsPerConnection_=128 so the tuner can still request up to 128 blocks for MNNVL configs. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/ext/collectives/allreduce/allreduce_rsag.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ext/collectives/allreduce/allreduce_rsag.cu b/src/ext/collectives/allreduce/allreduce_rsag.cu index 93e2d0c46..4dcceb48e 100644 --- a/src/ext/collectives/allreduce/allreduce_rsag.cu +++ b/src/ext/collectives/allreduce/allreduce_rsag.cu @@ -133,7 +133,7 @@ struct AllreduceRsAgAdapter { size_t nelems = inputSize / sizeof(T); if (nBlocks == 0 || nThreadsPerBlock == 0) { nThreadsPerBlock = 1024; - nBlocks = 128; + nBlocks = 64; } allreduceRsAg<<>>( (T*)input, (T*)scratch, (T*)output, (ChannelType*)memoryChannels, switchChannel, remoteMemories, rank, From 639b80de7b5fd031a283967c77e2de65103ce379 Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Wed, 6 May 2026 22:31:15 +0000 Subject: [PATCH 21/44] Tie AllreduceAllpairPacket maxBlockNum_ to MAX_IPC_DOMAIN_NRANKS - 1 The hard-coded 72 was off by one from what the comment claims is the minimum (MAX_IPC_DOMAIN_NRANKS - 1 = 71). Express the value via the constant so the relationship is self-documenting and any future change to MAX_IPC_DOMAIN_NRANKS propagates automatically. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../collectives/include/allreduce/allreduce_allpair_packet.hpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/ext/collectives/include/allreduce/allreduce_allpair_packet.hpp b/src/ext/collectives/include/allreduce/allreduce_allpair_packet.hpp index 79c211b39..d2ea7259e 100644 --- a/src/ext/collectives/include/allreduce/allreduce_allpair_packet.hpp +++ b/src/ext/collectives/include/allreduce/allreduce_allpair_packet.hpp @@ -4,6 +4,7 @@ #include #include "allreduce/common.hpp" +#include "collective_utils.hpp" namespace mscclpp { namespace collective { @@ -31,7 +32,7 @@ class AllreduceAllpairPacket : public AlgorithmBuilder { const int nSegmentsForScratchBuffer_ = 2; // Must be at least MAX_IPC_DOMAIN_NRANKS-1 so the adapter can launch one // block per peer at MNNVL scale. - const int maxBlockNum_ = 72; + const int maxBlockNum_ = MAX_IPC_DOMAIN_NRANKS - 1; std::vector conns_; std::vector> memorySemaphores_; std::vector registeredMemories_; From e8caab7c8e866d6ff89e86d7bcf7c0f011f19021 Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Wed, 6 May 2026 23:04:41 +0000 Subject: [PATCH 22/44] Strip preflight validation blocks from NVLS pipeline allreduce kernels allreduce_nvls_block_pipeline.cu and allreduce_nvls_warp_pipeline.cu were carrying ~45 lines of per-call invariant-checking added during the MNNVL work. Restore main's simple defaulting pattern (just `if (==0) set defaults`); incorrect inputs will manifest as CUDA errors via the existing error-handling path. Also drop the unreachable `6 * ipcDomainNranks > NUM_SEMAPHORES` throw in the block_pipeline initialize (max ipcDomainNranks=72, NUM_SEMAPHORES=512), the now-unused `` include, and trim the verbose comments around `nBaseChannels_` sizing in both files. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../allreduce_nvls_block_pipeline.cu | 48 +-------------- .../allreduce/allreduce_nvls_warp_pipeline.cu | 59 +------------------ 2 files changed, 6 insertions(+), 101 deletions(-) diff --git a/src/ext/collectives/allreduce/allreduce_nvls_block_pipeline.cu b/src/ext/collectives/allreduce/allreduce_nvls_block_pipeline.cu index f5c0d2f85..9d3316e4c 100644 --- a/src/ext/collectives/allreduce/allreduce_nvls_block_pipeline.cu +++ b/src/ext/collectives/allreduce/allreduce_nvls_block_pipeline.cu @@ -3,7 +3,6 @@ #include #include -#include #include "allreduce/allreduce_nvls_block_pipeline.hpp" #include "allreduce/common.hpp" @@ -179,14 +178,7 @@ struct NvlsBlockPipelineAdapter { void AllreduceNvlsBlockPipeline::initialize(std::shared_ptr comm) { nSwitchChannels_ = 8; ipcDomainNranks_ = getIpcDomainNranks(comm); - // Block-pipeline device-side semaphore indices grow as 6 * ipcDomainNranks (see kernel). - if (6 * ipcDomainNranks_ > NUM_SEMAPHORES) { - throw Error("AllreduceNvlsBlockPipeline: ipcDomainNranks " + std::to_string(ipcDomainNranks_) + - " exceeds NUM_SEMAPHORES capacity (" + std::to_string(NUM_SEMAPHORES) + ")", - ErrorCode::InvalidUsage); - } - // The kernel addresses up to `2 * nBlocksForCopy = 4 * ipcDomainNranks` distinct entries - // per peer in `memoryChannels`. Scale the per-connection allocation to match. + // Per-peer channel allocation must hold up to 4 * ipcDomainNranks entries (see kernel). nBaseChannels_ = std::max(64, 4 * ipcDomainNranks_); this->conns_ = setupConnections(comm); // setup semaphores @@ -208,43 +200,9 @@ CommResult AllreduceNvlsBlockPipeline::allreduceKernelFunc( WARN("Unsupported operation or data type for allreduce, dtype=%d", static_cast(dtype)); return CommResult::CommInvalidArgument; } - const int requiredBlocks = ctx->ipcDomainNranks * 5; std::pair blockAndThreadNum = {nBlocks, nThreadsPerBlock}; - if (blockAndThreadNum.first == 0) blockAndThreadNum.first = requiredBlocks; - if (blockAndThreadNum.second == 0) blockAndThreadNum.second = 1024; - if (blockAndThreadNum.first != requiredBlocks) { - WARN("AllreduceNvlsBlockPipeline requires nBlocks == 5 * ipcDomainNranks (got %d, expected %d)", - blockAndThreadNum.first, requiredBlocks); - return CommResult::CommInvalidArgument; - } - if (blockAndThreadNum.second != 1024) { - WARN("AllreduceNvlsBlockPipeline requires nThreadsPerBlock == 1024 (got %d)", blockAndThreadNum.second); - return CommResult::CommInvalidArgument; - } - // Validate input alignment/divisibility expectations of the kernel. - constexpr size_t kKernelAlignment = 16; - const size_t perRankBytes = inputSize / ctx->ipcDomainNranks; - if (perRankBytes * static_cast(ctx->ipcDomainNranks) != inputSize || perRankBytes % kKernelAlignment != 0) { - WARN( - "AllreduceNvlsBlockPipeline requires inputSize %% (ipcDomainNranks * %zu) == 0 (got inputSize=%zu, " - "ipcDomainNranks=%d)", - kKernelAlignment, inputSize, ctx->ipcDomainNranks); - return CommResult::CommInvalidArgument; - } - // Validate scratch is large enough for at least one pipeline iteration. The kernel - // computes scratchSizePerBlock = (scratchSizePerRank / nBlocksForCopy) aligned down - // to unitSize; if this is 0, maxItersForScratch is 0 and the kernel deadlocks. - const size_t unitSize = (inputSize <= static_cast(1024) * 1024 * 128) ? (1ULL << 16) : (1ULL << 17); - const size_t scratchSizePerRank = this->scratchBufferSize_ / ctx->ipcDomainNranks; - const size_t nBlocksForCopy = static_cast(ctx->ipcDomainNranks) * 2; - const size_t scratchSizePerBlock = (scratchSizePerRank / nBlocksForCopy) / unitSize * unitSize; - if (scratchSizePerBlock < unitSize) { - WARN( - "AllreduceNvlsBlockPipeline scratch buffer too small for ipcDomainNranks=%d and inputSize=%zu " - "(scratchBufferSize=%zu, need at least ~%zu bytes)", - ctx->ipcDomainNranks, inputSize, this->scratchBufferSize_, - static_cast(ctx->ipcDomainNranks) * nBlocksForCopy * unitSize); - return CommResult::CommInvalidArgument; + if (blockAndThreadNum.first == 0 || blockAndThreadNum.second == 0) { + blockAndThreadNum = {ctx->ipcDomainNranks * 5, 1024}; } cudaError_t error = allreduce(input, this->scratchBuffer_, output, this->memoryChannelsDeviceHandle_.get(), nullptr, ctx->switchChannelDeviceHandles.get(), nullptr, 0, 0, this->scratchBufferSize_, diff --git a/src/ext/collectives/allreduce/allreduce_nvls_warp_pipeline.cu b/src/ext/collectives/allreduce/allreduce_nvls_warp_pipeline.cu index 02b899aa2..73ecdab9d 100644 --- a/src/ext/collectives/allreduce/allreduce_nvls_warp_pipeline.cu +++ b/src/ext/collectives/allreduce/allreduce_nvls_warp_pipeline.cu @@ -3,7 +3,6 @@ #include #include -#include #include "allreduce/allreduce_nvls_warp_pipeline.hpp" #include "allreduce/common.hpp" @@ -143,9 +142,7 @@ struct NvlsWarpPipelineAdapter { void AllreduceNvlsWarpPipeline::initialize(std::shared_ptr comm) { nSwitchChannels_ = NUM_NVLS_CONNECTION; ipcDomainNranks_ = getIpcDomainNranks(comm); - // The warp-pipeline kernel addresses 2 * nPeers entries per block in `memoryChannels`, - // so per-peer base channel allocation must be at least `2 * nBlocks`. Default - // nBlocks = 4 * ipcDomainNranks (see allreduceKernelFunc), so size accordingly. + // Per-peer channel allocation must hold 2 * nBlocks entries; default nBlocks = 4 * ipcDomainNranks. nBaseChannels_ = std::max(64, 8 * ipcDomainNranks_); this->conns_ = setupConnections(comm); // setup semaphores @@ -168,58 +165,8 @@ CommResult AllreduceNvlsWarpPipeline::allreduceKernelFunc( return CommResult::CommInvalidArgument; } std::pair blockAndThreadNum = {nBlocks, nThreadsPerBlock}; - if (blockAndThreadNum.first == 0) { - // Default to 4 * ipcDomainNranks blocks, rounded up to a multiple of NUM_NVLS_CONNECTION - // so that nBlocks / NUM_NVLS_CONNECTION partitioning in the kernel is well-defined. - int defaultBlocks = ctx->ipcDomainNranks * 4; - defaultBlocks = ((defaultBlocks + NUM_NVLS_CONNECTION - 1) / NUM_NVLS_CONNECTION) * NUM_NVLS_CONNECTION; - blockAndThreadNum.first = std::max(defaultBlocks, NUM_NVLS_CONNECTION); - } - if (blockAndThreadNum.second == 0) blockAndThreadNum.second = 1024; - // The kernel computes nBlocksPerNvlsConn = nBlocks / NUM_NVLS_CONNECTION and indexes the - // multicast handle array with bid / nBlocksPerNvlsConn; both must be safe. - if (blockAndThreadNum.first < NUM_NVLS_CONNECTION || blockAndThreadNum.first % NUM_NVLS_CONNECTION != 0) { - WARN("AllreduceNvlsWarpPipeline requires nBlocks to be a positive multiple of %d (got %d)", NUM_NVLS_CONNECTION, - blockAndThreadNum.first); - return CommResult::CommInvalidArgument; - } - // Each block uses 2 * nPeers consecutive entries in `memoryChannels`, so the per-peer - // base-channel allocation must support 2 * nBlocks distinct entries. - if (2 * blockAndThreadNum.first > this->nBaseChannels_) { - WARN( - "AllreduceNvlsWarpPipeline: nBlocks %d exceeds channel allocation (nBaseChannels=%d, " - "ipcDomainNranks=%d). Increase MSCCLPP_IPC_DOMAIN_NRANKS-aware sizing or reduce nBlocks.", - blockAndThreadNum.first, this->nBaseChannels_, ctx->ipcDomainNranks); - return CommResult::CommInvalidArgument; - } - // The kernel hard-codes 14 + 4 + 14 = 32 warps per block and bar.sync member counts - // computed from these constants; deviating from 1024 threads breaks those barriers. - if (blockAndThreadNum.second != 1024) { - WARN("AllreduceNvlsWarpPipeline requires nThreadsPerBlock == 1024 (got %d)", blockAndThreadNum.second); - return CommResult::CommInvalidArgument; - } - // Validate input divisibility by ipcDomainNranks (kernel computes size / ipcDomainNranks). - if (inputSize % static_cast(ctx->ipcDomainNranks) != 0) { - WARN("AllreduceNvlsWarpPipeline requires inputSize %% ipcDomainNranks == 0 (got inputSize=%zu, ipcDomainNranks=%d)", - inputSize, ctx->ipcDomainNranks); - return CommResult::CommInvalidArgument; - } - // Validate scratch is large enough for at least one pipeline iteration. The kernel - // computes scratchSizePerBlock = (scratchSizePerRank / nBlocks) aligned down to copyPerIter; - // if this is 0 the modulo offset arithmetic divides by zero. - const size_t sizePerRank = inputSize / static_cast(ctx->ipcDomainNranks); - const size_t maxSizePerBlock = ((sizePerRank + blockAndThreadNum.first - 1) / blockAndThreadNum.first + 15) / 16 * 16; - const size_t copyPerIter = (maxSizePerBlock >= 1024 * 64) ? (1024 * 32) : (1024 * 16); - const size_t scratchSizePerRank = this->scratchBufferSize_ / static_cast(ctx->ipcDomainNranks); - const size_t scratchSizePerBlock = - (scratchSizePerRank / static_cast(blockAndThreadNum.first)) / copyPerIter * copyPerIter; - if (scratchSizePerBlock < copyPerIter) { - WARN( - "AllreduceNvlsWarpPipeline scratch buffer too small for ipcDomainNranks=%d, nBlocks=%d, inputSize=%zu " - "(scratchBufferSize=%zu, need at least ~%zu bytes)", - ctx->ipcDomainNranks, blockAndThreadNum.first, inputSize, this->scratchBufferSize_, - static_cast(ctx->ipcDomainNranks) * static_cast(blockAndThreadNum.first) * copyPerIter); - return CommResult::CommInvalidArgument; + if (blockAndThreadNum.first == 0 || blockAndThreadNum.second == 0) { + blockAndThreadNum = {ctx->ipcDomainNranks * 4, 1024}; } cudaError_t error = allreduce(input, this->scratchBuffer_, output, this->memoryChannelsDeviceHandle_.get(), nullptr, ctx->switchChannelDeviceHandles.get(), nullptr, 0, 0, this->scratchBufferSize_, From 7d80a333603bb18e63798242682520c7e9a43c8b Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Wed, 6 May 2026 23:43:37 +0000 Subject: [PATCH 23/44] Default torch example SYMMETRIC_MEMORY env to 1 The non-symmetric rsag_zero_copy path uses an incrementing tag in its context key, so cross-rank memory registration handshakes happen on every call rather than being cached. At single-host x 8 GPUs and sizes >= 512 KB this becomes the only candidate (since nvls_zero_copy is filtered out without symmetric memory) and degrades into apparent hang. Defaulting SYMMETRIC_MEMORY=1 lets a plain `mpirun ...` invocation work out of the box; users can still override with `SYMMETRIC_MEMORY=0` to exercise the non-symmetric path. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- examples/torch-integration/customized_comm_with_tuning.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/torch-integration/customized_comm_with_tuning.py b/examples/torch-integration/customized_comm_with_tuning.py index 18fdd6f14..cbfb419d4 100644 --- a/examples/torch-integration/customized_comm_with_tuning.py +++ b/examples/torch-integration/customized_comm_with_tuning.py @@ -449,7 +449,7 @@ def main(): accum_map = {"float32": mscclpp.DataType.float32, "float16": mscclpp.DataType.float16} accum_str = os.environ.get("ACCUM_DTYPE") accum_dtype = accum_map.get(accum_str) if accum_str else None - symmetric_memory = os.environ.get("SYMMETRIC_MEMORY", "0") == "1" + symmetric_memory = os.environ.get("SYMMETRIC_MEMORY", "1") == "1" comm_group = init_dist() cc = CustomizedComm(comm_group, symmetric_memory=symmetric_memory) From d1b04a3b26567f7e27c0eefb52b8e4dcc273874a Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Thu, 7 May 2026 00:38:31 +0000 Subject: [PATCH 24/44] NVLS zero-copy allreduce: support FP16 accumulator for FP8 inputs multimem.ld_reduce on FP8 inputs accumulates in FP32 by default. The ISA also exposes an .acc::f16 variant that keeps the reduction in FP16, which is faster but lower precision. Plumb AccumT through: - include/mscclpp/switch_channel_device.hpp: Extend SwitchChannelDeviceHandle::multimemLoadReduce with an optional AccumT template parameter. When VectorType is one of the FP8 vector types (f8_e4m3x{4,8,16} / f8_e5m2x{4,8,16}) and AccumT is __half, emit the .acc::f16 form of the instruction; otherwise unchanged. - src/ext/collectives/include/allreduce/common.hpp: Make handleMultiLoadReduceStore template on AccumT and forward it to multimemLoadReduce(...). - src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu: Template allreduceNvls and NvlsAdapter on AccumT and forward to handleMultiLoadReduceStore; the existing dispatch<> machinery already plumbs AccumT through from the algorithm context. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../customized_comm_with_tuning.py | 51 ++++++----- include/mscclpp/switch_channel_device.hpp | 84 ++++++++++++++----- .../allreduce/allreduce_nvls_zero_copy.cu | 12 +-- .../collectives/include/allreduce/common.hpp | 7 +- 4 files changed, 100 insertions(+), 54 deletions(-) diff --git a/examples/torch-integration/customized_comm_with_tuning.py b/examples/torch-integration/customized_comm_with_tuning.py index cbfb419d4..44a5c9c10 100644 --- a/examples/torch-integration/customized_comm_with_tuning.py +++ b/examples/torch-integration/customized_comm_with_tuning.py @@ -76,6 +76,25 @@ class CustomizedComm: "default_allreduce_fullmesh": 64, "default_allgather_fullmesh2": 32, } + # (algo_name, min_size, max_size, predicate) + # Boundaries are inclusive on both ends. max_size=None means unbounded. + # predicate=None means always applicable; otherwise a callable taking `self`. + _AR_CANDIDATES_MNNVL = [ + ("default_allreduce_allpair_packet", 0, 128 << 10, None), + ("default_allreduce_nvls_packet", 0, 64 << 10, lambda c: c._nvls), + ("default_allreduce_packet", 128 << 10, 4 << 20, None), + ("default_allreduce_nvls_zero_copy", 512 << 10, None, lambda c: c._nvls and c.symmetric_memory), + ("default_allreduce_rsag_zero_copy", 512 << 10, None, None), + ("default_allreduce_rsag", 512 << 10, None, None), + ] + _AR_CANDIDATES_SINGLE = [ + ("default_allreduce_packet", 0, 4 << 20, None), + ("default_allreduce_allpair_packet", 0, 4 << 20, None), + ("default_allreduce_nvls_packet", 0, 4 << 20, lambda c: c._nvls), + ("default_allreduce_rsag_zero_copy", 512 << 10, None, None), + ("default_allreduce_nvls_zero_copy", 512 << 10, None, lambda c: c._nvls and c.symmetric_memory), + ("default_allreduce_fullmesh", 0, None, lambda c: torch.version.hip is not None), + ] def __init__(self, comm: mscclpp.CommGroup, symmetric_memory: bool = False): self.comm = comm @@ -164,32 +183,12 @@ def _ensure_tune_bufs(self): return self._tune_buf def _ar_candidates(self, size: int): - out = [] - if self.multi_host_mnnvl: - if size <= 4 << 20: - if size <= 128 << 10: - out.append(self._algo("allreduce", "default_allreduce_allpair_packet")) - if size <= 64 << 10 and self._nvls: - out.append(self._algo("allreduce", "default_allreduce_nvls_packet")) - if size > 128 << 10: - out.append(self._algo("allreduce", "default_allreduce_packet")) - if size >= 512 << 10: - if self._nvls and self.symmetric_memory: - out.append(self._algo("allreduce", "default_allreduce_nvls_zero_copy")) - out.append(self._algo("allreduce", "default_allreduce_rsag")) - return out - if size <= 4 << 20: - out.append(self._algo("allreduce", "default_allreduce_packet")) - out.append(self._algo("allreduce", "default_allreduce_allpair_packet")) - if self._nvls: - out.append(self._algo("allreduce", "default_allreduce_nvls_packet")) - if size >= 512 << 10: - out.append(self._algo("allreduce", "default_allreduce_rsag_zero_copy")) - if self._nvls and self.symmetric_memory: - out.append(self._algo("allreduce", "default_allreduce_nvls_zero_copy")) - if torch.version.hip is not None: - out.append(self._algo("allreduce", "default_allreduce_fullmesh")) - return out + table = self._AR_CANDIDATES_MNNVL if self.multi_host_mnnvl else self._AR_CANDIDATES_SINGLE + return [ + self._algo("allreduce", name) + for name, lo, hi, pred in table + if size >= lo and (hi is None or size <= hi) and (pred is None or pred(self)) + ] def _ag_candidates(self): if self.multi_host_mnnvl: diff --git a/include/mscclpp/switch_channel_device.hpp b/include/mscclpp/switch_channel_device.hpp index b52b65723..7b749f7a9 100644 --- a/include/mscclpp/switch_channel_device.hpp +++ b/include/mscclpp/switch_channel_device.hpp @@ -37,7 +37,11 @@ struct SwitchChannelDeviceHandle { SwitchChannelDeviceHandle::multimemStore(val, reinterpret_cast(mcPtr) + index); } - template + /// Vectorized multimem load+reduce. The optional `AccumT` template parameter selects the + /// accumulator: when `AccumT == __half` and `VectorType` is an FP8 vector type, the + /// `.acc::f16` variant of the instruction is used (faster but lower precision than the + /// default FP32 accumulator). For all other types `AccumT` is ignored. + template MSCCLPP_DEVICE_INLINE static VectorType multimemLoadReduce(VectorType* ptr) { VectorType val; if constexpr (std::is_same_v) { @@ -81,29 +85,71 @@ struct SwitchChannelDeviceHandle { : "l"(ptr) : "memory"); } else if constexpr (std::is_same_v) { - asm("multimem.ld_reduce.relaxed.sys.global.add.e4m3x4 %0, [%1];" : "=r"(val.words[0]) : "l"(ptr) : "memory"); + if constexpr (std::is_same_v) { + asm("multimem.ld_reduce.relaxed.sys.global.add.acc::f16.e4m3x4 %0, [%1];" + : "=r"(val.words[0]) + : "l"(ptr) + : "memory"); + } else { + asm("multimem.ld_reduce.relaxed.sys.global.add.e4m3x4 %0, [%1];" : "=r"(val.words[0]) : "l"(ptr) : "memory"); + } } else if constexpr (std::is_same_v) { - asm("multimem.ld_reduce.relaxed.sys.global.add.v2.e4m3x4 {%0,%1}, [%2];" - : "=r"(val.words[0]), "=r"(val.words[1]) - : "l"(ptr) - : "memory"); + if constexpr (std::is_same_v) { + asm("multimem.ld_reduce.relaxed.sys.global.add.acc::f16.v2.e4m3x4 {%0,%1}, [%2];" + : "=r"(val.words[0]), "=r"(val.words[1]) + : "l"(ptr) + : "memory"); + } else { + asm("multimem.ld_reduce.relaxed.sys.global.add.v2.e4m3x4 {%0,%1}, [%2];" + : "=r"(val.words[0]), "=r"(val.words[1]) + : "l"(ptr) + : "memory"); + } } else if constexpr (std::is_same_v) { - asm("multimem.ld_reduce.relaxed.sys.global.add.v4.e4m3x4 {%0,%1,%2,%3}, [%4];" - : "=r"(val.words[0]), "=r"(val.words[1]), "=r"(val.words[2]), "=r"(val.words[3]) - : "l"(ptr) - : "memory"); + if constexpr (std::is_same_v) { + asm("multimem.ld_reduce.relaxed.sys.global.add.acc::f16.v4.e4m3x4 {%0,%1,%2,%3}, [%4];" + : "=r"(val.words[0]), "=r"(val.words[1]), "=r"(val.words[2]), "=r"(val.words[3]) + : "l"(ptr) + : "memory"); + } else { + asm("multimem.ld_reduce.relaxed.sys.global.add.v4.e4m3x4 {%0,%1,%2,%3}, [%4];" + : "=r"(val.words[0]), "=r"(val.words[1]), "=r"(val.words[2]), "=r"(val.words[3]) + : "l"(ptr) + : "memory"); + } } else if constexpr (std::is_same_v) { - asm("multimem.ld_reduce.relaxed.sys.global.add.e5m2x4 %0, [%1];" : "=r"(val.words[0]) : "l"(ptr) : "memory"); + if constexpr (std::is_same_v) { + asm("multimem.ld_reduce.relaxed.sys.global.add.acc::f16.e5m2x4 %0, [%1];" + : "=r"(val.words[0]) + : "l"(ptr) + : "memory"); + } else { + asm("multimem.ld_reduce.relaxed.sys.global.add.e5m2x4 %0, [%1];" : "=r"(val.words[0]) : "l"(ptr) : "memory"); + } } else if constexpr (std::is_same_v) { - asm("multimem.ld_reduce.relaxed.sys.global.add.v2.e5m2x4 {%0,%1}, [%2];" - : "=r"(val.words[0]), "=r"(val.words[1]) - : "l"(ptr) - : "memory"); + if constexpr (std::is_same_v) { + asm("multimem.ld_reduce.relaxed.sys.global.add.acc::f16.v2.e5m2x4 {%0,%1}, [%2];" + : "=r"(val.words[0]), "=r"(val.words[1]) + : "l"(ptr) + : "memory"); + } else { + asm("multimem.ld_reduce.relaxed.sys.global.add.v2.e5m2x4 {%0,%1}, [%2];" + : "=r"(val.words[0]), "=r"(val.words[1]) + : "l"(ptr) + : "memory"); + } } else if constexpr (std::is_same_v) { - asm("multimem.ld_reduce.relaxed.sys.global.add.v4.e5m2x4 {%0,%1,%2,%3}, [%4];" - : "=r"(val.words[0]), "=r"(val.words[1]), "=r"(val.words[2]), "=r"(val.words[3]) - : "l"(ptr) - : "memory"); + if constexpr (std::is_same_v) { + asm("multimem.ld_reduce.relaxed.sys.global.add.acc::f16.v4.e5m2x4 {%0,%1,%2,%3}, [%4];" + : "=r"(val.words[0]), "=r"(val.words[1]), "=r"(val.words[2]), "=r"(val.words[3]) + : "l"(ptr) + : "memory"); + } else { + asm("multimem.ld_reduce.relaxed.sys.global.add.v4.e5m2x4 {%0,%1,%2,%3}, [%4];" + : "=r"(val.words[0]), "=r"(val.words[1]), "=r"(val.words[2]), "=r"(val.words[3]) + : "l"(ptr) + : "memory"); + } } else { static_assert(dependentFalse, "Not supported type"); } diff --git a/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu b/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu index 115a229ae..99146779c 100644 --- a/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu +++ b/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu @@ -14,7 +14,7 @@ namespace collective { constexpr int MAX_NBLOCKS = 32; -template +template __global__ void __launch_bounds__(1024, 1) allreduceNvls([[maybe_unused]] mscclpp::DeviceHandle* memoryChannels, [[maybe_unused]] mscclpp::DeviceHandle* multicast, @@ -58,8 +58,8 @@ __global__ void __launch_bounds__(1024, 1) T* src = (T*)multicastPtr->mcPtr; T* dst = (T*)multicastOutPtr->mcPtr; if (curBlockSize > 0) { - handleMultiLoadReduceStore(src, dst, blockOffset + channelInOffset, blockOffset + channelOutOffset, curBlockSize, - threadIdx.x, blockDim.x); + handleMultiLoadReduceStore(src, dst, blockOffset + channelInOffset, blockOffset + channelOutOffset, + curBlockSize, threadIdx.x, blockDim.x); } __syncthreads(); if (threadIdx.x < nPeers) { @@ -90,9 +90,9 @@ struct NvlsAdapter { #endif { using ChannelType = DeviceHandle; - allreduceNvls<<>>((ChannelType*)memoryChannels, nvlsChannels, - nvlsOutChannels, channelInOffset, channelOutOffset, - inputSize, rank, ipcDomainNranks); + allreduceNvls<<>>( + (ChannelType*)memoryChannels, nvlsChannels, nvlsOutChannels, channelInOffset, channelOutOffset, inputSize, + rank, ipcDomainNranks); return cudaGetLastError(); } } diff --git a/src/ext/collectives/include/allreduce/common.hpp b/src/ext/collectives/include/allreduce/common.hpp index 93b18e262..22513ace5 100644 --- a/src/ext/collectives/include/allreduce/common.hpp +++ b/src/ext/collectives/include/allreduce/common.hpp @@ -36,7 +36,7 @@ MSCCLPP_DEVICE_INLINE constexpr std::size_t calcVectorSize() { } } -template +template MSCCLPP_DEVICE_INLINE void handleMultiLoadReduceStore(T* src, T* dst, size_t srcOffset, size_t dstOffset, size_t size, int tid, int nThreads) { // nvls can only handle 4 bytes alignment @@ -54,7 +54,7 @@ MSCCLPP_DEVICE_INLINE void handleMultiLoadReduceStore(T* src, T* dst, size_t src vectorType* src4 = (vectorType*)src; vectorType* dst4 = (vectorType*)dst; for (size_t idx = tid; idx < nVec; idx += nThreads) { - auto val = mscclpp::SwitchChannelDeviceHandle::multimemLoadReduce(src4 + srcOffset4 + idx); + auto val = mscclpp::SwitchChannelDeviceHandle::multimemLoadReduce(src4 + srcOffset4 + idx); mscclpp::SwitchChannelDeviceHandle::multimemStore(val, dst4 + dstOffset4 + idx); } // handle rest of data @@ -64,7 +64,8 @@ MSCCLPP_DEVICE_INLINE void handleMultiLoadReduceStore(T* src, T* dst, size_t src const size_t startIdx = (srcOffset + processed) / sizeof(restVectorType); const size_t endIdx = (srcOffset + size) / sizeof(restVectorType); for (size_t idx = tid + startIdx; idx < endIdx; idx += nThreads) { - auto val = mscclpp::SwitchChannelDeviceHandle::multimemLoadReduce((restVectorType*)src + idx); + auto val = + mscclpp::SwitchChannelDeviceHandle::multimemLoadReduce((restVectorType*)src + idx); mscclpp::SwitchChannelDeviceHandle::multimemStore(val, (restVectorType*)dst + idx); } } From 113d859d13f08ca9533e1ba5a0d4b645c26028ee Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Fri, 8 May 2026 03:00:53 +0000 Subject: [PATCH 25/44] fix --- src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu b/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu index 99146779c..ef6d216ca 100644 --- a/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu +++ b/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu @@ -83,7 +83,8 @@ struct NvlsAdapter { // fp8_e4m3b15 is a software-only type with no hardware NVLS support. return cudaErrorNotSupported; } else -#if (!defined(__CUDA_ARCH_SPECIFIC__) && !defined(__CUDA_ARCH_FAMILY_SPECIFIC__)) || (__CUDA_ARCH__ < 1000) +#if defined(__CUDA_ARCH__) && \ + ((!defined(__CUDA_ARCH_SPECIFIC__) && !defined(__CUDA_ARCH_FAMILY_SPECIFIC__)) || (__CUDA_ARCH__ < 1000)) if constexpr (std::is_same_v || std::is_same_v) { return cudaErrorNotSupported; } else From 9ff7e1c2c38ebf798728fe77eef2f6c5e31989e5 Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Fri, 8 May 2026 03:43:34 +0000 Subject: [PATCH 26/44] update --- .../collectives/allreduce/allreduce_nvls_zero_copy.cu | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu b/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu index ef6d216ca..63cbd057d 100644 --- a/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu +++ b/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu @@ -82,14 +82,7 @@ struct NvlsAdapter { } else if constexpr (std::is_same_v) { // fp8_e4m3b15 is a software-only type with no hardware NVLS support. return cudaErrorNotSupported; - } else -#if defined(__CUDA_ARCH__) && \ - ((!defined(__CUDA_ARCH_SPECIFIC__) && !defined(__CUDA_ARCH_FAMILY_SPECIFIC__)) || (__CUDA_ARCH__ < 1000)) - if constexpr (std::is_same_v || std::is_same_v) { - return cudaErrorNotSupported; - } else -#endif - { + } else { using ChannelType = DeviceHandle; allreduceNvls<<>>( (ChannelType*)memoryChannels, nvlsChannels, nvlsOutChannels, channelInOffset, channelOutOffset, inputSize, From 654bcfa6ba403cc0c40f31cf2224b3f2fa524569 Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Fri, 8 May 2026 03:54:32 +0000 Subject: [PATCH 27/44] update --- include/mscclpp/switch_channel_device.hpp | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/include/mscclpp/switch_channel_device.hpp b/include/mscclpp/switch_channel_device.hpp index 7b749f7a9..841b7f320 100644 --- a/include/mscclpp/switch_channel_device.hpp +++ b/include/mscclpp/switch_channel_device.hpp @@ -84,7 +84,9 @@ struct SwitchChannelDeviceHandle { : "=r"(val.words[0]), "=r"(val.words[1]), "=r"(val.words[2]), "=r"(val.words[3]) : "l"(ptr) : "memory"); - } else if constexpr (std::is_same_v) { + } +#if (defined(__CUDA_ARCH_SPECIFIC__) || defined(__CUDA_ARCH_FAMILY_SPECIFIC__)) && (__CUDA_ARCH__ >= 1000) + else if constexpr (std::is_same_v) { if constexpr (std::is_same_v) { asm("multimem.ld_reduce.relaxed.sys.global.add.acc::f16.e4m3x4 %0, [%1];" : "=r"(val.words[0]) @@ -150,7 +152,9 @@ struct SwitchChannelDeviceHandle { : "l"(ptr) : "memory"); } - } else { + } +#endif + else { static_assert(dependentFalse, "Not supported type"); } return val; From 5516bdbb6be2b307754053898db6b940c83cb011 Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Fri, 8 May 2026 04:22:50 +0000 Subject: [PATCH 28/44] fix --- include/mscclpp/switch_channel_device.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/mscclpp/switch_channel_device.hpp b/include/mscclpp/switch_channel_device.hpp index 841b7f320..4e0396dd3 100644 --- a/include/mscclpp/switch_channel_device.hpp +++ b/include/mscclpp/switch_channel_device.hpp @@ -155,7 +155,7 @@ struct SwitchChannelDeviceHandle { } #endif else { - static_assert(dependentFalse, "Not supported type"); + assert(false && "Unsupported vector type for multimemLoadReduce"); } return val; }; @@ -219,7 +219,7 @@ struct SwitchChannelDeviceHandle { "r"(val.words[1]), "r"(val.words[2]), "r"(val.words[3]) : "memory"); } else { - static_assert(dependentFalse, "Not supported type"); + assert(false && "Unsupported vector type for multimemStore"); } }; @@ -244,7 +244,7 @@ struct SwitchChannelDeviceHandle { } else if constexpr (std::is_same_v && std::is_same_v) { asm volatile("multimem.red.relaxed.sys.global.add.f16x2 [%0], {%1};" ::"l"(ptr), "r"(val.x) : "memory"); } else { - static_assert(dependentFalse, "Not supported type"); + assert(false && "Unsupported vector type for multimemStoreReduce"); } }; #endif // defined(MSCCLPP_DEVICE_CUDA) From e208cc326b95c9590c09b1bce35683d9580087a7 Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Fri, 8 May 2026 04:30:05 +0000 Subject: [PATCH 29/44] WIP --- include/mscclpp/switch_channel_device.hpp | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/include/mscclpp/switch_channel_device.hpp b/include/mscclpp/switch_channel_device.hpp index 4e0396dd3..e95dfcf51 100644 --- a/include/mscclpp/switch_channel_device.hpp +++ b/include/mscclpp/switch_channel_device.hpp @@ -198,7 +198,9 @@ struct SwitchChannelDeviceHandle { asm volatile("multimem.st.relaxed.sys.global.v4.bf16x2 [%0], {%1,%2,%3,%4};" ::"l"(ptr), "r"(val.words[0]), "r"(val.words[1]), "r"(val.words[2]), "r"(val.words[3]) : "memory"); - } else if constexpr (std::is_same_v) { + } +#if (defined(__CUDA_ARCH_SPECIFIC__) || defined(__CUDA_ARCH_FAMILY_SPECIFIC__)) && (__CUDA_ARCH__ >= 1000) + else if constexpr (std::is_same_v) { asm volatile("multimem.st.relaxed.sys.global.e4m3x4 [%0], %1;" ::"l"(ptr), "r"(val.words[0]) : "memory"); } else if constexpr (std::is_same_v) { asm volatile("multimem.st.relaxed.sys.global.v2.e4m3x4 [%0], {%1,%2};" ::"l"(ptr), "r"(val.words[0]), @@ -218,7 +220,9 @@ struct SwitchChannelDeviceHandle { asm volatile("multimem.st.relaxed.sys.global.v4.e5m2x4 [%0], {%1,%2,%3,%4};" ::"l"(ptr), "r"(val.words[0]), "r"(val.words[1]), "r"(val.words[2]), "r"(val.words[3]) : "memory"); - } else { + } +#endif + else { assert(false && "Unsupported vector type for multimemStore"); } }; From 825fc124a547c0fa1e5b6df9c79c42a1aebf627f Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Sat, 9 May 2026 03:16:33 +0000 Subject: [PATCH 30/44] address hang issue --- src/ext/collectives/allreduce/allreduce_nvls_packet.cu | 1 + 1 file changed, 1 insertion(+) diff --git a/src/ext/collectives/allreduce/allreduce_nvls_packet.cu b/src/ext/collectives/allreduce/allreduce_nvls_packet.cu index 2ef0516e3..56455b6ea 100644 --- a/src/ext/collectives/allreduce/allreduce_nvls_packet.cu +++ b/src/ext/collectives/allreduce/allreduce_nvls_packet.cu @@ -83,6 +83,7 @@ void AllreduceNvlsPacket::initialize(std::shared_ptr comm) { this->nvlsConnections_ = setupNvlsConnections(comm, nvlsBufferSize_, nSwitchChannels); this->switchChannels_ = setupNvlsChannels(this->nvlsConnections_, this->scratchBuffer_, this->scratchBufferSize_, nSwitchChannels); + comm->bootstrap()->barrier(); } AlgorithmCtxKey AllreduceNvlsPacket::generateAllreduceContextKey(const void*, void*, size_t, DataType, bool) { From 224b3deb84fb2977318c56acdb7131c4e9f49eeb Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Wed, 13 May 2026 01:22:51 +0000 Subject: [PATCH 31/44] Clean up completed communicator receives Erase completed receive bookkeeping from the communicator once the deferred receive future finishes, while preserving ordered receive chaining for repeated rank/tag operations. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/core/communicator.cc | 100 ++++++++++++++++++++++++--------------- 1 file changed, 63 insertions(+), 37 deletions(-) diff --git a/src/core/communicator.cc b/src/core/communicator.cc index c95ca4213..97fadbbd0 100644 --- a/src/core/communicator.cc +++ b/src/core/communicator.cc @@ -3,10 +3,60 @@ #include "communicator.hpp" +#include + #include "api.h" namespace mscclpp { +namespace { + +template +class ScopeGuard { + public: + explicit ScopeGuard(Fn fn) : fn_(std::move(fn)) {} + ScopeGuard(const ScopeGuard&) = delete; + ScopeGuard& operator=(const ScopeGuard&) = delete; + ~ScopeGuard() { fn_(); } + + private: + Fn fn_; +}; + +template +ScopeGuard makeScopeGuard(Fn fn) { + return ScopeGuard(std::move(fn)); +} + +template +std::shared_future makeOrderedRecvFuture(Impl* impl, int remoteRank, int tag, Fn fn) { + auto thisRecvItem = std::make_shared>(); + auto future = std::async(std::launch::deferred, [impl, remoteRank, tag, thisRecvItem, + lastRecvItem = impl->getLastRecvItem(remoteRank, tag), + fn = std::move(fn)]() mutable { + [[maybe_unused]] auto cleanup = makeScopeGuard([impl, remoteRank, tag, thisRecvItem]() { + auto item = thisRecvItem->lock(); + auto it = impl->lastRecvItems_.find({remoteRank, tag}); + if (item && it != impl->lastRecvItems_.end() && it->second == item) { + impl->lastRecvItems_.erase(it); + } + }); + + if (lastRecvItem) { + // Recursive call to the previous receive items + lastRecvItem->wait(); + } + return fn(); + }); + auto sharedFuture = std::shared_future(std::move(future)); + auto recvItem = std::make_shared>(sharedFuture); + *thisRecvItem = recvItem; + impl->setLastRecvItem(remoteRank, tag, recvItem); + return sharedFuture; +} + +} // namespace + Communicator::Impl::Impl(std::shared_ptr bootstrap, std::shared_ptr context) : bootstrap_(bootstrap) { if (!context) { @@ -83,19 +133,11 @@ MSCCLPP_API_CPP std::shared_future Communicator::recvMemory(in locRecvMemList.push_back(std::move(locRecvMem)); return future; } - auto future = std::async(std::launch::deferred, - [this, remoteRank, tag, lastRecvItem = pimpl_->getLastRecvItem(remoteRank, tag)]() { - if (lastRecvItem) { - // Recursive call to the previous receive items - lastRecvItem->wait(); - } - std::vector data; - bootstrap()->recv(data, remoteRank, tag); - return RegisteredMemory::deserialize(data); - }); - auto shared_future = std::shared_future(std::move(future)); - pimpl_->setLastRecvItem(remoteRank, tag, std::make_shared>(shared_future)); - return shared_future; + return makeOrderedRecvFuture(pimpl_.get(), remoteRank, tag, [this, remoteRank, tag]() { + std::vector data; + bootstrap()->recv(data, remoteRank, tag); + return RegisteredMemory::deserialize(data); + }); } MSCCLPP_API_CPP std::shared_future Communicator::connect(const Endpoint& localEndpoint, int remoteRank, @@ -112,12 +154,8 @@ MSCCLPP_API_CPP std::shared_future Communicator::connect(const Endpo bootstrap()->send(localEndpoint.serialize(), remoteRank, tag); - auto future = std::async(std::launch::deferred, [this, remoteRank, tag, localEndpoint, - lastRecvItem = pimpl_->getLastRecvItem(remoteRank, tag)]() mutable { - if (lastRecvItem) { - // Recursive call to the previous receive items - lastRecvItem->wait(); - } + return makeOrderedRecvFuture(pimpl_.get(), remoteRank, tag, + [this, remoteRank, tag, localEndpoint]() mutable { std::vector data; bootstrap()->recv(data, remoteRank, tag); auto remoteEndpoint = Endpoint::deserialize(data); @@ -125,9 +163,6 @@ MSCCLPP_API_CPP std::shared_future Communicator::connect(const Endpo pimpl_->connectionInfos_[connection.impl_.get()] = {remoteRank, tag}; return connection; }); - auto shared_future = std::shared_future(std::move(future)); - pimpl_->setLastRecvItem(remoteRank, tag, std::make_shared>(shared_future)); - return shared_future; } MSCCLPP_API_CPP std::shared_future Communicator::connect(const EndpointConfig& localConfig, int remoteRank, @@ -141,21 +176,12 @@ MSCCLPP_API_CPP std::shared_future Communicator::buildSemaphore(const SemaphoreStub localStub(connection); bootstrap()->send(localStub.serialize(), remoteRank, tag); - auto future = - std::async(std::launch::deferred, [this, remoteRank, tag, lastRecvItem = pimpl_->getLastRecvItem(remoteRank, tag), - localStub = localStub]() mutable { - if (lastRecvItem) { - // Recursive call to the previous receive items - lastRecvItem->wait(); - } - std::vector data; - bootstrap()->recv(data, remoteRank, tag); - auto remoteStub = SemaphoreStub::deserialize(data); - return Semaphore(localStub, remoteStub); - }); - auto shared_future = std::shared_future(std::move(future)); - pimpl_->setLastRecvItem(remoteRank, tag, std::make_shared>(shared_future)); - return shared_future; + return makeOrderedRecvFuture(pimpl_.get(), remoteRank, tag, [this, remoteRank, tag, localStub]() mutable { + std::vector data; + bootstrap()->recv(data, remoteRank, tag); + auto remoteStub = SemaphoreStub::deserialize(data); + return Semaphore(localStub, remoteStub); + }); } MSCCLPP_API_CPP int Communicator::remoteRankOf(const Connection& connection) { From 7724e49f316d0059933a7bcfe2ff9f53ac9bc043 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 13 May 2026 03:26:53 +0000 Subject: [PATCH 32/44] Fix lint and ROCm error alias Agent-Logs-Url: https://github.com/microsoft/mscclpp/sessions/0f0e525d-a69c-4ff7-8913-983243b5cbf7 Co-authored-by: Binyang2014 <9415966+Binyang2014@users.noreply.github.com> --- include/mscclpp/gpu.hpp | 1 + src/core/communicator.cc | 48 ++++++++++++++++++++-------------------- 2 files changed, 25 insertions(+), 24 deletions(-) diff --git a/include/mscclpp/gpu.hpp b/include/mscclpp/gpu.hpp index b8d096e2b..b289bd4d3 100644 --- a/include/mscclpp/gpu.hpp +++ b/include/mscclpp/gpu.hpp @@ -31,6 +31,7 @@ using CUmemorytype = hipMemoryType; constexpr auto cudaErrorPeerAccessAlreadyEnabled = hipErrorPeerAccessAlreadyEnabled; constexpr auto cudaErrorContextIsDestroyed = hipErrorContextIsDestroyed; constexpr auto cudaErrorInvalidDevice = hipErrorInvalidDevice; +constexpr auto cudaErrorInvalidValue = hipErrorInvalidValue; constexpr auto cudaSuccess = hipSuccess; constexpr auto cudaErrorNotSupported = hipErrorNotSupported; constexpr auto cudaStreamNonBlocking = hipStreamNonBlocking; diff --git a/src/core/communicator.cc b/src/core/communicator.cc index 97fadbbd0..81cd7bbe4 100644 --- a/src/core/communicator.cc +++ b/src/core/communicator.cc @@ -31,23 +31,23 @@ ScopeGuard makeScopeGuard(Fn fn) { template std::shared_future makeOrderedRecvFuture(Impl* impl, int remoteRank, int tag, Fn fn) { auto thisRecvItem = std::make_shared>(); - auto future = std::async(std::launch::deferred, [impl, remoteRank, tag, thisRecvItem, - lastRecvItem = impl->getLastRecvItem(remoteRank, tag), - fn = std::move(fn)]() mutable { - [[maybe_unused]] auto cleanup = makeScopeGuard([impl, remoteRank, tag, thisRecvItem]() { - auto item = thisRecvItem->lock(); - auto it = impl->lastRecvItems_.find({remoteRank, tag}); - if (item && it != impl->lastRecvItems_.end() && it->second == item) { - impl->lastRecvItems_.erase(it); - } - }); - - if (lastRecvItem) { - // Recursive call to the previous receive items - lastRecvItem->wait(); - } - return fn(); - }); + auto future = std::async(std::launch::deferred, + [impl, remoteRank, tag, thisRecvItem, lastRecvItem = impl->getLastRecvItem(remoteRank, tag), + fn = std::move(fn)]() mutable { + [[maybe_unused]] auto cleanup = makeScopeGuard([impl, remoteRank, tag, thisRecvItem]() { + auto item = thisRecvItem->lock(); + auto it = impl->lastRecvItems_.find({remoteRank, tag}); + if (item && it != impl->lastRecvItems_.end() && it->second == item) { + impl->lastRecvItems_.erase(it); + } + }); + + if (lastRecvItem) { + // Recursive call to the previous receive items + lastRecvItem->wait(); + } + return fn(); + }); auto sharedFuture = std::shared_future(std::move(future)); auto recvItem = std::make_shared>(sharedFuture); *thisRecvItem = recvItem; @@ -156,13 +156,13 @@ MSCCLPP_API_CPP std::shared_future Communicator::connect(const Endpo return makeOrderedRecvFuture(pimpl_.get(), remoteRank, tag, [this, remoteRank, tag, localEndpoint]() mutable { - std::vector data; - bootstrap()->recv(data, remoteRank, tag); - auto remoteEndpoint = Endpoint::deserialize(data); - auto connection = context()->connect(localEndpoint, remoteEndpoint); - pimpl_->connectionInfos_[connection.impl_.get()] = {remoteRank, tag}; - return connection; - }); + std::vector data; + bootstrap()->recv(data, remoteRank, tag); + auto remoteEndpoint = Endpoint::deserialize(data); + auto connection = context()->connect(localEndpoint, remoteEndpoint); + pimpl_->connectionInfos_[connection.impl_.get()] = {remoteRank, tag}; + return connection; + }); } MSCCLPP_API_CPP std::shared_future Communicator::connect(const EndpointConfig& localConfig, int remoteRank, From dbebde2b5801ba54220c0f25f253417be81686c3 Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Fri, 15 May 2026 22:26:53 +0000 Subject: [PATCH 33/44] Configure IPC domain per communicator Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../customized_comm_with_tuning.py | 10 ++++------ include/mscclpp/core.hpp | 12 ++++++++++++ include/mscclpp/env.hpp | 6 ------ python/csrc/core_py.cpp | 2 ++ python/mscclpp/_core/comm.py | 3 +++ src/core/communicator.cc | 9 +++++++++ src/core/env.cpp | 4 +--- src/core/include/communicator.hpp | 1 + src/ext/collectives/collective_utils.cc | 16 +++++++--------- src/ext/collectives/include/collective_utils.hpp | 5 ++--- 10 files changed, 41 insertions(+), 27 deletions(-) diff --git a/examples/torch-integration/customized_comm_with_tuning.py b/examples/torch-integration/customized_comm_with_tuning.py index 44a5c9c10..1243ca91a 100644 --- a/examples/torch-integration/customized_comm_with_tuning.py +++ b/examples/torch-integration/customized_comm_with_tuning.py @@ -8,10 +8,6 @@ from mpi4py import MPI -_world_size = MPI.COMM_WORLD.Get_size() -if _world_size > 1 and "MSCCLPP_IPC_DOMAIN_NRANKS" not in os.environ: - os.environ["MSCCLPP_IPC_DOMAIN_NRANKS"] = str(_world_size) - import torch import mscclpp import mscclpp.ext @@ -101,8 +97,10 @@ def __init__(self, comm: mscclpp.CommGroup, symmetric_memory: bool = False): self.rank = comm.my_rank self.world_size = comm.nranks self.nranks_per_node = comm.nranks_per_node - nvlink_domain_nranks = int(os.environ.get("MSCCLPP_IPC_DOMAIN_NRANKS", "0")) - self.multi_host_mnnvl = nvlink_domain_nranks >= self.world_size and self.world_size > self.nranks_per_node + if comm.communicator.get_ipc_domain_n_ranks() == 0 and self.world_size > 1: + comm.communicator.set_ipc_domain_n_ranks(self.world_size) + self.ipc_domain_n_ranks = comm.communicator.get_ipc_domain_n_ranks() + self.multi_host_mnnvl = self.ipc_domain_n_ranks >= self.world_size and self.world_size > self.nranks_per_node self.symmetric_memory = symmetric_memory self._nvls = mscclpp.is_nvls_supported() diff --git a/include/mscclpp/core.hpp b/include/mscclpp/core.hpp index 45b56bcc0..481f1d3c5 100644 --- a/include/mscclpp/core.hpp +++ b/include/mscclpp/core.hpp @@ -821,6 +821,18 @@ class Communicator { /// @return The context held by this communicator. std::shared_ptr context(); + /// Set the IPC-domain rank count for collective algorithms using this communicator. + /// + /// The value describes how many ranks are in one GPU-IPC-reachable peer group, such as a Multi-Node NVLink + /// fabric. Set to 0 to use the default `bootstrap()->getNranksPerNode()` value. + /// + /// @param ipcDomainNranks Number of ranks in the communicator's IPC domain, or 0 to use the default. + void setIpcDomainNranks(int ipcDomainNranks); + + /// Get the IPC-domain rank count override for this communicator. + /// @return The configured IPC-domain rank count, or 0 if the communicator uses `bootstrap()->getNranksPerNode()`. + int getIpcDomainNranks() const; + /// Register a region of GPU memory for use in this communicator's context. /// /// @param ptr Base pointer to the memory. diff --git a/include/mscclpp/env.hpp b/include/mscclpp/env.hpp index 0dd63ed74..a6dd306b6 100644 --- a/include/mscclpp/env.hpp +++ b/include/mscclpp/env.hpp @@ -119,12 +119,6 @@ class Env { /// Default is 0. Used when `EndpointConfig::Ib::gidIndex` is -1 (unspecified). const int ibGidIndex; - /// Env name: `MSCCLPP_IPC_DOMAIN_NRANKS`. Number of ranks that share a single GPU-IPC-reachable peer - /// group (e.g. a Multi-Node NVLink fabric such as GB200 NVL72, or an AMD XGMI domain). This hint is - /// consumed only by the collective algorithms; it does not affect `Bootstrap::getNranksPerNode()` or - /// any other layer. If unset or non-positive, algorithms fall back to `bootstrap->getNranksPerNode()`. - const int ipcDomainNranks; - private: Env(); diff --git a/python/csrc/core_py.cpp b/python/csrc/core_py.cpp index a94f9863a..d748c6a00 100644 --- a/python/csrc/core_py.cpp +++ b/python/csrc/core_py.cpp @@ -282,6 +282,8 @@ void register_core(nb::module_& m) { nb::arg("context") = nullptr) .def("bootstrap", &Communicator::bootstrap) .def("context", &Communicator::context) + .def("set_ipc_domain_n_ranks", &Communicator::setIpcDomainNranks, nb::arg("n_ranks")) + .def("get_ipc_domain_n_ranks", &Communicator::getIpcDomainNranks) .def( "register_memory", [](Communicator* self, uintptr_t ptr, size_t size, TransportFlags transports) { diff --git a/python/mscclpp/_core/comm.py b/python/mscclpp/_core/comm.py index d42349ddb..f1940eae7 100644 --- a/python/mscclpp/_core/comm.py +++ b/python/mscclpp/_core/comm.py @@ -35,6 +35,7 @@ def __init__( interfaceIpPortTrio: str = "", rank: int = None, size: int = None, + ipc_domain_n_ranks: int = 0, ): if interfaceIpPortTrio == "" and (mpi_comm is not None or torch_group is not None): uniq_id = None @@ -70,9 +71,11 @@ def __init__( else: raise RuntimeError("Either the interface or mpi_group need to be specified") self.communicator = CppCommunicator(self.bootstrap) + self.communicator.set_ipc_domain_n_ranks(ipc_domain_n_ranks) self.my_rank = self.bootstrap.get_rank() self.nranks = self.bootstrap.get_n_ranks() self.nranks_per_node = self.bootstrap.get_n_ranks_per_node() + self.ipc_domain_n_ranks = self.communicator.get_ipc_domain_n_ranks() def barrier(self): self.bootstrap.barrier() diff --git a/src/core/communicator.cc b/src/core/communicator.cc index 1ca029d67..2272175e7 100644 --- a/src/core/communicator.cc +++ b/src/core/communicator.cc @@ -81,6 +81,15 @@ MSCCLPP_API_CPP std::shared_ptr Communicator::bootstrap() { return pi MSCCLPP_API_CPP std::shared_ptr Communicator::context() { return pimpl_->context_; } +MSCCLPP_API_CPP void Communicator::setIpcDomainNranks(int ipcDomainNranks) { + if (ipcDomainNranks < 0) { + throw Error("ipcDomainNranks must be non-negative", ErrorCode::InvalidUsage); + } + pimpl_->ipcDomainNranks_ = ipcDomainNranks; +} + +MSCCLPP_API_CPP int Communicator::getIpcDomainNranks() const { return pimpl_->ipcDomainNranks_; } + MSCCLPP_API_CPP RegisteredMemory Communicator::registerMemory(void* ptr, size_t size, TransportFlags transports) { return context()->registerMemory(ptr, size, transports); } diff --git a/src/core/env.cpp b/src/core/env.cpp index 18d548b02..7a42471bf 100644 --- a/src/core/env.cpp +++ b/src/core/env.cpp @@ -67,8 +67,7 @@ Env::Env() ncclSymmetricMemory(readEnv("MSCCLPP_NCCL_SYMMETRIC_MEMORY", false)), forceDisableNvls(readEnv("MSCCLPP_FORCE_DISABLE_NVLS", false)), forceDisableGdr(readEnv("MSCCLPP_FORCE_DISABLE_GDR", false)), - ibGidIndex(readEnv("MSCCLPP_IB_GID_INDEX", 0)), - ipcDomainNranks(readEnv("MSCCLPP_IPC_DOMAIN_NRANKS", 0)) {} + ibGidIndex(readEnv("MSCCLPP_IB_GID_INDEX", 0)) {} std::shared_ptr env() { static std::shared_ptr globalEnv = std::shared_ptr(new Env()); @@ -98,7 +97,6 @@ std::shared_ptr env() { logEnv("MSCCLPP_FORCE_DISABLE_NVLS", globalEnv->forceDisableNvls); logEnv("MSCCLPP_FORCE_DISABLE_GDR", globalEnv->forceDisableGdr); logEnv("MSCCLPP_IB_GID_INDEX", globalEnv->ibGidIndex); - logEnv("MSCCLPP_IPC_DOMAIN_NRANKS", globalEnv->ipcDomainNranks); } return globalEnv; } diff --git a/src/core/include/communicator.hpp b/src/core/include/communicator.hpp index f15e20f74..b9f519b9b 100644 --- a/src/core/include/communicator.hpp +++ b/src/core/include/communicator.hpp @@ -60,6 +60,7 @@ struct Communicator::Impl { std::shared_ptr bootstrap_; std::shared_ptr context_; std::unordered_map connectionInfos_; + int ipcDomainNranks_ = 0; // Temporary storage for the latest RecvItem of each {remoteRank, tag} pair. // The RecvItem is removed when it finishes or when getLastRecvItem observes that it is ready. diff --git a/src/ext/collectives/collective_utils.cc b/src/ext/collectives/collective_utils.cc index 6acfd7ce0..192fac8d3 100644 --- a/src/ext/collectives/collective_utils.cc +++ b/src/ext/collectives/collective_utils.cc @@ -6,7 +6,6 @@ #include #include #include -#include #include #include #include @@ -73,23 +72,22 @@ std::vector> setupMemoryS return memorySemaphores; } -int getIpcDomainNranks(std::shared_ptr comm) { - const int envValue = mscclpp::env()->ipcDomainNranks; - const int ipcDomainNranks = (envValue > 0) ? envValue : comm->bootstrap()->getNranksPerNode(); +int getIpcDomainNranks(std::shared_ptr comm) { + const int commValue = comm->getIpcDomainNranks(); + const int ipcDomainNranks = (commValue > 0) ? commValue : comm->bootstrap()->getNranksPerNode(); const int worldSize = comm->bootstrap()->getNranks(); const int rank = comm->bootstrap()->getRank(); if (ipcDomainNranks < 2 || ipcDomainNranks > MAX_IPC_DOMAIN_NRANKS) { - THROW(mscclpp::LogSubsys::ALGO, mscclpp::Error, mscclpp::ErrorCode::InvalidUsage, "ipcDomainNranks ", - ipcDomainNranks, " is out of supported range [2, ", MAX_IPC_DOMAIN_NRANKS, "]"); + THROW(LogSubsys::ALGO, Error, ErrorCode::InvalidUsage, "ipcDomainNranks ", ipcDomainNranks, + " is out of supported range [2, ", MAX_IPC_DOMAIN_NRANKS, "]"); } if (worldSize != ipcDomainNranks) { - THROW(mscclpp::LogSubsys::ALGO, mscclpp::Error, mscclpp::ErrorCode::InvalidUsage, + THROW(LogSubsys::ALGO, Error, ErrorCode::InvalidUsage, "requires worldSize == ipcDomainNranks (got worldSize=", worldSize, ", ipcDomainNranks=", ipcDomainNranks, ")"); } if (rank < 0 || rank >= ipcDomainNranks) { - THROW(mscclpp::LogSubsys::ALGO, mscclpp::Error, mscclpp::ErrorCode::InvalidUsage, "rank ", rank, " out of [0, ", - ipcDomainNranks, ")"); + THROW(LogSubsys::ALGO, Error, ErrorCode::InvalidUsage, "rank ", rank, " out of [0, ", ipcDomainNranks, ")"); } return ipcDomainNranks; } diff --git a/src/ext/collectives/include/collective_utils.hpp b/src/ext/collectives/include/collective_utils.hpp index 280a63328..217c7f550 100644 --- a/src/ext/collectives/include/collective_utils.hpp +++ b/src/ext/collectives/include/collective_utils.hpp @@ -52,9 +52,8 @@ std::vector> setupMemorySemaphores std::shared_ptr comm, const std::vector& connections, int nChannelsPerConnection); /// Returns the IPC-reachable peer-group size, validated to span the whole communicator and -/// to be within `[2, MAX_IPC_DOMAIN_NRANKS]`. Reads `MSCCLPP_IPC_DOMAIN_NRANKS` if set to a -/// positive value; otherwise falls back to `bootstrap->getNranksPerNode()`. Throws -/// `Error(InvalidUsage)` on violation. +/// to be within `[2, MAX_IPC_DOMAIN_NRANKS]`. Reads the communicator's IPC-domain override +/// if set; otherwise falls back to `bootstrap->getNranksPerNode()`. Throws `Error(InvalidUsage)` on violation. int getIpcDomainNranks(std::shared_ptr comm); std::shared_ptr> setupMemoryChannelDeviceHandles( From 93b43547cc003784771bbbffdd554abeab75aaad Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Fri, 15 May 2026 23:15:40 +0000 Subject: [PATCH 34/44] temp solution --- .../customized_comm_with_tuning.py | 9 ++++----- include/mscclpp/core.hpp | 4 ++-- src/core/communicator.cc | 6 +++--- .../allgather/allgather_fullmesh.cu | 2 +- .../allgather/allgather_fullmesh_2.cu | 2 +- .../allreduce/allreduce_allpair_packet.cu | 2 +- .../allreduce/allreduce_fullmesh.cu | 2 +- .../allreduce_nvls_block_pipeline.cu | 4 ++-- .../allreduce/allreduce_nvls_packet.cu | 2 +- .../allreduce/allreduce_nvls_warp_pipeline.cu | 4 ++-- .../allreduce/allreduce_nvls_zero_copy.cu | 3 +-- .../collectives/allreduce/allreduce_packet.cu | 2 +- .../collectives/allreduce/allreduce_rsag.cu | 2 +- .../allreduce/allreduce_rsag_pipeline.cu | 2 +- .../allreduce/allreduce_rsag_zero_copy.cu | 2 +- src/ext/collectives/collective_utils.cc | 20 ------------------- .../collectives/include/collective_utils.hpp | 5 ----- 17 files changed, 23 insertions(+), 50 deletions(-) diff --git a/examples/torch-integration/customized_comm_with_tuning.py b/examples/torch-integration/customized_comm_with_tuning.py index 1243ca91a..d0da8c689 100644 --- a/examples/torch-integration/customized_comm_with_tuning.py +++ b/examples/torch-integration/customized_comm_with_tuning.py @@ -97,8 +97,6 @@ def __init__(self, comm: mscclpp.CommGroup, symmetric_memory: bool = False): self.rank = comm.my_rank self.world_size = comm.nranks self.nranks_per_node = comm.nranks_per_node - if comm.communicator.get_ipc_domain_n_ranks() == 0 and self.world_size > 1: - comm.communicator.set_ipc_domain_n_ranks(self.world_size) self.ipc_domain_n_ranks = comm.communicator.get_ipc_domain_n_ranks() self.multi_host_mnnvl = self.ipc_domain_n_ranks >= self.world_size and self.world_size > self.nranks_per_node self.symmetric_memory = symmetric_memory @@ -433,8 +431,8 @@ def benchmark_allgather(comm: CustomizedComm, dtype=torch.float16, n_warmup=10, # -- Bootstrap & main --------------------------------------------------------- -def init_dist() -> mscclpp.CommGroup: - return mscclpp.CommGroup(mpi_comm=MPI.COMM_WORLD) +def init_dist(ipc_domain_n_ranks: int = 0) -> mscclpp.CommGroup: + return mscclpp.CommGroup(mpi_comm=MPI.COMM_WORLD, ipc_domain_n_ranks=ipc_domain_n_ranks) def main(): @@ -447,8 +445,9 @@ def main(): accum_str = os.environ.get("ACCUM_DTYPE") accum_dtype = accum_map.get(accum_str) if accum_str else None symmetric_memory = os.environ.get("SYMMETRIC_MEMORY", "1") == "1" + ipc_domain_n_ranks = int(os.environ.get("IPC_DOMAIN_NRANKS", "0")) - comm_group = init_dist() + comm_group = init_dist(ipc_domain_n_ranks=ipc_domain_n_ranks) cc = CustomizedComm(comm_group, symmetric_memory=symmetric_memory) print( diff --git a/include/mscclpp/core.hpp b/include/mscclpp/core.hpp index 481f1d3c5..832323ad6 100644 --- a/include/mscclpp/core.hpp +++ b/include/mscclpp/core.hpp @@ -829,8 +829,8 @@ class Communicator { /// @param ipcDomainNranks Number of ranks in the communicator's IPC domain, or 0 to use the default. void setIpcDomainNranks(int ipcDomainNranks); - /// Get the IPC-domain rank count override for this communicator. - /// @return The configured IPC-domain rank count, or 0 if the communicator uses `bootstrap()->getNranksPerNode()`. + /// Get the effective IPC-domain rank count for this communicator. + /// @return The configured IPC-domain rank count, or `bootstrap()->getNranksPerNode()` if no override is set. int getIpcDomainNranks() const; /// Register a region of GPU memory for use in this communicator's context. diff --git a/src/core/communicator.cc b/src/core/communicator.cc index 2272175e7..9bbbff3be 100644 --- a/src/core/communicator.cc +++ b/src/core/communicator.cc @@ -3,8 +3,6 @@ #include "communicator.hpp" -#include - #include "api.h" namespace mscclpp { @@ -88,7 +86,9 @@ MSCCLPP_API_CPP void Communicator::setIpcDomainNranks(int ipcDomainNranks) { pimpl_->ipcDomainNranks_ = ipcDomainNranks; } -MSCCLPP_API_CPP int Communicator::getIpcDomainNranks() const { return pimpl_->ipcDomainNranks_; } +MSCCLPP_API_CPP int Communicator::getIpcDomainNranks() const { + return (pimpl_->ipcDomainNranks_ > 0) ? pimpl_->ipcDomainNranks_ : pimpl_->bootstrap_->getNranksPerNode(); +} MSCCLPP_API_CPP RegisteredMemory Communicator::registerMemory(void* ptr, size_t size, TransportFlags transports) { return context()->registerMemory(ptr, size, transports); diff --git a/src/ext/collectives/allgather/allgather_fullmesh.cu b/src/ext/collectives/allgather/allgather_fullmesh.cu index a4196c6cd..8b5cf3b70 100644 --- a/src/ext/collectives/allgather/allgather_fullmesh.cu +++ b/src/ext/collectives/allgather/allgather_fullmesh.cu @@ -148,7 +148,7 @@ std::shared_ptr AllgatherFullmesh::initAllgatherContext(std::shared_ptr(); ctx->rank = comm->bootstrap()->getRank(); ctx->workSize = comm->bootstrap()->getNranks(); - ctx->ipcDomainNranks = comm->bootstrap()->getNranksPerNode(); + ctx->ipcDomainNranks = comm->getIpcDomainNranks(); // setup semaphores ctx->memorySemaphores = setupMemorySemaphores(comm, this->conns_, nChannelsPerConnection); diff --git a/src/ext/collectives/allgather/allgather_fullmesh_2.cu b/src/ext/collectives/allgather/allgather_fullmesh_2.cu index 6e69f81ca..de9d93840 100644 --- a/src/ext/collectives/allgather/allgather_fullmesh_2.cu +++ b/src/ext/collectives/allgather/allgather_fullmesh_2.cu @@ -159,7 +159,7 @@ std::shared_ptr AllgatherFullmesh2::initAllgatherContext(std::shared_ptr(); ctx->rank = comm->bootstrap()->getRank(); ctx->workSize = comm->bootstrap()->getNranks(); - ctx->ipcDomainNranks = comm->bootstrap()->getNranksPerNode(); + ctx->ipcDomainNranks = comm->getIpcDomainNranks(); // setup semaphores ctx->memorySemaphores = this->memorySemaphores_; diff --git a/src/ext/collectives/allreduce/allreduce_allpair_packet.cu b/src/ext/collectives/allreduce/allreduce_allpair_packet.cu index 5be2f3360..6c4f972f2 100644 --- a/src/ext/collectives/allreduce/allreduce_allpair_packet.cu +++ b/src/ext/collectives/allreduce/allreduce_allpair_packet.cu @@ -140,7 +140,7 @@ std::shared_ptr AllreduceAllpairPacket::initAllreduceContext(std::shared_p const int nChannelsPerConnection = maxBlockNum_; ctx->rank = comm->bootstrap()->getRank(); ctx->workSize = comm->bootstrap()->getNranks(); - ctx->ipcDomainNranks = getIpcDomainNranks(comm); + ctx->ipcDomainNranks = comm->getIpcDomainNranks(); ctx->memorySemaphores = this->memorySemaphores_; ctx->registeredMemories = this->registeredMemories_; ctx->registeredMemories.pop_back(); // remove the local memory from previous context diff --git a/src/ext/collectives/allreduce/allreduce_fullmesh.cu b/src/ext/collectives/allreduce/allreduce_fullmesh.cu index ef7ecf74d..a54270703 100644 --- a/src/ext/collectives/allreduce/allreduce_fullmesh.cu +++ b/src/ext/collectives/allreduce/allreduce_fullmesh.cu @@ -250,7 +250,7 @@ std::shared_ptr AllreduceFullmesh::initAllreduceContext(std::shared_ptr(); ctx->rank = comm->bootstrap()->getRank(); ctx->workSize = comm->bootstrap()->getNranks(); - ctx->ipcDomainNranks = comm->bootstrap()->getNranksPerNode(); + ctx->ipcDomainNranks = comm->getIpcDomainNranks(); // setup semaphores ctx->memorySemaphores = this->outputSemaphores_; diff --git a/src/ext/collectives/allreduce/allreduce_nvls_block_pipeline.cu b/src/ext/collectives/allreduce/allreduce_nvls_block_pipeline.cu index 9d3316e4c..07418f744 100644 --- a/src/ext/collectives/allreduce/allreduce_nvls_block_pipeline.cu +++ b/src/ext/collectives/allreduce/allreduce_nvls_block_pipeline.cu @@ -177,7 +177,7 @@ struct NvlsBlockPipelineAdapter { void AllreduceNvlsBlockPipeline::initialize(std::shared_ptr comm) { nSwitchChannels_ = 8; - ipcDomainNranks_ = getIpcDomainNranks(comm); + ipcDomainNranks_ = comm->getIpcDomainNranks(); // Per-peer channel allocation must hold up to 4 * ipcDomainNranks entries (see kernel). nBaseChannels_ = std::max(64, 4 * ipcDomainNranks_); this->conns_ = setupConnections(comm); @@ -224,7 +224,7 @@ std::shared_ptr AllreduceNvlsBlockPipeline::initAllreduceContext(std::shar auto ctx = std::make_shared(); ctx->rank = comm->bootstrap()->getRank(); ctx->workSize = comm->bootstrap()->getNranks(); - ctx->ipcDomainNranks = getIpcDomainNranks(comm); + ctx->ipcDomainNranks = comm->getIpcDomainNranks(); // setup channels ctx->switchChannels = diff --git a/src/ext/collectives/allreduce/allreduce_nvls_packet.cu b/src/ext/collectives/allreduce/allreduce_nvls_packet.cu index 56455b6ea..cb9ad17eb 100644 --- a/src/ext/collectives/allreduce/allreduce_nvls_packet.cu +++ b/src/ext/collectives/allreduce/allreduce_nvls_packet.cu @@ -95,7 +95,7 @@ std::shared_ptr AllreduceNvlsPacket::initAllreduceContext(std::shared_ptr< auto ctx = std::make_shared(); ctx->rank = comm->bootstrap()->getRank(); ctx->workSize = comm->bootstrap()->getNranks(); - ctx->ipcDomainNranks = getIpcDomainNranks(comm); + ctx->ipcDomainNranks = comm->getIpcDomainNranks(); // setup channels ctx->switchChannels = this->switchChannels_; diff --git a/src/ext/collectives/allreduce/allreduce_nvls_warp_pipeline.cu b/src/ext/collectives/allreduce/allreduce_nvls_warp_pipeline.cu index 73ecdab9d..a06692947 100644 --- a/src/ext/collectives/allreduce/allreduce_nvls_warp_pipeline.cu +++ b/src/ext/collectives/allreduce/allreduce_nvls_warp_pipeline.cu @@ -141,7 +141,7 @@ struct NvlsWarpPipelineAdapter { void AllreduceNvlsWarpPipeline::initialize(std::shared_ptr comm) { nSwitchChannels_ = NUM_NVLS_CONNECTION; - ipcDomainNranks_ = getIpcDomainNranks(comm); + ipcDomainNranks_ = comm->getIpcDomainNranks(); // Per-peer channel allocation must hold 2 * nBlocks entries; default nBlocks = 4 * ipcDomainNranks. nBaseChannels_ = std::max(64, 8 * ipcDomainNranks_); this->conns_ = setupConnections(comm); @@ -188,7 +188,7 @@ std::shared_ptr AllreduceNvlsWarpPipeline::initAllreduceContext(std::share auto ctx = std::make_shared(); ctx->rank = comm->bootstrap()->getRank(); ctx->workSize = comm->bootstrap()->getNranks(); - ctx->ipcDomainNranks = getIpcDomainNranks(comm); + ctx->ipcDomainNranks = comm->getIpcDomainNranks(); // setup channels ctx->switchChannels = diff --git a/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu b/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu index 63cbd057d..36095e73c 100644 --- a/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu +++ b/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu @@ -99,7 +99,6 @@ void AllreduceNvls::initialize(std::shared_ptr comm) { MSCCLPP_CUDATHROW(cudaGetDeviceProperties(&deviceProp, device)); computeCapabilityMajor_ = deviceProp.major; nSwitchChannels_ = 32; - getIpcDomainNranks(comm); this->conns_ = setupConnections(comm); // setup semaphores std::vector> memorySemaphores = @@ -177,7 +176,7 @@ std::shared_ptr AllreduceNvls::initAllreduceContext(std::shared_ptr(); ctx->rank = comm->bootstrap()->getRank(); ctx->workSize = comm->bootstrap()->getNranks(); - ctx->ipcDomainNranks = getIpcDomainNranks(comm); + ctx->ipcDomainNranks = comm->getIpcDomainNranks(); size_t sendBytes, recvBytes; CUdeviceptr sendBasePtr, recvBasePtr; diff --git a/src/ext/collectives/allreduce/allreduce_packet.cu b/src/ext/collectives/allreduce/allreduce_packet.cu index 7bc9a85f1..f88389dca 100644 --- a/src/ext/collectives/allreduce/allreduce_packet.cu +++ b/src/ext/collectives/allreduce/allreduce_packet.cu @@ -263,7 +263,7 @@ std::shared_ptr AllreducePacket::initAllreduceContext(std::shared_ptrrank = comm->bootstrap()->getRank(); ctx->workSize = comm->bootstrap()->getNranks(); - ctx->ipcDomainNranks = getIpcDomainNranks(comm); + ctx->ipcDomainNranks = comm->getIpcDomainNranks(); ctx->memorySemaphores = this->memorySemaphores_; ctx->registeredMemories = this->registeredMemories_; ctx->registeredMemories.pop_back(); // remove the local memory from previous context diff --git a/src/ext/collectives/allreduce/allreduce_rsag.cu b/src/ext/collectives/allreduce/allreduce_rsag.cu index 4dcceb48e..43ff56106 100644 --- a/src/ext/collectives/allreduce/allreduce_rsag.cu +++ b/src/ext/collectives/allreduce/allreduce_rsag.cu @@ -203,7 +203,7 @@ std::shared_ptr AllreduceRsAg::initAllreduceContext(std::shared_ptr(); ctx->rank = comm->bootstrap()->getRank(); ctx->workSize = comm->bootstrap()->getNranks(); - ctx->ipcDomainNranks = getIpcDomainNranks(comm); + ctx->ipcDomainNranks = comm->getIpcDomainNranks(); ctx->memorySemaphores = this->scratchSemaphores_; ctx->registeredMemories = this->remoteScratchMemories_; diff --git a/src/ext/collectives/allreduce/allreduce_rsag_pipeline.cu b/src/ext/collectives/allreduce/allreduce_rsag_pipeline.cu index 9f63e5905..1e59c7e45 100644 --- a/src/ext/collectives/allreduce/allreduce_rsag_pipeline.cu +++ b/src/ext/collectives/allreduce/allreduce_rsag_pipeline.cu @@ -306,7 +306,7 @@ std::shared_ptr AllreduceRsAgPipeline::initAllreduceContext(std::shared_pt auto ctx = std::make_shared(); ctx->rank = comm->bootstrap()->getRank(); ctx->workSize = comm->bootstrap()->getNranks(); - ctx->ipcDomainNranks = comm->bootstrap()->getNranksPerNode(); + ctx->ipcDomainNranks = comm->getIpcDomainNranks(); ctx->memorySemaphores = this->scratchSemaphores_; ctx->registeredMemories = this->remoteScratchMemories_; diff --git a/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu b/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu index c678c2670..f8d612793 100644 --- a/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu +++ b/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu @@ -200,7 +200,7 @@ std::shared_ptr AllreduceRsAgZeroCopy::initAllreduceContext(std::shared_pt auto ctx = std::make_shared(); ctx->rank = comm->bootstrap()->getRank(); ctx->workSize = comm->bootstrap()->getNranks(); - ctx->ipcDomainNranks = getIpcDomainNranks(comm); + ctx->ipcDomainNranks = comm->getIpcDomainNranks(); ctx->memorySemaphores = this->semaphores_; diff --git a/src/ext/collectives/collective_utils.cc b/src/ext/collectives/collective_utils.cc index 192fac8d3..c3856a88e 100644 --- a/src/ext/collectives/collective_utils.cc +++ b/src/ext/collectives/collective_utils.cc @@ -72,26 +72,6 @@ std::vector> setupMemoryS return memorySemaphores; } -int getIpcDomainNranks(std::shared_ptr comm) { - const int commValue = comm->getIpcDomainNranks(); - const int ipcDomainNranks = (commValue > 0) ? commValue : comm->bootstrap()->getNranksPerNode(); - const int worldSize = comm->bootstrap()->getNranks(); - const int rank = comm->bootstrap()->getRank(); - if (ipcDomainNranks < 2 || ipcDomainNranks > MAX_IPC_DOMAIN_NRANKS) { - THROW(LogSubsys::ALGO, Error, ErrorCode::InvalidUsage, "ipcDomainNranks ", ipcDomainNranks, - " is out of supported range [2, ", MAX_IPC_DOMAIN_NRANKS, "]"); - } - if (worldSize != ipcDomainNranks) { - THROW(LogSubsys::ALGO, Error, ErrorCode::InvalidUsage, - "requires worldSize == ipcDomainNranks (got worldSize=", worldSize, ", ipcDomainNranks=", ipcDomainNranks, - ")"); - } - if (rank < 0 || rank >= ipcDomainNranks) { - THROW(LogSubsys::ALGO, Error, ErrorCode::InvalidUsage, "rank ", rank, " out of [0, ", ipcDomainNranks, ")"); - } - return ipcDomainNranks; -} - std::shared_ptr> setupMemoryChannelDeviceHandles( const std::vector& memoryChannels) { std::vector> memoryChannelDeviceHandles; diff --git a/src/ext/collectives/include/collective_utils.hpp b/src/ext/collectives/include/collective_utils.hpp index 217c7f550..c1cad4121 100644 --- a/src/ext/collectives/include/collective_utils.hpp +++ b/src/ext/collectives/include/collective_utils.hpp @@ -51,11 +51,6 @@ std::vector setupConnections(std::shared_ptr comm); std::vector> setupMemorySemaphores( std::shared_ptr comm, const std::vector& connections, int nChannelsPerConnection); -/// Returns the IPC-reachable peer-group size, validated to span the whole communicator and -/// to be within `[2, MAX_IPC_DOMAIN_NRANKS]`. Reads the communicator's IPC-domain override -/// if set; otherwise falls back to `bootstrap->getNranksPerNode()`. Throws `Error(InvalidUsage)` on violation. -int getIpcDomainNranks(std::shared_ptr comm); - std::shared_ptr> setupMemoryChannelDeviceHandles( const std::vector& memoryChannels); From 0744e806fc1f0d9cd8c33231ef1517fe42348395 Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Sat, 16 May 2026 00:39:49 +0000 Subject: [PATCH 35/44] detect ipc domain automaticlly --- CMakeLists.txt | 1 + .../customized_comm_with_tuning.py | 9 +-- include/mscclpp/core.hpp | 19 ++--- python/csrc/core_py.cpp | 3 +- python/mscclpp/_core/comm.py | 4 +- src/core/bootstrap/bootstrap.cc | 23 ++++++ src/core/communicator.cc | 11 --- src/core/include/communicator.hpp | 2 - src/core/include/utils_internal.hpp | 1 + src/core/utils_internal.cc | 77 +++++++++++++++++++ .../allgather/allgather_fullmesh.cu | 2 +- .../allgather/allgather_fullmesh_2.cu | 2 +- .../allreduce/allreduce_allpair_packet.cu | 2 +- .../allreduce/allreduce_fullmesh.cu | 2 +- .../allreduce_nvls_block_pipeline.cu | 4 +- .../allreduce/allreduce_nvls_packet.cu | 2 +- .../allreduce/allreduce_nvls_warp_pipeline.cu | 4 +- .../allreduce/allreduce_nvls_zero_copy.cu | 2 +- .../collectives/allreduce/allreduce_packet.cu | 2 +- .../collectives/allreduce/allreduce_rsag.cu | 2 +- .../allreduce/allreduce_rsag_pipeline.cu | 2 +- .../allreduce/allreduce_rsag_zero_copy.cu | 2 +- test/mp_unit/bootstrap_tests.cc | 1 + 23 files changed, 130 insertions(+), 49 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 49154e0b0..3f9bf8e07 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -206,6 +206,7 @@ if(MSCCLPP_USE_CUDA) else() set(GPU_LIBRARIES CUDA::cudart CUDA::cuda_driver) endif() + list(APPEND GPU_LIBRARIES CUDA::nvml) else() set(CMAKE_HIP_STANDARD 17) set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} -Wall -Wextra") diff --git a/examples/torch-integration/customized_comm_with_tuning.py b/examples/torch-integration/customized_comm_with_tuning.py index d0da8c689..6cef88feb 100644 --- a/examples/torch-integration/customized_comm_with_tuning.py +++ b/examples/torch-integration/customized_comm_with_tuning.py @@ -97,7 +97,7 @@ def __init__(self, comm: mscclpp.CommGroup, symmetric_memory: bool = False): self.rank = comm.my_rank self.world_size = comm.nranks self.nranks_per_node = comm.nranks_per_node - self.ipc_domain_n_ranks = comm.communicator.get_ipc_domain_n_ranks() + self.ipc_domain_n_ranks = comm.ipc_domain_n_ranks self.multi_host_mnnvl = self.ipc_domain_n_ranks >= self.world_size and self.world_size > self.nranks_per_node self.symmetric_memory = symmetric_memory self._nvls = mscclpp.is_nvls_supported() @@ -431,8 +431,8 @@ def benchmark_allgather(comm: CustomizedComm, dtype=torch.float16, n_warmup=10, # -- Bootstrap & main --------------------------------------------------------- -def init_dist(ipc_domain_n_ranks: int = 0) -> mscclpp.CommGroup: - return mscclpp.CommGroup(mpi_comm=MPI.COMM_WORLD, ipc_domain_n_ranks=ipc_domain_n_ranks) +def init_dist() -> mscclpp.CommGroup: + return mscclpp.CommGroup(mpi_comm=MPI.COMM_WORLD) def main(): @@ -445,9 +445,8 @@ def main(): accum_str = os.environ.get("ACCUM_DTYPE") accum_dtype = accum_map.get(accum_str) if accum_str else None symmetric_memory = os.environ.get("SYMMETRIC_MEMORY", "1") == "1" - ipc_domain_n_ranks = int(os.environ.get("IPC_DOMAIN_NRANKS", "0")) - comm_group = init_dist(ipc_domain_n_ranks=ipc_domain_n_ranks) + comm_group = init_dist() cc = CustomizedComm(comm_group, symmetric_memory=symmetric_memory) print( diff --git a/include/mscclpp/core.hpp b/include/mscclpp/core.hpp index 832323ad6..4c14f1eec 100644 --- a/include/mscclpp/core.hpp +++ b/include/mscclpp/core.hpp @@ -46,6 +46,10 @@ class Bootstrap { /// @return The total number of ranks per node. virtual int getNranksPerNode() const = 0; + /// Return the number of ranks in this rank's GPU IPC domain. + /// @return The number of ranks in the GPU IPC domain. + virtual int getNranksPerIpcDomain() const; + /// Send arbitrary data to another process. /// /// Data sent via `send(senderBuff, size, receiverRank, tag)` can be received via `recv(receiverBuff, size, @@ -144,6 +148,9 @@ class TcpBootstrap : public Bootstrap { /// Return the total number of ranks per node. int getNranksPerNode() const override; + /// Return the number of ranks in this rank's GPU IPC domain. + int getNranksPerIpcDomain() const override; + /// Send arbitrary data to another process. /// /// Data sent via `send(senderBuff, size, receiverRank, tag)` can be received via `recv(receiverBuff, size, @@ -821,18 +828,6 @@ class Communicator { /// @return The context held by this communicator. std::shared_ptr context(); - /// Set the IPC-domain rank count for collective algorithms using this communicator. - /// - /// The value describes how many ranks are in one GPU-IPC-reachable peer group, such as a Multi-Node NVLink - /// fabric. Set to 0 to use the default `bootstrap()->getNranksPerNode()` value. - /// - /// @param ipcDomainNranks Number of ranks in the communicator's IPC domain, or 0 to use the default. - void setIpcDomainNranks(int ipcDomainNranks); - - /// Get the effective IPC-domain rank count for this communicator. - /// @return The configured IPC-domain rank count, or `bootstrap()->getNranksPerNode()` if no override is set. - int getIpcDomainNranks() const; - /// Register a region of GPU memory for use in this communicator's context. /// /// @param ptr Base pointer to the memory. diff --git a/python/csrc/core_py.cpp b/python/csrc/core_py.cpp index d748c6a00..7e9af6c1f 100644 --- a/python/csrc/core_py.cpp +++ b/python/csrc/core_py.cpp @@ -56,6 +56,7 @@ void register_core(nb::module_& m) { .def("get_rank", &Bootstrap::getRank) .def("get_n_ranks", &Bootstrap::getNranks) .def("get_n_ranks_per_node", &Bootstrap::getNranksPerNode) + .def("get_n_ranks_per_ipc_domain", &Bootstrap::getNranksPerIpcDomain) .def( "send", [](Bootstrap* self, uintptr_t ptr, size_t size, int peer, int tag) { @@ -282,8 +283,6 @@ void register_core(nb::module_& m) { nb::arg("context") = nullptr) .def("bootstrap", &Communicator::bootstrap) .def("context", &Communicator::context) - .def("set_ipc_domain_n_ranks", &Communicator::setIpcDomainNranks, nb::arg("n_ranks")) - .def("get_ipc_domain_n_ranks", &Communicator::getIpcDomainNranks) .def( "register_memory", [](Communicator* self, uintptr_t ptr, size_t size, TransportFlags transports) { diff --git a/python/mscclpp/_core/comm.py b/python/mscclpp/_core/comm.py index f1940eae7..875e07f18 100644 --- a/python/mscclpp/_core/comm.py +++ b/python/mscclpp/_core/comm.py @@ -35,7 +35,6 @@ def __init__( interfaceIpPortTrio: str = "", rank: int = None, size: int = None, - ipc_domain_n_ranks: int = 0, ): if interfaceIpPortTrio == "" and (mpi_comm is not None or torch_group is not None): uniq_id = None @@ -71,11 +70,10 @@ def __init__( else: raise RuntimeError("Either the interface or mpi_group need to be specified") self.communicator = CppCommunicator(self.bootstrap) - self.communicator.set_ipc_domain_n_ranks(ipc_domain_n_ranks) self.my_rank = self.bootstrap.get_rank() self.nranks = self.bootstrap.get_n_ranks() self.nranks_per_node = self.bootstrap.get_n_ranks_per_node() - self.ipc_domain_n_ranks = self.communicator.get_ipc_domain_n_ranks() + self.ipc_domain_n_ranks = self.bootstrap.get_n_ranks_per_ipc_domain() def barrier(self): self.bootstrap.barrier() diff --git a/src/core/bootstrap/bootstrap.cc b/src/core/bootstrap/bootstrap.cc index b3032e502..a58357519 100644 --- a/src/core/bootstrap/bootstrap.cc +++ b/src/core/bootstrap/bootstrap.cc @@ -50,6 +50,8 @@ MSCCLPP_API_CPP void Bootstrap::groupBarrier(const std::vector& ranks) { } } +MSCCLPP_API_CPP int Bootstrap::getNranksPerIpcDomain() const { return getNranksPerNode(); } + MSCCLPP_API_CPP void Bootstrap::send(const std::vector& data, int peer, int tag) { size_t size = data.size(); send((void*)&size, sizeof(size_t), peer, tag); @@ -83,6 +85,7 @@ class TcpBootstrap::Impl { int getRank(); int getNranks(); int getNranksPerNode(); + int getNranksPerIpcDomain(); void allGather(void* allData, int size); void broadcast(void* data, int size, int root); void send(void* data, int size, int peer, int tag); @@ -95,6 +98,7 @@ class TcpBootstrap::Impl { int rank_; int nRanks_; int nRanksPerNode_; + int nRanksPerIpcDomain_; bool netInitialized; std::unique_ptr listenSockRoot_; std::unique_ptr listenSock_; @@ -148,6 +152,7 @@ TcpBootstrap::Impl::Impl(int rank, int nRanks) : rank_(rank), nRanks_(nRanks), nRanksPerNode_(0), + nRanksPerIpcDomain_(0), netInitialized(false), peerCommAddresses_(nRanks, SocketAddress()), barrierArr_(nRanks, 0), @@ -451,6 +456,22 @@ int TcpBootstrap::Impl::getNranksPerNode() { return nRanksPerNode_; } +int TcpBootstrap::Impl::getNranksPerIpcDomain() { + if (nRanksPerIpcDomain_ > 0) return nRanksPerIpcDomain_; + std::vector ipcDomainHashes(nRanks_); + ipcDomainHashes[rank_] = getIpcDomainHash(); + allGather(ipcDomainHashes.data(), sizeof(uint64_t)); + + int nRanksPerIpcDomain = 0; + for (int i = 0; i < nRanks_; ++i) { + if (ipcDomainHashes[i] == ipcDomainHashes[rank_]) { + ++nRanksPerIpcDomain; + } + } + nRanksPerIpcDomain_ = nRanksPerIpcDomain; + return nRanksPerIpcDomain_; +} + void TcpBootstrap::Impl::allGather(void* allData, int size) { char* data = static_cast(allData); int rank = rank_; @@ -592,6 +613,8 @@ MSCCLPP_API_CPP int TcpBootstrap::getNranks() const { return pimpl_->getNranks() MSCCLPP_API_CPP int TcpBootstrap::getNranksPerNode() const { return pimpl_->getNranksPerNode(); } +MSCCLPP_API_CPP int TcpBootstrap::getNranksPerIpcDomain() const { return pimpl_->getNranksPerIpcDomain(); } + MSCCLPP_API_CPP void TcpBootstrap::send(void* data, int size, int peer, int tag) { pimpl_->send(data, size, peer, tag); } diff --git a/src/core/communicator.cc b/src/core/communicator.cc index 9bbbff3be..41e46bc50 100644 --- a/src/core/communicator.cc +++ b/src/core/communicator.cc @@ -79,17 +79,6 @@ MSCCLPP_API_CPP std::shared_ptr Communicator::bootstrap() { return pi MSCCLPP_API_CPP std::shared_ptr Communicator::context() { return pimpl_->context_; } -MSCCLPP_API_CPP void Communicator::setIpcDomainNranks(int ipcDomainNranks) { - if (ipcDomainNranks < 0) { - throw Error("ipcDomainNranks must be non-negative", ErrorCode::InvalidUsage); - } - pimpl_->ipcDomainNranks_ = ipcDomainNranks; -} - -MSCCLPP_API_CPP int Communicator::getIpcDomainNranks() const { - return (pimpl_->ipcDomainNranks_ > 0) ? pimpl_->ipcDomainNranks_ : pimpl_->bootstrap_->getNranksPerNode(); -} - MSCCLPP_API_CPP RegisteredMemory Communicator::registerMemory(void* ptr, size_t size, TransportFlags transports) { return context()->registerMemory(ptr, size, transports); } diff --git a/src/core/include/communicator.hpp b/src/core/include/communicator.hpp index b9f519b9b..333cc9823 100644 --- a/src/core/include/communicator.hpp +++ b/src/core/include/communicator.hpp @@ -60,8 +60,6 @@ struct Communicator::Impl { std::shared_ptr bootstrap_; std::shared_ptr context_; std::unordered_map connectionInfos_; - int ipcDomainNranks_ = 0; - // Temporary storage for the latest RecvItem of each {remoteRank, tag} pair. // The RecvItem is removed when it finishes or when getLastRecvItem observes that it is ready. std::unordered_map, std::shared_ptr, PairHash> lastRecvItems_; diff --git a/src/core/include/utils_internal.hpp b/src/core/include/utils_internal.hpp index c5c67e26c..c6934194d 100644 --- a/src/core/include/utils_internal.hpp +++ b/src/core/include/utils_internal.hpp @@ -37,6 +37,7 @@ int64_t busIdToInt64(const std::string busId); uint64_t getHash(const char* string, int n); uint64_t getHostHash(); uint64_t getPidHash(); +uint64_t getIpcDomainHash(); void getRandomData(void* buffer, size_t bytes); struct netIf { diff --git a/src/core/utils_internal.cc b/src/core/utils_internal.cc index 8cc554301..2e620b660 100644 --- a/src/core/utils_internal.cc +++ b/src/core/utils_internal.cc @@ -6,6 +6,10 @@ #include #include +#if defined(MSCCLPP_USE_CUDA) +#include +#endif + #include #include #include @@ -175,6 +179,79 @@ uint64_t getPidHash(void) { return *pidHash; } +#if defined(MSCCLPP_USE_CUDA) && defined(NVML_GPU_FABRIC_UUID_LEN) +namespace { + +class NvmlState { + public: + NvmlState() : initialized_(nvmlInit_v2() == NVML_SUCCESS) {} + + ~NvmlState() { + if (initialized_) { + (void)nvmlShutdown(); + } + } + + bool isInitialized() const { return initialized_; } + + private: + bool initialized_ = false; +}; + +uint64_t getFabricHash(const nvmlGpuFabricInfo_t& fabricInfo) { + char hashData[NVML_GPU_FABRIC_UUID_LEN + sizeof(fabricInfo.cliqueId)]; + std::memcpy(hashData, fabricInfo.clusterUuid, NVML_GPU_FABRIC_UUID_LEN); + std::memcpy(hashData + NVML_GPU_FABRIC_UUID_LEN, &fabricInfo.cliqueId, sizeof(fabricInfo.cliqueId)); + return getHash(hashData, sizeof(hashData)); +} + +bool tryGetNvmlIpcDomainHash(uint64_t& ipcDomainHash) { + // Use the current CUDA device; callers must set the rank's device before querying. + int deviceId; + if (cudaGetDevice(&deviceId) != cudaSuccess) { + return false; + } + + char pciBusId[] = "00000000:00:00.0"; + if (cudaDeviceGetPCIBusId(pciBusId, sizeof(pciBusId), deviceId) != cudaSuccess) { + return false; + } + + static NvmlState nvml; + if (!nvml.isInitialized()) { + return false; + } + + nvmlDevice_t nvmlDevice; + if (nvmlDeviceGetHandleByPciBusId_v2(pciBusId, &nvmlDevice) != NVML_SUCCESS) { + return false; + } + + nvmlGpuFabricInfo_t fabricInfo = {}; + if (nvmlDeviceGetGpuFabricInfo(nvmlDevice, &fabricInfo) != NVML_SUCCESS) { + return false; + } + if (fabricInfo.state != NVML_GPU_FABRIC_STATE_COMPLETED || fabricInfo.status != NVML_SUCCESS) { + return false; + } + + ipcDomainHash = getFabricHash(fabricInfo); + return true; +} + +} // namespace +#endif + +uint64_t getIpcDomainHash(void) { +#if defined(MSCCLPP_USE_CUDA) && defined(NVML_GPU_FABRIC_UUID_LEN) + uint64_t ipcDomainHash; + if (tryGetNvmlIpcDomainHash(ipcDomainHash)) { + return ipcDomainHash; + } +#endif + return getHostHash(); +} + int parseStringList(const char* string, netIf* ifList, int maxList) { if (!string) return 0; diff --git a/src/ext/collectives/allgather/allgather_fullmesh.cu b/src/ext/collectives/allgather/allgather_fullmesh.cu index 8b5cf3b70..84dd4d473 100644 --- a/src/ext/collectives/allgather/allgather_fullmesh.cu +++ b/src/ext/collectives/allgather/allgather_fullmesh.cu @@ -148,7 +148,7 @@ std::shared_ptr AllgatherFullmesh::initAllgatherContext(std::shared_ptr(); ctx->rank = comm->bootstrap()->getRank(); ctx->workSize = comm->bootstrap()->getNranks(); - ctx->ipcDomainNranks = comm->getIpcDomainNranks(); + ctx->ipcDomainNranks = comm->bootstrap()->getNranksPerIpcDomain(); // setup semaphores ctx->memorySemaphores = setupMemorySemaphores(comm, this->conns_, nChannelsPerConnection); diff --git a/src/ext/collectives/allgather/allgather_fullmesh_2.cu b/src/ext/collectives/allgather/allgather_fullmesh_2.cu index de9d93840..5a353922f 100644 --- a/src/ext/collectives/allgather/allgather_fullmesh_2.cu +++ b/src/ext/collectives/allgather/allgather_fullmesh_2.cu @@ -159,7 +159,7 @@ std::shared_ptr AllgatherFullmesh2::initAllgatherContext(std::shared_ptr(); ctx->rank = comm->bootstrap()->getRank(); ctx->workSize = comm->bootstrap()->getNranks(); - ctx->ipcDomainNranks = comm->getIpcDomainNranks(); + ctx->ipcDomainNranks = comm->bootstrap()->getNranksPerIpcDomain(); // setup semaphores ctx->memorySemaphores = this->memorySemaphores_; diff --git a/src/ext/collectives/allreduce/allreduce_allpair_packet.cu b/src/ext/collectives/allreduce/allreduce_allpair_packet.cu index 6c4f972f2..29ef2055b 100644 --- a/src/ext/collectives/allreduce/allreduce_allpair_packet.cu +++ b/src/ext/collectives/allreduce/allreduce_allpair_packet.cu @@ -140,7 +140,7 @@ std::shared_ptr AllreduceAllpairPacket::initAllreduceContext(std::shared_p const int nChannelsPerConnection = maxBlockNum_; ctx->rank = comm->bootstrap()->getRank(); ctx->workSize = comm->bootstrap()->getNranks(); - ctx->ipcDomainNranks = comm->getIpcDomainNranks(); + ctx->ipcDomainNranks = comm->bootstrap()->getNranksPerIpcDomain(); ctx->memorySemaphores = this->memorySemaphores_; ctx->registeredMemories = this->registeredMemories_; ctx->registeredMemories.pop_back(); // remove the local memory from previous context diff --git a/src/ext/collectives/allreduce/allreduce_fullmesh.cu b/src/ext/collectives/allreduce/allreduce_fullmesh.cu index a54270703..b158f817c 100644 --- a/src/ext/collectives/allreduce/allreduce_fullmesh.cu +++ b/src/ext/collectives/allreduce/allreduce_fullmesh.cu @@ -250,7 +250,7 @@ std::shared_ptr AllreduceFullmesh::initAllreduceContext(std::shared_ptr(); ctx->rank = comm->bootstrap()->getRank(); ctx->workSize = comm->bootstrap()->getNranks(); - ctx->ipcDomainNranks = comm->getIpcDomainNranks(); + ctx->ipcDomainNranks = comm->bootstrap()->getNranksPerIpcDomain(); // setup semaphores ctx->memorySemaphores = this->outputSemaphores_; diff --git a/src/ext/collectives/allreduce/allreduce_nvls_block_pipeline.cu b/src/ext/collectives/allreduce/allreduce_nvls_block_pipeline.cu index 07418f744..890e50f58 100644 --- a/src/ext/collectives/allreduce/allreduce_nvls_block_pipeline.cu +++ b/src/ext/collectives/allreduce/allreduce_nvls_block_pipeline.cu @@ -177,7 +177,7 @@ struct NvlsBlockPipelineAdapter { void AllreduceNvlsBlockPipeline::initialize(std::shared_ptr comm) { nSwitchChannels_ = 8; - ipcDomainNranks_ = comm->getIpcDomainNranks(); + ipcDomainNranks_ = comm->bootstrap()->getNranksPerIpcDomain(); // Per-peer channel allocation must hold up to 4 * ipcDomainNranks entries (see kernel). nBaseChannels_ = std::max(64, 4 * ipcDomainNranks_); this->conns_ = setupConnections(comm); @@ -224,7 +224,7 @@ std::shared_ptr AllreduceNvlsBlockPipeline::initAllreduceContext(std::shar auto ctx = std::make_shared(); ctx->rank = comm->bootstrap()->getRank(); ctx->workSize = comm->bootstrap()->getNranks(); - ctx->ipcDomainNranks = comm->getIpcDomainNranks(); + ctx->ipcDomainNranks = comm->bootstrap()->getNranksPerIpcDomain(); // setup channels ctx->switchChannels = diff --git a/src/ext/collectives/allreduce/allreduce_nvls_packet.cu b/src/ext/collectives/allreduce/allreduce_nvls_packet.cu index cb9ad17eb..e8ecfb737 100644 --- a/src/ext/collectives/allreduce/allreduce_nvls_packet.cu +++ b/src/ext/collectives/allreduce/allreduce_nvls_packet.cu @@ -95,7 +95,7 @@ std::shared_ptr AllreduceNvlsPacket::initAllreduceContext(std::shared_ptr< auto ctx = std::make_shared(); ctx->rank = comm->bootstrap()->getRank(); ctx->workSize = comm->bootstrap()->getNranks(); - ctx->ipcDomainNranks = comm->getIpcDomainNranks(); + ctx->ipcDomainNranks = comm->bootstrap()->getNranksPerIpcDomain(); // setup channels ctx->switchChannels = this->switchChannels_; diff --git a/src/ext/collectives/allreduce/allreduce_nvls_warp_pipeline.cu b/src/ext/collectives/allreduce/allreduce_nvls_warp_pipeline.cu index a06692947..68efc2ab0 100644 --- a/src/ext/collectives/allreduce/allreduce_nvls_warp_pipeline.cu +++ b/src/ext/collectives/allreduce/allreduce_nvls_warp_pipeline.cu @@ -141,7 +141,7 @@ struct NvlsWarpPipelineAdapter { void AllreduceNvlsWarpPipeline::initialize(std::shared_ptr comm) { nSwitchChannels_ = NUM_NVLS_CONNECTION; - ipcDomainNranks_ = comm->getIpcDomainNranks(); + ipcDomainNranks_ = comm->bootstrap()->getNranksPerIpcDomain(); // Per-peer channel allocation must hold 2 * nBlocks entries; default nBlocks = 4 * ipcDomainNranks. nBaseChannels_ = std::max(64, 8 * ipcDomainNranks_); this->conns_ = setupConnections(comm); @@ -188,7 +188,7 @@ std::shared_ptr AllreduceNvlsWarpPipeline::initAllreduceContext(std::share auto ctx = std::make_shared(); ctx->rank = comm->bootstrap()->getRank(); ctx->workSize = comm->bootstrap()->getNranks(); - ctx->ipcDomainNranks = comm->getIpcDomainNranks(); + ctx->ipcDomainNranks = comm->bootstrap()->getNranksPerIpcDomain(); // setup channels ctx->switchChannels = diff --git a/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu b/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu index 36095e73c..a6f699b2e 100644 --- a/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu +++ b/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu @@ -176,7 +176,7 @@ std::shared_ptr AllreduceNvls::initAllreduceContext(std::shared_ptr(); ctx->rank = comm->bootstrap()->getRank(); ctx->workSize = comm->bootstrap()->getNranks(); - ctx->ipcDomainNranks = comm->getIpcDomainNranks(); + ctx->ipcDomainNranks = comm->bootstrap()->getNranksPerIpcDomain(); size_t sendBytes, recvBytes; CUdeviceptr sendBasePtr, recvBasePtr; diff --git a/src/ext/collectives/allreduce/allreduce_packet.cu b/src/ext/collectives/allreduce/allreduce_packet.cu index f88389dca..a0bc0e26e 100644 --- a/src/ext/collectives/allreduce/allreduce_packet.cu +++ b/src/ext/collectives/allreduce/allreduce_packet.cu @@ -263,7 +263,7 @@ std::shared_ptr AllreducePacket::initAllreduceContext(std::shared_ptrrank = comm->bootstrap()->getRank(); ctx->workSize = comm->bootstrap()->getNranks(); - ctx->ipcDomainNranks = comm->getIpcDomainNranks(); + ctx->ipcDomainNranks = comm->bootstrap()->getNranksPerIpcDomain(); ctx->memorySemaphores = this->memorySemaphores_; ctx->registeredMemories = this->registeredMemories_; ctx->registeredMemories.pop_back(); // remove the local memory from previous context diff --git a/src/ext/collectives/allreduce/allreduce_rsag.cu b/src/ext/collectives/allreduce/allreduce_rsag.cu index 43ff56106..22e3a4ee4 100644 --- a/src/ext/collectives/allreduce/allreduce_rsag.cu +++ b/src/ext/collectives/allreduce/allreduce_rsag.cu @@ -203,7 +203,7 @@ std::shared_ptr AllreduceRsAg::initAllreduceContext(std::shared_ptr(); ctx->rank = comm->bootstrap()->getRank(); ctx->workSize = comm->bootstrap()->getNranks(); - ctx->ipcDomainNranks = comm->getIpcDomainNranks(); + ctx->ipcDomainNranks = comm->bootstrap()->getNranksPerIpcDomain(); ctx->memorySemaphores = this->scratchSemaphores_; ctx->registeredMemories = this->remoteScratchMemories_; diff --git a/src/ext/collectives/allreduce/allreduce_rsag_pipeline.cu b/src/ext/collectives/allreduce/allreduce_rsag_pipeline.cu index 1e59c7e45..bedf15c50 100644 --- a/src/ext/collectives/allreduce/allreduce_rsag_pipeline.cu +++ b/src/ext/collectives/allreduce/allreduce_rsag_pipeline.cu @@ -306,7 +306,7 @@ std::shared_ptr AllreduceRsAgPipeline::initAllreduceContext(std::shared_pt auto ctx = std::make_shared(); ctx->rank = comm->bootstrap()->getRank(); ctx->workSize = comm->bootstrap()->getNranks(); - ctx->ipcDomainNranks = comm->getIpcDomainNranks(); + ctx->ipcDomainNranks = comm->bootstrap()->getNranksPerIpcDomain(); ctx->memorySemaphores = this->scratchSemaphores_; ctx->registeredMemories = this->remoteScratchMemories_; diff --git a/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu b/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu index f8d612793..10d3a35c2 100644 --- a/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu +++ b/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu @@ -200,7 +200,7 @@ std::shared_ptr AllreduceRsAgZeroCopy::initAllreduceContext(std::shared_pt auto ctx = std::make_shared(); ctx->rank = comm->bootstrap()->getRank(); ctx->workSize = comm->bootstrap()->getNranks(); - ctx->ipcDomainNranks = comm->getIpcDomainNranks(); + ctx->ipcDomainNranks = comm->bootstrap()->getNranksPerIpcDomain(); ctx->memorySemaphores = this->semaphores_; diff --git a/test/mp_unit/bootstrap_tests.cc b/test/mp_unit/bootstrap_tests.cc index c28087a45..eb6985a8e 100644 --- a/test/mp_unit/bootstrap_tests.cc +++ b/test/mp_unit/bootstrap_tests.cc @@ -127,6 +127,7 @@ class MPIBootstrap : public mscclpp::Bootstrap { MPI_Comm_size(shmcomm, &shmrank); return shmrank; } + int getNranksPerIpcDomain() const override { return getNranksPerNode(); } void allGather(void* sendbuf, int size) override { MPI_Allgather(MPI_IN_PLACE, 0, MPI_BYTE, sendbuf, size, MPI_BYTE, MPI_COMM_WORLD); } From 94af88d88d0a648411baf319f7449db1efaa592c Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Sat, 16 May 2026 01:24:56 +0000 Subject: [PATCH 36/44] Fix tuning example hang Avoid probing invalid packet allreduce configurations and reduce the default tuning sweep so the 8-rank tuning example completes reliably. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../customized_comm_with_tuning.py | 33 ++++++++++++------- .../collectives/allreduce/allreduce_packet.cu | 7 +++- 2 files changed, 28 insertions(+), 12 deletions(-) diff --git a/examples/torch-integration/customized_comm_with_tuning.py b/examples/torch-integration/customized_comm_with_tuning.py index 6cef88feb..0a07ca325 100644 --- a/examples/torch-integration/customized_comm_with_tuning.py +++ b/examples/torch-integration/customized_comm_with_tuning.py @@ -54,12 +54,16 @@ def _round_pow2(size: int) -> int: # -- CustomizedComm ----------------------------------------------------------- +def _env_int(name: str, default: int) -> int: + return int(os.environ.get(name, default)) + + class CustomizedComm: """Exposes all_reduce, all_gather, barrier with lazy per-size tuning.""" - _TUNE_N_WARMUP = 5 - _TUNE_N_GRAPH_LAUNCHES = 10 - _TUNE_N_OPS_PER_GRAPH = 100 + _TUNE_N_WARMUP = _env_int("TUNE_N_WARMUP", 2) + _TUNE_N_GRAPH_LAUNCHES = _env_int("TUNE_N_GRAPH_LAUNCHES", 3) + _TUNE_N_OPS_PER_GRAPH = _env_int("TUNE_N_OPS_PER_GRAPH", 20) _CANDIDATE_NBLOCKS = [4, 8, 16, 24, 32, 48, 64, 112, 128] _CANDIDATE_NTHREADS = [512, 768, 1024] _NBLOCKS_LIMIT = { @@ -78,16 +82,16 @@ class CustomizedComm: _AR_CANDIDATES_MNNVL = [ ("default_allreduce_allpair_packet", 0, 128 << 10, None), ("default_allreduce_nvls_packet", 0, 64 << 10, lambda c: c._nvls), - ("default_allreduce_packet", 128 << 10, 4 << 20, None), + ("default_allreduce_packet", 128 << 10, 512 << 10, None), ("default_allreduce_nvls_zero_copy", 512 << 10, None, lambda c: c._nvls and c.symmetric_memory), ("default_allreduce_rsag_zero_copy", 512 << 10, None, None), ("default_allreduce_rsag", 512 << 10, None, None), ] _AR_CANDIDATES_SINGLE = [ - ("default_allreduce_packet", 0, 4 << 20, None), - ("default_allreduce_allpair_packet", 0, 4 << 20, None), - ("default_allreduce_nvls_packet", 0, 4 << 20, lambda c: c._nvls), - ("default_allreduce_rsag_zero_copy", 512 << 10, None, None), + ("default_allreduce_packet", 0, 512 << 10, None), + ("default_allreduce_allpair_packet", 0, 128 << 10, None), + ("default_allreduce_nvls_packet", 0, 64 << 10, lambda c: c._nvls), + ("default_allreduce_rsag_zero_copy", 512 << 10, None, lambda c: not (c._nvls and c.symmetric_memory)), ("default_allreduce_nvls_zero_copy", 512 << 10, None, lambda c: c._nvls and c.symmetric_memory), ("default_allreduce_fullmesh", 0, None, lambda c: torch.version.hip is not None), ] @@ -224,6 +228,11 @@ def _run_tune(self, collective, algo, buf, size, nb, nt): symmetric_memory=False, ) + def _is_tune_config_supported(self, algo, nb, nt): + if algo.name in ("default_allreduce_packet", "default_allreduce_allpair_packet"): + return nb >= self.world_size - 1 and nt in (512, 1024) + return True + def _tune_size(self, collective: str, target_size: int): """Auto-tune one (collective, target_size) pair and cache result.""" buf = self._ensure_tune_bufs() @@ -239,13 +248,15 @@ def _tune_size(self, collective: str, target_size: int): if nb > nb_limit: continue for nt in self._CANDIDATE_NTHREADS: + if not self._is_tune_config_supported(algo, nb, nt): + continue # Feasibility — sync result across ranks so all agree ret = run(algo, nb, nt) - torch.cuda.synchronize() self._time_buf[0] = float(ret) self._exec_ar(self._time_buf[:1], *self._default_ar_config(), sym=self.symmetric_memory) if self._time_buf[0].item() != 0: continue + torch.cuda.synchronize() used.add(algo) # Warmup @@ -341,7 +352,7 @@ def _bench_sizes(low=5 * 1024, high=80 << 20): def benchmark_allreduce( - comm: CustomizedComm, dtype=torch.float16, accum_dtype=None, n_warmup=10, n_graph_launches=10, n_iter=100 + comm: CustomizedComm, dtype=torch.float16, accum_dtype=None, n_warmup=5, n_graph_launches=5, n_iter=50 ): sizes = _bench_sizes() if comm.rank == 0: @@ -382,7 +393,7 @@ def benchmark_allreduce( print(f"{nelems:<18} {size:<18} {ms*1000:<18.2f} {size/(ms*1e-3)/1e9:<18.2f}") -def benchmark_allgather(comm: CustomizedComm, dtype=torch.float16, n_warmup=10, n_graph_launches=10, n_iter=100): +def benchmark_allgather(comm: CustomizedComm, dtype=torch.float16, n_warmup=5, n_graph_launches=5, n_iter=50): sizes = _bench_sizes() if comm.rank == 0: print(f"\n{'='*60}\nAllgather Benchmark\n{'='*60}") diff --git a/src/ext/collectives/allreduce/allreduce_packet.cu b/src/ext/collectives/allreduce/allreduce_packet.cu index a0bc0e26e..801bed626 100644 --- a/src/ext/collectives/allreduce/allreduce_packet.cu +++ b/src/ext/collectives/allreduce/allreduce_packet.cu @@ -231,7 +231,12 @@ CommResult AllreducePacket::allreduceKernelFunc(const std::shared_ptr ctx_ auto ctx = std::static_pointer_cast(ctx_void); std::pair blockAndThreadNum = {nBlocks, nThreadsPerBlock}; if (blockAndThreadNum.first == 0 || blockAndThreadNum.second == 0) { - blockAndThreadNum = getDefaultBlockNumAndThreadNum(inputSize, ctx->workSize, ctx->ipcDomainNranks, dtype); + blockAndThreadNum = getDefaultBlockNumAndThreadNum(inputSize, ctx->ipcDomainNranks, ctx->workSize, dtype); + } else { + const int nPeers = ctx->workSize - 1; + if (nPeers > 0 && blockAndThreadNum.first < nPeers) { + return CommResult::CommInvalidArgument; + } } size_t sendBytes; From f32cfb1fb87be2adce4a33b695ccec43441bf3bc Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Sat, 16 May 2026 19:29:18 +0000 Subject: [PATCH 37/44] update --- .../customized_comm_with_tuning.py | 27 ++++++------------- 1 file changed, 8 insertions(+), 19 deletions(-) diff --git a/examples/torch-integration/customized_comm_with_tuning.py b/examples/torch-integration/customized_comm_with_tuning.py index 0a07ca325..040fda584 100644 --- a/examples/torch-integration/customized_comm_with_tuning.py +++ b/examples/torch-integration/customized_comm_with_tuning.py @@ -54,16 +54,12 @@ def _round_pow2(size: int) -> int: # -- CustomizedComm ----------------------------------------------------------- -def _env_int(name: str, default: int) -> int: - return int(os.environ.get(name, default)) - - class CustomizedComm: """Exposes all_reduce, all_gather, barrier with lazy per-size tuning.""" - _TUNE_N_WARMUP = _env_int("TUNE_N_WARMUP", 2) - _TUNE_N_GRAPH_LAUNCHES = _env_int("TUNE_N_GRAPH_LAUNCHES", 3) - _TUNE_N_OPS_PER_GRAPH = _env_int("TUNE_N_OPS_PER_GRAPH", 20) + _TUNE_N_WARMUP = 3 + _TUNE_N_GRAPH_LAUNCHES = 5 + _TUNE_N_OPS_PER_GRAPH = 50 _CANDIDATE_NBLOCKS = [4, 8, 16, 24, 32, 48, 64, 112, 128] _CANDIDATE_NTHREADS = [512, 768, 1024] _NBLOCKS_LIMIT = { @@ -88,10 +84,10 @@ class CustomizedComm: ("default_allreduce_rsag", 512 << 10, None, None), ] _AR_CANDIDATES_SINGLE = [ - ("default_allreduce_packet", 0, 512 << 10, None), - ("default_allreduce_allpair_packet", 0, 128 << 10, None), - ("default_allreduce_nvls_packet", 0, 64 << 10, lambda c: c._nvls), - ("default_allreduce_rsag_zero_copy", 512 << 10, None, lambda c: not (c._nvls and c.symmetric_memory)), + ("default_allreduce_packet", 0, 4 << 20, None), + ("default_allreduce_allpair_packet", 0, 512 << 10, None), + ("default_allreduce_nvls_packet", 0, 512 << 10, lambda c: c._nvls), + ("default_allreduce_rsag_zero_copy", 512 << 10, None, None), ("default_allreduce_nvls_zero_copy", 512 << 10, None, lambda c: c._nvls and c.symmetric_memory), ("default_allreduce_fullmesh", 0, None, lambda c: torch.version.hip is not None), ] @@ -228,11 +224,6 @@ def _run_tune(self, collective, algo, buf, size, nb, nt): symmetric_memory=False, ) - def _is_tune_config_supported(self, algo, nb, nt): - if algo.name in ("default_allreduce_packet", "default_allreduce_allpair_packet"): - return nb >= self.world_size - 1 and nt in (512, 1024) - return True - def _tune_size(self, collective: str, target_size: int): """Auto-tune one (collective, target_size) pair and cache result.""" buf = self._ensure_tune_bufs() @@ -248,15 +239,13 @@ def _tune_size(self, collective: str, target_size: int): if nb > nb_limit: continue for nt in self._CANDIDATE_NTHREADS: - if not self._is_tune_config_supported(algo, nb, nt): - continue # Feasibility — sync result across ranks so all agree ret = run(algo, nb, nt) + torch.cuda.synchronize() self._time_buf[0] = float(ret) self._exec_ar(self._time_buf[:1], *self._default_ar_config(), sym=self.symmetric_memory) if self._time_buf[0].item() != 0: continue - torch.cuda.synchronize() used.add(algo) # Warmup From 594dc79657bc15cbfa9762d986bb4756726794c2 Mon Sep 17 00:00:00 2001 From: Qinghua Zhou Date: Sat, 16 May 2026 23:19:25 +0000 Subject: [PATCH 38/44] Address NVLS review feedback Handle unsupported FP8 NVLS paths safely, tighten IPC-domain guards, align IPC-domain naming, and add IPC-domain fabric hash logging. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- include/mscclpp/switch_channel_device.hpp | 6 +- src/core/bootstrap/bootstrap.cc | 2 + src/core/include/execution_kernel.hpp | 10 ++- .../allgather/allgather_fullmesh.cu | 10 +-- .../allgather/allgather_fullmesh_2.cu | 12 ++-- .../allreduce/allreduce_allpair_packet.cu | 29 +++++---- .../allreduce/allreduce_fullmesh.cu | 12 ++-- .../allreduce_nvls_block_pipeline.cu | 32 ++++----- .../allreduce/allreduce_nvls_packet.cu | 4 +- .../allreduce/allreduce_nvls_warp_pipeline.cu | 30 ++++----- .../allreduce/allreduce_nvls_zero_copy.cu | 33 ++++++---- .../collectives/allreduce/allreduce_packet.cu | 38 ++++++----- .../collectives/allreduce/allreduce_rsag.cu | 26 ++++---- .../allreduce/allreduce_rsag_pipeline.cu | 24 +++---- .../allreduce/allreduce_rsag_zero_copy.cu | 18 +++-- .../allreduce_nvls_block_pipeline.hpp | 2 +- .../allreduce_nvls_warp_pipeline.hpp | 2 +- .../collectives/include/allreduce/common.hpp | 65 +++++++++++-------- .../collectives/include/collective_utils.hpp | 8 +-- 19 files changed, 203 insertions(+), 160 deletions(-) diff --git a/include/mscclpp/switch_channel_device.hpp b/include/mscclpp/switch_channel_device.hpp index e95dfcf51..df22bd3aa 100644 --- a/include/mscclpp/switch_channel_device.hpp +++ b/include/mscclpp/switch_channel_device.hpp @@ -155,7 +155,7 @@ struct SwitchChannelDeviceHandle { } #endif else { - assert(false && "Unsupported vector type for multimemLoadReduce"); + static_assert(dependentFalse, "Unsupported vector type for multimemLoadReduce"); } return val; }; @@ -223,7 +223,7 @@ struct SwitchChannelDeviceHandle { } #endif else { - assert(false && "Unsupported vector type for multimemStore"); + static_assert(dependentFalse, "Unsupported vector type for multimemStore"); } }; @@ -248,7 +248,7 @@ struct SwitchChannelDeviceHandle { } else if constexpr (std::is_same_v && std::is_same_v) { asm volatile("multimem.red.relaxed.sys.global.add.f16x2 [%0], {%1};" ::"l"(ptr), "r"(val.x) : "memory"); } else { - assert(false && "Unsupported vector type for multimemStoreReduce"); + static_assert(dependentFalse, "Unsupported vector type for multimemStoreReduce"); } }; #endif // defined(MSCCLPP_DEVICE_CUDA) diff --git a/src/core/bootstrap/bootstrap.cc b/src/core/bootstrap/bootstrap.cc index a58357519..ffdd9c1cc 100644 --- a/src/core/bootstrap/bootstrap.cc +++ b/src/core/bootstrap/bootstrap.cc @@ -468,6 +468,8 @@ int TcpBootstrap::Impl::getNranksPerIpcDomain() { ++nRanksPerIpcDomain; } } + INFO(MSCCLPP_INIT, "rank %d IPC domain fabric hash 0x%016llx nRanksPerIpcDomain %d", rank_, + static_cast(ipcDomainHashes[rank_]), nRanksPerIpcDomain); nRanksPerIpcDomain_ = nRanksPerIpcDomain; return nRanksPerIpcDomain_; } diff --git a/src/core/include/execution_kernel.hpp b/src/core/include/execution_kernel.hpp index cb808bc8c..e9095ada6 100644 --- a/src/core/include/execution_kernel.hpp +++ b/src/core/include/execution_kernel.hpp @@ -525,7 +525,15 @@ MSCCLPP_DEVICE_INLINE void handleMultiLoadReduceStore(const Operation& op, uint3 if constexpr (std::is_same_v) { assert(false && "MULTI_LOAD_REDUCE_STORE is not supported for uint8_t data type"); return; - } else { + } +#if defined(__FP8_TYPES_EXIST__) && \ + (!(defined(__CUDA_ARCH_SPECIFIC__) || defined(__CUDA_ARCH_FAMILY_SPECIFIC__)) || (__CUDA_ARCH__ < 1000)) + else if constexpr (std::is_same_v || std::is_same_v) { + assert(false && "FP8 MULTI_LOAD_REDUCE_STORE requires sm_100a or newer"); + return; + } +#endif + else { static_assert(sizeof(T) <= 8, "Only support type with size <= 8 bytes"); const uint32_t size = min(op.inputBufferSizes[0] - offset, unitSize); if (size <= 0) { diff --git a/src/ext/collectives/allgather/allgather_fullmesh.cu b/src/ext/collectives/allgather/allgather_fullmesh.cu index 84dd4d473..570a2d612 100644 --- a/src/ext/collectives/allgather/allgather_fullmesh.cu +++ b/src/ext/collectives/allgather/allgather_fullmesh.cu @@ -11,8 +11,8 @@ namespace collective { template __global__ void __launch_bounds__(1024, 1) allgatherFullmesh(void* buff, void* scratch, void* resultBuff, DeviceHandle* memoryChannels, - int rank, int ipcDomainNranks, [[maybe_unused]] int worldSize, size_t nelems) { - const int nPeer = ipcDomainNranks - 1; + int rank, int nRanksPerIpcDomain, [[maybe_unused]] int worldSize, size_t nelems) { + const int nPeer = nRanksPerIpcDomain - 1; const size_t chanOffset = nPeer * blockIdx.x; // assume (nelems * sizeof(T)) is divisible by 16 const size_t nInt4 = nelems * sizeof(int) / sizeof(int4); @@ -127,11 +127,11 @@ CommResult AllgatherFullmesh::allgatherKernelFunc(const std::shared_ptr ct if ((char*)input == (char*)output + rank * inputSize) { allgatherFullmesh<<>>( (void*)input, this->scratchBuffer_, (void*)output, ctx->memoryChannelDeviceHandles.get(), rank, - ctx->ipcDomainNranks, ctx->workSize, nElem); + ctx->nRanksPerIpcDomain, ctx->workSize, nElem); } else { allgatherFullmesh<<>>( (void*)input, this->scratchBuffer_, (void*)output, ctx->memoryChannelDeviceHandles.get(), rank, - ctx->ipcDomainNranks, ctx->workSize, nElem); + ctx->nRanksPerIpcDomain, ctx->workSize, nElem); } cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { @@ -148,7 +148,7 @@ std::shared_ptr AllgatherFullmesh::initAllgatherContext(std::shared_ptr(); ctx->rank = comm->bootstrap()->getRank(); ctx->workSize = comm->bootstrap()->getNranks(); - ctx->ipcDomainNranks = comm->bootstrap()->getNranksPerIpcDomain(); + ctx->nRanksPerIpcDomain = comm->bootstrap()->getNranksPerIpcDomain(); // setup semaphores ctx->memorySemaphores = setupMemorySemaphores(comm, this->conns_, nChannelsPerConnection); diff --git a/src/ext/collectives/allgather/allgather_fullmesh_2.cu b/src/ext/collectives/allgather/allgather_fullmesh_2.cu index 5a353922f..f344824f7 100644 --- a/src/ext/collectives/allgather/allgather_fullmesh_2.cu +++ b/src/ext/collectives/allgather/allgather_fullmesh_2.cu @@ -12,15 +12,15 @@ __device__ DeviceSyncer deviceSyncer; template __global__ void __launch_bounds__(1024, 1) allgatherFullmesh2(void* sendbuff, mscclpp::DeviceHandle* memoryChannels, - size_t channelOutOffset, size_t rank, [[maybe_unused]] size_t worldSize, size_t ipcDomainNranks, - size_t nelemsPerGPU) { + size_t channelOutOffset, size_t rank, [[maybe_unused]] size_t worldSize, + size_t nRanksPerIpcDomain, size_t nelemsPerGPU) { const size_t tid = threadIdx.x + blockIdx.x * blockDim.x; const size_t lid = tid % WARP_SIZE; const size_t wid = tid / WARP_SIZE; const size_t nThread = blockDim.x * gridDim.x; const size_t nWarp = nThread / WARP_SIZE; - const size_t nPeer = ipcDomainNranks - 1; + const size_t nPeer = nRanksPerIpcDomain - 1; const size_t chanOffset = nPeer * blockIdx.x; auto memChans = memoryChannels + chanOffset; @@ -140,11 +140,11 @@ CommResult AllgatherFullmesh2::allgatherKernelFunc(const std::shared_ptr c if ((char*)input == (char*)output + rank * inputSize) { allgatherFullmesh2<<>>( (void*)input, ctx->memoryChannelDeviceHandles.get(), channelOutOffset, ctx->rank, ctx->workSize, - ctx->ipcDomainNranks, nElem); + ctx->nRanksPerIpcDomain, nElem); } else { allgatherFullmesh2<<>>( (void*)input, ctx->memoryChannelDeviceHandles.get(), channelOutOffset, ctx->rank, ctx->workSize, - ctx->ipcDomainNranks, nElem); + ctx->nRanksPerIpcDomain, nElem); } cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { @@ -159,7 +159,7 @@ std::shared_ptr AllgatherFullmesh2::initAllgatherContext(std::shared_ptr(); ctx->rank = comm->bootstrap()->getRank(); ctx->workSize = comm->bootstrap()->getNranks(); - ctx->ipcDomainNranks = comm->bootstrap()->getNranksPerIpcDomain(); + ctx->nRanksPerIpcDomain = comm->bootstrap()->getNranksPerIpcDomain(); // setup semaphores ctx->memorySemaphores = this->memorySemaphores_; diff --git a/src/ext/collectives/allreduce/allreduce_allpair_packet.cu b/src/ext/collectives/allreduce/allreduce_allpair_packet.cu index 29ef2055b..47c4f61d9 100644 --- a/src/ext/collectives/allreduce/allreduce_allpair_packet.cu +++ b/src/ext/collectives/allreduce/allreduce_allpair_packet.cu @@ -14,11 +14,11 @@ namespace collective { template __global__ void allreduceAllPairs(T* buff, T* scratch, T* resultBuff, DeviceHandle* memoryChannels, - size_t channelDataOffset, size_t scratchBufferSize, int rank, int ipcDomainNranks, + size_t channelDataOffset, size_t scratchBufferSize, int rank, int nRanksPerIpcDomain, int worldSize, size_t nelems, uint32_t numScratchBuff, void* flags, uint32_t flagSize) { if (sizeof(T) == 2 || sizeof(T) == 1) nelems = (nelems * sizeof(T) + sizeof(T)) / sizeof(int); - const int nPeers = ipcDomainNranks - 1; + const int nPeers = nRanksPerIpcDomain - 1; uint32_t flag = ((uint32_t*)flags)[blockIdx.x]; size_t scratchBaseOffset = (flag % numScratchBuff) ? (scratchBufferSize / numScratchBuff) : 0; @@ -72,19 +72,17 @@ template struct AllpairAdapter { static cudaError_t call(const void* buff, void* scratch, void* resultBuff, void* memoryChannels, void*, DeviceHandle*, DeviceHandle*, size_t channelInOffset, size_t, - size_t scratchBufferSize, int rank, int ipcDomainNranks, int worldSize, size_t inputSize, + size_t scratchBufferSize, int rank, int nRanksPerIpcDomain, int worldSize, size_t inputSize, cudaStream_t stream, void* flags, uint32_t flagSize, uint32_t numScratchBuff, int nBlocks = 0, int nThreadsPerBlock = 0) { using ChannelType = DeviceHandle; const size_t nelems = inputSize / sizeof(T); // Round nBlocks to multiple of nPeers so every block maps to a valid peer. - const int nPeers = worldSize - 1; - if (nPeers > 0) { - nBlocks = (nBlocks / nPeers) * nPeers; - } + const int nPeers = nRanksPerIpcDomain - 1; + nBlocks = (nBlocks / nPeers) * nPeers; allreduceAllPairs<<>>( (T*)buff, (T*)scratch, (T*)resultBuff, (ChannelType*)memoryChannels, channelInOffset, scratchBufferSize, rank, - ipcDomainNranks, worldSize, nelems, numScratchBuff, flags, flagSize); + nRanksPerIpcDomain, worldSize, nelems, numScratchBuff, flags, flagSize); return cudaGetLastError(); } }; @@ -103,13 +101,18 @@ CommResult AllreduceAllpairPacket::allreduceKernelFunc(const std::shared_ptr&, DataType accumDtype) { auto algoCtx = std::static_pointer_cast(ctx); + if (algoCtx->workSize != algoCtx->nRanksPerIpcDomain) { + WARN("AllreduceAllpairPacket requires workSize to match nRanksPerIpcDomain, got workSize=%d, nRanksPerIpcDomain=%d", + algoCtx->workSize, algoCtx->nRanksPerIpcDomain); + return CommResult::CommInvalidArgument; + } std::pair blockAndThreadNum{nBlocks, nThreadsPerBlock}; if (blockAndThreadNum.first == 0 || blockAndThreadNum.second == 0) { - blockAndThreadNum = getDefaultBlockNumAndThreadNum(inputSize, algoCtx->workSize); + blockAndThreadNum = getDefaultBlockNumAndThreadNum(inputSize, algoCtx->nRanksPerIpcDomain); } // nBlocks must be at least nPeers for allpair — each block maps to one peer. - const int nPeers = algoCtx->ipcDomainNranks - 1; - if (nPeers > 0 && blockAndThreadNum.first < nPeers) { + const int nPeers = algoCtx->nRanksPerIpcDomain - 1; + if (blockAndThreadNum.first < nPeers) { return CommResult::CommInvalidArgument; } size_t sendBytes; @@ -124,7 +127,7 @@ CommResult AllreduceAllpairPacket::allreduceKernelFunc(const std::shared_ptrscratchBuffer_, output, algoCtx->memoryChannelDeviceHandles.get(), nullptr, nullptr, - nullptr, channelInOffset, 0, this->scratchBufferSize_, algoCtx->rank, algoCtx->ipcDomainNranks, + nullptr, channelInOffset, 0, this->scratchBufferSize_, algoCtx->rank, algoCtx->nRanksPerIpcDomain, algoCtx->workSize, inputSize, stream, (void*)flagBuffer_, (uint32_t)flagBufferSize_, this->nSegmentsForScratchBuffer_, blockAndThreadNum.first, blockAndThreadNum.second); if (error != cudaSuccess) { @@ -140,7 +143,7 @@ std::shared_ptr AllreduceAllpairPacket::initAllreduceContext(std::shared_p const int nChannelsPerConnection = maxBlockNum_; ctx->rank = comm->bootstrap()->getRank(); ctx->workSize = comm->bootstrap()->getNranks(); - ctx->ipcDomainNranks = comm->bootstrap()->getNranksPerIpcDomain(); + ctx->nRanksPerIpcDomain = comm->bootstrap()->getNranksPerIpcDomain(); ctx->memorySemaphores = this->memorySemaphores_; ctx->registeredMemories = this->registeredMemories_; ctx->registeredMemories.pop_back(); // remove the local memory from previous context diff --git a/src/ext/collectives/allreduce/allreduce_fullmesh.cu b/src/ext/collectives/allreduce/allreduce_fullmesh.cu index b158f817c..2790295e4 100644 --- a/src/ext/collectives/allreduce/allreduce_fullmesh.cu +++ b/src/ext/collectives/allreduce/allreduce_fullmesh.cu @@ -13,8 +13,8 @@ template __global__ void __launch_bounds__(512, 1) allreduceFullmesh(T* buff, T* scratch, T* resultBuff, DeviceHandle* memoryChannels, DeviceHandle* memoryOutChannels, size_t channelOutDataOffset, int rank, - int ipcDomainNranks, int worldSize, size_t nelems) { - const int nPeer = ipcDomainNranks - 1; + int nRanksPerIpcDomain, int worldSize, size_t nelems) { + const int nPeer = nRanksPerIpcDomain - 1; const size_t chanOffset = nPeer * blockIdx.x; // assume (nelems * sizeof(T)) is divisible by (16 * worldSize) const size_t nInt4 = nelems * sizeof(T) / sizeof(int4); @@ -157,7 +157,7 @@ template struct AllreduceAllconnectAdapter { static cudaError_t call(const void* input, void* scratch, void* output, void* memoryChannels, void* memoryOutChannels, DeviceHandle*, DeviceHandle*, size_t, - size_t channelOutDataOffset, size_t, int rank, int ipcDomainNranks, int worldSize, + size_t channelOutDataOffset, size_t, int rank, int nRanksPerIpcDomain, int worldSize, size_t inputSize, cudaStream_t stream, void*, uint32_t, uint32_t, int nBlocks, int nThreadsPerBlock) { using ChannelType = DeviceHandle; @@ -166,7 +166,7 @@ struct AllreduceAllconnectAdapter { if (nThreadsPerBlock == 0) nThreadsPerBlock = 512; allreduceFullmesh<<>>( (T*)input, (T*)scratch, (T*)output, (ChannelType*)memoryChannels, (ChannelType*)memoryOutChannels, - channelOutDataOffset, rank, ipcDomainNranks, worldSize, nelems); + channelOutDataOffset, rank, nRanksPerIpcDomain, worldSize, nelems); return cudaGetLastError(); } }; @@ -223,7 +223,7 @@ CommResult AllreduceFullmesh::allreduceKernelFunc( } cudaError_t error = allreduce(input, this->scratchBuffer_, output, inputChannelHandles.get(), ctx->memoryChannelDeviceHandles.get(), - nullptr, nullptr, 0, channelOutOffset, 0, ctx->rank, ctx->ipcDomainNranks, ctx->workSize, inputSize, + nullptr, nullptr, 0, channelOutOffset, 0, ctx->rank, ctx->nRanksPerIpcDomain, ctx->workSize, inputSize, stream, nullptr, 0, 0, numBlocksAndThreads.first, numBlocksAndThreads.second); if (error != cudaSuccess) { WARN("AllreduceAllconnect failed with error: %s", cudaGetErrorString(error)); @@ -250,7 +250,7 @@ std::shared_ptr AllreduceFullmesh::initAllreduceContext(std::shared_ptr(); ctx->rank = comm->bootstrap()->getRank(); ctx->workSize = comm->bootstrap()->getNranks(); - ctx->ipcDomainNranks = comm->bootstrap()->getNranksPerIpcDomain(); + ctx->nRanksPerIpcDomain = comm->bootstrap()->getNranksPerIpcDomain(); // setup semaphores ctx->memorySemaphores = this->outputSemaphores_; diff --git a/src/ext/collectives/allreduce/allreduce_nvls_block_pipeline.cu b/src/ext/collectives/allreduce/allreduce_nvls_block_pipeline.cu index 890e50f58..347ce8b41 100644 --- a/src/ext/collectives/allreduce/allreduce_nvls_block_pipeline.cu +++ b/src/ext/collectives/allreduce/allreduce_nvls_block_pipeline.cu @@ -21,15 +21,15 @@ __global__ void __launch_bounds__(1024, 1) [[maybe_unused]] DeviceHandle* memoryChannels, [[maybe_unused]] DeviceHandle* switchChannels, [[maybe_unused]] size_t size, [[maybe_unused]] size_t scratchBufferSize, - [[maybe_unused]] int rank, [[maybe_unused]] int ipcDomainNranks) { + [[maybe_unused]] int rank, [[maybe_unused]] int nRanksPerIpcDomain) { #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 constexpr int alignment = 16; - int nPeers = ipcDomainNranks - 1; - int nBlocksForCopy = ipcDomainNranks * 2; - int nBlocksForReduce = ipcDomainNranks; + int nPeers = nRanksPerIpcDomain - 1; + int nBlocksForCopy = nRanksPerIpcDomain * 2; + int nBlocksForReduce = nRanksPerIpcDomain; int copyReduceRatio = nBlocksForCopy / nBlocksForReduce; - size_t scratchSizePerRank = scratchBufferSize / ipcDomainNranks; - size_t sizePerRank = size / ipcDomainNranks; + size_t scratchSizePerRank = scratchBufferSize / nRanksPerIpcDomain; + size_t sizePerRank = size / nRanksPerIpcDomain; assert(sizePerRank % alignment == 0); uint32_t sizePerBlock = ((sizePerRank + (nBlocksForCopy - 1)) / nBlocksForCopy + alignment - 1) / alignment * alignment; @@ -69,7 +69,7 @@ __global__ void __launch_bounds__(1024, 1) deviceSemaphore[bid + 2 * nBlocksForCopy].acquire(); } __syncthreads(); - for (int i = 0; i < ipcDomainNranks; i++) { + for (int i = 0; i < nRanksPerIpcDomain; i++) { size_t blockOffset = it * unitSize + bid * sizePerBlock + i * sizePerRank; uint32_t scratchOffset = scratchIt * unitSize + bid * scratchSizePerBlock + i * scratchSizePerRank; char* srcData = (char*)src + blockOffset; @@ -126,7 +126,7 @@ __global__ void __launch_bounds__(1024, 1) channels->wait(); } __syncthreads(); - for (int i = 0; i < ipcDomainNranks; i++) { + for (int i = 0; i < nRanksPerIpcDomain; i++) { size_t blockOffset = it * unitSize + (bid - nBlocksForCopy - nBlocksForReduce) * sizePerBlock + i * sizePerRank; uint32_t scratchOffset = scratchIt * unitSize + (bid - nBlocksForCopy - nBlocksForReduce) * scratchSizePerBlock + @@ -151,7 +151,7 @@ template struct NvlsBlockPipelineAdapter { static cudaError_t call(const void* input, void* scratch, void* output, void* memoryChannels, void*, DeviceHandle* nvlsChannels, DeviceHandle*, size_t, size_t, - size_t scratchBufferSize, int rank, int ipcDomainNranks, int, size_t inputSize, + size_t scratchBufferSize, int rank, int nRanksPerIpcDomain, int, size_t inputSize, cudaStream_t stream, void*, uint32_t, uint32_t, int nBlocks, int nThreadsPerBlock) { // uint8_t is not supported for NVLS (no hardware support for byte-level reduction) if constexpr (std::is_same_v) { @@ -169,7 +169,7 @@ struct NvlsBlockPipelineAdapter { using ChannelType = DeviceHandle; allreduceNvlsBlockPipeline<<>>( input, scratch, output, (ChannelType*)memoryChannels, nvlsChannels, inputSize, scratchBufferSize, rank, - ipcDomainNranks); + nRanksPerIpcDomain); return cudaGetLastError(); } } @@ -177,9 +177,9 @@ struct NvlsBlockPipelineAdapter { void AllreduceNvlsBlockPipeline::initialize(std::shared_ptr comm) { nSwitchChannels_ = 8; - ipcDomainNranks_ = comm->bootstrap()->getNranksPerIpcDomain(); - // Per-peer channel allocation must hold up to 4 * ipcDomainNranks entries (see kernel). - nBaseChannels_ = std::max(64, 4 * ipcDomainNranks_); + nRanksPerIpcDomain_ = comm->bootstrap()->getNranksPerIpcDomain(); + // Per-peer channel allocation must hold up to 4 * nRanksPerIpcDomain entries (see kernel). + nBaseChannels_ = std::max(64, 4 * nRanksPerIpcDomain_); this->conns_ = setupConnections(comm); // setup semaphores std::vector> memorySemaphores = @@ -202,11 +202,11 @@ CommResult AllreduceNvlsBlockPipeline::allreduceKernelFunc( } std::pair blockAndThreadNum = {nBlocks, nThreadsPerBlock}; if (blockAndThreadNum.first == 0 || blockAndThreadNum.second == 0) { - blockAndThreadNum = {ctx->ipcDomainNranks * 5, 1024}; + blockAndThreadNum = {ctx->nRanksPerIpcDomain * 5, 1024}; } cudaError_t error = allreduce(input, this->scratchBuffer_, output, this->memoryChannelsDeviceHandle_.get(), nullptr, ctx->switchChannelDeviceHandles.get(), nullptr, 0, 0, this->scratchBufferSize_, - ctx->rank, ctx->ipcDomainNranks, ctx->workSize, inputSize, stream, nullptr, 0, 0, + ctx->rank, ctx->nRanksPerIpcDomain, ctx->workSize, inputSize, stream, nullptr, 0, 0, blockAndThreadNum.first, blockAndThreadNum.second); if (error != cudaSuccess) { WARN("AllreduceNvlsBlockPipeline failed with error: %s", cudaGetErrorString(error)); @@ -224,7 +224,7 @@ std::shared_ptr AllreduceNvlsBlockPipeline::initAllreduceContext(std::shar auto ctx = std::make_shared(); ctx->rank = comm->bootstrap()->getRank(); ctx->workSize = comm->bootstrap()->getNranks(); - ctx->ipcDomainNranks = comm->bootstrap()->getNranksPerIpcDomain(); + ctx->nRanksPerIpcDomain = comm->bootstrap()->getNranksPerIpcDomain(); // setup channels ctx->switchChannels = diff --git a/src/ext/collectives/allreduce/allreduce_nvls_packet.cu b/src/ext/collectives/allreduce/allreduce_nvls_packet.cu index e8ecfb737..f16e8b05f 100644 --- a/src/ext/collectives/allreduce/allreduce_nvls_packet.cu +++ b/src/ext/collectives/allreduce/allreduce_nvls_packet.cu @@ -95,7 +95,7 @@ std::shared_ptr AllreduceNvlsPacket::initAllreduceContext(std::shared_ptr< auto ctx = std::make_shared(); ctx->rank = comm->bootstrap()->getRank(); ctx->workSize = comm->bootstrap()->getNranks(); - ctx->ipcDomainNranks = comm->bootstrap()->getNranksPerIpcDomain(); + ctx->nRanksPerIpcDomain = comm->bootstrap()->getNranksPerIpcDomain(); // setup channels ctx->switchChannels = this->switchChannels_; @@ -124,7 +124,7 @@ CommResult AllreduceNvlsPacket::allreduceKernelFunc(const std::shared_ptr } cudaError_t error = allreduce(input, this->scratchBuffer_, output, nullptr, nullptr, ctx->switchChannelDeviceHandles.get(), nullptr, - 0, 0, this->scratchBufferSize_, ctx->rank, ctx->ipcDomainNranks, ctx->workSize, inputSize, stream, + 0, 0, this->scratchBufferSize_, ctx->rank, ctx->nRanksPerIpcDomain, ctx->workSize, inputSize, stream, (void*)flagBuffer_, (uint32_t)flagBufferSize_, 0, blockAndThreadNum.first, blockAndThreadNum.second); if (error != cudaSuccess) { WARN(ALGO, "AllreduceNvlsPacket failed with error: ", cudaGetErrorString(error)); diff --git a/src/ext/collectives/allreduce/allreduce_nvls_warp_pipeline.cu b/src/ext/collectives/allreduce/allreduce_nvls_warp_pipeline.cu index 68efc2ab0..ba447d32a 100644 --- a/src/ext/collectives/allreduce/allreduce_nvls_warp_pipeline.cu +++ b/src/ext/collectives/allreduce/allreduce_nvls_warp_pipeline.cu @@ -19,15 +19,15 @@ __global__ void __launch_bounds__(1024, 1) [[maybe_unused]] DeviceHandle* memoryChannels, [[maybe_unused]] DeviceHandle* multicast, [[maybe_unused]] size_t size, [[maybe_unused]] size_t scratchBufferSize, [[maybe_unused]] int rank, - [[maybe_unused]] int ipcDomainNranks) { + [[maybe_unused]] int nRanksPerIpcDomain) { #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 constexpr int alignment = 16; - int nPeers = ipcDomainNranks - 1; + int nPeers = nRanksPerIpcDomain - 1; int nBlocks = gridDim.x; int nBlocksPerNvlsConn = nBlocks / NUM_NVLS_CONNECTION; int bid = blockIdx.x; - size_t sizePerRank = size / ipcDomainNranks; - size_t scratchSizePerRank = scratchBufferSize / ipcDomainNranks; + size_t sizePerRank = size / nRanksPerIpcDomain; + size_t scratchSizePerRank = scratchBufferSize / nRanksPerIpcDomain; const size_t maxSizePerBlock = ((sizePerRank + nBlocks - 1) / nBlocks + alignment - 1) / alignment * alignment; size_t start = bid * maxSizePerBlock; size_t end = min(start + maxSizePerBlock, sizePerRank); @@ -54,7 +54,7 @@ __global__ void __launch_bounds__(1024, 1) lastIterSize = sizePerBlock % copyPerIter; } - const size_t chanOffset = (ipcDomainNranks - 1) * blockIdx.x * 2; + const size_t chanOffset = (nRanksPerIpcDomain - 1) * blockIdx.x * 2; auto memoryChans = memoryChannels + chanOffset; __shared__ DeviceHandle channels[(MAX_IPC_DOMAIN_NRANKS - 1) * 2]; const int lid = threadIdx.x % WARP_SIZE; @@ -67,7 +67,7 @@ __global__ void __launch_bounds__(1024, 1) const size_t iterSize = (it == nIter - 1) ? lastIterSize : copyPerIter; if (warpId < endCopyWid) { int tidInCopy = threadIdx.x; - for (int i = 0; i < ipcDomainNranks; i++) { + for (int i = 0; i < nRanksPerIpcDomain; i++) { size_t offset = i * sizePerRank + maxSizePerBlock * bid + it * copyPerIter; size_t offsetScratch = i * scratchSizePerRank + scratchSizePerBlock * bid + (it * copyPerIter) % scratchSizePerBlock; @@ -98,7 +98,7 @@ __global__ void __launch_bounds__(1024, 1) channels[tidInRecvCopy + nPeers].wait(); } asm volatile("bar.sync %0, %1;" ::"r"(3), "r"((NRECV_COPY_WARPS)*WARP_SIZE) : "memory"); - for (int i = 0; i < ipcDomainNranks; i++) { + for (int i = 0; i < nRanksPerIpcDomain; i++) { size_t offset = i * sizePerRank + maxSizePerBlock * bid + it * copyPerIter; size_t offsetScratch = i * scratchSizePerRank + scratchSizePerBlock * bid + (it * copyPerIter) % scratchSizePerBlock; @@ -115,7 +115,7 @@ template struct NvlsWarpPipelineAdapter { static cudaError_t call(const void* input, void* scratch, void* output, void* memoryChannels, void*, DeviceHandle* nvlsChannels, DeviceHandle*, size_t, size_t, - size_t scratchBufferSize, int rank, int ipcDomainNranks, int, size_t inputSize, + size_t scratchBufferSize, int rank, int nRanksPerIpcDomain, int, size_t inputSize, cudaStream_t stream, void*, uint32_t, uint32_t, int nBlocks, int nThreadsPerBlock) { // uint8_t is not supported for NVLS (no hardware support for byte-level reduction) if constexpr (std::is_same_v) { @@ -133,7 +133,7 @@ struct NvlsWarpPipelineAdapter { using ChannelType = DeviceHandle; allreduceNvlsWarpPipeline<<>>( input, scratch, output, (ChannelType*)memoryChannels, nvlsChannels, inputSize, scratchBufferSize, rank, - ipcDomainNranks); + nRanksPerIpcDomain); return cudaGetLastError(); } } @@ -141,9 +141,9 @@ struct NvlsWarpPipelineAdapter { void AllreduceNvlsWarpPipeline::initialize(std::shared_ptr comm) { nSwitchChannels_ = NUM_NVLS_CONNECTION; - ipcDomainNranks_ = comm->bootstrap()->getNranksPerIpcDomain(); - // Per-peer channel allocation must hold 2 * nBlocks entries; default nBlocks = 4 * ipcDomainNranks. - nBaseChannels_ = std::max(64, 8 * ipcDomainNranks_); + nRanksPerIpcDomain_ = comm->bootstrap()->getNranksPerIpcDomain(); + // Per-peer channel allocation must hold 2 * nBlocks entries; default nBlocks = 4 * nRanksPerIpcDomain. + nBaseChannels_ = std::max(64, 8 * nRanksPerIpcDomain_); this->conns_ = setupConnections(comm); // setup semaphores std::vector> memorySemaphores = @@ -166,11 +166,11 @@ CommResult AllreduceNvlsWarpPipeline::allreduceKernelFunc( } std::pair blockAndThreadNum = {nBlocks, nThreadsPerBlock}; if (blockAndThreadNum.first == 0 || blockAndThreadNum.second == 0) { - blockAndThreadNum = {ctx->ipcDomainNranks * 4, 1024}; + blockAndThreadNum = {ctx->nRanksPerIpcDomain * 4, 1024}; } cudaError_t error = allreduce(input, this->scratchBuffer_, output, this->memoryChannelsDeviceHandle_.get(), nullptr, ctx->switchChannelDeviceHandles.get(), nullptr, 0, 0, this->scratchBufferSize_, - ctx->rank, ctx->ipcDomainNranks, ctx->workSize, inputSize, stream, nullptr, 0, 0, + ctx->rank, ctx->nRanksPerIpcDomain, ctx->workSize, inputSize, stream, nullptr, 0, 0, blockAndThreadNum.first, blockAndThreadNum.second); if (error != cudaSuccess) { WARN("AllreduceNvlsWarpPipeline failed with error: %s", cudaGetErrorString(error)); @@ -188,7 +188,7 @@ std::shared_ptr AllreduceNvlsWarpPipeline::initAllreduceContext(std::share auto ctx = std::make_shared(); ctx->rank = comm->bootstrap()->getRank(); ctx->workSize = comm->bootstrap()->getNranks(); - ctx->ipcDomainNranks = comm->bootstrap()->getNranksPerIpcDomain(); + ctx->nRanksPerIpcDomain = comm->bootstrap()->getNranksPerIpcDomain(); // setup channels ctx->switchChannels = diff --git a/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu b/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu index a6f699b2e..32fc61423 100644 --- a/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu +++ b/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu @@ -20,12 +20,12 @@ __global__ void __launch_bounds__(1024, 1) [[maybe_unused]] mscclpp::DeviceHandle* multicast, [[maybe_unused]] mscclpp::DeviceHandle* multicastOut, [[maybe_unused]] size_t channelInOffset, [[maybe_unused]] size_t channelOutOffset, - [[maybe_unused]] size_t size, [[maybe_unused]] int rank, [[maybe_unused]] int ipcDomainNranks) { + [[maybe_unused]] size_t size, [[maybe_unused]] int rank, [[maybe_unused]] int nRanksPerIpcDomain) { #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 - int nPeers = ipcDomainNranks - 1; + int nPeers = nRanksPerIpcDomain - 1; int nBlocks = gridDim.x; int bid = blockIdx.x; - size_t sizePerRank = size / ipcDomainNranks; + size_t sizePerRank = size / nRanksPerIpcDomain; const size_t minAlign = 16; // Align sizePerBlock to 16 bytes to ensure aligned vector access in handleMultiLoadReduceStore size_t sizePerBlock = (sizePerRank + nBlocks - 1) / nBlocks; @@ -41,12 +41,12 @@ __global__ void __launch_bounds__(1024, 1) mscclpp::DeviceHandle* multicastPtr = multicast + bid; mscclpp::DeviceHandle* multicastOutPtr = multicastOut + bid; - const size_t chanOffset = (ipcDomainNranks - 1) * blockIdx.x; + const size_t chanOffset = (nRanksPerIpcDomain - 1) * blockIdx.x; auto memoryChans = memoryChannels + chanOffset; __shared__ mscclpp::DeviceHandle channels[MAX_IPC_DOMAIN_NRANKS - 1]; const int lid = threadIdx.x % WARP_SIZE; // Peer count may exceed WARP_SIZE on MNNVL. - for (int i = lid; i < ipcDomainNranks - 1; i += WARP_SIZE) { + for (int i = lid; i < nRanksPerIpcDomain - 1; i += WARP_SIZE) { channels[i] = memoryChans[i]; } __syncwarp(); @@ -74,7 +74,7 @@ struct NvlsAdapter { static cudaError_t call(const void*, void*, void*, void* memoryChannels, void*, mscclpp::DeviceHandle* nvlsChannels, mscclpp::DeviceHandle* nvlsOutChannels, size_t channelInOffset, - size_t channelOutOffset, size_t, int rank, int ipcDomainNranks, int, size_t inputSize, + size_t channelOutOffset, size_t, int rank, int nRanksPerIpcDomain, int, size_t inputSize, cudaStream_t stream, void*, uint32_t, uint32_t, int nBlocks, int nThreadsPerBlock) { // uint8_t is not supported for NVLS (no hardware support for byte-level reduction) if constexpr (std::is_same_v) { @@ -86,7 +86,7 @@ struct NvlsAdapter { using ChannelType = DeviceHandle; allreduceNvls<<>>( (ChannelType*)memoryChannels, nvlsChannels, nvlsOutChannels, channelInOffset, channelOutOffset, inputSize, - rank, ipcDomainNranks); + rank, nRanksPerIpcDomain); return cudaGetLastError(); } } @@ -120,6 +120,13 @@ CommResult AllreduceNvls::allreduceKernelFunc(const std::shared_ptr ctx_vo return CommResult::CommInvalidArgument; } auto ctx = std::static_pointer_cast(ctx_void); +#if defined(__FP8_TYPES_EXIST__) + bool isFp8Dtype = dtype == mscclpp::DataType::FLOAT8_E4M3FN || dtype == mscclpp::DataType::FLOAT8_E5M2; + if (isFp8Dtype && computeCapabilityMajor_ < 10) { + WARN("FP8 NVLS allreduce requires compute capability 10.x or newer."); + return CommResult::CommInvalidArgument; + } +#endif AllreduceFunc allreduce = dispatch(op, dtype, accumDtype); if (!allreduce) { WARN("Unsupported operation or data type for allreduce, dtype=%d", static_cast(dtype)); @@ -138,7 +145,7 @@ CommResult AllreduceNvls::allreduceKernelFunc(const std::shared_ptr ctx_vo } std::pair numBlocksAndThreads = {nBlocks, nThreadsPerBlock}; if (numBlocksAndThreads.first == 0 || numBlocksAndThreads.second == 0) { - numBlocksAndThreads = {::min(ctx->ipcDomainNranks, MAX_NBLOCKS), 1024}; + numBlocksAndThreads = {::min(ctx->nRanksPerIpcDomain, MAX_NBLOCKS), 1024}; // For GB200 devices with MNNVLS (Multi-Node NVLink Sharp), scale the number of blocks inversely with // the number of GPUs. Empirically, 32 blocks works well for 4 GPUs and 16 for 8 GPUs, which // follows the formula 128 / nGPUs, clamped to [1, MAX_NBLOCKS]. @@ -152,9 +159,13 @@ CommResult AllreduceNvls::allreduceKernelFunc(const std::shared_ptr ctx_vo } cudaError_t error = allreduce(nullptr, nullptr, nullptr, this->memoryChannelsDeviceHandle_.get(), nullptr, nvlsChannels, - nvlsOutChannels, channelInOffset, channelOutOffset, 0, ctx->rank, ctx->ipcDomainNranks, ctx->workSize, - inputSize, stream, nullptr, 0, 0, numBlocksAndThreads.first, numBlocksAndThreads.second); + nvlsOutChannels, channelInOffset, channelOutOffset, 0, ctx->rank, ctx->nRanksPerIpcDomain, + ctx->workSize, inputSize, stream, nullptr, 0, 0, numBlocksAndThreads.first, numBlocksAndThreads.second); if (error != cudaSuccess) { + if (error == cudaErrorNotSupported) { + WARN("AllreduceNvls does not support the requested data type."); + return CommResult::CommInvalidArgument; + } WARN("AllreduceNvls failed with error: %s", cudaGetErrorString(error)); return CommResult::CommUnhandledCudaError; } @@ -176,7 +187,7 @@ std::shared_ptr AllreduceNvls::initAllreduceContext(std::shared_ptr(); ctx->rank = comm->bootstrap()->getRank(); ctx->workSize = comm->bootstrap()->getNranks(); - ctx->ipcDomainNranks = comm->bootstrap()->getNranksPerIpcDomain(); + ctx->nRanksPerIpcDomain = comm->bootstrap()->getNranksPerIpcDomain(); size_t sendBytes, recvBytes; CUdeviceptr sendBasePtr, recvBasePtr; diff --git a/src/ext/collectives/allreduce/allreduce_packet.cu b/src/ext/collectives/allreduce/allreduce_packet.cu index 801bed626..d20625eea 100644 --- a/src/ext/collectives/allreduce/allreduce_packet.cu +++ b/src/ext/collectives/allreduce/allreduce_packet.cu @@ -15,7 +15,7 @@ namespace collective { template __global__ void __launch_bounds__(1024, 1) allreducePacket(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle* memoryChannels, - size_t channelDataOffset, size_t scratchBufferSize, int rank, int ipcDomainNranks, int worldSize, + size_t channelDataOffset, size_t scratchBufferSize, int rank, int nRanksPerIpcDomain, int worldSize, size_t nelems, void* flags, uint32_t flagBufferSize, uint32_t numScratchBuff #if defined(ENABLE_NPKIT) , @@ -53,7 +53,7 @@ __global__ void __launch_bounds__(1024, 1) else nelems = nelems / (sizeof(int) / sizeof(T)); - const int nPeers = ipcDomainNranks - 1; + const int nPeers = nRanksPerIpcDomain - 1; const size_t nPkts = nelems / 2; uint32_t flag = ((uint32_t*)flags)[blockIdx.x]; @@ -154,31 +154,32 @@ template struct PacketAdapter { static cudaError_t call(const void* buff, void* scratch, void* resultBuff, void* memoryChannels, void*, DeviceHandle*, DeviceHandle*, size_t channelInOffset, size_t, - size_t scratchBufferSize, int rank, int ipcDomainNranks, int worldSize, size_t inputSize, + size_t scratchBufferSize, int rank, int nRanksPerIpcDomain, int worldSize, size_t inputSize, cudaStream_t stream, void* flags, uint32_t flagBufferSize, uint32_t numScratchBuff, int nBlocks = 0, int nThreadsPerBlock = 0) { using ChannelType = DeviceHandle; const size_t nelems = inputSize / sizeof(T); - // Optimize the number of blocks to be multiple of (worldSize - 1) - nBlocks = nBlocks / (worldSize - 1) * (worldSize - 1); + // Optimize the number of blocks to be multiple of the IPC-domain peer count. + const int nPeers = nRanksPerIpcDomain - 1; + nBlocks = nBlocks / nPeers * nPeers; #if defined(ENABLE_NPKIT) size_t sharedMemSize = sizeof(NpKitEvent) * NPKIT_SHM_NUM_EVENTS; allreducePacket<<>>( (T*)buff, (T*)scratch, (T*)resultBuff, (ChannelType*)memoryChannels, channelInOffset, scratchBufferSize, rank, - ipcDomainNranks, worldSize, nelems, flags, flagBufferSize, numScratchBuff, NpKit::GetGpuEventCollectContexts(), - NpKit::GetCpuTimestamp()); + nRanksPerIpcDomain, worldSize, nelems, flags, flagBufferSize, numScratchBuff, + NpKit::GetGpuEventCollectContexts(), NpKit::GetCpuTimestamp()); #else allreducePacket<<>>( (T*)buff, (T*)scratch, (T*)resultBuff, (ChannelType*)memoryChannels, channelInOffset, scratchBufferSize, rank, - ipcDomainNranks, worldSize, nelems, flags, flagBufferSize, numScratchBuff); + nRanksPerIpcDomain, worldSize, nelems, flags, flagBufferSize, numScratchBuff); #endif return cudaGetLastError(); } }; -inline std::pair getDefaultBlockNumAndThreadNum(size_t inputSize, int ipcDomainNranks, int worldSize, +inline std::pair getDefaultBlockNumAndThreadNum(size_t inputSize, int nRanksPerIpcDomain, int worldSize, [[maybe_unused]] DataType dtype) { - int nBlocks = (ipcDomainNranks - 1) * 4; + int nBlocks = (nRanksPerIpcDomain - 1) * 4; int nThreadsPerBlock = 1024; if (inputSize >= 32768) { nBlocks = (worldSize - 1) * 8; @@ -229,12 +230,17 @@ CommResult AllreducePacket::allreduceKernelFunc(const std::shared_ptr ctx_ const std::unordered_map&, DataType accumDtype) { auto ctx = std::static_pointer_cast(ctx_void); + if (ctx->workSize != ctx->nRanksPerIpcDomain) { + WARN(ALGO, "AllreducePacket requires workSize to match nRanksPerIpcDomain, got workSize=", ctx->workSize, + ", nRanksPerIpcDomain=", ctx->nRanksPerIpcDomain); + return CommResult::CommInvalidArgument; + } std::pair blockAndThreadNum = {nBlocks, nThreadsPerBlock}; if (blockAndThreadNum.first == 0 || blockAndThreadNum.second == 0) { - blockAndThreadNum = getDefaultBlockNumAndThreadNum(inputSize, ctx->ipcDomainNranks, ctx->workSize, dtype); + blockAndThreadNum = getDefaultBlockNumAndThreadNum(inputSize, ctx->nRanksPerIpcDomain, ctx->workSize, dtype); } else { - const int nPeers = ctx->workSize - 1; - if (nPeers > 0 && blockAndThreadNum.first < nPeers) { + const int nPeers = ctx->nRanksPerIpcDomain - 1; + if (blockAndThreadNum.first < nPeers) { return CommResult::CommInvalidArgument; } } @@ -252,8 +258,8 @@ CommResult AllreducePacket::allreduceKernelFunc(const std::shared_ptr ctx_ } cudaError_t error = allreduce(input, this->scratchBuffer_, output, ctx->memoryChannelDeviceHandles.get(), nullptr, nullptr, nullptr, - channelInOffset, 0, this->scratchBufferSize_, ctx->rank, ctx->ipcDomainNranks, ctx->workSize, inputSize, - stream, (void*)flagBuffer_, (uint32_t)flagBufferSize_, this->nSegmentsForScratchBuffer_, + channelInOffset, 0, this->scratchBufferSize_, ctx->rank, ctx->nRanksPerIpcDomain, ctx->workSize, + inputSize, stream, (void*)flagBuffer_, (uint32_t)flagBufferSize_, this->nSegmentsForScratchBuffer_, blockAndThreadNum.first, blockAndThreadNum.second); if (error != cudaSuccess) { WARN(ALGO, "AllreducePacket failed with error: ", cudaGetErrorString(error)); @@ -268,7 +274,7 @@ std::shared_ptr AllreducePacket::initAllreduceContext(std::shared_ptrrank = comm->bootstrap()->getRank(); ctx->workSize = comm->bootstrap()->getNranks(); - ctx->ipcDomainNranks = comm->bootstrap()->getNranksPerIpcDomain(); + ctx->nRanksPerIpcDomain = comm->bootstrap()->getNranksPerIpcDomain(); ctx->memorySemaphores = this->memorySemaphores_; ctx->registeredMemories = this->registeredMemories_; ctx->registeredMemories.pop_back(); // remove the local memory from previous context diff --git a/src/ext/collectives/allreduce/allreduce_rsag.cu b/src/ext/collectives/allreduce/allreduce_rsag.cu index 22e3a4ee4..f07e0e2c8 100644 --- a/src/ext/collectives/allreduce/allreduce_rsag.cu +++ b/src/ext/collectives/allreduce/allreduce_rsag.cu @@ -31,18 +31,18 @@ namespace collective { template __global__ void __launch_bounds__(1024, 1) allreduceRsAg(T* buff, T* scratch, T* resultBuff, DeviceHandle* memoryChannels, - DeviceHandle* switchChannels, void* remoteMemories, int rank, int ipcDomainNranks, + DeviceHandle* switchChannels, void* remoteMemories, int rank, int nRanksPerIpcDomain, int worldSize, size_t nelems) { int blockId = blockIdx.x; - uint32_t nPeers = ipcDomainNranks - 1; + uint32_t nPeers = nRanksPerIpcDomain - 1; assert((uintptr_t)buff % sizeof(int4) == 0); assert((uintptr_t)resultBuff % sizeof(int4) == 0); constexpr uint32_t nelemsPerInt4 = sizeof(int4) / sizeof(T); - uint32_t alignedNelems = ((nelems + ipcDomainNranks - 1) / ipcDomainNranks + nelemsPerInt4 - 1) / nelemsPerInt4 * - nelemsPerInt4 * ipcDomainNranks; - uint32_t nelemsPerRank = alignedNelems / ipcDomainNranks; + uint32_t alignedNelems = ((nelems + nRanksPerIpcDomain - 1) / nRanksPerIpcDomain + nelemsPerInt4 - 1) / + nelemsPerInt4 * nelemsPerInt4 * nRanksPerIpcDomain; + uint32_t nelemsPerRank = alignedNelems / nRanksPerIpcDomain; uint32_t nInt4PerRank = nelemsPerRank / nelemsPerInt4; uint32_t lastInt4Index = nelems / nelemsPerInt4; uint32_t remainder = nelems % nelemsPerInt4; @@ -59,7 +59,7 @@ __global__ void __launch_bounds__(1024, 1) nInt4PerBlock += remainderForBlock; } if (nInt4PerBlock == 0) return; - uint32_t nInt4ForCopy = nInt4PerBlock * ipcDomainNranks; + uint32_t nInt4ForCopy = nInt4PerBlock * nRanksPerIpcDomain; for (uint32_t idx = threadIdx.x; idx < nInt4ForCopy; idx += blockDim.x) { int rankIdx = idx / nInt4PerBlock; @@ -84,13 +84,13 @@ __global__ void __launch_bounds__(1024, 1) if (offset > lastInt4Index) continue; int4 tmp = scratch4[offset]; for (uint32_t i = 0; i < nPeers; i++) { - int rankIdx = (rank + i + 1) % ipcDomainNranks; + int rankIdx = (rank + i + 1) % nRanksPerIpcDomain; int peerIdx = rankIdx < rank ? rankIdx : rankIdx - 1; int4 data = mscclpp::read(((void**)remoteMemories)[peerIdx], offset); tmp = calVector(data, tmp); } for (uint32_t i = 0; i < nPeers; i++) { - int rankIdx = (rank + i + 1) % ipcDomainNranks; + int rankIdx = (rank + i + 1) % nRanksPerIpcDomain; int peerIdx = rankIdx < rank ? rankIdx : rankIdx - 1; mscclpp::write(((void**)remoteMemories)[peerIdx], offset, tmp); } @@ -127,8 +127,8 @@ template struct AllreduceRsAgAdapter { static cudaError_t call(const void* input, void* scratch, void* output, void* memoryChannels, void* remoteMemories, DeviceHandle* switchChannel, DeviceHandle*, size_t, size_t, - size_t, int rank, int ipcDomainNranks, int worldSize, size_t inputSize, cudaStream_t stream, - void*, uint32_t, uint32_t, int nBlocks, int nThreadsPerBlock) { + size_t, int rank, int nRanksPerIpcDomain, int worldSize, size_t inputSize, + cudaStream_t stream, void*, uint32_t, uint32_t, int nBlocks, int nThreadsPerBlock) { using ChannelType = DeviceHandle; size_t nelems = inputSize / sizeof(T); if (nBlocks == 0 || nThreadsPerBlock == 0) { @@ -137,7 +137,7 @@ struct AllreduceRsAgAdapter { } allreduceRsAg<<>>( (T*)input, (T*)scratch, (T*)output, (ChannelType*)memoryChannels, switchChannel, remoteMemories, rank, - ipcDomainNranks, worldSize, nelems); + nRanksPerIpcDomain, worldSize, nelems); return cudaGetLastError(); } }; @@ -185,7 +185,7 @@ CommResult AllreduceRsAg::allreduceKernelFunc(const std::shared_ptr ctx, c } cudaError_t error = allreduce(input, this->scratchBuffer_, output, this->baseMemoryChannelHandles_.get(), this->remoteMemoryHandles_.get(), nullptr, nullptr, 0, 0, 0, algoCtx->rank, - algoCtx->ipcDomainNranks, algoCtx->workSize, inputSize, stream, nullptr, 0, 0, + algoCtx->nRanksPerIpcDomain, algoCtx->workSize, inputSize, stream, nullptr, 0, 0, numBlocksAndThreads.first, numBlocksAndThreads.second); if (error != cudaSuccess) { WARN(ALGO, "Allreduce kernel launch failed with error: ", cudaGetErrorString(error)); @@ -203,7 +203,7 @@ std::shared_ptr AllreduceRsAg::initAllreduceContext(std::shared_ptr(); ctx->rank = comm->bootstrap()->getRank(); ctx->workSize = comm->bootstrap()->getNranks(); - ctx->ipcDomainNranks = comm->bootstrap()->getNranksPerIpcDomain(); + ctx->nRanksPerIpcDomain = comm->bootstrap()->getNranksPerIpcDomain(); ctx->memorySemaphores = this->scratchSemaphores_; ctx->registeredMemories = this->remoteScratchMemories_; diff --git a/src/ext/collectives/allreduce/allreduce_rsag_pipeline.cu b/src/ext/collectives/allreduce/allreduce_rsag_pipeline.cu index bedf15c50..e9d543eaa 100644 --- a/src/ext/collectives/allreduce/allreduce_rsag_pipeline.cu +++ b/src/ext/collectives/allreduce/allreduce_rsag_pipeline.cu @@ -86,15 +86,15 @@ template __global__ void __launch_bounds__(1024, 1) allreduceRsAgPipeline(T* buff, T* scratch, T* resultBuff, DeviceHandle* memoryChannels, DeviceHandle* switchChannels, void* remoteMemories, int rank, - int ipcDomainNranks, int worldSize, size_t nelems, size_t scratchSize, uint32_t nblocksForPut, - uint32_t nblocksForReduce, uint32_t nblocksForRecv) { + int nRanksPerIpcDomain, int worldSize, size_t nelems, size_t scratchSize, + uint32_t nblocksForPut, uint32_t nblocksForReduce, uint32_t nblocksForRecv) { uint32_t bid = blockIdx.x; constexpr uint32_t nStepsPerIter = 4; uint32_t nInt4 = (nelems * sizeof(T) + sizeof(int4) - 1) / sizeof(int4); uint32_t nInt4PerIter = nblocksForReduce * blockDim.x * nStepsPerIter; const uint32_t chunkSize = nInt4PerIter * worldSize; uint32_t nIters = (nInt4 + chunkSize - 1) / chunkSize; - uint32_t nPeers = ipcDomainNranks - 1; + uint32_t nPeers = nRanksPerIpcDomain - 1; int4* scratch4 = reinterpret_cast((char*)scratch); const uint32_t scratchIterStride = 2 * chunkSize; // one for AS, one for AG const uint32_t pipelineDepth = scratchSize / sizeof(int4) / scratchIterStride; @@ -111,7 +111,7 @@ __global__ void __launch_bounds__(1024, 1) __syncthreads(); uint32_t threadIdInPut = bid * blockDim.x + threadIdx.x; for (uint32_t peer = 0; peer < nPeers; peer++) { - int remoteRankId = (rank + peer + 1) % ipcDomainNranks; + int remoteRankId = (rank + peer + 1) % nRanksPerIpcDomain; int peerId = remoteRankId < rank ? remoteRankId : remoteRankId - 1; // Read chunk[remoteRankId] from local buff, write to peer's scratch[rank] (sender's slot) uint32_t srcOffset = iter * chunkSize + remoteRankId * nInt4PerIter; @@ -164,7 +164,7 @@ __global__ void __launch_bounds__(1024, 1) int4 tmp = loadVec(buff, myChunkOffset, nelems); // Add data from each peer's slot in scratch (peer sent their chunk[rank] to our scratch[peer]) for (uint32_t peer = 0; peer < nPeers; peer++) { - int remoteRankId = (rank + peer + 1) % ipcDomainNranks; + int remoteRankId = (rank + peer + 1) % nRanksPerIpcDomain; uint32_t peerSlotOffset = baseOffset + remoteRankId * nInt4PerIter + threadIdInPut + putStep * blockDim.x * nblocksForPut; int4 data = scratch4[peerSlotOffset]; @@ -175,7 +175,7 @@ __global__ void __launch_bounds__(1024, 1) uint32_t dstOffset = baseOffset + chunkSize + rank * nInt4PerIter + threadIdInPut + putStep * blockDim.x * nblocksForPut; for (uint32_t i = 0; i < nPeers; i++) { - int peerIdx = (rank + i + 1) % ipcDomainNranks; + int peerIdx = (rank + i + 1) % nRanksPerIpcDomain; int index = peerIdx < rank ? peerIdx : peerIdx - 1; mscclpp::write(((void**)remoteMemories)[index], dstOffset, tmp); } @@ -203,7 +203,7 @@ __global__ void __launch_bounds__(1024, 1) __syncthreads(); // Copy other ranks' reduced chunks from scratch to result for (uint32_t peer = 0; peer < nPeers; peer++) { - int remoteRankId = (rank + peer + 1) % ipcDomainNranks; + int remoteRankId = (rank + peer + 1) % nRanksPerIpcDomain; for (uint32_t step = 0; step < nStepsPerIter * REDUCE_COPY_RATIO; step++) { uint32_t offset = baseOffset + chunkSize + remoteRankId * nInt4PerIter + threadIdInRecv + step * blockDim.x * nblocksForRecv; @@ -224,7 +224,7 @@ template struct AllreduceRsAgPipelineAdapter { static cudaError_t call(const void* input, void* scratch, void* output, void* memoryChannels, void* remoteMemories, DeviceHandle* switchChannel, DeviceHandle*, size_t, size_t, - size_t scratchSize, int rank, int ipcDomainNranks, int worldSize, size_t inputSize, + size_t scratchSize, int rank, int nRanksPerIpcDomain, int worldSize, size_t inputSize, cudaStream_t stream, void*, uint32_t, uint32_t, int nBlocks, int nThreadsPerBlock) { using ChannelType = DeviceHandle; size_t nelems = inputSize / sizeof(T); @@ -248,7 +248,7 @@ struct AllreduceRsAgPipelineAdapter { } allreduceRsAgPipeline<<>>( (T*)input, (T*)scratch, (T*)output, (ChannelType*)memoryChannels, switchChannel, remoteMemories, rank, - ipcDomainNranks, worldSize, nelems, scratchSize, nblocksForPut, nblocksForReduce, nblocksForRecv); + nRanksPerIpcDomain, worldSize, nelems, scratchSize, nblocksForPut, nblocksForReduce, nblocksForRecv); return cudaGetLastError(); } }; @@ -288,8 +288,8 @@ CommResult AllreduceRsAgPipeline::allreduceKernelFunc( std::pair numBlocksAndThreads = {nBlocks, nThreadsPerBlock}; cudaError_t error = allreduce(input, this->scratchBuffer_, output, this->baseMemoryChannelHandles_.get(), this->remoteMemoryHandles_.get(), nullptr, nullptr, 0, 0, this->scratchBufferSize_, - algoCtx->rank, algoCtx->ipcDomainNranks, algoCtx->workSize, inputSize, stream, nullptr, - 0, 0, numBlocksAndThreads.first, numBlocksAndThreads.second); + algoCtx->rank, algoCtx->nRanksPerIpcDomain, algoCtx->workSize, inputSize, stream, + nullptr, 0, 0, numBlocksAndThreads.first, numBlocksAndThreads.second); if (error != cudaSuccess) { WARN(ALGO, "Allreduce kernel launch failed with error: ", cudaGetErrorString(error)); return CommResult::CommUnhandledCudaError; @@ -306,7 +306,7 @@ std::shared_ptr AllreduceRsAgPipeline::initAllreduceContext(std::shared_pt auto ctx = std::make_shared(); ctx->rank = comm->bootstrap()->getRank(); ctx->workSize = comm->bootstrap()->getNranks(); - ctx->ipcDomainNranks = comm->bootstrap()->getNranksPerIpcDomain(); + ctx->nRanksPerIpcDomain = comm->bootstrap()->getNranksPerIpcDomain(); ctx->memorySemaphores = this->scratchSemaphores_; ctx->registeredMemories = this->remoteScratchMemories_; diff --git a/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu b/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu index 10d3a35c2..753ad7999 100644 --- a/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu +++ b/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu @@ -116,8 +116,8 @@ template struct AllreduceRsAgZeroCopyAdapter { static cudaError_t call(const void* input, void* scratch, void* output, void* memoryChannels, void* remoteMemories, DeviceHandle* switchChannel, DeviceHandle*, size_t, size_t, - size_t, int rank, int ipcDomainNranks, int worldSize, size_t inputSize, cudaStream_t stream, - void*, uint32_t, uint32_t, int nBlocks, int nThreadsPerBlock) { + size_t, int rank, int nRanksPerIpcDomain, int worldSize, size_t inputSize, + cudaStream_t stream, void*, uint32_t, uint32_t, int nBlocks, int nThreadsPerBlock) { using ChannelType = DeviceHandle; size_t nelems = inputSize / sizeof(T); if (nBlocks == 0 || nThreadsPerBlock == 0) { @@ -127,16 +127,16 @@ struct AllreduceRsAgZeroCopyAdapter { nBlocks = 128; } } - if (ipcDomainNranks == 4) { + if (nRanksPerIpcDomain == 4) { allreduceRsAgZeroCopy<4, OpType, T, AccumT> <<>>((T*)input, (T*)scratch, (T*)output, (ChannelType*)memoryChannels, switchChannel, remoteMemories, rank, worldSize, nelems); - } else if (ipcDomainNranks == 8) { + } else if (nRanksPerIpcDomain == 8) { allreduceRsAgZeroCopy<8, OpType, T, AccumT> <<>>((T*)input, (T*)scratch, (T*)output, (ChannelType*)memoryChannels, switchChannel, remoteMemories, rank, worldSize, nelems); } else { - WARN(ALGO, "AllreduceRsAgZeroCopy only supports ipcDomainNranks of 4 or 8, got: ", ipcDomainNranks); + WARN(ALGO, "AllreduceRsAgZeroCopy only supports nRanksPerIpcDomain of 4 or 8, got: ", nRanksPerIpcDomain); return cudaErrorInvalidValue; } return cudaGetLastError(); @@ -172,9 +172,13 @@ CommResult AllreduceRsAgZeroCopy::allreduceKernelFunc(const std::shared_ptrbaseMemoryChannelHandles_.get(), algoCtx->remoteMemoryHandles.get(), - nullptr, nullptr, 0, 0, 0, algoCtx->rank, algoCtx->ipcDomainNranks, algoCtx->workSize, inputSize, + nullptr, nullptr, 0, 0, 0, algoCtx->rank, algoCtx->nRanksPerIpcDomain, algoCtx->workSize, inputSize, stream, nullptr, 0, 0, numBlocksAndThreads.first, numBlocksAndThreads.second); if (error != cudaSuccess) { + if (error == cudaErrorInvalidValue) { + WARN(ALGO, "AllreduceRsAgZeroCopy received invalid launch arguments: ", cudaGetErrorString(error)); + return CommResult::CommInvalidArgument; + } WARN(ALGO, "Allreduce kernel launch failed with error: ", cudaGetErrorString(error)); return CommResult::CommUnhandledCudaError; } @@ -200,7 +204,7 @@ std::shared_ptr AllreduceRsAgZeroCopy::initAllreduceContext(std::shared_pt auto ctx = std::make_shared(); ctx->rank = comm->bootstrap()->getRank(); ctx->workSize = comm->bootstrap()->getNranks(); - ctx->ipcDomainNranks = comm->bootstrap()->getNranksPerIpcDomain(); + ctx->nRanksPerIpcDomain = comm->bootstrap()->getNranksPerIpcDomain(); ctx->memorySemaphores = this->semaphores_; diff --git a/src/ext/collectives/include/allreduce/allreduce_nvls_block_pipeline.hpp b/src/ext/collectives/include/allreduce/allreduce_nvls_block_pipeline.hpp index 9a1742db1..5662d1163 100644 --- a/src/ext/collectives/include/allreduce/allreduce_nvls_block_pipeline.hpp +++ b/src/ext/collectives/include/allreduce/allreduce_nvls_block_pipeline.hpp @@ -29,7 +29,7 @@ class AllreduceNvlsBlockPipeline : public AlgorithmBuilder { void* scratchBuffer_; size_t scratchBufferSize_; uint32_t nSwitchChannels_; - int ipcDomainNranks_ = 0; + int nRanksPerIpcDomain_ = 0; int nBaseChannels_ = 0; std::shared_ptr> memoryChannelsDeviceHandle_; std::vector baseChannels_; diff --git a/src/ext/collectives/include/allreduce/allreduce_nvls_warp_pipeline.hpp b/src/ext/collectives/include/allreduce/allreduce_nvls_warp_pipeline.hpp index e2aa8c873..f347c871f 100644 --- a/src/ext/collectives/include/allreduce/allreduce_nvls_warp_pipeline.hpp +++ b/src/ext/collectives/include/allreduce/allreduce_nvls_warp_pipeline.hpp @@ -29,7 +29,7 @@ class AllreduceNvlsWarpPipeline : public AlgorithmBuilder { void* scratchBuffer_; size_t scratchBufferSize_; uint32_t nSwitchChannels_; - int ipcDomainNranks_ = 0; + int nRanksPerIpcDomain_ = 0; int nBaseChannels_ = 0; std::shared_ptr> memoryChannelsDeviceHandle_; std::vector baseChannels_; diff --git a/src/ext/collectives/include/allreduce/common.hpp b/src/ext/collectives/include/allreduce/common.hpp index 22513ace5..5d593449c 100644 --- a/src/ext/collectives/include/allreduce/common.hpp +++ b/src/ext/collectives/include/allreduce/common.hpp @@ -39,34 +39,43 @@ MSCCLPP_DEVICE_INLINE constexpr std::size_t calcVectorSize() { template MSCCLPP_DEVICE_INLINE void handleMultiLoadReduceStore(T* src, T* dst, size_t srcOffset, size_t dstOffset, size_t size, int tid, int nThreads) { - // nvls can only handle 4 bytes alignment - MSCCLPP_ASSERT_DEVICE(size % 4 == 0, "size must be 4 bytes aligned"); - constexpr size_t nElem = calcVectorSize(); - // For integer types, use 1-element vectors since multimem doesn't support vectorized integer operations - constexpr size_t vecSize = (std::is_same_v || std::is_same_v || std::is_same_v || - std::is_same_v) - ? 1 - : nElem; - using vectorType = mscclpp::VectorType; - const size_t nVec = size / sizeof(vectorType); - const size_t srcOffset4 = srcOffset / sizeof(vectorType); - const size_t dstOffset4 = dstOffset / sizeof(vectorType); - vectorType* src4 = (vectorType*)src; - vectorType* dst4 = (vectorType*)dst; - for (size_t idx = tid; idx < nVec; idx += nThreads) { - auto val = mscclpp::SwitchChannelDeviceHandle::multimemLoadReduce(src4 + srcOffset4 + idx); - mscclpp::SwitchChannelDeviceHandle::multimemStore(val, dst4 + dstOffset4 + idx); - } - // handle rest of data - size_t processed = nVec * sizeof(vectorType); - constexpr size_t nRestElem = 4 / sizeof(T); - using restVectorType = mscclpp::VectorType; - const size_t startIdx = (srcOffset + processed) / sizeof(restVectorType); - const size_t endIdx = (srcOffset + size) / sizeof(restVectorType); - for (size_t idx = tid + startIdx; idx < endIdx; idx += nThreads) { - auto val = - mscclpp::SwitchChannelDeviceHandle::multimemLoadReduce((restVectorType*)src + idx); - mscclpp::SwitchChannelDeviceHandle::multimemStore(val, (restVectorType*)dst + idx); +#if defined(__FP8_TYPES_EXIST__) && \ + (!(defined(__CUDA_ARCH_SPECIFIC__) || defined(__CUDA_ARCH_FAMILY_SPECIFIC__)) || (__CUDA_ARCH__ < 1000)) + if constexpr (std::is_same_v || std::is_same_v) { + assert(false && "FP8 NVLS multimem requires sm_100a or newer"); + return; + } else +#endif + { + // nvls can only handle 4 bytes alignment + MSCCLPP_ASSERT_DEVICE(size % 4 == 0, "size must be 4 bytes aligned"); + constexpr size_t nElem = calcVectorSize(); + // For integer types, use 1-element vectors since multimem doesn't support vectorized integer operations + constexpr size_t vecSize = (std::is_same_v || std::is_same_v || + std::is_same_v || std::is_same_v) + ? 1 + : nElem; + using vectorType = mscclpp::VectorType; + const size_t nVec = size / sizeof(vectorType); + const size_t srcOffset4 = srcOffset / sizeof(vectorType); + const size_t dstOffset4 = dstOffset / sizeof(vectorType); + vectorType* src4 = (vectorType*)src; + vectorType* dst4 = (vectorType*)dst; + for (size_t idx = tid; idx < nVec; idx += nThreads) { + auto val = mscclpp::SwitchChannelDeviceHandle::multimemLoadReduce(src4 + srcOffset4 + idx); + mscclpp::SwitchChannelDeviceHandle::multimemStore(val, dst4 + dstOffset4 + idx); + } + // handle rest of data + size_t processed = nVec * sizeof(vectorType); + constexpr size_t nRestElem = 4 / sizeof(T); + using restVectorType = mscclpp::VectorType; + const size_t startIdx = (srcOffset + processed) / sizeof(restVectorType); + const size_t endIdx = (srcOffset + size) / sizeof(restVectorType); + for (size_t idx = tid + startIdx; idx < endIdx; idx += nThreads) { + auto val = + mscclpp::SwitchChannelDeviceHandle::multimemLoadReduce((restVectorType*)src + idx); + mscclpp::SwitchChannelDeviceHandle::multimemStore(val, (restVectorType*)dst + idx); + } } } #endif // defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 diff --git a/src/ext/collectives/include/collective_utils.hpp b/src/ext/collectives/include/collective_utils.hpp index c1cad4121..2e61b9379 100644 --- a/src/ext/collectives/include/collective_utils.hpp +++ b/src/ext/collectives/include/collective_utils.hpp @@ -27,8 +27,8 @@ namespace mscclpp { namespace collective { constexpr int NUM_NVLS_CONNECTION = 8; // Sized to cover MAX_IPC_DOMAIN_NRANKS-scale allreduce algos whose device-side -// semaphore indices grow as O(ipcDomainNranks) (e.g. nvls_block_pipeline uses -// up to ~5 * ipcDomainNranks entries). +// semaphore indices grow as O(nRanksPerIpcDomain) (e.g. nvls_block_pipeline uses +// up to ~5 * nRanksPerIpcDomain entries). constexpr int NUM_SEMAPHORES = 512; // Upper bound on the number of NVLink-reachable ranks that participate in a @@ -37,7 +37,7 @@ constexpr int NUM_SEMAPHORES = 512; // of shared-memory channel arrays in the allreduce/allgather kernels. constexpr int MAX_IPC_DOMAIN_NRANKS = 72; -constexpr int SCRATCH_SIZE = 2 * 1024 * 1024 * 70; // double buffer * 35 thread-blocks * 8 ranks * 256KB = 70MB +constexpr int SCRATCH_SIZE = 2 * 1024 * 1024 * 70; // Two 70 MiB buffers for double-buffered packet scratch space. std::vector setupRemoteMemories(std::shared_ptr comm, int rank, RegisteredMemory localMemory); @@ -79,7 +79,7 @@ class AlgorithmCtx { public: int rank; int workSize; - int ipcDomainNranks; + int nRanksPerIpcDomain; std::vector registeredMemories; std::vector memoryChannels; From 18d37379d264f5b08e409ca3b99b3fd4c24f67cc Mon Sep 17 00:00:00 2001 From: Qinghua Zhou Date: Sat, 16 May 2026 23:23:30 +0000 Subject: [PATCH 39/44] Tighten NVML IPC domain hash lookup Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/core/utils_internal.cc | 22 +++++----------------- 1 file changed, 5 insertions(+), 17 deletions(-) diff --git a/src/core/utils_internal.cc b/src/core/utils_internal.cc index 2e620b660..adbf8e5b7 100644 --- a/src/core/utils_internal.cc +++ b/src/core/utils_internal.cc @@ -208,30 +208,18 @@ uint64_t getFabricHash(const nvmlGpuFabricInfo_t& fabricInfo) { bool tryGetNvmlIpcDomainHash(uint64_t& ipcDomainHash) { // Use the current CUDA device; callers must set the rank's device before querying. int deviceId; - if (cudaGetDevice(&deviceId) != cudaSuccess) { - return false; - } - char pciBusId[] = "00000000:00:00.0"; - if (cudaDeviceGetPCIBusId(pciBusId, sizeof(pciBusId), deviceId) != cudaSuccess) { + if (cudaGetDevice(&deviceId) != cudaSuccess || + cudaDeviceGetPCIBusId(pciBusId, sizeof(pciBusId), deviceId) != cudaSuccess) { return false; } static NvmlState nvml; - if (!nvml.isInitialized()) { - return false; - } - nvmlDevice_t nvmlDevice; - if (nvmlDeviceGetHandleByPciBusId_v2(pciBusId, &nvmlDevice) != NVML_SUCCESS) { - return false; - } - nvmlGpuFabricInfo_t fabricInfo = {}; - if (nvmlDeviceGetGpuFabricInfo(nvmlDevice, &fabricInfo) != NVML_SUCCESS) { - return false; - } - if (fabricInfo.state != NVML_GPU_FABRIC_STATE_COMPLETED || fabricInfo.status != NVML_SUCCESS) { + if (!nvml.isInitialized() || nvmlDeviceGetHandleByPciBusId_v2(pciBusId, &nvmlDevice) != NVML_SUCCESS || + nvmlDeviceGetGpuFabricInfo(nvmlDevice, &fabricInfo) != NVML_SUCCESS || + fabricInfo.state != NVML_GPU_FABRIC_STATE_COMPLETED || fabricInfo.status != NVML_SUCCESS) { return false; } From 4db71b93b74fc137a5482f6a840b9161c8759e2d Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Mon, 18 May 2026 20:50:01 +0000 Subject: [PATCH 40/44] Move barrier into setupNvlsChannels and clean up NVLS pipeline state - setupNvlsChannels now takes the Communicator and barriers internally after binding all switch channels, replacing the explicit bootstrap()->barrier() previously done only in AllreduceNvlsPacket. - Demote nRanksPerIpcDomain_ / nBaseChannels_ to locals in AllreduceNvlsBlockPipeline and AllreduceNvlsWarpPipeline; they were never read outside initialize(). - Drive-by: pick up in-tree edits to switch_channel_device.hpp, executor.cc, communicator.hpp, and allreduce_rsag.cu. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- include/mscclpp/switch_channel_device.hpp | 3 +-- src/core/executor/executor.cc | 1 + src/core/include/communicator.hpp | 1 + .../allreduce/allreduce_nvls_block_pipeline.cu | 11 +++++------ .../collectives/allreduce/allreduce_nvls_packet.cu | 3 +-- .../allreduce/allreduce_nvls_warp_pipeline.cu | 11 +++++------ .../collectives/allreduce/allreduce_nvls_zero_copy.cu | 6 +++--- src/ext/collectives/allreduce/allreduce_rsag.cu | 2 +- src/ext/collectives/collective_utils.cc | 8 ++++---- .../allreduce/allreduce_nvls_block_pipeline.hpp | 2 -- .../allreduce/allreduce_nvls_warp_pipeline.hpp | 2 -- src/ext/collectives/include/collective_utils.hpp | 3 ++- 12 files changed, 24 insertions(+), 29 deletions(-) diff --git a/include/mscclpp/switch_channel_device.hpp b/include/mscclpp/switch_channel_device.hpp index df22bd3aa..fcdd7fddb 100644 --- a/include/mscclpp/switch_channel_device.hpp +++ b/include/mscclpp/switch_channel_device.hpp @@ -39,8 +39,7 @@ struct SwitchChannelDeviceHandle { /// Vectorized multimem load+reduce. The optional `AccumT` template parameter selects the /// accumulator: when `AccumT == __half` and `VectorType` is an FP8 vector type, the - /// `.acc::f16` variant of the instruction is used (faster but lower precision than the - /// default FP32 accumulator). For all other types `AccumT` is ignored. + /// `.acc::f16` variant of the instruction is used. For all other types `AccumT` is ignored. template MSCCLPP_DEVICE_INLINE static VectorType multimemLoadReduce(VectorType* ptr) { VectorType val; diff --git a/src/core/executor/executor.cc b/src/core/executor/executor.cc index fcecc4ddf..15c6af4e6 100644 --- a/src/core/executor/executor.cc +++ b/src/core/executor/executor.cc @@ -389,6 +389,7 @@ struct Executor::Impl { nvlsConnection->bindAllocatedMemory((CUdeviceptr)bufferInfo.first, bufferInfo.second); context.nvlsChannels.push_back(switchChannel); } + this->comm->bootstrap()->barrier(); } void setupSemaphores(ExecutionContext& context, const ExecutionPlan& plan) { diff --git a/src/core/include/communicator.hpp b/src/core/include/communicator.hpp index 333cc9823..f15e20f74 100644 --- a/src/core/include/communicator.hpp +++ b/src/core/include/communicator.hpp @@ -60,6 +60,7 @@ struct Communicator::Impl { std::shared_ptr bootstrap_; std::shared_ptr context_; std::unordered_map connectionInfos_; + // Temporary storage for the latest RecvItem of each {remoteRank, tag} pair. // The RecvItem is removed when it finishes or when getLastRecvItem observes that it is ready. std::unordered_map, std::shared_ptr, PairHash> lastRecvItems_; diff --git a/src/ext/collectives/allreduce/allreduce_nvls_block_pipeline.cu b/src/ext/collectives/allreduce/allreduce_nvls_block_pipeline.cu index 347ce8b41..04c7f8c99 100644 --- a/src/ext/collectives/allreduce/allreduce_nvls_block_pipeline.cu +++ b/src/ext/collectives/allreduce/allreduce_nvls_block_pipeline.cu @@ -1,7 +1,6 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT License. -#include #include #include "allreduce/allreduce_nvls_block_pipeline.hpp" @@ -177,15 +176,15 @@ struct NvlsBlockPipelineAdapter { void AllreduceNvlsBlockPipeline::initialize(std::shared_ptr comm) { nSwitchChannels_ = 8; - nRanksPerIpcDomain_ = comm->bootstrap()->getNranksPerIpcDomain(); + int nRanksPerIpcDomain = comm->bootstrap()->getNranksPerIpcDomain(); // Per-peer channel allocation must hold up to 4 * nRanksPerIpcDomain entries (see kernel). - nBaseChannels_ = std::max(64, 4 * nRanksPerIpcDomain_); + int nBaseChannels = std::max(64, 4 * nRanksPerIpcDomain); this->conns_ = setupConnections(comm); // setup semaphores std::vector> memorySemaphores = - setupMemorySemaphores(comm, this->conns_, nBaseChannels_); + setupMemorySemaphores(comm, this->conns_, nBaseChannels); // setup base memory channels - this->baseChannels_ = setupBaseMemoryChannels(this->conns_, memorySemaphores, nBaseChannels_); + this->baseChannels_ = setupBaseMemoryChannels(this->conns_, memorySemaphores, nBaseChannels); this->memoryChannelsDeviceHandle_ = setupBaseMemoryChannelDeviceHandles(this->baseChannels_); this->nvlsConnections_ = setupNvlsConnections(comm, nvlsBufferSize_, nSwitchChannels_); } @@ -228,7 +227,7 @@ std::shared_ptr AllreduceNvlsBlockPipeline::initAllreduceContext(std::shar // setup channels ctx->switchChannels = - setupNvlsChannels(this->nvlsConnections_, this->scratchBuffer_, scratchBufferSize_, nSwitchChannels_); + setupNvlsChannels(comm, this->nvlsConnections_, this->scratchBuffer_, scratchBufferSize_, nSwitchChannels_); ctx->switchChannelDeviceHandles = setupNvlsChannelDeviceHandles(ctx->switchChannels); return ctx; } diff --git a/src/ext/collectives/allreduce/allreduce_nvls_packet.cu b/src/ext/collectives/allreduce/allreduce_nvls_packet.cu index f16e8b05f..1918eef19 100644 --- a/src/ext/collectives/allreduce/allreduce_nvls_packet.cu +++ b/src/ext/collectives/allreduce/allreduce_nvls_packet.cu @@ -82,8 +82,7 @@ void AllreduceNvlsPacket::initialize(std::shared_ptr comm) { int nSwitchChannels = 1; this->nvlsConnections_ = setupNvlsConnections(comm, nvlsBufferSize_, nSwitchChannels); this->switchChannels_ = - setupNvlsChannels(this->nvlsConnections_, this->scratchBuffer_, this->scratchBufferSize_, nSwitchChannels); - comm->bootstrap()->barrier(); + setupNvlsChannels(comm, this->nvlsConnections_, this->scratchBuffer_, this->scratchBufferSize_, nSwitchChannels); } AlgorithmCtxKey AllreduceNvlsPacket::generateAllreduceContextKey(const void*, void*, size_t, DataType, bool) { diff --git a/src/ext/collectives/allreduce/allreduce_nvls_warp_pipeline.cu b/src/ext/collectives/allreduce/allreduce_nvls_warp_pipeline.cu index ba447d32a..d5bbb2e71 100644 --- a/src/ext/collectives/allreduce/allreduce_nvls_warp_pipeline.cu +++ b/src/ext/collectives/allreduce/allreduce_nvls_warp_pipeline.cu @@ -1,7 +1,6 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT License. -#include #include #include "allreduce/allreduce_nvls_warp_pipeline.hpp" @@ -141,15 +140,15 @@ struct NvlsWarpPipelineAdapter { void AllreduceNvlsWarpPipeline::initialize(std::shared_ptr comm) { nSwitchChannels_ = NUM_NVLS_CONNECTION; - nRanksPerIpcDomain_ = comm->bootstrap()->getNranksPerIpcDomain(); + int nRanksPerIpcDomain = comm->bootstrap()->getNranksPerIpcDomain(); // Per-peer channel allocation must hold 2 * nBlocks entries; default nBlocks = 4 * nRanksPerIpcDomain. - nBaseChannels_ = std::max(64, 8 * nRanksPerIpcDomain_); + int nBaseChannels = std::max(64, 8 * nRanksPerIpcDomain); this->conns_ = setupConnections(comm); // setup semaphores std::vector> memorySemaphores = - setupMemorySemaphores(comm, this->conns_, nBaseChannels_); + setupMemorySemaphores(comm, this->conns_, nBaseChannels); // setup base memory channels - this->baseChannels_ = setupBaseMemoryChannels(this->conns_, memorySemaphores, nBaseChannels_); + this->baseChannels_ = setupBaseMemoryChannels(this->conns_, memorySemaphores, nBaseChannels); this->memoryChannelsDeviceHandle_ = setupBaseMemoryChannelDeviceHandles(this->baseChannels_); this->nvlsConnections_ = setupNvlsConnections(comm, nvlsBufferSize_, nSwitchChannels_); } @@ -192,7 +191,7 @@ std::shared_ptr AllreduceNvlsWarpPipeline::initAllreduceContext(std::share // setup channels ctx->switchChannels = - setupNvlsChannels(this->nvlsConnections_, this->scratchBuffer_, scratchBufferSize_, nSwitchChannels_); + setupNvlsChannels(comm, this->nvlsConnections_, this->scratchBuffer_, scratchBufferSize_, nSwitchChannels_); ctx->switchChannelDeviceHandles = setupNvlsChannelDeviceHandles(ctx->switchChannels); return ctx; } diff --git a/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu b/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu index 32fc61423..481e8ad85 100644 --- a/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu +++ b/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu @@ -2,7 +2,6 @@ // Licensed under the MIT License. #include -#include #include "allreduce/allreduce_nvls_zero_copy.hpp" #include "allreduce/common.hpp" @@ -195,11 +194,12 @@ std::shared_ptr AllreduceNvls::initAllreduceContext(std::shared_ptrswitchChannels = setupNvlsChannels(this->nvlsConnections_, (void*)sendBasePtr, sendBytes, nSwitchChannels_); + ctx->switchChannels = + setupNvlsChannels(comm, this->nvlsConnections_, (void*)sendBasePtr, sendBytes, nSwitchChannels_); if (input != output) { auto nvlsOutConnections = this->nvlsOutConnections_; std::vector outChannels = - setupNvlsChannels(this->nvlsOutConnections_, (void*)recvBasePtr, recvBytes, nSwitchChannels_); + setupNvlsChannels(comm, this->nvlsOutConnections_, (void*)recvBasePtr, recvBytes, nSwitchChannels_); ctx->switchChannels.insert(ctx->switchChannels.end(), outChannels.begin(), outChannels.end()); } diff --git a/src/ext/collectives/allreduce/allreduce_rsag.cu b/src/ext/collectives/allreduce/allreduce_rsag.cu index f07e0e2c8..6fffc4dac 100644 --- a/src/ext/collectives/allreduce/allreduce_rsag.cu +++ b/src/ext/collectives/allreduce/allreduce_rsag.cu @@ -144,7 +144,7 @@ struct AllreduceRsAgAdapter { void AllreduceRsAg::initialize(std::shared_ptr comm) { this->conns_ = setupConnections(comm); - nChannelsPerConnection_ = 128; + nChannelsPerConnection_ = 64; comm_ = comm; // setup semaphores this->scratchSemaphores_ = setupMemorySemaphores(comm, this->conns_, nChannelsPerConnection_); diff --git a/src/ext/collectives/collective_utils.cc b/src/ext/collectives/collective_utils.cc index c3856a88e..5d038afae 100644 --- a/src/ext/collectives/collective_utils.cc +++ b/src/ext/collectives/collective_utils.cc @@ -6,12 +6,9 @@ #include #include #include -#include #include #include -#include "logger.hpp" - namespace mscclpp { namespace collective { std::vector setupRemoteMemories(std::shared_ptr comm, int rank, @@ -101,7 +98,8 @@ std::vector> setupNvlsConnections(std:: return nvlsConnections; } -std::vector setupNvlsChannels(std::vector> conns, +std::vector setupNvlsChannels(std::shared_ptr comm, + std::vector> conns, void* buffer, size_t bufferSize, int nSwitchChannels) { std::vector channels; @@ -110,6 +108,8 @@ std::vector setupNvlsChannels(std::vectorbindAllocatedMemory((CUdeviceptr)buffer, bufferSize); channels.push_back(switchChannel); } + // Synchronize to make sure all ranks have their NVLS channels set up before any rank starts using them. + comm->bootstrap()->barrier(); return channels; } diff --git a/src/ext/collectives/include/allreduce/allreduce_nvls_block_pipeline.hpp b/src/ext/collectives/include/allreduce/allreduce_nvls_block_pipeline.hpp index 5662d1163..81b74add4 100644 --- a/src/ext/collectives/include/allreduce/allreduce_nvls_block_pipeline.hpp +++ b/src/ext/collectives/include/allreduce/allreduce_nvls_block_pipeline.hpp @@ -29,8 +29,6 @@ class AllreduceNvlsBlockPipeline : public AlgorithmBuilder { void* scratchBuffer_; size_t scratchBufferSize_; uint32_t nSwitchChannels_; - int nRanksPerIpcDomain_ = 0; - int nBaseChannels_ = 0; std::shared_ptr> memoryChannelsDeviceHandle_; std::vector baseChannels_; std::vector conns_; diff --git a/src/ext/collectives/include/allreduce/allreduce_nvls_warp_pipeline.hpp b/src/ext/collectives/include/allreduce/allreduce_nvls_warp_pipeline.hpp index f347c871f..8f02a8738 100644 --- a/src/ext/collectives/include/allreduce/allreduce_nvls_warp_pipeline.hpp +++ b/src/ext/collectives/include/allreduce/allreduce_nvls_warp_pipeline.hpp @@ -29,8 +29,6 @@ class AllreduceNvlsWarpPipeline : public AlgorithmBuilder { void* scratchBuffer_; size_t scratchBufferSize_; uint32_t nSwitchChannels_; - int nRanksPerIpcDomain_ = 0; - int nBaseChannels_ = 0; std::shared_ptr> memoryChannelsDeviceHandle_; std::vector baseChannels_; std::vector conns_; diff --git a/src/ext/collectives/include/collective_utils.hpp b/src/ext/collectives/include/collective_utils.hpp index 2e61b9379..95ce7f5a4 100644 --- a/src/ext/collectives/include/collective_utils.hpp +++ b/src/ext/collectives/include/collective_utils.hpp @@ -57,7 +57,8 @@ std::shared_ptr> setupMemoryChannelDeviceHandles( std::vector> setupNvlsConnections(std::shared_ptr comm, size_t size, int numConnections); -std::vector setupNvlsChannels(std::vector> conns, void* buffer, +std::vector setupNvlsChannels(std::shared_ptr comm, + std::vector> conns, void* buffer, size_t bufferSize, int nSwitchChannels); std::shared_ptr> setupNvlsChannelDeviceHandles( From ac44e98d962e6d629372b1ab5e61b6f45449a766 Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Wed, 20 May 2026 20:33:27 +0000 Subject: [PATCH 41/44] update --- src/ext/collectives/include/collective_utils.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/ext/collectives/include/collective_utils.hpp b/src/ext/collectives/include/collective_utils.hpp index be18477a3..c2bcd87e3 100644 --- a/src/ext/collectives/include/collective_utils.hpp +++ b/src/ext/collectives/include/collective_utils.hpp @@ -80,6 +80,7 @@ class AlgorithmCtx { public: int rank; int worldSize; + int nRanksPerNode; int nRanksPerIpcDomain; std::vector registeredMemories; From 42ece408b993a4be71012b169eed5b28453db796 Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Sun, 24 May 2026 05:56:11 +0000 Subject: [PATCH 42/44] Fix memory leak --- .../customized_comm_with_tuning.py | 4 ++-- .../allreduce/allreduce_fullmesh.cu | 24 ++++++++++++++----- .../allreduce/allreduce_rsag_zero_copy.cu | 2 -- .../include/allreduce/allreduce_fullmesh.hpp | 2 -- .../allreduce/allreduce_rsag_zero_copy.hpp | 2 -- 5 files changed, 20 insertions(+), 14 deletions(-) diff --git a/examples/torch-integration/customized_comm_with_tuning.py b/examples/torch-integration/customized_comm_with_tuning.py index cb3661675..cf475cdfc 100644 --- a/examples/torch-integration/customized_comm_with_tuning.py +++ b/examples/torch-integration/customized_comm_with_tuning.py @@ -58,8 +58,8 @@ class CustomizedComm: """Exposes all_reduce, all_gather, barrier with lazy per-size tuning.""" _TUNE_N_WARMUP = 5 - _TUNE_N_GRAPH_LAUNCHES = 10 - _TUNE_N_OPS_PER_GRAPH = 100 + _TUNE_N_GRAPH_LAUNCHES = 5 + _TUNE_N_OPS_PER_GRAPH = 20 _CANDIDATE_NBLOCKS = [4, 8, 16, 24, 32, 48, 56, 64, 128] _CANDIDATE_NTHREADS = [512, 768, 1024] _NBLOCKS_LIMIT = { diff --git a/src/ext/collectives/allreduce/allreduce_fullmesh.cu b/src/ext/collectives/allreduce/allreduce_fullmesh.cu index f547ab4fd..eb8726245 100644 --- a/src/ext/collectives/allreduce/allreduce_fullmesh.cu +++ b/src/ext/collectives/allreduce/allreduce_fullmesh.cu @@ -9,6 +9,17 @@ namespace mscclpp { namespace collective { +namespace { +// Per-context cache of input-side MemoryChannels keyed by input pointer. +// Lifetime is tied to AlgorithmCtx, so entries are released when the ctx is +// evicted from the framework's context cache (avoids unbounded growth across +// allreduce calls that pass different input buffers). +using InputChannelsCache = + std::unordered_map, std::shared_ptr>>>; +constexpr const char* kInputChannelsExtraKey = "inputChannels"; +} // namespace + template __global__ void __launch_bounds__(512, 1) allreduceFullmesh(T* buff, T* scratch, T* resultBuff, DeviceHandle* memoryChannels, @@ -195,17 +206,17 @@ CommResult AllreduceFullmesh::allreduceKernelFunc( MSCCLPP_CUTHROW(cuMemGetAddressRange(&recvBasePtr, &recvBytes, (CUdeviceptr)output)); channelOutOffset = (char*)output - (char*)recvBasePtr; } - std::shared_ptr> inputChannelHandles; - if (this->memoryChannelsMap_.find(input) != this->memoryChannelsMap_.end()) { - inputChannelHandles = this->memoryChannelsMap_[input].second; - } else { + auto& inputChannelsCache = *static_cast(ctx->extras.at(kInputChannelsExtraKey).get()); + auto it = inputChannelsCache.find(input); + if (it == inputChannelsCache.end()) { RegisteredMemory localMemory = comm_->registerMemory(const_cast(input), inputSize, Transport::CudaIpc); std::vector channels = setupMemoryChannels(this->conns_, this->inputScratchSemaphores_, this->remoteScratchMemories_, localMemory, nChannelsPerConnection_); - this->memoryChannelsMap_[input] = std::make_pair(channels, setupMemoryChannelDeviceHandles(channels)); + auto handles = setupMemoryChannelDeviceHandles(channels); + it = inputChannelsCache.emplace(input, std::make_pair(std::move(channels), std::move(handles))).first; } - inputChannelHandles = this->memoryChannelsMap_[input].second; + std::shared_ptr> inputChannelHandles = it->second.second; AllreduceFunc allreduce = dispatch(op, dtype, accumDtype); if (!allreduce) { @@ -267,6 +278,7 @@ std::shared_ptr AllreduceFullmesh::initAllreduceContext(std::shared_ptrmemoryChannels = setupMemoryChannels(this->conns_, ctx->memorySemaphores, ctx->registeredMemories, localMemory, nChannelsPerConnection_); ctx->memoryChannelDeviceHandles = setupMemoryChannelDeviceHandles(ctx->memoryChannels); + ctx->extras.insert({kInputChannelsExtraKey, std::make_shared()}); return ctx; } diff --git a/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu b/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu index 877a722a3..e7ed0cabe 100644 --- a/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu +++ b/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu @@ -211,8 +211,6 @@ std::shared_ptr AllreduceRsAgZeroCopy::initAllreduceContext(std::shared_pt // register input and output memories RegisteredMemory inputMemory = comm->registerMemory((void*)input, size, Transport::CudaIpc); RegisteredMemory outputMemory = comm->registerMemory(output, size, Transport::CudaIpc); - this->inputMemories_.push_back(inputMemory); - this->outputMemories_.push_back(outputMemory); auto remoteInputMemories = setupRemoteMemories(comm, ctx->rank, inputMemory); auto remoteOutputMemories = setupRemoteMemories(comm, ctx->rank, outputMemory); diff --git a/src/ext/collectives/include/allreduce/allreduce_fullmesh.hpp b/src/ext/collectives/include/allreduce/allreduce_fullmesh.hpp index a54352b3f..e0c63a3d3 100644 --- a/src/ext/collectives/include/allreduce/allreduce_fullmesh.hpp +++ b/src/ext/collectives/include/allreduce/allreduce_fullmesh.hpp @@ -30,8 +30,6 @@ class AllreduceFullmesh : public mscclpp::AlgorithmBuilder { std::vector> inputScratchSemaphores_; std::vector remoteScratchMemories_; RegisteredMemory localScratchMemory_; - std::unordered_map, std::shared_ptr>>> - memoryChannelsMap_; bool symmetricMemory_ = false; }; } // namespace collective diff --git a/src/ext/collectives/include/allreduce/allreduce_rsag_zero_copy.hpp b/src/ext/collectives/include/allreduce/allreduce_rsag_zero_copy.hpp index 05bf2ef3c..528d9708b 100644 --- a/src/ext/collectives/include/allreduce/allreduce_rsag_zero_copy.hpp +++ b/src/ext/collectives/include/allreduce/allreduce_rsag_zero_copy.hpp @@ -27,8 +27,6 @@ class AllreduceRsAgZeroCopy : public mscclpp::AlgorithmBuilder { int nChannelsPerConnection_; std::vector conns_; std::vector> semaphores_; - std::vector inputMemories_; - std::vector outputMemories_; std::vector baseChannels_; std::shared_ptr> baseMemoryChannelHandles_; From 641420de6dd89f2cdd7645a253b8d65917edadd1 Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Tue, 26 May 2026 22:05:22 +0000 Subject: [PATCH 43/44] increase nvls memory size to 64 GB --- .../collectives/include/allreduce/allreduce_nvls_zero_copy.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ext/collectives/include/allreduce/allreduce_nvls_zero_copy.hpp b/src/ext/collectives/include/allreduce/allreduce_nvls_zero_copy.hpp index c40bd2cda..a28bcae37 100644 --- a/src/ext/collectives/include/allreduce/allreduce_nvls_zero_copy.hpp +++ b/src/ext/collectives/include/allreduce/allreduce_nvls_zero_copy.hpp @@ -28,7 +28,7 @@ class AllreduceNvls : public AlgorithmBuilder { // Large buffer size because cuMemMap requires offset=0 for multicast handles, so the entire // user allocation must be mapped. This only reserves virtual address space; no physical memory // is consumed beyond what is actually bound. - const size_t nvlsBufferSize_ = (1UL << 34); + const size_t nvlsBufferSize_ = (1UL << 36); uint32_t nSwitchChannels_; std::shared_ptr> memoryChannelsDeviceHandle_; std::vector baseChannels_; From ea73a1e1b7d666802705d69027afd201303e9574 Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Tue, 26 May 2026 22:34:01 +0000 Subject: [PATCH 44/44] WIP --- include/mscclpp/gpu_utils.hpp | 3 ++- .../collectives/include/allreduce/allreduce_nvls_zero_copy.hpp | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/include/mscclpp/gpu_utils.hpp b/include/mscclpp/gpu_utils.hpp index b079e0fd9..ed5f9f63b 100644 --- a/include/mscclpp/gpu_utils.hpp +++ b/include/mscclpp/gpu_utils.hpp @@ -342,7 +342,8 @@ class GpuBuffer { MSCCLPP_CUDATHROW(cudaGetDevice(&deviceId_)); #if (CUDA_NVLS_API_AVAILABLE) if (isNvlsSupported()) { - size_t gran = detail::getMulticastGranularity(nelems * sizeof(T), CU_MULTICAST_GRANULARITY_RECOMMENDED); + // TODO: pass granularity from the caller instead of using the minimum granularity. + size_t gran = detail::getMulticastGranularity(nelems * sizeof(T), CU_MULTICAST_GRANULARITY_MINIMUM); bytes_ = (nelems * sizeof(T) + gran - 1) / gran * gran / sizeof(T) * sizeof(T); memory_ = detail::gpuCallocPhysicalShared(nelems, gran); return; diff --git a/src/ext/collectives/include/allreduce/allreduce_nvls_zero_copy.hpp b/src/ext/collectives/include/allreduce/allreduce_nvls_zero_copy.hpp index a28bcae37..c40bd2cda 100644 --- a/src/ext/collectives/include/allreduce/allreduce_nvls_zero_copy.hpp +++ b/src/ext/collectives/include/allreduce/allreduce_nvls_zero_copy.hpp @@ -28,7 +28,7 @@ class AllreduceNvls : public AlgorithmBuilder { // Large buffer size because cuMemMap requires offset=0 for multicast handles, so the entire // user allocation must be mapped. This only reserves virtual address space; no physical memory // is consumed beyond what is actually bound. - const size_t nvlsBufferSize_ = (1UL << 36); + const size_t nvlsBufferSize_ = (1UL << 34); uint32_t nSwitchChannels_; std::shared_ptr> memoryChannelsDeviceHandle_; std::vector baseChannels_;