microsoft · Binyang2014 · May 28, 2026 · May 28, 2026 · May 28, 2026 · May 28, 2026
diff --git a/.azure-pipelines/templates/nccl-test.yml b/.azure-pipelines/templates/nccl-test.yml
@@ -74,6 +74,14 @@ steps:
       mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="broadcast" /root/nccl-tests/build/broadcast_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
       mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce" /root/nccl-tests/build/broadcast_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
 
+- template: run-remote-task.yml
+  parameters:
+    name: PyBench
+    displayName: Run collective benchmarks
+    remoteScript: |
+      mpirun --allow-run-as-root -x GPU_MAX_HW_QUEUES=8 -np 8 python3 ./python/mscclpp_benchmark/bench_collective.py --collective allreduce --dtype float8_e4m3b15 --accum-type float32 --autotune
+      mpirun --allow-run-as-root -x GPU_MAX_HW_QUEUES=8 -np 8 python3 ./python/mscclpp_benchmark/bench_collective.py --collective allreduce --dtype float16 --symmetric-memory --autotune
+
 - template: stop.yml
   parameters:
     subscription: ${{ parameters.subscription }}

diff --git a/.azure-pipelines/templates/rccl-test.yml b/.azure-pipelines/templates/rccl-test.yml
@@ -57,6 +57,14 @@ steps:
       mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN /root/rocm-systems/projects/rccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
       mpirun -np 8 --bind-to numa --allow-run-as-root /root/rocm-systems/projects/rccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
 
+- template: run-remote-task.yml
+  parameters:
+    name: PyBench
+    displayName: Run collective benchmarks
+    remoteScript: |
+      mpirun --allow-run-as-root -x GPU_MAX_HW_QUEUES=8 -np 8 python3 ./python/mscclpp_benchmark/bench_collective.py --collective allreduce --dtype float8_e4m3b15 --accum-type float32 --autotune
+      mpirun --allow-run-as-root -x GPU_MAX_HW_QUEUES=8 -np 8 python3 ./python/mscclpp_benchmark/bench_collective.py --collective allgather --dtype float8_e4m3b15 --autotune
+
 - template: stop.yml
   parameters:
     subscription: ${{ parameters.subscription }}

diff --git a/docs/quickstart.md b/docs/quickstart.md
@@ -110,12 +110,12 @@ $ CXX=/opt/rocm/bin/hipcc python -m pip install ".[rocm6]"
 ```
 
 > **Note:** A platform extra (`cuda11`, `cuda12`, `cuda13`, or `rocm6`) is required to install CuPy.
-> The CUDA extras install pre-built CuPy wheels. The `rocm6` extra installs CuPy from source,
-> which requires ROCm and may take longer. Running `pip install .` without an extra will not install CuPy.
+> The CUDA extras install pre-built CuPy wheels and CUDA Python bindings. The `rocm6` extra installs CuPy from source
+> and HIP Python 6.x, which require ROCm and may take longer. Running `pip install .` without an extra will not install CuPy.
 
 Optional extras can be installed by specifying them in brackets. Available extras:
-- **`cuda11`**, **`cuda12`**, **`cuda13`**: Install a pre-built CuPy package for your CUDA version.
-- **`rocm6`**: Install CuPy from source for AMD ROCm platforms.
+- **`cuda11`**, **`cuda12`**, **`cuda13`**: Install a pre-built CuPy package and CUDA Python bindings for your CUDA version.
+- **`rocm6`**: Install CuPy from source and HIP Python 6.x for AMD ROCm platforms.
 - **`benchmark`**: Install benchmark dependencies (mpi4py, prettytable, netifaces, matplotlib).
 - **`test`**: Install test dependencies (pytest, mpi4py, netifaces).
 
@@ -209,17 +209,45 @@ $ mpirun -np 16 -npernode 8 -hostfile hostfile ./bin/mp_unit_tests -ip_port 10.0
 
 ## Performance Benchmark
 
-### Python Benchmark
+### Python Benchmark and Tuning
 
-[Install the MSCCL++ Python package](#install-from-source-python-module) and run our Python AllReduce benchmark as follows. It requires MPI on the system.
+[Install the MSCCL++ Python package](#install-from-source-python-module) and run the Python collective benchmark as follows. It requires MPI on the system.
 
 ```bash
 # Install with benchmark dependencies and the appropriate CUDA/ROCm extras.
 # Replace `cuda12` with your platform: cuda11, cuda12, cuda13, or rocm6.
 $ python3 -m pip install ".[cuda12,benchmark,test]"
-$ mpirun -tag-output -np 8 python3 ./python/mscclpp_benchmark/allreduce_bench.py
+
+# Run a benchmark.
+$ PYTHONPATH=$PWD/python mpirun -np 8 --allow-run-as-root \
+    python3 ./python/mscclpp_benchmark/bench_collective.py \
+    --collective allreduce --dtype float16
+```
+
+To autotune launch parameters and save a tuned config:
+
+```bash
+$ PYTHONPATH=$PWD/python mpirun -np 8 --allow-run-as-root \
+    python3 -m mscclpp_benchmark.tuner \
+    --collective allreduce \
+    --dim 5120 \
+    --dtype float16 \
+    --scale 8 \
+    --batch-sizes 1,2,4,8 \
+    --output /tmp/mscclpp_tuned_configs.json
 ```
 
+Use the tuned config in a benchmark:
+
+```bash
+$ PYTHONPATH=$PWD/python mpirun -np 8 --allow-run-as-root \
+    python3 ./python/mscclpp_benchmark/bench_collective.py \
+    --collective allreduce \
+    --dtype float16 \
+    --config-path /tmp/mscclpp_tuned_configs.json
+```
+
+
 (nccl-benchmark)=
 ### NCCL/RCCL Benchmark over MSCCL++
 
@@ -291,4 +319,3 @@ Version: 0.8.0.post1.dev0+gc632fee37.d20251007
 mscclpp.version
 {'version': '0.8.0.post1.dev0+gc632fee37.d20251007', 'git_commit': 'g50382c567'}
 ```
-
diff --git a/include/mscclpp/gpu_data_types.hpp b/include/mscclpp/gpu_data_types.hpp
@@ -1268,8 +1268,8 @@ MSCCLPP_DEVICE_INLINE f8_e4m3b15x4 to<f8_e4m3b15x4, f32x4>(const f32x4& v) {
   return to<f8_e4m3b15x4, f16x4>(h);
 #elif defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
   f16x4 h;
-  h.words[0] = __builtin_bit_cast(uint32_t, __builtin_amdgcn_cvt_pkrtz(v.data[0], v.data[1]));
-  h.words[1] = __builtin_bit_cast(uint32_t, __builtin_amdgcn_cvt_pkrtz(v.data[2], v.data[3]));
+  h.words[0] = __builtin_bit_cast(uint32_t, __floats2half2_rn(v.data[0], v.data[1]));
+  h.words[1] = __builtin_bit_cast(uint32_t, __floats2half2_rn(v.data[2], v.data[3]));
   return to<f8_e4m3b15x4, f16x4>(h);
 #else
   f8_e4m3b15x4 result;

diff --git a/pyproject.toml b/pyproject.toml
@@ -21,10 +21,22 @@ dependencies = [
 ]
 
 [project.optional-dependencies]
-cuda11 = ["cupy-cuda11x"]
-cuda12 = ["cupy-cuda12x"]
-cuda13 = ["cupy-cuda13x"]
-rocm6 = ["cupy"]
+cuda11 = [
+    "cupy-cuda11x",
+    "cuda-bindings>=11.8,<12",
+]
+cuda12 = [
+    "cupy-cuda12x",
+    "cuda-bindings>=12,<13",
+]
+cuda13 = [
+    "cupy-cuda13x",
+    "cuda-bindings>=13,<14",
+]
+rocm6 = [
+    "cupy",
+    "hip-python>=6,<7",
+]
 benchmark = [
     "mpi4py",
     "prettytable",

diff --git a/python/mscclpp_benchmark/__init__.py b/python/mscclpp_benchmark/__init__.py
@@ -1,4 +1,18 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
 
-from .mscclpp_op import MscclppAllReduce1, MscclppAllReduce2, MscclppAllReduce3, MscclppAllReduce4, MscclppAllReduce5
+__all__ = [
+    "MscclppAllReduce1",
+    "MscclppAllReduce2",
+    "MscclppAllReduce3",
+    "MscclppAllReduce4",
+    "MscclppAllReduce5",
+]
+
+
+def __getattr__(name):
+    if name in __all__:
+        from . import mscclpp_op
+
+        return getattr(mscclpp_op, name)
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")