NVIDIA-NeMo · dmvevents · Apr 12, 2026 · terrykong · Apr 13, 2026 · terrykong
@@ -497,8 +497,13 @@ def _create_workers_from_bundle_indices(
                         "AVAILABLE_PORT_LIST": str(available_ports),
                     }
                 )
-                # Remove Ray-specific environment variables, let the worker itself set them.
-                worker_env_vars.pop("RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES", None)
+                # Preserve RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES=1 to prevent Ray
+                # from masking CUDA_VISIBLE_DEVICES per actor. GPU masking triggers NCCL
+                # bugs on NVSwitch topologies (H200/P5en, H100/P5) including cuMem import
+                # penalty (nccl#1749) and NVLS rank ordering corruption (nccl#1906).
+                # Workers use explicit torch.cuda.set_device(local_rank) instead.
-                # Workers use explicit torch.cuda.set_device(local_rank) instead.
+                # Workers rely on LOCAL_RANK env var for device selection via
+                # init_device_mesh / Megatron internals.
-                # Workers use explicit torch.cuda.set_device(local_rank) instead.
+                # Workers rely on LOCAL_RANK env var for device selection via
+                # init_device_mesh / Megatron internals.
+                # See: https://github.com/NVIDIA-NeMo/RL/issues/1963
+                worker_env_vars["RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES"] = "1"
                 worker_env_vars.pop("RAY_CLIENT_MODE", None)
                 worker_env_vars.pop("RAY_JOB_ID", None)
                 worker_env_vars.pop("RAY_LD_PRELOAD", None)