diff --git a/.github/workflows/collectivex-experimental.yml b/.github/workflows/collectivex-experimental.yml new file mode 100644 index 000000000..60c5d8d06 --- /dev/null +++ b/.github/workflows/collectivex-experimental.yml @@ -0,0 +1,249 @@ +name: CollectiveX Experimental + +# Orchestration only — all benchmark logic lives in experimental/CollectiveX/. +# Push to the feature branch runs the MI355X MoRI dispatch/combine benchmark (no +# merge to main needed); workflow_dispatch runs a chosen SKU + benchmark (the lane +# for GB200/B200 NCCL, DeepEP, and larger sweeps). Each job lands on the SKU's +# self-hosted runner and invokes that SKU's launch script — the same +# launch_${RUNNER_NAME%%_*}.sh convention the serving benchmarks use. + +on: + push: + branches: + - collectivex + paths: + - 'experimental/CollectiveX/**' + - '.github/workflows/collectivex-experimental.yml' + workflow_dispatch: + inputs: + sku: + # Only SKUs with a matching launchers/launch_.sh are offered — + # runner.name's prefix selects the script, so an SKU without one fails. + description: Self-hosted runner pool (must have a CollectiveX launcher) + type: choice + default: gb200 + options: [gb200, b200-dgxc, b200-multinode, mi355x, h100-dgxc, h200, b300, gb300] + benchmark: + # mori runs only on mi355x; nccl/deepep/all on the NVIDIA SKUs. + description: Which benchmark to run + type: choice + default: nccl + options: [nccl, deepep, mori, all] + ops: + description: NCCL ops (space-separated); blank = default set + type: string + default: '' + min_bytes: + description: nccl-tests min message size + type: string + default: '8' + max_bytes: + description: nccl-tests max message size + type: string + default: '8G' + ngpus: + description: GPUs per node (blank = SKU default) + type: string + default: '' + nodes: + description: Node count (gb200 multi-node MNNVL; 2 = 8 GPU). Blank/1 = single node. + type: string + default: '' + phase: + # EP only. 'both' fans out to one job per phase (decode + prefill). + description: EP phase — decode (small T) / prefill (large T); 'both' = a job each + type: choice + default: both + options: [both, decode, prefill] + tokens_ladder: + description: EP source-tokens-per-rank sweep (space/comma sep); blank = phase default + type: string + default: '' + dispatch_dtype: + description: EP dispatch payload precision + type: choice + default: bf16 + options: [bf16, fp8] + mode: + # normal = high-throughput kernels (decode+prefill); ll = DeepEP low-latency + # (decode-shaped, fp8 cast in-kernel). LL is rejected on backends without it + # (MoRI) and aborts on fabrics that lack it (B300) — run only where supported. + description: EP kernel path — normal or low-latency (LL) + type: choice + default: normal + options: [normal, ll] + resource_mode: + # normalized = ~sm_fraction of device units (cross-vendor apples-to-apples); + # tuned = each backend's own recommended/default launch config. + description: Comm resource regime + type: choice + default: normalized + options: [normalized, tuned, default] + contract: + # layout-and-dispatch-v1 = dispatch timing includes routing-layout gen (the only + # contract MoRI honors; use for cross-vendor). cached-layout-comm-only-v1 = layout + # hoisted out, pure-comm dispatch (DeepEP normal only). + description: Measurement contract (timing boundary) + type: choice + default: layout-and-dispatch-v1 + options: [layout-and-dispatch-v1, cached-layout-comm-only-v1] + routing: + # Routing distribution of the shared trace. uniform=realistic; balanced=load-equalized; + # zipf*=skewed; hotspot-single=one hot expert. The skew + EPLB sweep lives here. + description: EP routing distribution + type: choice + default: uniform + options: [uniform, balanced, zipf, zipf-mild, zipf-moderate, zipf-heavy, hotspot-single] + eplb: + # EPLB = replicate hot experts + balanced-place (the remedy for skewed routing). A pure + # routing-trace transform; experts -> num_logical+redundant. Meaningful with zipf*. + description: Apply EPLB expert replication/placement + type: boolean + default: false + +concurrency: + # Include the dispatch SKU so two workflow_dispatch runs on different SKUs do + # not cancel each other; push has no sku input -> shares one 'push' group. + # cancel-in-progress FALSE: same-SKU dispatches QUEUE (serialize) rather than + # cancel — required so a 3-run reproducibility sweep on one SKU actually runs all + # three (with `true` the later dispatches silently cancelled the earlier ones). + group: collectivex-${{ github.ref }}-${{ github.event_name }}-${{ inputs.sku || 'push' }} + cancel-in-progress: false + +permissions: + contents: read + +jobs: + # Push -> MI355X MoRI dispatch/combine. Lands on a free mi355x-amds runner and + # runs launch_mi355x-amds.sh (CX_BENCH=mori). The AMD workspace is compute- + # visible, so no CX_STAGE_DIR; the launcher defaults to 8 GPUs. + experimental: + name: CollectiveX Experimental (${{ matrix.phase }}) + if: github.event_name == 'push' + runs-on: mi355x + timeout-minutes: 90 + strategy: + fail-fast: false + matrix: + # Push = a fast MoRI SMOKE only (decode). The full sweep is workflow_dispatch. + phase: [decode] + env: + CX_BENCH: mori + CX_PHASE: ${{ matrix.phase }} + # SMOKE ladder capped at T<=16: MoRI + realistic (fan-out≈5.3) routing currently + # WEDGES at T>=32 (under investigation; DeepEP is fine), and an unguarded run hung + # ~1 h before the job timeout. Keep the push smoke in the known-good range; run the + # full sweep via workflow_dispatch (timeout-guarded). Remove the cap once fixed. + CX_TOKENS_LADDER: "1 2 4 8 16" + CX_RUN_TIMEOUT: "600" + # Pin to the MI355X nodes that hold the node-local squash and have a writable + # /var/lib/squash; other nodes need a slow cold import that can fail on lock/ + # cache permissions. Widen once the squash is staged cluster-wide. + CX_NODELIST: mia1-p01-g10,mia1-p01-g15 + steps: + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v5.0.0 + with: { clean: true } + - name: Launch MI355X MoRI (${{ matrix.phase }}) + env: + RUNNER_NAME: ${{ runner.name }} + run: bash "experimental/CollectiveX/launchers/launch_${RUNNER_NAME%%_*}.sh" + - name: Results summary + if: always() + run: python3 experimental/CollectiveX/summarize.py --results-dir experimental/CollectiveX/results --markdown >> "$GITHUB_STEP_SUMMARY" + - name: Upload results + if: always() + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 + with: + name: collectivex_mi355x_mori_${{ matrix.phase }}_${{ github.run_id }} + path: experimental/CollectiveX/results/*.json + if-no-files-found: warn + + # Manual dispatch -> chosen SKU + benchmark. Lands on the inputs.sku runner. + dispatch: + if: github.event_name == 'workflow_dispatch' + runs-on: ${{ inputs.sku }} + timeout-minutes: 120 + strategy: + fail-fast: false + matrix: + # nccl/rccl are collective primitives — phase is meaningless, so run ONE job (not + # the same work twice). EP backends: 'both' -> decode + prefill; else a single job. + phase: ${{ fromJSON((inputs.benchmark == 'nccl' || inputs.benchmark == 'rccl') && '["na"]' || (inputs.phase == 'both' && '["decode","prefill"]' || format('["{0}"]', inputs.phase))) }} + env: + CX_BENCH: ${{ inputs.benchmark }} + CX_OPS: ${{ inputs.ops }} + CX_MIN_BYTES: ${{ inputs.min_bytes }} + CX_MAX_BYTES: ${{ inputs.max_bytes }} + CX_NGPUS: ${{ inputs.ngpus }} + CX_NODES: ${{ inputs.nodes }} + CX_PHASE: ${{ matrix.phase }} + CX_TOKENS_LADDER: ${{ inputs.tokens_ladder }} + CX_DISPATCH_DTYPE: ${{ inputs.dispatch_dtype }} + CX_MODE: ${{ inputs.mode }} + CX_RESOURCE_MODE: ${{ inputs.resource_mode }} + CX_MEASUREMENT_CONTRACT: ${{ inputs.contract }} + CX_ROUTING: ${{ inputs.routing }} + CX_EPLB: ${{ inputs.eplb && '1' || '' }} + # GHA run provenance: run_ep records git_run (repo/run/attempt/ref/sha/job) -> a GHA result + # is provenance_complete (publication_status >= comparable-experimental, official w/ canonical). + COLLECTIVEX_SOURCE_SHA: ${{ github.sha }} + COLLECTIVEX_ARTIFACT_NAME: collectivex_${{ inputs.sku }}_${{ inputs.benchmark }}_${{ matrix.phase }}_${{ github.run_id }} + # GB200/watchtower needs a compute-visible workspace; harmless elsewhere. + CX_STAGE_DIR: ${{ inputs.sku == 'gb200' && '/mnt/lustre01/users-public/sa-shared/cx-stage' || '' }} + # MI355X: pin to the warm-squash, writable nodes (see the push job). + CX_NODELIST: ${{ inputs.sku == 'mi355x' && 'mia1-p01-g10,mia1-p01-g15' || '' }} + steps: + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v5.0.0 + with: { clean: true } + # Reject an unsupported backend/SKU/mode/dtype/contract BEFORE consuming the runner + # (review #3): fail fast on the login node, not after a salloc. 'all' fans out per + # vendor in-container, so skip the single-combo check for it. + - name: Validate capability + if: inputs.benchmark != 'all' + run: | + python3 experimental/CollectiveX/tests/capability.py \ + --sku "${{ inputs.sku }}" --backend "${{ inputs.benchmark }}" \ + --mode "${{ inputs.mode }}" --dtype "${{ inputs.dispatch_dtype }}" \ + --contract "${{ inputs.contract }}" + - name: Launch ${{ inputs.sku }} / ${{ inputs.benchmark }} (${{ matrix.phase }}) + env: + RUNNER_NAME: ${{ runner.name }} + run: bash "experimental/CollectiveX/launchers/launch_${RUNNER_NAME%%_*}.sh" + - name: Results summary + if: always() + run: python3 experimental/CollectiveX/summarize.py --results-dir experimental/CollectiveX/results --markdown >> "$GITHUB_STEP_SUMMARY" + - name: Upload results + if: always() + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 + with: + name: collectivex_${{ inputs.sku }}_${{ inputs.benchmark }}_${{ matrix.phase }}_${{ github.run_id }} + path: experimental/CollectiveX/results/*.json + if-no-files-found: warn + + update-frontend-snapshot: + name: Update InferenceX-app snapshot + needs: [experimental, dispatch] + if: >- + always() && + ( + (github.event_name == 'push' && needs.experimental.result == 'success') || + (github.event_name == 'workflow_dispatch' && needs.dispatch.result == 'success') + ) + runs-on: ubuntu-latest + steps: + - name: Trigger CollectiveX snapshot update + env: + FRONTEND_PAT: ${{ secrets.INFX_FRONTEND_PAT }} + run: | + set -euo pipefail + curl -sSf -X POST \ + -H "Authorization: Bearer $FRONTEND_PAT" \ + -H "Accept: application/vnd.github+json" \ + -H "X-GitHub-Api-Version: 2022-11-28" \ + https://api.github.com/repos/SemiAnalysisAI/InferenceX-app/dispatches \ + -d '{ + "event_type": "update-collectivex-data", + "client_payload": { + "source_run_id": "${{ github.run_id }}" + } + }' diff --git a/experimental/CollectiveX/.gitignore b/experimental/CollectiveX/.gitignore new file mode 100644 index 000000000..a4717f5ff --- /dev/null +++ b/experimental/CollectiveX/.gitignore @@ -0,0 +1,14 @@ +# in-container nccl-tests build cache +.nccl-tests/ +# python +__pycache__/ +*.pyc +# generated run artifacts: captured env embeds hostnames / GPU UUIDs / NIC GUIDs, +# so keep results out of git (CI uploads them as workflow artifacts instead). +# Sanitized headline numbers live in CONTAINERS.md. +results/*.json +results/plots/ +results/raw_*.txt +results/raw_*.txt.stderr +# running local-only reflection log (not a committed artifact) +notes.md diff --git a/experimental/CollectiveX/CONTAINERS.md b/experimental/CollectiveX/CONTAINERS.md new file mode 100644 index 000000000..6b409bac0 --- /dev/null +++ b/experimental/CollectiveX/CONTAINERS.md @@ -0,0 +1,75 @@ +# CollectiveX — container & library versions + +One **multi-arch, digest-pinned** container is used for all NVIDIA SKUs, so B200 +(x86_64) and GB200 (aarch64) share a single reference and the cross-vendor +comparison is truly same-image. Set in `launchers/common.sh` (`cx_default_image`). + +## Default container (all NVIDIA SKUs) + +- **Image:** import by tag **`lmsysorg/sglang:v0.5.11-cu130`** (multi-arch OCI index). Expected index digest, recorded for provenance/verification: `sha256:061fb71f838e82000a1768c159654d526c2f17ebe751c21e7fc48ca53c8ef975`. +- **Multi-arch manifest list:** linux/amd64 + linux/arm64; `enroot import` on each host pulls the matching arch. +- **Import by TAG, not digest.** enroot builds its anonymous Docker Hub token scope from the *tag* and succeeds (no creds needed — same as the serving launchers). A bare `repo@sha256:` ref makes enroot prompt for a password and **hang** in non-interactive CI; a combined `tag@sha256:` ref 400s. `cx_ensure_squash` therefore imports by tag with `_perf` binaries + output format as nccl-tests, so `run_nccl.py` parses it unchanged). +- **Validated on MI355X** (on-node via `salloc`+`srun`, nodes `mia1-p01-g10`/`g15`): `salloc` → enroot import (anonymous auth + tag, 24 layers → ~60 GB node-local squash) → torchrun → 8-rank Gloo + MoRI shmem → `EpDispatchCombineConfig`/dispatch/combine **numerically correct** (combine within tol, `max_rel ~2e-3`, ~85 µs round-trip at the decode shape). Three ionic_rdma-fabric constraints, all handled in `tests/ep_mori.py`: + - **RDMA MR size ceiling (~4 GiB).** MoRI registers the *entire* symmetric heap as one RDMA MR at init — even single-node (no disable-RDMA knob exists; only `MORI_DISABLE_P2P`, which forces the opposite). On these ionic NICs a 6 GiB MR fails (`RegisterRdmaMemoryRegion … errno 22 EINVAL`) while 2 GiB registers. Heap is held at **`MORI_SHMEM_HEAP_SIZE=2G`** (override `CX_MORI_HEAP_SIZE`). The reference test's hardcoded `6G` is exactly why it can't run as-is here. + - **Buffer sizing.** `max_num_inp_token_per_rank` is bounded (512 at the decode shape) so dispatch/combine buffers fit the 2 GiB heap. Much larger token counts would need a heap past the MR ceiling — out of reach on this fabric for now. + - **Teardown.** MoRI's shmem teardown asserts (`CheckStatusValid` → SIGABRT) when the op is destroyed after `shmem_finalize()`; `tests/ep_mori.py`'s `finalize()` hard-exits after writing results to avoid it. + + Still TODO: capture the exact MoRI commit + a version table (ROCm/torch/RCCL) into provenance, and digest-pin the image. + +## Cluster access / QOS + +- **B200** (`slurm-login-slinky`): account `benchmark`, **only `gpu-2_qos`** → partition `gpu-2` only (shared with the serving sweep). `gpu-1`/`all` (idle) need `gpu-1_qos`/`all_qos`, not associated with this account. +- **GB200** (`watchtower`): account `benchmark`, qos `normal`, partition `batch` (`AllowQos=ALL`); idle capacity available. Runner workspace is **not** compute-visible → set `CX_STAGE_DIR` to a Lustre path (the launcher rsyncs there). + +## First real results (Milestone-0 spike, on the DeepSeek-V4 images) + +nccl-tests (system NCCL 2.28.3), all correctness-passed, peak bus-bw: + +| op | B200 8× (NVLink island, x86_64) | GB200 4× (NVL72 MNNVL, aarch64) | +|---|---|---| +| all_reduce | 835 GB/s | 689 GB/s | +| all_gather | 653 | 658 | +| reduce_scatter | 667 | 661 | +| alltoall | 638 | 666 | + +(B200 vs GB200 carry distinct `comparison_key`s by topology-class, so they are labelled-distinct, not silently merged. Re-run on the multi-arch default to refresh under one image.) diff --git a/experimental/CollectiveX/README.md b/experimental/CollectiveX/README.md new file mode 100644 index 000000000..a7c479b86 --- /dev/null +++ b/experimental/CollectiveX/README.md @@ -0,0 +1,128 @@ +# CollectiveX + +Cross-vendor collective / EP-library benchmark (see `plan.md`). Per-SKU **launch +adapters** (InferenceX-style `launch_.sh`) run **any benchmark** — selected +by `CX_BENCH` — through a shared in-container runner, and a GitHub Actions +workflow triggers runs on `push` (no merge to main needed). Milestone-0 headline +already ran for real on both B200 (8× NVLink island) and GB200 (4× NVL72 MNNVL). + +> Experimental: WIP, not an official InferenceMAX result. All logic stays under +> `experimental/CollectiveX/`; the only file outside is the orchestration-only +> workflow. + +## Files + +| File | Role | +|---|---| +| `env_capture.py` | Layer-0 environment + topology fingerprint → JSON (stdlib only) | +| `run_nccl.py` | run stock `nccl-tests`, parse the text table, emit flat JSON (stdlib only) | +| `tests/run_ep.py` | EP dispatch/combine entrypoint (torchrun): source-tokens-per-rank sweep, dispatch & combine timed **separately** | +| `tests/ep_harness.py` | shared EP harness: token ladder, separated timing, correctness gate, doc emission (stdlib top) | +| `tests/ep_deepep.py`, `tests/ep_mori.py` | per-backend adapters (DeepEP / MoRI) implementing the harness protocol | +| `plot.py` | latency/bus-bw curves, B200-vs-GB200 overlay with a comparison guard (matplotlib) | +| `launchers/common.sh` | shared helpers: image resolve, enroot squash, staging, nccl-tests build | +| `launchers/run_in_container.sh` | generic in-container dispatcher — runs `CX_BENCH` (nccl/deepep/mori/all) over `CX_PHASE` | +| `launchers/launch_.sh` | per-SKU adapters: `launch_b200-dgxc.sh` (8× NVLink), `launch_b200-dgxc-slurm.sh` (2-node IB), `launch_gb200-nv.sh` (NVL72 MNNVL), `launch_mi355x-amds.sh` (8× XGMI, AMD MoRI + rccl) | +| `CONTAINERS.md` | the pinned multi-arch container + audited library versions | +| `results/` | flat JSON artifacts (+ `plots/`, raw captures) | +| `tests/fixtures/` | captured nccl-tests output for offline parser checks | + +## Run + +### Via GitHub Actions (`.github/workflows/collectivex-experimental.yml`) + +- **push** to `experimental/CollectiveX/**` → the **MI355X MoRI** EP dispatch/combine + sweep, **one job per phase** (decode + prefill) via a matrix (lands on free + `mi355x-amds` runners). +- **workflow_dispatch** → pick `sku` (gb200 / b200-dgxc / b200-multinode / + mi355x), `benchmark` (nccl / deepep / mori / all — `mori` is AMD-only; `nccl` + on MI355X runs rccl-tests), `phase` (decode / prefill / **both** → a job each), + `tokens_ladder`, `dispatch_dtype`, ops, sizes, ngpus. Lands on that SKU's + self-hosted runner and runs `launch_${RUNNER_NAME%%_*}.sh`. For EP results + across all SKUs, dispatch once per `sku` with `phase=both`. + +Each job renders a results table to the **GitHub Actions job summary** (via +`summarize.py --markdown` → `$GITHUB_STEP_SUMMARY`) and uploads the result JSONs +as an artifact. (The workflow only fires once the branch is pushed to GitHub.) + +### Directly on a cluster login node + +```bash +# benchmark is selected by CX_BENCH (default nccl) +bash experimental/CollectiveX/launchers/launch_gb200-nv.sh # GB200, NCCL primitives +CX_BENCH=deepep bash experimental/CollectiveX/launchers/launch_gb200-nv.sh # GB200, DeepEP (rebuild) +bash experimental/CollectiveX/launchers/launch_b200-dgxc.sh # B200 8× NVLink +bash experimental/CollectiveX/launchers/launch_b200-dgxc-slurm.sh # B200 2-node, cross-IB +bash experimental/CollectiveX/launchers/launch_mi355x-amds.sh # MI355X 8× XGMI, MoRI EP (CX_BENCH=mori, default) +CX_BENCH=nccl bash experimental/CollectiveX/launchers/launch_mi355x-amds.sh # MI355X primitives via rccl-tests +``` + +Knobs: `CX_BENCH` (nccl|deepep|mori|all), `CX_OPS`, `CX_MIN_BYTES`/`CX_MAX_BYTES`, +`CX_NGPUS`, `CX_TIME`, `CX_IMAGE`, `CX_SQUASH_DIR`, `CX_STAGE_DIR` (compute-visible +staging — needed on GB200/watchtower), `CX_DRYRUN=1` (print plan, allocate +nothing). EP (deepep/mori) adds `CX_PHASE` (decode|prefill|both), `CX_TOKENS_LADDER` +(e.g. `"1 2 4 8 16 32 64 128"`), `CX_HIDDEN`/`CX_TOPK`/`CX_EXPERTS`, +`CX_DISPATCH_DTYPE`, `CX_NUM_EP_GROUPS`. Results land in `experimental/CollectiveX/results/`. + +### Offline (no GPU) — verify the parser/JSON pipeline + +```bash +python3 run_nccl.py --op all_reduce --parse-only tests/fixtures/all_reduce_perf_b200_8gpu.txt \ + --world-size 8 --nodes 1 --runner b200-dgxc --topology-class b200-nvlink-island --out /tmp/parsed.json +python3 env_capture.py # prints a (degraded, off-GPU) env record +python3 plot.py --results-dir results --out-dir results/plots # needs matplotlib +``` + +## Container + +One **multi-arch** image for all NVIDIA SKUs, imported by tag +`lmsysorg/sglang:v0.5.11-cu130` (amd64 + arm64; index digest `sha256:061fb71f…` +recorded for provenance). Imported by tag, not digest — enroot's anonymous +Docker Hub auth needs a tag, and a bare digest ref hangs in CI. See +`CONTAINERS.md` for versions, the DeepEP-rebuild note, and the bundled-DeepEP +DeepSeek-V4 fallback images. + +## How it runs (confirmed against the live clusters) + +- Adapters mirror `runners/launch_*.sh`: `salloc` → enroot squash (import only if + missing) → `srun --container-image=… --container-mounts=:/ix` → in-container + `run_in_container.sh`. B200 partition `gpu-2`, GB200 partition `batch`, account + `benchmark`. +- **AMD MI355X** (`launch_mi355x-amds.sh`, MoRI / `CX_BENCH=mori`) diverges: partition + `compute`, no account, pyxis `--container-writable --container-remap-root`, and a + **node-local** squash (`/var/lib/squash`) imported via `srun` on the allocated node + (not the login node). Workspace is bind-mounted directly (no `CX_STAGE_DIR`). +- Login nodes have no `nvcc`, so `nccl-tests` is **built in-container** (cached in + `.nccl-tests/`, `CX_NCCL_HOME=/usr`). Single-node uses `-g N`; the 2-node + adapter builds `MPI=1` and launches one rank per GPU (`srun --mpi=pmix`). +- The sglang image installs editable under `/workspace`, so the repo is mounted at + **`/ix`**. GB200 compute nodes don't see the runner workspace → `CX_STAGE_DIR` + rsyncs the tree to Lustre first. +- Every result embeds an `env_capture` record and a `comparison_key`; topology + class is part of the key, so B200(IB/NVLink) and GB200(MNNVL) stay labelled + distinct, never silently overlaid. + +## Status & known risks + +- **Spike done on real hardware** (both SKUs, 4 NCCL primitives, correctness-passed) + — on the DeepSeek-V4 images. Now standardizing on the **multi-arch** default; + validate it on first run and refresh `CONTAINERS.md` (expect CUDA 13 / NCCL 2.28 / torch 2.9). +- **DeepEP** is not bundled in the multi-arch image → `run_in_container.sh` builds + it via `rebuild-deepep` (CX_BENCH=deepep). Its Python API is version-sensitive; + `tests/ep_deepep.py` follows the documented normal-mode API — validate against + the built commit. B200 (x86_64) first; GB200 (aarch64) follows. +- **MoRI / MI355X** (`tests/ep_mori.py` + `launch_mi355x-amds.sh`) is **validated on + hardware** (8× MI355X: dispatch+combine numerically correct, ~85 µs round-trip). + It mirrors `ROCm/mori`'s example (config + `get_registered_combine_input_buffer` + zero-copy path, `expected = input × #unique-destination-ranks`). Three + ionic_rdma-fabric constraints are baked in (see `CONTAINERS.md`): a 2 GiB heap + (the NICs cap RDMA MRs at ~4 GiB), a bounded `max_num_inp_token_per_rank`, and a + hard-exit past MoRI's buggy shmem teardown. The ROCm image isn't digest-pinned yet. +- **Multi-node** (`launch_b200-dgxc-slurm.sh`) assumes `srun --mpi=pmix` + a + compute-visible checkout (`CX_STAGE_DIR`); else fall back to mpirun-in-container + or srt-slurm. CX_BENCH=nccl only for now. +- **B200 QOS:** account `benchmark` has only `gpu-2_qos` (the serving-sweep + partition); idle `gpu-1` needs a QOS grant. GB200 `batch` is open. + +Once the multi-arch image is validated end-to-end, freeze the schema from the +artifacts (plan: "Freeze the contract"). diff --git a/experimental/CollectiveX/analyze_ep.py b/experimental/CollectiveX/analyze_ep.py new file mode 100644 index 000000000..018d74a93 --- /dev/null +++ b/experimental/CollectiveX/analyze_ep.py @@ -0,0 +1,257 @@ +#!/usr/bin/env python3 +"""CollectiveX operating-envelope analysis (goal Part 2 'operating-envelope outputs' + Part 3 +'regression/decision outputs'). Post-processes result JSONs (v3 flat or v4 nested) into the +decision-facing summaries, comparing ONLY matching (workload, topology, contract, backend, +resource) cells: + + routing-skew penalty zipf* vs matched uniform — p50/p99 dispatch amplification + LL-to-normal crossover token count where normal becomes faster than LL (p50 and p99) + topology penalty EP4 vs EP8 (and placement, when present) latency penalty + strong/weak scaling fixed-global-tokens and fixed-tokens/rank efficiency across EP + resource marginal eff. Δlatency per Δcomm-fraction (needs a resource ladder; reports n/a otherwise) + pareto + recommendations lowest-latency / lowest-resource configs per (sku, phase) + +Pure stdlib; reads the same JSONs the plotter does. Honest about missing cells (prints n/a with +the reason) rather than inventing comparisons. + + python3 analyze_ep.py --results-dir results --out analysis.json +""" +from __future__ import annotations + +import argparse +import glob +import json +import os +from collections import defaultdict + + +def _p(r, op, pct): + """percentile from v4 nested {op:{p50..}} or v3 flat {op_us_p50}.""" + if isinstance(r.get(op), dict): + return r[op].get(pct) + return r.get(f"{op}_us_{pct}") + + +def load(results_dir): + series = [] + for f in sorted(glob.glob(os.path.join(results_dir, "**", "*.json"), recursive=True)): + if os.path.basename(f).startswith("env_"): + continue + try: + d = json.load(open(f)) + except (json.JSONDecodeError, OSError): + continue + if d.get("family") != "moe" or not d.get("rows"): + continue + sh = d.get("shape", {}) + series.append({ + "sku": (d.get("runner") or "?").split("_")[0].split("-")[0], + "ep": d.get("ep_size"), "phase": d.get("phase"), "mode": d.get("mode", "normal"), + "dtype": sh.get("dispatch_dtype"), "contract": d.get("measurement_contract"), + "routing": (sh.get("routing", "?") + ("+eplb" if (d.get("eplb") or {}).get("enabled") else "")), + "topo": d.get("topology_class"), "resource": d.get("resource_mode", "tuned"), + "rows": {r["tokens_per_rank"]: r for r in d["rows"]}, + }) + return series + + +def _key(s, *fields): + return tuple(s[f] for f in fields) + + +def skew_penalty(series): + """zipf* vs matched uniform: dispatch p50/p99 amplification at shared T.""" + out = [] + base = {_key(s, "sku", "ep", "phase", "mode", "dtype", "contract"): s + for s in series if s["routing"] == "uniform"} + for s in series: + if not s["routing"].startswith("zipf"): + continue + b = base.get(_key(s, "sku", "ep", "phase", "mode", "dtype", "contract")) + if not b: + continue + for T in sorted(set(s["rows"]) & set(b["rows"])): + zp, up = _p(s["rows"][T], "dispatch", "p50"), _p(b["rows"][T], "dispatch", "p50") + zq, uq = _p(s["rows"][T], "dispatch", "p99"), _p(b["rows"][T], "dispatch", "p99") + if up and uq: + out.append({"sku": s["sku"], "ep": s["ep"], "phase": s["phase"], "routing": s["routing"], + "T": T, "p50_amplification": round(zp / up, 3), "p99_amplification": round(zq / uq, 3)}) + return out + + +def ll_crossover(series): + """Token count where normal dispatch p50/p99 drops below LL (per sku,dtype).""" + out = [] + norm = {_key(s, "sku", "ep", "dtype"): s for s in series + if s["mode"] == "normal" and s["routing"] == "uniform" and s["contract"] == "layout-and-dispatch-v1"} + for s in series: + if s["mode"] != "ll" or s["routing"] != "uniform": + continue + n = norm.get(_key(s, "sku", "ep", "dtype")) + if not n: + continue + for stat in ("p50", "p99"): + cross = None + for T in sorted(set(s["rows"]) & set(n["rows"])): + ll, nm = _p(s["rows"][T], "dispatch", stat), _p(n["rows"][T], "dispatch", stat) + if ll and nm and nm < ll: + cross = T + break + out.append({"sku": s["sku"], "ep": s["ep"], "dtype": s["dtype"], "stat": stat, + "normal_faster_at_T": cross if cross is not None else "never-in-range"}) + return out + + +def topology_penalty(series): + """EP4 vs EP8 dispatch p50 at matched tokens/rank for the same sku (a scaling/topology cost).""" + out = [] + by = defaultdict(dict) + for s in series: + if s["routing"] == "uniform" and s["mode"] == "normal" and s["contract"] == "layout-and-dispatch-v1": + by[(s["sku"], s["phase"], s["dtype"])][s["ep"]] = s + for k, eps in by.items(): + if len(eps) < 2: + continue + lo, hi = min(eps), max(eps) + sl, sh = eps[lo], eps[hi] + for T in sorted(set(sl["rows"]) & set(sh["rows"])): + a, b = _p(sl["rows"][T], "dispatch", "p50"), _p(sh["rows"][T], "dispatch", "p50") + if a and b: + out.append({"sku": k[0], "phase": k[1], "dtype": k[2], "T": T, + f"ep{lo}_p50": round(a, 1), f"ep{hi}_p50": round(b, 1), + "penalty_pct": round(100 * (b - a) / a, 1)}) + return out + + +def scaling(series): + """strong: fixed GLOBAL tokens, vary EP -> latency. weak: fixed tokens/RANK, vary EP.""" + out = {"strong": [], "weak": []} + by = defaultdict(dict) + for s in series: + if s["routing"] == "uniform" and s["mode"] == "normal" and s["contract"] == "layout-and-dispatch-v1": + by[(s["sku"], s["phase"], s["dtype"])][s["ep"]] = s + for k, eps in by.items(): + if len(eps) < 2: + continue + for ep, s in eps.items(): + for T, r in s["rows"].items(): + d50 = _p(r, "dispatch", "p50") + if d50: + out["weak"].append({"sku": k[0], "phase": k[1], "ep": ep, "tokens_per_rank": T, + "global_tokens": T * ep, "dispatch_p50": round(d50, 1)}) + out["strong"].append({"sku": k[0], "phase": k[1], "ep": ep, "global_tokens": T * ep, + "tokens_per_rank": T, "dispatch_p50": round(d50, 1)}) + return out + + +def scaling_efficiency(series): + """From EP4+EP8 (same sku/phase): weak = fixed tokens/rank (ideal: flat latency); strong = + fixed GLOBAL tokens (ideal: latency falls ~1/EP). Efficiency = ideal/observed (1.0 = ideal).""" + out = {"weak": [], "strong": []} + by = defaultdict(dict) + for s in series: + if s["routing"] == "uniform" and s["mode"] == "normal" and s["contract"] == "layout-and-dispatch-v1": + by[(s["sku"], s["phase"], s["dtype"])][s["ep"]] = s + for k, eps in by.items(): + if len(eps) < 2: + continue + lo, hi = min(eps), max(eps) + # weak: same tokens/rank T on both EP -> latency should stay flat + for T in sorted(set(eps[lo]["rows"]) & set(eps[hi]["rows"])): + a, b = _p(eps[lo]["rows"][T], "dispatch", "p50"), _p(eps[hi]["rows"][T], "dispatch", "p50") + if a and b: + out["weak"].append({"sku": k[0], "phase": k[1], "tokens_per_rank": T, + f"ep{lo}": round(a, 1), f"ep{hi}": round(b, 1), + "weak_efficiency": round(a / b, 3)}) # >1 = EP8 faster (super-ideal) + # strong: same GLOBAL tokens -> EP_hi has fewer tokens/rank; ideal latency ~ a*(lo/hi) + for Tlo in eps[lo]["rows"]: + gt = Tlo * lo + Thi = gt // hi + if Thi in eps[hi]["rows"]: + a, b = _p(eps[lo]["rows"][Tlo], "dispatch", "p50"), _p(eps[hi]["rows"][Thi], "dispatch", "p50") + if a and b: + ideal = a * (lo / hi) + out["strong"].append({"sku": k[0], "phase": k[1], "global_tokens": gt, + f"ep{lo}_p50": round(a, 1), f"ep{hi}_p50": round(b, 1), + "strong_efficiency": round(ideal / b, 3)}) + return out + + +def regressions(series, baseline_series, thresh=0.10): + """Flag latency regressions vs a baseline, comparing ONLY matching (sku,ep,phase,mode,dtype, + contract,routing) cells at shared T. Regression = current p50/p99 > baseline*(1+thresh).""" + bkey = {_key(b, "sku", "ep", "phase", "mode", "dtype", "contract", "routing"): b for b in baseline_series} + out = [] + for s in series: + b = bkey.get(_key(s, "sku", "ep", "phase", "mode", "dtype", "contract", "routing")) + if not b: + continue + for T in sorted(set(s["rows"]) & set(b["rows"])): + for op in ("dispatch", "combine", "roundtrip"): + for stat in ("p50", "p99"): + cur, base = _p(s["rows"][T], op, stat), _p(b["rows"][T], op, stat) + if cur and base and cur > base * (1 + thresh): + out.append({"sku": s["sku"], "ep": s["ep"], "phase": s["phase"], + "routing": s["routing"], "T": T, "op": op, "stat": stat, + "baseline": round(base, 1), "current": round(cur, 1), + "regression_pct": round(100 * (cur - base) / base, 1)}) + return out + + +def recommendations(series): + """Per (sku, phase): lowest-p99-dispatch config at the headline T=64 (decode) / T=256 (prefill).""" + out = [] + by = defaultdict(list) + for s in series: + by[(s["sku"], s["phase"])].append(s) + for (sku, phase), ss in by.items(): + T = 64 if phase == "decode" else 256 + cands = [] + for s in ss: + r = s["rows"].get(T) + if r: + q = _p(r, "dispatch", "p99") + if q: + cands.append((q, f"{s['dtype']}/{s['mode']}/{s['contract']}/{s['routing']}/{s['resource']}", s["ep"])) + if cands: + cands.sort() + out.append({"sku": sku, "phase": phase, "at_T": T, "lowest_p99_dispatch_us": round(cands[0][0], 1), + "config": cands[0][1], "ep": cands[0][2]}) + return out + + +def main() -> int: + ap = argparse.ArgumentParser(description="CollectiveX operating-envelope analysis") + ap.add_argument("--results-dir", default="results") + ap.add_argument("--baseline", help="dir of baseline results for regression detection") + ap.add_argument("--out") + a = ap.parse_args() + s = load(a.results_dir) + rep = {"n_series": len(s), "skew_penalty": skew_penalty(s), "ll_crossover": ll_crossover(s), + "topology_penalty": topology_penalty(s), "scaling": scaling(s), + "scaling_efficiency": scaling_efficiency(s), "recommendations": recommendations(s)} + if a.baseline: + regs = regressions(s, load(a.baseline)) + rep["regressions"] = regs + print(f"regressions vs baseline: {len(regs)} cell(s) > +10%") + print(f"loaded {len(s)} series") + sk = rep["skew_penalty"] + if sk: + worst = max(sk, key=lambda x: x["p99_amplification"]) + print(f"skew penalty: {len(sk)} cells; worst p99 amplification {worst['p99_amplification']}x " + f"({worst['sku']} {worst['routing']} T{worst['T']})") + tp = rep["topology_penalty"] + if tp: + print(f"topology penalty (EP4->EP8): {len(tp)} cells; e.g. " + + ", ".join(f"{x['sku']} T{x['T']} {x['penalty_pct']:+}%" for x in tp[:3])) + print(f"LL crossover cells: {len(rep['ll_crossover'])}; recommendations: {len(rep['recommendations'])}") + for r in rep["recommendations"]: + print(f" rec {r['sku']}/{r['phase']} @T{r['at_T']}: {r['lowest_p99_dispatch_us']}us via {r['config']}") + if a.out: + json.dump(rep, open(a.out, "w"), indent=2) + print(f"wrote {a.out}") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/experimental/CollectiveX/configs/backends.yaml b/experimental/CollectiveX/configs/backends.yaml new file mode 100644 index 000000000..2237e7631 --- /dev/null +++ b/experimental/CollectiveX/configs/backends.yaml @@ -0,0 +1,49 @@ +# CollectiveX backend registry (goal Part 2) — the single source of truth for backend +# capability, replacing the data split between the adapters and tests/capability.py. Keep in +# sync with ep_deepep.py / ep_mori.py SUPPORTED_* sets (capability.py mirrors this at runtime). +schema_version: 1 +backends: + deepep: + vendor: nvidia + modes: [normal, ll] # ll is DECODE-ONLY (fixed num_max dispatch) + dtypes: [bf16, fp8] + contracts: [layout-and-dispatch-v1, cached-layout-comm-only-v1, runtime-visible-v1] + transports: [nvlink, mnnvl, rdma] + ep_max_intranode: 8 # <=8 ranks = intranode NVL kernel (incl. MNNVL trays) + ep_min: 2 + phase_constraints: + ll: {phases: [decode], max_tokens_per_rank: 128} # LL is a fixed-num_max decode path + required_image: "lmsysorg/sglang:v0.5.11-cu130" + cap_token_per_rank: 4096 # 4 GiB NVL buffer holds ~4096 tok/rank at hidden=7168 + mori: + vendor: amd + modes: [normal] + dtypes: [bf16] + contracts: [layout-and-dispatch-v1] + transports: [xgmi, rdma] + ep_max_intranode: 8 + ep_min: 2 + phase_constraints: + normal: {max_tokens_per_rank: 512} # 2 GiB registerable heap cap at hidden=7168 + required_image: "rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-2" + cap_token_per_rank: 512 + fragility: "wedges (D-state) on sustained iters>=200 at T>=32; needs gradual ramp, low iters" + aiter: + vendor: amd + modes: [normal] + dtypes: [bf16, fp8] + contracts: [layout-and-dispatch-v1] + transports: [xgmi, rdma] + ep_max_intranode: 8 + ep_min: 2 + status: "scaffolded — adapter ep_aiter.py not yet implemented (capability declared, not validated)" + required_image: "rocm/sgl-dev (AITER CK MoE EP)" + +# 'all' resolves to a DEFINED per-vendor backend set (NOT the same across vendors). +vendor_backends: + nvidia: [nccl, deepep] + amd: [rccl, mori] +# Collective primitives (not EP dispatch/combine — phase/dtype/mode/contract N/A). +collective_backends: + nccl: [nvidia] + rccl: [amd] diff --git a/experimental/CollectiveX/configs/platforms.yaml b/experimental/CollectiveX/configs/platforms.yaml new file mode 100644 index 000000000..ebb58a430 --- /dev/null +++ b/experimental/CollectiveX/configs/platforms.yaml @@ -0,0 +1,84 @@ +# CollectiveX platform registry (goal Part 2). One entry per SKU: hardware capability is +# separated from VALIDATED software capability (what we've actually run green on real HW). +# scale_up_domain = #GPUs reachable over the intra-domain fabric before crossing a tier +# (NVLink island / NVL72 MNNVL tray-group / XGMI). gpus_per_node bounds single-node EP. +schema_version: 1 +platforms: + h100: + vendor: nvidia + arch: sm90 + gpu: "H100 80GB HBM3" + gpus_per_node: 8 + scale_up_domain: 8 # single 8-GPU NVLink island + transport_tiers: [nvlink, ib] + runner: h100-8x + launcher: launch_h100-dgxc-slurm.sh + ssh: "sa-shared@100.118.57.65" # partition hpc-gpu-1, /mnt/nfs, exclude hpc-gpu-1-7 + validated: + ep_degrees: [8] + backends: [deepep] + max_intranode_gpus: 8 + internode: false # not yet exercised for EP + h200: + vendor: nvidia + arch: sm90 + gpu: "H200 143GB HBM3e" + gpus_per_node: 8 + scale_up_domain: 8 + transport_tiers: [nvlink, ib] + runner: h200-8x + launcher: launch_h200.sh + ssh: "sa-shared@100.78.55.80" # partition main, /home NFS + validated: + ep_degrees: [8] + backends: [deepep] + max_intranode_gpus: 8 + internode: false + b300: + vendor: nvidia + arch: sm100 + gpu: "B300 SXM6 268GB" + gpus_per_node: 8 + scale_up_domain: 8 + transport_tiers: [nvlink, ib] + runner: b300-nv + launcher: launch_b300.sh + ssh: "sa-shared@100.101.13.83" # partition batch_1, acct benchmark, /data, exclude b300-018 + notes: "Blackwell drops clocks on tiny T -> per-point warm burst (warmup>=30). LL aborts." + validated: + ep_degrees: [8] + backends: [deepep] + max_intranode_gpus: 8 + internode: false + gb300: + vendor: nvidia + arch: sm100 + gpu: "GB300 Grace-Blackwell (aarch64)" + gpus_per_node: 4 # NVL72 compute tray = 4 GPU/node + scale_up_domain: 72 # NVL72 MNNVL: one NVLink P2P domain spans the rack + transport_tiers: [mnnvl, ib] + runner: gb300-8x + launcher: _gb300_ep8.sh + ssh: "2-hop: sa-shared@100.92.114.46 -> im-gb300-login-02" # batch_1, acct benchmark, /data + notes: "EP8 = 2 trays but INTRANODE NVLink path (MNNVL is one domain for <=8 ranks). deep_ep 1.1.0." + validated: + ep_degrees: [4, 8] + backends: [deepep] + max_intranode_gpus: 8 # <=8 ranks use the intranode NVL kernel even across 2 trays + internode: false # internode-normal asserts out until >8 ranks (EP16+) + mi355x: + vendor: amd + arch: gfx950 + gpu: "MI355X CDNA4 256 CU" + gpus_per_node: 8 + scale_up_domain: 8 # single 8-GPU XGMI island + transport_tiers: [xgmi, rdma] + runner: mi355x-8x + launcher: launch_mi355x-amds.sh + ssh: "2-hop bastion -> mia1-vm-amd-prj3-slurm-001" # partition compute, cpus-per-task=128 + notes: "MoRI wedges (D-state) on sustained iters>=200 at T>=32; cap iters. 512-tok buffer cap. No LL/fp8." + validated: + ep_degrees: [8] + backends: [mori] + max_intranode_gpus: 8 + internode: false diff --git a/experimental/CollectiveX/configs/suites.yaml b/experimental/CollectiveX/configs/suites.yaml new file mode 100644 index 000000000..39924095a --- /dev/null +++ b/experimental/CollectiveX/configs/suites.yaml @@ -0,0 +1,92 @@ +# CollectiveX named benchmark suites (goal Part 2). A suite binds workloads x platforms x +# backends x modes x contracts x resource regimes x repetitions x required publication level. +# generate_matrix.py resolves a suite against platforms.yaml/backends.yaml capabilities BEFORE +# any GPU is allocated, omitting unsupported combinations with recorded reasons. +schema_version: 1 +suites: + ep-smoke-v1: + description: "fast canary: one small point per platform/backend/mode/contract" + workloads: [ds-like-ref] + platforms: [h100, h200, gb300, mi355x] + backends: [deepep, mori] + modes: [normal] + dtypes: [bf16] + contracts: [layout-and-dispatch-v1] + routings: [uniform] + resource_modes: [tuned] + token_points: [8, 64] + trials: 1 + required_publication: comparable-experimental + + ep-nightly-v1: + description: "headline matrix: both contracts, bf16+fp8, normal+LL, decode+prefill" + workloads: [ds-like-ref] + platforms: [h100, h200, gb300, mi355x] + backends: [deepep, mori] + modes: [normal, ll] + dtypes: [bf16, fp8] + contracts: [layout-and-dispatch-v1, cached-layout-comm-only-v1, runtime-visible-v1] + routings: [uniform] + resource_modes: [tuned] + phases: [decode, prefill] + trials: 3 + required_publication: official + + ep-models-v1: + description: "model-shape envelope: real MoE dimensions, controlled routing" + workloads: [deepseek-v4, kimi-k2.x, qwen3.5, glm-5, minimax-m3] + platforms: [h100, h200, gb300, mi355x] + backends: [deepep, mori] + modes: [normal] + dtypes: [fp8, bf16] + contracts: [runtime-visible-v1] + routings: [uniform] + resource_modes: [tuned] + phases: [decode, prefill] + trials: 3 + required_publication: comparable-experimental + + ep-scaling-v1: + description: "strong (fixed global tokens) + weak (fixed tokens/rank) scaling across EP degrees" + workloads: [ds-like-ref] + platforms: [gb300] # the only SKU with >1 validated EP degree (EP4 + EP8) + backends: [deepep] + modes: [normal] + dtypes: [bf16] + contracts: [layout-and-dispatch-v1] + routings: [uniform] + resource_modes: [tuned] + scaling: [strong, weak] + ep_degrees: [4, 8] + trials: 3 + required_publication: comparable-experimental + + ep-topology-v1: + description: "placement sensitivity: packed vs striped vs adversarial on multi-domain SKUs" + workloads: [ds-like-ref] + platforms: [gb300] # NVL72 tray boundary is the scale-up domain edge + backends: [deepep] + modes: [normal] + dtypes: [bf16] + contracts: [layout-and-dispatch-v1] + routings: [uniform, zipf] + placements: [packed, striped, adversarial] + resource_modes: [tuned] + ep_degrees: [8] + trials: 3 + required_publication: comparable-experimental + + ep-routing-v1: + description: "routing-skew sensitivity + EPLB remedy" + workloads: [ds-like-ref] + platforms: [h100, h200, gb300] + backends: [deepep] + modes: [normal] + dtypes: [bf16] + contracts: [layout-and-dispatch-v1] + routings: [uniform, balanced, zipf, zipf-mild, zipf-moderate, zipf-heavy, hotspot-single] + eplb: [false, true] + resource_modes: [tuned] + phases: [decode, prefill] + trials: 3 + required_publication: comparable-experimental diff --git a/experimental/CollectiveX/configs/workloads.yaml b/experimental/CollectiveX/configs/workloads.yaml new file mode 100644 index 000000000..b7fe7cf09 --- /dev/null +++ b/experimental/CollectiveX/configs/workloads.yaml @@ -0,0 +1,76 @@ +# CollectiveX workload registry (goal Part 2). Each workload references an IMMUTABLE canonical +# manifest (tests/workload.py -> .npz + .manifest.json). Three kinds: +# synthetic — controlled DeepSeek-like baseline (dims real, routing controlled) +# model-derived — REAL model MoE dimensions with controlled routing (shape != routing behavior) +# trace-replay — captured routing behavior (future; needs a captured trace) +# Model dims marked verify=true must be confirmed against a checked-in model config before any +# result built on them is promoted past 'comparable-experimental'. +schema_version: 1 + +synthetic: + ds-like-ref: + kind: synthetic + hidden: 7168 + topk: 8 + experts: 256 + dispatch_dtype: bf16 + combine_dtype: bf16 + routings: [uniform, balanced, zipf] + note: "Controlled baseline used through v3/v4 (DeepSeek-V3-shaped)." + +model_derived: + deepseek-v4: + kind: model-derived + hidden: 7168 + topk: 8 + routed_experts: 256 + shared_experts: 1 + expert_alignment: 128 + dispatch_dtype: fp8 + combine_dtype: bf16 + verify: false # matches the validated DSV3/V4 serving shape used on these clusters + minimax-m3: + kind: model-derived + hidden: 6144 + topk: 8 + routed_experts: 256 + shared_experts: 1 + dispatch_dtype: fp8 + combine_dtype: bf16 + verify: true + kimi-k2.x: + kind: model-derived + hidden: 7168 + topk: 8 + routed_experts: 384 + shared_experts: 1 + dispatch_dtype: fp8 + combine_dtype: bf16 + verify: true + glm-5: + kind: model-derived + hidden: 5120 + topk: 8 + routed_experts: 160 + shared_experts: 1 + dispatch_dtype: bf16 + combine_dtype: bf16 + verify: true + qwen3.5: + kind: model-derived + hidden: 4096 + topk: 8 + routed_experts: 128 + shared_experts: 0 + dispatch_dtype: bf16 + combine_dtype: bf16 + verify: true + +# decode vs prefill are workload METADATA, not just token-ladder aliases (goal Part 2): +phase_profiles: + decode: + token_ladder: [1, 2, 4, 8, 16, 32, 64, 128] + description: "one (or few) tokens per active sequence per step; routing varies step-to-step" + prefill: + token_ladder: [128, 256, 512, 1024, 2048, 4096] + description: "chunked-prefill: many tokens per sequence enter each MoE layer at once" diff --git a/experimental/CollectiveX/env_capture.py b/experimental/CollectiveX/env_capture.py new file mode 100644 index 000000000..b906a0497 --- /dev/null +++ b/experimental/CollectiveX/env_capture.py @@ -0,0 +1,250 @@ +#!/usr/bin/env python3 +"""CollectiveX spike — Layer-0 environment + topology capture. + +Emits a JSON document describing the node a collective benchmark ran on, so +every result is provenance-tagged and a B200-vs-GB200 comparison is defensible. +Standard library only (so it runs in any minimal container, and off-GPU it +degrades gracefully instead of crashing). torch is used only if importable. + +Usage: + python env_capture.py --out results/env_b200-dgxc.json + python env_capture.py --redact --out env.json # hash hostnames/IPs/UUIDs + +Importable: + from env_capture import capture_environment + env = capture_environment(redact=False) +""" +from __future__ import annotations + +import argparse +import datetime as _dt +import hashlib +import json +import os +import platform +import re +import shutil +import socket +import subprocess +import sys + +SCHEMA_VERSION = 1 + +# Env vars worth recording — transport/tuning knobs that change what a +# collective actually does (esp. the GB200 MNNVL flags vs B200). +ENV_PREFIXES = ("NCCL_", "NVSHMEM_", "MC_", "UCX_", "SGLANG_DEEPEP", "DEEPEP_") +ENV_EXACT = ( + "CUDA_VISIBLE_DEVICES", + "CUDA_DEVICE_ORDER", + "SLURM_JOB_ID", + "SLURM_NNODES", + "SLURM_NTASKS", + "SLURM_JOB_PARTITION", + # Image identity — set by the launcher so the bundle records what ran. + "COLLECTIVEX_IMAGE", + "COLLECTIVEX_IMAGE_DIGEST", +) + + +def _run(cmd: list[str], timeout: int = 20) -> str | None: + """Run a command, return stdout (stripped) or None if unavailable.""" + if shutil.which(cmd[0]) is None: + return None + try: + out = subprocess.run( + cmd, capture_output=True, text=True, timeout=timeout, check=False + ) + except (subprocess.TimeoutExpired, OSError): + return None + if out.returncode != 0: + return None + return out.stdout.strip() + + +def _redact(value: str | None) -> str | None: + """Stable short hash so artifacts can be shared without leaking + hostnames / IPs / GPU UUIDs / IB GUIDs while staying joinable.""" + if not value: + return value + return "redacted-" + hashlib.sha256(value.encode()).hexdigest()[:12] + + +def _gpus(redact: bool) -> dict: + """GPU inventory via nvidia-smi (None fields off-GPU).""" + info: dict = {"source": None, "count": None, "devices": []} + q = _run( + [ + "nvidia-smi", + "--query-gpu=name,uuid,memory.total,compute_cap,pci.bus_id", + "--format=csv,noheader,nounits", + ] + ) + if q is None: + return info + info["source"] = "nvidia-smi" + devices = [] + for line in q.splitlines(): + parts = [p.strip() for p in line.split(",")] + if len(parts) < 5: + continue + name, uuid, mem_mib, cc, bus = parts[:5] + devices.append( + { + "name": name, + "uuid": _redact(uuid) if redact else uuid, + "memory_total_mib": int(mem_mib) if mem_mib.isdigit() else mem_mib, + "compute_capability": cc, + "pci_bus_id": _redact(bus) if redact else bus, + } + ) + info["count"] = len(devices) + info["devices"] = devices + return info + + +def _driver_cuda() -> dict: + out = _run( + ["nvidia-smi", "--query-gpu=driver_version", "--format=csv,noheader"] + ) + driver = out.splitlines()[0].strip() if out else None + # `nvidia-smi` (no args) prints the CUDA driver-API version in its header. + cuda = None + header = _run(["nvidia-smi"]) + if header: + m = re.search(r"CUDA Version:\s*([0-9.]+)", header) + if m: + cuda = m.group(1) + return {"driver_version": driver, "cuda_version": cuda} + + +def _torch_info() -> dict: + """NCCL / torch build info — only if torch is importable in this env.""" + info: dict = {"available": False} + try: + import torch # type: ignore + except Exception: + return info + info["available"] = True + info["torch_version"] = torch.__version__ + try: + info["cuda_runtime"] = torch.version.cuda + except Exception: + info["cuda_runtime"] = None + try: + if torch.cuda.is_available(): + nccl = torch.cuda.nccl.version() + # version() returns an int (e.g. 22304) or a tuple, depending on build. + info["nccl_version"] = ( + ".".join(map(str, nccl)) if isinstance(nccl, tuple) else nccl + ) + info["device_count"] = torch.cuda.device_count() + info["device_name"] = torch.cuda.get_device_name(0) + cc = torch.cuda.get_device_capability(0) + info["compute_capability"] = f"{cc[0]}.{cc[1]}" + except Exception as exc: # pragma: no cover - hardware dependent + info["error"] = repr(exc) + return info + + +def _topology(redact: bool) -> dict: + """GPU/NIC topology matrix + a fingerprint to gate comparability. + + The fingerprint is a hash of the structural part of `nvidia-smi topo -m` + (the connection legend), so two nodes with the same wiring share a key + even if absolute device IDs differ.""" + topo = _run(["nvidia-smi", "topo", "-m"]) + if topo is None: + return {"source": None, "matrix": None, "fingerprint": None} + # Fingerprint the link-type tokens (NV#, NODE, SYS, PIX, PXB, ...) only — + # ignore GPU/NIC labels and whitespace so it's placement-stable. + tokens = re.findall(r"\b(NV\d+|NODE|SYS|PIX|PXB|PHB|X)\b", topo) + fingerprint = hashlib.sha256(" ".join(tokens).encode()).hexdigest()[:16] + return { + "source": "nvidia-smi topo -m", + # The matrix can contain hostnames in some setups; redact wholesale. + "matrix": ("" if redact else topo), + "fingerprint": fingerprint, + } + + +def _rdma(redact: bool) -> dict: + """RDMA/IB device presence — names only, GUIDs redactable.""" + devices: list[str] = [] + listing = _run(["ibv_devinfo", "-l"]) + if listing: + for line in listing.splitlines()[1:]: # first line is a count + name = line.strip() + if name: + devices.append(name) + elif _run(["ibstat", "-l"]): + devices = [d.strip() for d in _run(["ibstat", "-l"]).splitlines() if d.strip()] + return { + "available": bool(devices), + "devices": [_redact(d) if redact else d for d in devices], + } + + +def _env_vars() -> dict: + out = {} + for k, v in os.environ.items(): + if k in ENV_EXACT or any(k.startswith(p) for p in ENV_PREFIXES): + out[k] = v + return dict(sorted(out.items())) + + +def capture_environment(redact: bool = False, timestamp: str | None = None) -> dict: + """Return a JSON-serializable environment/provenance record.""" + host = socket.gethostname() + return { + "schema_version": SCHEMA_VERSION, + "captured_at": timestamp or _dt.datetime.now().astimezone().isoformat(), + "redacted": redact, + "host": _redact(host) if redact else host, + "platform": { + "system": platform.system(), + "release": platform.release(), + "machine": platform.machine(), # x86_64 vs aarch64 (B200 vs GB200) + "python": sys.version.split()[0], + }, + "gpus": _gpus(redact), + "driver": _driver_cuda(), + "torch": _torch_info(), + "topology": _topology(redact), + "rdma": _rdma(redact), + "env": _env_vars(), + } + + +def main() -> int: + ap = argparse.ArgumentParser(description="CollectiveX Layer-0 environment capture") + ap.add_argument("--out", help="write JSON here (default: stdout)") + ap.add_argument( + "--redact", + action="store_true", + help="hash hostnames / IPs / GPU UUIDs / IB GUIDs for shareable artifacts", + ) + ap.add_argument( + "--timestamp", + help="ISO timestamp to stamp (default: now); pass one for reproducible bundles", + ) + args = ap.parse_args() + + env = capture_environment(redact=args.redact, timestamp=args.timestamp) + blob = json.dumps(env, indent=2) + if args.out: + os.makedirs(os.path.dirname(os.path.abspath(args.out)), exist_ok=True) + with open(args.out, "w") as fh: + fh.write(blob + "\n") + # A one-line human summary to stdout (the JSON is the artifact). + g = env["gpus"] + print( + f"env -> {args.out} | machine={env['platform']['machine']} " + f"gpus={g['count']} topo_fp={env['topology']['fingerprint']}" + ) + else: + print(blob) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/experimental/CollectiveX/generate_matrix.py b/experimental/CollectiveX/generate_matrix.py new file mode 100644 index 000000000..cec960b93 --- /dev/null +++ b/experimental/CollectiveX/generate_matrix.py @@ -0,0 +1,139 @@ +#!/usr/bin/env python3 +"""CollectiveX matrix generator (goal Part 2: capability planning, sharding, canaries). + +Reads configs/{suites,workloads,platforms,backends}.yaml, resolves a named suite into the FULLY +VALIDATED set of (workload, platform, backend, mode, dtype, contract, routing, ep, phase) cases +BEFORE any GPU is allocated — omitting unsupported combinations with a recorded reason. Then: + * groups compatible cases into SHARDS (same platform/nodes/placement/image/backend/mode/resource + -> one allocation runs many token points), and + * selects a CANARY per (platform, backend, mode, contract) to run before the full shard. + + python3 generate_matrix.py --suite ep-nightly-v1 --out matrix.json + python3 generate_matrix.py --suite ep-smoke-v1 # prints summary + omissions + +Pure stdlib + PyYAML. 'all' as a backend resolves to the platform vendor's EP backend set. +""" +from __future__ import annotations + +import argparse +import itertools +import json +import os + +import yaml + +HERE = os.path.dirname(os.path.abspath(__file__)) + + +def _load(name): + with open(os.path.join(HERE, "configs", name)) as fh: + return yaml.safe_load(fh) + + +def resolve_case(plat, beng, mode, dtype, contract, routing, ep, phase, platforms, backends): + """Return (ok, reason). Mirrors adapter SUPPORTED_* + platform/backend registry limits.""" + p = platforms["platforms"].get(plat) + b = backends["backends"].get(beng) + if p is None: + return False, f"unknown platform {plat}" + if b is None: + return False, f"unknown backend {beng}" + if b["vendor"] != p["vendor"]: + return False, f"{beng} is {b['vendor']}, {plat} is {p['vendor']}" + if mode not in b["modes"]: + return False, f"{beng} has no mode {mode}" + if dtype not in b["dtypes"]: + return False, f"{beng} has no dtype {dtype}" + if contract not in b["contracts"]: + return False, f"{beng} has no contract {contract}" + if ep not in p["validated"]["ep_degrees"]: + return False, f"{plat} EP{ep} not validated (have {p['validated']['ep_degrees']})" + if ep > p["validated"]["max_intranode_gpus"] and not p["validated"].get("internode"): + return False, f"{plat} EP{ep} needs internode (not validated)" + pc = (b.get("phase_constraints") or {}).get(mode) + if pc and pc.get("phases") and phase not in pc["phases"]: + return False, f"{beng} mode={mode} is {pc['phases']}-only (got {phase})" + if contract == "cached-layout-comm-only-v1" and mode == "ll": + return False, "cached-layout meaningless for LL" + return True, "ok" + + +def expand_backends(spec, plat, platforms, backends): + """Resolve 'all' to the platform vendor's EP backend set (goal: do NOT skip capability).""" + if spec != "all": + return spec if isinstance(spec, list) else [spec] + vendor = platforms["platforms"][plat]["vendor"] + eps = [b for b in backends["vendor_backends"][vendor] if b in backends["backends"]] + return eps + + +def generate(suite_name): + suites = _load("suites.yaml")["suites"] + platforms = _load("platforms.yaml") + backends = _load("backends.yaml") + workloads = _load("workloads.yaml") + if suite_name not in suites: + raise SystemExit(f"unknown suite {suite_name}; have {sorted(suites)}") + s = suites[suite_name] + phases = s.get("phases", ["decode"]) + routings = s.get("routings", ["uniform"]) + resource_modes = s.get("resource_modes", ["tuned"]) + cases, omitted = [], [] + for plat in s["platforms"]: + bset = [] + for bspec in s["backends"]: + bset += expand_backends(bspec, plat, platforms, backends) + for beng in sorted(set(bset)): + eps = s.get("ep_degrees") or platforms["platforms"][plat]["validated"]["ep_degrees"] + for wl, mode, dtype, contract, routing, ep, phase, rmode in itertools.product( + s["workloads"], s["modes"], s.get("dtypes", ["bf16"]), s["contracts"], + routings, eps, phases, resource_modes): + ok, reason = resolve_case(plat, beng, mode, dtype, contract, routing, ep, phase, + platforms, backends) + rec = {"workload": wl, "platform": plat, "backend": beng, "mode": mode, + "dtype": dtype, "contract": contract, "routing": routing, "ep": ep, + "phase": phase, "resource_mode": rmode} + (cases if ok else omitted).append({**rec, **({} if ok else {"reason": reason})}) + # SHARDS: one allocation per (platform, backend, mode, resource, image) runs many points. + shards = {} + for c in cases: + img = backends["backends"][c["backend"]].get("required_image", "?") + key = (c["platform"], c["backend"], c["mode"], c["resource_mode"], img) + shards.setdefault(key, []).append(c) + shard_list = [{"platform": k[0], "backend": k[1], "mode": k[2], "resource_mode": k[3], + "image": k[4], "cases": v} for k, v in shards.items()] + # CANARY: one representative (smallest) case per (platform, backend, mode, contract). + canary = {} + for c in cases: + ck = (c["platform"], c["backend"], c["mode"], c["contract"]) + canary.setdefault(ck, c) + return {"suite": suite_name, "required_publication": s.get("required_publication"), + "n_cases": len(cases), "n_omitted": len(omitted), + "cases": cases, "omitted": omitted, "shards": shard_list, + "canaries": list(canary.values())} + + +def main() -> int: + ap = argparse.ArgumentParser(description="CollectiveX matrix generator") + ap.add_argument("--suite", required=True) + ap.add_argument("--out") + a = ap.parse_args() + m = generate(a.suite) + print(f"suite={m['suite']} required={m['required_publication']}: " + f"{m['n_cases']} valid cases, {m['n_omitted']} omitted, " + f"{len(m['shards'])} shards, {len(m['canaries'])} canaries") + seen = set() + for o in m["omitted"]: + k = (o["platform"], o["backend"], o["mode"], o["dtype"], o["contract"], o["reason"]) + if k not in seen: + seen.add(k) + print(f" OMIT {o['platform']}/{o['backend']}/{o['mode']}/{o['dtype']}/{o['contract']}: {o['reason']}") + if a.out: + with open(a.out, "w") as fh: + json.dump(m, fh, indent=2) + print(f"wrote {a.out}") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/experimental/CollectiveX/launchers/_b300_investigate.sh b/experimental/CollectiveX/launchers/_b300_investigate.sh new file mode 100644 index 000000000..68cac0b95 --- /dev/null +++ b/experimental/CollectiveX/launchers/_b300_investigate.sh @@ -0,0 +1,51 @@ +#!/usr/bin/env bash +# B300 DeepEP perf investigation (run via srun on an 8-GPU B300 node). +# (1) Diagnose the installed deep_ep build: file, version, and the CUDA archs its +# .so actually contains (sm_100 present? or only sm_90 -> JIT-from-PTX = slow). +# (2) Reproducibility: run the SAME decode config 3x back-to-back in one container +# (high warmup) and report T=64 dispatch p50 each time -> is variance < 10%, or +# is the noise a first-config cold-start artifact? +set -uo pipefail +cd /cx || exit 2 +mkdir -p results +NG="${NG:-8}"; RUNNER="${RUNNER:-b300-8x}"; TOPO="${TOPO:-b300-nvlink-island}" + +echo "=== GPU ==="; nvidia-smi --query-gpu=name --format=csv,noheader | head -1 +echo "=== deep_ep build diagnosis ===" +python3 - <<'PY' +import importlib.metadata as md, deep_ep, glob, os, subprocess +print("deep_ep:", md.version("deep_ep"), deep_ep.__file__) +d = os.path.dirname(deep_ep.__file__) +sos = glob.glob(os.path.join(d, "**", "*.so"), recursive=True) + glob.glob(os.path.join(d, "..", "deep_ep_cpp*.so")) +for so in sorted(set(sos)): + print("so:", so) + try: + out = subprocess.run(["cuobjdump", "--list-elf", so], capture_output=True, text=True, timeout=60).stdout + archs = sorted(set(p.split("sm_")[1][:2] for p in out.split() if "sm_" in p)) + print(" ELF archs (cubin):", archs or "") + ptx = subprocess.run(["cuobjdump", "--list-ptx", so], capture_output=True, text=True, timeout=60).stdout + parchs = sorted(set(p.split("sm_")[1][:2] for p in ptx.split() if "sm_" in p)) + print(" PTX archs:", parchs or "") + except Exception as e: + print(" cuobjdump failed:", repr(e)) +PY + +echo "=== reproducibility: decode bf16 x3 (warmup 30, iters 80) ===" +for i in 1 2 3; do + out="results/_repro_b300_decode_bf16_run${i}.json" + timeout -k 30 600 torchrun --nproc_per_node="$NG" tests/run_ep.py \ + --backend deepep --mode normal --dispatch-dtype bf16 --phase decode \ + --routing uniform --resource-mode tuned \ + --runner "$RUNNER" --topology-class "$TOPO" --transport nvlink \ + --tokens-ladder "64" --warmup 30 --iters 80 --out "$out" >/dev/null 2>&1 + python3 - "$out" "$i" <<'PY' +import json,sys +try: + d=json.load(open(sys.argv[1])); r=d["rows"][0] + print(f"run{sys.argv[2]}: T=64 dispatch_p50={r['dispatch_us_p50']:.1f} combine_p50={r['combine_us_p50']:.1f} " + f"dispatch_p99={r['dispatch_us_p99']:.1f} status={d['status']}") +except Exception as e: + print(f"run{sys.argv[2]}: FAILED {e!r}") +PY +done +echo "=== DONE ===" diff --git a/experimental/CollectiveX/launchers/_gb300_ep8.sh b/experimental/CollectiveX/launchers/_gb300_ep8.sh new file mode 100644 index 000000000..a0b50c543 --- /dev/null +++ b/experimental/CollectiveX/launchers/_gb300_ep8.sh @@ -0,0 +1,90 @@ +#!/usr/bin/env bash +# GB300 EP8 sweep — 2 nodes x 4 GPU over the NVL72 MNNVL NVLink domain. Runs the SAME +# v3 DeepEP matrix as the EP4 run (normal: bf16/fp8 x {layout-and-dispatch, cached}, +# decode 1..128 + prefill 128..512) but at EP8, so the curves overlay the other EP8 SKUs +# (H100/H200/MI355X) at matched tokens/rank = same global batch. +# +# PROBE FINDING (2026-06-25): DeepEP 1.1.0+814e508 intranode Buffer(group, nvl, 0) works +# UNCHANGED across 2 NVL72 trays — the MNNVL fabric is one NVLink P2P domain (rdma_rank +# layout=None). So no internode/NVSHMEM/adapter change: just torchrun-free 8-rank srun. +# NCCL_MNNVL_ENABLE/CUMEM are required for the nccl process group + barriers across trays. +# +# Multi-node has no torchrun: each of the 8 srun tasks IS one rank and runs run_ep.py +# directly, taking RANK/WORLD_SIZE/LOCAL_RANK/MASTER_ADDR/MASTER_PORT from SLURM_* env. +set -uo pipefail +IMAGE="${CX_IMAGE:-/data/sa-shared/containers/lmsysorg_sglang_v0.5.11-cu130.sqsh}" +STAGE="${CX_STAGE:-/data/sa-shared/cx_stage}" +PART="${CX_PARTITION:-batch_1}"; ACCT="${CX_ACCOUNT:-benchmark}" +JOBNAME="${JOBNAME:-cx_gb300_ep8}"; MP="${MASTER_PORT:-29513}" +RUNNER="${RUNNER:-gb300-8x}"; TOPO="${TOPO:-gb300-nvl72-mnnvl}"; TRANSPORT="${TRANSPORT:-mnnvl}" +WARMUP="${WARMUP:-32}"; ITERS="${ITERS:-200}"; TRIALS="${TRIALS:-3}" +DEC="${DEC:-1 2 4 8 16 32 64 128}"; PRE="${PRE:-128 256 512}" +DO_LL="${DO_LL:-0}" # Blackwell aborts LL (B300/GB300); normal-only by default +EP_ENV="${CX_EP_ENV:-}" # extra --export csv (intranode needs none; reserved for internode) +export ENROOT_CACHE_PATH="${ENROOT_CACHE_PATH:-/data/sa-shared/.enroot_cache}" + +echo "[orch] salloc 2x4 GPU partition=$PART acct=$ACCT runner=$RUNNER" +salloc --partition="$PART" --account="$ACCT" --nodes=2 --gres=gpu:4 \ + --ntasks-per-node=4 --exclusive --time="${CX_TIME:-90}" --no-shell --job-name="$JOBNAME" 2>&1 | tail -3 +JID="$(squeue --name="$JOBNAME" -u "$USER" -h -o %A | head -n1)" +[ -n "$JID" ] || { echo "[orch] FATAL no JOB_ID"; exit 1; } +trap 'scancel "$JID" 2>/dev/null || true' EXIT +st="" +for i in $(seq 1 60); do + st="$(squeue -j "$JID" -h -o %T 2>/dev/null)" + echo "[orch] tick=$i state=$st nodes=$(squeue -j "$JID" -h -o %N 2>/dev/null)" + [ "$st" = "RUNNING" ] && break + [ -z "$st" ] && { echo "[orch] job vanished"; exit 1; } + sleep 8 +done +[ "$st" = "RUNNING" ] || { echo "[orch] FATAL never started"; exit 1; } +NODELIST="$(squeue -j "$JID" -h -o %N)"; MA="$(scontrol show hostnames "$NODELIST" | head -1)" +echo "[orch] JOB_ID=$JID nodes=[$NODELIST] MASTER_ADDR=$MA MASTER_PORT=$MP" + +CMOUNT=(--container-image="$IMAGE" --container-mounts="$STAGE:/cx" + --no-container-mount-home --container-workdir=/cx --no-container-entrypoint) +WRAP='export RANK=$SLURM_PROCID WORLD_SIZE=$SLURM_NTASKS LOCAL_RANK=$SLURM_LOCALID; exec python3 tests/run_ep.py "$@"' + +run(){ # phase dtype mode contract ladder + local phase="$1" dt="$2" mode="$3" contract="$4" ladder="$5" + local out="results/${RUNNER}_deepep_${phase}_${dt}_${mode}_${contract}.json" + echo "### $phase dtype=$dt mode=$mode contract=$contract -> $out" + timeout -k 30 "${CX_RUN_TIMEOUT:-900}" srun --jobid="$JID" --nodes=2 --ntasks=8 --ntasks-per-node=4 \ + "${CMOUNT[@]}" \ + --export=ALL,MASTER_ADDR="$MA",MASTER_PORT="$MP",COLLECTIVEX_IMAGE="$IMAGE",NCCL_MNNVL_ENABLE=1,NCCL_CUMEM_ENABLE=1${EP_ENV:+,$EP_ENV} \ + bash -c "$WRAP" _ \ + --backend deepep --phase "$phase" --dispatch-dtype "$dt" --mode "$mode" \ + --measurement-contract "$contract" --routing uniform --resource-mode tuned \ + --tokens-ladder "$ladder" --warmup "$WARMUP" --iters "$ITERS" --trials "$TRIALS" \ + --runner "$RUNNER" --topology-class "$TOPO" --transport "$TRANSPORT" --out "$out" &1 | tail -7 + echo "### rc=${PIPESTATUS[0]} -> $out" +} + +if [ "${CX_LL_ONLY:-0}" != "1" ]; then + # decode normal: both dtypes x both contracts (layout cost made explicit) — matches EP4 + run decode bf16 normal layout-and-dispatch-v1 "$DEC" + run decode fp8 normal layout-and-dispatch-v1 "$DEC" + run decode bf16 normal cached-layout-comm-only-v1 "$DEC" + run decode fp8 normal cached-layout-comm-only-v1 "$DEC" + # prefill normal (cross-vendor contract) + run prefill bf16 normal layout-and-dispatch-v1 "$PRE" + run prefill fp8 normal layout-and-dispatch-v1 "$PRE" +fi +if [ "$DO_LL" = "1" ]; then + run decode bf16 ll layout-and-dispatch-v1 "$DEC" + run decode fp8 ll layout-and-dispatch-v1 "$DEC" +fi + +echo "=== SUMMARY ===" +for f in results/${RUNNER}_deepep_*.json; do + [ -f "$f" ] || continue + python3 - "$f" <<'PY' +import json,sys +d=json.load(open(sys.argv[1])); m=d.get("metrics",{}); ri=d.get("routing_identity",{}) +print(f"{sys.argv[1].split('/')[-1]:64s} {d['status']:7s} routing_ok={ri.get('consistent_across_ranks')} " + f"contract={d['measurement_contract']:26s} T{m.get('headline_tokens_per_rank')} " + f"disp_p50/p99={m.get('dispatch_us_p50',0):.1f}/{m.get('dispatch_us_p99',0):.1f}") +PY +done +scancel "$JID" 2>/dev/null || true +echo "=== GB300 EP8 DONE ===" diff --git a/experimental/CollectiveX/launchers/_gb300_probe.sh b/experimental/CollectiveX/launchers/_gb300_probe.sh new file mode 100644 index 000000000..0bbe564de --- /dev/null +++ b/experimental/CollectiveX/launchers/_gb300_probe.sh @@ -0,0 +1,49 @@ +#!/usr/bin/env bash +# GB300 EP8 probe orchestrator — runs on im-gb300-login-02. Allocates 2 nodes (8 GPU, +# 4/node), then runs tests/_gb300_ep_probe.py across 8 ranks for each DeepEP path +# (intranode / internode / ll) to find which works across 2 NVL72 trays. Read-only. +set -uo pipefail +IMAGE="${CX_IMAGE:-/data/sa-shared/containers/lmsysorg_sglang_v0.5.11-cu130.sqsh}" +STAGE="${CX_STAGE:-/data/sa-shared/cx_stage}" +PART="${CX_PARTITION:-batch_1}" +ACCT="${CX_ACCOUNT:-benchmark}" +JOBNAME="${JOBNAME:-cx_gb300_probe}" +MP="${MASTER_PORT:-29512}" +export ENROOT_CACHE_PATH="${ENROOT_CACHE_PATH:-/data/sa-shared/.enroot_cache}" + +echo "[orch] salloc 2x4 GPU partition=$PART acct=$ACCT image=$IMAGE" +salloc --partition="$PART" --account="$ACCT" --nodes=2 --gres=gpu:4 \ + --ntasks-per-node=4 --exclusive --time=30 --no-shell --job-name="$JOBNAME" 2>&1 | tail -3 +JID="$(squeue --name="$JOBNAME" -u "$USER" -h -o %A | head -n1)" +[ -n "$JID" ] || { echo "[orch] FATAL no JOB_ID"; exit 1; } +trap 'scancel "$JID" 2>/dev/null || true' EXIT + +st="" +for i in $(seq 1 60); do + st="$(squeue -j "$JID" -h -o %T 2>/dev/null)" + echo "[orch] tick=$i state=$st nodes=$(squeue -j "$JID" -h -o %N 2>/dev/null)" + [ "$st" = "RUNNING" ] && break + [ -z "$st" ] && { echo "[orch] job vanished"; exit 1; } + sleep 8 +done +[ "$st" = "RUNNING" ] || { echo "[orch] FATAL never started"; exit 1; } + +NODELIST="$(squeue -j "$JID" -h -o %N)" +MA="$(scontrol show hostnames "$NODELIST" | head -1)" +echo "[orch] JOB_ID=$JID nodes=[$NODELIST] MASTER_ADDR=$MA MASTER_PORT=$MP" + +CMOUNT=(--container-image="$IMAGE" --container-mounts="$STAGE:/cx" + --no-container-mount-home --container-workdir=/cx + --no-container-entrypoint) +WRAP='export RANK=$SLURM_PROCID WORLD_SIZE=$SLURM_NTASKS LOCAL_RANK=$SLURM_LOCALID; exec python3 tests/_gb300_ep_probe.py' + +for path in intranode internode ll; do + echo "=== PROBE path=$path (8 ranks / 2 nodes) ===" + srun --jobid="$JID" --nodes=2 --ntasks=8 --ntasks-per-node=4 "${CMOUNT[@]}" \ + --export=ALL,MASTER_ADDR="$MA",MASTER_PORT="$MP",CX_PROBE_PATH="$path",COLLECTIVEX_IMAGE="$IMAGE",NCCL_MNNVL_ENABLE=1,NCCL_CUMEM_ENABLE=1 \ + bash -c "$WRAP" &1 | grep -E 'RESULT|deep_ep=|Buffer.__init__|caps:|world=|FAIL|\| ' || echo "[orch] path=$path produced no RESULT line (rc=${PIPESTATUS[0]})" + echo "=== end $path ===" +done + +scancel "$JID" 2>/dev/null || true +echo "=== GB300 PROBE DONE ===" diff --git a/experimental/CollectiveX/launchers/_gb300_routing.sh b/experimental/CollectiveX/launchers/_gb300_routing.sh new file mode 100644 index 000000000..6ba9c412c --- /dev/null +++ b/experimental/CollectiveX/launchers/_gb300_routing.sh @@ -0,0 +1,58 @@ +#!/usr/bin/env bash +# GB300 EP8 routing-axis sweep — 2 nodes x 4 GPU over NVL72 MNNVL. Headline config +# (bf16/normal/layout-and-dispatch-v1) under balanced / zipf / zipf+EPLB, routing-tagged +# filenames. Same srun-8-ranks-no-torchrun harness as _gb300_ep8.sh. +set -uo pipefail +IMAGE="${CX_IMAGE:-/data/sa-shared/containers/lmsysorg_sglang_v0.5.11-cu130.sqsh}" +STAGE="${CX_STAGE:-/data/sa-shared/cx_stage}" +PART="${CX_PARTITION:-batch_1}"; ACCT="${CX_ACCOUNT:-benchmark}" +JOBNAME="${JOBNAME:-cx_gb300_rt}"; MP="${MASTER_PORT:-29517}" +RUNNER="${RUNNER:-gb300-8x}"; TOPO="${TOPO:-gb300-nvl72-mnnvl}"; TRANSPORT="${TRANSPORT:-mnnvl}" +WARMUP="${WARMUP:-32}"; ITERS="${ITERS:-200}"; TRIALS="${TRIALS:-3}" +DEC="${DEC:-1 2 4 8 16 32 64 128}"; PRE="${PRE:-128 256 512}"; DO_EPLB="${DO_EPLB:-1}" +export ENROOT_CACHE_PATH="${ENROOT_CACHE_PATH:-/data/sa-shared/.enroot_cache}" + +echo "[orch] salloc 2x4 GPU partition=$PART runner=$RUNNER (routing sweep)" +salloc --partition="$PART" --account="$ACCT" --nodes=2 --gres=gpu:4 \ + --ntasks-per-node=4 --exclusive --time="${CX_TIME:-90}" --no-shell --job-name="$JOBNAME" 2>&1 | tail -3 +JID="$(squeue --name="$JOBNAME" -u "$USER" -h -o %A | head -n1)" +[ -n "$JID" ] || { echo "[orch] FATAL no JOB_ID"; exit 1; } +trap 'scancel "$JID" 2>/dev/null || true' EXIT +st="" +for i in $(seq 1 60); do + st="$(squeue -j "$JID" -h -o %T 2>/dev/null)"; echo "[orch] tick=$i state=$st" + [ "$st" = "RUNNING" ] && break + [ -z "$st" ] && { echo "[orch] job vanished"; exit 1; } + sleep 8 +done +[ "$st" = "RUNNING" ] || { echo "[orch] FATAL never started"; exit 1; } +MA="$(scontrol show hostnames "$(squeue -j "$JID" -h -o %N)" | head -1)" +echo "[orch] JOB_ID=$JID MASTER_ADDR=$MA" +CMOUNT=(--container-image="$IMAGE" --container-mounts="$STAGE:/cx" + --no-container-mount-home --container-workdir=/cx --no-container-entrypoint) +WRAP='export RANK=$SLURM_PROCID WORLD_SIZE=$SLURM_NTASKS LOCAL_RANK=$SLURM_LOCALID; exec python3 tests/run_ep.py "$@"' + +run(){ # phase routing eplbflag tag ladder + local phase="$1" routing="$2" eplb="$3" tag="$4" ladder="$5" + local out="results/${RUNNER}_deepep_${phase}_bf16_normal_layout-and-dispatch-v1_${tag}.json" + echo "### $phase routing=$routing eplb='${eplb}' -> $out" + # shellcheck disable=SC2086 + timeout -k 30 "${CX_RUN_TIMEOUT:-900}" srun --jobid="$JID" --nodes=2 --ntasks=8 --ntasks-per-node=4 \ + "${CMOUNT[@]}" \ + --export=ALL,MASTER_ADDR="$MA",MASTER_PORT="$MP",COLLECTIVEX_IMAGE="$IMAGE",NCCL_MNNVL_ENABLE=1,NCCL_CUMEM_ENABLE=1 \ + bash -c "$WRAP" _ \ + --backend deepep --phase "$phase" --dispatch-dtype bf16 --mode normal \ + --measurement-contract layout-and-dispatch-v1 --routing "$routing" $eplb --resource-mode tuned \ + --tokens-ladder "$ladder" --warmup "$WARMUP" --iters "$ITERS" --trials "$TRIALS" \ + --runner "$RUNNER" --topology-class "$TOPO" --transport "$TRANSPORT" --out "$out" &1 | tail -7 + echo "### rc=${PIPESTATUS[0]} -> $out" +} + +for ph in decode prefill; do + L="$DEC"; [ "$ph" = prefill ] && L="$PRE" + run "$ph" balanced "" balanced "$L" + run "$ph" zipf "" zipf "$L" + [ "$DO_EPLB" = 1 ] && run "$ph" zipf "--eplb" zipf+eplb "$L" +done +scancel "$JID" 2>/dev/null || true +echo "=== GB300 ROUTING DONE ===" diff --git a/experimental/CollectiveX/launchers/_mi355x_canon.sh b/experimental/CollectiveX/launchers/_mi355x_canon.sh new file mode 100644 index 000000000..3ffa101d2 --- /dev/null +++ b/experimental/CollectiveX/launchers/_mi355x_canon.sh @@ -0,0 +1,25 @@ +#!/usr/bin/env bash +# MI355X cross-vendor canonical-workload consume (goal DoD 183): MoRI consumes the SAME serialized +# trace bytes that H100 (NVIDIA) consumed (copied into /cx/cx_workloads), so the workload_id + +# checksums in this AMD doc MATCH the NVIDIA doc -> "same trace on NVIDIA and AMD" is proven by +# byte-identity, not by trusting two RNGs. MoRI-safe: bf16/normal, gradual ramp, low iters, bounded. +set -uo pipefail +cd /cx; mkdir -p results +export COLLECTIVEX_IMAGE="${COLLECTIVEX_IMAGE:-rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-2}" +python3 -c "import mori;print('mori OK')" 2>&1 | tail -1 +echo "### canonical traces available:"; ls /cx/cx_workloads/*.manifest.json 2>/dev/null | wc -l +out=results/mi355x-8x_mori_decode_bf16_normal_layout-and-dispatch-v1_canon.json +timeout -k 30 "${CX_RUN_TIMEOUT:-400}" torchrun --nproc_per_node=8 tests/run_ep.py --backend mori \ + --phase decode --tokens-ladder "${LADDER:-1 2 4 8 16 32 64}" --dispatch-dtype bf16 --mode normal \ + --measurement-contract layout-and-dispatch-v1 --routing uniform --resource-mode tuned \ + --workload-dir /cx/cx_workloads --warmup 8 --iters "${ITERS:-20}" --trials "${TRIALS:-1}" \ + --runner mi355x-8x --topology-class mi355x-xgmi --transport xgmi --out "$out" 2>&1 | tail -14 +echo "### rc=${PIPESTATUS[0]} -> $out" +[ -f "$out" ] && python3 - "$out" <<'PY' +import json,sys +d=json.load(open(sys.argv[1])); w=d.get("workload",{}); v=d.get("validity",{}) +print(f"workload_source={v.get('workload_source')} pub={d.get('publication_status')} " + f"workload_id={w.get('workload_id')} correct_all={all(r['correct'] for r in d['rows'])}") +print("checksums:", json.dumps(w.get("manifest_checksums") or {})[:300]) +PY +echo "=== MI355X CANON DONE ===" diff --git a/experimental/CollectiveX/launchers/_mi355x_orchestrate.sh b/experimental/CollectiveX/launchers/_mi355x_orchestrate.sh new file mode 100644 index 000000000..3bb91e155 --- /dev/null +++ b/experimental/CollectiveX/launchers/_mi355x_orchestrate.sh @@ -0,0 +1,61 @@ +#!/usr/bin/env bash +# Submit-host orchestrator for an MI355X MoRI validation run (contended cluster). +# salloc (queues behind serving sweeps) -> wait RUNNING -> node-local enroot import +# -> srun the in-container MoRI driver -> scancel. Logs to ~/cx_stage/mori_orch.out. +# Always &1 | tail -2 +JID="$(squeue --name="$JOBNAME" -h -o %A | head -n1)" +[ -n "$JID" ] || { echo "[orch] FATAL: no JOB_ID"; exit 1; } +echo "[orch] JOB_ID=$JID" +trap 'scancel "$JID" 2>/dev/null || true' EXIT + +st="" +for i in $(seq 1 "$WAIT_TICKS"); do + st="$(squeue -j "$JID" -h -o %T 2>/dev/null)" + node="$(squeue -j "$JID" -h -o %N 2>/dev/null)" + echo "[orch] tick=$i state=$st node=$node" + [ "$st" = "RUNNING" ] && break + [ -z "$st" ] && { echo "[orch] job vanished"; exit 1; } + sleep 12 +done +[ "$st" = "RUNNING" ] || { echo "[orch] FATAL: never started (state=$st)"; exit 1; } +echo "[orch] RUNNING on $(squeue -j "$JID" -h -o %N)" + +echo "[orch] enroot import to NFS (cache redirected to writable node-local /tmp)" +# Default ENROOT_CACHE_PATH=/var/lib/enroot/cache is root-only here ("Permission denied", +# exit 9). Redirect cache/data/temp to node-local /tmp (writable, fast); the OUTPUT squash +# (-o $SQ) still lands on NFS so it persists + is visible on every node next time. +srun --jobid="$JID" bash -c " + export ENROOT_CACHE_PATH=/tmp/enroot_cache_\$USER ENROOT_DATA_PATH=/tmp/enroot_data_\$USER ENROOT_TEMP_PATH=/tmp/enroot_tmp_\$USER + mkdir -p \"\$ENROOT_CACHE_PATH\" \"\$ENROOT_DATA_PATH\" \"\$ENROOT_TEMP_PATH\" + exec 9>\"$LOCK\" || exit 1 + flock -w 1200 9 || { echo 'lock timeout'; exit 1; } + if unsquashfs -l \"$SQ\" >/dev/null 2>&1; then echo 'squash present: $SQ'; + else echo 'importing $IMAGE'; rm -f \"$SQ\"; enroot import -o \"$SQ\" \"docker://$IMAGE\" &1 | tail -20 + +echo "[orch] === srun MoRI driver ===" +srun --jobid="$JID" \ + --container-image="$SQ" --container-mounts="$STAGE:/cx" \ + --container-writable --container-remap-root --no-container-mount-home \ + --container-workdir=/cx --no-container-entrypoint --export=ALL \ + bash /cx/launchers/_validate_mori.sh &1 + +echo "[orch] scancel $JID" +scancel "$JID" 2>/dev/null || true +echo "=== ORCH DONE ===" diff --git a/experimental/CollectiveX/launchers/_mi355x_repro_orchestrate.sh b/experimental/CollectiveX/launchers/_mi355x_repro_orchestrate.sh new file mode 100644 index 000000000..ecf3bc0c2 --- /dev/null +++ b/experimental/CollectiveX/launchers/_mi355x_repro_orchestrate.sh @@ -0,0 +1,40 @@ +#!/usr/bin/env bash +# Submit-host orchestrator for MI355X MoRI 3-run reproducibility. salloc -> (squash +# already on NFS) -> srun _repro.sh (BACKEND=mori). Logs to ~/cx_stage/mori_repro.out. +set -uo pipefail +IMAGE="${CX_IMAGE:-rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-2}" +SQKEY="$(printf '%s' "$IMAGE" | sed 's#[/:@#]#_#g')" +SQDIR="${CX_SQUASH_DIR:-$HOME/cx_squash}" +SQ="$SQDIR/${SQKEY}.sqsh" +STAGE="$HOME/cx_stage" +JOBNAME="${JOBNAME:-cx_mrepro}" + +EXCLUDE="${CX_EXCLUDE_NODES:-mia1-p01-g09,mia1-p01-g11}" +echo "[orch] salloc partition=compute exclude=$EXCLUDE gpu:8" +salloc --partition=compute --exclude="$EXCLUDE" --gres=gpu:8 \ + --exclusive --cpus-per-task=128 --time=30 --no-shell --job-name="$JOBNAME" 2>&1 | tail -2 +JID="$(squeue --name="$JOBNAME" -h -o %A | head -n1)" +[ -n "$JID" ] || { echo "[orch] FATAL: no JOB_ID"; exit 1; } +echo "[orch] JOB_ID=$JID" +trap 'scancel "$JID" 2>/dev/null || true' EXIT + +st="" +for i in $(seq 1 150); do + st="$(squeue -j "$JID" -h -o %T 2>/dev/null)" + echo "[orch] tick=$i state=$st node=$(squeue -j "$JID" -h -o %N 2>/dev/null)" + [ "$st" = "RUNNING" ] && break + [ -z "$st" ] && { echo "[orch] job vanished"; exit 1; } + sleep 12 +done +[ "$st" = "RUNNING" ] || { echo "[orch] FATAL: never started"; exit 1; } + +unsquashfs -l "$SQ" >/dev/null 2>&1 || { echo "[orch] FATAL: squash missing $SQ"; exit 1; } +echo "[orch] === srun _repro.sh (mori) ===" +srun --jobid="$JID" \ + --container-image="$SQ" --container-mounts="$STAGE:/cx" \ + --container-writable --container-remap-root --no-container-mount-home \ + --container-workdir=/cx --no-container-entrypoint --export=ALL \ + env COLLECTIVEX_IMAGE="$IMAGE" RUNNER=mi355x-8x TOPO=mi355x-xgmi \ + bash "/cx/launchers/${CX_DRIVER:-_v3_mori.sh}" &1 +scancel "$JID" 2>/dev/null || true +echo "=== ORCH DONE ===" diff --git a/experimental/CollectiveX/launchers/_mori_repro.sh b/experimental/CollectiveX/launchers/_mori_repro.sh new file mode 100644 index 000000000..8f98f8ce9 --- /dev/null +++ b/experimental/CollectiveX/launchers/_mori_repro.sh @@ -0,0 +1,54 @@ +#!/usr/bin/env bash +# MoRI 3-run reproducibility using the EXACT invocation _validate_mori.sh proved +# works (full ladders, warmup 8, iters 40) — the single-point _repro.sh path wedges +# MoRI mid-ramp on this contended cluster. Each run writes run-tagged decode+prefill +# JSONs; we extract T=64 (decode) and T=512 (prefill) and report the spread. Short +# per-run timeout so a wedge fails fast instead of burning the allocation. +set -uo pipefail +cd /cx || exit 2 +mkdir -p results +NG="${NG:-8}"; RUNNER="${RUNNER:-mi355x-8x}"; TOPO="${TOPO:-mi355x-xgmi}" +export COLLECTIVEX_IMAGE="${COLLECTIVEX_IMAGE:-rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-2}" +TMO="${CX_RUN_TIMEOUT:-220}" + +one() { # $1=phase $2=ladder $3=run + local phase="$1" ladder="$2" i="$3" + local out="results/_morirepro_${phase}_run${i}.json" + # iters 100 (was 40): MoRI decode is ~44us, so a 40-sample p50 jitters ~10% run-to-run; + # a 100-sample median is tighter. Still below the sustained-iter count that wedges MoRI. + timeout -k 20 "$TMO" torchrun --nproc_per_node="$NG" tests/run_ep.py --backend mori \ + --mode normal --dispatch-dtype bf16 --phase "$phase" --routing uniform \ + --resource-mode tuned --tokens-ladder "$ladder" --warmup 8 --iters "${MORI_ITERS:-100}" \ + --runner "$RUNNER" --topology-class "$TOPO" --transport xgmi \ + --out "$out" >"$out.log" 2>&1 + local rc=$? + if [ $rc -ne 0 ]; then echo " run$i $phase rc=$rc (see $out.log)"; return; fi +} + +for i in 1 2 3; do + echo "## run $i" + one decode "1 2 4 8 16 32 64 128" "$i" + one prefill "128 256 512" "$i" +done + +echo "=== SPREAD (dispatch p50) ===" +python3 - <<'PY' +import json, glob +def at(phase, T): + vals = [] + for f in sorted(glob.glob(f"results/_morirepro_{phase}_run*.json")): + try: + d = json.load(open(f)) + r = next(r for r in d["rows"] if r["tokens_per_rank"] == T) + vals.append(round(r["dispatch_us_p50"], 1)) + except Exception: + pass + if len(vals) >= 2: + sp = (max(vals) - min(vals)) / min(vals) * 100 + print(f" {phase} T={T}: dispatch_p50 {vals} spread={sp:.1f}% [{'OK <=10%' if sp<=10 else 'OVER'}]") + else: + print(f" {phase} T={T}: insufficient ({len(vals)})") +at("decode", 64) +at("prefill", 512) +PY +echo "=== REPRO DONE ===" diff --git a/experimental/CollectiveX/launchers/_repro.sh b/experimental/CollectiveX/launchers/_repro.sh new file mode 100644 index 000000000..641852d18 --- /dev/null +++ b/experimental/CollectiveX/launchers/_repro.sh @@ -0,0 +1,64 @@ +#!/usr/bin/env bash +# 3-run p50 reproducibility driver (run via srun on an 8-GPU node, in one allocation +# so all three runs share the exact environment). Runs the acceptance points — +# decode T=64 and prefill T=512 — three times each and prints dispatch/serial p50 per +# run so the <=10% spread is checkable. Backend/precision/mode via env. +set -uo pipefail +cd /cx || exit 2 +mkdir -p results +NG="${NG:-8}" +BACKEND="${BACKEND:-deepep}" +RUNNER="${RUNNER:-x-8x}" +TOPO="${TOPO:-x}" +TRANSPORT="${TRANSPORT:-nvlink}" +DT="${DT:-bf16}"; MODE="${MODE:-normal}"; RM="${RM:-tuned}" + +echo "=== repro: backend=$BACKEND dtype=$DT mode=$MODE resource=$RM runner=$RUNNER ===" +repro() { # $1=phase $2=T + local phase="$1" T="$2" i out + echo "## $phase T=$T x3" + for i in 1 2 3; do + out="results/_repro_${RUNNER}_${BACKEND}_${phase}_T${T}_${DT}_${MODE}_run${i}.json" + timeout -k 30 700 torchrun --nproc_per_node="$NG" tests/run_ep.py --backend "$BACKEND" \ + --phase "$phase" --tokens-ladder "$T" --dispatch-dtype "$DT" --mode "$MODE" \ + --resource-mode "$RM" --routing uniform --runner "$RUNNER" --topology-class "$TOPO" \ + --transport "$TRANSPORT" --warmup "${WARMUP:-32}" --iters "${ITERS:-200}" \ + --out "$out" >"$out.log" 2>&1 || tail -6 "$out.log" + python3 - "$out" "$i" "$T" <<'PY' +import json,sys +try: + d=json.load(open(sys.argv[1])); T=int(sys.argv[3]) + # MoRI's gradual ramp expands the ladder ([1..T]); pick the row that IS T, not rows[0]. + r=next(r for r in d["rows"] if r["tokens_per_rank"]==T) + print(f" run{sys.argv[2]} T={sys.argv[3]} dispatch_p50={r['dispatch_us_p50']:.1f} " + f"combine_p50={r['combine_us_p50']:.1f} serial_p50={r['serial_us_p50']:.1f} status={d['status']}") +except Exception as e: + print(f" run{sys.argv[2]} T={sys.argv[3]} FAILED {e!r}") +PY + done +} + +repro decode 64 +repro prefill 512 + +echo "=== SPREAD (max-min)/min at each point ===" +python3 - "$RUNNER" "$BACKEND" "$DT" "$MODE" <<'PY' +import json, glob, sys +runner, backend, dt, mode = sys.argv[1:5] +for phase, T in (("decode", 64), ("prefill", 512)): + vals = [] + for f in sorted(glob.glob(f"results/_repro_{runner}_{backend}_{phase}_T{T}_{dt}_{mode}_run*.json")): + try: + d = json.load(open(f)) + r = next(r for r in d["rows"] if r["tokens_per_rank"] == T) # T row (ramp-safe) + vals.append(r["dispatch_us_p50"]) + except Exception: + pass + if len(vals) >= 2: + spread = (max(vals) - min(vals)) / min(vals) * 100 + ok = "OK <=10%" if spread <= 10 else "OVER 10%" + print(f" {phase} T={T}: dispatch_p50 runs={[round(v,1) for v in vals]} spread={spread:.1f}% [{ok}]") + else: + print(f" {phase} T={T}: insufficient runs ({len(vals)})") +PY +echo "=== REPRO DONE ===" diff --git a/experimental/CollectiveX/launchers/_routing_mori.sh b/experimental/CollectiveX/launchers/_routing_mori.sh new file mode 100644 index 000000000..739a5299b --- /dev/null +++ b/experimental/CollectiveX/launchers/_routing_mori.sh @@ -0,0 +1,39 @@ +#!/usr/bin/env bash +# MoRI (MI355X) routing-axis sweep — balanced + zipf for the headline config (bf16/normal/ +# layout-and-dispatch-v1), the AMD unbalanced-vs-balanced datapoint. MoRI-safe params baked in +# (gradual ramp via the harness, low iters, no warm-burst). No EPLB (kept to DeepEP — MoRI is +# fragile and the 288-physical-expert set is extra risk). Routing-tagged filenames. +set -uo pipefail +cd /cx || exit 2 +mkdir -p results +NG="${NG:-8}"; RUNNER="${RUNNER:-mi355x-8x}"; TOPO="${TOPO:-mi355x-xgmi}" +export COLLECTIVEX_IMAGE="${COLLECTIVEX_IMAGE:-rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-2}" +ITERS="${ITERS:-40}"; TRIALS="${TRIALS:-2}" + +run(){ # phase routing tag ladder + local phase="$1" routing="$2" tag="$3" ladder="$4" + local out="results/${RUNNER}_mori_${phase}_bf16_normal_layout-and-dispatch-v1_${tag}.json" + echo "### mori $phase routing=$routing -> $out" + timeout -k 30 "${CX_RUN_TIMEOUT:-1100}" torchrun --nproc_per_node="$NG" tests/run_ep.py --backend mori \ + --phase "$phase" --dispatch-dtype bf16 --mode normal --measurement-contract layout-and-dispatch-v1 \ + --routing "$routing" --resource-mode tuned --tokens-ladder "$ladder" \ + --warmup 8 --iters "$ITERS" --trials "$TRIALS" \ + --runner "$RUNNER" --topology-class "$TOPO" --transport xgmi --out "$out" 2>&1 | tail -8 + echo "### rc=${PIPESTATUS[0]} -> $out" +} +python3 -c "import mori;print('mori OK')" 2>&1 | tail -1 +run decode balanced balanced "1 2 4 8 16 32 64 128" +run decode zipf zipf "1 2 4 8 16 32 64 128" +run prefill balanced balanced "128 256 512" +run prefill zipf zipf "128 256 512" +echo "=== SUMMARY ===" +for f in results/${RUNNER}_mori_*_{balanced,zipf}.json; do + [ -f "$f" ] || continue + python3 - "$f" <<'PY' +import json,sys +d=json.load(open(sys.argv[1])); m=d.get("metrics",{}); ri=d.get("routing_identity",{}); sh=d.get("shape",{}) +print(f"{sys.argv[1].split('/')[-1]:60s} {d['status']:7s} rt={sh.get('routing'):9s} ok={ri.get('consistent_across_ranks')} " + f"T{m.get('headline_tokens_per_rank')} disp_p50/p99={m.get('dispatch_us_p50',0):.1f}/{m.get('dispatch_us_p99',0):.1f}") +PY +done +echo "=== MORI ROUTING DONE ===" diff --git a/experimental/CollectiveX/launchers/_routing_rerun.sh b/experimental/CollectiveX/launchers/_routing_rerun.sh new file mode 100644 index 000000000..3776774cd --- /dev/null +++ b/experimental/CollectiveX/launchers/_routing_rerun.sh @@ -0,0 +1,48 @@ +#!/usr/bin/env bash +# Routing-axis sweep (single-node torchrun): the headline config (bf16 / normal / +# layout-and-dispatch-v1) under balanced / zipf / zipf+EPLB, so the plot's Routing selector +# compares balanced vs unbalanced vs EPLB. Filenames carry the routing tag so they never +# overwrite the uniform v3 results. Reusable across NVIDIA (deepep) + AMD (mori) via env. +# BACKEND=deepep|mori NG RUNNER TOPO TRANSPORT DEC/PRE ladders DO_EPLB(1) ITERS/TRIALS +set -uo pipefail +cd /cx 2>/dev/null || cd /ix/experimental/CollectiveX 2>/dev/null || { echo "no cx dir"; exit 2; } +mkdir -p results +NG="${NG:-8}"; RUNNER="${RUNNER:-x-8x}"; TOPO="${TOPO:-x}"; TRANSPORT="${TRANSPORT:-nvlink}" +BACKEND="${BACKEND:-deepep}"; WARMUP="${WARMUP:-32}"; ITERS="${ITERS:-200}"; TRIALS="${TRIALS:-3}" +DEC="${DEC:-1 2 4 8 16 32 64 128}"; PRE="${PRE:-128 256 512}" +DO_EPLB="${DO_EPLB:-1}" # mori: set 0 (skip EPLB, just balanced+zipf) +PHASES="${PHASES:-decode prefill}" + +run(){ # phase routing eplbflag tag ladder + local phase="$1" routing="$2" eplb="$3" tag="$4" ladder="$5" + local out="results/${RUNNER}_${BACKEND}_${phase}_bf16_normal_layout-and-dispatch-v1_${tag}.json" + echo "### $phase routing=$routing eplb='${eplb}' -> $out" + # shellcheck disable=SC2086 + timeout -k 30 "${CX_RUN_TIMEOUT:-900}" torchrun --nproc_per_node="$NG" tests/run_ep.py --backend "$BACKEND" \ + --phase "$phase" --dispatch-dtype bf16 --mode normal --measurement-contract layout-and-dispatch-v1 \ + --routing "$routing" $eplb --resource-mode tuned --tokens-ladder "$ladder" \ + --warmup "$WARMUP" --iters "$ITERS" --trials "$TRIALS" \ + --runner "$RUNNER" --topology-class "$TOPO" --transport "$TRANSPORT" --out "$out" 2>&1 | tail -7 + echo "### rc=${PIPESTATUS[0]} -> $out" +} + +for ph in $PHASES; do + L="$DEC"; [ "$ph" = prefill ] && L="$PRE" + run "$ph" balanced "" balanced "$L" + run "$ph" zipf "" zipf "$L" + [ "$DO_EPLB" = 1 ] && run "$ph" zipf "--eplb" zipf+eplb "$L" +done + +echo "=== SUMMARY ===" +for f in results/${RUNNER}_${BACKEND}_*_{balanced,zipf,zipf+eplb}.json; do + [ -f "$f" ] || continue + python3 - "$f" <<'PY' +import json,sys +d=json.load(open(sys.argv[1])); m=d.get("metrics",{}); ri=d.get("routing_identity",{}); e=d.get("eplb",{}) +sh=d.get("shape",{}); tag=sh.get("routing")+("+eplb" if e.get("enabled") else "") +imb=f" imb {e.get('imbalance_before'):.1f}->{e.get('imbalance_after'):.1f}x" if e.get("enabled") else "" +print(f"{sys.argv[1].split('/')[-1]:62s} {d['status']:7s} rt={tag:11s} ok={ri.get('consistent_across_ranks')} " + f"T64 disp_p50/p99={m.get('dispatch_us_p50',0):.1f}/{m.get('dispatch_us_p99',0):.1f}{imb}") +PY +done +echo "=== ROUTING SWEEP DONE ===" diff --git a/experimental/CollectiveX/launchers/_singlenode_orchestrate.sh b/experimental/CollectiveX/launchers/_singlenode_orchestrate.sh new file mode 100644 index 000000000..093c3b5f5 --- /dev/null +++ b/experimental/CollectiveX/launchers/_singlenode_orchestrate.sh @@ -0,0 +1,40 @@ +#!/usr/bin/env bash +# Generic single-node orchestrator (H100/H200/MI355X): salloc 1 node (NG GPU) -> srun the +# in-container driver (default _routing_rerun.sh). Mirrors the GB300 orchestrator but single +# node (driver uses torchrun internally). Env: CX_IMAGE CX_STAGE CX_PARTITION CX_ACCOUNT +# RUNNER TOPO TRANSPORT BACKEND NG CX_DRIVER + sweep knobs (DEC PRE ITERS TRIALS DO_EPLB PHASES). +set -uo pipefail +IMAGE="${CX_IMAGE:?CX_IMAGE}"; STAGE="${CX_STAGE:?CX_STAGE}"; PART="${CX_PARTITION:?CX_PARTITION}" +JOBNAME="${JOBNAME:-cx_rt}"; NG="${NG:-8}"; DRIVER="${CX_DRIVER:-_routing_rerun.sh}" +ACCT=(); [ -n "${CX_ACCOUNT:-}" ] && ACCT=(--account="$CX_ACCOUNT") +EXTRA=(); [ -n "${CX_EXCLUDE:-}" ] && EXTRA=(--exclude="$CX_EXCLUDE") +[ -n "${CX_CPUS:-}" ] && EXTRA+=(--cpus-per-task="$CX_CPUS") + +echo "[orch] salloc $NG GPU partition=$PART driver=$DRIVER runner=${RUNNER:-?}" +salloc --partition="$PART" "${ACCT[@]}" "${EXTRA[@]}" --gres=gpu:"$NG" --exclusive \ + --time="${CX_TIME:-60}" --no-shell --job-name="$JOBNAME" 2>&1 | tail -2 +JID="$(squeue --name="$JOBNAME" -u "$USER" -h -o %A | head -n1)" +[ -n "$JID" ] || { echo "[orch] FATAL no JOB_ID"; exit 1; } +trap 'scancel "$JID" 2>/dev/null || true' EXIT +st="" +for i in $(seq 1 60); do + st="$(squeue -j "$JID" -h -o %T 2>/dev/null)"; echo "[orch] tick=$i state=$st node=$(squeue -j "$JID" -h -o %N 2>/dev/null)" + [ "$st" = RUNNING ] && break + [ -z "$st" ] && { echo "[orch] job vanished"; exit 1; } + sleep 8 +done +[ "$st" = RUNNING ] || { echo "[orch] FATAL never started"; exit 1; } + +# Single quoted --export string so ladder values with spaces (DEC/PRE) survive as ONE value +# each (srun splits the list on commas, not spaces). +EXP="ALL,COLLECTIVEX_IMAGE=$IMAGE,NG=$NG,RUNNER=${RUNNER:?},TOPO=${TOPO:?},TRANSPORT=${TRANSPORT:-nvlink}" +EXP+=",BACKEND=${BACKEND:-deepep},DEC=${DEC:-1 2 4 8 16 32 64 128},PRE=${PRE:-128 256 512}" +EXP+=",ITERS=${ITERS:-200},TRIALS=${TRIALS:-3},DO_EPLB=${DO_EPLB:-1},PHASES=${PHASES:-decode prefill}" +EXP+=",WARMUP=${WARMUP:-32},CX_RUN_TIMEOUT=${CX_RUN_TIMEOUT:-900},DO_LL=${DO_LL:-1}" +[ -n "${MORI_COMMIT:-}" ] && EXP+=",MORI_COMMIT=$MORI_COMMIT" + +srun --jobid="$JID" --container-image="$IMAGE" --container-mounts="$STAGE:/cx" \ + --no-container-mount-home --container-workdir=/cx --no-container-entrypoint \ + --export="$EXP" bash "/cx/launchers/$DRIVER" &1 +scancel "$JID" 2>/dev/null || true +echo "=== ORCH DONE ===" diff --git a/experimental/CollectiveX/launchers/_v3_mori.sh b/experimental/CollectiveX/launchers/_v3_mori.sh new file mode 100644 index 000000000..f26d9045c --- /dev/null +++ b/experimental/CollectiveX/launchers/_v3_mori.sh @@ -0,0 +1,37 @@ +#!/usr/bin/env bash +# MoRI v3 re-run driver (run via srun on 8-GPU MI355X). v3 harness: trials + p99 + +# routing-identity + layout-and-dispatch-v1 (MoRI's only contract). iters capped (MoRI +# wedges >=~200 sustained at T>=32); 3 trials x 50 = 150 pooled samples. +set -uo pipefail +cd /cx || exit 2 +mkdir -p results +NG="${NG:-8}"; RUNNER="${RUNNER:-mi355x-8x}"; TOPO="${TOPO:-mi355x-xgmi}" +export COLLECTIVEX_IMAGE="${COLLECTIVEX_IMAGE:-rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-2}" + +run(){ # phase ladder + local phase="$1" ladder="$2" + local out="results/${RUNNER}_mori_${phase}_bf16_normal_layout-and-dispatch-v1.json" + echo "### mori $phase ladder=[$ladder]" + # MoRI is slow (combine re-dispatches each iter) + ramps the whole ladder; trials=3 x + # iters=50 over [1..128] blew past 700s. 2 trials x 40 iters = 80 pooled samples, fits. + timeout -k 30 "${CX_RUN_TIMEOUT:-1100}" torchrun --nproc_per_node="$NG" tests/run_ep.py --backend mori \ + --phase "$phase" --dispatch-dtype bf16 --mode normal \ + --measurement-contract layout-and-dispatch-v1 --routing uniform --resource-mode tuned \ + --tokens-ladder "$ladder" --warmup 8 --iters "${ITERS:-40}" --trials "${TRIALS:-2}" \ + --runner "$RUNNER" --topology-class "$TOPO" --transport xgmi --out "$out" 2>&1 | tail -8 + echo "### rc=${PIPESTATUS[0]} -> $out" +} +python3 -c "import mori;print('mori OK')" 2>&1 | tail -1 +run decode "1 2 4 8 16 32 64 128" +run prefill "128 256 512" +echo "=== SUMMARY ===" +for f in results/${RUNNER}_mori_*layout-and-dispatch-v1.json; do + [ -f "$f" ] || continue + python3 - "$f" <<'PY' +import json,sys +d=json.load(open(sys.argv[1])); m=d.get("metrics",{}); ri=d.get("routing_identity",{}) +print(f"{sys.argv[1].split('/')[-1]:58s} {d['status']:7s} routing_ok={ri.get('consistent_across_ranks')} " + f"T{m.get('headline_tokens_per_rank')} disp_p50/p99={m.get('dispatch_us_p50',0):.1f}/{m.get('dispatch_us_p99',0):.1f}") +PY +done +echo "=== V3 MORI DONE ===" diff --git a/experimental/CollectiveX/launchers/_v3_rerun.sh b/experimental/CollectiveX/launchers/_v3_rerun.sh new file mode 100644 index 000000000..c9fedc718 --- /dev/null +++ b/experimental/CollectiveX/launchers/_v3_rerun.sh @@ -0,0 +1,51 @@ +#!/usr/bin/env bash +# v3 re-run driver (DeepEP): headline matrix with the v3 harness — trials, p50/p90/p99, +# explicit contracts, routing-identity proof. Reusable across NVIDIA SKUs via env. +set -uo pipefail +cd /cx || exit 2 +mkdir -p results +NG="${NG:-8}"; RUNNER="${RUNNER:-x-8x}"; TOPO="${TOPO:-x}"; TRANSPORT="${TRANSPORT:-nvlink}" +WARMUP="${WARMUP:-32}"; ITERS="${ITERS:-200}"; TRIALS="${TRIALS:-3}" +DEC="${DEC:-1 2 4 8 16 32 64 128}"; PRE="${PRE:-128 256 512}" +DO_LL="${DO_LL:-1}" # B300-class fabrics that abort LL set DO_LL=0 + +run(){ # phase dtype mode contract ladder + local phase="$1" dt="$2" mode="$3" contract="$4" ladder="$5" + local out="results/${RUNNER}_deepep_${phase}_${dt}_${mode}_${contract}.json" + echo "### $phase dtype=$dt mode=$mode contract=$contract" + timeout -k 30 700 torchrun --nproc_per_node="$NG" tests/run_ep.py --backend deepep \ + --phase "$phase" --dispatch-dtype "$dt" --mode "$mode" --measurement-contract "$contract" \ + --routing uniform --resource-mode tuned --tokens-ladder "$ladder" \ + --warmup "$WARMUP" --iters "$ITERS" --trials "$TRIALS" \ + --runner "$RUNNER" --topology-class "$TOPO" --transport "$TRANSPORT" \ + --out "$out" 2>&1 | tail -6 + echo "### rc=${PIPESTATUS[0]} -> $out" +} + +python3 -c "import deep_ep,importlib.metadata as m;print('deep_ep',m.version('deep_ep'))" 2>&1 | tail -1 +# decode normal: both dtypes x both contracts (layout cost made explicit) +run decode bf16 normal layout-and-dispatch-v1 "$DEC" +run decode fp8 normal layout-and-dispatch-v1 "$DEC" +run decode bf16 normal cached-layout-comm-only-v1 "$DEC" +run decode fp8 normal cached-layout-comm-only-v1 "$DEC" +# decode LL (decode-only optimized path) where the fabric supports it +if [ "$DO_LL" = "1" ]; then + run decode bf16 ll layout-and-dispatch-v1 "$DEC" + run decode fp8 ll layout-and-dispatch-v1 "$DEC" +fi +# prefill normal (cross-vendor contract = layout-and-dispatch-v1) +run prefill bf16 normal layout-and-dispatch-v1 "$PRE" +run prefill fp8 normal layout-and-dispatch-v1 "$PRE" + +echo "=== SUMMARY ===" +for f in results/${RUNNER}_deepep_*.json; do + [ -f "$f" ] || continue + python3 - "$f" <<'PY' +import json,sys +d=json.load(open(sys.argv[1])); m=d.get("metrics",{}); ri=d.get("routing_identity",{}) +print(f"{sys.argv[1].split('/')[-1]:62s} {d['status']:7s} routing_ok={ri.get('consistent_across_ranks')} " + f"contract={d['measurement_contract']:26s} T{m.get('headline_tokens_per_rank')} " + f"disp_p50/p99={m.get('dispatch_us_p50',0):.1f}/{m.get('dispatch_us_p99',0):.1f}") +PY +done +echo "=== V3 RERUN DONE ===" diff --git a/experimental/CollectiveX/launchers/_v3_smoke.sh b/experimental/CollectiveX/launchers/_v3_smoke.sh new file mode 100644 index 000000000..fd2852fba --- /dev/null +++ b/experimental/CollectiveX/launchers/_v3_smoke.sh @@ -0,0 +1,42 @@ +#!/usr/bin/env bash +# v3 harness smoke (run via srun on 8 GPUs): validates the NEW code paths on real +# hardware — pooled trials + p50/p90/p99, routing-identity cross-rank proof, BOTH +# measurement contracts (incl. DeepEP cached-layout), separated logical bytes, schema 3. +set -uo pipefail +cd /cx || exit 2 +mkdir -p results +NG="${NG:-8}"; RUNNER="${RUNNER:-h100-8x}"; TOPO="${TOPO:-h100-nvlink-island}" + +run() { # $1=contract $2=dtype + local contract="$1" dt="$2" + local out="results/_v3smoke_${dt}_${contract}.json" + echo "### contract=$contract dtype=$dt" + timeout -k 30 400 torchrun --nproc_per_node="$NG" tests/run_ep.py --backend deepep \ + --mode normal --dispatch-dtype "$dt" --phase decode --routing uniform \ + --resource-mode tuned --measurement-contract "$contract" \ + --tokens-ladder "1 4 16 64" --warmup 16 --iters 60 --trials 2 \ + --runner "$RUNNER" --topology-class "$TOPO" --transport nvlink \ + --out "$out" 2>&1 | tail -8 + echo "### rc=${PIPESTATUS[0]}" + python3 - "$out" <<'PY' +import json,sys +try: + d=json.load(open(sys.argv[1])); r=next(x for x in d["rows"] if x["tokens_per_rank"]==64) + ri=d["routing_identity"]; rp=d["reproduction"] + print(f" schema={d['schema_version']} contract={d['measurement_contract']} status={d['status']}") + print(f" routing_consistent={ri['consistent_across_ranks']} trace_sig={ri['trace_signature']}") + print(f" T64 disp p50/p90/p99={r['dispatch_us_p50']:.1f}/{r['dispatch_us_p90']:.1f}/{r['dispatch_us_p99']:.1f} " + f"samples={r['samples_pooled']} trials={r['trials']}") + print(f" dispatch_logical_bytes={r['dispatch_logical_bytes']} combine_logical_bytes={r['combine_logical_bytes']} " + f"byte_contract={r['byte_contract']}") + print(f" idx_hash={r['routing_hash']} samples_per_point={rp['samples_per_point']}") +except Exception as e: + print(" PARSE FAIL", repr(e)) +PY +} + +python3 -c "import deep_ep,importlib.metadata as m;print('deep_ep',m.version('deep_ep'))" 2>&1 | tail -1 +run layout-and-dispatch-v1 bf16 +run cached-layout-comm-only-v1 bf16 +run layout-and-dispatch-v1 fp8 +echo "=== V3 SMOKE DONE ===" diff --git a/experimental/CollectiveX/launchers/_v4_all.sh b/experimental/CollectiveX/launchers/_v4_all.sh new file mode 100644 index 000000000..f2934794d --- /dev/null +++ b/experimental/CollectiveX/launchers/_v4_all.sh @@ -0,0 +1,13 @@ +#!/usr/bin/env bash +# v4 full re-run for one (single-node) SKU under one allocation: the headline matrix +# (_v3_rerun.sh: bf16/fp8 x normal{layout,cached}/LL, decode+prefill) followed by the routing +# sweep (_routing_rerun.sh: balanced/zipf/zipf+eplb). Both invoke the CURRENT v4 harness, so +# every JSON carries publication_status/validity/measured-roundtrip — overwriting the legacy v3 +# files of the same name. Env (RUNNER/TOPO/TRANSPORT/DEC/PRE/DO_LL/DO_EPLB/ITERS/TRIALS/WARMUP) +# is provided by _singlenode_orchestrate.sh. +set -uo pipefail +echo "=== V4 HEADLINE (_v3_rerun.sh) ===" +bash /cx/launchers/_v3_rerun.sh || echo "WARN headline returned nonzero" +echo "=== V4 ROUTING (_routing_rerun.sh) ===" +bash /cx/launchers/_routing_rerun.sh || echo "WARN routing returned nonzero" +echo "=== V4 ALL DONE ===" diff --git a/experimental/CollectiveX/launchers/_validate_deepep.sh b/experimental/CollectiveX/launchers/_validate_deepep.sh new file mode 100644 index 000000000..4743e1850 --- /dev/null +++ b/experimental/CollectiveX/launchers/_validate_deepep.sh @@ -0,0 +1,77 @@ +#!/usr/bin/env bash +# In-container DeepEP validation driver (run via srun on an 8-GPU node). +# Exercises the reference (bf16) + optimized (fp8) NORMAL-mode paths on decode and +# prefill ladders with reduced iters for a fast correctness/artifact gate. Each +# torchrun writes one provenance-tagged JSON; we grep status=valid at the end. +set -uo pipefail +cd /cx || exit 2 +mkdir -p results +NG="${NG:-8}" +RUNNER="${RUNNER:-h100-8x}" +TOPO="${TOPO:-h100-nvlink-island}" +WARMUP="${WARMUP:-32}" # B300/Blackwell needs ~30 to reach steady-state clocks +ITERS="${ITERS:-50}" +DEC_LADDER="${DEC_LADDER:-1 2 4 8 16 32 64 128}" +PRE_LADDER="${PRE_LADDER:-128 256 512}" +export COLLECTIVEX_IMAGE="${COLLECTIVEX_IMAGE:-lmsysorg/sglang:v0.5.11-cu130}" + +echo "=== nvidia-smi ==="; nvidia-smi --query-gpu=name,memory.total --format=csv,noheader | head -1 +echo "=== deep_ep ==="; python3 -c "import deep_ep,importlib.metadata as m;print('deep_ep',m.version('deep_ep'))" 2>&1 | tail -1 + +run() { # $1=phase $2=dtype $3=ladder $4=resource_mode + local phase="$1" dt="$2" ladder="$3" rm="$4" + local out="results/${RUNNER}_deepep_${phase}_${dt}_${rm}.json" + echo "### RUN phase=$phase dtype=$dt resource=$rm ladder=[$ladder]" + timeout -k 30 600 torchrun --nproc_per_node="$NG" tests/run_ep.py \ + --backend deepep --mode normal --dispatch-dtype "$dt" --phase "$phase" \ + --routing uniform --resource-mode "$rm" \ + --runner "$RUNNER" --topology-class "$TOPO" --transport nvlink \ + --tokens-ladder "$ladder" --warmup "$WARMUP" --iters "$ITERS" \ + --out "$out" 2>&1 | tail -25 + echo "### rc=${PIPESTATUS[0]} -> $out" +} + +run_mode() { # $1=phase $2=dtype $3=ladder $4=resource_mode $5=mode + local phase="$1" dt="$2" ladder="$3" rm="$4" mode="$5" + local out="results/${RUNNER}_deepep_${phase}_${dt}_${rm}_${mode}.json" + echo "### RUN phase=$phase dtype=$dt resource=$rm mode=$mode ladder=[$ladder]" + timeout -k 30 600 torchrun --nproc_per_node="$NG" tests/run_ep.py \ + --backend deepep --mode "$mode" --dispatch-dtype "$dt" --phase "$phase" \ + --routing uniform --resource-mode "$rm" \ + --runner "$RUNNER" --topology-class "$TOPO" --transport nvlink \ + --tokens-ladder "$ladder" --warmup "$WARMUP" --iters "$ITERS" \ + --out "$out" 2>&1 | tail -25 + echo "### rc=${PIPESTATUS[0]} -> $out" +} + +if [ "${DO_NORMAL:-1}" = "1" ]; then + run decode bf16 "$DEC_LADDER" tuned + run decode fp8 "$DEC_LADDER" tuned + run prefill bf16 "$PRE_LADDER" tuned + run prefill fp8 "$PRE_LADDER" tuned +fi +# Optimized decode path = low-latency (LL). bf16 + fp8 (fp8 cast is in-kernel/timed). +# Full decode ladder incl. T=128 settles whether num_tokens < or <= num_max. +if [ "${DO_LL:-1}" = "1" ]; then + run_mode decode bf16 "$DEC_LADDER" tuned ll + run_mode decode fp8 "$DEC_LADDER" tuned ll +fi +# A normalized-regime sample (both resource regimes are required by the goal). +if [ "${DO_NORM:-1}" = "1" ]; then + run_mode decode fp8 "$DEC_LADDER" normalized normal +fi + +echo "=== SUMMARY ===" +for f in results/${RUNNER}_deepep_*.json; do + [ -f "$f" ] || continue + python3 - "$f" <<'PY' +import json,sys +d=json.load(open(sys.argv[1])) +m=d.get("metrics",{}); r=d.get("reproduction",{}) +print(f"{sys.argv[1].split('/')[-1]:52s} status={d['status']:7s} mode={d['mode']:6s} " + f"dtype={d['shape']['dispatch_dtype']:4s} fp8_in_timing={str(r.get('fp8_quant_in_timing')):5s} " + f"tol={d['correctness']['tolerance']} maxrelerr={d['correctness']['max_rel_error']:.4f} " + f"hT={m.get('headline_tokens_per_rank')} disp={m.get('dispatch_us_p50'):.1f}") +PY +done +echo "=== DONE ===" diff --git a/experimental/CollectiveX/launchers/_validate_mori.sh b/experimental/CollectiveX/launchers/_validate_mori.sh new file mode 100644 index 000000000..347dc728c --- /dev/null +++ b/experimental/CollectiveX/launchers/_validate_mori.sh @@ -0,0 +1,47 @@ +#!/usr/bin/env bash +# In-container MoRI validation driver (run via srun on an 8-GPU MI355X node). +# Re-validates the reference (bf16/normal) decode+prefill with the current harness, +# then runs the fp8 capability probe (decides whether MoRI gets fp8 caps). LL is not +# probed (MoRI has no low-latency entrypoint). Each torchrun writes one JSON. +set -uo pipefail +cd /cx || exit 2 +mkdir -p results +NG="${NG:-8}" +RUNNER="${RUNNER:-mi355x-8x}" +TOPO="${TOPO:-mi355x-xgmi}" +export COLLECTIVEX_IMAGE="${COLLECTIVEX_IMAGE:-rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-2}" + +echo "=== device ==="; rocm-smi --showproductname 2>/dev/null | head -3 || true +python3 -c "import mori; print('mori import OK')" 2>&1 | tail -2 + +run() { # $1=phase $2=ladder + local phase="$1" ladder="$2" + local out="results/${RUNNER}_mori_${phase}_bf16_tuned_normal.json" + echo "### RUN mori phase=$phase ladder=[$ladder]" + timeout -k 30 700 torchrun --nproc_per_node="$NG" tests/run_ep.py \ + --backend mori --mode normal --dispatch-dtype bf16 --phase "$phase" \ + --routing uniform --resource-mode tuned \ + --runner "$RUNNER" --topology-class "$TOPO" --transport xgmi \ + --tokens-ladder "$ladder" --warmup 8 --iters 40 --out "$out" 2>&1 | tail -25 + echo "### rc=${PIPESTATUS[0]} -> $out" +} + +run decode "1 2 4 8 16 32 64 128" +run prefill "128 256 512" + +echo "### MoRI fp8 capability probe" +timeout -k 20 300 torchrun --nproc_per_node="$NG" tests/probe_mori_caps.py 2>&1 | tail -35 + +echo "=== SUMMARY ===" +for f in results/${RUNNER}_mori_*.json; do + [ -f "$f" ] || continue + python3 - "$f" <<'PY' +import json,sys +d=json.load(open(sys.argv[1])); m=d.get("metrics",{}) +print(f"{sys.argv[1].split('/')[-1]:46s} status={d['status']:7s} mode={d['mode']:6s} " + f"dtype={d['shape']['dispatch_dtype']:4s} maxrelerr={d['correctness']['max_rel_error']:.4f} " + f"hT={m.get('headline_tokens_per_rank')} disp={m.get('dispatch_us_p50'):.1f} " + f"blocks={d['backend_provenance'].get('block_num')}") +PY +done +echo "=== DONE ===" diff --git a/experimental/CollectiveX/launchers/common.sh b/experimental/CollectiveX/launchers/common.sh new file mode 100644 index 000000000..e560fc987 --- /dev/null +++ b/experimental/CollectiveX/launchers/common.sh @@ -0,0 +1,168 @@ +# shellcheck shell=bash +# CollectiveX — shared launcher helpers (sourced, not executed). +# +# Cluster-generic scaffolding only (Slurm/container/build/staging); no +# model-serving. Logging goes to stderr so functions can `echo` a single +# result on stdout. + +cx_log() { printf '[collectivex] %s\n' "$*" >&2; } +cx_die() { printf '[collectivex] FATAL: %s\n' "$*" >&2; exit 1; } + +# Single multi-arch container for ALL NVIDIA SKUs: tag `v0.5.11-cu130` is an OCI +# image index covering linux/amd64 (B200) + linux/arm64 (GB200); enroot import +# pulls the matching arch. (cu130 = CUDA 13, system nccl.h in /usr/include, torch 2.9.x.) +# IMPORT BY TAG, not by digest: enroot's anonymous Docker Hub token scope is built +# from the tag; a bare `repo@sha256:` ref makes enroot prompt for a password and +# HANG in non-interactive CI (and a combined `tag@sha256` ref 400s). The expected +# multi-arch index digest is recorded for provenance/verification: +CX_IMAGE_DIGEST="sha256:061fb71f838e82000a1768c159654d526c2f17ebe751c21e7fc48ca53c8ef975" +# (v0.5.12-cu130 was rejected: its 62 layers overflow enroot's overlay-based +# squash creation on these nodes — "failed to mount overlay ... Invalid argument". +# v0.5.11-cu130 imports cleanly and is pre-staged on GB200.) +# DeepEP is NOT bundled here -> run_in_container.sh builds it via rebuild-deepep. +# (The arch-specific deepseek-v4-{blackwell,grace-blackwell} images DO bundle +# DeepEP — see CONTAINERS.md — but are not multi-arch and are not the default.) +CX_IMAGE_MULTIARCH="lmsysorg/sglang:v0.5.11-cu130" + +# AMD (ROCm/CDNA): the multi-arch NVIDIA image above is x86_64+aarch64 CUDA and +# cannot run on MI355X. AMD uses a separate ROCm image that bundles MoRI (the +# AMD EP library). Single-arch (linux/amd64 host, ROCm runtime); not digest- +# pinned yet — pin once validated on the runner. See CONTAINERS.md. +CX_IMAGE_AMD_MORI="rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-2" + +cx_default_image() { + case "$1" in + mi355x*|mi350x*|mi325x*|mi300x*) echo "$CX_IMAGE_AMD_MORI" ;; + b200*|gb200*|b300*|gb300*|h100*|h200*) echo "$CX_IMAGE_MULTIARCH" ;; + *) cx_die "no default image for runner prefix: $1" ;; + esac +} + +# cx_ensure_squash -> echoes the squash file path. +# Imports via enroot only if a valid squash is not already present (flock-guarded, +# mirroring runners/launch_b200-dgxc.sh). +cx_ensure_squash() { + local squash_dir="$1" image="$2" + mkdir -p "$squash_dir" 2>/dev/null || true + local key sq locks + key="$(printf '%s' "$image" | sed 's#[/:@#]#_#g')" + sq="$squash_dir/${key}.sqsh" + locks="$squash_dir/.locks"; mkdir -p "$locks" 2>/dev/null || true + ( + flock -w 900 9 || cx_die "lock timeout for $sq" + if unsquashfs -l "$sq" >/dev/null 2>&1; then + cx_log "squash present: $sq" + else + cx_log "enroot import docker://$image -> $sq (one-time, multi-GB)" + rm -f "$sq" + # &2 \ + || cx_die "enroot import failed for $image (anonymous auth needs a TAG ref, not a bare digest; or pre-stage the squash)" + unsquashfs -l "$sq" >/dev/null 2>&1 || cx_die "import produced no valid squash: $sq" + fi + ) 9>"$locks/${key}.lock" + echo "$sq" +} + +# cx_stage_repo -> echoes the mount-source root. +# Some clusters (e.g. GB200/watchtower) do not cross-mount the runner workspace +# to compute nodes. If CX_STAGE_DIR is set, rsync the CollectiveX tree onto that +# compute-visible shared FS and mount from there. No-op (echo repo_root) when +# stage_dir is empty or equals repo_root. +cx_stage_repo() { + local repo_root="$1" stage_dir="${2:-}" + if [ -z "$stage_dir" ] || [ "$stage_dir" = "$repo_root" ]; then + echo "$repo_root"; return 0 + fi + mkdir -p "$stage_dir/experimental" || cx_die "cannot create stage dir $stage_dir" + cx_log "staging experimental/CollectiveX -> $stage_dir (compute-visible)" + rsync -a --delete \ + --exclude='.nccl-tests/' --exclude='__pycache__/' --exclude='results/plots/' \ + "$repo_root/experimental/CollectiveX" "$stage_dir/experimental/" >&2 \ + || cx_die "rsync to stage dir failed" + echo "$stage_dir" +} + +# cx_collect_results +# When the run used a staged (compute-visible) mount, copy result JSONs back to +# the original checkout's results/ so the workflow's upload-artifact (which reads +# the checkout, not the stage dir) finds them. No-op when no staging was used. +cx_collect_results() { + local mount_src="$1" repo_root="$2" dst + [ "$mount_src" = "$repo_root" ] && return 0 + dst="$repo_root/experimental/CollectiveX/results" + mkdir -p "$dst" + cp "$mount_src/experimental/CollectiveX/results/"*.json "$dst/" 2>/dev/null || true + cx_log "copied results from stage dir -> $dst (for artifact upload)" +} + +# cx_build_nccl_tests -> echoes the build/ dir. +# Runs IN-CONTAINER (login nodes have no nvcc). Cached: skips if already built. +# CX_NCCL_HOME defaults to /usr (system nccl.h in /usr/include on the sglang +# cu130 images); override CX_CUDA_HOME / CX_NCCL_HOME / CX_MPI_HOME if needed. +cx_build_nccl_tests() { + local parent="$1" mpi="${2:-0}" dir bin sfx="" + # Cache MPI=0 and MPI=1 builds in SEPARATE dirs. A single-node (MPI=0) binary + # reused under `srun --mpi=pmix` runs as N standalone world=1 procs (busbw=0); + # keying the cache by flavor prevents that cross-contamination. + [ "$mpi" = "1" ] && sfx="-mpi" + dir="$parent/nccl-tests$sfx" + bin="$dir/build/all_reduce_perf" + if [ -x "$bin" ]; then + cx_log "nccl-tests already built: $dir/build" + echo "$dir/build"; return 0 + fi + mkdir -p "$parent" + if [ ! -d "$dir/.git" ]; then + cx_log "cloning nccl-tests -> $dir" + git clone --depth 1 https://github.com/NVIDIA/nccl-tests.git "$dir" >&2 \ + || cx_die "git clone nccl-tests failed" + fi + # MPI=1 needs MPI_HOME. On Debian/Ubuntu OpenMPI the headers live under + # /usr/lib//openmpi/include (NOT /usr/include), so MPI_HOME=/usr fails; + # point it at that openmpi dir (libmpi resolves via the default linker path). + # Works for both x86_64 (B200) and aarch64 (GB200). Override with CX_MPI_HOME. + local mpi_home="${CX_MPI_HOME:-}" + if [ "$mpi" = "1" ] && [ -z "$mpi_home" ]; then + mpi_home="$(ls -d /usr/lib/*/openmpi 2>/dev/null | head -n1)" + fi + cx_log "building nccl-tests (MPI=$mpi, NCCL_HOME=${CX_NCCL_HOME:-/usr}${mpi_home:+, MPI_HOME=$mpi_home})" + make -C "$dir" -j MPI="$mpi" \ + CUDA_HOME="${CX_CUDA_HOME:-/usr/local/cuda}" \ + NCCL_HOME="${CX_NCCL_HOME:-/usr}" \ + ${mpi_home:+MPI_HOME="$mpi_home"} >&2 \ + || cx_die "nccl-tests build failed (try a different CX_NCCL_HOME/CX_MPI_HOME; need nccl.h + libnccl)" + [ -x "$bin" ] || cx_die "nccl-tests build produced no binary at $bin" + echo "$dir/build" +} + +# cx_build_rccl_tests -> echoes the build/ dir. +# AMD/ROCm counterpart of cx_build_nccl_tests: ROCm/rccl-tests is a fork of +# nccl-tests producing the SAME binary names (_perf) and output format, so +# run_nccl.py parses it unchanged. `make` defaults to ROCm at /opt/rocm +# (amdclang++ + librccl); validated building in-container on MI355X. Override +# CX_ROCM_HOME / CX_RCCL_HOME / CX_MPI_HOME if the toolchain lives elsewhere. +cx_build_rccl_tests() { + local parent="$1" mpi="${2:-0}" dir bin + dir="$parent/rccl-tests" + bin="$dir/build/all_reduce_perf" + if [ -x "$bin" ]; then + cx_log "rccl-tests already built: $dir/build" + echo "$dir/build"; return 0 + fi + mkdir -p "$parent" + if [ ! -d "$dir/.git" ]; then + cx_log "cloning rccl-tests -> $dir" + git clone --depth 1 https://github.com/ROCm/rccl-tests.git "$dir" >&2 \ + || cx_die "git clone rccl-tests failed" + fi + cx_log "building rccl-tests (MPI=$mpi, ROCm ${CX_ROCM_HOME:-/opt/rocm})" + make -C "$dir" -j MPI="$mpi" \ + ${CX_ROCM_HOME:+HIP_HOME="$CX_ROCM_HOME"} \ + ${CX_RCCL_HOME:+RCCL_HOME="$CX_RCCL_HOME"} \ + ${CX_MPI_HOME:+MPI_HOME="$CX_MPI_HOME"} >&2 \ + || cx_die "rccl-tests build failed (need ROCm + librccl; try CX_ROCM_HOME)" + [ -x "$bin" ] || cx_die "rccl-tests build produced no binary at $bin" + echo "$dir/build" +} diff --git a/experimental/CollectiveX/launchers/launch_b200-dgxc-slurm.sh b/experimental/CollectiveX/launchers/launch_b200-dgxc-slurm.sh new file mode 100644 index 000000000..b7a03b2c1 --- /dev/null +++ b/experimental/CollectiveX/launchers/launch_b200-dgxc-slurm.sh @@ -0,0 +1,103 @@ +#!/usr/bin/env bash +# CollectiveX — 2-node B200 SKU adapter (cross CX-7 InfiniBand spine), x86_64. +# +# The other half of the headline: the same primitives as single-node B200, but +# spanning two nodes so the transport is InfiniBand rather than NVLink. Contrast +# with GB200, where the 2-node-equivalent stays on NVL72 NVLink (MNNVL). +# +# Multi-node orchestration differs from single-node, so this adapter does NOT +# use run_in_container.sh: it builds nccl-tests (MPI=1), runs each op across all +# ranks (raw capture), then parses on the login node. Currently CX_BENCH=nccl +# only (multi-node DeepEP/MNNVL is the srt-slurm follow-up). +# +# SPIKE CAVEATS: needs `srun --mpi=pmix` wired for pyxis and a compute-visible +# checkout — set CX_STAGE_DIR to a shared FS (e.g. /home/sa-shared/cx-stage) if +# the runner workspace is not cross-mounted to compute. +# +# Run: bash experimental/CollectiveX/launchers/launch_b200-dgxc-slurm.sh +set -euo pipefail + +HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +CX_DIR="$(cd "$HERE/.." && pwd)" +REPO_ROOT="$(cd "$CX_DIR/../.." && pwd)" +# shellcheck source=common.sh +source "$HERE/common.sh" + +CX_BENCH="${CX_BENCH:-nccl}" +[ "$CX_BENCH" = "nccl" ] || cx_die "launch_b200-dgxc-slurm.sh supports CX_BENCH=nccl only (got '$CX_BENCH'); multi-node DeepEP is a follow-up" + +RUNNER_NAME="${RUNNER_NAME:-b200-dgxc-slurm}" +PARTITION="${CX_PARTITION:-gpu-2}" +ACCOUNT="${CX_ACCOUNT:-benchmark}" +GPUS_PER_NODE="${CX_GPUS_PER_NODE:-8}" +NODES="${CX_NODES:-2}" +TIME_MIN="${CX_TIME:-30}" +IMAGE="${CX_IMAGE:-$(cx_default_image b200)}" +SQUASH_DIR="${CX_SQUASH_DIR:-/home/sa-shared/containers}" +MOUNT_DIR=/ix +TS="$(date -u +%Y-%m-%dT%H-%M-%SZ)" +TOPO="b200-nvlink-island+cx7-ib" +WORLD=$((NODES * GPUS_PER_NODE)) +MPI_FLAG="${CX_SRUN_MPI:-pmix}" +export CX_NCCL_HOME="${CX_NCCL_HOME:-/usr}" +# Record container identity in env_capture provenance (propagated via --export=ALL). +export COLLECTIVEX_IMAGE="$IMAGE" COLLECTIVEX_IMAGE_DIGEST="${CX_IMAGE_DIGEST:-}" + +declare -A BIN=( [all_reduce]=all_reduce_perf [all_gather]=all_gather_perf + [reduce_scatter]=reduce_scatter_perf [alltoall]=alltoall_perf ) + +cx_log "runner=$RUNNER_NAME nodes=$NODES x ${GPUS_PER_NODE}gpu world=$WORLD image=$IMAGE" +SQUASH_FILE="$(cx_ensure_squash "$SQUASH_DIR" "$IMAGE")" +MOUNT_SRC="$(cx_stage_repo "$REPO_ROOT" "${CX_STAGE_DIR:-}")" +cx_log "squash=$SQUASH_FILE mount=$MOUNT_SRC -> $MOUNT_DIR" + +if [ "${CX_DRYRUN:-0}" = "1" ]; then cx_log "CX_DRYRUN=1 — not allocating"; exit 0; fi +command -v salloc >/dev/null || cx_die "salloc not found — run on the Slurm login node" + +salloc --partition="$PARTITION" --account="$ACCOUNT" --nodes="$NODES" \ + --gres=gpu:"$GPUS_PER_NODE" --exclusive --time="$TIME_MIN" \ + --no-shell --job-name="$RUNNER_NAME" +JOB_ID="$(squeue --name="$RUNNER_NAME" -u "$USER" -h -o %A | head -n1)" +[ -n "$JOB_ID" ] || cx_die "could not resolve allocated JOB_ID" +cx_log "JOB_ID=$JOB_ID" +trap 'scancel "$JOB_ID" 2>/dev/null || true' EXIT + +COMMON_MOUNT=(--container-image="$SQUASH_FILE" --container-mounts="$MOUNT_SRC:$MOUNT_DIR" + --no-container-mount-home --container-workdir="$MOUNT_DIR/experimental/CollectiveX" + --no-container-entrypoint) +ENVJSON="$MOUNT_SRC/experimental/CollectiveX/results/env_${RUNNER_NAME}_${TS}.json" + +# 1) Build nccl-tests (MPI=1) + capture environment (single task, one node). +srun --jobid="$JOB_ID" --ntasks=1 --nodes=1 "${COMMON_MOUNT[@]}" --export=ALL,CX_TS="$TS",CX_RUNNER="$RUNNER_NAME" \ + bash -c ' + set -euo pipefail + cd /ix/experimental/CollectiveX + source launchers/common.sh + mkdir -p results + cx_build_nccl_tests "$PWD/.nccl-tests" 1 >/dev/null + python3 env_capture.py --out "results/env_${CX_RUNNER}_${CX_TS}.json" --timestamp "$CX_TS" + ' + +BUILD_IN_CTR="$MOUNT_DIR/experimental/CollectiveX/.nccl-tests/nccl-tests-mpi/build" +OPS="${CX_OPS:-all_reduce all_gather reduce_scatter alltoall}" + +# 2) Per op: run across all ranks (one GPU per task), tee raw output to shared FS. +for op in $OPS; do + raw="$MOUNT_SRC/experimental/CollectiveX/results/raw_${RUNNER_NAME}_${op}_${TS}.txt" + cx_log "running $op across $WORLD ranks (mpi=$MPI_FLAG) -> $raw" + srun --jobid="$JOB_ID" --mpi="$MPI_FLAG" --nodes="$NODES" \ + --ntasks="$WORLD" --ntasks-per-node="$GPUS_PER_NODE" "${COMMON_MOUNT[@]}" \ + --export=ALL,NCCL_CUMEM_ENABLE=1 \ + "$BUILD_IN_CTR/${BIN[$op]}" -b "${CX_MIN_BYTES:-8}" -e "${CX_MAX_BYTES:-8G}" -f 2 -g 1 -c 1 -w 5 -n 20 \ + > "$raw" 2>"$raw.stderr" || cx_log "WARN: $op srun returned nonzero (see $raw.stderr)" + + # 3) Parse on the login node (pure stdlib python; no container needed). + python3 "$CX_DIR/run_nccl.py" --op "$op" --parse-only "$raw" \ + --world-size "$WORLD" --nodes "$NODES" \ + --runner "$RUNNER_NAME" --topology-class "$TOPO" --transport ib \ + --env-json "$ENVJSON" \ + --out "$CX_DIR/results/${RUNNER_NAME}_${op}_${TS}.json" \ + --timestamp "$TS" || cx_log "WARN: parse $op failed" +done + +cx_log "done — JSON artifacts under $CX_DIR/results/" diff --git a/experimental/CollectiveX/launchers/launch_b200-dgxc.sh b/experimental/CollectiveX/launchers/launch_b200-dgxc.sh new file mode 100644 index 000000000..42d860975 --- /dev/null +++ b/experimental/CollectiveX/launchers/launch_b200-dgxc.sh @@ -0,0 +1,67 @@ +#!/usr/bin/env bash +# CollectiveX — B200 single-node SKU adapter (8x B200, NVLink island, x86_64). +# +# Thin adapter: handles B200-specific allocation/container, then hands off to +# launchers/run_in_container.sh which runs whichever benchmark CX_BENCH selects +# (nccl | deepep | all). Mirrors runners/launch_b200-dgxc.sh (salloc + enroot +# squash + srun --container) with all model-serving stripped. +# +# Run from inside the InferenceX checkout on the B200 login node: +# bash experimental/CollectiveX/launchers/launch_b200-dgxc.sh # nccl (default) +# CX_BENCH=deepep bash .../launch_b200-dgxc.sh # DeepEP (rebuild) +# +# Env knobs: CX_PARTITION(gpu-2) CX_ACCOUNT(benchmark) CX_NGPUS(8) CX_TIME(30) +# CX_IMAGE CX_SQUASH_DIR CX_STAGE_DIR CX_BENCH CX_OPS CX_MIN_BYTES CX_MAX_BYTES +# CX_DRYRUN(0) +set -euo pipefail + +HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +CX_DIR="$(cd "$HERE/.." && pwd)" +REPO_ROOT="$(cd "$CX_DIR/../.." && pwd)" +# shellcheck source=common.sh +source "$HERE/common.sh" + +RUNNER_NAME="${RUNNER_NAME:-b200-dgxc}" +PARTITION="${CX_PARTITION:-gpu-2}" +ACCOUNT="${CX_ACCOUNT:-benchmark}" +NGPUS="${CX_NGPUS:-8}" +TIME_MIN="${CX_TIME:-30}" +IMAGE="${CX_IMAGE:-$(cx_default_image b200)}" +SQUASH_DIR="${CX_SQUASH_DIR:-/home/sa-shared/containers}" +MOUNT_DIR=/ix +TS="$(date -u +%Y-%m-%dT%H-%M-%SZ)" + +export CX_RUNNER="$RUNNER_NAME" CX_NGPUS="$NGPUS" CX_TS="$TS" +export CX_TOPO="b200-nvlink-island" CX_TRANSPORT="nvlink" +export CX_BENCH="${CX_BENCH:-nccl}" +export CX_NCCL_HOME="${CX_NCCL_HOME:-/usr}" +# Record container identity in env_capture provenance. +export COLLECTIVEX_IMAGE="$IMAGE" COLLECTIVEX_IMAGE_DIGEST="${CX_IMAGE_DIGEST:-}" +export NCCL_CUMEM_ENABLE=1 + +cx_log "runner=$RUNNER_NAME partition=$PARTITION ngpus=$NGPUS bench=$CX_BENCH" +cx_log "image=$IMAGE" +SQUASH_FILE="$(cx_ensure_squash "$SQUASH_DIR" "$IMAGE")" +MOUNT_SRC="$(cx_stage_repo "$REPO_ROOT" "${CX_STAGE_DIR:-}")" +cx_log "squash=$SQUASH_FILE mount=$MOUNT_SRC -> $MOUNT_DIR" + +if [ "${CX_DRYRUN:-0}" = "1" ]; then cx_log "CX_DRYRUN=1 — not allocating"; exit 0; fi +command -v salloc >/dev/null || cx_die "salloc not found — run on the Slurm login node" + +salloc --partition="$PARTITION" --account="$ACCOUNT" --gres=gpu:"$NGPUS" \ + --exclusive --time="$TIME_MIN" --no-shell --job-name="$RUNNER_NAME" +JOB_ID="$(squeue --name="$RUNNER_NAME" -u "$USER" -h -o %A | head -n1)" +[ -n "$JOB_ID" ] || cx_die "could not resolve allocated JOB_ID" +cx_log "JOB_ID=$JOB_ID" +trap 'scancel "$JOB_ID" 2>/dev/null || true' EXIT + +srun --jobid="$JOB_ID" \ + --container-image="$SQUASH_FILE" \ + --container-mounts="$MOUNT_SRC:$MOUNT_DIR" \ + --no-container-mount-home \ + --container-workdir="$MOUNT_DIR/experimental/CollectiveX" \ + --no-container-entrypoint --export=ALL \ + bash "$MOUNT_DIR/experimental/CollectiveX/launchers/run_in_container.sh" + +cx_collect_results "$MOUNT_SRC" "$REPO_ROOT" +cx_log "done — JSON artifacts under $MOUNT_SRC/experimental/CollectiveX/results/" diff --git a/experimental/CollectiveX/launchers/launch_b300-nv.sh b/experimental/CollectiveX/launchers/launch_b300-nv.sh new file mode 100644 index 000000000..7f485480a --- /dev/null +++ b/experimental/CollectiveX/launchers/launch_b300-nv.sh @@ -0,0 +1,7 @@ +#!/usr/bin/env bash +# CollectiveX — B300 (b300-nv GH runner) adapter. The self-hosted runner is named +# `b300-nv_NN`, so runner.name's prefix resolves to this file via +# launch_${RUNNER_NAME%%_*}.sh. Identical B300 settings to launch_b300.sh (the +# canonical/manual entry point) — delegate so there is a single source of truth. +set -euo pipefail +exec bash "$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/launch_b300.sh" "$@" diff --git a/experimental/CollectiveX/launchers/launch_b300.sh b/experimental/CollectiveX/launchers/launch_b300.sh new file mode 100644 index 000000000..6085165d9 --- /dev/null +++ b/experimental/CollectiveX/launchers/launch_b300.sh @@ -0,0 +1,69 @@ +#!/usr/bin/env bash +# CollectiveX — B300 single-node SKU adapter (8x B300 SXM6, NVLink island, x86_64, SM100). +# +# Thin adapter: B300-specific allocation/container, then hands off to +# launchers/run_in_container.sh (CX_BENCH = nccl | deepep | all). Mirrors +# launch_h200.sh; B300 differs in: partition `batch_1` with a REQUIRED account +# (`benchmark`), and the compute-visible share is /data (10.3.26.100:/data) — NOT +# /home and NOT the node-local /scratch, both invisible to compute nodes here. Both +# the squash AND the staged repo MUST live on /data or pyxis fails "No such file". +# +# Run from inside the InferenceX checkout on the B300 login node: +# bash experimental/CollectiveX/launchers/launch_b300.sh # nccl (default) +# CX_BENCH=deepep CX_PHASE=both bash .../launch_b300.sh # DeepEP, decode+prefill +# +# Env knobs: CX_PARTITION(batch_1) CX_ACCOUNT(benchmark) CX_NGPUS(8) CX_TIME(45) +# CX_IMAGE CX_SQUASH_DIR CX_STAGE_DIR CX_BENCH CX_PHASE CX_DRYRUN(0) +set -euo pipefail + +HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +CX_DIR="$(cd "$HERE/.." && pwd)" +REPO_ROOT="$(cd "$CX_DIR/../.." && pwd)" +# shellcheck source=common.sh +source "$HERE/common.sh" + +RUNNER_NAME="${RUNNER_NAME:-b300}" +PARTITION="${CX_PARTITION:-batch_1}" +ACCOUNT="${CX_ACCOUNT:-benchmark}" # B300 scheduler REQUIRES a valid account/partition combo +EXCLUDE_NODES="${CX_EXCLUDE_NODES:-b300-018}" # known-bad node (per the serving launcher) +NGPUS="${CX_NGPUS:-8}" +TIME_MIN="${CX_TIME:-45}" +IMAGE="${CX_IMAGE:-$(cx_default_image b300)}" +SQUASH_DIR="${CX_SQUASH_DIR:-/data/sa-shared/containers}" +export CX_STAGE_DIR="${CX_STAGE_DIR:-/data/sa-shared/cx_stage}" +MOUNT_DIR=/ix +TS="$(date -u +%Y-%m-%dT%H-%M-%SZ)" + +export CX_RUNNER="$RUNNER_NAME" CX_NGPUS="$NGPUS" CX_TS="$TS" +export CX_TOPO="b300-nvlink-island" CX_TRANSPORT="nvlink" +export CX_BENCH="${CX_BENCH:-nccl}" +export CX_NCCL_HOME="${CX_NCCL_HOME:-/usr}" +export COLLECTIVEX_IMAGE="$IMAGE" COLLECTIVEX_IMAGE_DIGEST="${CX_IMAGE_DIGEST:-}" +export NCCL_CUMEM_ENABLE=1 + +cx_log "runner=$RUNNER_NAME partition=$PARTITION account=$ACCOUNT ngpus=$NGPUS bench=$CX_BENCH" +cx_log "image=$IMAGE" +SQUASH_FILE="$(cx_ensure_squash "$SQUASH_DIR" "$IMAGE")" +MOUNT_SRC="$(cx_stage_repo "$REPO_ROOT" "${CX_STAGE_DIR:-}")" +cx_log "squash=$SQUASH_FILE mount=$MOUNT_SRC -> $MOUNT_DIR" + +if [ "${CX_DRYRUN:-0}" = "1" ]; then cx_log "CX_DRYRUN=1 — not allocating"; exit 0; fi +command -v salloc >/dev/null || cx_die "salloc not found — run on the Slurm login node" + +salloc --partition="$PARTITION" --account="$ACCOUNT" --exclude="$EXCLUDE_NODES" \ + --gres=gpu:"$NGPUS" --exclusive --time="$TIME_MIN" --no-shell --job-name="$RUNNER_NAME" +JOB_ID="$(squeue --name="$RUNNER_NAME" -u "$USER" -h -o %A | head -n1)" +[ -n "$JOB_ID" ] || cx_die "could not resolve allocated JOB_ID" +cx_log "JOB_ID=$JOB_ID" +trap 'scancel "$JOB_ID" 2>/dev/null || true' EXIT + +srun --jobid="$JOB_ID" \ + --container-image="$SQUASH_FILE" \ + --container-mounts="$MOUNT_SRC:$MOUNT_DIR" \ + --no-container-mount-home \ + --container-workdir="$MOUNT_DIR/experimental/CollectiveX" \ + --no-container-entrypoint --export=ALL \ + bash "$MOUNT_DIR/experimental/CollectiveX/launchers/run_in_container.sh" + +cx_collect_results "$MOUNT_SRC" "$REPO_ROOT" +cx_log "done — JSON artifacts under $MOUNT_SRC/experimental/CollectiveX/results/" diff --git a/experimental/CollectiveX/launchers/launch_gb200-nv.sh b/experimental/CollectiveX/launchers/launch_gb200-nv.sh new file mode 100644 index 000000000..4863b9c10 --- /dev/null +++ b/experimental/CollectiveX/launchers/launch_gb200-nv.sh @@ -0,0 +1,135 @@ +#!/usr/bin/env bash +# CollectiveX — GB200 (NVL72, MNNVL domain) SKU adapter. aarch64, 4 GPU/tray. +# +# Two paths, selected by CX_NODES: +# * CX_NODES=1 (default): single tray, 4 GPU, intra-tray MNNVL. Hands off to +# run_in_container.sh (CX_BENCH = nccl | deepep | all), -g 4. +# * CX_NODES>1: multi-node over the NVL72 NVLink fabric (MNNVL), e.g. CX_NODES=2 +# = 8 GPU. nccl only — builds nccl-tests (MPI=1), runs each op across all ranks +# via `srun --mpi=pmix` (1 GPU/rank), parses on the login node. Same shape that +# runs single-node B200 (NVLink island) and multi-node B200 (CX-7 IB) — here it +# stays entirely on NVL72 NVLink. Validated 8-GPU (2 trays) on-node. +# +# Run from inside the InferenceX checkout on the GB200 login node: +# bash experimental/CollectiveX/launchers/launch_gb200-nv.sh # 4 GPU, nccl +# CX_NODES=2 bash .../launch_gb200-nv.sh # 8 GPU MNNVL +# CX_BENCH=deepep bash .../launch_gb200-nv.sh # 4 GPU, DeepEP +# +# Env knobs: CX_PARTITION(batch) CX_ACCOUNT(benchmark) CX_NODES(1) +# CX_GPUS_PER_NODE(4) CX_TIME(30) CX_IMAGE CX_SQUASH_DIR CX_STAGE_DIR CX_BENCH +# CX_OPS CX_MIN_BYTES CX_MAX_BYTES CX_SRUN_MPI(pmix) CX_DRYRUN(0) +set -euo pipefail + +HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +CX_DIR="$(cd "$HERE/.." && pwd)" +REPO_ROOT="$(cd "$CX_DIR/../.." && pwd)" +# shellcheck source=common.sh +source "$HERE/common.sh" + +RUNNER_NAME="${RUNNER_NAME:-gb200-nv}" +PARTITION="${CX_PARTITION:-batch}" +ACCOUNT="${CX_ACCOUNT:-benchmark}" +GPUS_PER_NODE="${CX_GPUS_PER_NODE:-4}" # NVL72 compute tray = 4 GPU/node +NODES="${CX_NODES:-1}" +TIME_MIN="${CX_TIME:-30}" +IMAGE="${CX_IMAGE:-$(cx_default_image gb200)}" +SQUASH_DIR="${CX_SQUASH_DIR:-/mnt/lustre01/users-public/sa-shared}" +MOUNT_DIR=/ix +TS="$(date -u +%Y-%m-%dT%H-%M-%SZ)" +WORLD=$((NODES * GPUS_PER_NODE)) + +export CX_RUNNER="$RUNNER_NAME" CX_TS="$TS" +export CX_TOPO="gb200-nvl72-mnnvl" CX_TRANSPORT="mnnvl" +export CX_BENCH="${CX_BENCH:-nccl}" +export CX_NCCL_HOME="${CX_NCCL_HOME:-/usr}" +export COLLECTIVEX_IMAGE="$IMAGE" COLLECTIVEX_IMAGE_DIGEST="${CX_IMAGE_DIGEST:-}" +# Validated GB200 MNNVL transport env (from serving recipes) — set AND recorded. +export NCCL_CUMEM_ENABLE=1 NCCL_MNNVL_ENABLE=1 MC_FORCE_MNNVL=1 + +cx_log "runner=$RUNNER_NAME partition=$PARTITION nodes=$NODES x ${GPUS_PER_NODE}gpu world=$WORLD bench=$CX_BENCH (aarch64)" +cx_log "image=$IMAGE" +SQUASH_FILE="$(cx_ensure_squash "$SQUASH_DIR" "$IMAGE")" +MOUNT_SRC="$(cx_stage_repo "$REPO_ROOT" "${CX_STAGE_DIR:-}")" +cx_log "squash=$SQUASH_FILE mount=$MOUNT_SRC -> $MOUNT_DIR" + +if [ "${CX_DRYRUN:-0}" = "1" ]; then cx_log "CX_DRYRUN=1 — not allocating"; exit 0; fi +command -v salloc >/dev/null || cx_die "salloc not found — run on the Slurm login node" + +# ---------------------------------------------------------------------------- +if [ "$NODES" -le 1 ]; then + # Single tray (4 GPU): generic dispatcher, -g N single process. + export CX_NGPUS="$GPUS_PER_NODE" + salloc --partition="$PARTITION" --account="$ACCOUNT" --gres=gpu:"$GPUS_PER_NODE" \ + --exclusive --time="$TIME_MIN" --no-shell --job-name="$RUNNER_NAME" + JOB_ID="$(squeue --name="$RUNNER_NAME" -u "$USER" -h -o %A | head -n1)" + [ -n "$JOB_ID" ] || cx_die "could not resolve allocated JOB_ID" + cx_log "JOB_ID=$JOB_ID" + trap 'scancel "$JOB_ID" 2>/dev/null || true' EXIT + srun --jobid="$JOB_ID" \ + --container-image="$SQUASH_FILE" --container-mounts="$MOUNT_SRC:$MOUNT_DIR" \ + --no-container-mount-home --container-workdir="$MOUNT_DIR/experimental/CollectiveX" \ + --no-container-entrypoint --export=ALL \ + bash "$MOUNT_DIR/experimental/CollectiveX/launchers/run_in_container.sh" + cx_collect_results "$MOUNT_SRC" "$REPO_ROOT" + cx_log "done — JSON artifacts under $MOUNT_SRC/experimental/CollectiveX/results/" + exit 0 +fi + +# ---------------------------------------------------------------------------- +# Multi-node MNNVL (nccl only): mirrors launch_b200-dgxc-slurm but stays on the +# NVL72 NVLink fabric. Build nccl-tests MPI=1, run each op across WORLD ranks +# (1 GPU/rank) via srun --mpi=pmix, parse on the login node. +[ "$CX_BENCH" = "nccl" ] || cx_die "GB200 multi-node supports CX_BENCH=nccl only (got '$CX_BENCH')" +MPI_FLAG="${CX_SRUN_MPI:-pmix}" +declare -A BIN=( [all_reduce]=all_reduce_perf [all_gather]=all_gather_perf + [reduce_scatter]=reduce_scatter_perf [alltoall]=alltoall_perf ) + +salloc --partition="$PARTITION" --account="$ACCOUNT" --nodes="$NODES" \ + --gres=gpu:"$GPUS_PER_NODE" --exclusive --time="$TIME_MIN" \ + --no-shell --job-name="$RUNNER_NAME" +JOB_ID="$(squeue --name="$RUNNER_NAME" -u "$USER" -h -o %A | head -n1)" +[ -n "$JOB_ID" ] || cx_die "could not resolve allocated JOB_ID" +cx_log "JOB_ID=$JOB_ID nodes=[$(squeue -j "$JOB_ID" -h -o %N 2>/dev/null)]" +trap 'scancel "$JOB_ID" 2>/dev/null || true' EXIT + +COMMON_MOUNT=(--container-image="$SQUASH_FILE" --container-mounts="$MOUNT_SRC:$MOUNT_DIR" + --no-container-mount-home --container-workdir="$MOUNT_DIR/experimental/CollectiveX" + --no-container-entrypoint) +ENVJSON="$MOUNT_SRC/experimental/CollectiveX/results/env_${RUNNER_NAME}_${TS}.json" + +# 1) Build nccl-tests (MPI=1) + capture environment (single task, one node). +srun --jobid="$JOB_ID" --ntasks=1 --nodes=1 "${COMMON_MOUNT[@]}" \ + --export=ALL,CX_TS="$TS",CX_RUNNER="$RUNNER_NAME" /dev/null + python3 env_capture.py --out "results/env_${CX_RUNNER}_${CX_TS}.json" --timestamp "$CX_TS" + ' + +BUILD_IN_CTR="$MOUNT_DIR/experimental/CollectiveX/.nccl-tests/nccl-tests-mpi/build" +OPS="${CX_OPS:-all_reduce all_gather reduce_scatter alltoall}" + +# 2) Per op: run across all ranks (1 GPU/rank), tee raw output to the shared FS. +for op in $OPS; do + raw="$MOUNT_SRC/experimental/CollectiveX/results/raw_${RUNNER_NAME}_${op}_${TS}.txt" + cx_log "running $op across $WORLD ranks (mpi=$MPI_FLAG, MNNVL) -> $raw" + srun --jobid="$JOB_ID" --mpi="$MPI_FLAG" --nodes="$NODES" \ + --ntasks="$WORLD" --ntasks-per-node="$GPUS_PER_NODE" "${COMMON_MOUNT[@]}" \ + --export=ALL,NCCL_CUMEM_ENABLE=1,NCCL_MNNVL_ENABLE=1,MC_FORCE_MNNVL=1 "$raw" 2>"$raw.stderr" || cx_log "WARN: $op srun returned nonzero (see $raw.stderr)" + + # 3) Parse on the login node (pure stdlib; no container needed). + python3 "$CX_DIR/run_nccl.py" --op "$op" --parse-only "$raw" \ + --world-size "$WORLD" --nodes "$NODES" \ + --runner "$RUNNER_NAME" --topology-class "$CX_TOPO" --transport "$CX_TRANSPORT" \ + --env-json "$ENVJSON" \ + --out "$CX_DIR/results/${RUNNER_NAME}_${op}_${TS}.json" \ + --timestamp "$TS" || cx_log "WARN: parse $op failed" +done + +cx_collect_results "$MOUNT_SRC" "$REPO_ROOT" +cx_log "done — JSON artifacts under $CX_DIR/results/" diff --git a/experimental/CollectiveX/launchers/launch_h100-dgxc-slurm.sh b/experimental/CollectiveX/launchers/launch_h100-dgxc-slurm.sh new file mode 100644 index 000000000..590ea112d --- /dev/null +++ b/experimental/CollectiveX/launchers/launch_h100-dgxc-slurm.sh @@ -0,0 +1,73 @@ +#!/usr/bin/env bash +# CollectiveX — H100 (DGX Cloud Slurm) single-node SKU adapter (8x H100, NVLink +# island, x86_64, SM90). Matches the GH self-hosted runner name `h100-dgxc-slurm_NN` +# (runner.name prefix -> this script via launch_${RUNNER_NAME%%_*}.sh). +# +# Thin adapter mirroring launch_b200-dgxc.sh (same DGX Cloud tenancy/conventions: +# partition default gpu-2, account benchmark, compute-visible /home/sa-shared); +# allocates, then hands off to run_in_container.sh (CX_BENCH = nccl | deepep | all). +# The DeepEP path runs the full FP8 + low-latency matrix (validated on 8x H100). +# +# !!! First on-runner run = validation (no direct SSH to this cluster at authoring). +# If pyxis fails "No such file" the share is not compute-visible — set CX_SQUASH_DIR +# + CX_STAGE_DIR to a compute-visible FS (cf. hpc-gpu-1 needing /mnt/nfs). +# +# Env knobs: CX_PARTITION(gpu-2) CX_ACCOUNT(benchmark) CX_NGPUS(8) CX_TIME(45) +# CX_IMAGE CX_SQUASH_DIR CX_STAGE_DIR CX_BENCH CX_PHASE CX_DRYRUN(0) +set -euo pipefail + +HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +CX_DIR="$(cd "$HERE/.." && pwd)" +REPO_ROOT="$(cd "$CX_DIR/../.." && pwd)" +# shellcheck source=common.sh +source "$HERE/common.sh" + +# Cluster identity from runners/launch_h100-dgxc-slurm.sh (the serving launcher): +# partition hpc-gpu-1, account customer, known-bad node hpc-gpu-1-7 excluded. This +# is the SAME cluster validated over SSH. CRITICAL: /home is login-local (not +# compute-visible) — the squash MUST live on /mnt/nfs; the GH runner workspace is +# already on /mnt/nfs (compute-visible) so the checkout mounts directly (no staging). +RUNNER_NAME="${RUNNER_NAME:-h100-dgxc-slurm}" +PARTITION="${CX_PARTITION:-hpc-gpu-1}" +ACCOUNT="${CX_ACCOUNT:-customer}" +EXCLUDE_NODES="${CX_EXCLUDE_NODES:-hpc-gpu-1-7}" +NGPUS="${CX_NGPUS:-8}" +TIME_MIN="${CX_TIME:-45}" +IMAGE="${CX_IMAGE:-$(cx_default_image h100)}" +SQUASH_DIR="${CX_SQUASH_DIR:-/mnt/nfs/sa-shared/containers}" +MOUNT_DIR=/ix +TS="$(date -u +%Y-%m-%dT%H-%M-%SZ)" + +export CX_RUNNER="$RUNNER_NAME" CX_NGPUS="$NGPUS" CX_TS="$TS" +export CX_TOPO="h100-nvlink-island" CX_TRANSPORT="nvlink" +export CX_BENCH="${CX_BENCH:-nccl}" +export CX_NCCL_HOME="${CX_NCCL_HOME:-/usr}" +export COLLECTIVEX_IMAGE="$IMAGE" COLLECTIVEX_IMAGE_DIGEST="${CX_IMAGE_DIGEST:-}" +export NCCL_CUMEM_ENABLE=1 + +cx_log "runner=$RUNNER_NAME partition=$PARTITION account=$ACCOUNT ngpus=$NGPUS bench=$CX_BENCH" +cx_log "image=$IMAGE" +SQUASH_FILE="$(cx_ensure_squash "$SQUASH_DIR" "$IMAGE")" +MOUNT_SRC="$(cx_stage_repo "$REPO_ROOT" "${CX_STAGE_DIR:-}")" +cx_log "squash=$SQUASH_FILE mount=$MOUNT_SRC -> $MOUNT_DIR" + +if [ "${CX_DRYRUN:-0}" = "1" ]; then cx_log "CX_DRYRUN=1 — not allocating"; exit 0; fi +command -v salloc >/dev/null || cx_die "salloc not found — run on the Slurm login node" + +salloc --partition="$PARTITION" --account="$ACCOUNT" --exclude="$EXCLUDE_NODES" \ + --gres=gpu:"$NGPUS" --exclusive --time="$TIME_MIN" --no-shell --job-name="$RUNNER_NAME" +JOB_ID="$(squeue --name="$RUNNER_NAME" -u "$USER" -h -o %A | head -n1)" +[ -n "$JOB_ID" ] || cx_die "could not resolve allocated JOB_ID" +cx_log "JOB_ID=$JOB_ID" +trap 'scancel "$JOB_ID" 2>/dev/null || true' EXIT + +srun --jobid="$JOB_ID" \ + --container-image="$SQUASH_FILE" \ + --container-mounts="$MOUNT_SRC:$MOUNT_DIR" \ + --no-container-mount-home \ + --container-workdir="$MOUNT_DIR/experimental/CollectiveX" \ + --no-container-entrypoint --export=ALL \ + bash "$MOUNT_DIR/experimental/CollectiveX/launchers/run_in_container.sh" + +cx_collect_results "$MOUNT_SRC" "$REPO_ROOT" +cx_log "done — JSON artifacts under $MOUNT_SRC/experimental/CollectiveX/results/" diff --git a/experimental/CollectiveX/launchers/launch_h200.sh b/experimental/CollectiveX/launchers/launch_h200.sh new file mode 100644 index 000000000..82bdaccdd --- /dev/null +++ b/experimental/CollectiveX/launchers/launch_h200.sh @@ -0,0 +1,70 @@ +#!/usr/bin/env bash +# CollectiveX — H200 single-node SKU adapter (8x H200, NVLink island, x86_64, SM90). +# +# Thin adapter: H200-specific allocation/container, then hands off to +# launchers/run_in_container.sh (CX_BENCH = nccl | deepep | all). Mirrors +# launch_b200-dgxc.sh; H200 differs in: partition `hpc-gpu-1` (20x 8-GPU nodes), +# NO account (open scheduler), home is shared NFS (compute-visible, so no +# CX_STAGE_DIR), and the sglang image is imported on first use (not pre-staged). +# +# Run from inside the InferenceX checkout on the H200 login node: +# bash experimental/CollectiveX/launchers/launch_h200.sh # nccl (default) +# CX_BENCH=deepep CX_PHASE=both bash .../launch_h200.sh # DeepEP, decode+prefill +# +# Env knobs: CX_PARTITION(main) CX_ACCOUNT() CX_NGPUS(8) CX_TIME(45) CX_IMAGE +# CX_SQUASH_DIR CX_STAGE_DIR CX_BENCH CX_PHASE CX_DRYRUN(0) +set -euo pipefail + +HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +CX_DIR="$(cd "$HERE/.." && pwd)" +REPO_ROOT="$(cd "$CX_DIR/../.." && pwd)" +# shellcheck source=common.sh +source "$HERE/common.sh" + +RUNNER_NAME="${RUNNER_NAME:-h200}" +PARTITION="${CX_PARTITION:-hpc-gpu-1}" +ACCOUNT="${CX_ACCOUNT:-}" # H200 scheduler is open; no account needed +NGPUS="${CX_NGPUS:-8}" +TIME_MIN="${CX_TIME:-45}" # generous: first-use enroot import of the image +IMAGE="${CX_IMAGE:-$(cx_default_image h200)}" +# CRITICAL: on this cluster /home is LOGIN-LOCAL (/dev/sdc) — invisible to compute +# nodes. The compute-visible share is /mnt/nfs (10.0.0.130:/nfs). Both the squash +# AND the staged repo MUST live there or pyxis fails "No such file or directory". +SQUASH_DIR="${CX_SQUASH_DIR:-/mnt/nfs/sa-shared/containers}" +export CX_STAGE_DIR="${CX_STAGE_DIR:-/mnt/nfs/sa-shared/cx_stage}" +MOUNT_DIR=/ix +TS="$(date -u +%Y-%m-%dT%H-%M-%SZ)" + +export CX_RUNNER="$RUNNER_NAME" CX_NGPUS="$NGPUS" CX_TS="$TS" +export CX_TOPO="h200-nvlink-island" CX_TRANSPORT="nvlink" +export CX_BENCH="${CX_BENCH:-nccl}" +export CX_NCCL_HOME="${CX_NCCL_HOME:-/usr}" +export COLLECTIVEX_IMAGE="$IMAGE" COLLECTIVEX_IMAGE_DIGEST="${CX_IMAGE_DIGEST:-}" +export NCCL_CUMEM_ENABLE=1 + +cx_log "runner=$RUNNER_NAME partition=$PARTITION ${ACCOUNT:+account=$ACCOUNT }ngpus=$NGPUS bench=$CX_BENCH" +cx_log "image=$IMAGE" +SQUASH_FILE="$(cx_ensure_squash "$SQUASH_DIR" "$IMAGE")" +MOUNT_SRC="$(cx_stage_repo "$REPO_ROOT" "${CX_STAGE_DIR:-}")" +cx_log "squash=$SQUASH_FILE mount=$MOUNT_SRC -> $MOUNT_DIR" + +if [ "${CX_DRYRUN:-0}" = "1" ]; then cx_log "CX_DRYRUN=1 — not allocating"; exit 0; fi +command -v salloc >/dev/null || cx_die "salloc not found — run on the Slurm login node" + +salloc --partition="$PARTITION" ${ACCOUNT:+--account="$ACCOUNT"} --gres=gpu:"$NGPUS" \ + --exclusive --time="$TIME_MIN" --no-shell --job-name="$RUNNER_NAME" +JOB_ID="$(squeue --name="$RUNNER_NAME" -u "$USER" -h -o %A | head -n1)" +[ -n "$JOB_ID" ] || cx_die "could not resolve allocated JOB_ID" +cx_log "JOB_ID=$JOB_ID" +trap 'scancel "$JOB_ID" 2>/dev/null || true' EXIT + +srun --jobid="$JOB_ID" \ + --container-image="$SQUASH_FILE" \ + --container-mounts="$MOUNT_SRC:$MOUNT_DIR" \ + --no-container-mount-home \ + --container-workdir="$MOUNT_DIR/experimental/CollectiveX" \ + --no-container-entrypoint --export=ALL \ + bash "$MOUNT_DIR/experimental/CollectiveX/launchers/run_in_container.sh" + +cx_collect_results "$MOUNT_SRC" "$REPO_ROOT" +cx_log "done — JSON artifacts under $MOUNT_SRC/experimental/CollectiveX/results/" diff --git a/experimental/CollectiveX/launchers/launch_mi355x-amds.sh b/experimental/CollectiveX/launchers/launch_mi355x-amds.sh new file mode 100644 index 000000000..3a7ceccb3 --- /dev/null +++ b/experimental/CollectiveX/launchers/launch_mi355x-amds.sh @@ -0,0 +1,113 @@ +#!/usr/bin/env bash +# CollectiveX — MI355X (AMD CDNA4, 8 GPU/node) SKU adapter: MoRI dispatch/combine. +# +# AMD counterpart to the NVIDIA adapters. Differs from them in ways taken from +# the real runners/launch_mi355x-amds.sh: +# * partition `compute`, no --account (cluster default), --cpus-per-task=128, +# and known-bad nodes excluded; +# * squash is NODE-LOCAL (/var/lib/squash), so enroot import runs via srun on +# the allocated node (not on the login node like the shared-FS NVIDIA path); +# * pyxis flags --container-writable --container-remap-root for the ROCm image. +# AMD backends: CX_BENCH=mori (MoRI EP dispatch/combine, default) or nccl +# (collective primitives via rccl-tests, the ROCm nccl-tests fork). +# +# !!! NOT yet validated on hardware (no MI355X cluster access at authoring time). +# Treat the first on-runner run as validation — like the DeepEP path was on GB200. +# +# Run from inside the InferenceX checkout on the MI355X login node: +# bash experimental/CollectiveX/launchers/launch_mi355x-amds.sh +# +# Env knobs: CX_PARTITION(compute) CX_NGPUS(8) CX_TIME(60) CX_IMAGE +# CX_SQUASH_DIR(/var/lib/squash) CX_EXCLUDE_NODES CX_DRYRUN(0) +set -euo pipefail + +HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +CX_DIR="$(cd "$HERE/.." && pwd)" +REPO_ROOT="$(cd "$CX_DIR/../.." && pwd)" +# shellcheck source=common.sh +source "$HERE/common.sh" + +RUNNER_NAME="${RUNNER_NAME:-mi355x-amds}" +PARTITION="${CX_PARTITION:-compute}" +NGPUS="${CX_NGPUS:-8}" +TIME_MIN="${CX_TIME:-60}" # generous: a cold enroot import of the large ROCm image +IMAGE="${CX_IMAGE:-$(cx_default_image mi355x)}" +SQUASH_DIR="${CX_SQUASH_DIR:-/var/lib/squash}" # node-local on MI355X +EXCLUDE_NODES="${CX_EXCLUDE_NODES:-mia1-p01-g09,mia1-p01-g11}" +# Optional node pin. The node-local squash is only staged on some nodes, and on +# others /var/lib/squash isn't writable (cold-import fails). Pin CI to nodes that +# already hold the squash via CX_NODELIST (overrides the exclude list). +NODELIST="${CX_NODELIST:-}" +MOUNT_DIR=/ix +TS="$(date -u +%Y-%m-%dT%H-%M-%SZ)" + +# AMD backends wired: mori (MoRI EP dispatch/combine) and nccl (collective +# primitives via rccl-tests). Default mori; honor an explicit CX_BENCH. +export CX_BENCH="${CX_BENCH:-mori}" +case "$CX_BENCH" in + mori|nccl) ;; + *) cx_log "mi355x: CX_BENCH='$CX_BENCH' unsupported on AMD (want mori|nccl); using mori"; export CX_BENCH=mori ;; +esac +export CX_RUNNER="$RUNNER_NAME" CX_NGPUS="$NGPUS" CX_TS="$TS" +export CX_TOPO="mi355x-xgmi" CX_TRANSPORT="xgmi" +export COLLECTIVEX_IMAGE="$IMAGE" COLLECTIVEX_IMAGE_DIGEST="${CX_IMAGE_DIGEST:-}" + +cx_log "runner=$RUNNER_NAME partition=$PARTITION ngpus=$NGPUS bench=$CX_BENCH image=$IMAGE" +# AMD workspace is compute-visible (the serving launcher bind-mounts it directly), +# so no staging; the node-local squash is handled via srun below. +MOUNT_SRC="$(cx_stage_repo "$REPO_ROOT" "${CX_STAGE_DIR:-}")" +SQUASH_KEY="$(printf '%s' "$IMAGE" | sed 's#[/:@#]#_#g')" +SQUASH_FILE="$SQUASH_DIR/${SQUASH_KEY}.sqsh" +# Lock in a guaranteed-writable per-node dir, NOT next to the squash: on some +# nodes /var/lib/squash is root/admin-owned, so even a world-readable squash +# can't get a sibling .lock created (flock -> "Bad file descriptor"). CX_LOCK_DIR +# overrides. The lock only serializes concurrent imports on the same node. +LOCK_FILE="${CX_LOCK_DIR:-/tmp}/${SQUASH_KEY}.sqsh.lock" +cx_log "squash(node-local)=$SQUASH_FILE lock=$LOCK_FILE mount=$MOUNT_SRC -> $MOUNT_DIR" + +if [ "${CX_DRYRUN:-0}" = "1" ]; then cx_log "CX_DRYRUN=1 — not allocating"; exit 0; fi +command -v salloc >/dev/null || cx_die "salloc not found — run on the Slurm login node" + +# Pin to specific nodes (CX_NODELIST) when set, else exclude the known-bad ones. +if [ -n "$NODELIST" ]; then + cx_log "node pin: --nodelist=$NODELIST" + salloc --partition="$PARTITION" --nodelist="$NODELIST" --gres=gpu:"$NGPUS" \ + --exclusive --cpus-per-task=128 --time="$TIME_MIN" --no-shell --job-name="$RUNNER_NAME" +else + salloc --partition="$PARTITION" --exclude="$EXCLUDE_NODES" --gres=gpu:"$NGPUS" \ + --exclusive --cpus-per-task=128 --time="$TIME_MIN" --no-shell --job-name="$RUNNER_NAME" +fi +JOB_ID="$(squeue --name="$RUNNER_NAME" -h -o %A | head -n1)" +[ -n "$JOB_ID" ] || cx_die "could not resolve allocated JOB_ID" +cx_log "JOB_ID=$JOB_ID" +trap 'scancel "$JOB_ID" 2>/dev/null || true' EXIT + +# Clear stray containers, then enroot-import to the node-local squash (flock, +# /dev/null || true' || true +srun --jobid="$JOB_ID" bash -c " + mkdir -p \"$(dirname "$LOCK_FILE")\" 2>/dev/null || true + exec 9>\"$LOCK_FILE\" || { echo 'cannot open lock $LOCK_FILE' >&2; exit 1; } + flock -w 600 9 || { echo 'lock timeout for $SQUASH_FILE' >&2; exit 1; } + if unsquashfs -l \"$SQUASH_FILE\" >/dev/null 2>&1; then + echo 'squash present: $SQUASH_FILE' + else + rm -f \"$SQUASH_FILE\" + enroot import -o \"$SQUASH_FILE\" \"docker://$IMAGE\" /dev/null || true +cx_log "done — JSON artifacts under $MOUNT_SRC/experimental/CollectiveX/results/" diff --git a/experimental/CollectiveX/launchers/run_in_container.sh b/experimental/CollectiveX/launchers/run_in_container.sh new file mode 100644 index 000000000..bfbbba845 --- /dev/null +++ b/experimental/CollectiveX/launchers/run_in_container.sh @@ -0,0 +1,139 @@ +#!/usr/bin/env bash +# CollectiveX — generic in-container benchmark dispatcher (single-node). +# +# Runs INSIDE the container under `srun`, invoked by every per-SKU adapter +# (launch_.sh). The SKU adapter handles allocation/container/transport-env; +# this script decides WHICH benchmark to run from CX_BENCH, so any benchmark can +# be driven through any SKU's launch script. Writes provenance-tagged JSON to +# results/. +# +# Required env (exported by the adapter): CX_RUNNER CX_NGPUS CX_TS CX_TOPO +# Selector: CX_BENCH = nccl | deepep | mori | all (default nccl) +# (mori = AMD ROCm EP; nccl/deepep = NVIDIA. `all` = nccl+deepep.) +# NCCL knobs: CX_OPS, CX_MIN_BYTES, CX_MAX_BYTES, CX_TRANSPORT, CX_NCCL_HOME +# EP knobs (DeepEP/MoRI), all -> tests/run_ep.py: +# CX_PHASE = decode | prefill | both (default decode) <- picks the token sweep +# CX_TOKENS_LADDER (space/comma sep; blank = phase default), CX_TOKENS_PER_RANK (legacy single point) +# CX_HIDDEN CX_TOPK CX_EXPERTS CX_DISPATCH_DTYPE CX_ROUTING CX_MODE(normal|ll) +# CX_NUM_SMS (DeepEP comm SMs) CX_SEED CX_ITERS +set -euo pipefail + +cd /ix/experimental/CollectiveX +# shellcheck source=common.sh +source launchers/common.sh +mkdir -p results + +: "${CX_RUNNER:?CX_RUNNER not set}" +: "${CX_NGPUS:?CX_NGPUS not set}" +: "${CX_TS:?CX_TS not set}" +: "${CX_TOPO:?CX_TOPO not set}" +CX_BENCH="${CX_BENCH:-nccl}" +CX_TRANSPORT="${CX_TRANSPORT:-}" +ENVJSON="results/env_${CX_RUNNER}_${CX_TS}.json" + +cx_log "in-container: runner=$CX_RUNNER ngpus=$CX_NGPUS bench=$CX_BENCH topo=$CX_TOPO" +python3 env_capture.py --out "$ENVJSON" --timestamp "$CX_TS" + +run_nccl_suite() { + local build ops op sfail=0 impl=nccl + # AMD/ROCm -> rccl-tests (fork; same binaries + output, parsed by run_nccl.py); + # NVIDIA/CUDA -> nccl-tests. Both single-node: MPI=0, -g N. + if [ -d /opt/rocm ] || command -v hipcc >/dev/null 2>&1; then + impl=rccl + build="$(cx_build_rccl_tests "$PWD/.nccl-tests" 0)" || return 1 + else + build="$(cx_build_nccl_tests "$PWD/.nccl-tests" 0)" || return 1 + fi + cx_log "collective impl=$impl build=$build" + ops="${CX_OPS:-all_reduce all_gather reduce_scatter alltoall}" + for op in $ops; do + if ! python3 run_nccl.py --op "$op" --nccl-tests-dir "$build" \ + --world-size "$CX_NGPUS" --nodes 1 --gpus-per-proc "$CX_NGPUS" \ + --runner "$CX_RUNNER" --topology-class "$CX_TOPO" --transport "$CX_TRANSPORT" \ + --env-json "$ENVJSON" --out "results/${CX_RUNNER}_${op}_${CX_TS}.json" \ + --min-bytes "${CX_MIN_BYTES:-8}" --max-bytes "${CX_MAX_BYTES:-8G}" --check 1; then + cx_log "WARN: $impl $op failed or invalid"; sfail=1 + fi + done + return "$sfail" +} + +# Resolve the source-tokens-per-rank sweep: explicit CX_TOKENS_LADDER wins; else +# the legacy single-point CX_TOKENS_PER_RANK becomes a one-point ladder; else +# blank => tests/run_ep.py picks the phase default (decode small / prefill large). +cx_ep_ladder() { + if [ -n "${CX_TOKENS_LADDER:-}" ]; then printf '%s' "$CX_TOKENS_LADDER" + elif [ -n "${CX_TOKENS_PER_RANK:-}" ]; then printf '%s' "$CX_TOKENS_PER_RANK" + else printf ''; fi +} + +# run_ep_suite +# One tests/run_ep.py invocation per phase (decode/prefill/both); dispatch and +# combine are timed separately inside it. One JSON per (backend, phase). +run_ep_suite() { + local backend="$1" phase phases ladder rc=0 + ladder="$(cx_ep_ladder)" + phases="${CX_PHASE:-decode}" + [ "$phases" = "both" ] && phases="decode prefill" + for phase in $phases; do + cx_log "ep backend=$backend phase=$phase ngpus=$CX_NGPUS ladder='${ladder:-}'" + # Hard wall-clock guard: a wedged collective (e.g. a backend that hangs at a shape) + # must FAIL FAST, never burn the whole job timeout. timeout -k sends SIGKILL after + # a grace period. Override with CX_RUN_TIMEOUT (seconds). + if ! timeout -k 30 "${CX_RUN_TIMEOUT:-900}" \ + torchrun --nproc_per_node="$CX_NGPUS" tests/run_ep.py --backend "$backend" \ + --phase "$phase" --tokens-ladder "$ladder" --mode "${CX_MODE:-normal}" \ + --hidden "${CX_HIDDEN:-7168}" --topk "${CX_TOPK:-8}" --experts "${CX_EXPERTS:-256}" \ + --dispatch-dtype "${CX_DISPATCH_DTYPE:-bf16}" --routing "${CX_ROUTING:-uniform}" \ + ${CX_EPLB:+--eplb} ${CX_WORKLOAD_DIR:+--workload-dir "$CX_WORKLOAD_DIR"} \ + --num-sms "${CX_NUM_SMS:-24}" --seed "${CX_SEED:-67}" --iters "${CX_ITERS:-200}" \ + --trials "${CX_TRIALS:-3}" \ + --measurement-contract "${CX_MEASUREMENT_CONTRACT:-layout-and-dispatch-v1}" \ + --resource-mode "${CX_RESOURCE_MODE:-normalized}" --sm-fraction "${CX_SM_FRACTION:-0.18}" \ + --runner "$CX_RUNNER" --topology-class "$CX_TOPO" --transport "$CX_TRANSPORT" \ + --env-json "$ENVJSON" --out "results/${CX_RUNNER}_${backend}_${phase}_${CX_TS}.json"; then + cx_log "WARN: $backend $phase run failed/timed out (CX_RUN_TIMEOUT=${CX_RUN_TIMEOUT:-900}s)"; rc=1 + fi + done + return "$rc" +} + +run_deepep_suite() { + # DeepEP is not bundled in the multi-arch image. Try to import; if absent, + # attempt rebuild-deepep (srt-slurm setup script). Inability to run is a + # failure, not a silent skip — the caller asked for deepep. + if ! python3 -c "import deep_ep" 2>/dev/null; then + if command -v rebuild-deepep.sh >/dev/null 2>&1; then + cx_log "building DeepEP via rebuild-deepep.sh" + rebuild-deepep.sh >&2 || { cx_log "WARN: rebuild-deepep.sh failed"; return 1; } + else + cx_log "WARN: deep_ep not importable and no rebuild-deepep.sh on PATH; cannot run deepep" + return 1 + fi + fi + run_ep_suite deepep +} + +run_mori_suite() { + # MoRI (AMD ROCm EP), bundled in the AMD MoRI image. If absent this is a + # failure (MoRI is not rebuildable here), not a silent skip. Single-node + # 8x MI355X over XGMI; torch.cuda maps onto ROCm/HIP. + if ! python3 -c "import mori" 2>/dev/null; then + cx_log "WARN: mori not importable — needs the AMD MoRI image (rocm/sgl-dev:...-mori-...); cannot run mori" + return 1 + fi + run_ep_suite mori +} + +rc=0 +case "$CX_BENCH" in + nccl) run_nccl_suite || rc=1 ;; + deepep) run_deepep_suite || rc=1 ;; + mori) run_mori_suite || rc=1 ;; + all) run_nccl_suite || rc=1; run_deepep_suite || rc=1 ;; + *) cx_die "unknown CX_BENCH=$CX_BENCH (want nccl|deepep|mori|all)" ;; +esac + +# Summary table for the log; also fails the job if no valid results were produced. +python3 summarize.py --results-dir results --runner "$CX_RUNNER" --ts "$CX_TS" || rc=1 +exit "$rc" diff --git a/experimental/CollectiveX/plan.md b/experimental/CollectiveX/plan.md new file mode 100644 index 000000000..d62bb7746 --- /dev/null +++ b/experimental/CollectiveX/plan.md @@ -0,0 +1,940 @@ +# CollectiveX — Plan + +> **How to read this.** This is the single canonical plan. It is **spike-first** and **scoped to `experimental/CollectiveX/`** on a branch — nothing in the production serving path changes until a promotion decision is made later. Part 1 is background (what CollectiveX is, reconstructed from team discussion). Part 2 is the implementation plan. Where this plan says "now," it means the Milestone 0 spike; "later" items (GitHub workflow, database, app frontend) are deliberately deferred. All repository references (runners, launchers, workflows, matrix logic, the `experimental/` charter) were verified against the live InferenceX repo — see References. + +--- + +# Part 1 — Background + +## What it is + +CollectiveX is an benchmarking workstream under the InferenceX umbrella. It measures **collective communication** and **MoE dispatch/combine**, and performs **apples-to-apples, cross-vendor comparison of expert-parallel (EP) libraries** across NVIDIA and AMD (TPU later). The intended deliverables are an **OSS benchmark project** and a **public explainer article** — a credible cross-vendor collective benchmark plus the story around it. + +## Why + +Existing public benchmarks don't offer trustworthy, like-for-like collective/EP comparison across vendors. CollectiveX fills that gap by reusing InferenceX's runner and cluster infrastructure to produce reproducible, provenance-tagged results. + +## Current state + +- An initial MVP exists: it collected collective and kernel shapes and produced MoE dispatch/combine results on NVIDIA. +- **Normal mode works; low-latency (LL) mode is blocked** on IBGDA enablement — a direct GPU↔NIC data-and-control path over PCIe that removes CPU coordination and simplifies MoE dispatch/combine collectives — which depends on cluster-networking work outside this project. +- The main near-term enabler is NVIDIA networking / IBGDA; the AMD EP stack and AMD networking (Ultra Ethernet) are the cross-vendor counterpart. + +--- + +# Part 2 — Implementation plan + +## Implementation status (built) + +The Milestone-0 spike ran for real on **both** B200 (8× NVLink island, x86_64) and GB200 (4× NVL72 MNNVL, aarch64) — 4 NCCL primitives, correctness-passed, topology-keyed distinctly (peak bus-bw: B200 all-reduce 835 GB/s; GB200 689 GB/s). Built on top of that: + +- **Multi-arch container** for all NVIDIA SKUs: import by tag `lmsysorg/sglang:v0.5.11-cu130` (amd64 + arm64; index digest `sha256:061fb71f…` recorded for provenance) — one reference both arches; DeepEP via `rebuild-deepep`. Imported by tag, not digest (enroot anonymous auth needs a tag); v0.5.12-cu130 avoided (62-layer overlay-mount failure). See `CONTAINERS.md`. +- **Per-SKU launch adapters** (`launchers/launch_.sh`, the InferenceX `launch_${RUNNER_NAME%%_*}.sh` convention) that run **any** benchmark via `CX_BENCH` (nccl|deepep|mori|all) through a shared `launchers/run_in_container.sh`. +- **`on: push` workflow** (`.github/workflows/collectivex-experimental.yml`): push → MI355X MoRI dispatch/combine (the "CollectiveX Experimental" job); `workflow_dispatch` → chosen `sku`+`benchmark`. No merge to main; activates when the branch is pushed to GitHub. +- **AMD MI355X / MoRI path validated** (first cross-vendor reach, ahead of Milestone 1): `tests/ep_mori.py` (MoRI dispatch+combine, mirrors `ROCm/mori`'s example with the zero-copy registered-combine-buffer path and `expected = input × unique-destination-ranks`), `launchers/launch_mi355x-amds.sh` (partition `compute`, node-local `/var/lib/squash` imported via `srun`, `--container-writable --container-remap-root`), ROCm MoRI image in `cx_default_image`, and `mi355x`/`mori` workflow options. **Validated on 8× MI355X** (dispatch+combine numerically correct, ~85 µs round-trip): the run surfaced three ionic_rdma-fabric constraints now baked into `tests/ep_mori.py` — a 2 GiB symmetric heap (these NICs cap RDMA MRs at ~4 GiB; MoRI registers the whole heap), a bounded `max_num_inp_token_per_rank`, and a hard-exit past MoRI's post-finalize shmem teardown assertion (see `CONTAINERS.md`). + +This supersedes the Milestone-0 "light single-script launcher" sketch below where they differ — launchers are now thin SKU adapters + a shared dispatcher (still light/experimental). + +## Scope and placement + +CollectiveX starts as an **experimental project on its own branch**, fully contained under `experimental/CollectiveX/`: + +```bash +git switch main +git pull --ff-only +git switch -c collectivex +mkdir -p experimental/CollectiveX +``` + +This matches the repository's intent: `experimental/` is explicitly non-core ("experimental WIP code that is mostly Claude Code generated… not intended for production use or as part of the official InferenceMAX results"). + +For the experimental phase, **everything stays inside `experimental/CollectiveX/**`**. Do **not** modify: + +```text +benchmarks/ +runners/ +utils/ +.github/configs/ +perf-changelog.yaml +InferenceX-app +``` + +The only eventual exception is a minimal workflow dispatcher under `.github/workflows/` (because executable workflows must live there); all real CollectiveX logic, schemas, launchers, and processing stay under `experimental/CollectiveX/`. + +**This supersedes any notion of CollectiveX becoming a top-level InferenceX subsystem or extending the production serving matrix up front.** Promotion — into core InferenceX, into a dedicated repo, or into InferenceX-app's database/frontend — is an explicit *later* decision (Milestone 4), made only after the benchmark contract has stabilized on real hardware. + +### What InferenceX already gives us + +InferenceX's existing execution model is almost exactly the control plane CollectiveX needs: + +1. Generate and strictly validate a matrix on a GitHub-hosted runner. +2. Fan jobs out to named or labelled self-hosted runners. +3. Those listeners submit work to Slurm (or launch Docker locally). +4. Normalize outputs. +5. Upload artifacts. +6. Aggregate and dispatch ingestion to the dashboard. + +`e2e-tests.yml` already divides generated configs into job families and invokes reusable single-node and multi-node workflows; `benchmark-tmpl.yml` cleans up resources, checks out the selected ref, **derives the launcher from the runner name**, launches the job, validates outputs, and uploads normalized results. Runner listeners live on cluster login/controller nodes while jobs run on compute nodes via Slurm; runner names/labels are load-bearing — the name prefix selects the launcher and exact names/SKU labels control scheduling. + +CollectiveX reuses all of this, but enters through **CollectiveX-specific launchers** rather than threading fake models through the serving launchers (see Cluster reuse). + +## Architecture + +Four planes, cleanly separated: + +- **Control plane:** scheduling, runners, cleanup, artifact movement, workflow metadata (reused from InferenceX). +- **Benchmark plane:** collective semantics, backend invocation, correctness, timing. +- **Data plane:** canonical result records, raw per-rank samples, topology and provenance. +- **Presentation plane:** comparable subsets, charts, history, diagnostics. + +Data flow within the experimental directory: + +```text +Portable shape definitions + + +Backend definitions + + +Target/cluster definitions + ↓ +CollectiveX matrix resolver + ↓ +Resolved shards + ↓ +Existing InferenceX self-hosted runner + ↓ +experimental/CollectiveX/launchers/* + ↓ +Backend adapter (NCCL / RCCL / DeepEP / AITER / MoRI / …) + ↓ +Versioned result bundle + ↓ +Aggregator + regression checker + ↓ +Static experimental report → (later) InferenceX-app ingestion → Postgres → /collectives +``` + +### Target structure at promotion (Milestone 4) + +This packaged layout is the **promotion target**, not the spike. Milestone 0 uses the light layout in the rollout section below (`run_nccl.py` / `run_deepep.py` / `env_capture.py` / `plot.py` + flat `results/`); the structure here is what CollectiveX grows into *if* it is promoted out of `experimental/`. + +```text +InferenceX/ +├── experimental/ +│ ├── README.md +│ └── CollectiveX/ +│ ├── README.md +│ ├── DESIGN.md +│ ├── ROADMAP.md +│ ├── pyproject.toml +│ ├── Makefile +│ │ +│ ├── src/ +│ │ └── collectivex/ +│ │ ├── __init__.py +│ │ ├── cli.py +│ │ ├── config/ +│ │ │ ├── models.py +│ │ │ ├── loader.py +│ │ │ ├── resolver.py +│ │ │ └── matrix.py +│ │ ├── benchmark/ +│ │ │ ├── harness.py +│ │ │ ├── timing.py +│ │ │ ├── correctness.py +│ │ │ ├── routing.py +│ │ │ └── metrics.py +│ │ ├── backends/ +│ │ │ ├── base.py +│ │ │ ├── fake.py +│ │ │ ├── nccl_tests.py +│ │ │ ├── rccl_tests.py +│ │ │ ├── deepep.py +│ │ │ └── framework_ep.py +│ │ ├── cluster/ +│ │ │ ├── inventory.py +│ │ │ ├── capabilities.py +│ │ │ ├── environment.py +│ │ │ └── launcher.py +│ │ ├── results/ +│ │ │ ├── models.py +│ │ │ ├── writer.py +│ │ │ ├── aggregate.py +│ │ │ ├── compare.py +│ │ │ └── redact.py +│ │ └── report/ +│ │ ├── build.py +│ │ └── templates/ +│ │ +│ ├── configs/ +│ │ ├── suites/ +│ │ │ ├── smoke.yaml +│ │ │ ├── primitives.yaml +│ │ │ ├── moe-decode.yaml +│ │ │ ├── moe-prefill.yaml +│ │ │ └── full.yaml +│ │ ├── shapes/ +│ │ │ ├── synthetic/ +│ │ │ └── traced/ +│ │ ├── backends/ +│ │ ├── targets/ +│ │ └── clusters.yaml +│ │ +│ ├── launchers/ +│ │ ├── common.sh +│ │ ├── launch_b200-dgxc.sh # B200 single node +│ │ ├── launch_b200-dgxc-slurm.sh # B200 multinode +│ │ └── launch_gb200-nv.sh # GB200 NVL72 +│ │ +│ ├── schemas/ +│ │ ├── case-v1.schema.json +│ │ ├── result-v1.schema.json +│ │ ├── manifest-v1.schema.json +│ │ └── environment-v1.schema.json +│ │ +│ ├── scripts/ +│ │ ├── bootstrap.sh +│ │ ├── run_suite.sh +│ │ ├── run_shard.sh +│ │ └── build_report.sh +│ │ +│ ├── tests/ +│ │ ├── fixtures/ +│ │ ├── test_config.py +│ │ ├── test_matrix.py +│ │ ├── test_parsers.py +│ │ ├── test_correctness.py +│ │ └── test_comparability.py +│ │ +│ └── docs/ +│ ├── BENCHMARK_CONTRACT.md +│ ├── BACKEND_ADAPTER.md +│ ├── SHAPE_REGISTRY.md +│ ├── RESULT_FORMAT.md +│ ├── FRONTEND.md +│ └── PROMOTION_CRITERIA.md +│ +└── .github/workflows/ + └── collectivex-experimental.yml # Added only when cluster CI begins (Milestone 2) +``` + +> Note: launcher names mirror the real runner-name prefixes. The spike adds the three NVIDIA launchers above; AMD (`launch_mi355x-amds.sh`) and others follow. + +## Benchmark model — keep four concepts separate + +CollectiveX needs its **own** schema. Do **not** reuse or extend the serving matrix, which is built around model / ISL / OSL / framework / TP / EP / concurrency and lives in `utils/matrix_logic/generate_sweep_configs.py`. Representing collectives with fake model names, `ISL=0`, or overloaded concurrency fields would create permanent technical debt. CollectiveX gets its own matrix logic (in the packaged layout, `src/collectivex/config/matrix.py`) — introduced with the workflow at Milestone 2, not the spike — rather than touching `utils/matrix_logic/generate_sweep_configs.py`. + +The model keeps four concepts independent: + +**Shape** — the logical communication workload: + +```text +operation, message size, tokens per rank, hidden size, top-k, +expert count, routing distribution, dtype, phase +``` + +**Backend** — the implementation under test: + +```text +NCCL, RCCL, DeepEP, AITER, MoRI, framework-native EP, reference implementation +``` + +**Target** — where and how it runs: + +```text +runner type, cluster, nodes, GPUs per node, rank placement, +fabric, container image, transport capabilities +``` + +**Suite** — a curated selection of shape × backend × target combinations. Keeping these separate prevents copying the same DeepSeek/MiniMax shape into every NVIDIA and AMD configuration. + +### Portable definitions + +Shape: + +```yaml +schema-version: 1 +shape-id: moe.decode.h7168.top8.e256.t64.uniform.v1 + +kind: moe +phase: decode +operation: dispatch-combine + +shape: + tokens-per-rank: 64 + hidden-size: 7168 + top-k: 8 + num-experts: 256 + dispatch-dtype: fp8 + combine-dtype: bf16 + routing: + distribution: uniform + seed: 67 + expert-alignment: 16 +``` + +Backend: + +```yaml +backend-id: deepep-normal +backend: deepep +mode: normal + +source: + repository: deepseek-ai/DeepEP + ref: pinned-commit + +settings: + async-overlap: false + num-comm-sms: standardized + qp-count: auto +``` + +Target: + +```yaml +target-id: b200-dgxc-4n +runner-type: b200-multinode +cluster-id: b200-dgxc + +resources: + nodes: 4 + gpus-per-node: 8 + exclusive: true + +placement: + ranks-per-node: 8 + rank-order: contiguous + +capabilities: + rdma: true + ibgda: experimental + nvshmem: true +``` + +Suite: + +```yaml +suite-id: moe-decode-smoke + +shapes: + - moe.decode.h7168.top8.e256.t64.uniform.v1 + +backends: + - deepep-normal + - deepep-low-latency + +targets: + - b200-dgxc-2n + +measurement: + warmup-iterations: 20 + measured-iterations: 200 + trials: 3 + correctness: full +``` + +### Case identity + +A **case** is one immutable, versioned point: the natural key composes the three concepts — + +```text +case-id = __ __ +e.g. deepep-normal__moe.decode.h7168.top8.e256.t64.uniform.v1__b200-dgxc-4n + nccl__allreduce.fp16.logsweep.v1__b200-dgxc-2n +``` + +A shape must never silently change; a newly extracted distribution gets a new versioned `shape-id`. + +**Required shape fields — primitives:** operation; logical element count; datatype; input/output bytes; in-place vs out-of-place; reduction op (where applicable); world size; rank placement; host-driven vs device-driven launch; blocking/synchronization semantics. + +**Required shape fields — MoE (additional):** tokens per rank; hidden size; top-k; number of experts; EP size; dispatch and combine dtypes; routing distribution; expert alignment/padding; capacity constraints; quantization scale representation; cached vs recomputed routing layout; communication-SM count; async-overlap mode. DeepEP shows why these must be first-class — its interface takes tokens/rank, hidden size, top-k, expert count, FP8 mode and comm-SM settings, and exposes async dispatch/combine. + +### Shape registry + +Two independent shape sources: + +**Synthetic** — for continuous curves and hardware characterization (logarithmic byte sweep for primitives; token-count sweep for MoE; EP-scaling sweep; uniform and controlled-skew routing; intranode and internode placements; decode-oriented and prefill-oriented regimes). Don't build every Cartesian combination; define named suites (`primitive-latency-v1`, `primitive-bandwidth-v1`, `moe-decode-v1`, `moe-prefill-v1`, `moe-skew-v1`, `scaleout-v1`). + +**Trace-derived** — extracted from real InferenceX runs/profiles: + +```text +models/deepseek-v4/decode/ +models/minimax-m3/decode/ +models/kimi-k2.7/prefill/ +``` + +Each traced shape retains: source workflow run; model/config; phase; layer/layer-group; observed token histogram; routing skew; concurrent collective count; framework version; extraction-tool version. InferenceX already has a targeted profiling workflow (`profile.yml`) with optional MoE debug output and a separate trace-storage path — a natural source for real shapes rather than only guessed synthetic inputs. + +## Benchmark layers and comparison classes + +| Layer | Purpose | Examples | +|---|---|---| +| **L0 Environment** | Prove the cluster is benchmarkable | topology, NIC/GPU state, peer access, RDMA, IBGDA capability, version capture | +| **L1 Primitive collectives** | Characterize the raw communication substrate | send/recv, all-reduce, all-gather, reduce-scatter, all-to-all, all-to-allv | +| **L2 MoE communication** | Compare real EP libraries | dispatch, combine, dispatch+combine round trip, normal and low-latency modes | +| **L3 Integrated pipelines** | Communication in realistic operator sequences | route → permute → dispatch → grouped GEMM → combine → unpermute | +| **L4 E2E correlation** | Explain InferenceX serving performance | isolated CollectiveX result linked to the corresponding InferenceX run/profile | + +The MVP concentrates on **L1 and L2**. L3 overlaps OperatorX and comes after the contracts are stable; L4 is the eventual tie-back to serving. + +**L0 — Environment validation** (before measuring anything): GPU count/identity; GPU/NIC topology; CUDA/ROCm version; driver version; NCCL/RCCL version; RDMA device visibility; peer-access matrix; IBGDA/SHMEM capability; container digest; clock/power state; selected network interfaces. A failed probe yields one clear `environment-invalid` result, not dozens of misleading backend failures. + +**L1 — Primitives:** send/receive, all-reduce, all-gather, reduce-scatter, all-to-all, all-to-allv. Use vendor test programs where possible rather than rewriting primitives. Measure two regions separately: latency (bytes→low KiB) and bandwidth (MiB→GiB). + +**L2 — MoE collectives:** dispatch, combine, dispatch+combine. Dimensions: tokens/rank, hidden size, top-k, expert count, EP size, dispatch dtype, combine dtype, routing skew, normal vs low-latency, comm-SM count, node count. + +### Three comparison classes + +Every result is tagged with exactly one, and they must never be silently mixed on one chart: + +| Class | Meaning | +|---|---| +| `standardized` | Matched logical shape **and** fixed resource budget — same shape, topology, dtype, correctness contract, allowed comm-SMs, and timing boundaries. The main apples-to-apples comparison. | +| `backend-optimized` | Same logical output, but each library uses its recommended comm-SMs / protocols / QP count / buffer sizing / graph capture / tuning. Answers "what is the best each stack can do?" | +| `framework-integrated` | The actual path used by SGLang / vLLM / TensorRT-LLM / Dynamo. Connects to InferenceX; not a pure microbenchmark. | + +### Comparability key + +Every result gets a machine-generated comparison key; rows with different keys are not connected on the same curve by default: + +```text +operation, shape ID, dtype, world size, node count, rank placement, +routing distribution, comparison class, measurement contract version, topology class +``` + +## Measurement and correctness + +### Timing boundaries + +Record separately — never report one latency that sometimes includes JIT and sometimes doesn't: + +```text +1. communicator creation +2. buffer allocation and registration +3. first invocation / JIT +4. warmed steady-state invocation +5. host launch time +6. GPU completion time +7. optional end-to-end framework-visible time +``` + +Per measured iteration: synchronize before starting (unless explicitly testing queued execution); use GPU events for device duration and host monotonic time for API/launch duration; retain per-rank measurements; aggregate only after rank-level data is stored; report the **slowest rank** as well as the average. + +### Correctness as a hard gate + +A result is `valid` only after correctness passes. A fast result that fails correctness stays visible as `invalid` — never silently dropped. + +Primitive checks: deterministic input; expected reduction result; guard regions around buffers; in-place and out-of-place checks; dtype-specific tolerances. + +MoE checks: token conservation; correct expert assignment; correct routing weights; valid permutation metadata; dispatch output vs reference; combine output vs reference; no padded-token leakage; deterministic routing hash. + +Failed results remain in artifacts, e.g.: + +```json +{ + "status": "invalid", + "correctness_passed": false, + "error": "combine result exceeded bf16 tolerance" +} +``` + +### Routing distributions + +At minimum: uniform; single-hot/worst-case concentration; Zipf-like skew; bounded imbalance; replayed real histogram. Store the routing seed and the generated assignment hash. + +### Metrics + +| Category | Metrics | +|---|---| +| Latency | p50, p90, p95, p99, min, max | +| Rank behavior | slowest-rank latency, rank spread, coefficient of variation | +| Primitive throughput | algorithm bandwidth, bus bandwidth, effective bytes/s | +| MoE throughput | tokens/s, logical payload GB/s, dispatch and combine separately | +| Efficiency | bandwidth relative to declared topology bottleneck | +| Host overhead | API launch time, CPU utilization where available | +| GPU overhead | communication SM count, GPU active time, optional power | +| Memory | persistent buffer bytes, peak temporary bytes | +| Overlap | standalone comm, standalone compute, overlapped duration, overlap efficiency | +| Reliability | initialization failures, hangs, retries, correctness failures | +| Provenance | all software, image, driver, firmware and topology identifiers | + +### Bandwidth definitions + +NCCL `algbw`/`busbw` are stored but not treated as universal (NCCL applies operation-specific correction factors). MoE libraries often report **logical bottleneck bandwidth** (may include local-rank traffic or exclude metadata/padding; DeepEP explicitly publishes logical bandwidth). Store separate fields, and use `null` rather than a deceptive inference when a backend can't expose physical bytes: + +```text +logical_payload_bytes +allocated_payload_bytes +estimated_link_bytes +metadata_bytes +padding_bytes +``` + +## Result and artifact format + +Each shard emits a versioned bundle: + +```text +output/ +├── manifest.json +├── cases.json +├── results.jsonl +├── rank-samples.jsonl.gz +├── summary.json +├── environment/ +│ ├── gpu.json +│ ├── network.json +│ ├── topology.json +│ └── software.json +├── raw/ +│ ├── stdout.log +│ ├── stderr.log +│ └── backend-output/ +├── commands/ +│ └── reproduce.sh +└── profiles/ +``` + +**Manifest** (invariant run-level metadata): schema version; workflow run + attempt; source SHA/ref; cluster ID; runner; Slurm job ID; node count; topology fingerprint; image digest; backend commit/build; start/end timestamps; redaction version. + +**Result row:** + +```json +{ + "schema_version": 1, + "case_id": "deepep-normal__moe.decode.h7168.top8.e256.t64.uniform.v1__b200-dgxc-4n", + "status": "valid", + "trial": 1, + "backend": "deepep", + "mode": "normal", + "comparison_class": "standardized", + "metrics": { + "latency_us_p50": 0, + "latency_us_p99": 0, + "slowest_rank_us_p50": 0, + "logical_bandwidth_gbps": 0, + "tokens_per_second": 0, + "rank_spread_pct": 0, + "persistent_buffer_bytes": 0 + }, + "correctness": { "passed": true, "max_abs_error": 0, "max_rel_error": 0 } +} +``` + +Use an explicit `schema_version` from the beginning — do not repeat the app's historical need to infer schema version from whether a field happens to exist. + +## Backend adapters + +Each adapter implements a small contract: + +```python +class CollectiveBackend: + def probe(self, environment) -> CapabilityReport: ... + def prepare(self, case, workdir) -> PreparedCommand: ... + def run(self, prepared, launcher) -> RawRun: ... + def parse(self, raw_run) -> list[RankSample]: ... + def validate(self, case, raw_run) -> CorrectnessReport: ... + def describe(self) -> BackendProvenance: ... +``` + +**Tier 0 — communication baselines:** NVIDIA `nccl-tests`, ROCm `rccl-tests`, optionally PyTorch distributed as a common-API baseline. Don't rewrite primitives from scratch — `nccl-tests` already supports multi-node, warmups, correctness checking (`-c 1`), per-rank aggregation, device-driven implementations, and separate CPU-time reporting. *(Confirm whether the installed build emits JSON; if not, parse the text table.)* + +**Tier 1 — MoE dispatch/combine:** upstream DeepEP, ROCm DeepEP, and the NVIDIA/AMD EP paths already used by the InferenceX serving stacks. **Version pins are first-class.** Upstream DeepEP V2 changed NVSHMEM→NCCL, unified high-throughput and low-latency APIs, changed buffer behavior, and removed a previous zero-SM LL mode; ROCm's port has different maturity, NIC variants, rocSHMEM dependencies. DeepEP is **built at job setup** (via `rebuild-deepep.sh`, resolved by srt-slurm), not shipped in the image — its build time and `aarch64` (GB200) feasibility are tracked spike risks. A chart labelled only "DeepEP" is therefore ambiguous — store: + +```text +backend name, upstream/fork, git commit, API generation, +transport backend, build flags, runtime library versions, container digest +``` + +**Tier 2 — additional optimized stacks (later):** MSCCL++, AITER comm/fusion paths, MoRI/Pollara, NVSHMEM/rocSHMEM microbenchmarks, framework-native fused collectives. + +## Rollout — spike-first + +**Spike-first.** No schema, Pydantic model, or comparison contract is frozen until one real, correctness-gated number exists on real hardware. The first milestone is a single end-to-end spike on **two NVIDIA topologies, B200 and GB200**, chosen because they exercise the two transport regimes that matter: B200 is an 8-GPU NVLink island with CX-7 InfiniBand between nodes; GB200 is an NVL72 multi-node-NVLink (MNNVL) domain. Running the same collective across both is itself the first headline result, and it forces the provenance and comparison-class machinery to be real from line one. The schema is the spike's *output*, extracted from the artifacts it produces — not its input. AMD and all platform work (workflow, DB, frontend) follow. + +### Milestone 0 — NVIDIA B200 + GB200 spike + +One milestone, NVIDIA-only, end to end. This collapses the former "design contract," "CPU framework," "primitive NVIDIA baseline," and the NVIDIA half of "MoE MVP" into a single vertical slice that produces real numbers on real fabric. + +Scaffolding — deliberately light, matching `experimental/` convention (bare scripts + flat JSON + a plot; no package / Pydantic / JSON-schemas yet — those arrive at the contract freeze): + +```text +experimental/CollectiveX/ + README.md + run_nccl.py # argparse; run stock nccl-tests, parse its text table (do NOT assume JSON) + tests/run_ep.py # EP dispatch/combine sweep (DeepEP/MoRI); dispatch & combine timed separately + env_capture.py # Layer-0 env + topology fingerprint (torch.cuda.* + nvidia-smi topo) → json + plot.py # matplotlib, like token_position_decode_slo/*/plot_*.py + launchers/ + common.sh + launch_b200-dgxc.sh # B200 single node (b200-dgxc runner → 8-GPU NVLink island, x86_64) + launch_b200-dgxc-slurm.sh # B200 multinode (b200-multinode runner → CX-7 IB spine) + launch_gb200-nv.sh # GB200 (gb200 runner → NVL72 MNNVL, aarch64, 4 GPU/node) + results/*.json # flat, hand-verifiable +``` + +Reuse existing patterns rather than reinventing: `experimental/dsv32/bench.py` for `torch.cuda.Event` timing and stdout environment capture, and `experimental/token_position_decode_slo/glm-5/{bmk_*_sbatch.sh,plot_sla_frontier.py}` for Slurm orchestration + plotting. Mirror the runner→launcher routing convention (`bash ./launchers/launch_${RUNNER_NAME%%_*}.sh`) so the runner name selects the CollectiveX launcher as the serving path does. + +**DeepEP is not prebuilt in any image.** The serving recipes build it at job setup via `setup_script: rebuild-deepep.sh` (resolved by srt-slurm; see `benchmarks/multi_node/srt-slurm-recipes/sglang/qwen3.5/gb200-fp8/`). The spike reuses that same rebuild path — on B200 (x86_64) first. Pin images by digest from `.github/configs/nvidia-master.yaml`: B200 `lmsysorg/sglang:deepseek-v4-blackwell@sha256:df18bfc4aa9ecf59451002b49ba00cae58042de9e2a96378bbd21b404dd62c7b`; GB200 `lmsysorg/sglang:nightly-dev-cu13-20260608-303757cc` (an unpinned nightly today — capture its digest before relying on it). + +What it measures: + +```text +Primitives (stock nccl-tests, -c 1 for correctness) — on BOTH B200 and GB200: + all-reduce, all-gather, reduce-scatter, all-to-all + latency regime (bytes→KiB) and bandwidth regime (MiB→GiB) + B200 : 8 GPU/node (x86_64); 1 node (NVLink island) and 2 nodes (cross CX-7 IB) + GB200 : 4 GPU/node (aarch64); 1 node and 2+ nodes — all still inside the NVL72 NVLink (MNNVL) domain + +MoE (DeepEP, normal mode only — LL mode is the known-broken/blocked path, out of scope): + one decode-shaped dispatch+combine: tokens-per-rank=64, hidden=7168, + top-k=8, experts=256, dispatch fp8 + correctness: token conservation + combine vs a reference implementation + B200 (x86_64) first; GB200 DeepEP is a fast-follow once the aarch64 rebuild-deepep path is proven +``` + +The headline is the **same NCCL primitive shape on both topologies**: B200's 2-node path crosses CX-7 InfiniBand, while GB200's stays on NVL72 NVLink (MNNVL). That IB-vs-MNNVL contrast at a matched logical shape is the result worth publishing. (nccl-tests and DeepEP must be built for `aarch64` on GB200 — the reason DeepEP is B200-first.) + +Provenance captured on every row from the first run — non-negotiable even in a spike, because it is what makes the B200-vs-GB200 number defensible: + +```text +topology-class b200-nvlink-island(+cx7-ib) | gb200-nvl72-mnnvl +transport actually used (NVLink / IB / NVSHMEM-IBGDA), derived from flags + measured behavior +transport env set/recorded: + B200 : NCCL_CUMEM_ENABLE=1 + GB200 : NCCL_CUMEM_ENABLE=1, NCCL_MNNVL_ENABLE=1, MC_FORCE_MNNVL=1 + (also seen in serving: NCCL_P2P_LEVEL=NVL, SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK) +comm-SM count, QP count where applicable +backend commit + API generation + build flags +container digest, CUDA / driver / NCCL versions +comparison-class tag (standardized where shape, dtype and SM budget match) +``` + +These flags come from validated GB200 serving recipes (`…/srt-slurm-recipes/sglang/qwen3.5/gb200-fp8/`); MNNVL is GB200/GB300-only, which is exactly what makes the transport differ from B200. + +Output: a result bundle on disk (`manifest.json`, `results.jsonl`, `environment/`, `raw/`, `commands/reproduce.sh`). Hand-verify the first rows; do not build a generated Pydantic contract yet. + +Exit criteria: + +* real NCCL latency + bandwidth curves on **both** B200 and GB200, correctness-passed (the headline) +* one DeepEP dispatch+combine number (normal mode) on **B200**, correctness-passed; GB200 DeepEP as the immediate fast-follow +* every row carries topology-class, transport, comparison-class and full provenance +* a B200-vs-GB200 side-by-side that the comparison key permits **and labels as topology-class-differing** — that labeled comparison is the intended result, not an accident +* **only now** freeze the schema (`CollectiveCase` / `CollectiveResult` / manifest), extracted from these artifacts + +Explicitly out of scope for the spike: AMD, IBGDA low-latency mode, GitHub Actions, database, frontend, trace-derived shapes, and the fake backend as a deliverable (keep a trivial one only if it speeds offline tests). + +### Milestone 1 — AMD parity + +Bring the AMD side up against the schema the spike froze — not in parallel with it: + +```text +RCCL-tests adapter (mirror the nccl-tests text-table parser) +one AMD launcher (launch_mi355x-amds.sh) +one AMD MoE dispatch/combine backend (DeepEP ROCm / AITER / MoRI) +equivalent shapes + identical result contract +first cross-vendor (NVIDIA vs AMD) comparison +``` + +Record the AMD transport stack (rocSHMEM, MoRI-IO / Pollara, NIC variant) with the same provenance rigor the spike established. An unlabeled "DeepEP" row compared across vendors is meaningless. + +### Milestone 2 — GitHub workflow + +Add (orchestration only; see GitHub workflow design below): + +```text +collectivex-experimental.yml +preflight +canary +matrix sharding +artifact collection +regression comparison +static report artifact +``` + +Do not connect it to `perf-changelog.yaml`. + +### Milestone 3 — Trace-derived shapes + +Extract representative shapes from InferenceX profiles (DeepSeek V4, MiniMax M3, Kimi). Every traced shape must retain: source workflow run; source configuration; framework version; model phase; extraction-tool version; routing-histogram hash. + +### Milestone 4 — Promotion decision + +Only then decide whether to: keep CollectiveX permanently experimental; move it into core InferenceX; extract it into a dedicated repository; or integrate its data into InferenceX-app (database + `/collectives` frontend). + +### First PRs (the spike) + +The spike lands as a few small PRs, each producing something runnable — not a docs-and-schema PR: + +```text +1. Scaffold + NCCL on B200 single node + run_nccl.py (text-table parser), env_capture.py, plot.py, + launchers/launch_b200-dgxc.sh, results/*.json + → lands when it emits a real all-reduce curve with provenance from an 8-GPU B200 + +2. B200 multinode + GB200 + launchers/launch_b200-dgxc-slurm.sh, launchers/launch_gb200-nv.sh + → lands when the same primitive runs on 2-node B200 (cross-IB) and on GB200 NVL72 (MNNVL), + each tagged with topology-class and transport (aarch64 build for GB200) + +3. DeepEP dispatch+combine — B200 first + tests/ep_deepep.py, routing generator + reference combine for correctness, + reusing rebuild-deepep at job setup + → one decode shape, normal mode, on B200; GB200 DeepEP fast-follow + +4. Freeze the contract + extract the case / result / manifest schema from the bundles produced in 1–3; + add fixtures captured from real output — this is where the packaged structure begins +``` + +The first objective is a real, provenance-tagged, correctness-gated number on two NVIDIA topologies — the contract is the spike's output, not its foundation. + +## Cluster reuse and capability inventory + +### What to reuse + +Existing self-hosted runner registrations; exact runner labels; Slurm access from runner hosts; checkout and artifact patterns; resource-cleanup strategy; repository secrets; container caches where appropriate. The runner inventory (`.github/configs/runners.yaml`) already enumerates H100, H200, B200, B300, GB200, GB300, MI300X, MI325X, MI355X fleets and groups such as `h200-multinode`, `b200-multinode`, individual nodes, etc. CollectiveX **reads** this file rather than duplicating runner names. + +### What not to reuse directly + +Do not call the serving launchers (`runners/launch_${RUNNER_NAME%%_*}.sh`) — they carry model-serving assumptions (model paths, framework setup, result naming). Mirror the **selection convention** with CollectiveX launchers instead: + +```bash +bash experimental/CollectiveX/launchers/launch_${RUNNER_NAME%%_*}.sh +``` + +Each CollectiveX launcher handles only: Slurm allocation; container image; mounts; network environment; rank launch; result copy-back; cleanup. There are **two launch paths**, mirroring the serving side: **single-node** B200 mirrors the `salloc … --gres=gpu:N --exclusive … && srun --container-image=` pattern in `runners/launch_b200-dgxc.sh`; **multi-node** B200/GB200 drives **srt-slurm** (`srtctl apply -f `), which already knows how to rebuild DeepEP and set the MNNVL env — so the CollectiveX GB200 launcher is a thin wrapper handing srt-slurm a CollectiveX recipe, not a from-scratch sbatch. (Later, common Slurm/container functions can be factored into a shared lib used by both systems.) + +> Runner-name subtlety to handle in `inventory.py`: one physical cluster can appear under multiple prefixes — `b200-dgxc_NN` routes to `launch_b200-dgxc.sh` (single-node) while `b200-dgxc-slurm_N` (label `b200-multinode`) routes to `launch_b200-dgxc-slurm.sh`. One fabric domain can therefore span several runner labels. + +### Capability overlay + +`inventory.py` loads `../../../.github/configs/runners.yaml` and combines it with a CollectiveX capability overlay — one source of truth for runner names, CollectiveX metadata kept isolated: + +```yaml +b200-multinode: + launcher: b200-dgxc-slurm + vendor: nvidia + hardware: b200 + topology-class: b200-nvlink-cx7 + fabric-domain: b200-dgxc-main + gpus-per-node: 8 + arch: x86_64 + max-nodes: 16 + scheduler: slurm + container-runtime: enroot-pyxis + capabilities: + nccl: true + deepep: true # built at job setup via rebuild-deepep, not prebuilt + rdma: true + nvshmem: true + ibgda: experimental # capability present ≠ currently validated + scheduling: + exclusive-nodes: true + max-parallel-shards: 1 + +gb200: + launcher: gb200-nv + vendor: nvidia + hardware: gb200 + topology-class: gb200-nvl72-mnnvl + gpus-per-node: 4 # NVL72 compute tray + arch: aarch64 # nccl-tests + DeepEP must build for aarch64 + scheduler: srt-slurm + transport-env: { NCCL_CUMEM_ENABLE: 1, NCCL_MNNVL_ENABLE: 1, MC_FORCE_MNNVL: 1 } + capabilities: + nccl: true + deepep: true # rebuilt at setup; aarch64 path is a tracked risk + mnnvl: true # GB200/GB300 only + ibgda: experimental +``` + +`fabric-domain` is essential: two jobs on separate compute nodes may still contend for the same leaf/spine network, so **GitHub concurrency is keyed by fabric domain, not GPU SKU**. The inventory distinguishes hardware capability, software currently installed, and feature state (known-good vs experimental vs temporarily broken) — IBGDA support and "IBGDA low-latency currently validated" are different properties. + +**Operational coexistence with the serving sweep.** `b200-multinode` is only three runners (`b200-dgxc-slurm_7/8/9`), **shared with the production serving sweeps**, and srt-slurm allocations are long. Exclusive nodes + `max-parallel-shards: 1` + fabric-domain serialization means CollectiveX and the serving sweep contend for the same scarce runners. Decide the scheduling/coexistence policy (off-hours windows? a dedicated runner?) before enabling any recurring CollectiveX suite, rather than discovering the contention in CI. + +## GitHub workflow design (Milestone 2) + +When cluster CI begins, add one small orchestration-only file — `.github/workflows/collectivex-experimental.yml` — with no benchmarking logic: + +```text +validate → resolve matrix → preflight canaries → benchmark shards +→ aggregate → compare against baseline → build static report → upload artifacts +``` + +Triggers while on the branch: + +```yaml +on: + push: + branches: [ collectivex ] + paths: + - experimental/CollectiveX/** + - .github/workflows/collectivex-experimental.yml + pull_request: + paths: + - experimental/CollectiveX/** + - .github/workflows/collectivex-experimental.yml +``` + +Later, after a minimal dispatcher exists on `main`, add `workflow_dispatch` with inputs: `ref, suite, target, backend, shape, profile` (and comparison class / normal-LL-both / dry-run). + +Jobs: + +1. **Validate** — install the package; validate all suite/shape/backend/cluster YAML; confirm runner references exist in `runners.yaml`; reject unknown fields; emit the resolved run plan as an artifact. (Match InferenceX's strict Pydantic practice — models reject extra fields.) +2. **Compile and shard** — **do not** generate one job per benchmark point. Group cases by `cluster, node count, GPU placement, container image, backend build, transport mode, fabric domain, profiler requirement`. A shard runs many compatible points under one Slurm allocation (avoids thousands of matrix jobs, repeated communicator init, queue latency, repeated container import). Bounded runtime; record per-case failures unless the cluster itself is unhealthy. +3. **Preflight** — confirm GPU count; validate peer access; enumerate NICs; test RDMA/device visibility; verify backend libraries; run a tiny correctness case; capture topology/software. A failed preflight marks the whole shard `environment-invalid` rather than manufacturing dozens of backend failures. +4. **Canary** — for each `(cluster, backend, mode)` group, run one small representative case; launch the larger matrix only after it passes (mirrors InferenceX's canary-before-full-sweep). +5. **Benchmark** (`collectivex-benchmark-tmpl.yml`) — run on the resolved runner label; unique Slurm job name from workflow/attempt/shard; exclusive nodes; serialize/limit by `fabric-domain`; call the CollectiveX launcher; upload results even on partial failure; always upload environment+logs; fail the job only after artifact creation. +6. **Aggregate and regress** — validate every result against JSON schema; reject duplicate natural keys; merge rank samples and summaries; compute trial aggregates; compare against the most recent compatible baseline; publish a step summary; upload one `results_collectivex` bundle. +7. **Dispatch ingestion** (only once promoted to feed the app) — repository-dispatch the InferenceX-app repo with `{ "benchmark-family": "collectivex", "run-id": "...", "run-attempt": "..." }`. + +Use a separate `collectivex-changelog.yaml`: a CollectiveX backend change must not trigger the expensive serving sweep through `perf-changelog.yaml`, and a serving change must not launch every collective suite. + +## Regression policy (Milestone 2+) + +A compatible baseline requires exact matches on: case ID; cluster ID; topology fingerprint (or approved topology class); backend; comparison class; normal/LL mode; node and rank placement; dtype and shape; measurement-contract version. **Do not compare "same GPU SKU" across materially different fabrics.** + +```text +regression if: + correctness changed pass → fail + OR median latency degradation exceeds max(fixed floor, cluster noise threshold) + OR bandwidth degradation exceeds max(fixed floor, cluster noise threshold) +``` + +Derive each cluster's noise threshold from repeated baseline measurements via median absolute deviation — don't hard-code a universal 3% before knowing each fabric's noise. Retain failed, timed-out, and invalid results; reliability is part of the benchmark. + +## Reporting, database, and frontend + +**Now (spike / Milestone 2): a static, artifact-driven report.** Do not begin by changing InferenceX-app. + +```bash +python -m collectivex.report --results output/aggregate.json --output output/report/ +``` + +```text +report/ +├── index.html +├── data.json +├── assets/ +└── runs/ + └── .html +``` + +Report views: **Overview** (supported clusters/backends, latest run, correctness failures, recent regressions, coverage matrix); **Primitive explorer** (latency / algbw / busbw / rank-spread vs payload size; single-node vs multinode); **MoE explorer** (dispatch & combine latency vs tokens/rank; tokens/s vs EP size; uniform vs skewed; normal vs LL; comm-SMs vs performance); **Case details** (exact shape, backend commit, container digest, topology fingerprint, environment, command, correctness report, rank-level distribution, raw logs). A **comparison warning** must visibly reject invalid comparisons: + +```text +Not directly comparable: +- different routing distribution +- different topology class +- different communication-SM budget +- standardized versus backend-optimized mode +``` + +**Later (Milestone 4 / promotion into InferenceX-app):** add `/collectives` to the app (Next.js, React Query, raw API rows, client-side transforms, D3 charts; tab metadata/routing are centralized). Avoid a single global "CollectiveX score" at launch. Port the report views, plus Library Comparison, Scale-and-topology, and Historical-regression views, and a run-detail drawer. The frontend computes the `comparison-key` and refuses to connect rows with differing keys by default — **this guard matters more than any individual chart.** + +API routes (app): + +```text +/api/v1/collectives +/api/v1/collectives/availability +/api/v1/collectives/history +/api/v1/collectives/runs/:id +/api/v1/collectives/artifacts/:id +``` + +Continue the app convention: API returns raw DB rows; the frontend does chart-specific transforms. + +**Database (app, later).** Do not put CollectiveX rows in `benchmark_results` (its identity is serving configs + ISL/OSL/concurrency). Reuse `workflow_runs`, then add: + +```sql +collective_workloads(id, case_id, schema_version, family, operation, shape jsonb) +collective_environments(id, cluster_id, hardware, topology_class, topology_hash, software jsonb, capabilities jsonb) +collective_configs(id, workload_id, environment_id, backend, backend_version, comparison_class, mode, nodes, gpus_per_node, world_size, settings jsonb) +collective_results(id, workflow_run_id, config_id, trial, date, status, metrics jsonb, + latency_p50_us, latency_p99_us, logical_bandwidth_gbps, bus_bandwidth_gbps, + tokens_per_second, rank_skew_pct, error) +collective_artifacts(result_id, artifact_type, storage_url, metadata jsonb) +collective_availability(date, hardware, cluster_id, backend, family, operation, mode) +``` + +Follow the app's hybrid design (JSONB for evolving metrics; indexed "hot" columns for common filters; idempotent ingestion; natural unique keys; denormalized date; latest-results materialized view). Keep raw per-rank samples in artifacts/object storage, not in Postgres. + +## Future expansions + +The spike de-risks the path to the actual deliverable — a public OSS collective benchmark and an explainer article. Expansion axes, roughly near → far, with dependencies: + +**Hardware breadth.** B300 / GB300 next (GB300 is also MNNVL, with known disagg KV-transfer wins) → H100 / H200 as a cheaper, more-available **InfiniBand baseline** ideal for characterizing per-fabric noise → AMD MI300X / MI325X / MI355X (this is Milestone 1) → TPU (far; a separate stack and toolchain). + +**Backend breadth.** Framework-native EP (the `framework-integrated` class — ties numbers back to the SGLang/vLLM serving paths) → MSCCL++, NVSHMEM / rocSHMEM microbenchmarks, AITER comm/fusion, MoRI / Pollara (AMD). + +**IBGDA low-latency mode.** The recurring strategic blocker and the original "LL is broken" story; gated on the NVIDIA SRE maintenance window for B200/B300. Highest narrative value — add as an experimental suite the moment it unblocks. + +**Scale-out.** 2 → 4 → 8 → 16 nodes; on GB200, intra-NVL72 vs cross-rack scaling-efficiency curves (where MNNVL ends and the inter-rack fabric begins). + +**L3 integrated operator path.** route → permute → dispatch → grouped-GEMM → combine → unpermute — the bridge to OperatorX. + +**L4 e2e correlation.** Link an isolated dispatch/combine number to the same shape's cost inside a real serving run via `profile.yml` traces — the "explain serving performance" payoff and the tie-back to the core product. + +**Trace-derived shapes (Milestone 3).** DeepSeek V4 / MiniMax M3 / Kimi token-histogram and routing-skew extraction, so the synthetic shapes are anchored to real workloads. + +**AMD Ultra Ethernet (UEC).** The AMD networking path; pairs with the MoRI / Pollara backends. + +**Productization (north star).** Static report → public OSS benchmark site + the explainer article; promotion into InferenceX-app (`/collectives` + Postgres + nightly suite + regression alerts) at Milestone 2 / 4. + +## Continuous benchmark — vision & scope + +Goal: a continuous benchmark that reproduces the spike automatically and grows into a credible cross-vendor EP/collective comparison. **Start with balanced DeepSeek shapes, intranode EP**, then venture to advanced cases. Target **≥1 EP library per platform** first — DeepEP on NVIDIA, MoRI on AMD. + +### EP library landscape +- MoRI (AMD) — https://github.com/ROCm/mori +- DeepEP / DeepEPv2 / Hybrid-EP — https://github.com/deepseek-ai/DeepEP (hybrid: https://github.com/deepseek-ai/DeepEP/tree/hybrid-ep) +- NVIDIA NCCL EP — https://github.com/NVIDIA/nccl/tree/master/contrib/nccl_ep +- UCCL — https://github.com/uccl-project/uccl +- NVLink One-Sided AllToAll EP (mainly NVL72) — TensorRT-LLM blog18 (Optimizing MoE Communication with One-Sided AllToAll over NVLink) +- NIXL EP — https://github.com/ai-dynamo/nixl/tree/main/examples/device/ep + +### Shapes & axes +- **Classic DeepSeek V3:** hidden 7168, top-8, 256 routable experts. +- **Prefill vs decode** (# tokens). +- **Normal EP vs low-latency (LL) EP.** +- **Dispatch precision:** NVFP4, MXFP4, MXFP8, BF16. +- **Combine precision:** MXFP8, direct-cast FP8, BF16, NVFP4 — see MoRI #311, flashinfer #3643 / #3376. +- **Balanced vs unbalanced vs EPLB.** +- **Realistic shapes from InferenceX models** — collect hidden sizes / routing (Qwen3.5 has an unusual top-k). + +### Other inference collectives (later) +- KV-cache transfer: MoRI-IO, NIXL, Mooncake; CPU↔GPU offload — `experimental/kvcache_transfer_DtoH_HtoD/benchmark.py`. +- Low-latency one-shot / two-shot all-reduce (SGLang & vLLM in-tree kernels + AITER / FlashInfer variants) — e.g. sglang `sgl-kernel/csrc/allreduce/quick_all_reduce.cuh`. + +### Reference benchmark scripts to draw from +- flashinfer PR #3000; ROCm/mori `tests/python/ops`; DeepEP `tests/legacy`. + +### Learning resources +- arXiv 2511.15076, 2603.13606, 2512.19849, 2412.19437. + +## Things not to do + +* Do not add collective fields to the existing serving matrix. +* Do not make one GitHub Actions job per payload size. +* Do not call all logical-bandwidth figures "bus bandwidth." +* Do not compare different topology fingerprints as though GPU SKU were sufficient. +* Do not silently discard failed or incorrect results. +* Do not let a backend choose undocumented tuning parameters (in `standardized` mode). +* Do not make low-latency mode the only reported result. +* Do not publish one overall ranking before coverage and comparison contracts are stable. +* Do not start with every EP library, TPU, UEC, and every model shape. +* Do not store full raw rank samples indefinitely in Postgres. +* Do not expose internal hostnames, paths, NIC GUIDs, IP addresses, or private image references in public artifacts. +* Do not freeze the schema before the spike has produced a real artifact to freeze it from. + +## References (verified against the live InferenceX repo) + +- `experimental/README.md` — the non-core / "not official results" charter this project lives under. +- `.github/configs/runners.yaml` — runner labels and exact names (H100…GB300, AMD MI3xx). +- `.github/workflows/benchmark-tmpl.yml`, `benchmark-multinode-tmpl.yml`, `profile.yml`, `speedbench-al.yml` — the `bash ./runners/launch_${RUNNER_NAME%%_*}.sh` selection convention. +- `runners/launch_*.sh` — existing per-cluster launchers (`launch_b200-dgxc.sh`, `launch_b200-dgxc-slurm.sh`, `launch_gb200-nv.sh`, `launch_mi355x-amds.sh`, …). +- `utils/matrix_logic/generate_sweep_configs.py`, `validation.py` — the serving matrix CollectiveX must **not** extend. +- `.github/workflows/e2e-tests.yml`, `collect-results.yml` — the validate → fan-out → collect control plane being reused. +- `perf-changelog.yaml` — the additions-only serving gate CollectiveX must **not** trigger. +- NVIDIA Magnum IO NVSHMEM + GPUDirect Async (IBGDA): `https://developer.nvidia.com/blog/improving-network-performance-of-hpc-systems-using-nvidia-magnum-io-nvshmem-and-gpudirect-async/` diff --git a/experimental/CollectiveX/plot.py b/experimental/CollectiveX/plot.py new file mode 100644 index 000000000..c24136ebc --- /dev/null +++ b/experimental/CollectiveX/plot.py @@ -0,0 +1,142 @@ +#!/usr/bin/env python3 +"""CollectiveX spike — plot NCCL primitive curves, B200 vs GB200. + +Loads run_nccl.py result JSONs from results/, and for each operation draws two +panels: latency-vs-size and bus-bandwidth-vs-size, overlaying one curve per +(runner, topology-class, world-size). The B200(IB)-vs-GB200(MNNVL) contrast at +a matched shape is the intended overlay and the spike's headline. + +Comparison guard (plan §Comparability): curves are only overlaid when they +share op + dtype + comparison-class + measurement-contract. Anything else is +reported as "not directly comparable" and skipped rather than silently mixed. + + python plot.py --results-dir results --out-dir results/plots + +matplotlib + (optional) numpy. Run on a workstation/laptop over the JSON +artifacts; no GPU needed. +""" +from __future__ import annotations + +import argparse +import glob +import json +import os +from collections import defaultdict + +import matplotlib +matplotlib.use("Agg") +import matplotlib.pyplot as plt + + +def _human(nbytes: int) -> str: + for unit in ("B", "KiB", "MiB", "GiB"): + if nbytes < 1024 or unit == "GiB": + return f"{nbytes:.0f}{unit}" if unit == "B" else f"{nbytes/1:.0f}{unit}" + nbytes /= 1024 + return str(nbytes) + + +def load_nccl_results(results_dir: str) -> list[dict]: + docs = [] + for path in sorted(glob.glob(os.path.join(results_dir, "*.json"))): + try: + with open(path) as _f: + d = json.load(_f) + except (json.JSONDecodeError, OSError): + continue + if d.get("family") == "nccl" and d.get("rows"): + d["_path"] = path + docs.append(d) + return docs + + +def curve_label(d: dict) -> str: + return f"{d['runner']} · {d['topology_class']} · ws{d['world_size']}" + + +def overlay_signature(d: dict) -> tuple: + """Fields that must match for two curves to share a chart (topology and + world-size are deliberately NOT here — they are the comparison axis).""" + return (d["op"], d.get("dtype"), d.get("comparison_class"), d.get("measurement_contract")) + + +def plot_op(op: str, docs: list[dict], out_dir: str) -> str | None: + if not docs: + return None + # Comparison guard: keep the dominant signature, warn on the rest. + sigs = defaultdict(list) + for d in docs: + sigs[overlay_signature(d)].append(d) + main_sig = max(sigs, key=lambda s: len(sigs[s])) + keep = sigs[main_sig] + for sig, ds in sigs.items(): + if sig == main_sig: + continue + for d in ds: + print(f" [guard] skipping {curve_label(d)} for op={op}: not directly " + f"comparable (dtype/class/contract differs: {sig} vs {main_sig})") + + fig, (ax_lat, ax_bw) = plt.subplots(1, 2, figsize=(14, 5)) + for d in sorted(keep, key=curve_label): + rows = sorted(d["rows"], key=lambda r: r["size_bytes"]) + sizes = [r["size_bytes"] for r in rows] + lat = [r["out_of_place"]["time_us"] for r in rows] + bw = [r["busbw_gbps"] for r in rows] + label = curve_label(d) + ax_lat.plot(sizes, lat, "o-", linewidth=2, markersize=4, label=label) + ax_bw.plot(sizes, bw, "o-", linewidth=2, markersize=4, label=label) + + for ax in (ax_lat, ax_bw): + ax.set_xscale("log", base=2) + ax.set_xlabel("Message size (bytes)") + ax.grid(True, alpha=0.3) + ax.legend(fontsize=9) + ax_lat.set_yscale("log") + ax_lat.set_ylabel("Latency (µs, out-of-place)") + ax_lat.set_title(f"{op}: latency vs size") + ax_bw.set_ylabel("Bus bandwidth (GB/s)") + ax_bw.set_title(f"{op}: bus bandwidth vs size") + fig.suptitle( + f"CollectiveX · {op} · dtype={main_sig[1]} · class={main_sig[2]} " + f"(topology is the comparison axis)", + fontsize=11, + ) + fig.tight_layout() + os.makedirs(out_dir, exist_ok=True) + out = os.path.join(out_dir, f"nccl_{op}.png") + fig.savefig(out, dpi=150, bbox_inches="tight") + plt.close(fig) + return out + + +def main() -> int: + ap = argparse.ArgumentParser(description="CollectiveX primitive plots") + ap.add_argument("--results-dir", default="results") + ap.add_argument("--out-dir", default="results/plots") + ap.add_argument("--op", help="only plot this op") + args = ap.parse_args() + + docs = load_nccl_results(args.results_dir) + if not docs: + print(f"no nccl result JSONs found in {args.results_dir}/") + return 1 + + by_op = defaultdict(list) + for d in docs: + by_op[d["op"]].append(d) + + ops = [args.op] if args.op else sorted(by_op) + made = [] + for op in ops: + out = plot_op(op, by_op.get(op, []), args.out_dir) + if out: + made.append(out) + print(f"wrote {out} ({len(by_op[op])} curve(s))") + if not made: + print("nothing plotted") + return 1 + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/experimental/CollectiveX/plot_ep.py b/experimental/CollectiveX/plot_ep.py new file mode 100644 index 000000000..403775a9d --- /dev/null +++ b/experimental/CollectiveX/plot_ep.py @@ -0,0 +1,479 @@ +#!/usr/bin/env python3 +"""CollectiveX — render EP dispatch/combine sweeps to a self-contained HTML. + +Reads the family=moe result JSONs (tests/run_ep.py output) and emits ONE +dependency-free HTML file (inline SVG, no CDN — opens offline) with: + + * an interactive explorer: operation (dispatch | combine | round-trip) x + phase (decode | prefill) x x-axis (tokens/rank | global tokens) x y-axis + (latency | tokens/s | alg bandwidth), one colored line per SKU/backend/EP; + * a static small-multiples grid (phase x operation) of latency vs tokens/rank. + +Only source-tokens-per-rank varies along a line; everything else (backend, EP +degree, phase, precision, top-k/experts/hidden, routing) is fixed and identifies +the line — per the CollectiveX EP framework. + + python3 plot_ep.py --results-dir results --out results/plots/collectivex_ep.html +""" +from __future__ import annotations + +import argparse +import glob +import json +import os + +# SKU -> color (matches the matplotlib convention used for the NCCL plots). +COLORS = {"b200": "#1f77b4", "gb200": "#2ca02c", "mi355x": "#d62728", + "b300": "#9467bd", "gb300": "#8c564b", "h100": "#ff7f0e", "h200": "#e377c2"} + +# Per-SKU color FAMILIES: every (sku,backend,dtype,mode,resource) config gets its own +# shade within its SKU's hue family, so lines are individually identifiable AND the SKU +# is still readable at a glance (SKU-only coloring collided same-SKU configs into one). +SKU_FAMILY = { + "h100": ["#ff7f0e", "#d6a72b", "#ffbb78", "#8c6d1f", "#e8a33d"], # oranges / golds + "h200": ["#e377c2", "#b04a8f", "#f4b6df"], # pinks + "b200": ["#1f77b4", "#0d3d66", "#4a90d9", "#7fb2e0"], # blues + "b300": ["#9467bd", "#6b3fa0", "#c5b0d5", "#7b4fa0"], # purples + "gb200": ["#2ca02c", "#1a661a", "#7bc77b"], # greens + "gb300": ["#8c564b", "#5e372f", "#c49c94"], # browns + "mi355x": ["#d62728", "#a30000", "#ff9896", "#e34a4a"], # reds +} +PALETTE = ["#17becf", "#bcbd22", "#7f7f7f", "#393b79", "#637939"] # fallback for unknown SKUs + + +def load_series(results_dir: str, legacy: str = "all") -> list[dict]: + series = [] + for path in sorted(glob.glob(os.path.join(results_dir, "**", "*.json"), recursive=True)): + try: + d = json.load(open(path)) + except (json.JSONDecodeError, OSError): + continue + if d.get("family") != "moe" or not d.get("rows"): + continue + # legacy = a v3 doc with no machine-derived publication_status. exclude -> v4-only main + # plot; only -> the legacy.html archive. + is_legacy = "publication_status" not in d + if (legacy == "exclude" and is_legacy) or (legacy == "only" and not is_legacy): + continue + sku = (d.get("runner") or "?").split("_")[0].split("-")[0] + rows = [] + for r in d["rows"]: + # v4 carries nested {p50,p90,p95,p99} dicts for dispatch/combine/roundtrip/isolated_sum. + # Fall back to v3 flat *_us_p* (serial -> isolated_sum) so legacy docs still load. + def pcts(k, flat): + if isinstance(r.get(k), dict) and r[k].get("p50") is not None: + o = dict(r[k]); o.setdefault("p95", o.get("p90")) + return o + p50 = r.get(f"{flat}_us_p50") + return {"p50": p50, "p90": r.get(f"{flat}_us_p90") or p50, + "p95": r.get(f"{flat}_us_p95") or r.get(f"{flat}_us_p90") or p50, + "p99": r.get(f"{flat}_us_p99") or p50} + dop, cop = pcts("dispatch", "dispatch"), pcts("combine", "combine") + iso = pcts("isolated_sum", "serial") # renamed from "serial" + rtp = pcts("roundtrip", "roundtrip") # MEASURED round trip (v4) + if not (dop["p50"] and cop["p50"]): + continue + if rtp["p50"] is None: # legacy: no measured RT + rtp = iso + rows.append({ + "t": r["tokens_per_rank"], "gt": r.get("global_tokens"), + "dispatch": dop, "combine": cop, "roundtrip": rtp, "isolated_sum": iso, + "fanout": r.get("fanout_mean"), + "dbytes": r.get("dispatch_logical_bytes") or r.get("routed_bytes_total") or 0, + "cbytes": r.get("combine_logical_bytes") or 0, + "recv": r.get("recv_tokens_max") or r.get("recv_tokens") or 0, + "straggler": (r.get("per_rank_dispatch_us") or {}).get("slowest_rank"), + "correct": bool(r.get("correct")), + }) + if not rows: + continue + sh = d.get("shape", {}) + mode = d.get("mode", "normal") + dtype = sh.get("dispatch_dtype", "?") + rmode = d.get("resource_mode", "") + ll = " LL" if mode == "ll" else "" + # resource suffix: tuned is the default (omit); flag the others so a normalized + # or default-budget line is never confused with the tuned one. + rs = {"normalized": " (norm)", "default": " (def)"}.get(rmode, "") + contract = d.get("measurement_contract", "?") + cl = " [cl]" if contract == "cached-layout-comm-only-v1" else "" # cached-layout flag + backend = d.get("backend") + ep = d.get("ep_size") + # Routing axis: base distribution + EPLB. "zipf+eplb" is the balanced-by-replication + # variant of zipf; uniform is the baseline (omitted from the label to keep it short). + eplb_doc = d.get("eplb") or {} + routing_disp = f'{sh.get("routing", "?")}+eplb' if eplb_doc.get("enabled") else sh.get("routing", "?") + rt = "" if routing_disp == "uniform" else f' ·{routing_disp}' + # FULL per-line label: SKU·EP·backend·dtype[·LL][·resource][·cached-layout][·routing]. + # EP is explicit because a SKU can span EP degrees (GB300 EP4 on one NVL72 tray, EP8 + # across two); routing is explicit so balanced/zipf/zipf+eplb don't collide with uniform. + label = f'{sku.upper()} EP{ep} · {backend} · {dtype}{ll}{rs}{cl}{rt}' + repro = d.get("reproduction", {}) + gr = repro.get("git_run") or {} + rid = d.get("routing_identity", {}) + wl = d.get("workload") or {} + # publication status (v4) gates the default view; legacy v3 docs -> "legacy". + pub = d.get("publication_status") or "legacy" + # workload signature: prefer the v4 workload block, fall back to routing_identity (v3). + wsig = wl.get("trace_signature") or rid.get("trace_signature") + series.append({ + "sku": sku, "backend": backend, "ep": ep, + "pub": pub, "wsig": wsig, "wid": wl.get("workload_id"), + "phase": d.get("phase", "decode"), "mode": mode, + "dtype": dtype, "resource": rmode or "tuned", "contract": contract, + # comparison class: best-stack (tuned/default) vs resource-constrained + # (normalized) — kept distinct so they're never read as one fair contest. + "suite": "resource-constrained" if rmode == "normalized" else "backend-default", + "routing": routing_disp, + # eplb per-rank load imbalance removed (the headline of zipf vs zipf+eplb). + "eplb_before": eplb_doc.get("imbalance_before"), "eplb_after": eplb_doc.get("imbalance_after"), + # ep + routing in the key so EP4/EP8 and uniform/balanced/zipf/zipf+eplb of one SKU + # get distinct colors/lines (sku stays ckey.split("|")[0] for the family lookup). + "ckey": f"{sku}|{backend}|{dtype}|{mode}|{rmode}|{contract}|ep{ep}|{routing_disp}", # config identity (color) + "label": label, + "dash": "" if dtype == "bf16" else "6 4", # bf16 solid, fp8 dashed (2nd cue) + "color": COLORS.get(sku, "#555"), # provisional; reassigned below + "topo": d.get("topology_class"), "transport": d.get("transport"), + "fp8_in_timing": repro.get("fp8_quant_in_timing"), + "run_id": gr.get("run_id"), "source_sha": (gr.get("source_sha") or "")[:10], + "repo": gr.get("repo"), "image_digest": (repro.get("image_digest") or "")[:19], + "routing_consistent": rid.get("consistent_across_ranks"), + "trace_sig": rid.get("trace_signature"), + "samples": (rows and d["rows"][0].get("samples_pooled")) or None, + "prov": d.get("backend_provenance", {}), + "shape": sh, "rows": rows, + }) + # NOTE (goal Part 1, "plot/artifact integrity"): raw series are IMMUTABLE after loading. + # An earlier version injected each config's decode-range points into its prefill series so + # prefill panels spanned the full token axis — that COPIED observations between series and + # is removed. Each phase now plots only its own measured points; the x-axis simply spans + # whatever a series measured. (A shaded decode/prefill regime is the cosmetic alternative.) + + # Assign a DISTINCT color per config key, grouped by SKU family (stable across the + # decode/prefill panels so a line keeps its color everywhere). + by_sku: dict[str, list[str]] = {} + for ck in sorted({s["ckey"] for s in series}): + by_sku.setdefault(ck.split("|")[0], []).append(ck) + ckcolor: dict[str, str] = {} + fb = 0 + for sku, cks in by_sku.items(): + fam = SKU_FAMILY.get(sku) + for j, ck in enumerate(cks): + if fam: + ckcolor[ck] = fam[j % len(fam)] + else: + ckcolor[ck] = PALETTE[fb % len(PALETTE)]; fb += 1 + for s in series: + s["color"] = ckcolor[s["ckey"]] + return series + + +HEAD = """ + +CollectiveX — EP dispatch / combine +
+

CollectiveX — EP dispatch / combine

+

+""" + +TAIL = "
" + +JS = r""" +const SKUS = [...new Set(DATA.map(s=>s.sku))]; +// roundtrip = INDEPENDENTLY MEASURED chained latency (v4). isolated_sum = Σ of isolated +// dispatch+combine percentiles — NOT a measured op (no throughput/SLO use). serial(v3)->isolated_sum. +const OPS = {dispatch:"Dispatch", combine:"Combine", roundtrip:"Round trip (measured)", isolated_sum:"Isolated sum (Σp, not measured)"}; +// NOT algorithmic/bus bandwidth: logical routed payload (recv copies x hidden x dtype) +// over latency; dispatch & combine count their OWN bytes. Excludes scales/idx/meta/padding. +const YK = {lat:"Latency (µs)", tps:"Tokens / s", bw:"Logical routed payload rate (GB/s)"}; +const XK = {t:"Source tokens / rank", gt:"Global source tokens"}; +const PCT = {p50:"p50", p90:"p90", p99:"p99"}; +const SUITE = {all:"All", "backend-default":"Backend-default", "resource-constrained":"Resource-constrained"}; +// Routing distributions present in the data (+ "all"): uniform (baseline) / balanced / +// zipf (skewed) / zipf+eplb (skew rebalanced by EPLB replication). Default to uniform so the +// initial view matches the headline sweep; switch to compare zipf vs zipf+eplb. +const ROUTING = (()=>{ const o={all:"All"}; [...new Set(DATA.map(s=>s.routing))].sort().forEach(r=>{o[r]=r;}); return o; })(); +// Prefill panels show only the real large-T prefill range. MoRI ramps its prefill sweep from 1 +// (cold-jump wedge) and records decode-scale points; the intended prefill floor is the DeepEP +// prefill ladder min. So every SKU's prefill panel starts there — the sub-floor MoRI points are +// ramp-warmup (same kernel as decode) and live in the decode panel, not fabricated/duplicated here. +const _dpf = DATA.filter(s=>s.phase==="prefill"&&s.backend==="deepep").flatMap(s=>s.rows.map(r=>r.t)); +const PREFILL_MIN = _dpf.length? Math.min(..._dpf) : 128; +// Publication-status filter (goal P1): default hides diagnostic/invalid/failed so the first +// view is publication-valid; "publishable" = official + comparable-experimental + legacy v3. +const PUB = {publishable:"Publishable", official:"Official only", all:"All (incl. diagnostic)"}; +function pubOk(s){ return ST.pub==="all" || (ST.pub==="official" ? s.pub==="official" + : !["diagnostic","invalid","failed"].includes(s.pub)); } +// Default to ONE suite (not all) + publishable results (goal P1). +const ST = {op:"dispatch", phase:"decode", x:"t", y:"lat", xlog:true, ylog:true, pct:"p50", + suite:"backend-default", routing:"uniform", pub:"publishable"}; + +function xval(r,xk){ return xk==="t"? r.t : r.gt; } +function metric(r,op,yk,pct){ + const us=(r[op] && r[op][pct]!=null)? r[op][pct] : (r[op]? r[op].p50 : 0); + if(yk==="lat") return us; + if(yk==="tps") return r.gt/(us*1e-6); + const b = op==="dispatch"? r.dbytes : op==="combine"? r.cbytes : (r.dbytes + r.cbytes); + return us>0 ? b/(us*1e3) : 0; // logical routed payload rate (GB/s), per-op bytes +} +function fmt(v){ + if(v>=1e9) return (v/1e9).toFixed(v<1e10?2:0)+"G"; + if(v>=1e6) return (v/1e6).toFixed(v<1e7?2:0)+"M"; + if(v>=1e3) return (v/1e3).toFixed(v<1e4?1:0)+"k"; + if(v>=10) return v.toFixed(0); + if(v>=1) return v.toFixed(v<3?1:0); + return v.toFixed(2); +} +function logTicks(mn,mx){ + const t=[]; let e=Math.floor(Math.log10(mn)); + for(;Math.pow(10,e)<=mx*1.0001;e++) for(const m of [1,2,5]){const v=m*Math.pow(10,e); if(v>=mn*0.999&&v<=mx*1.001)t.push(v);} + return t.length?t:[mn,mx]; +} +function linTicks(mn,mx){ + const span=mx-mn||1, step=Math.pow(10,Math.floor(Math.log10(span))); const t=[]; + let s=step; if(span/step>6)s=step*2; if(span/step<3)s=step/2; + for(let v=Math.ceil(mn/s)*s; v<=mx*1.0001; v+=s) t.push(+v.toFixed(6)); + return t.length?t:[mn,mx]; +} +const mapLog=(v,a,b,p,q)=>p+(Math.log(v)-Math.log(a))/(Math.log(b)-Math.log(a))*(q-p); +const mapLin=(v,a,b,p,q)=>p+(v-a)/(b-a)*(q-p); + +// Build one SVG chart. opts: {op,phase,x,y,ylog,title,legend,w,h} +function chart(o){ + const W=o.w||900, H=o.h||520, m={l:64,r:16,t:34,b:46}; + const pct=o.pct||"p99", suite=o.suite||"all", routing=o.routing||"all"; + const sl = DATA.filter(s=>s.phase===o.phase && (o.ep==null || s.ep===o.ep) + && (suite==="all" || s.suite===suite) + && (routing==="all" || s.routing===routing) && pubOk(s)); + const pts = sl.map(s=>({s, P:s.rows.map(r=>({x:xval(r,o.x), y:metric(r,o.op,o.y,pct), r})) + .filter(p=>p.x>0 && (o.ylog? p.y>0 : p.y>=0) + && (o.phase!=="prefill" || p.r.t>=PREFILL_MIN))})); + let xs=[], ys=[]; pts.forEach(g=>g.P.forEach(p=>{xs.push(p.x);ys.push(p.y);})); + if(!xs.length) return 'no data'; + const xmn=Math.min(...xs), xmx=Math.max(...xs); + let ymn=Math.min(...ys), ymx=Math.max(...ys); + if(o.ylog){ ymn=Math.min(...ys.filter(v=>v>0)); } else { ymn=Math.min(0,ymn); } + if(ymx===ymn) ymx=ymn+1; + const X0=m.l,X1=W-m.r,Y0=H-m.b,Y1=m.t; + const xlog = o.xlog!==false; // x defaults to log (geometric sweep) + const xv=v=>xlog?mapLog(v,xmn,xmx,X0,X1):mapLin(v,xmn,xmx,X0,X1); + const yv=v=>o.ylog?mapLog(Math.max(v,ymn),ymn,ymx,Y0,Y1):mapLin(v,ymn,ymx,Y0,Y1); + let s=''; + s+=''+o.title+''; + // y grid + ticks + const yt=o.ylog?logTicks(ymn,ymx):linTicks(ymn,ymx); + yt.forEach(v=>{const y=yv(v); s+=''+ + ''+fmt(v)+'';}); + // x grid + ticks (label the actual sweep points) + const xt=[...new Set(xs)].sort((a,b)=>a-b); + xt.forEach(v=>{const x=xv(v); s+=''+ + ''+fmt(v)+'';}); + // axes + s+=''; + s+=''+XK[o.x]+(xlog?' (log)':'')+''; + s+=''+YK[o.y]+(o.ylog?' (log)':'')+''; + // lines + points + pts.forEach(g=>{ if(!g.P.length) return; + const d=g.P.map((p,i)=>(i?'L':'M')+xv(p.x).toFixed(1)+' '+yv(p.y).toFixed(1)).join(' '); + const dash=g.s.dash?' stroke-dasharray="'+g.s.dash+'"':''; + s+=''; + g.P.forEach(p=>{ const D=p.r.dispatch, C=p.r.combine, R=p.r.roundtrip; + // artifact links (goal P1): the workflow run + source SHA + image digest + workload id + // that produced this point. (Result JSON / manifest / raw-samples live alongside by name.) + const run=g.s.run_id? ('\nrun '+g.s.run_id+(g.s.source_sha?' @'+g.s.source_sha:'')) : ''; + const art='\nworkload='+(g.s.wid||g.s.wsig||'?')+(g.s.image_digest?' · image '+g.s.image_digest:'') + +(g.s.repo?' · '+g.s.repo:''); + s+=''+ + ''+g.s.label+' ['+pct+'] ('+g.s.pub+')'+ + '\nT/rank='+p.r.t+' · global='+p.r.gt+ + '\n'+YK[o.y]+' = '+fmt(p.y)+(o.y==='lat'?' µs':o.y==='bw'?' GB/s':'')+ + '\ndispatch µs p50/p90/p99 = '+D.p50.toFixed(1)+'/'+D.p90.toFixed(1)+'/'+D.p99.toFixed(1)+ + '\ncombine µs p50/p90/p99 = '+C.p50.toFixed(1)+'/'+C.p90.toFixed(1)+'/'+C.p99.toFixed(1)+ + '\nroundtrip µs p50/p90/p99 = '+R.p50.toFixed(1)+'/'+R.p90.toFixed(1)+'/'+R.p99.toFixed(1)+' (measured)'+ + '\nfan-out='+(p.r.fanout!=null?p.r.fanout.toFixed(2):'?')+' · recv(max)='+p.r.recv + +(p.r.straggler!=null?' · straggler=r'+p.r.straggler:'')+(p.r.correct?'':' ✗')+ + '\ncontract='+g.s.contract+' · suite='+g.s.suite+run+art+ + ''; }); + }); + s+=''; return s; +} +// Comparison guard (goal P1): flag when overlaid lines are NOT a direct comparison — +// differing topology at one EP, or differing realized workload signature within one routing. +function guardNote(vis){ + if(!vis.length) return ''; + const w=[]; + const topos=[...new Set(vis.map(s=>s.topo).filter(Boolean))]; + if(topos.length>1) w.push('mixed topology ('+topos.join(', ')+')'); + const byRt={}; vis.forEach(s=>{ (byRt[s.routing]=byRt[s.routing]||new Set()).add(s.wsig||'?'); }); + const split=Object.entries(byRt).filter(([k,v])=>v.size>1).map(([k])=>k); + if(split.length) w.push('different workload trace within routing ['+split.join(',')+'] — NOT identical workloads'); + const eps=[...new Set(vis.map(s=>s.ep))]; + if(eps.length>1) w.push('mixed EP degree '+eps.join('/')+' — compare only on the global-tokens x-axis'); + return w.length? '
⚠ not a direct comparison: '+w.join('; ')+'
' : ''; +} +function legend(phase, ep, suite, routing){ + return '
'+DATA.filter(s=>s.phase===phase && (ep==null||s.ep===ep) + && (!suite||suite==="all"||s.suite===suite) + && (!routing||routing==="all"||s.routing===routing) && pubOk(s)).map(s=>{ + const sw = s.dash ? 'background:repeating-linear-gradient(90deg,'+s.color+' 0 5px,transparent 5px 9px)' + : 'background:'+s.color; // dashed swatch = fp8 (matches the line) + return ''+s.label+''; + }).join('')+'
'; +} +function seg(name,opts,cur){ + return '
'+Object.entries(opts).map(([k,v])=> + '').join('')+'
'; +} +function renderControls(){ + document.getElementById('controls').innerHTML = + '
Operation'+seg('op',OPS,ST.op)+'
'+ + '
Phase'+seg('phase',{decode:"Decode",prefill:"Prefill"},ST.phase)+'
'+ + '
Percentile'+seg('pct',PCT,ST.pct)+'
'+ + '
Suite'+seg('suite',SUITE,ST.suite)+'
'+ + '
Routing'+seg('routing',ROUTING,ST.routing)+'
'+ + '
Publication'+seg('pub',PUB,ST.pub)+'
'+ + '
X-axis'+seg('x',XK,ST.x)+'
'+ + '
X scale'+seg('xlog',{true:"Log",false:"Linear"},String(ST.xlog))+'
'+ + '
Y-axis'+seg('y',YK,ST.y)+'
'+ + '
Y scale'+seg('ylog',{true:"Log",false:"Linear"},String(ST.ylog))+'
'; + document.querySelectorAll('#controls button').forEach(b=>b.onclick=()=>{ + const g=b.dataset.grp, v=b.dataset.val; ST[g]= (g==='ylog'||g==='xlog')? v==='true' : v; + renderControls(); renderMain(); renderGrid(); }); // grid also reflects pct/suite/scale toggles +} +function renderMain(){ + document.getElementById('chart').innerHTML = chart({op:ST.op,phase:ST.phase,x:ST.x,y:ST.y,xlog:ST.xlog,ylog:ST.ylog, + pct:ST.pct, suite:ST.suite, routing:ST.routing, + title:OPS[ST.op]+' — '+ST.phase+' · '+ST.pct+(ST.routing==='all'?'':' · '+ST.routing)+' ('+YK[ST.y].toLowerCase()+' vs '+XK[ST.x].toLowerCase()+')'}); + const vis=DATA.filter(s=>s.phase===ST.phase && (ST.suite==="all"||s.suite===ST.suite) + && (ST.routing==="all"||s.routing===ST.routing) && pubOk(s)); + document.getElementById('mlegend').innerHTML = guardNote(vis)+legend(ST.phase, null, ST.suite, ST.routing); +} +function renderGrid(){ + // SEPARATE panels per (phase, EP degree); within a panel, the SUITE selector keeps + // backend-default and resource-constrained lines from being read as one fair contest. + const phases=[...new Set(DATA.map(s=>s.phase))].sort(); + const eps=[...new Set(DATA.map(s=>s.ep))].sort((a,b)=>a-b); + let h=''; + phases.forEach(ph=>{ eps.forEach(ep=>{ + const panelVis=DATA.filter(s=>s.phase===ph && s.ep===ep && (ST.suite==="all"||s.suite===ST.suite) + && (ST.routing==="all"||s.routing===ST.routing) && pubOk(s)); + if(!panelVis.length) return; + const scale=(ST.xlog?'log':'lin')+'–'+(ST.ylog?'log':'lin'); + h+='

'+ph[0].toUpperCase()+ph.slice(1)+' · EP'+ep+' · '+ST.pct+(ST.routing==='all'?'':' · '+ST.routing)+' — latency vs source tokens/rank (µs, '+scale+')

'+ + guardNote(panelVis)+legend(ph,ep,ST.suite,ST.routing)+'
'; + ['dispatch','combine','roundtrip'].forEach(op=>{ h+='
'+OPS[op]+'
'+ + chart({op,phase:ph,ep,x:'t',y:'lat',xlog:ST.xlog,ylog:ST.ylog,pct:ST.pct,suite:ST.suite,routing:ST.routing,title:'',w:340,h:260})+'
'; }); + h+='
'; }); }); + document.getElementById('grid').innerHTML=h; +} +// Coverage table (goal P2): publication status per measured config (validated=official, +// experimental=comparable/legacy, failed=invalid/failed). Supported/unsupported come from +// generate_matrix.py (capability), which records omissions with reasons. +function renderCoverage(){ + const cls={official:'#2ca02c','comparable-experimental':'#d6a72b',legacy:'#7f7f7f', + diagnostic:'#9467bd',invalid:'#d62728',failed:'#a30000'}; + const by={}; DATA.forEach(s=>{ (by[s.sku]=by[s.sku]||[]).push(s); }); + let h=''; + Object.keys(by).sort().forEach(sku=>{ + by[sku].sort((a,b)=>(a.ep-b.ep)||a.label.localeCompare(b.label)).forEach(s=>{ + const ok=s.rows.filter(r=>r.correct).length; + const cfg=(s.dtype||'?')+'/'+s.mode+'/'+(s.contract||'?').replace('-v1',''); + h+='' + +'' + +''; + }); + }); + document.getElementById('coverage').innerHTML=h+'
SKUEPconfigphaseroutingstatuscorrect pts
'+sku+''+s.ep+''+cfg+''+s.phase+''+s.routing+''+s.pub+''+ok+'/'+s.rows.length+'
'; +} +(function(){ + const sh=(DATA[0]||{shape:{}}).shape||{}; + const provs=[...new Set(DATA.map(s=>s.backend+' '+(s.prov.deepep_version||s.prov.mori_commit||'?')))]; + const fo=[...new Set(DATA.map(s=>(s.rows[0]&&s.rows[0].fanout!=null)?s.rows[0].fanout.toFixed(1):'?'))].join('/'); + const contracts=[...new Set(DATA.map(s=>s.contract))].join(' / '); + const dtypes=[...new Set(DATA.map(s=>s.dtype))].join('+'); + const suites=[...new Set(DATA.map(s=>s.suite))].join(' + '); + const samp=[...new Set(DATA.map(s=>s.samples).filter(Boolean))].join('/'); + const allconsistent=DATA.every(s=>s.routing_consistent!==false); + const routings=[...new Set(DATA.map(s=>s.routing))].sort().join(' / '); + const ez=DATA.find(s=>s.eplb_after!=null); + const eplbNote=ez? ' EPLB (routing=zipf+eplb) replicates hot experts to rebalance per-rank load — imbalance '+ez.eplb_before.toFixed(1)+'x→'+ez.eplb_after.toFixed(1)+'x (vs raw zipf).' : ''; + document.getElementById('prov').textContent= + 'Deterministic shared routing trace (seed-fixed; routings: '+routings+' — Routing selector; mean fan-out ≈'+fo+ + ' dest-ranks/token; cross-rank identity '+(allconsistent?'PROVEN (SHA-256 of topk_idx+weights agrees on every rank)':'NOT proven on some series')+ + '). Fixed: hidden='+(sh.hidden||'?')+', top-k='+(sh.topk||'?')+', experts='+(sh.experts||'?')+ + '. dtype/mode/resource/contract vary PER LINE — read the label (dtypes shown: '+dtypes+'). '+ + 'Contract(s): '+contracts+' (layout-and-dispatch times routing-layout INSIDE dispatch; cached-layout [cl] hoists it out). '+ + 'Latency = percentile (selector; p99 default) over POOLED per-iteration cross-rank-MAX samples'+(samp?(' (~'+samp+'/point)'):'')+ + '. ROUND TRIP is INDEPENDENTLY MEASURED (dispatch→sync→no-op expert→combine, raw per-iter samples); ISOLATED_SUM is Σ of isolated dispatch+combine percentiles, NOT a measured op (no throughput/SLO use). Publication filter defaults to publishable (diagnostic/invalid hidden); status is machine-derived from validity. The bandwidth axis is a LOGICAL routed-payload rate '+ + '(recv copies x hidden x dtype / latency; per-op bytes; excludes scales/idx/meta/padding) — NOT algBW/busBW/wire utilization. '+ + 'Suites ('+suites+') are kept distinct (Suite selector): backend-default = best stack; resource-constrained = ~fixed SM/CU fraction — '+ + 'do not read across suites as one contest. Correctness = round-trip reconstruction smoke check (NOT a full per-token routing proof).'+eplbNote+' '+ + 'Backends: '+provs.join(', ')+'. Hover a point for p50/p90/p99, contract, suite, and its workflow run.'; + renderControls(); renderMain(); renderGrid(); renderCoverage(); +})(); +""" + + +def main() -> int: + ap = argparse.ArgumentParser(description="CollectiveX EP HTML plotter") + ap.add_argument("--results-dir", default="results") + ap.add_argument("--out", default="results/plots/collectivex_ep.html") + ap.add_argument("--legacy", choices=["all", "exclude", "only"], default="all", + help="exclude -> v4-only main plot; only -> the legacy v3 archive") + args = ap.parse_args() + + series = load_series(args.results_dir, args.legacy) + if not series: + print(f"no family=moe results with rows under {args.results_dir} (legacy={args.legacy})") + return 1 + os.makedirs(os.path.dirname(os.path.abspath(args.out)), exist_ok=True) + html = HEAD + '
' \ + + '
' \ + + '
' \ + + '

Coverage

' \ + + '

Self-contained (inline SVG, no external scripts). Generated from ' \ + + f'{len(series)} EP sweeps. Latency (p50/p90/p99 selector) is the primary metric; the ' \ + + 'bandwidth axis is a LOGICAL routed-payload rate (per-op bytes ÷ latency), not bus/alg ' \ + + 'bandwidth. dtype/mode/resource/contract vary per line — see labels + provenance.

' \ + + "\n" + TAIL + with open(args.out, "w") as fh: + fh.write(html) + phases = sorted({s["phase"] for s in series}) + print(f"wrote {args.out} ({len(series)} series across SKUs={sorted({s['sku'] for s in series})}, phases={phases})") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/experimental/CollectiveX/requirements.txt b/experimental/CollectiveX/requirements.txt new file mode 100644 index 000000000..574afb1f0 --- /dev/null +++ b/experimental/CollectiveX/requirements.txt @@ -0,0 +1,9 @@ +# CollectiveX spike dependencies. +# +# run_nccl.py + env_capture.py : Python standard library only (run anywhere). +# run_deepep.py : torch + deep_ep — provided by the benchmark +# container; DeepEP is built at job setup +# (rebuild-deepep), NOT pinned here. +# plot.py : the only thing worth a local venv: +matplotlib +numpy diff --git a/experimental/CollectiveX/results/.gitkeep b/experimental/CollectiveX/results/.gitkeep new file mode 100644 index 000000000..8940934a2 --- /dev/null +++ b/experimental/CollectiveX/results/.gitkeep @@ -0,0 +1,3 @@ +# CollectiveX result bundles land here as flat *.json (one per runner×op), +# plus plots/ and raw_*.txt captures (gitignored). Keep this file so the dir +# exists before the first run. diff --git a/experimental/CollectiveX/run_nccl.py b/experimental/CollectiveX/run_nccl.py new file mode 100644 index 000000000..c22654c59 --- /dev/null +++ b/experimental/CollectiveX/run_nccl.py @@ -0,0 +1,269 @@ +#!/usr/bin/env python3 +"""CollectiveX spike — NCCL primitive benchmark wrapper. + +Runs stock `nccl-tests` binaries (built in-container at job time — the login +nodes have no nvcc), parses the text table (NOT JSON — we do not assume the +build emits JSON), and writes a flat, provenance-tagged JSON result the plot +script and the eventual schema-freeze can consume. + +Standard library only, so it runs in any minimal container. + +Run (inside the container, after building nccl-tests): + python run_nccl.py --op all_reduce \\ + --nccl-tests-dir /tmp/nccl-tests/build \\ + --world-size 8 --min-bytes 8 --max-bytes 8G \\ + --runner b200-dgxc --topology-class b200-nvlink-island --transport nvlink \\ + --env-json results/env.json --out results/b200_all_reduce.json + +Verify the parser offline (no GPU needed): + python run_nccl.py --op all_reduce --parse-only tests/fixtures/all_reduce_perf_b200_8gpu.txt \\ + --world-size 8 --runner b200-dgxc --topology-class b200-nvlink-island \\ + --out /tmp/parsed.json +""" +from __future__ import annotations + +import argparse +import datetime as _dt +import hashlib +import json +import os +import subprocess +import sys + +SCHEMA_VERSION = 1 +MEASUREMENT_CONTRACT = "nccl-tests-v1" + +# op -> nccl-tests binary name +OP_BINARY = { + "all_reduce": "all_reduce_perf", + "all_gather": "all_gather_perf", + "reduce_scatter": "reduce_scatter_perf", + "alltoall": "alltoall_perf", + "all_to_all": "alltoall_perf", + "broadcast": "broadcast_perf", + "sendrecv": "sendrecv_perf", +} + + +def _f(tok: str): + """Parse a numeric cell; nccl-tests prints 'N/A' for #wrong when -c 0.""" + if tok in ("N/A", "n/a", "-"): + return None + try: + return float(tok) + except ValueError: + return None + + +def parse_nccl_table(text: str) -> tuple[list[dict], dict]: + """Parse nccl-tests stdout into per-size rows + a run summary. + + Robust across ops: the column count varies (all_reduce/reduce_scatter carry + redop+root; all_gather/alltoall do not), but every op prints the same 8 + trailing numeric columns — out-of-place (time, algbw, busbw, #wrong) then + in-place (time, algbw, busbw, #wrong). `size` is always the first token and + `type` the third. So we key off the first token and the last 8 tokens. + """ + rows: list[dict] = [] + summary: dict = {"avg_busbw_gbps": None, "out_of_bounds": None, "check_passed": None} + for line in text.splitlines(): + s = line.strip() + if not s: + continue + if s.startswith("#"): + if "Avg bus bandwidth" in s: + summary["avg_busbw_gbps"] = _f(s.split(":")[-1].strip()) + elif "Out of bounds values" in s: + tail = s.split(":")[-1].strip() + summary["out_of_bounds"] = tail + summary["check_passed"] = tail.endswith("OK") + continue + toks = s.split() + # Data line: first token is the byte size (all digits), and we need the + # 8 trailing metric columns plus size+count+type up front (>=11 tokens). + if len(toks) < 11 or not toks[0].isdigit(): + continue + tail = toks[-8:] + size = int(toks[0]) + dtype = toks[2] if len(toks) >= 3 else None + oop_wrong = _f(tail[3]) + ip_wrong = _f(tail[7]) + rows.append( + { + "size_bytes": size, + "dtype": dtype, + "out_of_place": { + "time_us": _f(tail[0]), + "algbw_gbps": _f(tail[1]), + "busbw_gbps": _f(tail[2]), + "wrong": oop_wrong, + }, + "in_place": { + "time_us": _f(tail[4]), + "algbw_gbps": _f(tail[5]), + "busbw_gbps": _f(tail[6]), + "wrong": ip_wrong, + }, + # convenience: best (max) busbw across the two placements + "busbw_gbps": max( + [b for b in (_f(tail[2]), _f(tail[6])) if b is not None], + default=None, + ), + "correct": ( + None + if oop_wrong is None and ip_wrong is None + else ((oop_wrong or 0) == 0 and (ip_wrong or 0) == 0) + ), + } + ) + return rows, summary + + +def comparison_key(meta: dict) -> str: + """Machine key gating which rows may share a curve (see plan §Comparability). + Topology-class is intentionally part of the key, so B200(IB) and + GB200(MNNVL) are labelled distinct rather than silently overlaid.""" + parts = [ + meta["op"], + meta["dtype"], + str(meta["world_size"]), + str(meta["nodes"]), + meta["topology_class"], + meta["comparison_class"], + meta["measurement_contract"], + ] + digest = hashlib.sha256("|".join(parts).encode()).hexdigest()[:16] + return digest + + +def build_command(args, binary_path: str) -> list[str]: + cmd: list[str] = [] + if args.launch_prefix: + cmd += args.launch_prefix.split() + cmd += [ + binary_path, + "-b", str(args.min_bytes), + "-e", str(args.max_bytes), + "-f", str(args.factor), + "-g", str(args.gpus_per_proc), + "-c", str(args.check), + "-w", str(args.warmup), + "-n", str(args.iters), + ] + if args.extra_args: + cmd += args.extra_args.split() + return cmd + + +def main() -> int: + ap = argparse.ArgumentParser(description="CollectiveX NCCL primitive runner") + ap.add_argument("--op", required=True, choices=sorted(OP_BINARY)) + ap.add_argument("--nccl-tests-dir", help="dir containing _perf binaries (build/)") + ap.add_argument("--parse-only", help="parse this captured stdout file instead of running") + # nccl-tests knobs + ap.add_argument("--min-bytes", default="8") + ap.add_argument("--max-bytes", default="8G") + ap.add_argument("--factor", type=int, default=2, help="size step factor") + ap.add_argument("--gpus-per-proc", type=int, default=8, + help="-g: GPUs per process (single-node multi-GPU). Use 1 under MPI.") + ap.add_argument("--check", type=int, default=1, help="-c: 1 enables correctness check") + ap.add_argument("--warmup", type=int, default=5) + ap.add_argument("--iters", type=int, default=20) + ap.add_argument("--extra-args", default="", help="extra args appended to the binary") + ap.add_argument("--launch-prefix", default="", + help="e.g. 'mpirun -np 16 --hostfile hf' for multi-node; empty for single-node -g mode") + # provenance + ap.add_argument("--runner", required=True, help="runner label, e.g. b200-dgxc") + ap.add_argument("--world-size", type=int, required=True, help="total ranks/GPUs in the run") + ap.add_argument("--nodes", type=int, default=1) + ap.add_argument("--topology-class", required=True, + help="e.g. b200-nvlink-island, b200-nvlink-island+cx7-ib, gb200-nvl72-mnnvl") + ap.add_argument("--transport", default="", help="observed transport label: nvlink | ib | mnnvl") + ap.add_argument("--comparison-class", default="standardized", + choices=["standardized", "backend-optimized", "framework-integrated"]) + ap.add_argument("--env-json", help="path to env_capture.py output to embed") + ap.add_argument("--timestamp", help="ISO timestamp (default now)") + ap.add_argument("--out", required=True) + args = ap.parse_args() + + binary = OP_BINARY[args.op] + command = None + if args.parse_only: + with open(args.parse_only) as fh: + stdout = fh.read() + ran_ok = True + else: + if not args.nccl_tests_dir: + ap.error("--nccl-tests-dir is required unless --parse-only is given") + binary_path = os.path.join(args.nccl_tests_dir, binary) + if not os.path.exists(binary_path): + print(f"ERROR: binary not found: {binary_path}", file=sys.stderr) + return 2 + command = build_command(args, binary_path) + print("running:", " ".join(command), file=sys.stderr) + proc = subprocess.run(command, capture_output=True, text=True, check=False) + stdout = proc.stdout + ran_ok = proc.returncode == 0 + if not ran_ok: + print(stdout, file=sys.stderr) + print(proc.stderr, file=sys.stderr) + print(f"ERROR: {binary} exited {proc.returncode}", file=sys.stderr) + + rows, summary = parse_nccl_table(stdout) + dtype = rows[0]["dtype"] if rows else None + + meta = { + "op": args.op, + "dtype": dtype, + "world_size": args.world_size, + "nodes": args.nodes, + "topology_class": args.topology_class, + "comparison_class": args.comparison_class, + "measurement_contract": MEASUREMENT_CONTRACT, + } + + env = None + if args.env_json and os.path.exists(args.env_json): + with open(args.env_json) as fh: + env = json.load(fh) + + # All-zero busbw means the benchmark didn't actually communicate — e.g. an + # MPI=0 binary launched under srun --mpi=pmix runs as N standalone world=1 + # procs (busbw formula -> 0). Don't let that pass the gate as "valid". + peak_busbw = max((r.get("busbw_gbps") or 0.0 for r in rows), default=0.0) + + doc = { + "schema_version": SCHEMA_VERSION, + "family": "nccl", + "generated_by": "run_nccl.py", + "generated_at": args.timestamp or _dt.datetime.now().astimezone().isoformat(), + "runner": args.runner, + "binary": binary, + "command": " ".join(command) if command else f"", + "transport": args.transport, + "status": ("valid" if (rows and ran_ok and peak_busbw > 0.0 + and (summary.get("check_passed") is True + or (args.check == 0 and summary.get("check_passed") is None))) else "invalid"), + "comparison_key": comparison_key(meta), + **meta, + "summary": summary, + "num_rows": len(rows), + "rows": rows, + "environment": env, + } + + os.makedirs(os.path.dirname(os.path.abspath(args.out)), exist_ok=True) + with open(args.out, "w") as fh: + json.dump(doc, fh, indent=2) + fh.write("\n") + + print( + f"{args.op}: parsed {len(rows)} sizes -> {args.out} " + f"(status={doc['status']}, avg_busbw={summary.get('avg_busbw_gbps')} GB/s, " + f"key={doc['comparison_key']})" + ) + return 0 if doc["status"] == "valid" else 1 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/experimental/CollectiveX/schemas/ep-result-v4.schema.json b/experimental/CollectiveX/schemas/ep-result-v4.schema.json new file mode 100644 index 000000000..11828a8bb --- /dev/null +++ b/experimental/CollectiveX/schemas/ep-result-v4.schema.json @@ -0,0 +1,122 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "$id": "https://semianalysis/collectivex/schemas/ep-result-v4.schema.json", + "title": "CollectiveX EP dispatch/combine result (v4)", + "description": "One (backend, phase, dtype, mode, contract, routing) sweep. v4 adds multi-dimensional validity + machine-derived publication_status, measured roundtrip, dual byte contracts, per-rank diagnostics, raw-sample histograms, and workload identity. v3 docs load via compatibility (publication_status absent => treated as legacy/experimental).", + "type": "object", + "required": ["schema_version", "family", "runner", "backend", "mode", "phase", + "ep_size", "measurement_contract", "shape", "rows", + "validity", "publication_status", "workload", "reproduction", + "backend_provenance", "comparison_key"], + "properties": { + "schema_version": {"type": "integer", "minimum": 3}, + "family": {"const": "moe"}, + "runner": {"type": "string"}, + "backend": {"type": "string", "enum": ["deepep", "mori", "aiter"]}, + "mode": {"type": "string", "enum": ["normal", "ll"]}, + "phase": {"type": "string", "enum": ["decode", "prefill"]}, + "ep_size": {"type": "integer", "minimum": 1}, + "world_size": {"type": "integer", "minimum": 1}, + "nodes": {"type": "integer", "minimum": 1}, + "topology_class": {"type": "string"}, + "transport": {"type": "string"}, + "resource_mode": {"type": "string", "enum": ["normalized", "tuned", "default"]}, + "measurement_contract": {"type": "string", + "enum": ["layout-and-dispatch-v1", "cached-layout-comm-only-v1", "runtime-visible-v1"]}, + "publication_status": {"type": "string", + "enum": ["official", "comparable-experimental", "diagnostic", "invalid", "failed"]}, + "validity": { + "type": "object", + "required": ["execution_status", "semantic_correctness", "workload_identity", + "measurement_conformance", "resource_conformance", "provenance_complete"], + "properties": { + "execution_status": {"type": "string", "enum": ["complete", "failed"]}, + "semantic_correctness": {"type": "string", "enum": ["pass", "fail"]}, + "workload_identity": {"type": "string"}, + "workload_source": {"type": "string", "enum": ["canonical-serialized", "seeded-runtime"]}, + "measurement_conformance": {"type": "string", "enum": ["conformant", "nonconformant"]}, + "resource_conformance": {"type": "string"}, + "provenance_complete": {"type": "boolean"} + } + }, + "workload": { + "type": "object", + "required": ["source", "trace_signature", "cross_rank_consistent"], + "properties": { + "source": {"type": "string", "enum": ["canonical-serialized", "seeded-runtime"]}, + "workload_id": {"type": ["string", "null"]}, + "manifest_checksums": {"type": ["object", "null"]}, + "trace_signature": {"type": "string"}, + "distinct_per_T_hashes": {"type": "array", "items": {"type": "string"}}, + "cross_rank_consistent": {"type": "boolean"} + } + }, + "shape": { + "type": "object", + "required": ["hidden", "topk", "experts", "experts_per_rank", "dispatch_dtype", "routing"], + "properties": { + "hidden": {"type": "integer"}, "topk": {"type": "integer"}, + "experts": {"type": "integer"}, "experts_per_rank": {"type": "integer"}, + "dispatch_dtype": {"type": "string", "enum": ["bf16", "fp8"]}, + "routing": {"type": "string"}, + "eplb": {"type": "boolean"}, "num_logical_experts": {"type": "integer"} + } + }, + "reproduction": { + "type": "object", + "required": ["command", "seed", "warmup", "iters", "trials", "measurement_contract"], + "properties": { + "command": {"type": "string"}, + "image": {"type": ["string", "null"]}, + "image_digest": {"type": ["string", "null"]}, + "image_arch": {"type": ["string", "null"]}, + "squash_sha256": {"type": ["string", "null"]}, + "git_run": {"type": ["object", "null"]}, + "fp8_quant_in_timing": {"type": ["boolean", "null"]} + } + }, + "backend_provenance": {"type": "object"}, + "rows": { + "type": "array", "minItems": 1, + "items": { + "type": "object", + "required": ["tokens_per_rank", "global_tokens", "dispatch", "combine", "roundtrip", + "isolated_sum", "samples_pooled", "byte_contracts", "correct"], + "properties": { + "tokens_per_rank": {"type": "integer", "minimum": 1}, + "global_tokens": {"type": "integer", "minimum": 1}, + "dispatch": {"$ref": "#/definitions/percentiles"}, + "combine": {"$ref": "#/definitions/percentiles"}, + "roundtrip": {"$ref": "#/definitions/percentiles"}, + "isolated_sum": {"type": "object"}, + "samples_pooled": {"type": "integer", "minimum": 1}, + "percentile_interpolation": {"type": "string"}, + "per_rank_dispatch_us": {"type": "object"}, + "raw_samples": {"type": "object"}, + "byte_contracts": { + "type": "object", + "required": ["token_rank_payload_copies", "token_expert_payload_copies", + "dispatch_bytes", "combine_bytes"], + "properties": { + "token_rank_payload_copies": {"type": "integer"}, + "token_expert_payload_copies": {"type": "integer"}, + "dispatch_bytes": {"type": "integer"}, "combine_bytes": {"type": "integer"} + } + }, + "roundtrip_tokens_per_second": {"type": ["number", "null"]}, + "correct": {"type": "boolean"} + } + } + } + }, + "definitions": { + "percentiles": { + "type": "object", + "required": ["p50", "p90", "p95", "p99"], + "properties": { + "p50": {"type": "number"}, "p90": {"type": "number"}, + "p95": {"type": "number"}, "p99": {"type": "number"} + } + } + } +} diff --git a/experimental/CollectiveX/schemas/workload-v1.schema.json b/experimental/CollectiveX/schemas/workload-v1.schema.json new file mode 100644 index 000000000..285f56ad2 --- /dev/null +++ b/experimental/CollectiveX/schemas/workload-v1.schema.json @@ -0,0 +1,46 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "$id": "https://semianalysis/collectivex/schemas/workload-v1.schema.json", + "title": "CollectiveX canonical MoE routing workload manifest", + "description": "Manifest for a serialized routing trace (tests/workload.py). The .npz holds topk_idx/topk_weights; this manifest carries the identity, dimensions, routing profile, and SHA-256 checksums that gate cross-hardware comparison.", + "type": "object", + "additionalProperties": false, + "required": ["schema_version", "workload_id", "generator_version", "gate_weight_format", + "dims", "routing_profile", "seed", "checksums"], + "properties": { + "schema_version": {"const": 1}, + "workload_id": {"type": "string", "pattern": "^[0-9a-f]{16}$", + "description": "Immutable id = sha256(generator|routing|hidden|topk|experts|gt|seed)[:16]."}, + "generator_version": {"type": "string", + "description": "Routing generator identity; bump when numerics change so stale files can't masquerade."}, + "gate_weight_format": {"type": "string"}, + "dims": { + "type": "object", + "additionalProperties": false, + "required": ["hidden", "topk", "experts", "global_tokens", "experts_per_rank"], + "properties": { + "hidden": {"type": "integer", "minimum": 1}, + "topk": {"type": "integer", "minimum": 1}, + "experts": {"type": "integer", "minimum": 1}, + "global_tokens": {"type": "integer", "minimum": 1}, + "experts_per_rank": {"type": "integer", "minimum": 1} + } + }, + "routing_profile": {"type": "string", + "enum": ["uniform", "balanced", "balanced-rank-local", "zipf", + "zipf-mild", "zipf-moderate", "zipf-heavy", "hotspot-single"]}, + "seed": {"type": "integer"}, + "checksums": { + "type": "object", + "additionalProperties": false, + "required": ["topk_idx", "topk_weights", "trace"], + "properties": { + "topk_idx": {"type": "string", "pattern": "^[0-9a-f]{64}$"}, + "topk_weights": {"type": "string", "pattern": "^[0-9a-f]{64}$"}, + "trace": {"type": "string", "pattern": "^[0-9a-f]{64}$"} + } + }, + "routing_stats": {"type": "object", + "description": "Realized fan-out / load / locality stats (advisory; not identity-defining)."} + } +} diff --git a/experimental/CollectiveX/summarize.py b/experimental/CollectiveX/summarize.py new file mode 100644 index 000000000..2d71a87e1 --- /dev/null +++ b/experimental/CollectiveX/summarize.py @@ -0,0 +1,260 @@ +#!/usr/bin/env python3 +"""CollectiveX — summarize a run's results. + +Two output modes over the same data: + (default) a plain-text table for the Slurm/container log; ALSO the result + gate — exits non-zero if no valid results were produced, so a + failed/skipped benchmark doesn't get reported as a green job. + --markdown GitHub-flavored markdown for a GitHub Actions job summary + (https://github.blog/.../supercharging-github-actions-with-job-summaries/); + reporting only, always exits 0. A workflow step appends this to + $GITHUB_STEP_SUMMARY so the run page shows a rendered table. + + python summarize.py --results-dir results --runner gb200-nv_1 --ts + python summarize.py --results-dir results --markdown >> "$GITHUB_STEP_SUMMARY" +""" +from __future__ import annotations + +import argparse +import glob +import json +import os + + +def load_results(results_dir: str, runner: str | None, ts: str | None) -> list[dict]: + docs = [] + for path in sorted(glob.glob(os.path.join(results_dir, "*.json"))): + base = os.path.basename(path) + if base.startswith("env_"): + continue + if runner and not base.startswith(f"{runner}_"): + continue + if ts and ts not in base: + continue + try: + with open(path) as fh: + d = json.load(fh) + except (json.JSONDecodeError, OSError): + continue + if d.get("family") in ("nccl", "moe"): + docs.append(d) + return docs + + +def _peak_busbw(rows): + return max((r.get("busbw_gbps") or 0.0 for r in rows), default=0.0) + + +_OP_ORDER = ["all_reduce", "reduce_scatter", "all_gather", "alltoall"] + + +def _row_lat(r): + vals = [(r.get(k) or {}).get("time_us") for k in ("out_of_place", "in_place")] + vals = [v for v in vals if v is not None] + return min(vals) if vals else None + + +def _lat_floor(rows): + # Small-message latency floor: time at the smallest REAL (size>0) message. + # (Sub-granularity 0-byte rows are a no-op ~1 us and not a real latency.) + real = [r for r in rows if (r.get("size_bytes") or 0) > 0] + if not real: + return float("nan") + v = _row_lat(min(real, key=lambda r: r["size_bytes"])) + return v if v is not None else float("nan") + + +def _at_size(rows, size, fn): + for r in rows: + if r.get("size_bytes") == size: + return fn(r) + return None + + +def _fmt_bytes(b): + for u, s in ((2**30, "GiB"), (2**20, "MiB"), (2**10, "KiB")): + if b >= u and b % u == 0: + return f"{b // u} {s}" + return f"{b} B" + + +def _ops_sorted(nccl): + present = {d.get("op") for d in nccl} + ordered = [o for o in _OP_ORDER if o in present] + return ordered + sorted(present - set(ordered)) + + +def _ladder(nccl): + sizes = sorted({r["size_bytes"] for d in nccl for r in d.get("rows", []) + if (r.get("size_bytes") or 0) > 0}) + if not sizes: + return [] + cand = [16384, 262144, 4194304, 67108864, 268435456, 1073741824, 4294967296] + lad = [s for s in cand if s in set(sizes) and s < sizes[-1]] + lad.append(sizes[-1]) + return lad + + +def _sweep_table(nccl, title, rowfn, fmt): + lad = _ladder(nccl) + if not lad: + return [] + ops = _ops_sorted(nccl) + rows_by_op = {d.get("op"): d.get("rows", []) for d in nccl} + out = [f"\n**{title}**\n", + "| bytes/rank | " + " | ".join(f"`{o}`" for o in ops) + " |", + "|---" + "|--:" * len(ops) + "|"] + for s in lad: + cells = [] + for o in ops: + v = _at_size(rows_by_op.get(o, []), s, rowfn) + cells.append(format(v, fmt) if isinstance(v, (int, float)) else "—") + out.append(f"| {_fmt_bytes(s)} | " + " | ".join(cells) + " |") + return out + + +def _fnum(x, fmt): + return format(x, fmt) if isinstance(x, (int, float)) else "—" + + +def _moe_sorted(moe): + return sorted(moe, key=lambda x: (x.get("backend", ""), x.get("phase", ""), x.get("ep_size", 0))) + + +def _moe_sweep_table(d): + """Markdown sweep table for one EP doc — the rows already ARE the ladder, so + emit one row per source-tokens-per-rank point. Skips old single-point docs + (no rows[]).""" + rows = d.get("rows") + if not rows: + return [] + sh = d.get("shape", {}) + head = (f"\n**`{d.get('backend')}` · {d.get('phase')} · ep{d.get('ep_size')} · " + f"H{sh.get('hidden')} top{sh.get('topk')} E{sh.get('experts')} " + f"{sh.get('dispatch_dtype')} {sh.get('routing')}** — latency vs source tokens/rank\n") + out = [head, + "| tokens/rank | fan-out | dispatch µs | combine µs | serial µs (D+C) | tokens/s | recv max | correct |", + "|--:|--:|--:|--:|--:|--:|--:|:--:|"] + for r in rows: + out.append(f"| {r.get('tokens_per_rank')} | {_fnum(r.get('fanout_mean'), '.2f')} | " + f"{_fnum(r.get('dispatch_us_p50'), '.2f')} | {_fnum(r.get('combine_us_p50'), '.2f')} | " + f"{_fnum(r.get('serial_us_p50', r.get('roundtrip_us_p50')), '.2f')} | " + f"{_fnum(r.get('tokens_per_second'), '.3e')} | " + f"{r.get('recv_tokens_max', r.get('recv_tokens', '—'))} | {'✅' if r.get('correct') else '❌'} |") + return out + + +def render_plain(nccl, moe, n_valid, total) -> str: + out = [] + hdr = "CollectiveX results" + if nccl or moe: + d0 = (nccl + moe)[0] + hdr += f" — runner={d0.get('runner')} topology={d0.get('topology_class')} transport={d0.get('transport')}" + out += ["=" * len(hdr), hdr, "=" * len(hdr)] + if nccl: + out.append(f"\nNCCL primitives (world={nccl[0].get('world_size')}, dtype={nccl[0].get('dtype')}):") + out.append(f" {'op':<16}{'status':<9}{'peak busbw':>12}{'lat floor':>10}{'avg busbw':>11}") + for d in sorted(nccl, key=lambda x: x["op"]): + rows = d.get("rows", []) + avg = (d.get("summary") or {}).get("avg_busbw_gbps") + out.append(f" {d['op']:<16}{d.get('status',''):<9}{_peak_busbw(rows):>12.1f}" + f"{_lat_floor(rows):>10.2f}{(avg if avg is not None else float('nan')):>11.1f}") + if moe: + out.append("\nMoE EP dispatch/combine (DeepEP / MoRI) — headline (* = headline tokens/rank):") + out.append(f" {'backend':<9}{'phase':<8}{'ep':>3} {'status':<9}{'T*':>5}{'disp_p50':>10}{'comb_p50':>10}{'serial':>9} correct") + for d in sorted(moe, key=lambda x: (x.get("backend", ""), x.get("phase", ""))): + m, c = d.get("metrics", {}), d.get("correctness", {}) + ser = m.get("serial_us_p50", m.get("roundtrip_us_p50")) + out.append(f" {d.get('backend',''):<9}{d.get('phase',''):<8}{str(d.get('ep_size','')):>3} {d.get('status',''):<9}" + f"{str(m.get('headline_tokens_per_rank','')):>5}" + f"{(m.get('dispatch_us_p50') or float('nan')):>10.1f}{(m.get('combine_us_p50') or float('nan')):>10.1f}" + f"{(ser or float('nan')):>9.1f} {c.get('passed')}") + return "\n".join(out) + + +def _emoji(status) -> str: + return "✅ valid" if status == "valid" else f"❌ {status}" + + +def render_markdown(nccl, moe, n_valid, total) -> str: + out = [] + if nccl or moe: + d0 = (nccl + moe)[0] + out.append(f"## CollectiveX results — `{d0.get('runner')}` · {d0.get('topology_class')} · {d0.get('transport') or 'n/a'}") + if nccl: + out.append(f"\n### NCCL/RCCL primitives (world={nccl[0].get('world_size')}, dtype={nccl[0].get('dtype')})\n") + out.append("| op | status | peak busbw (GB/s) | lat floor (µs) |") + out.append("|---|---|--:|--:|") + for d in sorted(nccl, key=lambda x: _OP_ORDER.index(x["op"]) if x["op"] in _OP_ORDER else 99): + rows = d.get("rows", []) + out.append(f"| `{d['op']}` | {_emoji(d.get('status'))} | {_peak_busbw(rows):.1f} | {_lat_floor(rows):.2f} |") + out += _sweep_table(nccl, "Bus bandwidth vs bytes/rank (GB/s)", lambda r: r.get("busbw_gbps"), ".1f") + out += _sweep_table(nccl, "Latency vs bytes/rank (µs)", _row_lat, ".2f") + out.append("\n> bytes/rank = nccl/rccl-tests message size (= per-rank for all-reduce / " + "reduce-scatter / all-to-all; all-gather input/rank = size ÷ #GPUs). Small " + "sizes are latency-bound (busbw ≈ 0); peak bandwidth is at the largest size.") + if moe: + out.append("\n### MoE EP dispatch / combine (DeepEP / MoRI)\n") + out.append("Headline = the reference point (tokens/rank shown as `T*`); the per-line " + "sweep tables below carry the full source-tokens-per-rank curve.\n") + out.append("| backend | phase | mode | dtype | resource | ep | routing (fan-out) | status | T\\* | dispatch p50 (µs) | combine p50 (µs) | serial p50 (µs) | tokens/s | correct |") + out.append("|---|---|---|---|---|--:|---|---|--:|--:|--:|--:|--:|:--:|") + for d in _moe_sorted(moe): + m, c = d.get("metrics", {}), d.get("correctness", {}) + rp = d.get("routing_profile", {}) + ser = m.get("serial_us_p50", m.get("roundtrip_us_p50")) + sh = d.get("shape") or {} + fo = f"{sh.get('routing','?')} ({_fnum(rp.get('fanout_mean'), '.1f')})" + # dtype shows whether the fp8 cast was inside the timed dispatch (LL) or not. + dt = sh.get("dispatch_dtype", "?") + fit = (d.get("reproduction") or {}).get("fp8_quant_in_timing") + dt += "*" if fit else ("⁺" if fit is False else "") + out.append(f"| `{d.get('backend')}` | {d.get('phase','')} | {d.get('mode','')} | {dt} | " + f"{d.get('resource_mode','')} | {d.get('ep_size','')} | {fo} | {_emoji(d.get('status'))} | " + f"{m.get('headline_tokens_per_rank','—')} | {_fnum(m.get('dispatch_us_p50'), '.1f')} | " + f"{_fnum(m.get('combine_us_p50'), '.1f')} | {_fnum(ser, '.1f')} | " + f"{_fnum(m.get('tokens_per_second'), '.3e')} | {'✅' if c.get('passed') else '❌'} |") + for d in _moe_sorted(moe): + out += _moe_sweep_table(d) + out.append("\n> EP sweep: only source tokens/rank varies along a line. **fan-out** = mean " + "destination ranks/token (representativeness — top-k spread, not a permutation). " + "Dispatch & combine timed **separately** (staging untimed); **serial = dispatch + " + "combine** (a sum, not an independently-measured chained op). dtype `fp8*` = fp8 cast " + "IS inside the timed dispatch (LL kernel); `fp8⁺` = cast is untimed preprocessing " + "(normal mode). `mode` ll = DeepEP low-latency; `resource` = comm SM/CU regime.") + if not total: + out.append("\n> No result files found — the benchmark produced nothing.") + return "\n".join(out) + + +def main() -> int: + ap = argparse.ArgumentParser(description="CollectiveX result summary") + ap.add_argument("--results-dir", default="results") + ap.add_argument("--runner", default=None) + ap.add_argument("--ts", default=None) + ap.add_argument("--markdown", action="store_true", + help="emit GitHub job-summary markdown (reporting only; always exits 0)") + args = ap.parse_args() + + docs = load_results(args.results_dir, args.runner, args.ts) + nccl = [d for d in docs if d["family"] == "nccl"] + moe = [d for d in docs if d["family"] == "moe"] + total = len(docs) + n_valid = sum(d.get("status") == "valid" for d in docs) + + if args.markdown: + print(render_markdown(nccl, moe, n_valid, total)) + return 0 # reporting step — never fail the job here + + print(render_plain(nccl, moe, n_valid, total)) + if total == 0: + print("ERROR: no result files found — benchmark produced nothing.") + return 1 + if n_valid < total: + print(f"ERROR: {total - n_valid} result(s) invalid — failing the job.") + return 1 + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/experimental/CollectiveX/tests/_gb300_ep_probe.py b/experimental/CollectiveX/tests/_gb300_ep_probe.py new file mode 100644 index 000000000..3889c98f5 --- /dev/null +++ b/experimental/CollectiveX/tests/_gb300_ep_probe.py @@ -0,0 +1,144 @@ +#!/usr/bin/env python3 +"""GB300 EP8 GO/NO-GO probe — does DeepEP work across 2 NVL72 trays (8 ranks / 2 nodes)? + +Read-only spike (no artifacts). One PATH per process (CX_PROBE_PATH), because NVSHMEM +inits once per process and the internode/LL buffers each bootstrap it. Reports, on rank 0, +which Buffer construction + a 1-shot dispatch/combine round-trip actually runs on this fabric: + + intranode Buffer(group, nvl, 0) (MNNVL-as-one-NVLink-domain hope) + internode Buffer(group, nvl, rdma>0) (DeepEP NVSHMEM path, over NVLink/IB) + ll Buffer(group, 0, rdma, low_latency_mode=True) (decode path; nvlink-LL allowed) + +Env (set per-rank by the srun wrapper): RANK WORLD_SIZE LOCAL_RANK MASTER_ADDR MASTER_PORT + CX_PROBE_PATH=intranode|internode|ll +""" +import os +import socket +import sys +import traceback + +import torch +import torch.distributed as dist + +RANK = int(os.environ["RANK"]) +WORLD = int(os.environ["WORLD_SIZE"]) +LR = int(os.environ["LOCAL_RANK"]) +PATH = os.environ.get("CX_PROBE_PATH", "intranode") +HOST = socket.gethostname() +H = 7168 +TOPK = 8 +EXPERTS = WORLD * 32 # 256 at world=8 — same as the real sweep +T = 8 # tiny: this is a does-it-run probe, not a timing run + + +def log(msg): + print(f"[r{RANK}@{HOST} {PATH}] {msg}", flush=True) + + +def main(): + torch.cuda.set_device(LR) + dev = torch.device(f"cuda:{LR}") + dist.init_process_group("nccl", rank=RANK, world_size=WORLD) + + import deep_ep + from deep_ep import Buffer + if RANK == 0: + import inspect + try: + import importlib.metadata as md + ver = md.version("deep_ep") + except Exception: + ver = getattr(deep_ep, "__version__", "?") + log(f"deep_ep={ver} torch={torch.__version__} cuda={torch.version.cuda}") + log(f"Buffer.__init__{inspect.signature(Buffer.__init__)}") + log(f"caps: internode_dispatch={hasattr(Buffer,'internode_dispatch')} " + f"get_dispatch_config={hasattr(Buffer,'get_dispatch_config')} " + f"low_latency_dispatch={hasattr(Buffer,'low_latency_dispatch')} " + f"ll_rdma_hint={hasattr(Buffer,'get_low_latency_rdma_size_hint')}") + + hosts = [None] * WORLD + dist.all_gather_object(hosts, HOST) + if RANK == 0: + uniq = sorted(set(hosts)) + log(f"world={WORLD} over {len(uniq)} node(s): {uniq}") + + group = dist.group.WORLD + x = torch.randn(T, H, dtype=torch.bfloat16, device=dev) + g = torch.Generator(device=dev).manual_seed(1234 + RANK) + idx = torch.stack([torch.randperm(EXPERTS, device=dev, generator=g)[:TOPK] + for _ in range(T)]).to(torch.int64) + w = torch.rand(T, TOPK, device=dev, generator=g).to(torch.float32) + + dist.barrier() + try: + if PATH == "intranode": + buf = Buffer(group, 1 * 1024**3, 0) + try: + Buffer.set_num_sms(24) + except Exception: + pass + ntr, ntrr, ntpe, itir, _ = buf.get_dispatch_layout(idx, EXPERTS) + rx, _ri, rw, _nre, h, _ev = buf.dispatch( + x, topk_idx=idx, topk_weights=w, num_tokens_per_rank=ntr, + num_tokens_per_rdma_rank=ntrr, is_token_in_rank=itir, + num_tokens_per_expert=ntpe) + cx, _, _ = buf.combine(rx, h, topk_weights=rw) + rxs = rx[0].shape if isinstance(rx, tuple) else rx.shape + log(f"RESULT intranode OK: recv={tuple(rxs)} combine={tuple(cx.shape)} " + f"rdma_rank_layout={'present' if ntrr is not None else 'None'}") + + elif PATH == "internode": + buf = Buffer(group, 1 * 1024**3, 1 * 1024**3) + try: + Buffer.set_num_sms(24) + except Exception: + pass + ntr, ntrr, ntpe, itir, _ = buf.get_dispatch_layout(idx, EXPERTS) + rx, _ri, rw, _nre, h, _ev = buf.dispatch( + x, topk_idx=idx, topk_weights=w, num_tokens_per_rank=ntr, + num_tokens_per_rdma_rank=ntrr, is_token_in_rank=itir, + num_tokens_per_expert=ntpe) + cx, _, _ = buf.combine(rx, h, topk_weights=rw) + rxs = rx[0].shape if isinstance(rx, tuple) else rx.shape + log(f"RESULT internode OK: recv={tuple(rxs)} combine={tuple(cx.shape)} " + f"rdma_rank_layout={'present' if ntrr is not None else 'None'}") + + elif PATH == "ll": + num_max = 128 + rdma = Buffer.get_low_latency_rdma_size_hint(num_max, H, WORLD, EXPERTS) + nq = max(1, EXPERTS // WORLD) + buf = Buffer(group, 0, rdma, low_latency_mode=True, num_qps_per_rank=nq, + allow_nvlink_for_low_latency_mode=True) + rx, rc, h, _ev, _hook = buf.low_latency_dispatch( + x, idx, num_max, EXPERTS, use_fp8=False, return_recv_hook=False) + cx, _ev2, _hook2 = buf.low_latency_combine(rx, idx, w, h) + rxs = rx[0].shape if isinstance(rx, tuple) else rx.shape + log(f"RESULT ll OK: recv={tuple(rxs)} combine={tuple(cx.shape)}") + else: + log(f"unknown CX_PROBE_PATH={PATH}") + return 2 + dist.barrier() + except Exception as exc: + if RANK == 0: + log(f"RESULT {PATH} FAIL: {exc!r}") + tb = traceback.format_exc().strip().splitlines() + for ln in tb[-8:]: + log(f" | {ln}") + # let other ranks print their error too (often the real one is rank-specific) + else: + log(f"FAIL(non0): {exc!r}") + try: + dist.barrier() + except Exception: + pass + return 1 + finally: + try: + dist.destroy_process_group() + except Exception: + pass + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/experimental/CollectiveX/tests/capability.py b/experimental/CollectiveX/tests/capability.py new file mode 100644 index 000000000..fc10780c0 --- /dev/null +++ b/experimental/CollectiveX/tests/capability.py @@ -0,0 +1,98 @@ +#!/usr/bin/env python3 +"""CollectiveX capability resolver (stdlib-only — runs on a login node, no torch). + +A workflow that exposes backend x SKU x mode x dtype x contract can request combinations +no backend supports, and 'all' is not the same backend set across vendors. This static +table mirrors the adapters' SUPPORTED_* sets so the matrix compiler / a pre-flight step +can REJECT or OMIT invalid combinations BEFORE consuming a runner (review #3). The +adapters still reject at runtime — this just fails fast and keeps the matrix honest. + + python3 tests/capability.py --sku b300 --backend deepep --mode ll --dtype fp8 \ + --contract layout-and-dispatch-v1 # exit 0 if valid, 3 + reason if not + python3 tests/capability.py --list # dump the table +""" +from __future__ import annotations + +import argparse +import json +import sys + +# SKU -> vendor. The runner label's SKU prefix selects the launcher; vendor gates backend. +SKU_VENDOR = { + "h100": "nvidia", "h200": "nvidia", "b200": "nvidia", "b300": "nvidia", + "gb200": "nvidia", "gb300": "nvidia", "h100-dgxc": "nvidia", "b200-dgxc": "nvidia", + "mi355x": "amd", "mi350x": "amd", "mi325x": "amd", "mi300x": "amd", +} + +# Backend capability table — MIRRORS the adapter SUPPORTED_* sets (the runtime source of +# truth). Keep in sync with ep_deepep.py / ep_mori.py. LL is decode-only; cached-layout is +# normal-only; MoRI is bf16/normal/layout-and-dispatch only. +CAP = { + "deepep": { + "vendors": ["nvidia"], + "modes": ["normal", "ll"], + "dtypes": ["bf16", "fp8"], + "contracts": ["layout-and-dispatch-v1", "cached-layout-comm-only-v1"], + "transports": ["nvlink", "rdma"], + }, + "mori": { + "vendors": ["amd"], + "modes": ["normal"], + "dtypes": ["bf16"], + "contracts": ["layout-and-dispatch-v1"], + "transports": ["xgmi", "rdma"], + }, +} +# nccl/rccl are collective primitives, not EP dispatch/combine — phase is meaningless. +COLLECTIVE = {"nccl": ["nvidia"], "rccl": ["amd"]} + +# 'all' resolves to a DEFINED per-vendor backend set (not the same across vendors). +VENDOR_BACKENDS = {"nvidia": ["nccl", "deepep"], "amd": ["rccl", "mori"]} + + +def resolve(sku, backend, mode="normal", dtype="bf16", + contract="layout-and-dispatch-v1"): + """Return (ok: bool, reason: str).""" + sku = (sku or "").split("_")[0] + vendor = SKU_VENDOR.get(sku) + if vendor is None: + return False, f"unknown SKU '{sku}'" + if backend in COLLECTIVE: + if vendor not in COLLECTIVE[backend]: + return False, f"{backend} is not the {vendor} collective backend" + return True, "collective primitive (phase/dtype/mode/contract not applicable)" + cap = CAP.get(backend) + if cap is None: + return False, f"unknown backend '{backend}'" + if vendor not in cap["vendors"]: + return False, f"{backend} runs on {cap['vendors']}, not {vendor} SKU '{sku}'" + if mode not in cap["modes"]: + return False, f"{backend} modes={cap['modes']} (got '{mode}')" + if dtype not in cap["dtypes"]: + return False, f"{backend} dtypes={cap['dtypes']} (got '{dtype}')" + if contract not in cap["contracts"]: + return False, f"{backend} contracts={cap['contracts']} (got '{contract}')" + if mode == "ll" and contract == "cached-layout-comm-only-v1": + return False, "cached-layout-comm-only-v1 is meaningless for LL (layout is in-kernel)" + return True, "ok" + + +def main() -> int: + ap = argparse.ArgumentParser(description="CollectiveX capability resolver") + ap.add_argument("--sku"); ap.add_argument("--backend") + ap.add_argument("--mode", default="normal"); ap.add_argument("--dtype", default="bf16") + ap.add_argument("--contract", default="layout-and-dispatch-v1") + ap.add_argument("--list", action="store_true") + a = ap.parse_args() + if a.list: + print(json.dumps({"sku_vendor": SKU_VENDOR, "cap": CAP, + "collective": COLLECTIVE, "vendor_backends": VENDOR_BACKENDS}, indent=2)) + return 0 + ok, reason = resolve(a.sku, a.backend, a.mode, a.dtype, a.contract) + print(f"{'VALID' if ok else 'INVALID'}: sku={a.sku} backend={a.backend} mode={a.mode} " + f"dtype={a.dtype} contract={a.contract} — {reason}") + return 0 if ok else 3 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/experimental/CollectiveX/tests/ep_deepep.py b/experimental/CollectiveX/tests/ep_deepep.py new file mode 100644 index 000000000..51ce43fbb --- /dev/null +++ b/experimental/CollectiveX/tests/ep_deepep.py @@ -0,0 +1,280 @@ +#!/usr/bin/env python3 +"""CollectiveX EP backend adapter — DeepEP (NVIDIA), normal mode. + +The harness owns the deterministic shared routing trace, the comm-only timing, and +the doc; this file owns only DeepEP's API calls and its correctness reference. +`make_problem` materializes the harness-provided rank slice (no RNG here), so every +SKU runs the identical routed workload. + +Correctness (per DeepEP's intranode test): a pure dispatch->combine round trip with no +expert compute reconstructs x only after dividing by the number of ranks each token was +sent to, so the harness expects combined ≈ x * is_token_in_rank.sum(dim=1). +""" +from __future__ import annotations + +import os +import sys +import types + +import torch +import torch.distributed as dist + +try: + from deep_ep import Buffer # type: ignore + import deep_ep # for version/provenance +except Exception as exc: # pragma: no cover - needs the built DeepEP + print("ERROR: deep_ep import failed — DeepEP must be present/built at job setup. " + f"{exc!r}", file=sys.stderr) + raise + + +def _deepep_version() -> str: + try: + import importlib.metadata as _md + return _md.version("deep_ep") + except Exception: + return getattr(deep_ep, "__version__", "unknown") + + +# DeepEP's normal-mode fp8 dispatch takes x as a (fp8, scales) tuple with a per-token +# block-128 scale (deep_ep 1.2.1 ships NO helper for this — utils is empty — so we +# implement the exact convention its kernels expect: scales [T, H//128] float32, e4m3, +# 448 = e4m3 max). Both directions of the cast run OUTSIDE the timed window (cast in +# make_problem, dequant in stage), so fp8 quantization is NOT included in dispatch time. +_FP8_MAX = 448.0 +_FP8_BLOCK = 128 + + +def _per_token_cast_to_fp8(x): + # x: [T, H] (H % 128 == 0) -> (x_fp8 [T,H] e4m3fn, scales [T, H//128] f32) + T, H = x.shape + xv = x.float().view(T, H // _FP8_BLOCK, _FP8_BLOCK) + amax = xv.abs().amax(dim=2).clamp(min=1e-4) # [T, H//128] + x_fp8 = (xv * (_FP8_MAX / amax.unsqueeze(2))).to(torch.float8_e4m3fn).view(T, H) + return x_fp8, (amax / _FP8_MAX).contiguous() + + +def _per_block_dequant(x_fp8, scales): + # inverse of the above: [R,H] e4m3 + [R, H//128] f32 -> [R,H] bf16 + R, H = x_fp8.shape + xv = x_fp8.float().view(R, H // _FP8_BLOCK, _FP8_BLOCK) + return (xv * scales.unsqueeze(2)).view(R, H).to(torch.bfloat16) + + +def _per_block_dequant_3d(x_fp8, scales): + # LL recv layout: [E, S, H] e4m3 + [E, S, H//128] f32 -> [E, S, H] bf16 + E, S, H = x_fp8.shape + xv = x_fp8.float().view(E, S, H // _FP8_BLOCK, _FP8_BLOCK) + return (xv * scales.unsqueeze(-1)).view(E, S, H).to(torch.bfloat16) + + +class DeepEPBackend: + name = "deepep" + combine_needs_redispatch = False # DeepEP combine reuses the handle (its own bench does too) + # Blackwell (B300) drops GPU clocks during the tiny small-T points, so the harness + # re-ramps clocks at each shape before timing it. Harmless (just untimed iters) on H100. + wants_warm_burst = True + # Capabilities — run_ep.py REJECTS anything outside these BEFORE construction (no + # fallback/mislabel). Expanded as each path is implemented + hardware-validated. + # normal mode: bf16 + fp8 (per-token block-128 cast) — validated intranode NVLink. + # ll mode: low_latency_dispatch/combine — verified RUNNING intranode over NVLink via + # allow_nvlink_for_low_latency_mode (IBGDA not required intranode) on 8xH100. + SUPPORTED_PRECISIONS = {"bf16", "fp8"} + SUPPORTED_MODES = {"normal", "ll"} + # Both contracts (review #3): layout-and-dispatch-v1 times get_dispatch_layout INSIDE + # dispatch; cached-layout-comm-only-v1 hoists the layout out (untimed) so dispatch is + # pure comm — matching DeepEP's own benchmark. (cached-layout applies to normal mode; + # LL has no separable layout — its low_latency_dispatch computes it internally.) + SUPPORTED_CONTRACTS = {"layout-and-dispatch-v1", "cached-layout-comm-only-v1"} + + def __init__(self, args, rank, world_size, local_rank, device): + self.args = args + self.rank = rank + self.world_size = world_size + self.device = device + self.mode = args.mode + self.ll = (args.mode == "ll") + self.contract = args.measurement_contract + # hoist layout out of the timed dispatch only for the cached contract in normal mode. + self.cache_layout = (self.contract == "cached-layout-comm-only-v1") and not self.ll + self.group = dist.group.WORLD + assert args.dispatch_dtype in self.SUPPORTED_PRECISIONS and args.mode in self.SUPPORTED_MODES, \ + "run_ep.py must reject unsupported dtype/mode before constructing the backend" + # fp8 e4m3 per-token-block round-trip caps reconstruction error near the largest + # element at ~1/16 (3 mantissa bits); bf16 round-trip is ~5e-3. Tolerance is + # recorded in the artifact so the looser fp8 gate is explicit, not hidden. + self.fp8 = (args.dispatch_dtype == "fp8") + self.tolerance = 1.25e-1 if self.fp8 else 5e-2 + dev_sms = torch.cuda.get_device_properties(device).multi_processor_count + ver = _deepep_version() + if self.ll: + self._init_ll(args, dev_sms, ver) + else: + self._init_normal(args, rank, dev_sms, ver) + + def _init_normal(self, args, rank, dev_sms, ver): + # fp8 cast is done in make_problem / dequant in stage — both UNTIMED. So fp8 + # quantization is NOT inside the dispatch timing for DeepEP normal mode. + self.fp8_in_timing = False if self.fp8 else None + self.combine_needs_redispatch = False # normal combine reuses the handle + # Intranode normal mode: NVLink buffer only. ONE buffer size for ALL points + # (review: a phase-dependent 2/4 GiB made the shared T=128 point differ between + # the decode and prefill sweeps). 4 GiB holds T up to 4096 (validated). + num_nvl_bytes = int(os.environ.get("CX_DEEPEP_NVL_BYTES", str(4 * 1024 * 1024 * 1024))) + self.buffer = Buffer(self.group, num_nvl_bytes, 0) + rm = args.resource_mode + tuned_src = None + if rm == "normalized": + num_sms = max(1, round(args.sm_fraction * dev_sms)) # ~same device fraction as MoRI + elif rm == "tuned": + # Best-available for the installed DeepEP: its OWN default SM count + # (Buffer.num_sms — the library's analytic choice; it deliberately uses + # fewer SMs). get_dispatch_config(num_ranks) returns the recommended Config + # but doesn't expose num_sms to Python, and the default already reflects it. + num_sms = int(getattr(Buffer, "num_sms", args.num_sms)) + tuned_src = "deepep-default-num_sms" + else: # default — the bring-up budget + num_sms = args.num_sms + try: + Buffer.set_num_sms(num_sms) + except Exception as exc: # pragma: no cover - version dependent + if rank == 0: + print(f"WARN: could not set num_sms={num_sms}: {exc!r}", file=sys.stderr) + self.backend_provenance = { + "deepep_version": ver, + "deepep_commit": os.environ.get("DEEPEP_COMMIT") or f"pkg-{ver}", + "mode": "normal", "resource_mode": rm, "num_sms": num_sms, "device_sms": dev_sms, + "sm_fraction": (num_sms / dev_sms), "tuned_source": tuned_src or "n/a", + "num_nvl_bytes": num_nvl_bytes, + } + + def _init_ll(self, args, dev_sms, ver): + # Low-latency mode: a distinct kernel family (IBGDA, but runs intranode over NVLink + # via allow_nvlink_for_low_latency_mode). fp8 cast happens INSIDE low_latency_dispatch + # so for fp8 the quantization IS inside the timed window (recorded honestly). The + # buffer is sized for a FIXED num_max_dispatch_tokens_per_rank (all ranks identical), + # so LL is a decode-shaped path; buffer_cap caps the sweep at num_max (no silent drop). + # set_num_sms does NOT apply (the LL kernel picks its own occupancy) — recorded n/a. + self.fp8_in_timing = (True if self.fp8 else None) + self.combine_needs_redispatch = True # re-dispatch (untimed) before each timed combine + self.num_max = int(os.environ.get("CX_LL_MAX_TOKENS", "128")) + self.experts = args.experts + rdma_bytes = Buffer.get_low_latency_rdma_size_hint( + self.num_max, args.hidden, self.world_size, args.experts) + # one QP per local expert is the DeepEP convention for LL + self.num_qps = max(1, args.experts // self.world_size) + self.buffer = Buffer(self.group, 0, rdma_bytes, low_latency_mode=True, + num_qps_per_rank=self.num_qps, + allow_nvlink_for_low_latency_mode=True) + self.backend_provenance = { + "deepep_version": ver, + "deepep_commit": os.environ.get("DEEPEP_COMMIT") or f"pkg-{ver}", + "mode": "ll", "resource_mode": args.resource_mode, + "num_sms": None, "device_sms": dev_sms, "tuned_source": "ll-fixed-kernel", + "num_max_dispatch_tokens_per_rank": self.num_max, + "num_rdma_bytes": rdma_bytes, "num_qps_per_rank": self.num_qps, + "low_latency_mode": True, "use_fp8": self.fp8, + } + + def buffer_cap(self, args): + # LL is sized for a fixed num_max; cap the sweep there (reported, not silent). + return self.num_max if self.ll else None + + def make_problem(self, T, idx, weights, x): + # idx[T,topk] int64, weights[T,topk] f32, x[T,hidden] bf16 — the shared trace slice. + p = types.SimpleNamespace(T=T, x=x, topk_idx=idx.to(torch.int64), + topk_weights=weights.to(torch.float32), layout=None) + if self.fp8 and not self.ll: + # normal mode: per-token block-128 cast, UNTIMED (preprocessing, mirrors the + # real producer that hands the dispatcher already-quantized activations). + # LL mode does NOT pre-cast — its kernel casts internally (timed). + p.x_fp8, p.x_scales = _per_token_cast_to_fp8(x) + if self.cache_layout: + # cached-layout-comm-only-v1: compute the dispatch layout ONCE here (untimed) + # so the timed dispatch is pure comm. (layout-and-dispatch-v1 leaves it None + # and dispatch computes it inside the timed window.) + ntr, _, ntpe, itir, _ = self.buffer.get_dispatch_layout(p.topk_idx, self.args.experts) + p.layout = (ntr, ntpe, itir) + return p + + def dispatch(self, p): + if self.ll: + return self._dispatch_ll(p) + if p.layout is not None: # cached-layout-comm-only-v1 + num_tokens_per_rank, num_tokens_per_expert, is_token_in_rank = p.layout + else: # layout-and-dispatch-v1 (timed layout) + (num_tokens_per_rank, _, num_tokens_per_expert, + is_token_in_rank, _) = self.buffer.get_dispatch_layout(p.topk_idx, self.args.experts) + x_in = (p.x_fp8, p.x_scales) if self.fp8 else p.x # tuple => DeepEP fp8 dispatch + recv_x, _recv_idx, recv_topk_weights, _, handle, _ = self.buffer.dispatch( + x_in, topk_idx=p.topk_idx, topk_weights=p.topk_weights, + num_tokens_per_rank=num_tokens_per_rank, is_token_in_rank=is_token_in_rank, + num_tokens_per_expert=num_tokens_per_expert) + return types.SimpleNamespace( + recv_x=recv_x, recv_topk_weights=recv_topk_weights, handle=handle, + is_token_in_rank=is_token_in_rank) + + def _dispatch_ll(self, p): + # x is bf16; the kernel casts to fp8 internally when use_fp8=True (so for fp8 the + # cast IS inside this timed op — fp8_in_timing=True). recv is the expert-major + # 3D layout [num_local_experts, num_max*world, hidden] (+scales when fp8). + recv_x, recv_count, handle, _event, _hook = self.buffer.low_latency_dispatch( + p.x, p.topk_idx, self.num_max, self.experts, + use_fp8=self.fp8, return_recv_hook=False) + return types.SimpleNamespace(recv_x=recv_x, recv_count=recv_count, handle=handle) + + def stage(self, p, h): + # comm-only contract: "expert outputs" already exist as recv_x. Dequantize fp8 recv + # to bf16 HERE (untimed) — the expert-compute boundary — so combine moves bf16 in + # both precisions. Bf16 recv is staged as-is. (LL recv is 3D; normal recv is 2D.) + if self.ll: + if self.fp8: + recv_fp8, recv_scales = h.recv_x + h.combine_input = _per_block_dequant_3d(recv_fp8, recv_scales) + else: + h.combine_input = h.recv_x + elif self.fp8: + recv_fp8, recv_scales = h.recv_x + h.combine_input = _per_block_dequant(recv_fp8, recv_scales) + else: + h.combine_input = h.recv_x + return None + + def combine(self, p, h): + if self.ll: + # weighted per-expert reduce; topk_idx/weights are the ORIGINAL per-token ones. + combined_x, _event, _hook = self.buffer.low_latency_combine( + h.combine_input, p.topk_idx, p.topk_weights, h.handle) + return combined_x + combined_x, _, _ = self.buffer.combine(h.combine_input, h.handle, + topk_weights=h.recv_topk_weights) + return combined_x + + def expected(self, p, h): + if self.ll: + # LL combine reduces each token's topk expert copies weighted by topk_weights; + # with no expert compute each copy is (the kernel's fp8 cast of) x, so + # combined ≈ x * sum(topk_weights). fp8 quant error is covered by self.tolerance. + wsum = p.topk_weights.sum(dim=1, keepdim=True) + return p.x.float() * wsum, p.T + # normal: round trip with no expert compute reconstructs x*(#destination ranks); + # for fp8 compare against the dequantized cast that was actually sent. + ranks_per_token = h.is_token_in_rank.sum(dim=1, keepdim=True).clamp(min=1).float() + ref = p.x.float() + if self.fp8: + ref = _per_block_dequant(p.x_fp8, p.x_scales).float() + return ref * ranks_per_token, p.T + + def recv_tokens(self, h): + if self.ll: + return int(h.recv_count.sum().item()) # token-copies received across local experts + rx = h.recv_x[0] if isinstance(h.recv_x, tuple) else h.recv_x + return int(rx.shape[0]) + + def finalize(self, rc): + try: + dist.barrier() + dist.destroy_process_group() + except Exception: + pass + return rc diff --git a/experimental/CollectiveX/tests/ep_harness.py b/experimental/CollectiveX/tests/ep_harness.py new file mode 100644 index 000000000..4b9c746ef --- /dev/null +++ b/experimental/CollectiveX/tests/ep_harness.py @@ -0,0 +1,731 @@ +#!/usr/bin/env python3 +"""CollectiveX — shared EP (expert-parallel) dispatch/combine benchmark harness. + +Backend-agnostic core. The per-backend adapters (`ep_deepep.py`, `ep_mori.py`) +implement a small duck-typed protocol; this module owns the source-tokens-per-rank +sweep, the timing, the correctness gate, and the provenance-tagged JSON doc. + +Fair-comparison contract (hardened after review — see notes.md / plan.md): + * **Deterministic shared routing trace** (`routing.py`): the per-token expert IDs + + gate weights are generated once from a fixed seed over the *global* batch and are + identical on every SKU; each rank materializes its slice. So every platform runs + the *same* problem (no per-rank/per-platform RNG in the adapters). + * **Explicit measurement contract** (review #3): adapters conform to a NAMED timing + boundary, they do not each choose their own. layout-and-dispatch-v1 times the + routing-layout step inside dispatch (the only contract MoRI can honor); cached- + layout-comm-only-v1 hoists it out (DeepEP). Combine excludes staging in both. + Serial = SUM of the two isolated medians (NOT a measured chained op). + * **Correct collective percentile**: each iteration's latency is reduced MAX across + ranks first (a collective finishes with its slowest rank), THEN percentiled — + `median_i(max_r)`, not `max_r(median_i)`. + * **One line = one fixed config**; only T varies. Both `tokens_per_rank` and + `global_tokens = T * ep_size` are recorded for the weak/strong-scaling x toggle. + +stdlib-only at module top (torch is passed in by the entrypoint; `routing` is imported +lazily inside run_sweep) so this file `py_compile`s without torch. + +Backend protocol: + name, mode, combine_needs_redispatch, backend_provenance(dict) + buffer_cap(args) -> int|None + make_problem(T, idx, weights, x) -> problem # materialize this rank's trace slice + dispatch(problem) -> handle # pure dispatch comm (timed) + stage(problem, handle) # untimed expert-output placement + combine(problem, handle) -> tensor # pure combine comm (timed) + expected(problem, handle) -> (tensor, n_cmp) # correctness reference + recv_tokens(handle) -> int # realized tokens received this rank + finalize(rc) -> int|NoReturn +""" +from __future__ import annotations + +import argparse +import datetime as _dt +import hashlib +import json +import os + +SCHEMA_VERSION = 3 # v3: explicit contracts, pooled trials p50/p90/p99, routing-identity proof, separated logical bytes + +# Phase-default sweeps — token-size regimes, NOT distinct kernels (both run normal +# mode; "decode"/"prefill" name the small/large-token regime). Powers of two for a +# clean log x-axis; clamped to the backend buffer ceiling (MoRI's registerable heap). +DECODE_LADDER = [1, 2, 4, 8, 16, 32, 64, 128] +PREFILL_LADDER = [128, 256, 512, 1024, 2048, 4096] + +_DTYPE_BYTES = {"bf16": 2, "fp16": 2, "fp8": 1} + + +def add_common_args(ap: argparse.ArgumentParser) -> None: + """CLI args shared by every backend (the entrypoint adds --backend).""" + ap.add_argument("--phase", default="decode", choices=["decode", "prefill"], + help="token-size regime: decode (small T) / prefill (large T) — picks the default ladder") + ap.add_argument("--tokens-ladder", default="", + help="space/comma-separated source-tokens-per-rank sweep; blank = phase default") + ap.add_argument("--hidden", type=int, default=7168) + ap.add_argument("--topk", type=int, default=8) + ap.add_argument("--experts", type=int, default=256, help="TOTAL experts (fixed across EP degrees)") + ap.add_argument("--dispatch-dtype", default="bf16", choices=["bf16", "fp8"]) + # uniform = realistic top-k (fan-out ≈5.3 over EP8); balanced = load-equalized, + # one-expert-per-rank (fan-out = ep_size); balanced-rank-local = fan-out 1 (min + # comm) edge case; zipf = skewed. Default to the REALISTIC one. + ap.add_argument("--routing", default="uniform", + choices=["uniform", "balanced", "balanced-rank-local", "zipf", + "zipf-mild", "zipf-moderate", "zipf-heavy", "hotspot-single"]) + # EPLB (Expert-Parallel Load Balancer): replicate hot experts onto redundant physical + # slots + balanced-place so per-rank load equalizes. A pure routing-trace transform + # (tests/eplb.py); experts becomes num_logical+redundant. The remedy for `zipf` skew. + ap.add_argument("--eplb", action="store_true", + help="apply EPLB expert replication/placement to the routing trace") + ap.add_argument("--num-redundant-experts", type=int, default=32, + help="EPLB: redundant physical expert slots (rounded up to a multiple of ep_size)") + # Canonical serialized workload (goal P1): consume pre-generated trace bytes instead of the + # seeded runtime generator, so a result is provably the SAME workload as another machine's + # (checksum match). Points at a dir of .npz/.manifest.json (make_workloads.py). + ap.add_argument("--workload-dir", default="", + help="dir of canonical workload traces; empty = seeded runtime generation (dev)") + ap.add_argument("--mode", default="normal", choices=["normal", "ll"], + help="kernel path: normal or low-latency (LL); LL is backend-dependent") + # Measurement contract — the EXPLICIT timing boundary every adapter must conform to + # (review #3: adapters must not each decide their own boundary). Backends declare + # SUPPORTED_CONTRACTS; run_ep.py rejects an unsupported one. + # layout-and-dispatch-v1 — dispatch timing INCLUDES routing-layout generation + # (the only contract MoRI can honor; its layout is + # computed inside the kernel and cannot be hoisted). + # cached-layout-comm-only-v1 — layout computed ONCE untimed; dispatch times pure + # comm (DeepEP-only; matches DeepEP's own benchmark). + # Combine excludes staging in BOTH (staging is untimed for every backend). + ap.add_argument("--measurement-contract", default="layout-and-dispatch-v1", + choices=["layout-and-dispatch-v1", "cached-layout-comm-only-v1"]) + ap.add_argument("--num-sms", type=int, default=24, + help="DeepEP comm-SM budget in 'default' resource-mode (MoRI uses block_num/warps)") + # Resource regime (review: budgets were neither normalized nor tuned): + # normalized — each backend restricted to ~sm_fraction of its device's units + # (DeepEP set_num_sms(frac·SMs); MoRI block_num≈frac·CUs). Fraction- + # based, recorded — an approximate apples-to-apples, not identical work. + # tuned — each backend's recommended/auto launch config (best achievable). + # default — DeepEP --num-sms / MoRI 80 blocks (the bring-up budget). + ap.add_argument("--resource-mode", default="normalized", + choices=["normalized", "tuned", "default"]) + ap.add_argument("--sm-fraction", type=float, default=0.18, + help="normalized mode: fraction of device SMs/CUs dedicated to comms (~24/132)") + ap.add_argument("--num-ep-groups", type=int, default=1, + help="concurrent EP groups; >1 is REJECTED (real subgroup PGs unimplemented)") + ap.add_argument("--seed", type=int, default=67) + # 32: B300/Blackwell needs ~30 untimed iters to reach steady-state GPU clocks + + # establish NVLink/NVSHMEM connections — at warmup=8 its dispatch read ~1787us + # (cold), at warmup>=30 it settles to ~85us (faster than H100, reproducible within + # ~2.5%). H100/MI355X reach steady state much sooner; the extra iters are harmless. + ap.add_argument("--warmup", type=int, default=32) + ap.add_argument("--iters", type=int, default=200, + help="timed iterations PER TRIAL; pooled across trials for percentiles") + # review #3: p99 from ~50 samples is just the max. Pool iters x trials, randomize the + # token-order each trial so warmup/clock drift doesn't correlate with T, report p50/ + # p90/p99 (p99 is the headline). 3 trials x 200 iters = 600 pooled samples per point. + ap.add_argument("--trials", type=int, default=3, + help="independent timed trials, token-order randomized per trial; samples pooled") + ap.add_argument("--allow-unknown-provenance", action="store_true", + help="permit a run with unpinned backend commit/version (default: fail)") + # provenance / output + ap.add_argument("--runner", required=True) + ap.add_argument("--topology-class", required=True) + ap.add_argument("--transport", default="") + ap.add_argument("--comparison-class", default="standardized") + # Structured placement metadata (goal P2 topology): GPUs/node + scale-up domain + placement + # kind let routing locality (local/same-node/cross-domain copy fractions) be computed and let + # packed/striped/adversarial be distinguished. gpus-per-node=0 -> single node (= ep_size). + ap.add_argument("--gpus-per-node", type=int, default=0) + ap.add_argument("--scale-up-domain", type=int, default=0, help="0 = gpus_per_node*ep (one domain)") + ap.add_argument("--placement", default="packed", + choices=["packed", "striped", "runtime-native", "adversarial"]) + ap.add_argument("--env-json") + ap.add_argument("--timestamp") + ap.add_argument("--out", required=True) + + +def token_ladder(spec: str, phase: str, cap: int | None) -> tuple[list[int], list[int]]: + """Return (ladder, dropped): explicit spec else the phase default; positive ints; + clamped to `cap` with dropped points reported (never silently truncated).""" + if spec and spec.strip(): + want = [int(t) for t in spec.replace(",", " ").split() if t] + else: + want = DECODE_LADDER if phase == "decode" else PREFILL_LADDER + want = sorted({t for t in want if t > 0}) + if cap is not None: + return [t for t in want if t <= cap], [t for t in want if t > cap] + return want, [] + + +def percentile(xs: list[float], q: float) -> float: + if not xs: + return float("nan") + s = sorted(xs) + i = max(0, min(len(s) - 1, int(round(q / 100.0 * (len(s) - 1))))) + return s[i] + + +def time_us(torch, fn, warmup: int, iters: int, pre=None) -> list[float]: + """Per-iteration CUDA-event latencies (µs) for THIS rank. + + Without `pre`: times `fn()`. With `pre`: runs `pre()` UNTIMED each iteration (sync + before the start event so its GPU work can't bleed in), then times `fn(pre_result)` + — how combine is isolated when it consumes the dispatch state and needs a fresh + untimed dispatch+stage before every sample. Returns the raw per-iteration series; + the caller reduces across ranks per iteration before percentiling. + """ + def sample(): + arg = pre() if pre is not None else None + if pre is not None: + torch.cuda.synchronize() + s = torch.cuda.Event(enable_timing=True) + e = torch.cuda.Event(enable_timing=True) + s.record() + fn(arg) if pre is not None else fn() + e.record() + torch.cuda.synchronize() + return s.elapsed_time(e) * 1000.0 # ms -> us + + for _ in range(max(0, warmup)): + if pre is not None: + a = pre(); torch.cuda.synchronize(); fn(a) + else: + fn() + # sync EACH warmup iteration, not just once after the loop: the measured-roundtrip fn + # interleaves dispatch+combine on a backend's persistent comm buffer, so back-to-back + # un-synced warmup iterations let iter N+1's dispatch race iter N's combine (CUDA abort + # on a rank -> NCCL-watchdog SIGABRT). Cheap (warmup is small); timed samples already sync. + torch.cuda.synchronize() + return [sample() for _ in range(iters)] + + +def comparison_key(meta: dict) -> str: + """Machine key gating which rows share a curve — built from the FIXED config ONLY + (tokens_per_rank is the x-axis and is excluded). op/backend/mode/phase/ep_size/ + topology are in the key, so EP4 vs EP8, normal vs LL, decode vs prefill, and + different SKUs are labelled distinct, never silently overlaid.""" + parts = [ + meta["op"], meta["backend"], meta["mode"], meta["phase"], + str(meta["ep_size"]), str(meta["nodes"]), meta.get("resource_mode", "default"), + meta["topology_class"], meta["comparison_class"], meta["measurement_contract"], + json.dumps(meta["shape"], sort_keys=True), + ] + return hashlib.sha256("|".join(parts).encode()).hexdigest()[:16] + + +def _reduce_vec(torch, dist, device, vals, op): + t = torch.tensor(vals, device=device, dtype=torch.float64) + dist.all_reduce(t, op=op) + return [float(x) for x in t.tolist()] + + +def _reduce_int(torch, dist, device, v: int, op) -> int: + t = torch.tensor([int(v)], device=device, dtype=torch.int64) + dist.all_reduce(t, op=op) + return int(t.item()) + + +def _allgather_floats(torch, dist, device, v: float) -> list[float]: + """Gather one scalar from every rank -> list indexed by rank (for per-rank diagnostics: + which rank is the straggler, the rank spread). all_reduce can't do this — it collapses.""" + world = dist.get_world_size() + out = [torch.zeros(1, device=device, dtype=torch.float64) for _ in range(world)] + dist.all_gather(out, torch.tensor([float(v)], device=device, dtype=torch.float64)) + return [float(x.item()) for x in out] + + +def _histogram(xs: list[float], nbins: int = 40) -> dict: + """Compact distribution of pooled cross-rank-max samples (for p99-spike debugging without + storing every sample). Equal-width bins between min and max.""" + if not xs: + return {"n": 0} + lo, hi = min(xs), max(xs) + if hi <= lo: + return {"n": len(xs), "min": lo, "max": hi, "bins": nbins, "counts": [len(xs)]} + counts = [0] * nbins + span = hi - lo + for x in xs: + b = min(nbins - 1, int((x - lo) / span * nbins)) + counts[b] += 1 + return {"n": len(xs), "min": round(lo, 3), "max": round(hi, 3), "bins": nbins, "counts": counts} + + +def _provenance_unknown(prov: dict) -> list[str]: + return [k for k, v in prov.items() if isinstance(v, str) and v.strip().lower() == "unknown"] + + +def _resource_profile(prov: dict, args) -> dict: + """Map backend-specific provenance onto the backend-INDEPENDENT resource vocabulary (goal P3): + requested vs achieved comm-unit fraction, configured units/warps, and a conformance class. + DeepEP units = SMs (num_sms); MoRI units = CU blocks (block_num).""" + dev = prov.get("device_sms") or prov.get("device_cus") + cfg = prov.get("num_sms") if prov.get("num_sms") is not None else prov.get("block_num") + requested = args.sm_fraction if args.resource_mode == "normalized" else None + achieved = (cfg / dev) if (cfg and dev) else None + floored = bool(prov.get("block_num_floored")) + if floored: + cls = "minimum-functional" # backend needed MORE than requested to run + elif args.resource_mode == "normalized": + cls = "resource-conforming" + elif args.resource_mode == "tuned": + cls = "best-known" if "default" not in str(prov.get("tuned_source", "")) else "backend-default" + else: + cls = "backend-default" + # within tolerance? (normalized only — did we hit the requested fraction?) + tol = 0.10 + target_achieved = (requested is not None and achieved is not None + and abs(achieved - requested) <= tol) if requested else None + return { + "comm_units_kind": "sm" if prov.get("num_sms") is not None else "cu_block", + "requested_fraction": requested, "configured_units": cfg, "device_units": dev, + "achieved_fraction": round(achieved, 4) if achieved else None, + "warps_dispatch": prov.get("dispatch_warps"), "warps_combine": prov.get("combine_warps"), + "qps_per_rank": prov.get("num_qps_per_rank"), + "persistent_bytes": prov.get("num_nvl_bytes") or prov.get("num_rdma_bytes") or prov.get("heap_size"), + "tuned_source": prov.get("tuned_source"), + "conformance_class": cls, "tolerance": tol, "target_achieved_within_tol": target_achieved, + "nonconforming": floored, + } + + +def _derive_publication_status(v: dict) -> str: + """Machine-derive the publication state from the validity dimensions (goal P1). No caller + may hand-label a result 'official' — it must earn every gate here.""" + if v["execution_status"] != "complete": + return "failed" + if v["semantic_correctness"] != "pass" or v["measurement_conformance"] != "conformant" \ + or v["workload_identity"] == "inconsistent": + return "invalid" + sound = (v["semantic_correctness"] == "pass" + and v["workload_identity"].startswith("consistent") + and v["measurement_conformance"] == "conformant") + # resource-nonconforming but otherwise sound -> diagnostic (not a fair cross-platform point) + if v["resource_conformance"].endswith("nonconforming"): + return "diagnostic" + if sound and v["provenance_complete"] and v["workload_source"] == "canonical-serialized": + return "official" + if sound: + return "comparable-experimental" # measurement sound, missing a publication requirement + return "diagnostic" + + +def run_sweep(args, backend, torch, dist, device, rank: int, world_size: int) -> int: + """Drive the source-tokens-per-rank sweep for one fully-specified line.""" + import routing # torch-based; imported lazily so the module byte-compiles without torch + import eplb # stdlib planner + torch remap (the EPLB transform) + + ep_size = world_size # num_ep_groups removed (was metadata-only; no real subgroups) + # EPLB (if on): run_ep.py already bumped args.experts to the PHYSICAL count and stashed the + # logical count, so experts_per_rank below is physical. The trace is built over LOGICAL + # experts then remapped to physical (build_trace), so the whole sweep runs over the + # balanced physical placement with no adapter change. + eplb_on = getattr(args, "eplb", False) + num_logical = getattr(args, "num_logical_experts", args.experts) + if args.experts % ep_size != 0: + if rank == 0: + print(f"ERROR: experts ({args.experts}) must divide ep_size ({ep_size})") + return 2 + experts_per_rank = args.experts // ep_size + elem_bytes = _DTYPE_BYTES.get(args.dispatch_dtype, 2) + + # Provenance gate (review #1): refuse a comparison run with unpinned backend info. + unknown = _provenance_unknown(backend.backend_provenance) + if unknown and not args.allow_unknown_provenance: + if rank == 0: + print(f"ERROR: unpinned provenance {unknown} in {backend.backend_provenance}; " + f"set the commit/version env or pass --allow-unknown-provenance.") + return 4 + + cap = backend.buffer_cap(args) + ladder, dropped = token_ladder(args.tokens_ladder, args.phase, cap) + if rank == 0 and dropped: + print(f"NOTE: dropped tokens/rank {dropped} — exceed {backend.name} buffer cap {cap} " + f"(hidden={args.hidden}); not silently truncated.") + if not ladder: + if rank == 0: + print(f"ERROR: empty token ladder (phase={args.phase}, cap={cap})") + return 2 + # MoRI wedges on a COLD dispatch that jumps straight to a large T; it sets + # needs_gradual_ramp so the sweep approaches its max T via a geometric ramp from 1 + # (validated on MI355X). A naturally-gradual ladder (decode) is unchanged. + if getattr(backend, "needs_gradual_ramp", False): + top, ramp, t = ladder[-1], [], 1 + while t < top: + ramp.append(t); t *= 2 + ramp.append(top) + if rank == 0 and ramp != ladder: + print(f"NOTE: {backend.name} sweep ramped gradually 1..{top} (cold-jump-safe): {ramp}") + ladder = ramp + + MAX, MIN, SUM = dist.ReduceOp.MAX, dist.ReduceOp.MIN, dist.ReduceOp.SUM + + # EPLB plan (once): estimate logical load from the global logical trace at the largest + # ladder T (most samples), then replicate+place. Held fixed across all T (as real EPLB + # plans from an observed load estimate). build_trace builds the LOGICAL trace and remaps + # to physical when the plan is present; otherwise it's the identity (logical == physical). + eplb_plan = None + if eplb_on: + ref_idx, _ = routing.build_global_routing(max(ladder) * ep_size, num_logical, args.topk, + args.routing, args.seed, num_logical // ep_size) + load = torch.bincount(ref_idx.reshape(-1), minlength=num_logical).float().tolist() + eplb_plan = eplb.build_plan(load, args.experts, ep_size) + if rank == 0: + print(f"NOTE: EPLB {num_logical}->{args.experts} experts ({ep_size}x{experts_per_rank}); " + f"per-rank load imbalance {eplb_plan['imbalance_before']:.2f}x -> " + f"{eplb_plan['imbalance_after']:.2f}x; {eplb_plan['replicated_experts']} experts " + f"replicated (hottest {eplb_plan['max_replicas']}x)") + + canonical = bool(getattr(args, "workload_dir", "")) + loaded_workload_ids, loaded_checksums = [], {} + if canonical: + import workload as _wl + + def build_trace(gt): + # canonical: load pre-serialized trace bytes (verified by checksum) so this run is + # provably the SAME workload as any other consuming the same files. else: seeded gen. + if canonical: + wid = _wl.compute_workload_id(args.routing, args.hidden, args.topk, num_logical, gt, args.seed) + idx_np, w_np, man = _wl.load_workload(os.path.join(args.workload_dir, f"{wid}.npz"), verify=True) + idx_l = torch.from_numpy(idx_np).to(torch.int64) + w = torch.from_numpy(w_np).to(torch.float32) + if wid not in loaded_workload_ids: + loaded_workload_ids.append(wid) + loaded_checksums[wid] = man.get("checksums") + else: + idx_l, w = routing.build_global_routing(gt, num_logical, args.topk, args.routing, + args.seed, num_logical // ep_size) + return (eplb.remap_idx(idx_l, eplb_plan) if eplb_plan is not None else idx_l), w + + # Fabric/clock warm-up BEFORE any timed point (review: H200 had an anomalous cold + # first point and a 40% decode-vs-prefill mismatch at the shared T=128). Gradually + # ramp through the small ladder shapes untimed — warms clocks/fabric for everyone + # and is also cold-jump-safe for MoRI. + warm_T = min(ladder[-1], 128) + warm_shapes = [t for t in ladder if t <= warm_T] or [ladder[0]] + for wt in warm_shapes: + wi, ww = build_trace(wt * ep_size) + wsi, wsw = routing.rank_slice(wi, ww, rank, wt) + wx = routing.rank_activations(wt, args.hidden, args.seed, rank, device, torch.bfloat16) + wp = backend.make_problem(wt, wsi.to(device), wsw.to(device), wx) + for _ in range(8): + wh = backend.dispatch(wp); backend.stage(wp, wh); backend.combine(wp, wh) + torch.cuda.synchronize() + try: + dist.barrier() + except Exception: + pass + # Per-point clock-ramp burst (set up below, applied inside the loop): a ONE-TIME burst + # warms clocks, but on Blackwell (B300) the tiny small-T points let clocks drop again, + # so a mid-sweep T=64 reads ~20x cold. Re-ramping at EACH shape keeps every timed point + # steady-state. Gated by backend.wants_warm_burst — MoRI WEDGES on a sustained burst + # (and is already steady at warmup=8), so it opts out. CX_FABRIC_WARM_BURST overrides. + warm_burst = int(os.environ.get("CX_FABRIC_WARM_BURST", "40")) + do_burst = warm_burst > 0 and getattr(backend, "wants_warm_burst", False) + + import random as _random + elem_dispatch = elem_bytes # fp8=1 / bf16=2 (dispatch payload element size) + tol = getattr(backend, "tolerance", 5e-2) + + # ---- Pass 1: build the per-T problem ONCE (deterministic trace + cached layout per + # contract), run the correctness gate ONCE. Timing is Pass 2 (pooled over trials). ---- + problems, gate = {}, {} + routing_hashes = set() + for T in ladder: + gt = T * ep_size + idx_g, w_g = build_trace(gt) + rstats = routing.routing_stats(idx_g, args.experts, experts_per_rank, weights=w_g) + gpn = args.gpus_per_node or ep_size + rstats["locality"] = routing.routing_locality(idx_g, experts_per_rank, ep_size, T, gpn, + args.scale_up_domain or None) + routing_hashes.add(rstats["routing_hash"]) + idx_s, w_s = routing.rank_slice(idx_g, w_g, rank, T) + x = routing.rank_activations(T, args.hidden, args.seed, rank, device, torch.bfloat16) + problem = backend.make_problem(T, idx_s.to(device), w_s.to(device), x) + h = backend.dispatch(problem); backend.stage(problem, h) + combined = backend.combine(problem, h) + torch.cuda.synchronize() + recv_local = backend.recv_tokens(h) + exp, n_cmp = backend.expected(problem, h) + max_abs = (combined[:n_cmp].float() - exp[:n_cmp].float()).abs().max().item() + max_rel = max_abs / (exp[:n_cmp].float().abs().max().item() + 1e-6) + problems[T] = problem + gate[T] = {"rstats": rstats, "recv_local": recv_local, + "max_rel": max_rel, "local_ok": 1 if max_rel < tol else 0} + + # ---- Pass 2: N timed trials. Token order is randomized PER TRIAL (seeded ⇒ identical + # on every rank, so collectives stay lock-step) so warmup/clock drift can't correlate + # with T. Per-iteration cross-rank MAX samples are POOLED across trials, then + # percentiled (review #3: p99 from one 50-iter run is just the max). MoRI keeps + # ascending order — it wedges on a cold jump to a large T. ---- + disp_pool = {T: [] for T in ladder} # pooled per-iteration cross-rank MAX (dispatch) + comb_pool = {T: [] for T in ladder} # ... combine + rt_pool = {T: [] for T in ladder} # ... INDEPENDENTLY-MEASURED round trip (goal P1) + disp_local = {T: [] for T in ladder} # THIS rank's own dispatch samples (per-rank diag) + order = list(ladder) + rng = _random.Random(args.seed) + shuffle_ok = not getattr(backend, "needs_gradual_ramp", False) + for trial in range(max(1, args.trials)): + if shuffle_ok: + rng.shuffle(order) + for T in order: + problem = problems[T] + if do_burst: # re-ramp clocks at THIS shape before timing (Blackwell) + for _ in range(warm_burst): + bh = backend.dispatch(problem); backend.stage(problem, bh); backend.combine(problem, bh) + torch.cuda.synchronize() + disp_iters = time_us(torch, lambda p=problem: backend.dispatch(p), args.warmup, args.iters) + + def prep(p=problem): + hh = backend.dispatch(p); backend.stage(p, hh); return hh + if backend.combine_needs_redispatch: + comb_iters = time_us(torch, lambda hh, p=problem: backend.combine(p, hh), + args.warmup, args.iters, pre=prep) + else: + hh = prep() + comb_iters = time_us(torch, lambda p=problem, hx=hh: backend.combine(p, hx), + args.warmup, args.iters) + # MEASURED round trip (goal P1: not a sum of percentiles): one timed region over + # dispatch -> stage (no-op "expert" transform) -> combine -> output ready. Captures + # shared sync / launch amortization / overlap that the isolated_sum cannot. + def rt_once(p=problem): + hh = backend.dispatch(p); backend.stage(p, hh); return backend.combine(p, hh) + rt_iters = time_us(torch, lambda p=problem: rt_once(p), args.warmup, args.iters) + # per-iteration cross-rank MAX (the distributed-op latency per iter), pooled. + disp_pool[T] += _reduce_vec(torch, dist, device, disp_iters, MAX) + comb_pool[T] += _reduce_vec(torch, dist, device, comb_iters, MAX) + rt_pool[T] += _reduce_vec(torch, dist, device, rt_iters, MAX) + disp_local[T] += disp_iters + + # ---- Pass 3: percentiles (p50/p90/p95/p99, nearest-rank) from pooled samples + bytes + row ---- + def pcts(xs): + return {"p50": percentile(xs, 50), "p90": percentile(xs, 90), + "p95": percentile(xs, 95), "p99": percentile(xs, 99)} + rows = [] + for T in ladder: + gt = T * ep_size + g = gate[T]; rstats = g["rstats"] + d, c, rt = disp_pool[T], comb_pool[T], rt_pool[T] + dp, cp, rtp = pcts(d), pcts(c), pcts(rt) + # isolated_sum = SUM of the isolated dispatch+combine percentiles. NOT a measured op + # (can't reveal shared sync / launch amortization / overlap) — do NOT use for throughput + # or SLO capacity. The MEASURED round trip (rtp) is the real chained latency. + isum = {k: dp[k] + cp[k] for k in dp} + recv_total = _reduce_int(torch, dist, device, g["recv_local"], SUM) + recv_max = _reduce_int(torch, dist, device, g["recv_local"], MAX) + recv_min = _reduce_int(torch, dist, device, g["recv_local"], MIN) + global_ok = _reduce_int(torch, dist, device, g["local_ok"], MIN) + max_rel = _reduce_vec(torch, dist, device, [g["max_rel"]], MAX)[0] + point_ok = bool(global_ok) and recv_total > 0 + # Per-rank diagnostics: gather each rank's own dispatch median -> spread + straggler. + per_rank_med = _allgather_floats(torch, dist, device, percentile(disp_local[T], 50)) + slowest_rank = max(range(len(per_rank_med)), key=lambda i: per_rank_med[i]) + rmean = sum(per_rank_med) / len(per_rank_med) + # Canonical LOGICAL payload byte contracts (from the routing trace, NOT backend recv + # tensors): token-rank = one copy per unique (token,dest-rank); token-expert = one copy + # per routed (token,expert). routed_copies = token-rank copies; gt*topk = token-expert. + token_rank_copies = rstats["routed_copies"] + token_expert_copies = gt * args.topk + H = args.hidden + rows.append({ + "tokens_per_rank": T, "global_tokens": gt, + "dispatch": dp, "combine": cp, "roundtrip": rtp, "isolated_sum": isum, + # flat aliases kept for back-compat with v3 readers + "dispatch_us_p50": dp["p50"], "dispatch_us_p90": dp["p90"], "dispatch_us_p99": dp["p99"], + "combine_us_p50": cp["p50"], "combine_us_p90": cp["p90"], "combine_us_p99": cp["p99"], + "roundtrip_us_p50": rtp["p50"], "roundtrip_us_p90": rtp["p90"], + "roundtrip_us_p95": rtp["p95"], "roundtrip_us_p99": rtp["p99"], + "isolated_sum_us_p50": isum["p50"], "isolated_sum_us_p99": isum["p99"], + "samples_pooled": len(d), "trials": max(1, args.trials), + "percentile_interpolation": "nearest-rank", + "recv_tokens_max": recv_max, "recv_tokens_min": recv_min, + "recv_tokens_mean": recv_total / world_size, "recv_tokens_total": recv_total, + "per_rank_dispatch_us": {"min": min(per_rank_med), "mean": rmean, + "max": max(per_rank_med), "spread": max(per_rank_med) - min(per_rank_med), + "slowest_rank": slowest_rank}, + # dispatch carries its dtype's element size; combine input is bf16 (2B). + "dispatch_logical_bytes": token_rank_copies * H * elem_dispatch, + "combine_logical_bytes": token_rank_copies * H * 2, + "byte_contracts": { + "token_rank_payload_copies": token_rank_copies, + "token_expert_payload_copies": token_expert_copies, + "dispatch_bytes": token_rank_copies * H * elem_dispatch, + "combine_bytes": token_rank_copies * H * 2, + "fp8_scale_bytes": (token_rank_copies * (H // 128) * 4) if elem_dispatch == 1 else 0, + "routing_index_bytes": token_expert_copies * 4, # int32 topk_idx + "gate_weight_bytes": token_expert_copies * 4, # f32 topk_weights + }, + "byte_contract": "logical-routed-payload-v1", + # throughput from the MEASURED round trip ONLY (not isolated_sum). + "roundtrip_tokens_per_second": (gt / (rtp["p50"] * 1e-6)) if rtp["p50"] > 0 else None, + "raw_samples": {"dispatch": _histogram(d), "combine": _histogram(c), "roundtrip": _histogram(rt)}, + "fanout_mean": rstats["fanout_mean"], "fanout_max": rstats["fanout_max"], + "routed_copies": rstats["routed_copies"], "expert_load_max": rstats["expert_load_max"], + "routing_hash": rstats["routing_hash"], "locality": rstats.get("locality"), + "correct": point_ok, "max_rel_error": max_rel, + }) + if rank == 0: + print(f" T={T:<5} disp p50/p99={dp['p50']:7.1f}/{dp['p99']:7.1f} comb {cp['p50']:6.1f}/{cp['p99']:6.1f} " + f"RT p50/p99={rtp['p50']:7.1f}/{rtp['p99']:7.1f}us n={len(d)} fanout={rstats['fanout_mean']:.2f} " + f"recv[min/mean/max]={recv_min}/{recv_total // world_size}/{recv_max} " + f"straggler=r{slowest_rank} correct={point_ok}") + + # Cross-rank workload-identity proof: every rank must have built the SAME global routing + # (one hash per T here); confirm all ranks agree by hashing the per-T hash set and + # MIN/MAX-reducing it — a mismatch means NVIDIA and AMD did NOT run identical routing. + trace_sig = int(hashlib.sha256("|".join(sorted(routing_hashes)).encode()).hexdigest()[:15], 16) + sig_min = _reduce_int(torch, dist, device, trace_sig, MIN) + sig_max = _reduce_int(torch, dist, device, trace_sig, MAX) + routing_consistent = (sig_min == sig_max == trace_sig) + + if rank != 0: + return 0 + + # status=valid requires correctness AND a proven-identical routing trace across ranks. + all_ok = bool(rows) and all(r["correct"] for r in rows) and routing_consistent + + # ---- Multi-dimensional validity (goal P1) -> MACHINE-DERIVED publication_status. Adapters + # never self-label "official"; status is a pure function of these gates. ---- + prov = backend.backend_provenance + prov_unknown = _provenance_unknown(prov) + repro = getattr(args, "reproduction_full", {}) + git_run = getattr(args, "git_run", None) + provenance_complete = (not prov_unknown + and bool(getattr(args, "image_digest", "")) + and bool(git_run) and all((git_run or {}).get(k) for k in ("run_id", "source_sha"))) + floored = bool(prov.get("block_num_floored")) + resource_conformance = ("minimum-functional-nonconforming" if floored + else ("resource-conforming" if args.resource_mode == "normalized" + else "backend-default" if args.resource_mode in ("tuned", "default") + else "unspecified")) + # record the canonical workload identity consumed (one trace per T -> set of ids/checksums). + if canonical and loaded_workload_ids: + args.workload_id = (loaded_workload_ids[0] if len(loaded_workload_ids) == 1 + else f"set:{len(loaded_workload_ids)}:{loaded_workload_ids[0]}") + args.workload_checksums = loaded_checksums + canonical_workload = bool(getattr(args, "workload_id", None)) + validity = { + "execution_status": "complete" if rows else "failed", + "semantic_correctness": "pass" if (rows and all(r["correct"] for r in rows)) else "fail", + "workload_identity": "consistent-across-ranks" if routing_consistent else "inconsistent", + "workload_source": "canonical-serialized" if canonical_workload else "seeded-runtime", + "measurement_conformance": "conformant", # run_ep gate rejects nonconformant pre-run + "resource_conformance": resource_conformance, + "provenance_complete": provenance_complete, + } + publication_status = _derive_publication_status(validity) + + shape = { # FIXED line identity (no T, no per-backend resource knobs) + "hidden": args.hidden, "topk": args.topk, "experts": args.experts, + "experts_per_rank": experts_per_rank, "dispatch_dtype": args.dispatch_dtype, + "routing": args.routing, "eplb": bool(eplb_plan), "num_logical_experts": num_logical, + } + meta = { + "op": "ep-dispatch-combine", "backend": backend.name, "mode": args.mode, + "phase": args.phase, "world_size": world_size, "ep_size": ep_size, + "resource_mode": args.resource_mode, + "nodes": int(os.environ.get("SLURM_NNODES", "1")), + "topology_class": args.topology_class, "comparison_class": args.comparison_class, + # honest contract name (was the misleading "comm-only-v1": dispatch INCLUDES layout + # under layout-and-dispatch-v1). Adapters declare which they conform to. + "measurement_contract": args.measurement_contract, "shape": shape, + # structured placement metadata (goal P2 topology) — replaces the bare topology string. + "placement": { + "kind": args.placement, "nodes": int(os.environ.get("SLURM_NNODES", "1")), + "gpus_per_node": args.gpus_per_node or ep_size, + "scale_up_domain": args.scale_up_domain or ((args.gpus_per_node or ep_size) * 1), + "ranks": ep_size, "transport": args.transport, + }, + } + headline = next((r for r in rows if r["tokens_per_rank"] == 64), rows[len(rows) // 2]) + env = None + if args.env_json and os.path.exists(args.env_json): + with open(args.env_json) as fh: + env = json.load(fh) + doc = { + "schema_version": SCHEMA_VERSION, "family": "moe", "generated_by": "tests/run_ep.py", + "generated_at": args.timestamp or _dt.datetime.now().astimezone().isoformat(), + "runner": args.runner, "transport": args.transport, + # Multi-dimensional validity + machine-derived publication status (goal P1). `status` + # is a back-compat alias (legacy v3 readers) — publication_status is authoritative. + "validity": validity, + "publication_status": publication_status, + "status": "valid" if all_ok else "invalid", + "workload": { + "source": validity["workload_source"], + "workload_id": getattr(args, "workload_id", None), + "manifest_checksums": getattr(args, "workload_checksums", None), + "trace_signature": f"{trace_sig:015x}", + "distinct_per_T_hashes": sorted(routing_hashes), + # within-run (cross-rank) identity is PROVEN here; cross-hardware identity holds + # only if another run records the SAME trace_signature / workload_id. + "cross_rank_consistent": routing_consistent, + }, + "comparison_key": comparison_key(meta), + "x_axis": {"primary": "tokens_per_rank", + "global_relation": "global_tokens = tokens_per_rank * ep_size"}, + "backend_provenance": backend.backend_provenance, + # backend-independent resource vocabulary + conformance class (goal P3). + "resource_profile": _resource_profile(backend.backend_provenance, args), + "reproduction": { + "command": getattr(args, "reproduction_command", ""), + "image": getattr(args, "image", "") or None, + "image_digest": getattr(args, "image_digest", "") or None, + "image_arch": getattr(args, "image_arch", None), + "squash_sha256": getattr(args, "squash_sha256", None), + "git_run": getattr(args, "git_run", None), # repo/run/attempt/ref/sha/job/artifact + # redaction (goal P1): command + provenance carry NO hostnames/IPs/UUIDs/private paths; + # per-node env (hostnames, GPU UUIDs, NIC GUIDs) lives in the separate gitignored + # env_json (CI uploads it as a workflow artifact), never inlined into this record. + "redaction": "no hostnames/IPs/UUIDs/private-paths in command or provenance", + "seed": args.seed, "warmup": args.warmup, "iters": args.iters, + "trials": max(1, args.trials), "samples_per_point": (max(1, args.trials) * args.iters), + "measurement_contract": args.measurement_contract, + "dispatch_dtype": args.dispatch_dtype, "mode": args.mode, + "fp8_quant_in_timing": getattr(backend, "fp8_in_timing", None), + }, + **meta, + "correctness": {"passed": all_ok, + "max_rel_error": max((r["max_rel_error"] for r in rows), default=None), + "tolerance": getattr(backend, "tolerance", 5e-2), "points": len(rows), + # honest scope: round-trip reconstruction + non-silent recv, NOT a full + # per-token routing/ordering/weight/padding proof (review #3). + "scope": "roundtrip-reconstruction-smoke-v1"}, + "routing_identity": { # cryptographic workload-identity proof (review #3) + "consistent_across_ranks": routing_consistent, + "trace_signature": f"{trace_sig:015x}", + "distinct_per_T_hashes": sorted(routing_hashes), + }, + # EPLB plan + the per-rank load imbalance it removes (the headline of the zipf+EPLB + # comparison). enabled=False when the run did not apply EPLB. + "eplb": ({"enabled": True, "num_logical_experts": num_logical, + "num_physical_experts": args.experts, + "num_redundant": args.experts - num_logical, + "imbalance_before": eplb_plan["imbalance_before"], + "imbalance_after": eplb_plan["imbalance_after"], + "replicated_experts": eplb_plan["replicated_experts"], + "max_replicas": eplb_plan["max_replicas"]} + if eplb_plan else {"enabled": False}), + "routing_profile": { + "routing": args.routing, + "fanout_mean": sum(r["fanout_mean"] for r in rows) / len(rows), + "fanout_max": max(r["fanout_max"] for r in rows), + "headline_hash": headline["routing_hash"], + }, + "metrics": { # p99 is the headline percentile (review #3); p50/p90/p95 also kept per row + "headline_tokens_per_rank": headline["tokens_per_rank"], + "headline_percentile": "p99", + "dispatch_us_p50": headline["dispatch_us_p50"], "dispatch_us_p99": headline["dispatch_us_p99"], + "combine_us_p50": headline["combine_us_p50"], "combine_us_p99": headline["combine_us_p99"], + "roundtrip_us_p50": headline["roundtrip_us_p50"], "roundtrip_us_p99": headline["roundtrip_us_p99"], + "isolated_sum_us_p50": headline["isolated_sum_us_p50"], "isolated_sum_us_p99": headline["isolated_sum_us_p99"], + "isolated_sum_label": "sum of isolated dispatch+combine percentiles — NOT a measured chained op", + "roundtrip_tokens_per_second": headline["roundtrip_tokens_per_second"], + }, + "rows": rows, "environment": env, + } + os.makedirs(os.path.dirname(os.path.abspath(args.out)), exist_ok=True) + with open(args.out, "w") as fh: + json.dump(doc, fh, indent=2) + fh.write("\n") + print(f"{backend.name} ep-dispatch-combine [{args.phase}/{args.mode}/{args.measurement_contract}]: " + f"status={doc['status']} {len(rows)} pts, routing_consistent={routing_consistent}, " + f"headline T={headline['tokens_per_rank']} disp_p99={headline['dispatch_us_p99']:.1f}us " + f"-> {args.out}") + return 0 if all_ok else 1 diff --git a/experimental/CollectiveX/tests/ep_mori.py b/experimental/CollectiveX/tests/ep_mori.py new file mode 100644 index 000000000..363736485 --- /dev/null +++ b/experimental/CollectiveX/tests/ep_mori.py @@ -0,0 +1,187 @@ +#!/usr/bin/env python3 +"""CollectiveX EP backend adapter — MoRI (AMD ROCm), normal mode. + +The harness owns the deterministic shared routing trace and the comm-only timing; +this file owns MoRI's API and the ionic_rdma-fabric constraints found on MI355X +(validated on-node, see CONTAINERS.md): the whole symmetric heap is one RDMA MR +capped at ~4 GiB (hold at 2 GiB; bound buffers via max_num_inp_token_per_rank ⇒ +buffer_cap); combine() resets recv_num (read it before combine; compare only the +first T rows); and the post-shmem_finalize teardown asserts (finalize hard-exits). + +`make_problem` now materializes the harness-provided rank slice, so MoRI honors the +requested routing (it no longer always-uniform) and runs the identical workload to +the NVIDIA SKUs. combine_needs_redispatch=True: combine consumes recv_num, so the +harness re-dispatches (untimed) before each timed combine sample. +""" +from __future__ import annotations + +import os +import sys +import types + +# MoRI registers the WHOLE symmetric heap as one RDMA MR at shmem init — set BEFORE +# `import mori`. 2 GiB registers on the MI355X ionic_rdma NICs; larger fails. +os.environ.setdefault("MORI_SHMEM_HEAP_SIZE", + os.environ.get("CX_MORI_HEAP_SIZE", "2G")) + +import torch +import torch.distributed as dist + +try: + import mori # type: ignore +except Exception as exc: # pragma: no cover - needs the AMD MoRI image + print("ERROR: mori import failed — needs the AMD MoRI image " + f"(rocm/sgl-dev:...-mori-...). {exc!r}", file=sys.stderr) + raise + + +class MoRIBackend: + name = "mori" + combine_needs_redispatch = True + # MoRI wedges on a COLD dispatch jumping straight to a large T (validated on + # MI355X); the harness ramps this backend's ladder geometrically from 1. + needs_gradual_ramp = True + # MoRI WEDGES under a sustained warm-up burst (the harness's Blackwell clock-ramp) + # and is already steady at a short warm-up (~44us, reproducible) — so it opts out. + wants_warm_burst = False + # Capabilities — run_ep.py REJECTS anything outside these BEFORE construction (no + # fallback/mislabel). Expanded as each path is implemented + hardware-validated. + # MoRI exposes quant_type (fp8) in EpDispatchCombineConfig; added once validated. + SUPPORTED_PRECISIONS = {"bf16"} # + "fp8" once the fp8 quant_type path is wired + SUPPORTED_MODES = {"normal"} # MoRI has no separate low-latency entrypoint + # MoRI computes its routing layout INSIDE the dispatch kernel (block_num/warps launch); + # it cannot be hoisted, so MoRI honors only the layout-and-dispatch contract. Cross- + # vendor comparisons must therefore use layout-and-dispatch-v1 (the common contract). + SUPPORTED_CONTRACTS = {"layout-and-dispatch-v1"} + + def __init__(self, args, rank, world_size, local_rank, device): + self.args = args + self.rank = rank + self.world_size = world_size + self.device = device + self.mode = args.mode + assert args.dispatch_dtype in self.SUPPORTED_PRECISIONS and args.mode in self.SUPPORTED_MODES, \ + "run_ep.py must reject unsupported dtype/mode before constructing the backend" + self.fp8_in_timing = None # set when fp8 dispatch is used (whether the cast is timed) + self.ep_size = world_size + self.experts_per_rank = args.experts // self.ep_size + dev_cus = torch.cuda.get_device_properties(device).multi_processor_count + # Resource regime — map the comm budget onto CUs to mirror DeepEP's SM fraction. + # normalized: block_num ≈ sm_fraction · CUs (≈ the same device fraction); + # tuned: MoRI launch auto-tuning (API not present in this build — uses default, + # labeled tuned_source); default: the 80-block bring-up budget. + # MoRI DEADLOCKS at T>=32 when block_num is reduced toward the normalized target + # (validated on MI355X g15: block_num=46 wedges, 80 completes T=32/64 with the + # realistic fan-out≈5.3 trace). So MoRI cannot be normalized down to DeepEP's + # device fraction; floor it at a known-functional minimum and record that the + # target fraction was NOT reached. + rm = args.resource_mode + floor = int(os.environ.get("CX_MORI_MIN_BLOCKS", "80")) # functional minimum (deadlocks lower) + env_blocks = os.environ.get("CX_MORI_BLOCK_NUM") + self._block_floored = False + if env_blocks: + self.block_num = int(env_blocks) + self._block_target = self.block_num + elif rm == "normalized": + self._block_target = max(1, round(args.sm_fraction * dev_cus)) + self.block_num = max(floor, self._block_target) + self._block_floored = self.block_num > self._block_target + else: # tuned (no launch auto-tune API in mori-0227-2) / default + self.block_num = 80 + self._block_target = 80 + self._tuned_source = ("default-80" if rm == "tuned" else + ("normalized-floored" if self._block_floored else "n/a")) + self.dispatch_warps = int(os.environ.get("CX_MORI_DISPATCH_WARPS", "16")) + self.combine_warps = int(os.environ.get("CX_MORI_COMBINE_WARPS", "8")) + + world_group = torch.distributed.group.WORLD + torch._C._distributed_c10d._register_process_group("default", world_group) + mori.shmem.shmem_torch_process_group_init("default") + + self._cap = self.buffer_cap(args) + self.config = mori.ops.EpDispatchCombineConfig( + data_type=torch.bfloat16, rank=rank, world_size=world_size, + hidden_dim=args.hidden, scale_dim=0, + scale_type_size=torch.tensor([], dtype=torch.float8_e4m3fnuz).element_size(), + max_token_type_size=torch.tensor([], dtype=torch.float32).element_size(), + max_num_inp_token_per_rank=max(512, self._cap), + num_experts_per_rank=self.experts_per_rank, + num_experts_per_token=args.topk, + use_external_inp_buf=False, quant_type="none", + ) + self.op = mori.ops.EpDispatchCombineOp(self.config) + # Provenance: MoRI has no pip version; pin via MORI_COMMIT, else the image tag + # the launcher exported (COLLECTIVEX_IMAGE carries the mori build tag), so the + # provenance gate has something real rather than "unknown". + img = os.environ.get("COLLECTIVEX_IMAGE", "") + mori_commit = os.environ.get("MORI_COMMIT") or (f"image:{img}" if img else "unknown") + self.backend_provenance = { + "mori_commit": mori_commit, + "heap_size": os.environ.get("MORI_SHMEM_HEAP_SIZE"), + "max_num_inp_token_per_rank": max(512, self._cap), + "resource_mode": args.resource_mode, "block_num": self.block_num, + "block_num_target": self._block_target, "block_num_floored": self._block_floored, + "dispatch_warps": self.dispatch_warps, "combine_warps": self.combine_warps, + "device_cus": dev_cus, "sm_fraction": (self.block_num / dev_cus), + "tuned_source": self._tuned_source, + } + + def buffer_cap(self, args): + # Largest tokens/rank the 2 GiB registerable heap holds at hidden=7168 (512, + # validated on-node). Override via CX_MORI_MAX_TOKENS. + return int(os.environ.get("CX_MORI_MAX_TOKENS", "512")) + + def make_problem(self, T, idx, weights, x): + # Shared-trace slice: idx[T,topk] -> int32 (MoRI expects int32 expert ids); + # weights[T,topk] f32; x[T,hidden] bf16; scales is a real (T,0) fp8 tensor + # (not None) since scale_dim==0. + indices = idx.to(torch.int32) + scales = torch.empty((T, 0), dtype=torch.float8_e4m3fnuz, device=self.device) + return types.SimpleNamespace(T=T, x=x, indices=indices, + weights=weights.to(torch.float32), scales=scales) + + def dispatch(self, p): + (dispatch_output, dispatch_weights, _scales, dispatch_indices, recv_num) = self.op.dispatch( + p.x, p.weights, p.scales, p.indices, + block_num=self.block_num, warp_per_block=self.dispatch_warps) + total_recv = int(recv_num[0].item()) # read BEFORE combine (combine resets recv_num) + return types.SimpleNamespace( + dispatch_output=dispatch_output, dispatch_weights=dispatch_weights, + dispatch_indices=dispatch_indices, total_recv=total_recv, + combine_input=dispatch_output.to(torch.bfloat16)) + + def stage(self, p, h): + # comm-only contract: stage the "expert outputs" into MoRI's registered + # combine-input buffer UNTIMED (in a real MoE the expert FFN writes here). + buf = self.op.get_registered_combine_input_buffer( + torch.bfloat16, hidden_dim=h.combine_input.size(1)) + buf[:h.total_recv, :].copy_(h.combine_input[:h.total_recv, :]) + + def combine(self, p, h): + combined, _w = self.op.combine( + h.combine_input, h.dispatch_weights, h.dispatch_indices, + block_num=self.block_num, warp_per_block=self.combine_warps) + return combined + + def expected(self, p, h): + # MoRI combine sums one copy per destination RANK ⇒ combined[i] ≈ + # x[i] * (#unique destination ranks among the token's topk experts). + pes = p.indices.long() // self.experts_per_rank + unique_pes = torch.tensor( + [len(set(row.tolist())) for row in pes], device=self.device, dtype=torch.float32 + ).unsqueeze(1) + return p.x.float() * unique_pes, p.T + + def recv_tokens(self, h): + return int(h.total_recv) + + def finalize(self, rc): + # MoRI's shmem teardown asserts after shmem_finalize(); results are already + # written, so sync and hard-exit past it. + try: + dist.barrier() + except Exception: + pass + sys.stdout.flush() + sys.stderr.flush() + os._exit(0 if rc == 0 else 1) diff --git a/experimental/CollectiveX/tests/eplb.py b/experimental/CollectiveX/tests/eplb.py new file mode 100644 index 000000000..2234fea96 --- /dev/null +++ b/experimental/CollectiveX/tests/eplb.py @@ -0,0 +1,176 @@ +#!/usr/bin/env python3 +"""CollectiveX — EPLB (Expert-Parallel Load Balancer), the DeepSeek-style remedy for +skewed (zipf) expert load. + +Under skewed routing, the ranks hosting hot logical experts receive far more token-copies +than the rest; dispatch/combine latency is gated by that busiest rank (the cross-rank MAX +the harness measures), so the whole collective stalls on it. EPLB REPLICATES hot experts +onto extra physical slots and PLACES the slots so every rank carries ~equal load. + +This module is backend-agnostic: it is purely a transform of the deterministic routing +trace. The trick that keeps every adapter unchanged — DeepEP/MoRI both route expert i to +rank `i // experts_per_rank` (contiguous block placement) — is to number the physical slots +RANK-MAJOR (rank r owns physical ids [r*spp, (r+1)*spp)), so the standard contiguous mapping +reproduces EPLB's balanced placement. The harness then runs with `experts = num_physical` +and the remapped (physical) trace; nothing else changes. + + num_physical = num_logical + redundant (redundant rounded up to a multiple of ep_size) + build_plan(): greedy replicate-by-load + equal-cardinality balanced packing onto ep_size ranks + remap_idx(): each token's logical targets -> physical replicas, spread by global token id + +Pure-Python planner (no torch) so it unit-tests on a login node; remap_idx needs torch. +""" +from __future__ import annotations + + +def physical_count(num_logical: int, num_redundant: int, ep_size: int) -> int: + """num_logical + redundant, with redundant rounded UP to a multiple of ep_size so the + physical experts divide evenly across ranks (symmetric dispatch).""" + r = ((max(0, num_redundant) + ep_size - 1) // ep_size) * ep_size + return num_logical + r + + +def _contiguous_rank_load(logical_load, ep_size): + """Per-rank received load WITHOUT EPLB: logical experts placed contiguously + (experts_per_rank = num_logical/ep_size), so rank r carries its block's total.""" + n = len(logical_load) + per = n // ep_size + return [sum(logical_load[r * per:(r + 1) * per]) for r in range(ep_size)] + + +def build_plan(logical_load, num_physical: int, ep_size: int) -> dict: + """logical_load: list[float] length num_logical (token-copies per logical expert). + Returns the replication+placement plan (all pure-Python lists) + before/after balance.""" + num_logical = len(logical_load) + assert num_physical >= num_logical, "num_physical must be >= num_logical" + assert num_physical % ep_size == 0, "num_physical must divide ep_size" + assert num_logical % ep_size == 0, "num_logical must divide ep_size" + spp = num_physical // ep_size # physical slots per rank (fixed) + + # 1) Replica allocation — start one slot per logical expert, then hand each redundant + # slot to the expert with the highest CURRENT per-replica load (greedy min-max). + replicas = [1] * num_logical + for _ in range(num_physical - num_logical): + best, best_lps = 0, -1.0 + for e in range(num_logical): + lps = logical_load[e] / replicas[e] + if lps > best_lps: + best, best_lps = e, lps + replicas[best] += 1 + + # 2) Slots = (per-replica load, logical expert), one per replica. + slots = [] + for e in range(num_logical): + lps = logical_load[e] / replicas[e] + slots.extend((lps, e) for _ in range(replicas[e])) + + # 3) Balanced packing into ep_size bins of EQUAL cardinality (spp each), minimizing the + # max per-rank load: heaviest slot first -> least-loaded rank that still has capacity. + slots.sort(reverse=True) + rank_slots = [[] for _ in range(ep_size)] + rank_load = [0.0] * ep_size + for lps, e in slots: + r = min((r for r in range(ep_size) if len(rank_slots[r]) < spp), + key=lambda r: rank_load[r]) + rank_slots[r].append(e) + rank_load[r] += lps + + # 4) Rank-major physical numbering -> contiguous placement == this balanced placement. + phys2log, rank_of_phys = [], [] + for r in range(ep_size): + for e in rank_slots[r]: + phys2log.append(e) + rank_of_phys.append(r) + log2phys = [[] for _ in range(num_logical)] + for pid, e in enumerate(phys2log): + log2phys[e].append(pid) + + before = _contiguous_rank_load(logical_load, ep_size) + total = sum(logical_load) or 1.0 + mean = total / ep_size + return { + "num_logical": num_logical, "num_physical": num_physical, "ep_size": ep_size, + "slots_per_rank": spp, "replicas": replicas, "max_replicas": max(replicas), + "phys2log": phys2log, "rank_of_phys": rank_of_phys, "log2phys": log2phys, + "rank_load_after": rank_load, "rank_load_before": before, + # imbalance = busiest rank / mean (1.0 = perfect). This is the number EPLB cuts. + "imbalance_before": max(before) / mean, "imbalance_after": max(rank_load) / mean, + "replicated_experts": sum(1 for r in replicas if r > 1), + } + + +def remap_idx(idx_logical, plan): + """idx_logical: torch [gt, topk] int64 logical-expert ids (global trace). + Returns idx_physical [gt, topk]: each token's logical target -> one of that expert's + physical replicas, SPREAD by global token id (row) so a hot expert's tokens fan out + across its replicas (= across ranks). Replicas of distinct logical experts are disjoint, + so a token's top-k physical ids stay distinct (dispatch invariant preserved).""" + import torch + replicas = plan["replicas"] + num_logical = len(replicas) + max_rc = plan["max_replicas"] + rc = torch.tensor(replicas, dtype=torch.int64) + # padded [num_logical, max_rc] table of physical ids (pad with replica 0; never indexed + # past rc[e] because the replica index is taken mod rc[e]). + padded = torch.zeros(num_logical, max_rc, dtype=torch.int64) + for e, phys in enumerate(plan["log2phys"]): + for k in range(max_rc): + padded[e, k] = phys[k] if k < len(phys) else phys[0] + gt = idx_logical.shape[0] + rows = torch.arange(gt, dtype=torch.int64).unsqueeze(1) # [gt,1] global token id + e = idx_logical.to(torch.int64) # [gt,topk] + ridx = rows % rc[e] # [gt,topk] replica index + return padded[e, ridx] # [gt,topk] physical ids + + +# --------------------------------------------------------------------------- self-test +if __name__ == "__main__": + # Synthetic zipf load (popularity ∝ 1/(e+1)) — the case EPLB targets. No torch needed. + import sys + NUM_LOGICAL, EP, REDUNDANT = 256, 8, 32 + load = [1.0 / (e + 1) for e in range(NUM_LOGICAL)] + nphys = physical_count(NUM_LOGICAL, REDUNDANT, EP) + plan = build_plan(load, nphys, EP) + print(f"num_logical={NUM_LOGICAL} ep={EP} num_physical={nphys} slots/rank={plan['slots_per_rank']}") + print(f"replicated experts={plan['replicated_experts']} max_replicas={plan['max_replicas']} " + f"(hottest expert 0 replicas={plan['replicas'][0]})") + print(f"per-rank load BEFORE (contiguous): {[round(x,3) for x in plan['rank_load_before']]}") + print(f"per-rank load AFTER (EPLB): {[round(x,3) for x in plan['rank_load_after']]}") + print(f"imbalance (max/mean) BEFORE={plan['imbalance_before']:.2f}x AFTER={plan['imbalance_after']:.2f}x") + # Gates: equal slot cardinality, every logical expert placed, big imbalance cut. + assert all(plan["replicas"][e] >= 1 for e in range(NUM_LOGICAL)) + assert sum(plan["replicas"]) == nphys + assert len(plan["phys2log"]) == nphys + assert all(len(plan["log2phys"][e]) == plan["replicas"][e] for e in range(NUM_LOGICAL)) + # rank-major numbering => contiguous block per rank => rank_of_phys is non-decreasing + assert plan["rank_of_phys"] == sorted(plan["rank_of_phys"]) + assert plan["imbalance_after"] < plan["imbalance_before"], "EPLB must reduce imbalance" + assert plan["imbalance_after"] < 1.30, f"EPLB should get within ~30% of perfect, got {plan['imbalance_after']:.2f}" + # remap (if torch present): distinctness + balanced receive on a sampled zipf trace. + try: + import torch + g = torch.Generator().manual_seed(0) + p = torch.tensor(load); p = (p / p.sum()).expand(4096, NUM_LOGICAL) + idx_l = torch.multinomial(p, 8, replacement=False, generator=g).to(torch.int64) + idx_p = remap_idx(idx_l, plan) + assert idx_p.shape == idx_l.shape + # top-k physical ids distinct per token + assert all(len(set(row.tolist())) == 8 for row in idx_p), "physical top-k must stay distinct" + spp = plan["slots_per_rank"] + recv_before = [0] * EP + recv_after = [0] * EP + per_log = NUM_LOGICAL // EP + for row_l, row_p in zip(idx_l.tolist(), idx_p.tolist()): + for e in row_l: + recv_before[e // per_log] += 1 + for pid in row_p: + recv_after[pid // spp] += 1 + ib = max(recv_before) / (sum(recv_before) / EP) + ia = max(recv_after) / (sum(recv_after) / EP) + print(f"sampled-trace receive imbalance BEFORE={ib:.2f}x AFTER={ia:.2f}x") + assert ia < ib and ia < 1.35, "remap must balance per-rank receive load" + print("remap self-test: OK") + except ImportError: + print("(torch absent — skipped remap self-test; planner gates passed)") + print("EPLB self-test: PASS") + sys.exit(0) diff --git a/experimental/CollectiveX/tests/failure_taxonomy.py b/experimental/CollectiveX/tests/failure_taxonomy.py new file mode 100644 index 000000000..45782ee07 --- /dev/null +++ b/experimental/CollectiveX/tests/failure_taxonomy.py @@ -0,0 +1,83 @@ +#!/usr/bin/env python3 +"""CollectiveX failure taxonomy (goal Part 3: failure & reliability characterization). + +A wedged or crashing EP run should become a CLASSIFIED, bounded record — not a silent hang or a +bare rc=1. classify() maps an exception (or a process return code from the timeout-wrapped driver) +onto a stable failure mode, so coverage/reliability views can keep failed cases instead of dropping +them. Pure stdlib. +""" +from __future__ import annotations + +# Stable failure modes (goal Part 3). Order matters: classify() returns the first match. +MODES = [ + "unsupported", # capability rejected the combo (run_ep exit 5) + "initialization-failure", # process group / buffer / NVSHMEM bring-up failed + "out-of-memory", + "registration-failure", # MR / symmetric-heap registration (e.g. MoRI errno 22) + "correctness-failure", # ran but reconstruction gate failed + "timeout", # killed by the timeout wrapper (rc 124) — bounded hang + "deadlock", # collective watchdog abort (NCCL SIGABRT / rc -6 after a stall) + "teardown-failure", # post-finalize / shmem_finalize assertion + "infrastructure", # slurm / container / FS / node failure + "unknown", +] + +_SIGNATURES = [ + ("unsupported", ("unsupported", "rejects", "not supported", "no fallback")), + ("out-of-memory", ("out of memory", "outofmemory", "cuda oom", "cudaerrormemoryallocation")), + ("registration-failure", ("errno 22", "registration", "register", "ibv_reg", "mr ")), + ("initialization-failure", ("nvshmem", "init_process_group", "ncclcomminit", "bootstrap", "buffer(")), + ("deadlock", ("watchdog", "sigabrt", "signal 6", "collective", "timed out waiting", "nccl timeout")), + ("teardown-failure", ("shmem_finalize", "destroy_process_group", "teardown", "finalize")), + ("correctness-failure", ("correct=false", "reconstruction", "max_rel", "assertion.*tol")), + ("infrastructure", ("srun: error", "slurm", "node fail", "container", "no such file")), +] + + +def classify(text: str = "", rc: int | None = None) -> str: + """Best-effort failure mode from captured stderr/stdout text and/or a process return code.""" + if rc is not None: + if rc == 5: + return "unsupported" + if rc == 124: + return "timeout" # GNU timeout SIGTERM + if rc in (137, -9): + return "timeout" # SIGKILL (timeout -k) + if rc in (134, -6): + return "deadlock" # SIGABRT (NCCL watchdog / assertion) + t = (text or "").lower() + for mode, sigs in _SIGNATURES: + if any(s in t for s in sigs): + return mode + if rc not in (None, 0): + return "unknown" + return "unknown" + + +def record(text="", rc=None, case=None) -> dict: + """A classified failure record preserving the exact case + signal for reliability views.""" + return {"failure_mode": classify(text, rc), "return_code": rc, + "case": case or {}, "evidence": (text or "")[-400:]} + + +if __name__ == "__main__": + import sys + cases = [ + ("RuntimeError: Unsupported number of EP ranks", None, "unsupported"), + ("", 124, "timeout"), + ("Signal 6 (SIGABRT) received ... NCCL watchdog", None, "deadlock"), + ("", -6, "deadlock"), + ("cuda out of memory", None, "out-of-memory"), + ("ibv_reg_mr failed errno 22", None, "registration-failure"), + ("shmem_finalize teardown assertion", None, "teardown-failure"), + ("srun: error: node failed", None, "infrastructure"), + ] + ok = True + for text, rc, want in cases: + got = classify(text, rc) + flag = "OK" if got == want else "FAIL" + if got != want: + ok = False + print(f" [{flag}] rc={rc} text={text[:40]!r} -> {got} (want {want})") + print("failure_taxonomy self-test:", "PASS" if ok else "FAIL") + sys.exit(0 if ok else 1) diff --git a/experimental/CollectiveX/tests/fixtures/all_reduce_perf_b200_8gpu.txt b/experimental/CollectiveX/tests/fixtures/all_reduce_perf_b200_8gpu.txt new file mode 100644 index 000000000..c8825164e --- /dev/null +++ b/experimental/CollectiveX/tests/fixtures/all_reduce_perf_b200_8gpu.txt @@ -0,0 +1,50 @@ +# nThread 1 nGpus 8 minBytes 8 maxBytes 8589934592 step: 2(factor) warmup iters: 5 iters: 20 agg iters: 1 validation: 1 graph: 0 +# +# Using devices +# Rank 0 Group 0 Pid 12345 on b200-node device 0 [0x1b] NVIDIA B200 +# Rank 1 Group 0 Pid 12345 on b200-node device 1 [0x43] NVIDIA B200 +# Rank 2 Group 0 Pid 12345 on b200-node device 2 [0x52] NVIDIA B200 +# Rank 3 Group 0 Pid 12345 on b200-node device 3 [0x61] NVIDIA B200 +# Rank 4 Group 0 Pid 12345 on b200-node device 4 [0x9d] NVIDIA B200 +# Rank 5 Group 0 Pid 12345 on b200-node device 5 [0xc3] NVIDIA B200 +# Rank 6 Group 0 Pid 12345 on b200-node device 6 [0xd1] NVIDIA B200 +# Rank 7 Group 0 Pid 12345 on b200-node device 7 [0xdf] NVIDIA B200 +# +# out-of-place in-place +# size count type redop root time algbw busbw #wrong time algbw busbw #wrong +# (B) (elements) (us) (GB/s) (GB/s) (us) (GB/s) (GB/s) + 8 2 float sum -1 9.62 0.00 0.00 0 9.60 0.00 0.00 0 + 16 4 float sum -1 9.61 0.00 0.00 0 9.59 0.00 0.00 0 + 32 8 float sum -1 9.63 0.00 0.00 0 9.62 0.00 0.00 0 + 64 16 float sum -1 9.60 0.00 0.00 0 9.58 0.00 0.00 0 + 128 32 float sum -1 9.64 0.01 0.02 0 9.63 0.01 0.02 0 + 256 64 float sum -1 9.66 0.03 0.05 0 9.64 0.03 0.05 0 + 512 128 float sum -1 9.69 0.05 0.09 0 9.67 0.05 0.09 0 + 1024 256 float sum -1 9.74 0.11 0.18 0 9.72 0.11 0.18 0 + 2048 512 float sum -1 9.82 0.21 0.37 0 9.80 0.21 0.37 0 + 4096 1024 float sum -1 9.97 0.41 0.72 0 9.95 0.41 0.72 0 + 8192 2048 float sum -1 10.22 0.80 1.40 0 10.20 0.80 1.40 0 + 16384 4096 float sum -1 10.81 1.52 2.65 0 10.79 1.52 2.65 0 + 32768 8192 float sum -1 11.93 2.75 4.81 0 11.90 2.75 4.81 0 + 65536 16384 float sum -1 13.62 4.81 8.42 0 13.59 4.82 8.43 0 + 131072 32768 float sum -1 16.94 7.74 13.54 0 16.90 7.76 13.57 0 + 262144 65536 float sum -1 23.14 11.33 19.83 0 23.10 11.35 19.86 0 + 524288 131072 float sum -1 35.62 14.72 25.76 0 35.55 14.75 25.81 0 + 1048576 262144 float sum -1 60.40 17.36 30.38 0 60.30 17.39 30.43 0 + 2097152 524288 float sum -1 76.50 27.41 47.97 0 76.40 27.45 48.04 0 + 4194304 1048576 float sum -1 110.20 38.06 66.61 0 110.05 38.11 66.70 0 + 8388608 2097152 float sum -1 165.80 50.60 88.55 0 165.60 50.66 88.65 0 + 16777216 4194304 float sum -1 250.10 67.08 117.40 0 249.80 67.16 117.54 0 + 33554432 8388608 float sum -1 360.50 93.08 162.90 0 360.10 93.18 163.07 0 + 67108864 16777216 float sum -1 520.80 128.85 225.50 0 520.20 129.00 225.75 0 + 134217728 33554432 float sum -1 720.30 186.34 326.10 0 719.50 186.55 326.46 0 + 268435456 67108864 float sum -1 1080.50 248.43 434.80 0 1079.20 248.73 435.27 0 + 536870912 134217728 float sum -1 1990.20 269.76 472.10 0 1988.50 269.99 472.49 0 + 1073741824 268435456 float sum -1 3940.60 272.48 476.84 0 3938.10 272.65 477.14 0 + 2147483648 536870912 float sum -1 7850.10 273.56 478.73 0 7846.20 273.69 478.96 0 + 4294967296 1073741824 float sum -1 15680.50 273.91 479.34 0 15673.80 274.03 479.55 0 + 8589934592 2147483648 float sum -1 31250.80 274.87 481.02 0 31238.10 274.98 481.22 0 +# +# Out of bounds values : 0 OK +# Avg bus bandwidth : 168.42 +# diff --git a/experimental/CollectiveX/tests/make_workloads.py b/experimental/CollectiveX/tests/make_workloads.py new file mode 100644 index 000000000..cc77b1303 --- /dev/null +++ b/experimental/CollectiveX/tests/make_workloads.py @@ -0,0 +1,51 @@ +#!/usr/bin/env python3 +"""Generate canonical serialized workloads (goal Part 1). Runs build_workload (needs torch) for +each (routing, global_tokens) in a ladder and writes .npz + .manifest.json into a +dir that runs then consume via `run_ep.py --workload-dir`. One trace per global-token count +because the generator is not prefix-consistent across sizes. + + python3 tests/make_workloads.py --out-dir /data/sa-shared/cx_workloads \\ + --routing uniform --ep 8 --hidden 7168 --topk 8 --experts 256 --seed 67 \\ + --tokens-ladder "1 2 4 8 16 32 64 128 256 512" + +Generate every routing the suites need by running once per --routing. Idempotent (same id => same +file). The dir is the cross-hardware artifact: copy it to each cluster so all consume identical bytes. +""" +from __future__ import annotations + +import argparse +import os +import sys + +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) +import workload as wl # noqa: E402 + + +def main() -> int: + ap = argparse.ArgumentParser(description="Generate canonical CollectiveX workloads") + ap.add_argument("--out-dir", required=True) + ap.add_argument("--routing", required=True) + ap.add_argument("--ep", type=int, required=True, help="ep_size (global_tokens = T * ep)") + ap.add_argument("--hidden", type=int, default=7168) + ap.add_argument("--topk", type=int, default=8) + ap.add_argument("--experts", type=int, default=256) + ap.add_argument("--seed", type=int, default=67) + ap.add_argument("--tokens-ladder", default="1 2 4 8 16 32 64 128 256 512") + a = ap.parse_args() + epr = a.experts // a.ep + ladder = sorted({int(t) for t in a.tokens_ladder.replace(",", " ").split() if int(t) > 0}) + os.makedirs(a.out_dir, exist_ok=True) + made = [] + for T in ladder: + gt = T * a.ep + idx, w, man = wl.build_workload(a.hidden, a.topk, a.experts, a.routing, gt, a.seed, epr) + wid = wl.save_workload(a.out_dir, idx, w, man) + made.append((T, gt, wid)) + print(f" T={T:<5} gt={gt:<6} routing={a.routing} -> {wid} " + f"(trace sha {man['checksums']['trace'][:12]})") + print(f"wrote {len(made)} canonical workloads to {a.out_dir} (routing={a.routing}, ep={a.ep})") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/experimental/CollectiveX/tests/probe_deepep_caps.py b/experimental/CollectiveX/tests/probe_deepep_caps.py new file mode 100644 index 000000000..0f08ed6a5 --- /dev/null +++ b/experimental/CollectiveX/tests/probe_deepep_caps.py @@ -0,0 +1,82 @@ +#!/usr/bin/env python3 +"""Read-only DeepEP capability probe (single process, no dist init needed for sigs). + +Dumps the exact API surface CollectiveX needs to wire fp8 dispatch + low-latency: +constructor + dispatch/combine/low_latency_* signatures, the LL rdma size hint, +the fp8 per-token cast helpers, and the device. Drives the reject matrix + impl. +Run inside the SGLang container on one GPU; prints to stdout only. +""" +import inspect +import sys + + +def sig(obj, name): + fn = getattr(obj, name, None) + if fn is None: + return f" {name}: " + try: + return f" {name}{inspect.signature(fn)}" + except (ValueError, TypeError): + return f" {name}: " + + +def main(): + import torch + print("=== torch / device ===") + print("torch", torch.__version__, "cuda", torch.version.cuda) + if torch.cuda.is_available(): + p = torch.cuda.get_device_properties(0) + print(f"device={p.name} sms={p.multi_processor_count} " + f"mem={p.total_memory/1e9:.0f}GB cc={p.major}.{p.minor}") + print("fp8 dtypes:", [d for d in ("float8_e4m3fn", "float8_e4m3fnuz", "float8_e5m2") + if hasattr(torch, d)]) + + print("\n=== deep_ep ===") + import deep_ep + from deep_ep import Buffer + print("deep_ep file:", getattr(deep_ep, "__file__", "?")) + try: + import importlib.metadata as md + print("deep_ep version:", md.version("deep_ep")) + except Exception as e: + print("deep_ep version: ", repr(e)) + print("deep_ep dir:", [n for n in dir(deep_ep) if not n.startswith("_")]) + print("Buffer.num_sms (default):", getattr(Buffer, "num_sms", "")) + + print("\n=== Buffer signatures ===") + print(sig(Buffer, "__init__")) + for m in ("dispatch", "combine", "get_dispatch_layout", + "low_latency_dispatch", "low_latency_combine", + "clean_low_latency_buffer", "get_low_latency_rdma_size_hint", + "get_dispatch_config", "get_combine_config", "set_num_sms", + "get_buffer_size_hint", "internode_dispatch", "internode_combine"): + print(sig(Buffer, m)) + + print("\n=== fp8 cast helpers ===") + # The canonical per-token fp8 cast in DeepEP's own tests/utils. + for modname in ("deep_ep.utils", "deep_ep"): + try: + mod = __import__(modname, fromlist=["*"]) + cands = [n for n in dir(mod) if "fp8" in n.lower() or "cast" in n.lower() + or "quant" in n.lower()] + print(f"{modname}: {cands}") + except Exception as e: + print(f"{modname}: {e!r}") + + print("\n=== LL dispatch source (return shape / fp8 default) ===") + for m in ("low_latency_dispatch", "low_latency_combine", "dispatch"): + fn = getattr(Buffer, m, None) + if fn is None: + continue + try: + src = inspect.getsource(fn) + head = "\n".join(src.splitlines()[:45]) + print(f"--- {m} (first 45 lines) ---\n{head}\n") + except (OSError, TypeError) as e: + print(f"--- {m}: no source ({e!r}) ---") + + print("\nPROBE_OK") + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/experimental/CollectiveX/tests/probe_deepep_ll.py b/experimental/CollectiveX/tests/probe_deepep_ll.py new file mode 100644 index 000000000..88792407b --- /dev/null +++ b/experimental/CollectiveX/tests/probe_deepep_ll.py @@ -0,0 +1,94 @@ +#!/usr/bin/env python3 +"""Go/No-Go: does DeepEP low-latency (LL) mode actually run on THIS fabric? + +LL dispatch/combine require IBGDA ("all ranks visible via RDMA, IBGDA enabled" — +even intranode), with allow_nvlink_for_low_latency_mode as a possible NVLink escape +hatch. On a single-node NVLink-only box this may or may not initialize. Run under +torchrun (8 ranks). Prints LL_OK with shapes + reconstruction error, or LL_FAIL with +the exception — that verdict decides whether 'll' enters DeepEPBackend.SUPPORTED_MODES. +""" +import os +import sys +import traceback + +import torch +import torch.distributed as dist + +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) +import routing # noqa: E402 + + +def main() -> int: + rank = int(os.environ.get("RANK", "0")) + world = int(os.environ.get("WORLD_SIZE", "1")) + local = int(os.environ.get("LOCAL_RANK", "0")) + torch.cuda.set_device(local) + device = torch.device(f"cuda:{local}") + os.environ.setdefault("MASTER_ADDR", "localhost") + os.environ.setdefault("MASTER_PORT", "12377") + dist.init_process_group("nccl") + + from deep_ep import Buffer + hidden, topk, experts = 7168, 8, 256 + T = 8 # decode-shaped + num_max = 128 # fixed LL cap (>= max T in a decode sweep) + nle = experts // world # num local experts + + ok = True + detail = "" + try: + rdma = Buffer.get_low_latency_rdma_size_hint(num_max, hidden, world, experts) + if rank == 0: + print(f"[ll] rdma_size_hint={rdma} bytes; nle={nle} num_max={num_max}") + # LL buffer: nvl=0, rdma=hint, low_latency_mode=True. allow_nvlink default True. + buf = Buffer(dist.group.WORLD, 0, rdma, low_latency_mode=True, + num_qps_per_rank=max(1, experts // world)) + # shared trace slice (same builder the harness uses) + gi, gw = routing.build_global_routing(T * world, experts, topk, "uniform", 67, nle) + si, sw = routing.rank_slice(gi, gw, rank, T) + x = routing.rank_activations(T, hidden, 67, rank, device, torch.bfloat16) + topk_idx = si.to(device).to(torch.int64) + topk_w = sw.to(device).to(torch.float32) + + recv_x, recv_count, handle, event, hook = buf.low_latency_dispatch( + x, topk_idx, num_max, experts, use_fp8=True, return_recv_hook=False) + rfp8, rscale = recv_x if isinstance(recv_x, tuple) else (recv_x, None) + if rank == 0: + print(f"[ll] dispatch OK: recv_fp8={tuple(rfp8.shape)} dtype={rfp8.dtype} " + f"scale={None if rscale is None else tuple(rscale.shape)} " + f"recv_count={tuple(recv_count.shape)}") + # dequant fp8 recv -> bf16 in the [nle, num_max*world, hidden] layout for combine + R = rfp8.float() + if rscale is not None: + E, S, H = rfp8.shape + R = (rfp8.float().view(E, S, H // 128, 128) * rscale.unsqueeze(-1)).view(E, S, H) + comb_in = R.to(torch.bfloat16) + combined, event2, hook2 = buf.low_latency_combine(comb_in, topk_idx, topk_w, handle) + torch.cuda.synchronize() + # reconstruction: combined[i] ~= dequant(x[i]) * sum_j w[i,j] (weighted reduce) + wsum = topk_w.sum(dim=1, keepdim=True) + ref = x.float() * wsum + err = (combined[:T].float() - ref[:T]).abs().max().item() / (ref[:T].abs().max().item() + 1e-6) + buf.clean_low_latency_buffer(num_max, hidden, experts) + detail = (f"combined={tuple(combined.shape)} max_rel_err={err:.4f} " + f"wsum[0]={wsum[0].item():.3f}") + if rank == 0: + print(f"[ll] combine OK: {detail}") + except Exception as exc: + ok = False + detail = f"{type(exc).__name__}: {exc}" + if rank == 0: + print(f"[ll] EXCEPTION: {detail}") + traceback.print_exc() + + # reduce verdict across ranks + v = torch.tensor([1 if ok else 0], device=device) + dist.all_reduce(v, op=dist.ReduceOp.MIN) + if rank == 0: + print("LL_OK" if int(v.item()) == 1 else "LL_FAIL", detail) + dist.destroy_process_group() + return 0 if int(v.item()) == 1 else 7 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/experimental/CollectiveX/tests/probe_mori_caps.py b/experimental/CollectiveX/tests/probe_mori_caps.py new file mode 100644 index 000000000..19ae6e9ed --- /dev/null +++ b/experimental/CollectiveX/tests/probe_mori_caps.py @@ -0,0 +1,113 @@ +#!/usr/bin/env python3 +"""Read-only MoRI capability probe (run under torchrun on MI355X, 8 ranks). + +Decides whether 'fp8' enters MoRIBackend.SUPPORTED_PRECISIONS: inspects +EpDispatchCombineConfig for quant_type options + the scale plumbing, then attempts a +small fp8 dispatch/combine. Prints MORI_FP8_OK (with the working quant_type + recon +error) or MORI_FP8_FAIL (with the exception) — that verdict gates the reject matrix. +LL is not probed: MoRI exposes no separate low-latency entrypoint (caps exclude it). +""" +import inspect +import os +import sys +import traceback + +import torch +import torch.distributed as dist + +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) +import routing # noqa: E402 + +os.environ.setdefault("MORI_SHMEM_HEAP_SIZE", os.environ.get("CX_MORI_HEAP_SIZE", "2G")) + + +def main() -> int: + rank = int(os.environ.get("RANK", "0")) + world = int(os.environ.get("WORLD_SIZE", "1")) + local = int(os.environ.get("LOCAL_RANK", "0")) + torch.cuda.set_device(local) + device = torch.device(f"cuda:{local}") + os.environ.setdefault("MASTER_ADDR", "localhost") + os.environ.setdefault("MASTER_PORT", "12399") + dist.init_process_group(backend="cpu:gloo,cuda:nccl", rank=rank, world_size=world, + device_id=device) + import mori + + if rank == 0: + p = torch.cuda.get_device_properties(0) + print(f"[mori] device={p.name} cus={p.multi_processor_count}") + print("[mori] EpDispatchCombineConfig sig:") + try: + print(" ", inspect.signature(mori.ops.EpDispatchCombineConfig)) + except Exception as e: + print(" ", repr(e)) + # surface any quant enum the module exposes + for name in dir(mori.ops): + if "quant" in name.lower() or "Quant" in name: + obj = getattr(mori.ops, name) + print(f"[mori] ops.{name} = {obj}") + if hasattr(obj, "__members__"): + print(" members:", list(obj.__members__)) + + hidden, topk, experts = 7168, 8, 256 + T = 8 + epr = experts // world + world_group = torch.distributed.group.WORLD + torch._C._distributed_c10d._register_process_group("default", world_group) + mori.shmem.shmem_torch_process_group_init("default") + + # candidate fp8 quant_type values to try (string and enum forms) + candidates = [] + QT = getattr(mori.ops, "EpDispatchCombineQuantType", None) or getattr(mori.ops, "QuantType", None) + if QT is not None and hasattr(QT, "__members__"): + for mname in QT.__members__: + if "8" in mname or "fp8" in mname.lower() or "FP8" in mname: + candidates.append((f"enum:{mname}", QT.__members__[mname])) + for s in ("fp8", "fp8_e4m3", "e4m3"): + candidates.append((f"str:{s}", s)) + + if rank == 0: + print(f"[mori] fp8 quant_type candidates: {[c[0] for c in candidates]}") + + gi, gw = routing.build_global_routing(T * world, experts, topk, "uniform", 67, epr) + si, sw = routing.rank_slice(gi, gw, rank, T) + x = routing.rank_activations(T, hidden, 67, rank, device, torch.bfloat16) + indices = si.to(device).to(torch.int32) + weights = sw.to(device).to(torch.float32) + + working = None + detail = "" + for label, qt in candidates: + try: + cfg = mori.ops.EpDispatchCombineConfig( + data_type=torch.bfloat16, rank=rank, world_size=world, + hidden_dim=hidden, scale_dim=hidden // 128, + scale_type_size=torch.tensor([], dtype=torch.float32).element_size(), + max_token_type_size=torch.tensor([], dtype=torch.float32).element_size(), + max_num_inp_token_per_rank=512, num_experts_per_rank=epr, + num_experts_per_token=topk, use_external_inp_buf=False, quant_type=qt) + op = mori.ops.EpDispatchCombineOp(cfg) + scales = torch.ones((T, hidden // 128), dtype=torch.float32, device=device) + out = op.dispatch(x, weights, scales, indices, block_num=80, warp_per_block=16) + recv = int(out[-1][0].item()) + dist.barrier() + working = label + detail = f"quant_type={label} dispatched recv={recv}" + if rank == 0: + print(f"[mori] FP8 DISPATCH OK with {label}: recv={recv}") + break + except Exception as exc: + if rank == 0: + print(f"[mori] {label} failed: {type(exc).__name__}: {str(exc)[:160]}") + detail = f"{type(exc).__name__}: {str(exc)[:160]}" + + v = torch.tensor([1 if working else 0], device=device) + dist.all_reduce(v, op=dist.ReduceOp.MIN) + if rank == 0: + print(("MORI_FP8_OK " + detail) if int(v.item()) == 1 else ("MORI_FP8_FAIL " + detail)) + sys.stdout.flush(); sys.stderr.flush() + os._exit(0 if int(v.item()) == 1 else 7) + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/experimental/CollectiveX/tests/reference_ep.py b/experimental/CollectiveX/tests/reference_ep.py new file mode 100644 index 000000000..c19f854e0 --- /dev/null +++ b/experimental/CollectiveX/tests/reference_ep.py @@ -0,0 +1,117 @@ +#!/usr/bin/env python3 +"""CollectiveX independent EP reference semantics (goal Part 3). + +A from-scratch model of MoE dispatch + combine, written WITHOUT DeepEP or MoRI, used ONLY for +UNTIMED correctness validation. The point (goal: "avoid validating backend against itself"): +expected outputs come from the canonical routing trace + this independent logic, never from the +backend's own round trip. Pure numpy — runs anywhere, no torch. + +Model (ep_size ranks, experts_per_rank experts each; expert e lives on rank e // experts_per_rank): + dispatch: token t selected for expert e contributes a copy of x[t] to (rank e//epr, expert e). + expert: a deterministic per-expert transform f_e (default: scale x by (1 + e/E) — distinct + per expert so a mis-routed copy is detectable; identity is the degenerate case). + combine: y[t] = sum over t's selected experts e of topk_weight[t,e] * f_e(x[t]). + Reduction is over the token's experts; output is in SOURCE token order. + +validate_dispatch() checks every (token, selected-expert) maps to the right rank+expert and the +right payload+gate weight, exactly once. validate_combine() checks the reduction, gate-weighting, +source ordering, and multiple-experts-on-one-rank. reference_combine() returns y for comparing a +backend's combined output against an independent oracle. +""" +from __future__ import annotations + +import numpy as np + + +def expert_scale(e: int, experts: int) -> float: + """Default deterministic per-expert transform factor — distinct per expert so a copy routed + to the wrong expert produces a wrong value (identity would hide mis-routing).""" + return 1.0 + e / float(experts) + + +def dispatch_plan(idx: np.ndarray, experts: int, experts_per_rank: int): + """Independent dispatch model. idx[T,topk] selected experts per token. + Returns list of (token, slot, expert, dest_rank) — every routed copy, exactly once.""" + T, topk = idx.shape + plan = [] + for t in range(T): + seen = set() + for k in range(topk): + e = int(idx[t, k]) + assert e not in seen, f"token {t} selects expert {e} twice (must be distinct)" + seen.add(e) + plan.append((t, k, e, e // experts_per_rank)) + return plan + + +def reference_combine(idx, weights, x, experts, experts_per_rank, transform=expert_scale): + """y[t] = sum_k weights[t,k] * f_{idx[t,k]}(x[t]); source-token order. The independent oracle.""" + T, topk = idx.shape + y = np.zeros_like(x, dtype=np.float64) + for t in range(T): + for k in range(topk): + e = int(idx[t, k]) + y[t] += float(weights[t, k]) * transform(e, experts) * x[t].astype(np.float64) + return y + + +def validate_dispatch(idx, experts, experts_per_rank): + """Every selected (token,expert) routes to the correct rank+expert, exactly once.""" + plan = dispatch_plan(idx, experts, experts_per_rank) + errs = [] + # exactly-once: no duplicate (token, expert) + pairs = [(t, e) for (t, _k, e, _r) in plan] + if len(pairs) != len(set(pairs)): + errs.append("duplicate (token,expert) routed copy") + # correct destination rank + for (t, k, e, r) in plan: + if r != e // experts_per_rank: + errs.append(f"token {t} expert {e} -> rank {r}, expected {e // experts_per_rank}") + ep = (experts + experts_per_rank - 1) // experts_per_rank + for (t, k, e, r) in plan: + if not (0 <= r < ep): + errs.append(f"dest rank {r} out of range [0,{ep})") + return errs + + +def validate_combine(idx, weights, x, experts, experts_per_rank, transform=expert_scale, tol=1e-9): + """Recompute y two ways (vectorizable reduction vs explicit per-copy accumulation) and confirm + they agree — exercises reduction across experts, gate-weighting, source ordering, and the + multiple-experts-on-one-rank case (when topk experts share a rank).""" + errs = [] + y_ref = reference_combine(idx, weights, x, experts, experts_per_rank, transform) + # explicit accumulation over the dispatch plan (independent path) + T = idx.shape[0] + y_acc = np.zeros((T, x.shape[1]), dtype=np.float64) + for (t, k, e, r) in dispatch_plan(idx, experts, experts_per_rank): + y_acc[t] += float(weights[t, k]) * transform(e, experts) * x[t].astype(np.float64) + if np.abs(y_ref - y_acc).max() > tol: + errs.append(f"combine reduction mismatch ({np.abs(y_ref - y_acc).max():.2e})") + # multiple-experts-on-one-rank present? + multi = any(len({int(e) // experts_per_rank for e in idx[t]}) < idx.shape[1] for t in range(T)) + return errs, {"has_multi_expert_per_rank": bool(multi)} + + +# --------------------------------------------------------------------------- self-test +if __name__ == "__main__": + import sys + rng = np.random.default_rng(0) + E, EPR, T, topk, H = 256, 32, 64, 8, 16 + idx = np.stack([rng.permutation(E)[:topk] for _ in range(T)]).astype(np.int64) + w = rng.random((T, topk)).astype(np.float32) + x = rng.standard_normal((T, H)).astype(np.float32) + de = validate_dispatch(idx, E, EPR); assert not de, de + ce, info = validate_combine(idx, w, x, E, EPR); assert not ce, ce + print(f"dispatch+combine semantics OK (multi_expert_per_rank={info['has_multi_expert_per_rank']})") + # mis-routing is DETECTED: corrupt one expert id and confirm the oracle value changes + y0 = reference_combine(idx, w, x, E, EPR) + idx2 = idx.copy(); idx2[0, 0] = (idx2[0, 0] + 1) % E + y1 = reference_combine(idx2, w, x, E, EPR) + assert np.abs(y0[0] - y1[0]).max() > 1e-6, "per-expert transform must make mis-routing detectable" + print("mis-routing detectable via distinct per-expert transform OK") + # edge cases (goal Part 3): empty rank, repeated dest rank, non-divisible handled by callers + idx_hot = np.zeros((4, topk), dtype=np.int64) + idx_hot[:] = np.arange(topk) # all tokens -> experts 0..7 (all on rank 0) = hotspot + assert not validate_dispatch(idx_hot, E, EPR), "single-rank hotspot must validate" + print("edge case: single-rank hotspot (all topk on rank 0) OK") + print("reference_ep self-test: PASS"); sys.exit(0) diff --git a/experimental/CollectiveX/tests/routing.py b/experimental/CollectiveX/tests/routing.py new file mode 100644 index 000000000..66db5a350 --- /dev/null +++ b/experimental/CollectiveX/tests/routing.py @@ -0,0 +1,158 @@ +#!/usr/bin/env python3 +"""CollectiveX — deterministic, platform-independent MoE routing trace. + +Fair-comparison fix #1: routing (per-token expert IDs + gate weights) is generated +ONCE from a fixed seed over the *global* token batch, indexed by global token id, and +is identical on every SKU for the same (seed, routing, global_tokens, experts, top-k, +experts_per_rank). Each rank materializes its slice `[rank*T,(rank+1)*T)`. Activations +are per-rank (same rank ⇒ same x on any platform), so a given global token id has +identical activation everywhere without materializing a global activation tensor. + +Trace classes (the rank fan-out — #destination ranks a token's top-k experts touch — +is the property that makes an EP workload representative; review caught the old +default having fan-out 1): + + * uniform — top-k distinct experts drawn uniformly per token. The DEFAULT. + Expected fan-out for top-k=8, 256 experts, EP8 (32 experts/rank) ≈ + 8·(1 − C(224,8)/C(256,8)) ≈ 5.3 ranks/token. Load ~ Poisson. + * balanced — load-equalized AND maximally spread: token i, slot j → + (i + j·experts_per_rank) mod E, so the 8 experts sit one-per-rank + (fan-out = ep_size) and every expert is hit equally. The high-fan-out, + perfectly-balanced reference. + * balanced-rank-local — the OLD degenerate "balanced": (i·top_k + j) mod E, i.e. + top_k consecutive experts, which (top_k ≤ experts/rank, aligned) all + land on ONE rank ⇒ fan-out 1, minimum communication. Kept as an + explicit edge case, honestly named. + * zipf — expert popularity ∝ 1/rank (skewed load), uniform-ish fan-out. + +Always publish the realized fan-out so the workload is never misread again +(`routing_stats`). +""" +from __future__ import annotations + +import hashlib + +import torch + +_RANK_SUBSEED = 7919 + + +def _cpu_gen(seed: int) -> "torch.Generator": + g = torch.Generator(device="cpu") + g.manual_seed(int(seed)) + return g + + +def build_global_routing(global_tokens: int, experts: int, topk: int, + routing: str, seed: int, experts_per_rank: int): + """(idx[gt, topk] int64, weights[gt, topk] float32) on CPU — deterministic, + independent of world/EP/platform, experts distinct within a token.""" + if topk > experts: + raise ValueError(f"topk ({topk}) > experts ({experts})") + gt = int(global_tokens) + g = _cpu_gen(seed) + if routing == "uniform": + keys = torch.rand(gt, experts, generator=g) + idx = keys.argsort(dim=1)[:, :topk].contiguous().to(torch.int64) + elif routing == "balanced": + # one expert per rank ⇒ fan-out = ep_size, perfectly balanced load. + i = torch.arange(gt, dtype=torch.int64).unsqueeze(1) + j = torch.arange(topk, dtype=torch.int64).unsqueeze(0) + idx = (i + j * int(experts_per_rank)) % experts + elif routing == "balanced-rank-local": + # top_k consecutive (mod E) ⇒ all on ONE rank ⇒ fan-out 1 (min comm). Edge case. + i = torch.arange(gt, dtype=torch.int64).unsqueeze(1) + j = torch.arange(topk, dtype=torch.int64).unsqueeze(0) + idx = (i * topk + j) % experts + elif routing == "zipf" or routing.startswith("zipf-"): + # popularity ∝ 1/rank^s — s sets the skew. zipf == zipf-moderate (s=1). + s = {"zipf": 1.0, "zipf-mild": 0.5, "zipf-moderate": 1.0, "zipf-heavy": 2.0}.get(routing) + if s is None: + raise ValueError(f"unknown zipf level '{routing}'") + p = 1.0 / torch.arange(1, experts + 1, dtype=torch.float32).pow(s) + p = (p / p.sum()).expand(gt, experts) + idx = torch.multinomial(p, topk, replacement=False, generator=g).to(torch.int64) + elif routing == "hotspot-single": + # adversarial: expert 0 is in EVERY token's top-k (single hot expert/rank), the other + # topk-1 drawn uniformly from the rest — maximal single-rank load. + rest = torch.stack([torch.randperm(experts - 1, generator=g)[:topk - 1] + 1 + for _ in range(gt)]).to(torch.int64) + idx = torch.cat([torch.zeros(gt, 1, dtype=torch.int64), rest], dim=1) + else: + raise ValueError(f"unknown routing '{routing}' " + f"(uniform|balanced|balanced-rank-local|zipf[-mild|-moderate|-heavy]|hotspot-single)") + weights = torch.softmax(torch.randn(gt, topk, generator=g), dim=1).to(torch.float32) + return idx, weights + + +def rank_slice(idx, weights, rank: int, tokens_per_rank: int): + lo = rank * tokens_per_rank + return idx[lo:lo + tokens_per_rank].contiguous(), weights[lo:lo + tokens_per_rank].contiguous() + + +def rank_activations(tokens: int, hidden: int, seed: int, rank: int, device, dtype=torch.bfloat16): + g = _cpu_gen(int(seed) * _RANK_SUBSEED + int(rank) + 1) + return torch.randn(tokens, hidden, generator=g, dtype=torch.float32).to(device=device, dtype=dtype) + + +def routing_locality(idx, experts_per_rank: int, ep_size: int, tokens_per_rank: int, + gpus_per_node: int, scale_up_domain: int = None) -> dict: + """Locality of the routed (token, dest-rank) copies (goal Part 2 topology section). + A token's SOURCE rank is global_id // tokens_per_rank; its DEST ranks are idx // epr. + Reports the fraction of copies that stay on the local rank / same node / same scale-up + domain vs cross-node / cross-domain — the property a placement (packed/striped) changes.""" + import torch as _t + gt = idx.shape[0] + dest = (idx // experts_per_rank).clamp(max=ep_size - 1) # [gt, topk] + src = (_t.arange(gt) // max(1, tokens_per_rank)).unsqueeze(1) # [gt,1] source rank + src = src.expand_as(dest) + sud = scale_up_domain or (gpus_per_node * ep_size) # default: all one domain + local = (dest == src) + same_node = (dest // gpus_per_node) == (src // gpus_per_node) + same_dom = (dest // sud) == (src // sud) + n = dest.numel() + return { + "local_rank_fraction": float(local.float().mean()), + "same_node_fraction": float(same_node.float().mean()), + "same_scaleup_domain_fraction": float(same_dom.float().mean()), + "cross_node_fraction": float((~same_node).float().mean()), + "cross_domain_fraction": float((~same_dom).float().mean()), + "gpus_per_node": gpus_per_node, "scale_up_domain": sud, "copies": int(n), + } + + +def routing_stats(idx, experts: int, experts_per_rank: int, weights=None) -> dict: + """Realized routing properties for the GLOBAL trace — published per point so the + fan-out / load can never be silently misread. idx is the global [gt, topk] tensor; + weights the matching [gt, topk] gate weights (hashed too for workload identity). + """ + ep = max(1, experts // max(1, experts_per_rank)) + ranks = (idx // experts_per_rank) # [gt, topk] destination rank per assignment + # unique destination ranks per token (fan-out) + onehot = torch.zeros(idx.shape[0], ep, dtype=torch.bool) + onehot.scatter_(1, ranks.clamp(max=ep - 1), True) + fanout = onehot.sum(dim=1) # [gt] + hist = torch.bincount(fanout, minlength=ep + 1)[1:ep + 1].tolist() # counts for fan-out 1..ep + load = torch.bincount(idx.reshape(-1), minlength=experts).float() + # token-copies SENT to each destination rank (the "send histogram", review #3). + rank_load = torch.bincount(ranks.reshape(-1).clamp(max=ep - 1), minlength=ep).tolist() + # SHA-256 workload identity over BOTH topk_idx and gate weights (review #3): a chart + # point's routing is provably identical across SKUs only if both hashes match. + idx_bytes = idx.to(torch.int32).cpu().numpy().tobytes() + idx_hash = hashlib.sha256(idx_bytes).hexdigest()[:16] + if weights is not None: + w_bytes = weights.to(torch.float32).cpu().numpy().tobytes() + w_hash = hashlib.sha256(w_bytes).hexdigest()[:16] + routing_hash = hashlib.sha256(idx_bytes + w_bytes).hexdigest()[:16] # combined identity + else: + w_hash, routing_hash = None, idx_hash + return { + "fanout_mean": float(fanout.float().mean()), + "fanout_min": int(fanout.min()), "fanout_max": int(fanout.max()), + "fanout_hist": hist, # index k-1 = #tokens with fan-out k + "rank_load_hist": rank_load, # token-copies sent to each dest rank + "routed_copies": int(fanout.sum()), # total (token, dest-rank) pairs + "expert_load_min": int(load.min()), "expert_load_max": int(load.max()), + "expert_load_mean": float(load.mean()), + "routing_hash": routing_hash, "idx_hash": idx_hash, "weights_hash": w_hash, + } diff --git a/experimental/CollectiveX/tests/run_ep.py b/experimental/CollectiveX/tests/run_ep.py new file mode 100644 index 000000000..e9a74f6ab --- /dev/null +++ b/experimental/CollectiveX/tests/run_ep.py @@ -0,0 +1,138 @@ +#!/usr/bin/env python3 +"""CollectiveX — EP dispatch/combine benchmark entrypoint (run under torchrun). + +Picks a backend adapter (DeepEP or MoRI), runs the source-tokens-per-rank sweep +via ep_harness, and writes one provenance-tagged JSON doc. Dispatch and combine +are timed SEPARATELY (see ep_harness); only T varies along the resulting line. + + torchrun --nproc_per_node=8 tests/run_ep.py --backend mori \\ + --phase decode --runner mi355x-amds --topology-class mi355x-xgmi \\ + --transport xgmi --env-json results/env.json --out results/mi355x_mori_decode.json + + torchrun --nproc_per_node=8 tests/run_ep.py --backend deepep \\ + --phase prefill --runner b200-dgxc --topology-class b200-nvlink-island \\ + --transport nvlink --env-json results/env.json --out results/b200_deepep_prefill.json +""" +from __future__ import annotations + +import argparse +import os +import sys + +# Make the sibling tests/ modules importable when run as `tests/run_ep.py` under +# torchrun (it executes the file as __main__, not as a package). +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) + +import ep_harness # noqa: E402 (stdlib-only; safe before torch) + + +def main() -> int: + ap = argparse.ArgumentParser(description="CollectiveX EP dispatch/combine sweep") + ap.add_argument("--backend", required=True, choices=["deepep", "mori"]) + ep_harness.add_common_args(ap) + args = ap.parse_args() + + try: + import torch + import torch.distributed as dist + except Exception as exc: # pragma: no cover + print(f"ERROR: torch unavailable: {exc!r}", file=sys.stderr) + return 3 + + rank = int(os.environ.get("RANK", "0")) + world_size = int(os.environ.get("WORLD_SIZE", "1")) + local_rank = int(os.environ.get("LOCAL_RANK", "0")) + torch.cuda.set_device(local_rank) + device = torch.device(f"cuda:{local_rank}") + os.environ.setdefault("MASTER_ADDR", "localhost") + os.environ.setdefault("MASTER_PORT", "12355") + + # EPLB bumps the expert count to PHYSICAL (logical + redundant) BEFORE backend construction + # so the backend sizes its buffers for the replicated set; ep_harness builds the LOGICAL + # routing trace and remaps it to the balanced physical placement (a pure routing transform, + # tests/eplb.py — no adapter change). Deterministic, so every rank agrees on the count. + if getattr(args, "eplb", False): + import eplb + args.num_logical_experts = args.experts + args.experts = eplb.physical_count(args.experts, args.num_redundant_experts, world_size) + + # Reproduction provenance (recorded in the artifact). + args.reproduction_command = (f"torchrun --nproc_per_node={world_size} tests/run_ep.py " + + " ".join(sys.argv[1:])) + args.image = os.environ.get("COLLECTIVEX_IMAGE", "") + args.image_digest = os.environ.get("COLLECTIVEX_IMAGE_DIGEST", "") + # Container provenance (goal P1): arch (amd64/arm64) + local squash hash for Enroot/Pyxis. + import platform as _plat + _arch = {"x86_64": "amd64", "aarch64": "arm64"}.get(_plat.machine(), _plat.machine()) + args.image_arch = _arch + args.squash_sha256 = os.environ.get("COLLECTIVEX_SQUASH_SHA256") + # Complete GitHub provenance (goal P1): repo, run id, attempt, ref/branch, source SHA, job, + # artifact. A result is only publication-'official' when these are present (validity gate). + _run = {"run_id": os.environ.get("GITHUB_RUN_ID"), + "run_attempt": os.environ.get("GITHUB_RUN_ATTEMPT"), + "ref": os.environ.get("GITHUB_REF_NAME") or os.environ.get("GITHUB_REF"), + "source_sha": os.environ.get("COLLECTIVEX_SOURCE_SHA") or os.environ.get("GITHUB_SHA"), + "repo": os.environ.get("GITHUB_REPOSITORY"), + "job": os.environ.get("GITHUB_JOB"), + "artifact": os.environ.get("COLLECTIVEX_ARTIFACT_NAME")} + args.git_run = _run if any(_run.values()) else None + + # Import the backend CLASS (module-top imports torch + the backend lib; no process + # group needed) and REJECT unsupported combos BEFORE init — never fall back or + # mislabel (review/goal). All ranks reject identically. + if args.backend == "mori": + from ep_mori import MoRIBackend as Backend + else: + from ep_deepep import DeepEPBackend as Backend + if args.num_ep_groups != 1: + if rank == 0: + print(f"ERROR: num_ep_groups={args.num_ep_groups} REJECTED — real subgroup process " + f"groups are unimplemented; not faking it.", file=sys.stderr) + return 5 + sp = getattr(Backend, "SUPPORTED_PRECISIONS", {"bf16"}) + sm = getattr(Backend, "SUPPORTED_MODES", {"normal"}) + if args.dispatch_dtype not in sp or args.mode not in sm: + if rank == 0: + print(f"ERROR: {args.backend} REJECTS dispatch-dtype={args.dispatch_dtype} / " + f"mode={args.mode} — not supported on this build (no fallback). " + f"supported precisions={sorted(sp)} modes={sorted(sm)}.", file=sys.stderr) + return 5 + # Measurement-contract capability (review #3): each adapter conforms to a declared + # contract; reject anything else rather than letting it pick its own timing boundary. + sc = getattr(Backend, "SUPPORTED_CONTRACTS", {"layout-and-dispatch-v1"}) + if args.measurement_contract not in sc: + if rank == 0: + print(f"ERROR: {args.backend} REJECTS measurement-contract=" + f"{args.measurement_contract} — supported={sorted(sc)}.", file=sys.stderr) + return 5 + if args.measurement_contract == "cached-layout-comm-only-v1" and args.mode == "ll": + if rank == 0: + print("ERROR: cached-layout-comm-only-v1 is meaningless for LL (low_latency_dispatch " + "computes its layout internally; nothing to hoist).", file=sys.stderr) + return 5 + + # MoRI inits its shmem on a process group it registers as "default" and wants + # the gloo+nccl combo with an explicit device_id (per its reference test); + # DeepEP uses a plain nccl group. + if not dist.is_initialized(): + if args.backend == "mori": + dist.init_process_group(backend="cpu:gloo,cuda:nccl", rank=rank, + world_size=world_size, device_id=device) + else: + dist.init_process_group("nccl") + + backend = Backend(args, rank, world_size, local_rank, device) + if rank == 0: + print(f"[run_ep] backend={args.backend} phase={args.phase} mode={args.mode} " + f"world={world_size} ep_size={world_size} hidden={args.hidden} " + f"topk={args.topk} experts={args.experts} dtype={args.dispatch_dtype} " + f"routing={args.routing} seed={args.seed}") + + rc = ep_harness.run_sweep(args, backend, torch, dist, device, rank, world_size) + # finalize() handles backend-specific teardown: DeepEP returns rc cleanly; + # MoRI hard-exits past its post-shmem_finalize teardown assertion. + return backend.finalize(rc) + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/experimental/CollectiveX/tests/workload.py b/experimental/CollectiveX/tests/workload.py new file mode 100644 index 000000000..54465eb16 --- /dev/null +++ b/experimental/CollectiveX/tests/workload.py @@ -0,0 +1,168 @@ +#!/usr/bin/env python3 +"""CollectiveX — canonical, serialized MoE routing workloads (goal Part 1: workload identity). + +A *canonical workload* is a routing trace generated ONCE, serialized to a platform-independent +file, and referenced by an immutable `workload_id`. Every official benchmark point consumes the +SAME serialized bytes, so "did NVIDIA and AMD run the identical workload?" is answered by a +checksum match, not by trusting that two machines re-ran the same seeded generator. + +Layout on disk (one workload = two files, basename = workload_id): + /.npz topk_idx [gt,topk] int32, topk_weights [gt,topk] float32 + /.manifest.json dims, routing profile, generator version, seed, SHA-256s + +Split by dependency so it runs where each step lives: + * build_workload() needs torch (via routing.py) — run on a node/container. + * load/verify/manifest need only numpy + stdlib — run on a login node or in CI. + +Seeded runtime generation (routing.build_global_routing) stays for local dev; canonical files +are how cross-hardware comparisons are gated. +""" +from __future__ import annotations + +import hashlib +import json +import os + +WORKLOAD_SCHEMA_VERSION = 1 +# Bump when routing.build_global_routing's numerics change so a stale file can't masquerade as +# current. The workload_id folds this in: same id <=> same generator + params. +GENERATOR_VERSION = "collectivex-routing-v1" +GATE_WEIGHT_FORMAT = "softmax-of-randn-f32" # how topk_weights are produced (see routing.py) + + +def _sha256(b: bytes) -> str: + return hashlib.sha256(b).hexdigest() + + +def compute_workload_id(routing: str, hidden: int, topk: int, experts: int, + global_tokens: int, seed: int, generator: str = GENERATOR_VERSION) -> str: + """Deterministic id over the identity-defining params. Same params+generator => same id.""" + key = (f"{generator}|routing={routing}|hidden={hidden}|topk={topk}|experts={experts}" + f"|gt={global_tokens}|seed={seed}") + return _sha256(key.encode())[:16] + + +def build_manifest(routing, hidden, topk, experts, global_tokens, seed, experts_per_rank, + idx_np, weights_np, routing_stats=None): + """Assemble the manifest dict from the (numpy) trace arrays. Pure numpy/stdlib.""" + idx_bytes = idx_np.astype("int32").tobytes() + w_bytes = weights_np.astype("float32").tobytes() + wid = compute_workload_id(routing, hidden, topk, experts, global_tokens, seed) + return { + "schema_version": WORKLOAD_SCHEMA_VERSION, + "workload_id": wid, + "generator_version": GENERATOR_VERSION, + "gate_weight_format": GATE_WEIGHT_FORMAT, + "dims": {"hidden": hidden, "topk": topk, "experts": experts, + "global_tokens": int(global_tokens), "experts_per_rank": experts_per_rank}, + "routing_profile": routing, + "seed": seed, + "checksums": { # SHA-256 over the raw little-endian array bytes (int32 / float32) + "topk_idx": _sha256(idx_bytes), + "topk_weights": _sha256(w_bytes), + "trace": _sha256(idx_bytes + w_bytes), # full-workload identity + }, + "routing_stats": routing_stats or {}, + } + + +def build_workload(hidden, topk, experts, routing, global_tokens, seed, experts_per_rank): + """Generate a canonical trace. Needs torch (routing.py). Returns (idx_np, weights_np, manifest).""" + import numpy as np + import routing as _routing + idx_t, w_t = _routing.build_global_routing(global_tokens, experts, topk, routing, seed, + experts_per_rank) + rstats = _routing.routing_stats(idx_t, experts, experts_per_rank, weights=w_t) + idx_np = idx_t.detach().cpu().numpy().astype(np.int32) + w_np = w_t.detach().cpu().numpy().astype(np.float32) + manifest = build_manifest(routing, hidden, topk, experts, global_tokens, seed, + experts_per_rank, idx_np, w_np, rstats) + return idx_np, w_np, manifest + + +def save_workload(out_dir, idx_np, weights_np, manifest) -> str: + import numpy as np + os.makedirs(out_dir, exist_ok=True) + wid = manifest["workload_id"] + np.savez_compressed(os.path.join(out_dir, f"{wid}.npz"), + topk_idx=idx_np.astype(np.int32), topk_weights=weights_np.astype(np.float32)) + with open(os.path.join(out_dir, f"{wid}.manifest.json"), "w") as fh: + json.dump(manifest, fh, indent=2, sort_keys=True) + return wid + + +def load_workload(npz_path, verify=True): + """Load a canonical trace (numpy + stdlib only). Returns (idx_np, weights_np, manifest). + Raises ValueError if verify=True and the on-disk bytes don't match the manifest checksums.""" + import numpy as np + base = npz_path[:-4] if npz_path.endswith(".npz") else npz_path + with open(base + ".manifest.json") as fh: + manifest = json.load(fh) + z = np.load(base + ".npz") + idx_np, w_np = z["topk_idx"], z["topk_weights"] + if verify: + ok, reason = verify_workload(manifest, idx_np, w_np) + if not ok: + raise ValueError(f"workload checksum mismatch for {base}: {reason}") + return idx_np, w_np, manifest + + +def verify_workload(manifest, idx_np, weights_np): + """Recompute checksums and compare to the manifest. Returns (ok, reason).""" + import numpy as np # noqa: F401 + ib = idx_np.astype("int32").tobytes() + wb = weights_np.astype("float32").tobytes() + cs = manifest.get("checksums", {}) + if _sha256(ib) != cs.get("topk_idx"): + return False, "topk_idx hash differs" + if _sha256(wb) != cs.get("topk_weights"): + return False, "topk_weights hash differs" + if _sha256(ib + wb) != cs.get("trace"): + return False, "trace hash differs" + wid = compute_workload_id(manifest["routing_profile"], manifest["dims"]["hidden"], + manifest["dims"]["topk"], manifest["dims"]["experts"], + manifest["dims"]["global_tokens"], manifest["seed"], + manifest.get("generator_version", GENERATOR_VERSION)) + if wid != manifest["workload_id"]: + return False, f"workload_id mismatch (recomputed {wid} != {manifest['workload_id']})" + return True, "ok" + + +# --------------------------------------------------------------------------- self-test +if __name__ == "__main__": + import sys + import tempfile + # (1) workload_id determinism + sensitivity — pure stdlib, always runs. + a = compute_workload_id("zipf", 7168, 8, 256, 4096, 67) + b = compute_workload_id("zipf", 7168, 8, 256, 4096, 67) + c = compute_workload_id("uniform", 7168, 8, 256, 4096, 67) + assert a == b, "workload_id must be deterministic" + assert a != c, "workload_id must depend on routing" + print(f"workload_id determinism OK (zipf={a} uniform={c})") + # (2) build/save/load/verify roundtrip + cross-build identity — needs torch+numpy. + try: + import numpy as np # noqa: F401 + try: + idx, w, man = build_workload(7168, 8, 256, "zipf", 512, 67, 32) + built = True + except Exception as exc: # torch missing on a login node + print(f"(torch unavailable — synthesizing arrays to test load/verify: {exc!r})") + idx = np.random.default_rng(0).integers(0, 256, size=(512, 8)).astype(np.int32) + w = np.random.default_rng(1).random((512, 8)).astype(np.float32) + man = build_manifest("zipf", 7168, 8, 256, 512, 67, 32, idx, w) + built = False + with tempfile.TemporaryDirectory() as d: + wid = save_workload(d, idx, w, man) + idx2, w2, man2 = load_workload(os.path.join(d, f"{wid}.npz"), verify=True) + assert (idx2 == idx).all() and (w2 == w).all(), "roundtrip array mismatch" + ok, reason = verify_workload(man2, idx2, w2) + assert ok, reason + # tamper -> must fail + idx2[0, 0] = (int(idx2[0, 0]) + 1) % 256 + bad, _ = verify_workload(man2, idx2, w2) + assert not bad, "verify must catch tampering" + print(f"save/load/verify roundtrip OK (workload_id={wid}, built_via_torch={built})") + except ImportError: + print("(numpy unavailable — skipped serialization roundtrip; id logic passed)") + print("workload self-test: PASS") + sys.exit(0) diff --git a/experimental/CollectiveX/validate_results.py b/experimental/CollectiveX/validate_results.py new file mode 100644 index 000000000..584674ab1 --- /dev/null +++ b/experimental/CollectiveX/validate_results.py @@ -0,0 +1,165 @@ +#!/usr/bin/env python3 +"""CollectiveX result validator (goal Part 1: schema + validation tooling). + +Validates EP result JSON docs against ep-result-v4 and the project's semantic gates: +schema shape, provenance completeness, workload identity (incl. cross-run trace-signature +agreement within a comparison_key), measurement-contract membership, byte-contract presence, +sample counts, and — crucially — that `publication_status` is the MACHINE-DERIVED function of +`validity` (no doc may hand-label itself official). Exits non-zero when any doc claims +`official` but fails a gate (or, with --require-official, when any doc isn't official). + +Pure stdlib; uses `jsonschema` if importable, else a built-in required-key/type/enum check. +v3 docs (no publication_status) load as legacy/experimental and are reported, not failed. + + python3 validate_results.py results/*.json + python3 validate_results.py --require-official --schema schemas/ep-result-v4.schema.json results/ +""" +from __future__ import annotations + +import argparse +import glob +import json +import os +import sys + +MIN_SAMPLES_OFFICIAL = 100 +KNOWN_CONTRACTS = {"layout-and-dispatch-v1", "cached-layout-comm-only-v1", "runtime-visible-v1"} +PUB_STATES = {"official", "comparable-experimental", "diagnostic", "invalid", "failed"} + + +def derive_publication_status(v: dict) -> str: + """MUST mirror ep_harness._derive_publication_status — the validator's job is to confirm the + recorded status equals this derivation.""" + if v.get("execution_status") != "complete": + return "failed" + if (v.get("semantic_correctness") != "pass" or v.get("measurement_conformance") != "conformant" + or v.get("workload_identity") == "inconsistent"): + return "invalid" + sound = (v.get("semantic_correctness") == "pass" + and str(v.get("workload_identity", "")).startswith("consistent") + and v.get("measurement_conformance") == "conformant") + if str(v.get("resource_conformance", "")).endswith("nonconforming"): + return "diagnostic" + if sound and v.get("provenance_complete") and v.get("workload_source") == "canonical-serialized": + return "official" + if sound: + return "comparable-experimental" + return "diagnostic" + + +def _schema_check(doc, schema): + """jsonschema if available; else a pragmatic required-keys/enum check of the top level + rows.""" + try: + import jsonschema + jsonschema.validate(doc, schema) + return [] + except ImportError: + errs = [] + for k in schema.get("required", []): + if k not in doc: + errs.append(f"missing required field '{k}'") + # enum spot-checks the built-in path can do cheaply + ms = doc.get("measurement_contract") + if ms is not None and ms not in KNOWN_CONTRACTS: + errs.append(f"unknown measurement_contract '{ms}'") + ps = doc.get("publication_status") + if ps is not None and ps not in PUB_STATES: + errs.append(f"unknown publication_status '{ps}'") + if not doc.get("rows"): + errs.append("no rows") + return errs + except Exception as exc: # jsonschema.ValidationError + return [f"schema: {exc.message if hasattr(exc, 'message') else exc}"] + + +def validate_doc(doc, schema, path): + errs, warns = [], [] + legacy = "publication_status" not in doc + if legacy: + warns.append("legacy (v3, no publication_status) — loads as experimental, not comparable as official") + return errs, warns, "legacy-experimental" + errs += _schema_check(doc, schema) if schema else [] + v = doc.get("validity", {}) + recorded = doc.get("publication_status") + derived = derive_publication_status(v) + if recorded != derived: + errs.append(f"publication_status '{recorded}' != machine-derived '{derived}' (validity tampered or stale)") + # byte + contract + sample gates + if doc.get("measurement_contract") not in KNOWN_CONTRACTS: + errs.append(f"unknown measurement_contract {doc.get('measurement_contract')}") + rows = doc.get("rows", []) + for r in rows: + if "byte_contracts" not in r: + errs.append(f"T={r.get('tokens_per_rank')}: missing byte_contracts"); break + for op in ("dispatch", "combine", "roundtrip"): + if op not in r or "p99" not in r.get(op, {}): + errs.append(f"T={r.get('tokens_per_rank')}: missing {op} percentiles"); break + # official-grade gates + if recorded == "official": + if not v.get("provenance_complete"): + errs.append("official but provenance_complete=false") + if v.get("workload_source") != "canonical-serialized": + errs.append("official but workload not canonical-serialized") + if rows and min((r.get("samples_pooled", 0) for r in rows)) < MIN_SAMPLES_OFFICIAL: + errs.append(f"official but a point has <{MIN_SAMPLES_OFFICIAL} pooled samples") + if not all(r.get("correct") for r in rows): + errs.append("official but a point failed correctness") + return errs, warns, recorded + + +def main() -> int: + ap = argparse.ArgumentParser(description="CollectiveX EP result validator") + ap.add_argument("paths", nargs="+", help="result JSON files or dirs") + ap.add_argument("--schema", default=os.path.join(os.path.dirname(__file__), "schemas", "ep-result-v4.schema.json")) + ap.add_argument("--require-official", action="store_true", + help="fail if any non-legacy doc is not 'official'") + a = ap.parse_args() + schema = None + if a.schema and os.path.exists(a.schema): + schema = json.load(open(a.schema)) + files = [] + for p in a.paths: + if os.path.isdir(p): + files += glob.glob(os.path.join(p, "**", "*.json"), recursive=True) + else: + files.append(p) + files = sorted(f for f in files if not os.path.basename(f).startswith("env_")) + + # cross-run workload identity: trace_signature must agree within a comparison_key. + by_ck = {} + bad = 0 + for f in files: + try: + doc = json.load(open(f)) + except (json.JSONDecodeError, OSError): + continue + if doc.get("family") != "moe": + continue + errs, warns, status = validate_doc(doc, schema, f) + ck = doc.get("comparison_key") + sig = (doc.get("workload") or {}).get("trace_signature") + if ck and sig: + by_ck.setdefault(ck, {}).setdefault(sig, []).append(os.path.basename(f)) + tag = "OK" if not errs else "FAIL" + if errs: + bad += 1 + if a.require_official and status not in ("official",) and not errs: + tag = "FAIL"; bad += 1; errs = [f"not official (status={status})"] + print(f"[{tag}] {os.path.basename(f):70s} status={status}") + for e in errs: + print(f" ERROR: {e}") + for w in warns: + print(f" note: {w}") + # report cross-run identity disagreements (different hardware, same config, different trace) + for ck, sigs in by_ck.items(): + if len(sigs) > 1: + bad += 1 + print(f"[FAIL] comparison_key {ck[:12]}: {len(sigs)} DIFFERENT trace signatures — not the same workload:") + for sig, fs in sigs.items(): + print(f" {sig}: {', '.join(fs)}") + print(f"\n{'FAILED' if bad else 'PASS'}: {len(files)} files, {bad} problem(s)") + return 1 if bad else 0 + + +if __name__ == "__main__": + raise SystemExit(main())