diff --git a/.github/workflows/collectivex-experimental.yml b/.github/workflows/collectivex-experimental.yml
new file mode 100644
index 000000000..60c5d8d06
--- /dev/null
+++ b/.github/workflows/collectivex-experimental.yml
@@ -0,0 +1,249 @@
+name: CollectiveX Experimental
+
+# Orchestration only — all benchmark logic lives in experimental/CollectiveX/.
+# Push to the feature branch runs the MI355X MoRI dispatch/combine benchmark (no
+# merge to main needed); workflow_dispatch runs a chosen SKU + benchmark (the lane
+# for GB200/B200 NCCL, DeepEP, and larger sweeps). Each job lands on the SKU's
+# self-hosted runner and invokes that SKU's launch script — the same
+# launch_${RUNNER_NAME%%_*}.sh convention the serving benchmarks use.
+
+on:
+  push:
+    branches:
+      - collectivex
+    paths:
+      - 'experimental/CollectiveX/**'
+      - '.github/workflows/collectivex-experimental.yml'
+  workflow_dispatch:
+    inputs:
+      sku:
+        # Only SKUs with a matching launchers/launch_<prefix>.sh are offered —
+        # runner.name's prefix selects the script, so an SKU without one fails.
+        description: Self-hosted runner pool (must have a CollectiveX launcher)
+        type: choice
+        default: gb200
+        options: [gb200, b200-dgxc, b200-multinode, mi355x, h100-dgxc, h200, b300, gb300]
+      benchmark:
+        # mori runs only on mi355x; nccl/deepep/all on the NVIDIA SKUs.
+        description: Which benchmark to run
+        type: choice
+        default: nccl
+        options: [nccl, deepep, mori, all]
+      ops:
+        description: NCCL ops (space-separated); blank = default set
+        type: string
+        default: ''
+      min_bytes:
+        description: nccl-tests min message size
+        type: string
+        default: '8'
+      max_bytes:
+        description: nccl-tests max message size
+        type: string
+        default: '8G'
+      ngpus:
+        description: GPUs per node (blank = SKU default)
+        type: string
+        default: ''
+      nodes:
+        description: Node count (gb200 multi-node MNNVL; 2 = 8 GPU). Blank/1 = single node.
+        type: string
+        default: ''
+      phase:
+        # EP only. 'both' fans out to one job per phase (decode + prefill).
+        description: EP phase — decode (small T) / prefill (large T); 'both' = a job each
+        type: choice
+        default: both
+        options: [both, decode, prefill]
+      tokens_ladder:
+        description: EP source-tokens-per-rank sweep (space/comma sep); blank = phase default
+        type: string
+        default: ''
+      dispatch_dtype:
+        description: EP dispatch payload precision
+        type: choice
+        default: bf16
+        options: [bf16, fp8]
+      mode:
+        # normal = high-throughput kernels (decode+prefill); ll = DeepEP low-latency
+        # (decode-shaped, fp8 cast in-kernel). LL is rejected on backends without it
+        # (MoRI) and aborts on fabrics that lack it (B300) — run only where supported.
+        description: EP kernel path — normal or low-latency (LL)
+        type: choice
+        default: normal
+        options: [normal, ll]
+      resource_mode:
+        # normalized = ~sm_fraction of device units (cross-vendor apples-to-apples);
+        # tuned = each backend's own recommended/default launch config.
+        description: Comm resource regime
+        type: choice
+        default: normalized
+        options: [normalized, tuned, default]
+      contract:
+        # layout-and-dispatch-v1 = dispatch timing includes routing-layout gen (the only
+        # contract MoRI honors; use for cross-vendor). cached-layout-comm-only-v1 = layout
+        # hoisted out, pure-comm dispatch (DeepEP normal only).
+        description: Measurement contract (timing boundary)
+        type: choice
+        default: layout-and-dispatch-v1
+        options: [layout-and-dispatch-v1, cached-layout-comm-only-v1]
+      routing:
+        # Routing distribution of the shared trace. uniform=realistic; balanced=load-equalized;
+        # zipf*=skewed; hotspot-single=one hot expert. The skew + EPLB sweep lives here.
+        description: EP routing distribution
+        type: choice
+        default: uniform
+        options: [uniform, balanced, zipf, zipf-mild, zipf-moderate, zipf-heavy, hotspot-single]
+      eplb:
+        # EPLB = replicate hot experts + balanced-place (the remedy for skewed routing). A pure
+        # routing-trace transform; experts -> num_logical+redundant. Meaningful with zipf*.
+        description: Apply EPLB expert replication/placement
+        type: boolean
+        default: false
+
+concurrency:
+  # Include the dispatch SKU so two workflow_dispatch runs on different SKUs do
+  # not cancel each other; push has no sku input -> shares one 'push' group.
+  # cancel-in-progress FALSE: same-SKU dispatches QUEUE (serialize) rather than
+  # cancel — required so a 3-run reproducibility sweep on one SKU actually runs all
+  # three (with `true` the later dispatches silently cancelled the earlier ones).
+  group: collectivex-${{ github.ref }}-${{ github.event_name }}-${{ inputs.sku || 'push' }}
+  cancel-in-progress: false
+
+permissions:
+  contents: read
+
+jobs:
+  # Push -> MI355X MoRI dispatch/combine. Lands on a free mi355x-amds runner and
+  # runs launch_mi355x-amds.sh (CX_BENCH=mori). The AMD workspace is compute-
+  # visible, so no CX_STAGE_DIR; the launcher defaults to 8 GPUs.
+  experimental:
+    name: CollectiveX Experimental (${{ matrix.phase }})
+    if: github.event_name == 'push'
+    runs-on: mi355x
+    timeout-minutes: 90
+    strategy:
+      fail-fast: false
+      matrix:
+        # Push = a fast MoRI SMOKE only (decode). The full sweep is workflow_dispatch.
+        phase: [decode]
+    env:
+      CX_BENCH: mori
+      CX_PHASE: ${{ matrix.phase }}
+      # SMOKE ladder capped at T<=16: MoRI + realistic (fan-out≈5.3) routing currently
+      # WEDGES at T>=32 (under investigation; DeepEP is fine), and an unguarded run hung
+      # ~1 h before the job timeout. Keep the push smoke in the known-good range; run the
+      # full sweep via workflow_dispatch (timeout-guarded). Remove the cap once fixed.
+      CX_TOKENS_LADDER: "1 2 4 8 16"
+      CX_RUN_TIMEOUT: "600"
+      # Pin to the MI355X nodes that hold the node-local squash and have a writable
+      # /var/lib/squash; other nodes need a slow cold import that can fail on lock/
+      # cache permissions. Widen once the squash is staged cluster-wide.
+      CX_NODELIST: mia1-p01-g10,mia1-p01-g15
+    steps:
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v5.0.0
+        with: { clean: true }
+      - name: Launch MI355X MoRI (${{ matrix.phase }})
+        env:
+          RUNNER_NAME: ${{ runner.name }}
+        run: bash "experimental/CollectiveX/launchers/launch_${RUNNER_NAME%%_*}.sh"
+      - name: Results summary
+        if: always()
+        run: python3 experimental/CollectiveX/summarize.py --results-dir experimental/CollectiveX/results --markdown >> "$GITHUB_STEP_SUMMARY"
+      - name: Upload results
+        if: always()
+        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
+        with:
+          name: collectivex_mi355x_mori_${{ matrix.phase }}_${{ github.run_id }}
+          path: experimental/CollectiveX/results/*.json
+          if-no-files-found: warn
+
+  # Manual dispatch -> chosen SKU + benchmark. Lands on the inputs.sku runner.
+  dispatch:
+    if: github.event_name == 'workflow_dispatch'
+    runs-on: ${{ inputs.sku }}
+    timeout-minutes: 120
+    strategy:
+      fail-fast: false
+      matrix:
+        # nccl/rccl are collective primitives — phase is meaningless, so run ONE job (not
+        # the same work twice). EP backends: 'both' -> decode + prefill; else a single job.
+        phase: ${{ fromJSON((inputs.benchmark == 'nccl' || inputs.benchmark == 'rccl') && '["na"]' || (inputs.phase == 'both' && '["decode","prefill"]' || format('["{0}"]', inputs.phase))) }}
+    env:
+      CX_BENCH: ${{ inputs.benchmark }}
+      CX_OPS: ${{ inputs.ops }}
+      CX_MIN_BYTES: ${{ inputs.min_bytes }}
+      CX_MAX_BYTES: ${{ inputs.max_bytes }}
+      CX_NGPUS: ${{ inputs.ngpus }}
+      CX_NODES: ${{ inputs.nodes }}
+      CX_PHASE: ${{ matrix.phase }}
+      CX_TOKENS_LADDER: ${{ inputs.tokens_ladder }}
+      CX_DISPATCH_DTYPE: ${{ inputs.dispatch_dtype }}
+      CX_MODE: ${{ inputs.mode }}
+      CX_RESOURCE_MODE: ${{ inputs.resource_mode }}
+      CX_MEASUREMENT_CONTRACT: ${{ inputs.contract }}
+      CX_ROUTING: ${{ inputs.routing }}
+      CX_EPLB: ${{ inputs.eplb && '1' || '' }}
+      # GHA run provenance: run_ep records git_run (repo/run/attempt/ref/sha/job) -> a GHA result
+      # is provenance_complete (publication_status >= comparable-experimental, official w/ canonical).
+      COLLECTIVEX_SOURCE_SHA: ${{ github.sha }}
+      COLLECTIVEX_ARTIFACT_NAME: collectivex_${{ inputs.sku }}_${{ inputs.benchmark }}_${{ matrix.phase }}_${{ github.run_id }}
+      # GB200/watchtower needs a compute-visible workspace; harmless elsewhere.
+      CX_STAGE_DIR: ${{ inputs.sku == 'gb200' && '/mnt/lustre01/users-public/sa-shared/cx-stage' || '' }}
+      # MI355X: pin to the warm-squash, writable nodes (see the push job).
+      CX_NODELIST: ${{ inputs.sku == 'mi355x' && 'mia1-p01-g10,mia1-p01-g15' || '' }}
+    steps:
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v5.0.0
+        with: { clean: true }
+      # Reject an unsupported backend/SKU/mode/dtype/contract BEFORE consuming the runner
+      # (review #3): fail fast on the login node, not after a salloc. 'all' fans out per
+      # vendor in-container, so skip the single-combo check for it.
+      - name: Validate capability
+        if: inputs.benchmark != 'all'
+        run: |
+          python3 experimental/CollectiveX/tests/capability.py \
+            --sku "${{ inputs.sku }}" --backend "${{ inputs.benchmark }}" \
+            --mode "${{ inputs.mode }}" --dtype "${{ inputs.dispatch_dtype }}" \
+            --contract "${{ inputs.contract }}"
+      - name: Launch ${{ inputs.sku }} / ${{ inputs.benchmark }} (${{ matrix.phase }})
+        env:
+          RUNNER_NAME: ${{ runner.name }}
+        run: bash "experimental/CollectiveX/launchers/launch_${RUNNER_NAME%%_*}.sh"
+      - name: Results summary
+        if: always()
+        run: python3 experimental/CollectiveX/summarize.py --results-dir experimental/CollectiveX/results --markdown >> "$GITHUB_STEP_SUMMARY"
+      - name: Upload results
+        if: always()
+        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
+        with:
+          name: collectivex_${{ inputs.sku }}_${{ inputs.benchmark }}_${{ matrix.phase }}_${{ github.run_id }}
+          path: experimental/CollectiveX/results/*.json
+          if-no-files-found: warn
+
+  update-frontend-snapshot:
+    name: Update InferenceX-app snapshot
+    needs: [experimental, dispatch]
+    if: >-
+      always() &&
+      (
+        (github.event_name == 'push' && needs.experimental.result == 'success') ||
+        (github.event_name == 'workflow_dispatch' && needs.dispatch.result == 'success')
+      )
+    runs-on: ubuntu-latest
+    steps:
+      - name: Trigger CollectiveX snapshot update
+        env:
+          FRONTEND_PAT: ${{ secrets.INFX_FRONTEND_PAT }}
+        run: |
+          set -euo pipefail
+          curl -sSf -X POST \
+            -H "Authorization: Bearer $FRONTEND_PAT" \
+            -H "Accept: application/vnd.github+json" \
+            -H "X-GitHub-Api-Version: 2022-11-28" \
+            https://api.github.com/repos/SemiAnalysisAI/InferenceX-app/dispatches \
+            -d '{
+              "event_type": "update-collectivex-data",
+              "client_payload": {
+                "source_run_id": "${{ github.run_id }}"
+              }
+            }'
diff --git a/experimental/CollectiveX/.gitignore b/experimental/CollectiveX/.gitignore
new file mode 100644
index 000000000..a4717f5ff
--- /dev/null
+++ b/experimental/CollectiveX/.gitignore
@@ -0,0 +1,14 @@
+# in-container nccl-tests build cache
+.nccl-tests/
+# python
+__pycache__/
+*.pyc
+# generated run artifacts: captured env embeds hostnames / GPU UUIDs / NIC GUIDs,
+# so keep results out of git (CI uploads them as workflow artifacts instead).
+# Sanitized headline numbers live in CONTAINERS.md.
+results/*.json
+results/plots/
+results/raw_*.txt
+results/raw_*.txt.stderr
+# running local-only reflection log (not a committed artifact)
+notes.md
diff --git a/experimental/CollectiveX/CONTAINERS.md b/experimental/CollectiveX/CONTAINERS.md
new file mode 100644
index 000000000..6b409bac0
--- /dev/null
+++ b/experimental/CollectiveX/CONTAINERS.md
@@ -0,0 +1,75 @@
+# CollectiveX — container & library versions
+
+One **multi-arch, digest-pinned** container is used for all NVIDIA SKUs, so B200
+(x86_64) and GB200 (aarch64) share a single reference and the cross-vendor
+comparison is truly same-image. Set in `launchers/common.sh` (`cx_default_image`).
+
+## Default container (all NVIDIA SKUs)
+
+- **Image:** import by tag **`lmsysorg/sglang:v0.5.11-cu130`** (multi-arch OCI index). Expected index digest, recorded for provenance/verification: `sha256:061fb71f838e82000a1768c159654d526c2f17ebe751c21e7fc48ca53c8ef975`.
+- **Multi-arch manifest list:** linux/amd64 + linux/arm64; `enroot import` on each host pulls the matching arch.
+- **Import by TAG, not digest.** enroot builds its anonymous Docker Hub token scope from the *tag* and succeeds (no creds needed — same as the serving launchers). A bare `repo@sha256:` ref makes enroot prompt for a password and **hang** in non-interactive CI; a combined `tag@sha256:` ref 400s. `cx_ensure_squash` therefore imports by tag with `</dev/null` (a missing token fails fast instead of hanging). First import is multi-GB (~minutes); subsequent runs reuse the staged squash.
+- **Why v0.5.11-cu130 (chosen):** it's the newest cu130 release **pre-staged on BOTH clusters** — B200 `/home/sa-shared/containers/` (amd64 squash) and GB200 `/mnt/lustre01/users-public/sa-shared/` (arm64 squash), same filename — so neither side imports at all. (Shared cu130 multi-arch squashes across both clusters: v0.5.8.post1, v0.5.9, v0.5.11 — v0.5.11 is newest.) `v0.5.12-cu130` is staged on B200 but **not** GB200: its 62 layers overflow enroot's overlay-based squash creation on the GB200 kernel (`enroot-mksquashovlfs: failed to mount overlay … Invalid argument`), so it can't be the shared default.
+- **DeepEP: NOT bundled** here → `run_in_container.sh` builds it via `rebuild-deepep` at job setup (CX_BENCH=deepep). The NCCL path needs no DeepEP.
+- **nccl-tests build:** in-container (login nodes have no `nvcc`), `CX_NCCL_HOME=/usr` (system `nccl.h` in `/usr/include`), `CX_CUDA_HOME=/usr/local/cuda`. cu130 lineage ⇒ CUDA 13; confirm exact NCCL/torch on first run and append below.
+
+## Audited reference (cu130 lineage)
+
+Live audit of the sibling DeepSeek-V4 image `lmsysorg/sglang:deepseek-v4-grace-blackwell` (aarch64) on GB200, 2026-06-23 — the multi-arch `v0.5.11-cu130` should match closely (same cu130 base); reconfirm on first run:
+
+| Component | Version |
+|---|---|
+| OS / arch | Ubuntu 24.04.3, aarch64 |
+| CUDA (`nvcc`) | 13.0 (V13.0.88) |
+| NCCL (system `/usr/include/nccl.h`) | 2.28.3; torch-bundled 2.27.7 |
+| PyTorch | 2.9.1+cu130 |
+| DeepEP | bundled in *that* image; **not** in the multi-arch default |
+| NVSHMEM | `libnvshmem_host.so.3` present |
+| OpenMPI / gcc / make | 4.1.6 / 13.3.0 / 4.3 |
+| GPU / driver | GB200, 580.126.20 |
+
+**Version caveat:** the nccl-tests binary links **system NCCL** (2.28.x), while torch/DeepEP use the **bundled** NCCL (2.27.x). Record both in provenance (env_capture does); don't compare an nccl-tests curve against a DeepEP run as if NCCL were identical.
+
+## Bundled-DeepEP reference images (not the default)
+
+If a bundled DeepEP is needed before `rebuild-deepep` is wired on the multi-arch image, these arch-specific images bundle it (pin by digest):
+
+- B200 (amd64): `lmsysorg/sglang:deepseek-v4-blackwell@sha256:df18bfc4aa9ecf59451002b49ba00cae58042de9e2a96378bbd21b404dd62c7b` (pre-staged on B200)
+- GB200 (arm64): `lmsysorg/sglang:deepseek-v4-grace-blackwell@sha256:4f583347d7ff08aef7e16dbb4985b2a7c147ff49a0c261d5e27b8f5f41719368` (staged on GB200 Lustre)
+
+Select via `CX_IMAGE=…@sha256:…` on the launch script.
+
+## AMD container (MI355X) — MoRI EP
+
+AMD CDNA4 cannot run the CUDA multi-arch image; MI355X uses a ROCm image that
+bundles **MoRI** (AMD's EP dispatch/combine library). Set in `cx_default_image`
+for `mi355x*` (also `mi350x*`/`mi325x*`/`mi300x*`).
+
+- **Image:** `rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-2` (single-arch ROCm 7.2.0 runtime; from the AMD master serving config). **Not digest-pinned yet** — record the digest here and pin once validated on the runner, like the NVIDIA image.
+- **MoRI:** bundled in-image (build tag `mori-0227`). `tests/ep_mori.py` follows the upstream `ROCm/mori` `tests`/`examples` dispatch+combine path; capture the exact MoRI commit (`MORI_COMMIT` env → provenance) on first run.
+- **Squash is NODE-LOCAL** (`/var/lib/squash`), not a shared FS, so `launch_mi355x-amds.sh` imports via `srun` on the allocated node (the NVIDIA adapters import on the login node onto shared FS). pyxis flags `--container-writable --container-remap-root` (matches the AMD serving launcher); workspace is bind-mounted directly (no `CX_STAGE_DIR`).
+- **Transport:** intra-node **XGMI** (8× MI355X). Two backends wired: `CX_BENCH=mori` (MoRI EP dispatch/combine) and `CX_BENCH=nccl` (collective primitives via **rccl-tests**, the ROCm nccl-tests fork — built in-container with `make` against `/opt/rocm`/`amdclang++`/`librccl`; same `<op>_perf` binaries + output format as nccl-tests, so `run_nccl.py` parses it unchanged).
+- **Validated on MI355X** (on-node via `salloc`+`srun`, nodes `mia1-p01-g10`/`g15`): `salloc` → enroot import (anonymous auth + tag, 24 layers → ~60 GB node-local squash) → torchrun → 8-rank Gloo + MoRI shmem → `EpDispatchCombineConfig`/dispatch/combine **numerically correct** (combine within tol, `max_rel ~2e-3`, ~85 µs round-trip at the decode shape). Three ionic_rdma-fabric constraints, all handled in `tests/ep_mori.py`:
+  - **RDMA MR size ceiling (~4 GiB).** MoRI registers the *entire* symmetric heap as one RDMA MR at init — even single-node (no disable-RDMA knob exists; only `MORI_DISABLE_P2P`, which forces the opposite). On these ionic NICs a 6 GiB MR fails (`RegisterRdmaMemoryRegion … errno 22 EINVAL`) while 2 GiB registers. Heap is held at **`MORI_SHMEM_HEAP_SIZE=2G`** (override `CX_MORI_HEAP_SIZE`). The reference test's hardcoded `6G` is exactly why it can't run as-is here.
+  - **Buffer sizing.** `max_num_inp_token_per_rank` is bounded (512 at the decode shape) so dispatch/combine buffers fit the 2 GiB heap. Much larger token counts would need a heap past the MR ceiling — out of reach on this fabric for now.
+  - **Teardown.** MoRI's shmem teardown asserts (`CheckStatusValid` → SIGABRT) when the op is destroyed after `shmem_finalize()`; `tests/ep_mori.py`'s `finalize()` hard-exits after writing results to avoid it.
+
+  Still TODO: capture the exact MoRI commit + a version table (ROCm/torch/RCCL) into provenance, and digest-pin the image.
+
+## Cluster access / QOS
+
+- **B200** (`slurm-login-slinky`): account `benchmark`, **only `gpu-2_qos`** → partition `gpu-2` only (shared with the serving sweep). `gpu-1`/`all` (idle) need `gpu-1_qos`/`all_qos`, not associated with this account.
+- **GB200** (`watchtower`): account `benchmark`, qos `normal`, partition `batch` (`AllowQos=ALL`); idle capacity available. Runner workspace is **not** compute-visible → set `CX_STAGE_DIR` to a Lustre path (the launcher rsyncs there).
+
+## First real results (Milestone-0 spike, on the DeepSeek-V4 images)
+
+nccl-tests (system NCCL 2.28.3), all correctness-passed, peak bus-bw:
+
+| op | B200 8× (NVLink island, x86_64) | GB200 4× (NVL72 MNNVL, aarch64) |
+|---|---|---|
+| all_reduce | 835 GB/s | 689 GB/s |
+| all_gather | 653 | 658 |
+| reduce_scatter | 667 | 661 |
+| alltoall | 638 | 666 |
+
+(B200 vs GB200 carry distinct `comparison_key`s by topology-class, so they are labelled-distinct, not silently merged. Re-run on the multi-arch default to refresh under one image.)
diff --git a/experimental/CollectiveX/README.md b/experimental/CollectiveX/README.md
new file mode 100644
index 000000000..a7c479b86
--- /dev/null
+++ b/experimental/CollectiveX/README.md
@@ -0,0 +1,128 @@
+# CollectiveX
+
+Cross-vendor collective / EP-library benchmark (see `plan.md`). Per-SKU **launch
+adapters** (InferenceX-style `launch_<sku>.sh`) run **any benchmark** — selected
+by `CX_BENCH` — through a shared in-container runner, and a GitHub Actions
+workflow triggers runs on `push` (no merge to main needed). Milestone-0 headline
+already ran for real on both B200 (8× NVLink island) and GB200 (4× NVL72 MNNVL).
+
+> Experimental: WIP, not an official InferenceMAX result. All logic stays under
+> `experimental/CollectiveX/`; the only file outside is the orchestration-only
+> workflow.
+
+## Files
+
+| File | Role |
+|---|---|
+| `env_capture.py` | Layer-0 environment + topology fingerprint → JSON (stdlib only) |
+| `run_nccl.py` | run stock `nccl-tests`, parse the text table, emit flat JSON (stdlib only) |
+| `tests/run_ep.py` | EP dispatch/combine entrypoint (torchrun): source-tokens-per-rank sweep, dispatch & combine timed **separately** |
+| `tests/ep_harness.py` | shared EP harness: token ladder, separated timing, correctness gate, doc emission (stdlib top) |
+| `tests/ep_deepep.py`, `tests/ep_mori.py` | per-backend adapters (DeepEP / MoRI) implementing the harness protocol |
+| `plot.py` | latency/bus-bw curves, B200-vs-GB200 overlay with a comparison guard (matplotlib) |
+| `launchers/common.sh` | shared helpers: image resolve, enroot squash, staging, nccl-tests build |
+| `launchers/run_in_container.sh` | generic in-container dispatcher — runs `CX_BENCH` (nccl/deepep/mori/all) over `CX_PHASE` |
+| `launchers/launch_<sku>.sh` | per-SKU adapters: `launch_b200-dgxc.sh` (8× NVLink), `launch_b200-dgxc-slurm.sh` (2-node IB), `launch_gb200-nv.sh` (NVL72 MNNVL), `launch_mi355x-amds.sh` (8× XGMI, AMD MoRI + rccl) |
+| `CONTAINERS.md` | the pinned multi-arch container + audited library versions |
+| `results/` | flat JSON artifacts (+ `plots/`, raw captures) |
+| `tests/fixtures/` | captured nccl-tests output for offline parser checks |
+
+## Run
+
+### Via GitHub Actions (`.github/workflows/collectivex-experimental.yml`)
+
+- **push** to `experimental/CollectiveX/**` → the **MI355X MoRI** EP dispatch/combine
+  sweep, **one job per phase** (decode + prefill) via a matrix (lands on free
+  `mi355x-amds` runners).
+- **workflow_dispatch** → pick `sku` (gb200 / b200-dgxc / b200-multinode /
+  mi355x), `benchmark` (nccl / deepep / mori / all — `mori` is AMD-only; `nccl`
+  on MI355X runs rccl-tests), `phase` (decode / prefill / **both** → a job each),
+  `tokens_ladder`, `dispatch_dtype`, ops, sizes, ngpus. Lands on that SKU's
+  self-hosted runner and runs `launch_${RUNNER_NAME%%_*}.sh`. For EP results
+  across all SKUs, dispatch once per `sku` with `phase=both`.
+
+Each job renders a results table to the **GitHub Actions job summary** (via
+`summarize.py --markdown` → `$GITHUB_STEP_SUMMARY`) and uploads the result JSONs
+as an artifact. (The workflow only fires once the branch is pushed to GitHub.)
+
+### Directly on a cluster login node
+
+```bash
+# benchmark is selected by CX_BENCH (default nccl)
+bash experimental/CollectiveX/launchers/launch_gb200-nv.sh                 # GB200, NCCL primitives
+CX_BENCH=deepep bash experimental/CollectiveX/launchers/launch_gb200-nv.sh # GB200, DeepEP (rebuild)
+bash experimental/CollectiveX/launchers/launch_b200-dgxc.sh               # B200 8× NVLink
+bash experimental/CollectiveX/launchers/launch_b200-dgxc-slurm.sh         # B200 2-node, cross-IB
+bash experimental/CollectiveX/launchers/launch_mi355x-amds.sh                # MI355X 8× XGMI, MoRI EP (CX_BENCH=mori, default)
+CX_BENCH=nccl bash experimental/CollectiveX/launchers/launch_mi355x-amds.sh   # MI355X primitives via rccl-tests
+```
+
+Knobs: `CX_BENCH` (nccl|deepep|mori|all), `CX_OPS`, `CX_MIN_BYTES`/`CX_MAX_BYTES`,
+`CX_NGPUS`, `CX_TIME`, `CX_IMAGE`, `CX_SQUASH_DIR`, `CX_STAGE_DIR` (compute-visible
+staging — needed on GB200/watchtower), `CX_DRYRUN=1` (print plan, allocate
+nothing). EP (deepep/mori) adds `CX_PHASE` (decode|prefill|both), `CX_TOKENS_LADDER`
+(e.g. `"1 2 4 8 16 32 64 128"`), `CX_HIDDEN`/`CX_TOPK`/`CX_EXPERTS`,
+`CX_DISPATCH_DTYPE`, `CX_NUM_EP_GROUPS`. Results land in `experimental/CollectiveX/results/`.
+
+### Offline (no GPU) — verify the parser/JSON pipeline
+
+```bash
+python3 run_nccl.py --op all_reduce --parse-only tests/fixtures/all_reduce_perf_b200_8gpu.txt \
+  --world-size 8 --nodes 1 --runner b200-dgxc --topology-class b200-nvlink-island --out /tmp/parsed.json
+python3 env_capture.py            # prints a (degraded, off-GPU) env record
+python3 plot.py --results-dir results --out-dir results/plots   # needs matplotlib
+```
+
+## Container
+
+One **multi-arch** image for all NVIDIA SKUs, imported by tag
+`lmsysorg/sglang:v0.5.11-cu130` (amd64 + arm64; index digest `sha256:061fb71f…`
+recorded for provenance). Imported by tag, not digest — enroot's anonymous
+Docker Hub auth needs a tag, and a bare digest ref hangs in CI. See
+`CONTAINERS.md` for versions, the DeepEP-rebuild note, and the bundled-DeepEP
+DeepSeek-V4 fallback images.
+
+## How it runs (confirmed against the live clusters)
+
+- Adapters mirror `runners/launch_*.sh`: `salloc` → enroot squash (import only if
+  missing) → `srun --container-image=… --container-mounts=<repo>:/ix` → in-container
+  `run_in_container.sh`. B200 partition `gpu-2`, GB200 partition `batch`, account
+  `benchmark`.
+- **AMD MI355X** (`launch_mi355x-amds.sh`, MoRI / `CX_BENCH=mori`) diverges: partition
+  `compute`, no account, pyxis `--container-writable --container-remap-root`, and a
+  **node-local** squash (`/var/lib/squash`) imported via `srun` on the allocated node
+  (not the login node). Workspace is bind-mounted directly (no `CX_STAGE_DIR`).
+- Login nodes have no `nvcc`, so `nccl-tests` is **built in-container** (cached in
+  `.nccl-tests/`, `CX_NCCL_HOME=/usr`). Single-node uses `-g N`; the 2-node
+  adapter builds `MPI=1` and launches one rank per GPU (`srun --mpi=pmix`).
+- The sglang image installs editable under `/workspace`, so the repo is mounted at
+  **`/ix`**. GB200 compute nodes don't see the runner workspace → `CX_STAGE_DIR`
+  rsyncs the tree to Lustre first.
+- Every result embeds an `env_capture` record and a `comparison_key`; topology
+  class is part of the key, so B200(IB/NVLink) and GB200(MNNVL) stay labelled
+  distinct, never silently overlaid.
+
+## Status & known risks
+
+- **Spike done on real hardware** (both SKUs, 4 NCCL primitives, correctness-passed)
+  — on the DeepSeek-V4 images. Now standardizing on the **multi-arch** default;
+  validate it on first run and refresh `CONTAINERS.md` (expect CUDA 13 / NCCL 2.28 / torch 2.9).
+- **DeepEP** is not bundled in the multi-arch image → `run_in_container.sh` builds
+  it via `rebuild-deepep` (CX_BENCH=deepep). Its Python API is version-sensitive;
+  `tests/ep_deepep.py` follows the documented normal-mode API — validate against
+  the built commit. B200 (x86_64) first; GB200 (aarch64) follows.
+- **MoRI / MI355X** (`tests/ep_mori.py` + `launch_mi355x-amds.sh`) is **validated on
+  hardware** (8× MI355X: dispatch+combine numerically correct, ~85 µs round-trip).
+  It mirrors `ROCm/mori`'s example (config + `get_registered_combine_input_buffer`
+  zero-copy path, `expected = input × #unique-destination-ranks`). Three
+  ionic_rdma-fabric constraints are baked in (see `CONTAINERS.md`): a 2 GiB heap
+  (the NICs cap RDMA MRs at ~4 GiB), a bounded `max_num_inp_token_per_rank`, and a
+  hard-exit past MoRI's buggy shmem teardown. The ROCm image isn't digest-pinned yet.
+- **Multi-node** (`launch_b200-dgxc-slurm.sh`) assumes `srun --mpi=pmix` + a
+  compute-visible checkout (`CX_STAGE_DIR`); else fall back to mpirun-in-container
+  or srt-slurm. CX_BENCH=nccl only for now.
+- **B200 QOS:** account `benchmark` has only `gpu-2_qos` (the serving-sweep
+  partition); idle `gpu-1` needs a QOS grant. GB200 `batch` is open.
+
+Once the multi-arch image is validated end-to-end, freeze the schema from the
+artifacts (plan: "Freeze the contract").
diff --git a/experimental/CollectiveX/analyze_ep.py b/experimental/CollectiveX/analyze_ep.py
new file mode 100644
index 000000000..018d74a93
--- /dev/null
+++ b/experimental/CollectiveX/analyze_ep.py
@@ -0,0 +1,257 @@
+#!/usr/bin/env python3
+"""CollectiveX operating-envelope analysis (goal Part 2 'operating-envelope outputs' + Part 3
+'regression/decision outputs'). Post-processes result JSONs (v3 flat or v4 nested) into the
+decision-facing summaries, comparing ONLY matching (workload, topology, contract, backend,
+resource) cells:
+
+  routing-skew penalty     zipf* vs matched uniform — p50/p99 dispatch amplification
+  LL-to-normal crossover   token count where normal becomes faster than LL (p50 and p99)
+  topology penalty         EP4 vs EP8 (and placement, when present) latency penalty
+  strong/weak scaling      fixed-global-tokens and fixed-tokens/rank efficiency across EP
+  resource marginal eff.   Δlatency per Δcomm-fraction (needs a resource ladder; reports n/a otherwise)
+  pareto + recommendations lowest-latency / lowest-resource configs per (sku, phase)
+
+Pure stdlib; reads the same JSONs the plotter does. Honest about missing cells (prints n/a with
+the reason) rather than inventing comparisons.
+
+  python3 analyze_ep.py --results-dir results --out analysis.json
+"""
+from __future__ import annotations
+
+import argparse
+import glob
+import json
+import os
+from collections import defaultdict
+
+
+def _p(r, op, pct):
+    """percentile from v4 nested {op:{p50..}} or v3 flat {op_us_p50}."""
+    if isinstance(r.get(op), dict):
+        return r[op].get(pct)
+    return r.get(f"{op}_us_{pct}")
+
+
+def load(results_dir):
+    series = []
+    for f in sorted(glob.glob(os.path.join(results_dir, "**", "*.json"), recursive=True)):
+        if os.path.basename(f).startswith("env_"):
+            continue
+        try:
+            d = json.load(open(f))
+        except (json.JSONDecodeError, OSError):
+            continue
+        if d.get("family") != "moe" or not d.get("rows"):
+            continue
+        sh = d.get("shape", {})
+        series.append({
+            "sku": (d.get("runner") or "?").split("_")[0].split("-")[0],
+            "ep": d.get("ep_size"), "phase": d.get("phase"), "mode": d.get("mode", "normal"),
+            "dtype": sh.get("dispatch_dtype"), "contract": d.get("measurement_contract"),
+            "routing": (sh.get("routing", "?") + ("+eplb" if (d.get("eplb") or {}).get("enabled") else "")),
+            "topo": d.get("topology_class"), "resource": d.get("resource_mode", "tuned"),
+            "rows": {r["tokens_per_rank"]: r for r in d["rows"]},
+        })
+    return series
+
+
+def _key(s, *fields):
+    return tuple(s[f] for f in fields)
+
+
+def skew_penalty(series):
+    """zipf* vs matched uniform: dispatch p50/p99 amplification at shared T."""
+    out = []
+    base = {_key(s, "sku", "ep", "phase", "mode", "dtype", "contract"): s
+            for s in series if s["routing"] == "uniform"}
+    for s in series:
+        if not s["routing"].startswith("zipf"):
+            continue
+        b = base.get(_key(s, "sku", "ep", "phase", "mode", "dtype", "contract"))
+        if not b:
+            continue
+        for T in sorted(set(s["rows"]) & set(b["rows"])):
+            zp, up = _p(s["rows"][T], "dispatch", "p50"), _p(b["rows"][T], "dispatch", "p50")
+            zq, uq = _p(s["rows"][T], "dispatch", "p99"), _p(b["rows"][T], "dispatch", "p99")
+            if up and uq:
+                out.append({"sku": s["sku"], "ep": s["ep"], "phase": s["phase"], "routing": s["routing"],
+                            "T": T, "p50_amplification": round(zp / up, 3), "p99_amplification": round(zq / uq, 3)})
+    return out
+
+
+def ll_crossover(series):
+    """Token count where normal dispatch p50/p99 drops below LL (per sku,dtype)."""
+    out = []
+    norm = {_key(s, "sku", "ep", "dtype"): s for s in series
+            if s["mode"] == "normal" and s["routing"] == "uniform" and s["contract"] == "layout-and-dispatch-v1"}
+    for s in series:
+        if s["mode"] != "ll" or s["routing"] != "uniform":
+            continue
+        n = norm.get(_key(s, "sku", "ep", "dtype"))
+        if not n:
+            continue
+        for stat in ("p50", "p99"):
+            cross = None
+            for T in sorted(set(s["rows"]) & set(n["rows"])):
+                ll, nm = _p(s["rows"][T], "dispatch", stat), _p(n["rows"][T], "dispatch", stat)
+                if ll and nm and nm < ll:
+                    cross = T
+                    break
+            out.append({"sku": s["sku"], "ep": s["ep"], "dtype": s["dtype"], "stat": stat,
+                        "normal_faster_at_T": cross if cross is not None else "never-in-range"})
+    return out
+
+
+def topology_penalty(series):
+    """EP4 vs EP8 dispatch p50 at matched tokens/rank for the same sku (a scaling/topology cost)."""
+    out = []
+    by = defaultdict(dict)
+    for s in series:
+        if s["routing"] == "uniform" and s["mode"] == "normal" and s["contract"] == "layout-and-dispatch-v1":
+            by[(s["sku"], s["phase"], s["dtype"])][s["ep"]] = s
+    for k, eps in by.items():
+        if len(eps) < 2:
+            continue
+        lo, hi = min(eps), max(eps)
+        sl, sh = eps[lo], eps[hi]
+        for T in sorted(set(sl["rows"]) & set(sh["rows"])):
+            a, b = _p(sl["rows"][T], "dispatch", "p50"), _p(sh["rows"][T], "dispatch", "p50")
+            if a and b:
+                out.append({"sku": k[0], "phase": k[1], "dtype": k[2], "T": T,
+                            f"ep{lo}_p50": round(a, 1), f"ep{hi}_p50": round(b, 1),
+                            "penalty_pct": round(100 * (b - a) / a, 1)})
+    return out
+
+
+def scaling(series):
+    """strong: fixed GLOBAL tokens, vary EP -> latency. weak: fixed tokens/RANK, vary EP."""
+    out = {"strong": [], "weak": []}
+    by = defaultdict(dict)
+    for s in series:
+        if s["routing"] == "uniform" and s["mode"] == "normal" and s["contract"] == "layout-and-dispatch-v1":
+            by[(s["sku"], s["phase"], s["dtype"])][s["ep"]] = s
+    for k, eps in by.items():
+        if len(eps) < 2:
+            continue
+        for ep, s in eps.items():
+            for T, r in s["rows"].items():
+                d50 = _p(r, "dispatch", "p50")
+                if d50:
+                    out["weak"].append({"sku": k[0], "phase": k[1], "ep": ep, "tokens_per_rank": T,
+                                        "global_tokens": T * ep, "dispatch_p50": round(d50, 1)})
+                    out["strong"].append({"sku": k[0], "phase": k[1], "ep": ep, "global_tokens": T * ep,
+                                          "tokens_per_rank": T, "dispatch_p50": round(d50, 1)})
+    return out
+
+
+def scaling_efficiency(series):
+    """From EP4+EP8 (same sku/phase): weak = fixed tokens/rank (ideal: flat latency); strong =
+    fixed GLOBAL tokens (ideal: latency falls ~1/EP). Efficiency = ideal/observed (1.0 = ideal)."""
+    out = {"weak": [], "strong": []}
+    by = defaultdict(dict)
+    for s in series:
+        if s["routing"] == "uniform" and s["mode"] == "normal" and s["contract"] == "layout-and-dispatch-v1":
+            by[(s["sku"], s["phase"], s["dtype"])][s["ep"]] = s
+    for k, eps in by.items():
+        if len(eps) < 2:
+            continue
+        lo, hi = min(eps), max(eps)
+        # weak: same tokens/rank T on both EP -> latency should stay flat
+        for T in sorted(set(eps[lo]["rows"]) & set(eps[hi]["rows"])):
+            a, b = _p(eps[lo]["rows"][T], "dispatch", "p50"), _p(eps[hi]["rows"][T], "dispatch", "p50")
+            if a and b:
+                out["weak"].append({"sku": k[0], "phase": k[1], "tokens_per_rank": T,
+                                    f"ep{lo}": round(a, 1), f"ep{hi}": round(b, 1),
+                                    "weak_efficiency": round(a / b, 3)})  # >1 = EP8 faster (super-ideal)
+        # strong: same GLOBAL tokens -> EP_hi has fewer tokens/rank; ideal latency ~ a*(lo/hi)
+        for Tlo in eps[lo]["rows"]:
+            gt = Tlo * lo
+            Thi = gt // hi
+            if Thi in eps[hi]["rows"]:
+                a, b = _p(eps[lo]["rows"][Tlo], "dispatch", "p50"), _p(eps[hi]["rows"][Thi], "dispatch", "p50")
+                if a and b:
+                    ideal = a * (lo / hi)
+                    out["strong"].append({"sku": k[0], "phase": k[1], "global_tokens": gt,
+                                          f"ep{lo}_p50": round(a, 1), f"ep{hi}_p50": round(b, 1),
+                                          "strong_efficiency": round(ideal / b, 3)})
+    return out
+
+
+def regressions(series, baseline_series, thresh=0.10):
+    """Flag latency regressions vs a baseline, comparing ONLY matching (sku,ep,phase,mode,dtype,
+    contract,routing) cells at shared T. Regression = current p50/p99 > baseline*(1+thresh)."""
+    bkey = {_key(b, "sku", "ep", "phase", "mode", "dtype", "contract", "routing"): b for b in baseline_series}
+    out = []
+    for s in series:
+        b = bkey.get(_key(s, "sku", "ep", "phase", "mode", "dtype", "contract", "routing"))
+        if not b:
+            continue
+        for T in sorted(set(s["rows"]) & set(b["rows"])):
+            for op in ("dispatch", "combine", "roundtrip"):
+                for stat in ("p50", "p99"):
+                    cur, base = _p(s["rows"][T], op, stat), _p(b["rows"][T], op, stat)
+                    if cur and base and cur > base * (1 + thresh):
+                        out.append({"sku": s["sku"], "ep": s["ep"], "phase": s["phase"],
+                                    "routing": s["routing"], "T": T, "op": op, "stat": stat,
+                                    "baseline": round(base, 1), "current": round(cur, 1),
+                                    "regression_pct": round(100 * (cur - base) / base, 1)})
+    return out
+
+
+def recommendations(series):
+    """Per (sku, phase): lowest-p99-dispatch config at the headline T=64 (decode) / T=256 (prefill)."""
+    out = []
+    by = defaultdict(list)
+    for s in series:
+        by[(s["sku"], s["phase"])].append(s)
+    for (sku, phase), ss in by.items():
+        T = 64 if phase == "decode" else 256
+        cands = []
+        for s in ss:
+            r = s["rows"].get(T)
+            if r:
+                q = _p(r, "dispatch", "p99")
+                if q:
+                    cands.append((q, f"{s['dtype']}/{s['mode']}/{s['contract']}/{s['routing']}/{s['resource']}", s["ep"]))
+        if cands:
+            cands.sort()
+            out.append({"sku": sku, "phase": phase, "at_T": T, "lowest_p99_dispatch_us": round(cands[0][0], 1),
+                        "config": cands[0][1], "ep": cands[0][2]})
+    return out
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser(description="CollectiveX operating-envelope analysis")
+    ap.add_argument("--results-dir", default="results")
+    ap.add_argument("--baseline", help="dir of baseline results for regression detection")
+    ap.add_argument("--out")
+    a = ap.parse_args()
+    s = load(a.results_dir)
+    rep = {"n_series": len(s), "skew_penalty": skew_penalty(s), "ll_crossover": ll_crossover(s),
+           "topology_penalty": topology_penalty(s), "scaling": scaling(s),
+           "scaling_efficiency": scaling_efficiency(s), "recommendations": recommendations(s)}
+    if a.baseline:
+        regs = regressions(s, load(a.baseline))
+        rep["regressions"] = regs
+        print(f"regressions vs baseline: {len(regs)} cell(s) > +10%")
+    print(f"loaded {len(s)} series")
+    sk = rep["skew_penalty"]
+    if sk:
+        worst = max(sk, key=lambda x: x["p99_amplification"])
+        print(f"skew penalty: {len(sk)} cells; worst p99 amplification {worst['p99_amplification']}x "
+              f"({worst['sku']} {worst['routing']} T{worst['T']})")
+    tp = rep["topology_penalty"]
+    if tp:
+        print(f"topology penalty (EP4->EP8): {len(tp)} cells; e.g. "
+              + ", ".join(f"{x['sku']} T{x['T']} {x['penalty_pct']:+}%" for x in tp[:3]))
+    print(f"LL crossover cells: {len(rep['ll_crossover'])}; recommendations: {len(rep['recommendations'])}")
+    for r in rep["recommendations"]:
+        print(f"  rec {r['sku']}/{r['phase']} @T{r['at_T']}: {r['lowest_p99_dispatch_us']}us via {r['config']}")
+    if a.out:
+        json.dump(rep, open(a.out, "w"), indent=2)
+        print(f"wrote {a.out}")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/experimental/CollectiveX/configs/backends.yaml b/experimental/CollectiveX/configs/backends.yaml
new file mode 100644
index 000000000..2237e7631
--- /dev/null
+++ b/experimental/CollectiveX/configs/backends.yaml
@@ -0,0 +1,49 @@
+# CollectiveX backend registry (goal Part 2) — the single source of truth for backend
+# capability, replacing the data split between the adapters and tests/capability.py. Keep in
+# sync with ep_deepep.py / ep_mori.py SUPPORTED_* sets (capability.py mirrors this at runtime).
+schema_version: 1
+backends:
+  deepep:
+    vendor: nvidia
+    modes: [normal, ll]                 # ll is DECODE-ONLY (fixed num_max dispatch)
+    dtypes: [bf16, fp8]
+    contracts: [layout-and-dispatch-v1, cached-layout-comm-only-v1, runtime-visible-v1]
+    transports: [nvlink, mnnvl, rdma]
+    ep_max_intranode: 8                  # <=8 ranks = intranode NVL kernel (incl. MNNVL trays)
+    ep_min: 2
+    phase_constraints:
+      ll: {phases: [decode], max_tokens_per_rank: 128}   # LL is a fixed-num_max decode path
+    required_image: "lmsysorg/sglang:v0.5.11-cu130"
+    cap_token_per_rank: 4096             # 4 GiB NVL buffer holds ~4096 tok/rank at hidden=7168
+  mori:
+    vendor: amd
+    modes: [normal]
+    dtypes: [bf16]
+    contracts: [layout-and-dispatch-v1]
+    transports: [xgmi, rdma]
+    ep_max_intranode: 8
+    ep_min: 2
+    phase_constraints:
+      normal: {max_tokens_per_rank: 512}   # 2 GiB registerable heap cap at hidden=7168
+    required_image: "rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-2"
+    cap_token_per_rank: 512
+    fragility: "wedges (D-state) on sustained iters>=200 at T>=32; needs gradual ramp, low iters"
+  aiter:
+    vendor: amd
+    modes: [normal]
+    dtypes: [bf16, fp8]
+    contracts: [layout-and-dispatch-v1]
+    transports: [xgmi, rdma]
+    ep_max_intranode: 8
+    ep_min: 2
+    status: "scaffolded — adapter ep_aiter.py not yet implemented (capability declared, not validated)"
+    required_image: "rocm/sgl-dev (AITER CK MoE EP)"
+
+# 'all' resolves to a DEFINED per-vendor backend set (NOT the same across vendors).
+vendor_backends:
+  nvidia: [nccl, deepep]
+  amd: [rccl, mori]
+# Collective primitives (not EP dispatch/combine — phase/dtype/mode/contract N/A).
+collective_backends:
+  nccl: [nvidia]
+  rccl: [amd]
diff --git a/experimental/CollectiveX/configs/platforms.yaml b/experimental/CollectiveX/configs/platforms.yaml
new file mode 100644
index 000000000..ebb58a430
--- /dev/null
+++ b/experimental/CollectiveX/configs/platforms.yaml
@@ -0,0 +1,84 @@
+# CollectiveX platform registry (goal Part 2). One entry per SKU: hardware capability is
+# separated from VALIDATED software capability (what we've actually run green on real HW).
+# scale_up_domain = #GPUs reachable over the intra-domain fabric before crossing a tier
+# (NVLink island / NVL72 MNNVL tray-group / XGMI). gpus_per_node bounds single-node EP.
+schema_version: 1
+platforms:
+  h100:
+    vendor: nvidia
+    arch: sm90
+    gpu: "H100 80GB HBM3"
+    gpus_per_node: 8
+    scale_up_domain: 8            # single 8-GPU NVLink island
+    transport_tiers: [nvlink, ib]
+    runner: h100-8x
+    launcher: launch_h100-dgxc-slurm.sh
+    ssh: "sa-shared@100.118.57.65"   # partition hpc-gpu-1, /mnt/nfs, exclude hpc-gpu-1-7
+    validated:
+      ep_degrees: [8]
+      backends: [deepep]
+      max_intranode_gpus: 8
+      internode: false            # not yet exercised for EP
+  h200:
+    vendor: nvidia
+    arch: sm90
+    gpu: "H200 143GB HBM3e"
+    gpus_per_node: 8
+    scale_up_domain: 8
+    transport_tiers: [nvlink, ib]
+    runner: h200-8x
+    launcher: launch_h200.sh
+    ssh: "sa-shared@100.78.55.80"    # partition main, /home NFS
+    validated:
+      ep_degrees: [8]
+      backends: [deepep]
+      max_intranode_gpus: 8
+      internode: false
+  b300:
+    vendor: nvidia
+    arch: sm100
+    gpu: "B300 SXM6 268GB"
+    gpus_per_node: 8
+    scale_up_domain: 8
+    transport_tiers: [nvlink, ib]
+    runner: b300-nv
+    launcher: launch_b300.sh
+    ssh: "sa-shared@100.101.13.83"   # partition batch_1, acct benchmark, /data, exclude b300-018
+    notes: "Blackwell drops clocks on tiny T -> per-point warm burst (warmup>=30). LL aborts."
+    validated:
+      ep_degrees: [8]
+      backends: [deepep]
+      max_intranode_gpus: 8
+      internode: false
+  gb300:
+    vendor: nvidia
+    arch: sm100
+    gpu: "GB300 Grace-Blackwell (aarch64)"
+    gpus_per_node: 4              # NVL72 compute tray = 4 GPU/node
+    scale_up_domain: 72          # NVL72 MNNVL: one NVLink P2P domain spans the rack
+    transport_tiers: [mnnvl, ib]
+    runner: gb300-8x
+    launcher: _gb300_ep8.sh
+    ssh: "2-hop: sa-shared@100.92.114.46 -> im-gb300-login-02"  # batch_1, acct benchmark, /data
+    notes: "EP8 = 2 trays but INTRANODE NVLink path (MNNVL is one domain for <=8 ranks). deep_ep 1.1.0."
+    validated:
+      ep_degrees: [4, 8]
+      backends: [deepep]
+      max_intranode_gpus: 8        # <=8 ranks use the intranode NVL kernel even across 2 trays
+      internode: false             # internode-normal asserts out until >8 ranks (EP16+)
+  mi355x:
+    vendor: amd
+    arch: gfx950
+    gpu: "MI355X CDNA4 256 CU"
+    gpus_per_node: 8
+    scale_up_domain: 8           # single 8-GPU XGMI island
+    transport_tiers: [xgmi, rdma]
+    runner: mi355x-8x
+    launcher: launch_mi355x-amds.sh
+    ssh: "2-hop bastion -> mia1-vm-amd-prj3-slurm-001"  # partition compute, cpus-per-task=128
+    notes: "MoRI wedges (D-state) on sustained iters>=200 at T>=32; cap iters. 512-tok buffer cap. No LL/fp8."
+    validated:
+      ep_degrees: [8]
+      backends: [mori]
+      max_intranode_gpus: 8
+      internode: false
diff --git a/experimental/CollectiveX/configs/suites.yaml b/experimental/CollectiveX/configs/suites.yaml
new file mode 100644
index 000000000..39924095a
--- /dev/null
+++ b/experimental/CollectiveX/configs/suites.yaml
@@ -0,0 +1,92 @@
+# CollectiveX named benchmark suites (goal Part 2). A suite binds workloads x platforms x
+# backends x modes x contracts x resource regimes x repetitions x required publication level.
+# generate_matrix.py resolves a suite against platforms.yaml/backends.yaml capabilities BEFORE
+# any GPU is allocated, omitting unsupported combinations with recorded reasons.
+schema_version: 1
+suites:
+  ep-smoke-v1:
+    description: "fast canary: one small point per platform/backend/mode/contract"
+    workloads: [ds-like-ref]
+    platforms: [h100, h200, gb300, mi355x]
+    backends: [deepep, mori]
+    modes: [normal]
+    dtypes: [bf16]
+    contracts: [layout-and-dispatch-v1]
+    routings: [uniform]
+    resource_modes: [tuned]
+    token_points: [8, 64]
+    trials: 1
+    required_publication: comparable-experimental
+
+  ep-nightly-v1:
+    description: "headline matrix: both contracts, bf16+fp8, normal+LL, decode+prefill"
+    workloads: [ds-like-ref]
+    platforms: [h100, h200, gb300, mi355x]
+    backends: [deepep, mori]
+    modes: [normal, ll]
+    dtypes: [bf16, fp8]
+    contracts: [layout-and-dispatch-v1, cached-layout-comm-only-v1, runtime-visible-v1]
+    routings: [uniform]
+    resource_modes: [tuned]
+    phases: [decode, prefill]
+    trials: 3
+    required_publication: official
+
+  ep-models-v1:
+    description: "model-shape envelope: real MoE dimensions, controlled routing"
+    workloads: [deepseek-v4, kimi-k2.x, qwen3.5, glm-5, minimax-m3]
+    platforms: [h100, h200, gb300, mi355x]
+    backends: [deepep, mori]
+    modes: [normal]
+    dtypes: [fp8, bf16]
+    contracts: [runtime-visible-v1]
+    routings: [uniform]
+    resource_modes: [tuned]
+    phases: [decode, prefill]
+    trials: 3
+    required_publication: comparable-experimental
+
+  ep-scaling-v1:
+    description: "strong (fixed global tokens) + weak (fixed tokens/rank) scaling across EP degrees"
+    workloads: [ds-like-ref]
+    platforms: [gb300]            # the only SKU with >1 validated EP degree (EP4 + EP8)
+    backends: [deepep]
+    modes: [normal]
+    dtypes: [bf16]
+    contracts: [layout-and-dispatch-v1]
+    routings: [uniform]
+    resource_modes: [tuned]
+    scaling: [strong, weak]
+    ep_degrees: [4, 8]
+    trials: 3
+    required_publication: comparable-experimental
+
+  ep-topology-v1:
+    description: "placement sensitivity: packed vs striped vs adversarial on multi-domain SKUs"
+    workloads: [ds-like-ref]
+    platforms: [gb300]            # NVL72 tray boundary is the scale-up domain edge
+    backends: [deepep]
+    modes: [normal]
+    dtypes: [bf16]
+    contracts: [layout-and-dispatch-v1]
+    routings: [uniform, zipf]
+    placements: [packed, striped, adversarial]
+    resource_modes: [tuned]
+    ep_degrees: [8]
+    trials: 3
+    required_publication: comparable-experimental
+
+  ep-routing-v1:
+    description: "routing-skew sensitivity + EPLB remedy"
+    workloads: [ds-like-ref]
+    platforms: [h100, h200, gb300]
+    backends: [deepep]
+    modes: [normal]
+    dtypes: [bf16]
+    contracts: [layout-and-dispatch-v1]
+    routings: [uniform, balanced, zipf, zipf-mild, zipf-moderate, zipf-heavy, hotspot-single]
+    eplb: [false, true]
+    resource_modes: [tuned]
+    phases: [decode, prefill]
+    trials: 3
+    required_publication: comparable-experimental
diff --git a/experimental/CollectiveX/configs/workloads.yaml b/experimental/CollectiveX/configs/workloads.yaml
new file mode 100644
index 000000000..b7fe7cf09
--- /dev/null
+++ b/experimental/CollectiveX/configs/workloads.yaml
@@ -0,0 +1,76 @@
+# CollectiveX workload registry (goal Part 2). Each workload references an IMMUTABLE canonical
+# manifest (tests/workload.py -> <workload_id>.npz + .manifest.json). Three kinds:
+#   synthetic       — controlled DeepSeek-like baseline (dims real, routing controlled)
+#   model-derived   — REAL model MoE dimensions with controlled routing (shape != routing behavior)
+#   trace-replay    — captured routing behavior (future; needs a captured trace)
+# Model dims marked verify=true must be confirmed against a checked-in model config before any
+# result built on them is promoted past 'comparable-experimental'.
+schema_version: 1
+
+synthetic:
+  ds-like-ref:
+    kind: synthetic
+    hidden: 7168
+    topk: 8
+    experts: 256
+    dispatch_dtype: bf16
+    combine_dtype: bf16
+    routings: [uniform, balanced, zipf]
+    note: "Controlled baseline used through v3/v4 (DeepSeek-V3-shaped)."
+
+model_derived:
+  deepseek-v4:
+    kind: model-derived
+    hidden: 7168
+    topk: 8
+    routed_experts: 256
+    shared_experts: 1
+    expert_alignment: 128
+    dispatch_dtype: fp8
+    combine_dtype: bf16
+    verify: false           # matches the validated DSV3/V4 serving shape used on these clusters
+  minimax-m3:
+    kind: model-derived
+    hidden: 6144
+    topk: 8
+    routed_experts: 256
+    shared_experts: 1
+    dispatch_dtype: fp8
+    combine_dtype: bf16
+    verify: true
+  kimi-k2.x:
+    kind: model-derived
+    hidden: 7168
+    topk: 8
+    routed_experts: 384
+    shared_experts: 1
+    dispatch_dtype: fp8
+    combine_dtype: bf16
+    verify: true
+  glm-5:
+    kind: model-derived
+    hidden: 5120
+    topk: 8
+    routed_experts: 160
+    shared_experts: 1
+    dispatch_dtype: bf16
+    combine_dtype: bf16
+    verify: true
+  qwen3.5:
+    kind: model-derived
+    hidden: 4096
+    topk: 8
+    routed_experts: 128
+    shared_experts: 0
+    dispatch_dtype: bf16
+    combine_dtype: bf16
+    verify: true
+
+# decode vs prefill are workload METADATA, not just token-ladder aliases (goal Part 2):
+phase_profiles:
+  decode:
+    token_ladder: [1, 2, 4, 8, 16, 32, 64, 128]
+    description: "one (or few) tokens per active sequence per step; routing varies step-to-step"
+  prefill:
+    token_ladder: [128, 256, 512, 1024, 2048, 4096]
+    description: "chunked-prefill: many tokens per sequence enter each MoE layer at once"
diff --git a/experimental/CollectiveX/env_capture.py b/experimental/CollectiveX/env_capture.py
new file mode 100644
index 000000000..b906a0497
--- /dev/null
+++ b/experimental/CollectiveX/env_capture.py
@@ -0,0 +1,250 @@
+#!/usr/bin/env python3
+"""CollectiveX spike — Layer-0 environment + topology capture.
+
+Emits a JSON document describing the node a collective benchmark ran on, so
+every result is provenance-tagged and a B200-vs-GB200 comparison is defensible.
+Standard library only (so it runs in any minimal container, and off-GPU it
+degrades gracefully instead of crashing). torch is used only if importable.
+
+Usage:
+    python env_capture.py --out results/env_b200-dgxc.json
+    python env_capture.py --redact --out env.json   # hash hostnames/IPs/UUIDs
+
+Importable:
+    from env_capture import capture_environment
+    env = capture_environment(redact=False)
+"""
+from __future__ import annotations
+
+import argparse
+import datetime as _dt
+import hashlib
+import json
+import os
+import platform
+import re
+import shutil
+import socket
+import subprocess
+import sys
+
+SCHEMA_VERSION = 1
+
+# Env vars worth recording — transport/tuning knobs that change what a
+# collective actually does (esp. the GB200 MNNVL flags vs B200).
+ENV_PREFIXES = ("NCCL_", "NVSHMEM_", "MC_", "UCX_", "SGLANG_DEEPEP", "DEEPEP_")
+ENV_EXACT = (
+    "CUDA_VISIBLE_DEVICES",
+    "CUDA_DEVICE_ORDER",
+    "SLURM_JOB_ID",
+    "SLURM_NNODES",
+    "SLURM_NTASKS",
+    "SLURM_JOB_PARTITION",
+    # Image identity — set by the launcher so the bundle records what ran.
+    "COLLECTIVEX_IMAGE",
+    "COLLECTIVEX_IMAGE_DIGEST",
+)
+
+
+def _run(cmd: list[str], timeout: int = 20) -> str | None:
+    """Run a command, return stdout (stripped) or None if unavailable."""
+    if shutil.which(cmd[0]) is None:
+        return None
+    try:
+        out = subprocess.run(
+            cmd, capture_output=True, text=True, timeout=timeout, check=False
+        )
+    except (subprocess.TimeoutExpired, OSError):
+        return None
+    if out.returncode != 0:
+        return None
+    return out.stdout.strip()
+
+
+def _redact(value: str | None) -> str | None:
+    """Stable short hash so artifacts can be shared without leaking
+    hostnames / IPs / GPU UUIDs / IB GUIDs while staying joinable."""
+    if not value:
+        return value
+    return "redacted-" + hashlib.sha256(value.encode()).hexdigest()[:12]
+
+
+def _gpus(redact: bool) -> dict:
+    """GPU inventory via nvidia-smi (None fields off-GPU)."""
+    info: dict = {"source": None, "count": None, "devices": []}
+    q = _run(
+        [
+            "nvidia-smi",
+            "--query-gpu=name,uuid,memory.total,compute_cap,pci.bus_id",
+            "--format=csv,noheader,nounits",
+        ]
+    )
+    if q is None:
+        return info
+    info["source"] = "nvidia-smi"
+    devices = []
+    for line in q.splitlines():
+        parts = [p.strip() for p in line.split(",")]
+        if len(parts) < 5:
+            continue
+        name, uuid, mem_mib, cc, bus = parts[:5]
+        devices.append(
+            {
+                "name": name,
+                "uuid": _redact(uuid) if redact else uuid,
+                "memory_total_mib": int(mem_mib) if mem_mib.isdigit() else mem_mib,
+                "compute_capability": cc,
+                "pci_bus_id": _redact(bus) if redact else bus,
+            }
+        )
+    info["count"] = len(devices)
+    info["devices"] = devices
+    return info
+
+
+def _driver_cuda() -> dict:
+    out = _run(
+        ["nvidia-smi", "--query-gpu=driver_version", "--format=csv,noheader"]
+    )
+    driver = out.splitlines()[0].strip() if out else None
+    # `nvidia-smi` (no args) prints the CUDA driver-API version in its header.
+    cuda = None
+    header = _run(["nvidia-smi"])
+    if header:
+        m = re.search(r"CUDA Version:\s*([0-9.]+)", header)
+        if m:
+            cuda = m.group(1)
+    return {"driver_version": driver, "cuda_version": cuda}
+
+
+def _torch_info() -> dict:
+    """NCCL / torch build info — only if torch is importable in this env."""
+    info: dict = {"available": False}
+    try:
+        import torch  # type: ignore
+    except Exception:
+        return info
+    info["available"] = True
+    info["torch_version"] = torch.__version__
+    try:
+        info["cuda_runtime"] = torch.version.cuda
+    except Exception:
+        info["cuda_runtime"] = None
+    try:
+        if torch.cuda.is_available():
+            nccl = torch.cuda.nccl.version()
+            # version() returns an int (e.g. 22304) or a tuple, depending on build.
+            info["nccl_version"] = (
+                ".".join(map(str, nccl)) if isinstance(nccl, tuple) else nccl
+            )
+            info["device_count"] = torch.cuda.device_count()
+            info["device_name"] = torch.cuda.get_device_name(0)
+            cc = torch.cuda.get_device_capability(0)
+            info["compute_capability"] = f"{cc[0]}.{cc[1]}"
+    except Exception as exc:  # pragma: no cover - hardware dependent
+        info["error"] = repr(exc)
+    return info
+
+
+def _topology(redact: bool) -> dict:
+    """GPU/NIC topology matrix + a fingerprint to gate comparability.
+
+    The fingerprint is a hash of the structural part of `nvidia-smi topo -m`
+    (the connection legend), so two nodes with the same wiring share a key
+    even if absolute device IDs differ."""
+    topo = _run(["nvidia-smi", "topo", "-m"])
+    if topo is None:
+        return {"source": None, "matrix": None, "fingerprint": None}
+    # Fingerprint the link-type tokens (NV#, NODE, SYS, PIX, PXB, ...) only —
+    # ignore GPU/NIC labels and whitespace so it's placement-stable.
+    tokens = re.findall(r"\b(NV\d+|NODE|SYS|PIX|PXB|PHB|X)\b", topo)
+    fingerprint = hashlib.sha256(" ".join(tokens).encode()).hexdigest()[:16]
+    return {
+        "source": "nvidia-smi topo -m",
+        # The matrix can contain hostnames in some setups; redact wholesale.
+        "matrix": ("<redacted>" if redact else topo),
+        "fingerprint": fingerprint,
+    }
+
+
+def _rdma(redact: bool) -> dict:
+    """RDMA/IB device presence — names only, GUIDs redactable."""
+    devices: list[str] = []
+    listing = _run(["ibv_devinfo", "-l"])
+    if listing:
+        for line in listing.splitlines()[1:]:  # first line is a count
+            name = line.strip()
+            if name:
+                devices.append(name)
+    elif _run(["ibstat", "-l"]):
+        devices = [d.strip() for d in _run(["ibstat", "-l"]).splitlines() if d.strip()]
+    return {
+        "available": bool(devices),
+        "devices": [_redact(d) if redact else d for d in devices],
+    }
+
+
+def _env_vars() -> dict:
+    out = {}
+    for k, v in os.environ.items():
+        if k in ENV_EXACT or any(k.startswith(p) for p in ENV_PREFIXES):
+            out[k] = v
+    return dict(sorted(out.items()))
+
+
+def capture_environment(redact: bool = False, timestamp: str | None = None) -> dict:
+    """Return a JSON-serializable environment/provenance record."""
+    host = socket.gethostname()
+    return {
+        "schema_version": SCHEMA_VERSION,
+        "captured_at": timestamp or _dt.datetime.now().astimezone().isoformat(),
+        "redacted": redact,
+        "host": _redact(host) if redact else host,
+        "platform": {
+            "system": platform.system(),
+            "release": platform.release(),
+            "machine": platform.machine(),  # x86_64 vs aarch64 (B200 vs GB200)
+            "python": sys.version.split()[0],
+        },
+        "gpus": _gpus(redact),
+        "driver": _driver_cuda(),
+        "torch": _torch_info(),
+        "topology": _topology(redact),
+        "rdma": _rdma(redact),
+        "env": _env_vars(),
+    }
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser(description="CollectiveX Layer-0 environment capture")
+    ap.add_argument("--out", help="write JSON here (default: stdout)")
+    ap.add_argument(
+        "--redact",
+        action="store_true",
+        help="hash hostnames / IPs / GPU UUIDs / IB GUIDs for shareable artifacts",
+    )
+    ap.add_argument(
+        "--timestamp",
+        help="ISO timestamp to stamp (default: now); pass one for reproducible bundles",
+    )
+    args = ap.parse_args()
+
+    env = capture_environment(redact=args.redact, timestamp=args.timestamp)
+    blob = json.dumps(env, indent=2)
+    if args.out:
+        os.makedirs(os.path.dirname(os.path.abspath(args.out)), exist_ok=True)
+        with open(args.out, "w") as fh:
+            fh.write(blob + "\n")
+        # A one-line human summary to stdout (the JSON is the artifact).
+        g = env["gpus"]
+        print(
+            f"env -> {args.out} | machine={env['platform']['machine']} "
+            f"gpus={g['count']} topo_fp={env['topology']['fingerprint']}"
+        )
+    else:
+        print(blob)
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/experimental/CollectiveX/generate_matrix.py b/experimental/CollectiveX/generate_matrix.py
new file mode 100644
index 000000000..cec960b93
--- /dev/null
+++ b/experimental/CollectiveX/generate_matrix.py
@@ -0,0 +1,139 @@
+#!/usr/bin/env python3
+"""CollectiveX matrix generator (goal Part 2: capability planning, sharding, canaries).
+
+Reads configs/{suites,workloads,platforms,backends}.yaml, resolves a named suite into the FULLY
+VALIDATED set of (workload, platform, backend, mode, dtype, contract, routing, ep, phase) cases
+BEFORE any GPU is allocated — omitting unsupported combinations with a recorded reason. Then:
+  * groups compatible cases into SHARDS (same platform/nodes/placement/image/backend/mode/resource
+    -> one allocation runs many token points), and
+  * selects a CANARY per (platform, backend, mode, contract) to run before the full shard.
+
+  python3 generate_matrix.py --suite ep-nightly-v1 --out matrix.json
+  python3 generate_matrix.py --suite ep-smoke-v1            # prints summary + omissions
+
+Pure stdlib + PyYAML. 'all' as a backend resolves to the platform vendor's EP backend set.
+"""
+from __future__ import annotations
+
+import argparse
+import itertools
+import json
+import os
+
+import yaml
+
+HERE = os.path.dirname(os.path.abspath(__file__))
+
+
+def _load(name):
+    with open(os.path.join(HERE, "configs", name)) as fh:
+        return yaml.safe_load(fh)
+
+
+def resolve_case(plat, beng, mode, dtype, contract, routing, ep, phase, platforms, backends):
+    """Return (ok, reason). Mirrors adapter SUPPORTED_* + platform/backend registry limits."""
+    p = platforms["platforms"].get(plat)
+    b = backends["backends"].get(beng)
+    if p is None:
+        return False, f"unknown platform {plat}"
+    if b is None:
+        return False, f"unknown backend {beng}"
+    if b["vendor"] != p["vendor"]:
+        return False, f"{beng} is {b['vendor']}, {plat} is {p['vendor']}"
+    if mode not in b["modes"]:
+        return False, f"{beng} has no mode {mode}"
+    if dtype not in b["dtypes"]:
+        return False, f"{beng} has no dtype {dtype}"
+    if contract not in b["contracts"]:
+        return False, f"{beng} has no contract {contract}"
+    if ep not in p["validated"]["ep_degrees"]:
+        return False, f"{plat} EP{ep} not validated (have {p['validated']['ep_degrees']})"
+    if ep > p["validated"]["max_intranode_gpus"] and not p["validated"].get("internode"):
+        return False, f"{plat} EP{ep} needs internode (not validated)"
+    pc = (b.get("phase_constraints") or {}).get(mode)
+    if pc and pc.get("phases") and phase not in pc["phases"]:
+        return False, f"{beng} mode={mode} is {pc['phases']}-only (got {phase})"
+    if contract == "cached-layout-comm-only-v1" and mode == "ll":
+        return False, "cached-layout meaningless for LL"
+    return True, "ok"
+
+
+def expand_backends(spec, plat, platforms, backends):
+    """Resolve 'all' to the platform vendor's EP backend set (goal: do NOT skip capability)."""
+    if spec != "all":
+        return spec if isinstance(spec, list) else [spec]
+    vendor = platforms["platforms"][plat]["vendor"]
+    eps = [b for b in backends["vendor_backends"][vendor] if b in backends["backends"]]
+    return eps
+
+
+def generate(suite_name):
+    suites = _load("suites.yaml")["suites"]
+    platforms = _load("platforms.yaml")
+    backends = _load("backends.yaml")
+    workloads = _load("workloads.yaml")
+    if suite_name not in suites:
+        raise SystemExit(f"unknown suite {suite_name}; have {sorted(suites)}")
+    s = suites[suite_name]
+    phases = s.get("phases", ["decode"])
+    routings = s.get("routings", ["uniform"])
+    resource_modes = s.get("resource_modes", ["tuned"])
+    cases, omitted = [], []
+    for plat in s["platforms"]:
+        bset = []
+        for bspec in s["backends"]:
+            bset += expand_backends(bspec, plat, platforms, backends)
+        for beng in sorted(set(bset)):
+            eps = s.get("ep_degrees") or platforms["platforms"][plat]["validated"]["ep_degrees"]
+            for wl, mode, dtype, contract, routing, ep, phase, rmode in itertools.product(
+                    s["workloads"], s["modes"], s.get("dtypes", ["bf16"]), s["contracts"],
+                    routings, eps, phases, resource_modes):
+                ok, reason = resolve_case(plat, beng, mode, dtype, contract, routing, ep, phase,
+                                          platforms, backends)
+                rec = {"workload": wl, "platform": plat, "backend": beng, "mode": mode,
+                       "dtype": dtype, "contract": contract, "routing": routing, "ep": ep,
+                       "phase": phase, "resource_mode": rmode}
+                (cases if ok else omitted).append({**rec, **({} if ok else {"reason": reason})})
+    # SHARDS: one allocation per (platform, backend, mode, resource, image) runs many points.
+    shards = {}
+    for c in cases:
+        img = backends["backends"][c["backend"]].get("required_image", "?")
+        key = (c["platform"], c["backend"], c["mode"], c["resource_mode"], img)
+        shards.setdefault(key, []).append(c)
+    shard_list = [{"platform": k[0], "backend": k[1], "mode": k[2], "resource_mode": k[3],
+                   "image": k[4], "cases": v} for k, v in shards.items()]
+    # CANARY: one representative (smallest) case per (platform, backend, mode, contract).
+    canary = {}
+    for c in cases:
+        ck = (c["platform"], c["backend"], c["mode"], c["contract"])
+        canary.setdefault(ck, c)
+    return {"suite": suite_name, "required_publication": s.get("required_publication"),
+            "n_cases": len(cases), "n_omitted": len(omitted),
+            "cases": cases, "omitted": omitted, "shards": shard_list,
+            "canaries": list(canary.values())}
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser(description="CollectiveX matrix generator")
+    ap.add_argument("--suite", required=True)
+    ap.add_argument("--out")
+    a = ap.parse_args()
+    m = generate(a.suite)
+    print(f"suite={m['suite']} required={m['required_publication']}: "
+          f"{m['n_cases']} valid cases, {m['n_omitted']} omitted, "
+          f"{len(m['shards'])} shards, {len(m['canaries'])} canaries")
+    seen = set()
+    for o in m["omitted"]:
+        k = (o["platform"], o["backend"], o["mode"], o["dtype"], o["contract"], o["reason"])
+        if k not in seen:
+            seen.add(k)
+            print(f"  OMIT {o['platform']}/{o['backend']}/{o['mode']}/{o['dtype']}/{o['contract']}: {o['reason']}")
+    if a.out:
+        with open(a.out, "w") as fh:
+            json.dump(m, fh, indent=2)
+        print(f"wrote {a.out}")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/experimental/CollectiveX/launchers/_b300_investigate.sh b/experimental/CollectiveX/launchers/_b300_investigate.sh
new file mode 100644
index 000000000..68cac0b95
--- /dev/null
+++ b/experimental/CollectiveX/launchers/_b300_investigate.sh
@@ -0,0 +1,51 @@
+#!/usr/bin/env bash
+# B300 DeepEP perf investigation (run via srun on an 8-GPU B300 node).
+# (1) Diagnose the installed deep_ep build: file, version, and the CUDA archs its
+#     .so actually contains (sm_100 present? or only sm_90 -> JIT-from-PTX = slow).
+# (2) Reproducibility: run the SAME decode config 3x back-to-back in one container
+#     (high warmup) and report T=64 dispatch p50 each time -> is variance < 10%, or
+#     is the noise a first-config cold-start artifact?
+set -uo pipefail
+cd /cx || exit 2
+mkdir -p results
+NG="${NG:-8}"; RUNNER="${RUNNER:-b300-8x}"; TOPO="${TOPO:-b300-nvlink-island}"
+
+echo "=== GPU ==="; nvidia-smi --query-gpu=name --format=csv,noheader | head -1
+echo "=== deep_ep build diagnosis ==="
+python3 - <<'PY'
+import importlib.metadata as md, deep_ep, glob, os, subprocess
+print("deep_ep:", md.version("deep_ep"), deep_ep.__file__)
+d = os.path.dirname(deep_ep.__file__)
+sos = glob.glob(os.path.join(d, "**", "*.so"), recursive=True) + glob.glob(os.path.join(d, "..", "deep_ep_cpp*.so"))
+for so in sorted(set(sos)):
+    print("so:", so)
+    try:
+        out = subprocess.run(["cuobjdump", "--list-elf", so], capture_output=True, text=True, timeout=60).stdout
+        archs = sorted(set(p.split("sm_")[1][:2] for p in out.split() if "sm_" in p))
+        print("   ELF archs (cubin):", archs or "<none>")
+        ptx = subprocess.run(["cuobjdump", "--list-ptx", so], capture_output=True, text=True, timeout=60).stdout
+        parchs = sorted(set(p.split("sm_")[1][:2] for p in ptx.split() if "sm_" in p))
+        print("   PTX archs:", parchs or "<none>")
+    except Exception as e:
+        print("   cuobjdump failed:", repr(e))
+PY
+
+echo "=== reproducibility: decode bf16 x3 (warmup 30, iters 80) ==="
+for i in 1 2 3; do
+  out="results/_repro_b300_decode_bf16_run${i}.json"
+  timeout -k 30 600 torchrun --nproc_per_node="$NG" tests/run_ep.py \
+    --backend deepep --mode normal --dispatch-dtype bf16 --phase decode \
+    --routing uniform --resource-mode tuned \
+    --runner "$RUNNER" --topology-class "$TOPO" --transport nvlink \
+    --tokens-ladder "64" --warmup 30 --iters 80 --out "$out" >/dev/null 2>&1
+  python3 - "$out" "$i" <<'PY'
+import json,sys
+try:
+    d=json.load(open(sys.argv[1])); r=d["rows"][0]
+    print(f"run{sys.argv[2]}: T=64 dispatch_p50={r['dispatch_us_p50']:.1f} combine_p50={r['combine_us_p50']:.1f} "
+          f"dispatch_p99={r['dispatch_us_p99']:.1f} status={d['status']}")
+except Exception as e:
+    print(f"run{sys.argv[2]}: FAILED {e!r}")
+PY
+done
+echo "=== DONE ==="
diff --git a/experimental/CollectiveX/launchers/_gb300_ep8.sh b/experimental/CollectiveX/launchers/_gb300_ep8.sh
new file mode 100644
index 000000000..a0b50c543
--- /dev/null
+++ b/experimental/CollectiveX/launchers/_gb300_ep8.sh
@@ -0,0 +1,90 @@
+#!/usr/bin/env bash
+# GB300 EP8 sweep — 2 nodes x 4 GPU over the NVL72 MNNVL NVLink domain. Runs the SAME
+# v3 DeepEP matrix as the EP4 run (normal: bf16/fp8 x {layout-and-dispatch, cached},
+# decode 1..128 + prefill 128..512) but at EP8, so the curves overlay the other EP8 SKUs
+# (H100/H200/MI355X) at matched tokens/rank = same global batch.
+#
+# PROBE FINDING (2026-06-25): DeepEP 1.1.0+814e508 intranode Buffer(group, nvl, 0) works
+# UNCHANGED across 2 NVL72 trays — the MNNVL fabric is one NVLink P2P domain (rdma_rank
+# layout=None). So no internode/NVSHMEM/adapter change: just torchrun-free 8-rank srun.
+# NCCL_MNNVL_ENABLE/CUMEM are required for the nccl process group + barriers across trays.
+#
+# Multi-node has no torchrun: each of the 8 srun tasks IS one rank and runs run_ep.py
+# directly, taking RANK/WORLD_SIZE/LOCAL_RANK/MASTER_ADDR/MASTER_PORT from SLURM_* env.
+set -uo pipefail
+IMAGE="${CX_IMAGE:-/data/sa-shared/containers/lmsysorg_sglang_v0.5.11-cu130.sqsh}"
+STAGE="${CX_STAGE:-/data/sa-shared/cx_stage}"
+PART="${CX_PARTITION:-batch_1}"; ACCT="${CX_ACCOUNT:-benchmark}"
+JOBNAME="${JOBNAME:-cx_gb300_ep8}"; MP="${MASTER_PORT:-29513}"
+RUNNER="${RUNNER:-gb300-8x}"; TOPO="${TOPO:-gb300-nvl72-mnnvl}"; TRANSPORT="${TRANSPORT:-mnnvl}"
+WARMUP="${WARMUP:-32}"; ITERS="${ITERS:-200}"; TRIALS="${TRIALS:-3}"
+DEC="${DEC:-1 2 4 8 16 32 64 128}"; PRE="${PRE:-128 256 512}"
+DO_LL="${DO_LL:-0}"          # Blackwell aborts LL (B300/GB300); normal-only by default
+EP_ENV="${CX_EP_ENV:-}"      # extra --export csv (intranode needs none; reserved for internode)
+export ENROOT_CACHE_PATH="${ENROOT_CACHE_PATH:-/data/sa-shared/.enroot_cache}"
+
+echo "[orch] salloc 2x4 GPU partition=$PART acct=$ACCT runner=$RUNNER"
+salloc --partition="$PART" --account="$ACCT" --nodes=2 --gres=gpu:4 \
+       --ntasks-per-node=4 --exclusive --time="${CX_TIME:-90}" --no-shell --job-name="$JOBNAME" 2>&1 | tail -3
+JID="$(squeue --name="$JOBNAME" -u "$USER" -h -o %A | head -n1)"
+[ -n "$JID" ] || { echo "[orch] FATAL no JOB_ID"; exit 1; }
+trap 'scancel "$JID" 2>/dev/null || true' EXIT
+st=""
+for i in $(seq 1 60); do
+  st="$(squeue -j "$JID" -h -o %T 2>/dev/null)"
+  echo "[orch] tick=$i state=$st nodes=$(squeue -j "$JID" -h -o %N 2>/dev/null)"
+  [ "$st" = "RUNNING" ] && break
+  [ -z "$st" ] && { echo "[orch] job vanished"; exit 1; }
+  sleep 8
+done
+[ "$st" = "RUNNING" ] || { echo "[orch] FATAL never started"; exit 1; }
+NODELIST="$(squeue -j "$JID" -h -o %N)"; MA="$(scontrol show hostnames "$NODELIST" | head -1)"
+echo "[orch] JOB_ID=$JID nodes=[$NODELIST] MASTER_ADDR=$MA MASTER_PORT=$MP"
+
+CMOUNT=(--container-image="$IMAGE" --container-mounts="$STAGE:/cx"
+        --no-container-mount-home --container-workdir=/cx --no-container-entrypoint)
+WRAP='export RANK=$SLURM_PROCID WORLD_SIZE=$SLURM_NTASKS LOCAL_RANK=$SLURM_LOCALID; exec python3 tests/run_ep.py "$@"'
+
+run(){  # phase dtype mode contract ladder
+  local phase="$1" dt="$2" mode="$3" contract="$4" ladder="$5"
+  local out="results/${RUNNER}_deepep_${phase}_${dt}_${mode}_${contract}.json"
+  echo "### $phase dtype=$dt mode=$mode contract=$contract -> $out"
+  timeout -k 30 "${CX_RUN_TIMEOUT:-900}" srun --jobid="$JID" --nodes=2 --ntasks=8 --ntasks-per-node=4 \
+    "${CMOUNT[@]}" \
+    --export=ALL,MASTER_ADDR="$MA",MASTER_PORT="$MP",COLLECTIVEX_IMAGE="$IMAGE",NCCL_MNNVL_ENABLE=1,NCCL_CUMEM_ENABLE=1${EP_ENV:+,$EP_ENV} \
+    bash -c "$WRAP" _ \
+      --backend deepep --phase "$phase" --dispatch-dtype "$dt" --mode "$mode" \
+      --measurement-contract "$contract" --routing uniform --resource-mode tuned \
+      --tokens-ladder "$ladder" --warmup "$WARMUP" --iters "$ITERS" --trials "$TRIALS" \
+      --runner "$RUNNER" --topology-class "$TOPO" --transport "$TRANSPORT" --out "$out" </dev/null 2>&1 | tail -7
+  echo "### rc=${PIPESTATUS[0]} -> $out"
+}
+
+if [ "${CX_LL_ONLY:-0}" != "1" ]; then
+  # decode normal: both dtypes x both contracts (layout cost made explicit) — matches EP4
+  run decode  bf16 normal layout-and-dispatch-v1      "$DEC"
+  run decode  fp8  normal layout-and-dispatch-v1      "$DEC"
+  run decode  bf16 normal cached-layout-comm-only-v1  "$DEC"
+  run decode  fp8  normal cached-layout-comm-only-v1  "$DEC"
+  # prefill normal (cross-vendor contract)
+  run prefill bf16 normal layout-and-dispatch-v1 "$PRE"
+  run prefill fp8  normal layout-and-dispatch-v1 "$PRE"
+fi
+if [ "$DO_LL" = "1" ]; then
+  run decode bf16 ll layout-and-dispatch-v1 "$DEC"
+  run decode fp8  ll layout-and-dispatch-v1 "$DEC"
+fi
+
+echo "=== SUMMARY ==="
+for f in results/${RUNNER}_deepep_*.json; do
+  [ -f "$f" ] || continue
+  python3 - "$f" <<'PY'
+import json,sys
+d=json.load(open(sys.argv[1])); m=d.get("metrics",{}); ri=d.get("routing_identity",{})
+print(f"{sys.argv[1].split('/')[-1]:64s} {d['status']:7s} routing_ok={ri.get('consistent_across_ranks')} "
+      f"contract={d['measurement_contract']:26s} T{m.get('headline_tokens_per_rank')} "
+      f"disp_p50/p99={m.get('dispatch_us_p50',0):.1f}/{m.get('dispatch_us_p99',0):.1f}")
+PY
+done
+scancel "$JID" 2>/dev/null || true
+echo "=== GB300 EP8 DONE ==="
diff --git a/experimental/CollectiveX/launchers/_gb300_probe.sh b/experimental/CollectiveX/launchers/_gb300_probe.sh
new file mode 100644
index 000000000..0bbe564de
--- /dev/null
+++ b/experimental/CollectiveX/launchers/_gb300_probe.sh
@@ -0,0 +1,49 @@
+#!/usr/bin/env bash
+# GB300 EP8 probe orchestrator — runs on im-gb300-login-02. Allocates 2 nodes (8 GPU,
+# 4/node), then runs tests/_gb300_ep_probe.py across 8 ranks for each DeepEP path
+# (intranode / internode / ll) to find which works across 2 NVL72 trays. Read-only.
+set -uo pipefail
+IMAGE="${CX_IMAGE:-/data/sa-shared/containers/lmsysorg_sglang_v0.5.11-cu130.sqsh}"
+STAGE="${CX_STAGE:-/data/sa-shared/cx_stage}"
+PART="${CX_PARTITION:-batch_1}"
+ACCT="${CX_ACCOUNT:-benchmark}"
+JOBNAME="${JOBNAME:-cx_gb300_probe}"
+MP="${MASTER_PORT:-29512}"
+export ENROOT_CACHE_PATH="${ENROOT_CACHE_PATH:-/data/sa-shared/.enroot_cache}"
+
+echo "[orch] salloc 2x4 GPU partition=$PART acct=$ACCT image=$IMAGE"
+salloc --partition="$PART" --account="$ACCT" --nodes=2 --gres=gpu:4 \
+       --ntasks-per-node=4 --exclusive --time=30 --no-shell --job-name="$JOBNAME" 2>&1 | tail -3
+JID="$(squeue --name="$JOBNAME" -u "$USER" -h -o %A | head -n1)"
+[ -n "$JID" ] || { echo "[orch] FATAL no JOB_ID"; exit 1; }
+trap 'scancel "$JID" 2>/dev/null || true' EXIT
+
+st=""
+for i in $(seq 1 60); do
+  st="$(squeue -j "$JID" -h -o %T 2>/dev/null)"
+  echo "[orch] tick=$i state=$st nodes=$(squeue -j "$JID" -h -o %N 2>/dev/null)"
+  [ "$st" = "RUNNING" ] && break
+  [ -z "$st" ] && { echo "[orch] job vanished"; exit 1; }
+  sleep 8
+done
+[ "$st" = "RUNNING" ] || { echo "[orch] FATAL never started"; exit 1; }
+
+NODELIST="$(squeue -j "$JID" -h -o %N)"
+MA="$(scontrol show hostnames "$NODELIST" | head -1)"
+echo "[orch] JOB_ID=$JID nodes=[$NODELIST] MASTER_ADDR=$MA MASTER_PORT=$MP"
+
+CMOUNT=(--container-image="$IMAGE" --container-mounts="$STAGE:/cx"
+        --no-container-mount-home --container-workdir=/cx
+        --no-container-entrypoint)
+WRAP='export RANK=$SLURM_PROCID WORLD_SIZE=$SLURM_NTASKS LOCAL_RANK=$SLURM_LOCALID; exec python3 tests/_gb300_ep_probe.py'
+
+for path in intranode internode ll; do
+  echo "=== PROBE path=$path (8 ranks / 2 nodes) ==="
+  srun --jobid="$JID" --nodes=2 --ntasks=8 --ntasks-per-node=4 "${CMOUNT[@]}" \
+    --export=ALL,MASTER_ADDR="$MA",MASTER_PORT="$MP",CX_PROBE_PATH="$path",COLLECTIVEX_IMAGE="$IMAGE",NCCL_MNNVL_ENABLE=1,NCCL_CUMEM_ENABLE=1 \
+    bash -c "$WRAP" </dev/null 2>&1 | grep -E 'RESULT|deep_ep=|Buffer.__init__|caps:|world=|FAIL|\| ' || echo "[orch] path=$path produced no RESULT line (rc=${PIPESTATUS[0]})"
+  echo "=== end $path ==="
+done
+
+scancel "$JID" 2>/dev/null || true
+echo "=== GB300 PROBE DONE ==="
diff --git a/experimental/CollectiveX/launchers/_gb300_routing.sh b/experimental/CollectiveX/launchers/_gb300_routing.sh
new file mode 100644
index 000000000..6ba9c412c
--- /dev/null
+++ b/experimental/CollectiveX/launchers/_gb300_routing.sh
@@ -0,0 +1,58 @@
+#!/usr/bin/env bash
+# GB300 EP8 routing-axis sweep — 2 nodes x 4 GPU over NVL72 MNNVL. Headline config
+# (bf16/normal/layout-and-dispatch-v1) under balanced / zipf / zipf+EPLB, routing-tagged
+# filenames. Same srun-8-ranks-no-torchrun harness as _gb300_ep8.sh.
+set -uo pipefail
+IMAGE="${CX_IMAGE:-/data/sa-shared/containers/lmsysorg_sglang_v0.5.11-cu130.sqsh}"
+STAGE="${CX_STAGE:-/data/sa-shared/cx_stage}"
+PART="${CX_PARTITION:-batch_1}"; ACCT="${CX_ACCOUNT:-benchmark}"
+JOBNAME="${JOBNAME:-cx_gb300_rt}"; MP="${MASTER_PORT:-29517}"
+RUNNER="${RUNNER:-gb300-8x}"; TOPO="${TOPO:-gb300-nvl72-mnnvl}"; TRANSPORT="${TRANSPORT:-mnnvl}"
+WARMUP="${WARMUP:-32}"; ITERS="${ITERS:-200}"; TRIALS="${TRIALS:-3}"
+DEC="${DEC:-1 2 4 8 16 32 64 128}"; PRE="${PRE:-128 256 512}"; DO_EPLB="${DO_EPLB:-1}"
+export ENROOT_CACHE_PATH="${ENROOT_CACHE_PATH:-/data/sa-shared/.enroot_cache}"
+
+echo "[orch] salloc 2x4 GPU partition=$PART runner=$RUNNER (routing sweep)"
+salloc --partition="$PART" --account="$ACCT" --nodes=2 --gres=gpu:4 \
+       --ntasks-per-node=4 --exclusive --time="${CX_TIME:-90}" --no-shell --job-name="$JOBNAME" 2>&1 | tail -3
+JID="$(squeue --name="$JOBNAME" -u "$USER" -h -o %A | head -n1)"
+[ -n "$JID" ] || { echo "[orch] FATAL no JOB_ID"; exit 1; }
+trap 'scancel "$JID" 2>/dev/null || true' EXIT
+st=""
+for i in $(seq 1 60); do
+  st="$(squeue -j "$JID" -h -o %T 2>/dev/null)"; echo "[orch] tick=$i state=$st"
+  [ "$st" = "RUNNING" ] && break
+  [ -z "$st" ] && { echo "[orch] job vanished"; exit 1; }
+  sleep 8
+done
+[ "$st" = "RUNNING" ] || { echo "[orch] FATAL never started"; exit 1; }
+MA="$(scontrol show hostnames "$(squeue -j "$JID" -h -o %N)" | head -1)"
+echo "[orch] JOB_ID=$JID MASTER_ADDR=$MA"
+CMOUNT=(--container-image="$IMAGE" --container-mounts="$STAGE:/cx"
+        --no-container-mount-home --container-workdir=/cx --no-container-entrypoint)
+WRAP='export RANK=$SLURM_PROCID WORLD_SIZE=$SLURM_NTASKS LOCAL_RANK=$SLURM_LOCALID; exec python3 tests/run_ep.py "$@"'
+
+run(){  # phase routing eplbflag tag ladder
+  local phase="$1" routing="$2" eplb="$3" tag="$4" ladder="$5"
+  local out="results/${RUNNER}_deepep_${phase}_bf16_normal_layout-and-dispatch-v1_${tag}.json"
+  echo "### $phase routing=$routing eplb='${eplb}' -> $out"
+  # shellcheck disable=SC2086
+  timeout -k 30 "${CX_RUN_TIMEOUT:-900}" srun --jobid="$JID" --nodes=2 --ntasks=8 --ntasks-per-node=4 \
+    "${CMOUNT[@]}" \
+    --export=ALL,MASTER_ADDR="$MA",MASTER_PORT="$MP",COLLECTIVEX_IMAGE="$IMAGE",NCCL_MNNVL_ENABLE=1,NCCL_CUMEM_ENABLE=1 \
+    bash -c "$WRAP" _ \
+      --backend deepep --phase "$phase" --dispatch-dtype bf16 --mode normal \
+      --measurement-contract layout-and-dispatch-v1 --routing "$routing" $eplb --resource-mode tuned \
+      --tokens-ladder "$ladder" --warmup "$WARMUP" --iters "$ITERS" --trials "$TRIALS" \
+      --runner "$RUNNER" --topology-class "$TOPO" --transport "$TRANSPORT" --out "$out" </dev/null 2>&1 | tail -7
+  echo "### rc=${PIPESTATUS[0]} -> $out"
+}
+
+for ph in decode prefill; do
+  L="$DEC"; [ "$ph" = prefill ] && L="$PRE"
+  run "$ph" balanced ""       balanced "$L"
+  run "$ph" zipf     ""       zipf     "$L"
+  [ "$DO_EPLB" = 1 ] && run "$ph" zipf "--eplb" zipf+eplb "$L"
+done
+scancel "$JID" 2>/dev/null || true
+echo "=== GB300 ROUTING DONE ==="
diff --git a/experimental/CollectiveX/launchers/_mi355x_canon.sh b/experimental/CollectiveX/launchers/_mi355x_canon.sh
new file mode 100644
index 000000000..3ffa101d2
--- /dev/null
+++ b/experimental/CollectiveX/launchers/_mi355x_canon.sh
@@ -0,0 +1,25 @@
+#!/usr/bin/env bash
+# MI355X cross-vendor canonical-workload consume (goal DoD 183): MoRI consumes the SAME serialized
+# trace bytes that H100 (NVIDIA) consumed (copied into /cx/cx_workloads), so the workload_id +
+# checksums in this AMD doc MATCH the NVIDIA doc -> "same trace on NVIDIA and AMD" is proven by
+# byte-identity, not by trusting two RNGs. MoRI-safe: bf16/normal, gradual ramp, low iters, bounded.
+set -uo pipefail
+cd /cx; mkdir -p results
+export COLLECTIVEX_IMAGE="${COLLECTIVEX_IMAGE:-rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-2}"
+python3 -c "import mori;print('mori OK')" 2>&1 | tail -1
+echo "### canonical traces available:"; ls /cx/cx_workloads/*.manifest.json 2>/dev/null | wc -l
+out=results/mi355x-8x_mori_decode_bf16_normal_layout-and-dispatch-v1_canon.json
+timeout -k 30 "${CX_RUN_TIMEOUT:-400}" torchrun --nproc_per_node=8 tests/run_ep.py --backend mori \
+  --phase decode --tokens-ladder "${LADDER:-1 2 4 8 16 32 64}" --dispatch-dtype bf16 --mode normal \
+  --measurement-contract layout-and-dispatch-v1 --routing uniform --resource-mode tuned \
+  --workload-dir /cx/cx_workloads --warmup 8 --iters "${ITERS:-20}" --trials "${TRIALS:-1}" \
+  --runner mi355x-8x --topology-class mi355x-xgmi --transport xgmi --out "$out" 2>&1 | tail -14
+echo "### rc=${PIPESTATUS[0]} -> $out"
+[ -f "$out" ] && python3 - "$out" <<'PY'
+import json,sys
+d=json.load(open(sys.argv[1])); w=d.get("workload",{}); v=d.get("validity",{})
+print(f"workload_source={v.get('workload_source')} pub={d.get('publication_status')} "
+      f"workload_id={w.get('workload_id')} correct_all={all(r['correct'] for r in d['rows'])}")
+print("checksums:", json.dumps(w.get("manifest_checksums") or {})[:300])
+PY
+echo "=== MI355X CANON DONE ==="
diff --git a/experimental/CollectiveX/launchers/_mi355x_orchestrate.sh b/experimental/CollectiveX/launchers/_mi355x_orchestrate.sh
new file mode 100644
index 000000000..3bb91e155
--- /dev/null
+++ b/experimental/CollectiveX/launchers/_mi355x_orchestrate.sh
@@ -0,0 +1,61 @@
+#!/usr/bin/env bash
+# Submit-host orchestrator for an MI355X MoRI validation run (contended cluster).
+# salloc (queues behind serving sweeps) -> wait RUNNING -> node-local enroot import
+# -> srun the in-container MoRI driver -> scancel. Logs to ~/cx_stage/mori_orch.out.
+# Always </dev/null on srun (the cluster eats heredoc stdin otherwise).
+set -uo pipefail
+IMAGE="${CX_IMAGE:-rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-2}"
+SQKEY="$(printf '%s' "$IMAGE" | sed 's#[/:@#]#_#g')"
+# Import to NFS home (persistent, compute-visible on ALL nodes) — node-local
+# /var/lib/squash is not writable on every node (cold import fails on g16, etc.).
+SQDIR="${CX_SQUASH_DIR:-$HOME/cx_squash}"
+SQ="$SQDIR/${SQKEY}.sqsh"
+LOCK="$SQDIR/${SQKEY}.lock"
+STAGE="$HOME/cx_stage"
+mkdir -p "$SQDIR"
+JOBNAME="${JOBNAME:-cx_mori}"
+WAIT_TICKS="${WAIT_TICKS:-150}"   # 150*12s = 30 min max queue wait
+
+echo "[orch] salloc partition=compute exclude g09,g11 (g37 down) gpu:8 exclusive"
+salloc --partition=compute --exclude=mia1-p01-g09,mia1-p01-g11 --gres=gpu:8 \
+       --exclusive --cpus-per-task=128 --time=60 --no-shell --job-name="$JOBNAME" 2>&1 | tail -2
+JID="$(squeue --name="$JOBNAME" -h -o %A | head -n1)"
+[ -n "$JID" ] || { echo "[orch] FATAL: no JOB_ID"; exit 1; }
+echo "[orch] JOB_ID=$JID"
+trap 'scancel "$JID" 2>/dev/null || true' EXIT
+
+st=""
+for i in $(seq 1 "$WAIT_TICKS"); do
+  st="$(squeue -j "$JID" -h -o %T 2>/dev/null)"
+  node="$(squeue -j "$JID" -h -o %N 2>/dev/null)"
+  echo "[orch] tick=$i state=$st node=$node"
+  [ "$st" = "RUNNING" ] && break
+  [ -z "$st" ] && { echo "[orch] job vanished"; exit 1; }
+  sleep 12
+done
+[ "$st" = "RUNNING" ] || { echo "[orch] FATAL: never started (state=$st)"; exit 1; }
+echo "[orch] RUNNING on $(squeue -j "$JID" -h -o %N)"
+
+echo "[orch] enroot import to NFS (cache redirected to writable node-local /tmp)"
+# Default ENROOT_CACHE_PATH=/var/lib/enroot/cache is root-only here ("Permission denied",
+# exit 9). Redirect cache/data/temp to node-local /tmp (writable, fast); the OUTPUT squash
+# (-o $SQ) still lands on NFS so it persists + is visible on every node next time.
+srun --jobid="$JID" bash -c "
+  export ENROOT_CACHE_PATH=/tmp/enroot_cache_\$USER ENROOT_DATA_PATH=/tmp/enroot_data_\$USER ENROOT_TEMP_PATH=/tmp/enroot_tmp_\$USER
+  mkdir -p \"\$ENROOT_CACHE_PATH\" \"\$ENROOT_DATA_PATH\" \"\$ENROOT_TEMP_PATH\"
+  exec 9>\"$LOCK\" || exit 1
+  flock -w 1200 9 || { echo 'lock timeout'; exit 1; }
+  if unsquashfs -l \"$SQ\" >/dev/null 2>&1; then echo 'squash present: $SQ';
+  else echo 'importing $IMAGE'; rm -f \"$SQ\"; enroot import -o \"$SQ\" \"docker://$IMAGE\" </dev/null && echo 'import OK'; fi
+" </dev/null 2>&1 | tail -20
+
+echo "[orch] === srun MoRI driver ==="
+srun --jobid="$JID" \
+  --container-image="$SQ" --container-mounts="$STAGE:/cx" \
+  --container-writable --container-remap-root --no-container-mount-home \
+  --container-workdir=/cx --no-container-entrypoint --export=ALL \
+  bash /cx/launchers/_validate_mori.sh </dev/null 2>&1
+
+echo "[orch] scancel $JID"
+scancel "$JID" 2>/dev/null || true
+echo "=== ORCH DONE ==="
diff --git a/experimental/CollectiveX/launchers/_mi355x_repro_orchestrate.sh b/experimental/CollectiveX/launchers/_mi355x_repro_orchestrate.sh
new file mode 100644
index 000000000..ecf3bc0c2
--- /dev/null
+++ b/experimental/CollectiveX/launchers/_mi355x_repro_orchestrate.sh
@@ -0,0 +1,40 @@
+#!/usr/bin/env bash
+# Submit-host orchestrator for MI355X MoRI 3-run reproducibility. salloc -> (squash
+# already on NFS) -> srun _repro.sh (BACKEND=mori). Logs to ~/cx_stage/mori_repro.out.
+set -uo pipefail
+IMAGE="${CX_IMAGE:-rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-2}"
+SQKEY="$(printf '%s' "$IMAGE" | sed 's#[/:@#]#_#g')"
+SQDIR="${CX_SQUASH_DIR:-$HOME/cx_squash}"
+SQ="$SQDIR/${SQKEY}.sqsh"
+STAGE="$HOME/cx_stage"
+JOBNAME="${JOBNAME:-cx_mrepro}"
+
+EXCLUDE="${CX_EXCLUDE_NODES:-mia1-p01-g09,mia1-p01-g11}"
+echo "[orch] salloc partition=compute exclude=$EXCLUDE gpu:8"
+salloc --partition=compute --exclude="$EXCLUDE" --gres=gpu:8 \
+       --exclusive --cpus-per-task=128 --time=30 --no-shell --job-name="$JOBNAME" 2>&1 | tail -2
+JID="$(squeue --name="$JOBNAME" -h -o %A | head -n1)"
+[ -n "$JID" ] || { echo "[orch] FATAL: no JOB_ID"; exit 1; }
+echo "[orch] JOB_ID=$JID"
+trap 'scancel "$JID" 2>/dev/null || true' EXIT
+
+st=""
+for i in $(seq 1 150); do
+  st="$(squeue -j "$JID" -h -o %T 2>/dev/null)"
+  echo "[orch] tick=$i state=$st node=$(squeue -j "$JID" -h -o %N 2>/dev/null)"
+  [ "$st" = "RUNNING" ] && break
+  [ -z "$st" ] && { echo "[orch] job vanished"; exit 1; }
+  sleep 12
+done
+[ "$st" = "RUNNING" ] || { echo "[orch] FATAL: never started"; exit 1; }
+
+unsquashfs -l "$SQ" >/dev/null 2>&1 || { echo "[orch] FATAL: squash missing $SQ"; exit 1; }
+echo "[orch] === srun _repro.sh (mori) ==="
+srun --jobid="$JID" \
+  --container-image="$SQ" --container-mounts="$STAGE:/cx" \
+  --container-writable --container-remap-root --no-container-mount-home \
+  --container-workdir=/cx --no-container-entrypoint --export=ALL \
+  env COLLECTIVEX_IMAGE="$IMAGE" RUNNER=mi355x-8x TOPO=mi355x-xgmi \
+  bash "/cx/launchers/${CX_DRIVER:-_v3_mori.sh}" </dev/null 2>&1
+scancel "$JID" 2>/dev/null || true
+echo "=== ORCH DONE ==="
diff --git a/experimental/CollectiveX/launchers/_mori_repro.sh b/experimental/CollectiveX/launchers/_mori_repro.sh
new file mode 100644
index 000000000..8f98f8ce9
--- /dev/null
+++ b/experimental/CollectiveX/launchers/_mori_repro.sh
@@ -0,0 +1,54 @@
+#!/usr/bin/env bash
+# MoRI 3-run reproducibility using the EXACT invocation _validate_mori.sh proved
+# works (full ladders, warmup 8, iters 40) — the single-point _repro.sh path wedges
+# MoRI mid-ramp on this contended cluster. Each run writes run-tagged decode+prefill
+# JSONs; we extract T=64 (decode) and T=512 (prefill) and report the spread. Short
+# per-run timeout so a wedge fails fast instead of burning the allocation.
+set -uo pipefail
+cd /cx || exit 2
+mkdir -p results
+NG="${NG:-8}"; RUNNER="${RUNNER:-mi355x-8x}"; TOPO="${TOPO:-mi355x-xgmi}"
+export COLLECTIVEX_IMAGE="${COLLECTIVEX_IMAGE:-rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-2}"
+TMO="${CX_RUN_TIMEOUT:-220}"
+
+one() {  # $1=phase $2=ladder $3=run
+  local phase="$1" ladder="$2" i="$3"
+  local out="results/_morirepro_${phase}_run${i}.json"
+  # iters 100 (was 40): MoRI decode is ~44us, so a 40-sample p50 jitters ~10% run-to-run;
+  # a 100-sample median is tighter. Still below the sustained-iter count that wedges MoRI.
+  timeout -k 20 "$TMO" torchrun --nproc_per_node="$NG" tests/run_ep.py --backend mori \
+    --mode normal --dispatch-dtype bf16 --phase "$phase" --routing uniform \
+    --resource-mode tuned --tokens-ladder "$ladder" --warmup 8 --iters "${MORI_ITERS:-100}" \
+    --runner "$RUNNER" --topology-class "$TOPO" --transport xgmi \
+    --out "$out" >"$out.log" 2>&1
+  local rc=$?
+  if [ $rc -ne 0 ]; then echo "  run$i $phase rc=$rc (see $out.log)"; return; fi
+}
+
+for i in 1 2 3; do
+  echo "## run $i"
+  one decode  "1 2 4 8 16 32 64 128" "$i"
+  one prefill "128 256 512" "$i"
+done
+
+echo "=== SPREAD (dispatch p50) ==="
+python3 - <<'PY'
+import json, glob
+def at(phase, T):
+    vals = []
+    for f in sorted(glob.glob(f"results/_morirepro_{phase}_run*.json")):
+        try:
+            d = json.load(open(f))
+            r = next(r for r in d["rows"] if r["tokens_per_rank"] == T)
+            vals.append(round(r["dispatch_us_p50"], 1))
+        except Exception:
+            pass
+    if len(vals) >= 2:
+        sp = (max(vals) - min(vals)) / min(vals) * 100
+        print(f"  {phase} T={T}: dispatch_p50 {vals} spread={sp:.1f}% [{'OK <=10%' if sp<=10 else 'OVER'}]")
+    else:
+        print(f"  {phase} T={T}: insufficient ({len(vals)})")
+at("decode", 64)
+at("prefill", 512)
+PY
+echo "=== REPRO DONE ==="
diff --git a/experimental/CollectiveX/launchers/_repro.sh b/experimental/CollectiveX/launchers/_repro.sh
new file mode 100644
index 000000000..641852d18
--- /dev/null
+++ b/experimental/CollectiveX/launchers/_repro.sh
@@ -0,0 +1,64 @@
+#!/usr/bin/env bash
+# 3-run p50 reproducibility driver (run via srun on an 8-GPU node, in one allocation
+# so all three runs share the exact environment). Runs the acceptance points —
+# decode T=64 and prefill T=512 — three times each and prints dispatch/serial p50 per
+# run so the <=10% spread is checkable. Backend/precision/mode via env.
+set -uo pipefail
+cd /cx || exit 2
+mkdir -p results
+NG="${NG:-8}"
+BACKEND="${BACKEND:-deepep}"
+RUNNER="${RUNNER:-x-8x}"
+TOPO="${TOPO:-x}"
+TRANSPORT="${TRANSPORT:-nvlink}"
+DT="${DT:-bf16}"; MODE="${MODE:-normal}"; RM="${RM:-tuned}"
+
+echo "=== repro: backend=$BACKEND dtype=$DT mode=$MODE resource=$RM runner=$RUNNER ==="
+repro() {  # $1=phase $2=T
+  local phase="$1" T="$2" i out
+  echo "## $phase T=$T x3"
+  for i in 1 2 3; do
+    out="results/_repro_${RUNNER}_${BACKEND}_${phase}_T${T}_${DT}_${MODE}_run${i}.json"
+    timeout -k 30 700 torchrun --nproc_per_node="$NG" tests/run_ep.py --backend "$BACKEND" \
+      --phase "$phase" --tokens-ladder "$T" --dispatch-dtype "$DT" --mode "$MODE" \
+      --resource-mode "$RM" --routing uniform --runner "$RUNNER" --topology-class "$TOPO" \
+      --transport "$TRANSPORT" --warmup "${WARMUP:-32}" --iters "${ITERS:-200}" \
+      --out "$out" >"$out.log" 2>&1 || tail -6 "$out.log"
+    python3 - "$out" "$i" "$T" <<'PY'
+import json,sys
+try:
+    d=json.load(open(sys.argv[1])); T=int(sys.argv[3])
+    # MoRI's gradual ramp expands the ladder ([1..T]); pick the row that IS T, not rows[0].
+    r=next(r for r in d["rows"] if r["tokens_per_rank"]==T)
+    print(f"  run{sys.argv[2]} T={sys.argv[3]} dispatch_p50={r['dispatch_us_p50']:.1f} "
+          f"combine_p50={r['combine_us_p50']:.1f} serial_p50={r['serial_us_p50']:.1f} status={d['status']}")
+except Exception as e:
+    print(f"  run{sys.argv[2]} T={sys.argv[3]} FAILED {e!r}")
+PY
+  done
+}
+
+repro decode 64
+repro prefill 512
+
+echo "=== SPREAD (max-min)/min at each point ==="
+python3 - "$RUNNER" "$BACKEND" "$DT" "$MODE" <<'PY'
+import json, glob, sys
+runner, backend, dt, mode = sys.argv[1:5]
+for phase, T in (("decode", 64), ("prefill", 512)):
+    vals = []
+    for f in sorted(glob.glob(f"results/_repro_{runner}_{backend}_{phase}_T{T}_{dt}_{mode}_run*.json")):
+        try:
+            d = json.load(open(f))
+            r = next(r for r in d["rows"] if r["tokens_per_rank"] == T)  # T row (ramp-safe)
+            vals.append(r["dispatch_us_p50"])
+        except Exception:
+            pass
+    if len(vals) >= 2:
+        spread = (max(vals) - min(vals)) / min(vals) * 100
+        ok = "OK <=10%" if spread <= 10 else "OVER 10%"
+        print(f"  {phase} T={T}: dispatch_p50 runs={[round(v,1) for v in vals]} spread={spread:.1f}% [{ok}]")
+    else:
+        print(f"  {phase} T={T}: insufficient runs ({len(vals)})")
+PY
+echo "=== REPRO DONE ==="
diff --git a/experimental/CollectiveX/launchers/_routing_mori.sh b/experimental/CollectiveX/launchers/_routing_mori.sh
new file mode 100644
index 000000000..739a5299b
--- /dev/null
+++ b/experimental/CollectiveX/launchers/_routing_mori.sh
@@ -0,0 +1,39 @@
+#!/usr/bin/env bash
+# MoRI (MI355X) routing-axis sweep — balanced + zipf for the headline config (bf16/normal/
+# layout-and-dispatch-v1), the AMD unbalanced-vs-balanced datapoint. MoRI-safe params baked in
+# (gradual ramp via the harness, low iters, no warm-burst). No EPLB (kept to DeepEP — MoRI is
+# fragile and the 288-physical-expert set is extra risk). Routing-tagged filenames.
+set -uo pipefail
+cd /cx || exit 2
+mkdir -p results
+NG="${NG:-8}"; RUNNER="${RUNNER:-mi355x-8x}"; TOPO="${TOPO:-mi355x-xgmi}"
+export COLLECTIVEX_IMAGE="${COLLECTIVEX_IMAGE:-rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-2}"
+ITERS="${ITERS:-40}"; TRIALS="${TRIALS:-2}"
+
+run(){  # phase routing tag ladder
+  local phase="$1" routing="$2" tag="$3" ladder="$4"
+  local out="results/${RUNNER}_mori_${phase}_bf16_normal_layout-and-dispatch-v1_${tag}.json"
+  echo "### mori $phase routing=$routing -> $out"
+  timeout -k 30 "${CX_RUN_TIMEOUT:-1100}" torchrun --nproc_per_node="$NG" tests/run_ep.py --backend mori \
+    --phase "$phase" --dispatch-dtype bf16 --mode normal --measurement-contract layout-and-dispatch-v1 \
+    --routing "$routing" --resource-mode tuned --tokens-ladder "$ladder" \
+    --warmup 8 --iters "$ITERS" --trials "$TRIALS" \
+    --runner "$RUNNER" --topology-class "$TOPO" --transport xgmi --out "$out" 2>&1 | tail -8
+  echo "### rc=${PIPESTATUS[0]} -> $out"
+}
+python3 -c "import mori;print('mori OK')" 2>&1 | tail -1
+run decode  balanced balanced "1 2 4 8 16 32 64 128"
+run decode  zipf     zipf     "1 2 4 8 16 32 64 128"
+run prefill balanced balanced "128 256 512"
+run prefill zipf     zipf     "128 256 512"
+echo "=== SUMMARY ==="
+for f in results/${RUNNER}_mori_*_{balanced,zipf}.json; do
+  [ -f "$f" ] || continue
+  python3 - "$f" <<'PY'
+import json,sys
+d=json.load(open(sys.argv[1])); m=d.get("metrics",{}); ri=d.get("routing_identity",{}); sh=d.get("shape",{})
+print(f"{sys.argv[1].split('/')[-1]:60s} {d['status']:7s} rt={sh.get('routing'):9s} ok={ri.get('consistent_across_ranks')} "
+      f"T{m.get('headline_tokens_per_rank')} disp_p50/p99={m.get('dispatch_us_p50',0):.1f}/{m.get('dispatch_us_p99',0):.1f}")
+PY
+done
+echo "=== MORI ROUTING DONE ==="
diff --git a/experimental/CollectiveX/launchers/_routing_rerun.sh b/experimental/CollectiveX/launchers/_routing_rerun.sh
new file mode 100644
index 000000000..3776774cd
--- /dev/null
+++ b/experimental/CollectiveX/launchers/_routing_rerun.sh
@@ -0,0 +1,48 @@
+#!/usr/bin/env bash
+# Routing-axis sweep (single-node torchrun): the headline config (bf16 / normal /
+# layout-and-dispatch-v1) under balanced / zipf / zipf+EPLB, so the plot's Routing selector
+# compares balanced vs unbalanced vs EPLB. Filenames carry the routing tag so they never
+# overwrite the uniform v3 results. Reusable across NVIDIA (deepep) + AMD (mori) via env.
+#   BACKEND=deepep|mori  NG  RUNNER  TOPO  TRANSPORT  DEC/PRE ladders  DO_EPLB(1)  ITERS/TRIALS
+set -uo pipefail
+cd /cx 2>/dev/null || cd /ix/experimental/CollectiveX 2>/dev/null || { echo "no cx dir"; exit 2; }
+mkdir -p results
+NG="${NG:-8}"; RUNNER="${RUNNER:-x-8x}"; TOPO="${TOPO:-x}"; TRANSPORT="${TRANSPORT:-nvlink}"
+BACKEND="${BACKEND:-deepep}"; WARMUP="${WARMUP:-32}"; ITERS="${ITERS:-200}"; TRIALS="${TRIALS:-3}"
+DEC="${DEC:-1 2 4 8 16 32 64 128}"; PRE="${PRE:-128 256 512}"
+DO_EPLB="${DO_EPLB:-1}"          # mori: set 0 (skip EPLB, just balanced+zipf)
+PHASES="${PHASES:-decode prefill}"
+
+run(){  # phase routing eplbflag tag ladder
+  local phase="$1" routing="$2" eplb="$3" tag="$4" ladder="$5"
+  local out="results/${RUNNER}_${BACKEND}_${phase}_bf16_normal_layout-and-dispatch-v1_${tag}.json"
+  echo "### $phase routing=$routing eplb='${eplb}' -> $out"
+  # shellcheck disable=SC2086
+  timeout -k 30 "${CX_RUN_TIMEOUT:-900}" torchrun --nproc_per_node="$NG" tests/run_ep.py --backend "$BACKEND" \
+    --phase "$phase" --dispatch-dtype bf16 --mode normal --measurement-contract layout-and-dispatch-v1 \
+    --routing "$routing" $eplb --resource-mode tuned --tokens-ladder "$ladder" \
+    --warmup "$WARMUP" --iters "$ITERS" --trials "$TRIALS" \
+    --runner "$RUNNER" --topology-class "$TOPO" --transport "$TRANSPORT" --out "$out" 2>&1 | tail -7
+  echo "### rc=${PIPESTATUS[0]} -> $out"
+}
+
+for ph in $PHASES; do
+  L="$DEC"; [ "$ph" = prefill ] && L="$PRE"
+  run "$ph" balanced ""       balanced "$L"
+  run "$ph" zipf     ""       zipf     "$L"
+  [ "$DO_EPLB" = 1 ] && run "$ph" zipf "--eplb" zipf+eplb "$L"
+done
+
+echo "=== SUMMARY ==="
+for f in results/${RUNNER}_${BACKEND}_*_{balanced,zipf,zipf+eplb}.json; do
+  [ -f "$f" ] || continue
+  python3 - "$f" <<'PY'
+import json,sys
+d=json.load(open(sys.argv[1])); m=d.get("metrics",{}); ri=d.get("routing_identity",{}); e=d.get("eplb",{})
+sh=d.get("shape",{}); tag=sh.get("routing")+("+eplb" if e.get("enabled") else "")
+imb=f" imb {e.get('imbalance_before'):.1f}->{e.get('imbalance_after'):.1f}x" if e.get("enabled") else ""
+print(f"{sys.argv[1].split('/')[-1]:62s} {d['status']:7s} rt={tag:11s} ok={ri.get('consistent_across_ranks')} "
+      f"T64 disp_p50/p99={m.get('dispatch_us_p50',0):.1f}/{m.get('dispatch_us_p99',0):.1f}{imb}")
+PY
+done
+echo "=== ROUTING SWEEP DONE ==="
diff --git a/experimental/CollectiveX/launchers/_singlenode_orchestrate.sh b/experimental/CollectiveX/launchers/_singlenode_orchestrate.sh
new file mode 100644
index 000000000..093c3b5f5
--- /dev/null
+++ b/experimental/CollectiveX/launchers/_singlenode_orchestrate.sh
@@ -0,0 +1,40 @@
+#!/usr/bin/env bash
+# Generic single-node orchestrator (H100/H200/MI355X): salloc 1 node (NG GPU) -> srun the
+# in-container driver (default _routing_rerun.sh). Mirrors the GB300 orchestrator but single
+# node (driver uses torchrun internally). Env: CX_IMAGE CX_STAGE CX_PARTITION CX_ACCOUNT
+# RUNNER TOPO TRANSPORT BACKEND NG CX_DRIVER + sweep knobs (DEC PRE ITERS TRIALS DO_EPLB PHASES).
+set -uo pipefail
+IMAGE="${CX_IMAGE:?CX_IMAGE}"; STAGE="${CX_STAGE:?CX_STAGE}"; PART="${CX_PARTITION:?CX_PARTITION}"
+JOBNAME="${JOBNAME:-cx_rt}"; NG="${NG:-8}"; DRIVER="${CX_DRIVER:-_routing_rerun.sh}"
+ACCT=(); [ -n "${CX_ACCOUNT:-}" ] && ACCT=(--account="$CX_ACCOUNT")
+EXTRA=(); [ -n "${CX_EXCLUDE:-}" ] && EXTRA=(--exclude="$CX_EXCLUDE")
+[ -n "${CX_CPUS:-}" ] && EXTRA+=(--cpus-per-task="$CX_CPUS")
+
+echo "[orch] salloc $NG GPU partition=$PART driver=$DRIVER runner=${RUNNER:-?}"
+salloc --partition="$PART" "${ACCT[@]}" "${EXTRA[@]}" --gres=gpu:"$NG" --exclusive \
+       --time="${CX_TIME:-60}" --no-shell --job-name="$JOBNAME" 2>&1 | tail -2
+JID="$(squeue --name="$JOBNAME" -u "$USER" -h -o %A | head -n1)"
+[ -n "$JID" ] || { echo "[orch] FATAL no JOB_ID"; exit 1; }
+trap 'scancel "$JID" 2>/dev/null || true' EXIT
+st=""
+for i in $(seq 1 60); do
+  st="$(squeue -j "$JID" -h -o %T 2>/dev/null)"; echo "[orch] tick=$i state=$st node=$(squeue -j "$JID" -h -o %N 2>/dev/null)"
+  [ "$st" = RUNNING ] && break
+  [ -z "$st" ] && { echo "[orch] job vanished"; exit 1; }
+  sleep 8
+done
+[ "$st" = RUNNING ] || { echo "[orch] FATAL never started"; exit 1; }
+
+# Single quoted --export string so ladder values with spaces (DEC/PRE) survive as ONE value
+# each (srun splits the list on commas, not spaces).
+EXP="ALL,COLLECTIVEX_IMAGE=$IMAGE,NG=$NG,RUNNER=${RUNNER:?},TOPO=${TOPO:?},TRANSPORT=${TRANSPORT:-nvlink}"
+EXP+=",BACKEND=${BACKEND:-deepep},DEC=${DEC:-1 2 4 8 16 32 64 128},PRE=${PRE:-128 256 512}"
+EXP+=",ITERS=${ITERS:-200},TRIALS=${TRIALS:-3},DO_EPLB=${DO_EPLB:-1},PHASES=${PHASES:-decode prefill}"
+EXP+=",WARMUP=${WARMUP:-32},CX_RUN_TIMEOUT=${CX_RUN_TIMEOUT:-900},DO_LL=${DO_LL:-1}"
+[ -n "${MORI_COMMIT:-}" ] && EXP+=",MORI_COMMIT=$MORI_COMMIT"
+
+srun --jobid="$JID" --container-image="$IMAGE" --container-mounts="$STAGE:/cx" \
+  --no-container-mount-home --container-workdir=/cx --no-container-entrypoint \
+  --export="$EXP" bash "/cx/launchers/$DRIVER" </dev/null 2>&1
+scancel "$JID" 2>/dev/null || true
+echo "=== ORCH DONE ==="
diff --git a/experimental/CollectiveX/launchers/_v3_mori.sh b/experimental/CollectiveX/launchers/_v3_mori.sh
new file mode 100644
index 000000000..f26d9045c
--- /dev/null
+++ b/experimental/CollectiveX/launchers/_v3_mori.sh
@@ -0,0 +1,37 @@
+#!/usr/bin/env bash
+# MoRI v3 re-run driver (run via srun on 8-GPU MI355X). v3 harness: trials + p99 +
+# routing-identity + layout-and-dispatch-v1 (MoRI's only contract). iters capped (MoRI
+# wedges >=~200 sustained at T>=32); 3 trials x 50 = 150 pooled samples.
+set -uo pipefail
+cd /cx || exit 2
+mkdir -p results
+NG="${NG:-8}"; RUNNER="${RUNNER:-mi355x-8x}"; TOPO="${TOPO:-mi355x-xgmi}"
+export COLLECTIVEX_IMAGE="${COLLECTIVEX_IMAGE:-rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-2}"
+
+run(){  # phase ladder
+  local phase="$1" ladder="$2"
+  local out="results/${RUNNER}_mori_${phase}_bf16_normal_layout-and-dispatch-v1.json"
+  echo "### mori $phase ladder=[$ladder]"
+  # MoRI is slow (combine re-dispatches each iter) + ramps the whole ladder; trials=3 x
+  # iters=50 over [1..128] blew past 700s. 2 trials x 40 iters = 80 pooled samples, fits.
+  timeout -k 30 "${CX_RUN_TIMEOUT:-1100}" torchrun --nproc_per_node="$NG" tests/run_ep.py --backend mori \
+    --phase "$phase" --dispatch-dtype bf16 --mode normal \
+    --measurement-contract layout-and-dispatch-v1 --routing uniform --resource-mode tuned \
+    --tokens-ladder "$ladder" --warmup 8 --iters "${ITERS:-40}" --trials "${TRIALS:-2}" \
+    --runner "$RUNNER" --topology-class "$TOPO" --transport xgmi --out "$out" 2>&1 | tail -8
+  echo "### rc=${PIPESTATUS[0]} -> $out"
+}
+python3 -c "import mori;print('mori OK')" 2>&1 | tail -1
+run decode  "1 2 4 8 16 32 64 128"
+run prefill "128 256 512"
+echo "=== SUMMARY ==="
+for f in results/${RUNNER}_mori_*layout-and-dispatch-v1.json; do
+  [ -f "$f" ] || continue
+  python3 - "$f" <<'PY'
+import json,sys
+d=json.load(open(sys.argv[1])); m=d.get("metrics",{}); ri=d.get("routing_identity",{})
+print(f"{sys.argv[1].split('/')[-1]:58s} {d['status']:7s} routing_ok={ri.get('consistent_across_ranks')} "
+      f"T{m.get('headline_tokens_per_rank')} disp_p50/p99={m.get('dispatch_us_p50',0):.1f}/{m.get('dispatch_us_p99',0):.1f}")
+PY
+done
+echo "=== V3 MORI DONE ==="
diff --git a/experimental/CollectiveX/launchers/_v3_rerun.sh b/experimental/CollectiveX/launchers/_v3_rerun.sh
new file mode 100644
index 000000000..c9fedc718
--- /dev/null
+++ b/experimental/CollectiveX/launchers/_v3_rerun.sh
@@ -0,0 +1,51 @@
+#!/usr/bin/env bash
+# v3 re-run driver (DeepEP): headline matrix with the v3 harness — trials, p50/p90/p99,
+# explicit contracts, routing-identity proof. Reusable across NVIDIA SKUs via env.
+set -uo pipefail
+cd /cx || exit 2
+mkdir -p results
+NG="${NG:-8}"; RUNNER="${RUNNER:-x-8x}"; TOPO="${TOPO:-x}"; TRANSPORT="${TRANSPORT:-nvlink}"
+WARMUP="${WARMUP:-32}"; ITERS="${ITERS:-200}"; TRIALS="${TRIALS:-3}"
+DEC="${DEC:-1 2 4 8 16 32 64 128}"; PRE="${PRE:-128 256 512}"
+DO_LL="${DO_LL:-1}"   # B300-class fabrics that abort LL set DO_LL=0
+
+run(){  # phase dtype mode contract ladder
+  local phase="$1" dt="$2" mode="$3" contract="$4" ladder="$5"
+  local out="results/${RUNNER}_deepep_${phase}_${dt}_${mode}_${contract}.json"
+  echo "### $phase dtype=$dt mode=$mode contract=$contract"
+  timeout -k 30 700 torchrun --nproc_per_node="$NG" tests/run_ep.py --backend deepep \
+    --phase "$phase" --dispatch-dtype "$dt" --mode "$mode" --measurement-contract "$contract" \
+    --routing uniform --resource-mode tuned --tokens-ladder "$ladder" \
+    --warmup "$WARMUP" --iters "$ITERS" --trials "$TRIALS" \
+    --runner "$RUNNER" --topology-class "$TOPO" --transport "$TRANSPORT" \
+    --out "$out" 2>&1 | tail -6
+  echo "### rc=${PIPESTATUS[0]} -> $out"
+}
+
+python3 -c "import deep_ep,importlib.metadata as m;print('deep_ep',m.version('deep_ep'))" 2>&1 | tail -1
+# decode normal: both dtypes x both contracts (layout cost made explicit)
+run decode  bf16 normal layout-and-dispatch-v1      "$DEC"
+run decode  fp8  normal layout-and-dispatch-v1      "$DEC"
+run decode  bf16 normal cached-layout-comm-only-v1  "$DEC"
+run decode  fp8  normal cached-layout-comm-only-v1  "$DEC"
+# decode LL (decode-only optimized path) where the fabric supports it
+if [ "$DO_LL" = "1" ]; then
+  run decode bf16 ll layout-and-dispatch-v1 "$DEC"
+  run decode fp8  ll layout-and-dispatch-v1 "$DEC"
+fi
+# prefill normal (cross-vendor contract = layout-and-dispatch-v1)
+run prefill bf16 normal layout-and-dispatch-v1 "$PRE"
+run prefill fp8  normal layout-and-dispatch-v1 "$PRE"
+
+echo "=== SUMMARY ==="
+for f in results/${RUNNER}_deepep_*.json; do
+  [ -f "$f" ] || continue
+  python3 - "$f" <<'PY'
+import json,sys
+d=json.load(open(sys.argv[1])); m=d.get("metrics",{}); ri=d.get("routing_identity",{})
+print(f"{sys.argv[1].split('/')[-1]:62s} {d['status']:7s} routing_ok={ri.get('consistent_across_ranks')} "
+      f"contract={d['measurement_contract']:26s} T{m.get('headline_tokens_per_rank')} "
+      f"disp_p50/p99={m.get('dispatch_us_p50',0):.1f}/{m.get('dispatch_us_p99',0):.1f}")
+PY
+done
+echo "=== V3 RERUN DONE ==="
diff --git a/experimental/CollectiveX/launchers/_v3_smoke.sh b/experimental/CollectiveX/launchers/_v3_smoke.sh
new file mode 100644
index 000000000..fd2852fba
--- /dev/null
+++ b/experimental/CollectiveX/launchers/_v3_smoke.sh
@@ -0,0 +1,42 @@
+#!/usr/bin/env bash
+# v3 harness smoke (run via srun on 8 GPUs): validates the NEW code paths on real
+# hardware — pooled trials + p50/p90/p99, routing-identity cross-rank proof, BOTH
+# measurement contracts (incl. DeepEP cached-layout), separated logical bytes, schema 3.
+set -uo pipefail
+cd /cx || exit 2
+mkdir -p results
+NG="${NG:-8}"; RUNNER="${RUNNER:-h100-8x}"; TOPO="${TOPO:-h100-nvlink-island}"
+
+run() {  # $1=contract  $2=dtype
+  local contract="$1" dt="$2"
+  local out="results/_v3smoke_${dt}_${contract}.json"
+  echo "### contract=$contract dtype=$dt"
+  timeout -k 30 400 torchrun --nproc_per_node="$NG" tests/run_ep.py --backend deepep \
+    --mode normal --dispatch-dtype "$dt" --phase decode --routing uniform \
+    --resource-mode tuned --measurement-contract "$contract" \
+    --tokens-ladder "1 4 16 64" --warmup 16 --iters 60 --trials 2 \
+    --runner "$RUNNER" --topology-class "$TOPO" --transport nvlink \
+    --out "$out" 2>&1 | tail -8
+  echo "### rc=${PIPESTATUS[0]}"
+  python3 - "$out" <<'PY'
+import json,sys
+try:
+    d=json.load(open(sys.argv[1])); r=next(x for x in d["rows"] if x["tokens_per_rank"]==64)
+    ri=d["routing_identity"]; rp=d["reproduction"]
+    print(f"   schema={d['schema_version']} contract={d['measurement_contract']} status={d['status']}")
+    print(f"   routing_consistent={ri['consistent_across_ranks']} trace_sig={ri['trace_signature']}")
+    print(f"   T64 disp p50/p90/p99={r['dispatch_us_p50']:.1f}/{r['dispatch_us_p90']:.1f}/{r['dispatch_us_p99']:.1f} "
+          f"samples={r['samples_pooled']} trials={r['trials']}")
+    print(f"   dispatch_logical_bytes={r['dispatch_logical_bytes']} combine_logical_bytes={r['combine_logical_bytes']} "
+          f"byte_contract={r['byte_contract']}")
+    print(f"   idx_hash={r['routing_hash']} samples_per_point={rp['samples_per_point']}")
+except Exception as e:
+    print("   PARSE FAIL", repr(e))
+PY
+}
+
+python3 -c "import deep_ep,importlib.metadata as m;print('deep_ep',m.version('deep_ep'))" 2>&1 | tail -1
+run layout-and-dispatch-v1 bf16
+run cached-layout-comm-only-v1 bf16
+run layout-and-dispatch-v1 fp8
+echo "=== V3 SMOKE DONE ==="
diff --git a/experimental/CollectiveX/launchers/_v4_all.sh b/experimental/CollectiveX/launchers/_v4_all.sh
new file mode 100644
index 000000000..f2934794d
--- /dev/null
+++ b/experimental/CollectiveX/launchers/_v4_all.sh
@@ -0,0 +1,13 @@
+#!/usr/bin/env bash
+# v4 full re-run for one (single-node) SKU under one allocation: the headline matrix
+# (_v3_rerun.sh: bf16/fp8 x normal{layout,cached}/LL, decode+prefill) followed by the routing
+# sweep (_routing_rerun.sh: balanced/zipf/zipf+eplb). Both invoke the CURRENT v4 harness, so
+# every JSON carries publication_status/validity/measured-roundtrip — overwriting the legacy v3
+# files of the same name. Env (RUNNER/TOPO/TRANSPORT/DEC/PRE/DO_LL/DO_EPLB/ITERS/TRIALS/WARMUP)
+# is provided by _singlenode_orchestrate.sh.
+set -uo pipefail
+echo "=== V4 HEADLINE (_v3_rerun.sh) ==="
+bash /cx/launchers/_v3_rerun.sh || echo "WARN headline returned nonzero"
+echo "=== V4 ROUTING (_routing_rerun.sh) ==="
+bash /cx/launchers/_routing_rerun.sh || echo "WARN routing returned nonzero"
+echo "=== V4 ALL DONE ==="
diff --git a/experimental/CollectiveX/launchers/_validate_deepep.sh b/experimental/CollectiveX/launchers/_validate_deepep.sh
new file mode 100644
index 000000000..4743e1850
--- /dev/null
+++ b/experimental/CollectiveX/launchers/_validate_deepep.sh
@@ -0,0 +1,77 @@
+#!/usr/bin/env bash
+# In-container DeepEP validation driver (run via srun on an 8-GPU node).
+# Exercises the reference (bf16) + optimized (fp8) NORMAL-mode paths on decode and
+# prefill ladders with reduced iters for a fast correctness/artifact gate. Each
+# torchrun writes one provenance-tagged JSON; we grep status=valid at the end.
+set -uo pipefail
+cd /cx || exit 2
+mkdir -p results
+NG="${NG:-8}"
+RUNNER="${RUNNER:-h100-8x}"
+TOPO="${TOPO:-h100-nvlink-island}"
+WARMUP="${WARMUP:-32}"   # B300/Blackwell needs ~30 to reach steady-state clocks
+ITERS="${ITERS:-50}"
+DEC_LADDER="${DEC_LADDER:-1 2 4 8 16 32 64 128}"
+PRE_LADDER="${PRE_LADDER:-128 256 512}"
+export COLLECTIVEX_IMAGE="${COLLECTIVEX_IMAGE:-lmsysorg/sglang:v0.5.11-cu130}"
+
+echo "=== nvidia-smi ==="; nvidia-smi --query-gpu=name,memory.total --format=csv,noheader | head -1
+echo "=== deep_ep ==="; python3 -c "import deep_ep,importlib.metadata as m;print('deep_ep',m.version('deep_ep'))" 2>&1 | tail -1
+
+run() {  # $1=phase $2=dtype $3=ladder $4=resource_mode
+  local phase="$1" dt="$2" ladder="$3" rm="$4"
+  local out="results/${RUNNER}_deepep_${phase}_${dt}_${rm}.json"
+  echo "### RUN phase=$phase dtype=$dt resource=$rm ladder=[$ladder]"
+  timeout -k 30 600 torchrun --nproc_per_node="$NG" tests/run_ep.py \
+    --backend deepep --mode normal --dispatch-dtype "$dt" --phase "$phase" \
+    --routing uniform --resource-mode "$rm" \
+    --runner "$RUNNER" --topology-class "$TOPO" --transport nvlink \
+    --tokens-ladder "$ladder" --warmup "$WARMUP" --iters "$ITERS" \
+    --out "$out" 2>&1 | tail -25
+  echo "### rc=${PIPESTATUS[0]} -> $out"
+}
+
+run_mode() {  # $1=phase $2=dtype $3=ladder $4=resource_mode $5=mode
+  local phase="$1" dt="$2" ladder="$3" rm="$4" mode="$5"
+  local out="results/${RUNNER}_deepep_${phase}_${dt}_${rm}_${mode}.json"
+  echo "### RUN phase=$phase dtype=$dt resource=$rm mode=$mode ladder=[$ladder]"
+  timeout -k 30 600 torchrun --nproc_per_node="$NG" tests/run_ep.py \
+    --backend deepep --mode "$mode" --dispatch-dtype "$dt" --phase "$phase" \
+    --routing uniform --resource-mode "$rm" \
+    --runner "$RUNNER" --topology-class "$TOPO" --transport nvlink \
+    --tokens-ladder "$ladder" --warmup "$WARMUP" --iters "$ITERS" \
+    --out "$out" 2>&1 | tail -25
+  echo "### rc=${PIPESTATUS[0]} -> $out"
+}
+
+if [ "${DO_NORMAL:-1}" = "1" ]; then
+  run decode  bf16 "$DEC_LADDER" tuned
+  run decode  fp8  "$DEC_LADDER" tuned
+  run prefill bf16 "$PRE_LADDER" tuned
+  run prefill fp8  "$PRE_LADDER" tuned
+fi
+# Optimized decode path = low-latency (LL). bf16 + fp8 (fp8 cast is in-kernel/timed).
+# Full decode ladder incl. T=128 settles whether num_tokens < or <= num_max.
+if [ "${DO_LL:-1}" = "1" ]; then
+  run_mode decode bf16 "$DEC_LADDER" tuned ll
+  run_mode decode fp8  "$DEC_LADDER" tuned ll
+fi
+# A normalized-regime sample (both resource regimes are required by the goal).
+if [ "${DO_NORM:-1}" = "1" ]; then
+  run_mode decode fp8 "$DEC_LADDER" normalized normal
+fi
+
+echo "=== SUMMARY ==="
+for f in results/${RUNNER}_deepep_*.json; do
+  [ -f "$f" ] || continue
+  python3 - "$f" <<'PY'
+import json,sys
+d=json.load(open(sys.argv[1]))
+m=d.get("metrics",{}); r=d.get("reproduction",{})
+print(f"{sys.argv[1].split('/')[-1]:52s} status={d['status']:7s} mode={d['mode']:6s} "
+      f"dtype={d['shape']['dispatch_dtype']:4s} fp8_in_timing={str(r.get('fp8_quant_in_timing')):5s} "
+      f"tol={d['correctness']['tolerance']} maxrelerr={d['correctness']['max_rel_error']:.4f} "
+      f"hT={m.get('headline_tokens_per_rank')} disp={m.get('dispatch_us_p50'):.1f}")
+PY
+done
+echo "=== DONE ==="
diff --git a/experimental/CollectiveX/launchers/_validate_mori.sh b/experimental/CollectiveX/launchers/_validate_mori.sh
new file mode 100644
index 000000000..347dc728c
--- /dev/null
+++ b/experimental/CollectiveX/launchers/_validate_mori.sh
@@ -0,0 +1,47 @@
+#!/usr/bin/env bash
+# In-container MoRI validation driver (run via srun on an 8-GPU MI355X node).
+# Re-validates the reference (bf16/normal) decode+prefill with the current harness,
+# then runs the fp8 capability probe (decides whether MoRI gets fp8 caps). LL is not
+# probed (MoRI has no low-latency entrypoint). Each torchrun writes one JSON.
+set -uo pipefail
+cd /cx || exit 2
+mkdir -p results
+NG="${NG:-8}"
+RUNNER="${RUNNER:-mi355x-8x}"
+TOPO="${TOPO:-mi355x-xgmi}"
+export COLLECTIVEX_IMAGE="${COLLECTIVEX_IMAGE:-rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-2}"
+
+echo "=== device ==="; rocm-smi --showproductname 2>/dev/null | head -3 || true
+python3 -c "import mori; print('mori import OK')" 2>&1 | tail -2
+
+run() {  # $1=phase $2=ladder
+  local phase="$1" ladder="$2"
+  local out="results/${RUNNER}_mori_${phase}_bf16_tuned_normal.json"
+  echo "### RUN mori phase=$phase ladder=[$ladder]"
+  timeout -k 30 700 torchrun --nproc_per_node="$NG" tests/run_ep.py \
+    --backend mori --mode normal --dispatch-dtype bf16 --phase "$phase" \
+    --routing uniform --resource-mode tuned \
+    --runner "$RUNNER" --topology-class "$TOPO" --transport xgmi \
+    --tokens-ladder "$ladder" --warmup 8 --iters 40 --out "$out" 2>&1 | tail -25
+  echo "### rc=${PIPESTATUS[0]} -> $out"
+}
+
+run decode  "1 2 4 8 16 32 64 128"
+run prefill "128 256 512"
+
+echo "### MoRI fp8 capability probe"
+timeout -k 20 300 torchrun --nproc_per_node="$NG" tests/probe_mori_caps.py 2>&1 | tail -35
+
+echo "=== SUMMARY ==="
+for f in results/${RUNNER}_mori_*.json; do
+  [ -f "$f" ] || continue
+  python3 - "$f" <<'PY'
+import json,sys
+d=json.load(open(sys.argv[1])); m=d.get("metrics",{})
+print(f"{sys.argv[1].split('/')[-1]:46s} status={d['status']:7s} mode={d['mode']:6s} "
+      f"dtype={d['shape']['dispatch_dtype']:4s} maxrelerr={d['correctness']['max_rel_error']:.4f} "
+      f"hT={m.get('headline_tokens_per_rank')} disp={m.get('dispatch_us_p50'):.1f} "
+      f"blocks={d['backend_provenance'].get('block_num')}")
+PY
+done
+echo "=== DONE ==="
diff --git a/experimental/CollectiveX/launchers/common.sh b/experimental/CollectiveX/launchers/common.sh
new file mode 100644
index 000000000..e560fc987
--- /dev/null
+++ b/experimental/CollectiveX/launchers/common.sh
@@ -0,0 +1,168 @@
+# shellcheck shell=bash
+# CollectiveX — shared launcher helpers (sourced, not executed).
+#
+# Cluster-generic scaffolding only (Slurm/container/build/staging); no
+# model-serving. Logging goes to stderr so functions can `echo` a single
+# result on stdout.
+
+cx_log() { printf '[collectivex] %s\n' "$*" >&2; }
+cx_die() { printf '[collectivex] FATAL: %s\n' "$*" >&2; exit 1; }
+
+# Single multi-arch container for ALL NVIDIA SKUs: tag `v0.5.11-cu130` is an OCI
+# image index covering linux/amd64 (B200) + linux/arm64 (GB200); enroot import
+# pulls the matching arch. (cu130 = CUDA 13, system nccl.h in /usr/include, torch 2.9.x.)
+# IMPORT BY TAG, not by digest: enroot's anonymous Docker Hub token scope is built
+# from the tag; a bare `repo@sha256:` ref makes enroot prompt for a password and
+# HANG in non-interactive CI (and a combined `tag@sha256` ref 400s). The expected
+# multi-arch index digest is recorded for provenance/verification:
+CX_IMAGE_DIGEST="sha256:061fb71f838e82000a1768c159654d526c2f17ebe751c21e7fc48ca53c8ef975"
+# (v0.5.12-cu130 was rejected: its 62 layers overflow enroot's overlay-based
+# squash creation on these nodes — "failed to mount overlay ... Invalid argument".
+# v0.5.11-cu130 imports cleanly and is pre-staged on GB200.)
+# DeepEP is NOT bundled here -> run_in_container.sh builds it via rebuild-deepep.
+# (The arch-specific deepseek-v4-{blackwell,grace-blackwell} images DO bundle
+# DeepEP — see CONTAINERS.md — but are not multi-arch and are not the default.)
+CX_IMAGE_MULTIARCH="lmsysorg/sglang:v0.5.11-cu130"
+
+# AMD (ROCm/CDNA): the multi-arch NVIDIA image above is x86_64+aarch64 CUDA and
+# cannot run on MI355X. AMD uses a separate ROCm image that bundles MoRI (the
+# AMD EP library). Single-arch (linux/amd64 host, ROCm runtime); not digest-
+# pinned yet — pin once validated on the runner. See CONTAINERS.md.
+CX_IMAGE_AMD_MORI="rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-2"
+
+cx_default_image() {
+  case "$1" in
+    mi355x*|mi350x*|mi325x*|mi300x*) echo "$CX_IMAGE_AMD_MORI" ;;
+    b200*|gb200*|b300*|gb300*|h100*|h200*) echo "$CX_IMAGE_MULTIARCH" ;;
+    *) cx_die "no default image for runner prefix: $1" ;;
+  esac
+}
+
+# cx_ensure_squash <squash_dir> <image>  ->  echoes the squash file path.
+# Imports via enroot only if a valid squash is not already present (flock-guarded,
+# mirroring runners/launch_b200-dgxc.sh).
+cx_ensure_squash() {
+  local squash_dir="$1" image="$2"
+  mkdir -p "$squash_dir" 2>/dev/null || true
+  local key sq locks
+  key="$(printf '%s' "$image" | sed 's#[/:@#]#_#g')"
+  sq="$squash_dir/${key}.sqsh"
+  locks="$squash_dir/.locks"; mkdir -p "$locks" 2>/dev/null || true
+  (
+    flock -w 900 9 || cx_die "lock timeout for $sq"
+    if unsquashfs -l "$sq" >/dev/null 2>&1; then
+      cx_log "squash present: $sq"
+    else
+      cx_log "enroot import docker://$image -> $sq (one-time, multi-GB)"
+      rm -f "$sq"
+      # </dev/null: never block on enroot's interactive password prompt (a missing
+      # anonymous token must fail fast, not hang the CI job).
+      enroot import -o "$sq" "docker://$image" </dev/null >&2 \
+        || cx_die "enroot import failed for $image (anonymous auth needs a TAG ref, not a bare digest; or pre-stage the squash)"
+      unsquashfs -l "$sq" >/dev/null 2>&1 || cx_die "import produced no valid squash: $sq"
+    fi
+  ) 9>"$locks/${key}.lock"
+  echo "$sq"
+}
+
+# cx_stage_repo <repo_root> <stage_dir>  ->  echoes the mount-source root.
+# Some clusters (e.g. GB200/watchtower) do not cross-mount the runner workspace
+# to compute nodes. If CX_STAGE_DIR is set, rsync the CollectiveX tree onto that
+# compute-visible shared FS and mount from there. No-op (echo repo_root) when
+# stage_dir is empty or equals repo_root.
+cx_stage_repo() {
+  local repo_root="$1" stage_dir="${2:-}"
+  if [ -z "$stage_dir" ] || [ "$stage_dir" = "$repo_root" ]; then
+    echo "$repo_root"; return 0
+  fi
+  mkdir -p "$stage_dir/experimental" || cx_die "cannot create stage dir $stage_dir"
+  cx_log "staging experimental/CollectiveX -> $stage_dir (compute-visible)"
+  rsync -a --delete \
+    --exclude='.nccl-tests/' --exclude='__pycache__/' --exclude='results/plots/' \
+    "$repo_root/experimental/CollectiveX" "$stage_dir/experimental/" >&2 \
+    || cx_die "rsync to stage dir failed"
+  echo "$stage_dir"
+}
+
+# cx_collect_results <mount_src> <repo_root>
+# When the run used a staged (compute-visible) mount, copy result JSONs back to
+# the original checkout's results/ so the workflow's upload-artifact (which reads
+# the checkout, not the stage dir) finds them. No-op when no staging was used.
+cx_collect_results() {
+  local mount_src="$1" repo_root="$2" dst
+  [ "$mount_src" = "$repo_root" ] && return 0
+  dst="$repo_root/experimental/CollectiveX/results"
+  mkdir -p "$dst"
+  cp "$mount_src/experimental/CollectiveX/results/"*.json "$dst/" 2>/dev/null || true
+  cx_log "copied results from stage dir -> $dst (for artifact upload)"
+}
+
+# cx_build_nccl_tests <parent_dir> <mpi 0|1>  ->  echoes the build/ dir.
+# Runs IN-CONTAINER (login nodes have no nvcc). Cached: skips if already built.
+# CX_NCCL_HOME defaults to /usr (system nccl.h in /usr/include on the sglang
+# cu130 images); override CX_CUDA_HOME / CX_NCCL_HOME / CX_MPI_HOME if needed.
+cx_build_nccl_tests() {
+  local parent="$1" mpi="${2:-0}" dir bin sfx=""
+  # Cache MPI=0 and MPI=1 builds in SEPARATE dirs. A single-node (MPI=0) binary
+  # reused under `srun --mpi=pmix` runs as N standalone world=1 procs (busbw=0);
+  # keying the cache by flavor prevents that cross-contamination.
+  [ "$mpi" = "1" ] && sfx="-mpi"
+  dir="$parent/nccl-tests$sfx"
+  bin="$dir/build/all_reduce_perf"
+  if [ -x "$bin" ]; then
+    cx_log "nccl-tests already built: $dir/build"
+    echo "$dir/build"; return 0
+  fi
+  mkdir -p "$parent"
+  if [ ! -d "$dir/.git" ]; then
+    cx_log "cloning nccl-tests -> $dir"
+    git clone --depth 1 https://github.com/NVIDIA/nccl-tests.git "$dir" >&2 \
+      || cx_die "git clone nccl-tests failed"
+  fi
+  # MPI=1 needs MPI_HOME. On Debian/Ubuntu OpenMPI the headers live under
+  # /usr/lib/<arch>/openmpi/include (NOT /usr/include), so MPI_HOME=/usr fails;
+  # point it at that openmpi dir (libmpi resolves via the default linker path).
+  # Works for both x86_64 (B200) and aarch64 (GB200). Override with CX_MPI_HOME.
+  local mpi_home="${CX_MPI_HOME:-}"
+  if [ "$mpi" = "1" ] && [ -z "$mpi_home" ]; then
+    mpi_home="$(ls -d /usr/lib/*/openmpi 2>/dev/null | head -n1)"
+  fi
+  cx_log "building nccl-tests (MPI=$mpi, NCCL_HOME=${CX_NCCL_HOME:-/usr}${mpi_home:+, MPI_HOME=$mpi_home})"
+  make -C "$dir" -j MPI="$mpi" \
+       CUDA_HOME="${CX_CUDA_HOME:-/usr/local/cuda}" \
+       NCCL_HOME="${CX_NCCL_HOME:-/usr}" \
+       ${mpi_home:+MPI_HOME="$mpi_home"} >&2 \
+    || cx_die "nccl-tests build failed (try a different CX_NCCL_HOME/CX_MPI_HOME; need nccl.h + libnccl)"
+  [ -x "$bin" ] || cx_die "nccl-tests build produced no binary at $bin"
+  echo "$dir/build"
+}
+
+# cx_build_rccl_tests <parent_dir> <mpi 0|1>  ->  echoes the build/ dir.
+# AMD/ROCm counterpart of cx_build_nccl_tests: ROCm/rccl-tests is a fork of
+# nccl-tests producing the SAME binary names (<op>_perf) and output format, so
+# run_nccl.py parses it unchanged. `make` defaults to ROCm at /opt/rocm
+# (amdclang++ + librccl); validated building in-container on MI355X. Override
+# CX_ROCM_HOME / CX_RCCL_HOME / CX_MPI_HOME if the toolchain lives elsewhere.
+cx_build_rccl_tests() {
+  local parent="$1" mpi="${2:-0}" dir bin
+  dir="$parent/rccl-tests"
+  bin="$dir/build/all_reduce_perf"
+  if [ -x "$bin" ]; then
+    cx_log "rccl-tests already built: $dir/build"
+    echo "$dir/build"; return 0
+  fi
+  mkdir -p "$parent"
+  if [ ! -d "$dir/.git" ]; then
+    cx_log "cloning rccl-tests -> $dir"
+    git clone --depth 1 https://github.com/ROCm/rccl-tests.git "$dir" >&2 \
+      || cx_die "git clone rccl-tests failed"
+  fi
+  cx_log "building rccl-tests (MPI=$mpi, ROCm ${CX_ROCM_HOME:-/opt/rocm})"
+  make -C "$dir" -j MPI="$mpi" \
+       ${CX_ROCM_HOME:+HIP_HOME="$CX_ROCM_HOME"} \
+       ${CX_RCCL_HOME:+RCCL_HOME="$CX_RCCL_HOME"} \
+       ${CX_MPI_HOME:+MPI_HOME="$CX_MPI_HOME"} >&2 \
+    || cx_die "rccl-tests build failed (need ROCm + librccl; try CX_ROCM_HOME)"
+  [ -x "$bin" ] || cx_die "rccl-tests build produced no binary at $bin"
+  echo "$dir/build"
+}
diff --git a/experimental/CollectiveX/launchers/launch_b200-dgxc-slurm.sh b/experimental/CollectiveX/launchers/launch_b200-dgxc-slurm.sh
new file mode 100644
index 000000000..b7a03b2c1
--- /dev/null
+++ b/experimental/CollectiveX/launchers/launch_b200-dgxc-slurm.sh
@@ -0,0 +1,103 @@
+#!/usr/bin/env bash
+# CollectiveX — 2-node B200 SKU adapter (cross CX-7 InfiniBand spine), x86_64.
+#
+# The other half of the headline: the same primitives as single-node B200, but
+# spanning two nodes so the transport is InfiniBand rather than NVLink. Contrast
+# with GB200, where the 2-node-equivalent stays on NVL72 NVLink (MNNVL).
+#
+# Multi-node orchestration differs from single-node, so this adapter does NOT
+# use run_in_container.sh: it builds nccl-tests (MPI=1), runs each op across all
+# ranks (raw capture), then parses on the login node. Currently CX_BENCH=nccl
+# only (multi-node DeepEP/MNNVL is the srt-slurm follow-up).
+#
+# SPIKE CAVEATS: needs `srun --mpi=pmix` wired for pyxis and a compute-visible
+# checkout — set CX_STAGE_DIR to a shared FS (e.g. /home/sa-shared/cx-stage) if
+# the runner workspace is not cross-mounted to compute.
+#
+# Run: bash experimental/CollectiveX/launchers/launch_b200-dgxc-slurm.sh
+set -euo pipefail
+
+HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+CX_DIR="$(cd "$HERE/.." && pwd)"
+REPO_ROOT="$(cd "$CX_DIR/../.." && pwd)"
+# shellcheck source=common.sh
+source "$HERE/common.sh"
+
+CX_BENCH="${CX_BENCH:-nccl}"
+[ "$CX_BENCH" = "nccl" ] || cx_die "launch_b200-dgxc-slurm.sh supports CX_BENCH=nccl only (got '$CX_BENCH'); multi-node DeepEP is a follow-up"
+
+RUNNER_NAME="${RUNNER_NAME:-b200-dgxc-slurm}"
+PARTITION="${CX_PARTITION:-gpu-2}"
+ACCOUNT="${CX_ACCOUNT:-benchmark}"
+GPUS_PER_NODE="${CX_GPUS_PER_NODE:-8}"
+NODES="${CX_NODES:-2}"
+TIME_MIN="${CX_TIME:-30}"
+IMAGE="${CX_IMAGE:-$(cx_default_image b200)}"
+SQUASH_DIR="${CX_SQUASH_DIR:-/home/sa-shared/containers}"
+MOUNT_DIR=/ix
+TS="$(date -u +%Y-%m-%dT%H-%M-%SZ)"
+TOPO="b200-nvlink-island+cx7-ib"
+WORLD=$((NODES * GPUS_PER_NODE))
+MPI_FLAG="${CX_SRUN_MPI:-pmix}"
+export CX_NCCL_HOME="${CX_NCCL_HOME:-/usr}"
+# Record container identity in env_capture provenance (propagated via --export=ALL).
+export COLLECTIVEX_IMAGE="$IMAGE" COLLECTIVEX_IMAGE_DIGEST="${CX_IMAGE_DIGEST:-}"
+
+declare -A BIN=( [all_reduce]=all_reduce_perf [all_gather]=all_gather_perf
+                 [reduce_scatter]=reduce_scatter_perf [alltoall]=alltoall_perf )
+
+cx_log "runner=$RUNNER_NAME nodes=$NODES x ${GPUS_PER_NODE}gpu world=$WORLD image=$IMAGE"
+SQUASH_FILE="$(cx_ensure_squash "$SQUASH_DIR" "$IMAGE")"
+MOUNT_SRC="$(cx_stage_repo "$REPO_ROOT" "${CX_STAGE_DIR:-}")"
+cx_log "squash=$SQUASH_FILE  mount=$MOUNT_SRC -> $MOUNT_DIR"
+
+if [ "${CX_DRYRUN:-0}" = "1" ]; then cx_log "CX_DRYRUN=1 — not allocating"; exit 0; fi
+command -v salloc >/dev/null || cx_die "salloc not found — run on the Slurm login node"
+
+salloc --partition="$PARTITION" --account="$ACCOUNT" --nodes="$NODES" \
+       --gres=gpu:"$GPUS_PER_NODE" --exclusive --time="$TIME_MIN" \
+       --no-shell --job-name="$RUNNER_NAME"
+JOB_ID="$(squeue --name="$RUNNER_NAME" -u "$USER" -h -o %A | head -n1)"
+[ -n "$JOB_ID" ] || cx_die "could not resolve allocated JOB_ID"
+cx_log "JOB_ID=$JOB_ID"
+trap 'scancel "$JOB_ID" 2>/dev/null || true' EXIT
+
+COMMON_MOUNT=(--container-image="$SQUASH_FILE" --container-mounts="$MOUNT_SRC:$MOUNT_DIR"
+              --no-container-mount-home --container-workdir="$MOUNT_DIR/experimental/CollectiveX"
+              --no-container-entrypoint)
+ENVJSON="$MOUNT_SRC/experimental/CollectiveX/results/env_${RUNNER_NAME}_${TS}.json"
+
+# 1) Build nccl-tests (MPI=1) + capture environment (single task, one node).
+srun --jobid="$JOB_ID" --ntasks=1 --nodes=1 "${COMMON_MOUNT[@]}" --export=ALL,CX_TS="$TS",CX_RUNNER="$RUNNER_NAME" \
+  bash -c '
+    set -euo pipefail
+    cd /ix/experimental/CollectiveX
+    source launchers/common.sh
+    mkdir -p results
+    cx_build_nccl_tests "$PWD/.nccl-tests" 1 >/dev/null
+    python3 env_capture.py --out "results/env_${CX_RUNNER}_${CX_TS}.json" --timestamp "$CX_TS"
+  '
+
+BUILD_IN_CTR="$MOUNT_DIR/experimental/CollectiveX/.nccl-tests/nccl-tests-mpi/build"
+OPS="${CX_OPS:-all_reduce all_gather reduce_scatter alltoall}"
+
+# 2) Per op: run across all ranks (one GPU per task), tee raw output to shared FS.
+for op in $OPS; do
+  raw="$MOUNT_SRC/experimental/CollectiveX/results/raw_${RUNNER_NAME}_${op}_${TS}.txt"
+  cx_log "running $op across $WORLD ranks (mpi=$MPI_FLAG) -> $raw"
+  srun --jobid="$JOB_ID" --mpi="$MPI_FLAG" --nodes="$NODES" \
+       --ntasks="$WORLD" --ntasks-per-node="$GPUS_PER_NODE" "${COMMON_MOUNT[@]}" \
+       --export=ALL,NCCL_CUMEM_ENABLE=1 \
+       "$BUILD_IN_CTR/${BIN[$op]}" -b "${CX_MIN_BYTES:-8}" -e "${CX_MAX_BYTES:-8G}" -f 2 -g 1 -c 1 -w 5 -n 20 \
+       > "$raw" 2>"$raw.stderr" || cx_log "WARN: $op srun returned nonzero (see $raw.stderr)"
+
+  # 3) Parse on the login node (pure stdlib python; no container needed).
+  python3 "$CX_DIR/run_nccl.py" --op "$op" --parse-only "$raw" \
+    --world-size "$WORLD" --nodes "$NODES" \
+    --runner "$RUNNER_NAME" --topology-class "$TOPO" --transport ib \
+    --env-json "$ENVJSON" \
+    --out "$CX_DIR/results/${RUNNER_NAME}_${op}_${TS}.json" \
+    --timestamp "$TS" || cx_log "WARN: parse $op failed"
+done
+
+cx_log "done — JSON artifacts under $CX_DIR/results/"
diff --git a/experimental/CollectiveX/launchers/launch_b200-dgxc.sh b/experimental/CollectiveX/launchers/launch_b200-dgxc.sh
new file mode 100644
index 000000000..42d860975
--- /dev/null
+++ b/experimental/CollectiveX/launchers/launch_b200-dgxc.sh
@@ -0,0 +1,67 @@
+#!/usr/bin/env bash
+# CollectiveX — B200 single-node SKU adapter (8x B200, NVLink island, x86_64).
+#
+# Thin adapter: handles B200-specific allocation/container, then hands off to
+# launchers/run_in_container.sh which runs whichever benchmark CX_BENCH selects
+# (nccl | deepep | all). Mirrors runners/launch_b200-dgxc.sh (salloc + enroot
+# squash + srun --container) with all model-serving stripped.
+#
+# Run from inside the InferenceX checkout on the B200 login node:
+#     bash experimental/CollectiveX/launchers/launch_b200-dgxc.sh           # nccl (default)
+#     CX_BENCH=deepep bash .../launch_b200-dgxc.sh                          # DeepEP (rebuild)
+#
+# Env knobs: CX_PARTITION(gpu-2) CX_ACCOUNT(benchmark) CX_NGPUS(8) CX_TIME(30)
+#   CX_IMAGE CX_SQUASH_DIR CX_STAGE_DIR CX_BENCH CX_OPS CX_MIN_BYTES CX_MAX_BYTES
+#   CX_DRYRUN(0)
+set -euo pipefail
+
+HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+CX_DIR="$(cd "$HERE/.." && pwd)"
+REPO_ROOT="$(cd "$CX_DIR/../.." && pwd)"
+# shellcheck source=common.sh
+source "$HERE/common.sh"
+
+RUNNER_NAME="${RUNNER_NAME:-b200-dgxc}"
+PARTITION="${CX_PARTITION:-gpu-2}"
+ACCOUNT="${CX_ACCOUNT:-benchmark}"
+NGPUS="${CX_NGPUS:-8}"
+TIME_MIN="${CX_TIME:-30}"
+IMAGE="${CX_IMAGE:-$(cx_default_image b200)}"
+SQUASH_DIR="${CX_SQUASH_DIR:-/home/sa-shared/containers}"
+MOUNT_DIR=/ix
+TS="$(date -u +%Y-%m-%dT%H-%M-%SZ)"
+
+export CX_RUNNER="$RUNNER_NAME" CX_NGPUS="$NGPUS" CX_TS="$TS"
+export CX_TOPO="b200-nvlink-island" CX_TRANSPORT="nvlink"
+export CX_BENCH="${CX_BENCH:-nccl}"
+export CX_NCCL_HOME="${CX_NCCL_HOME:-/usr}"
+# Record container identity in env_capture provenance.
+export COLLECTIVEX_IMAGE="$IMAGE" COLLECTIVEX_IMAGE_DIGEST="${CX_IMAGE_DIGEST:-}"
+export NCCL_CUMEM_ENABLE=1
+
+cx_log "runner=$RUNNER_NAME partition=$PARTITION ngpus=$NGPUS bench=$CX_BENCH"
+cx_log "image=$IMAGE"
+SQUASH_FILE="$(cx_ensure_squash "$SQUASH_DIR" "$IMAGE")"
+MOUNT_SRC="$(cx_stage_repo "$REPO_ROOT" "${CX_STAGE_DIR:-}")"
+cx_log "squash=$SQUASH_FILE  mount=$MOUNT_SRC -> $MOUNT_DIR"
+
+if [ "${CX_DRYRUN:-0}" = "1" ]; then cx_log "CX_DRYRUN=1 — not allocating"; exit 0; fi
+command -v salloc >/dev/null || cx_die "salloc not found — run on the Slurm login node"
+
+salloc --partition="$PARTITION" --account="$ACCOUNT" --gres=gpu:"$NGPUS" \
+       --exclusive --time="$TIME_MIN" --no-shell --job-name="$RUNNER_NAME"
+JOB_ID="$(squeue --name="$RUNNER_NAME" -u "$USER" -h -o %A | head -n1)"
+[ -n "$JOB_ID" ] || cx_die "could not resolve allocated JOB_ID"
+cx_log "JOB_ID=$JOB_ID"
+trap 'scancel "$JOB_ID" 2>/dev/null || true' EXIT
+
+srun --jobid="$JOB_ID" \
+  --container-image="$SQUASH_FILE" \
+  --container-mounts="$MOUNT_SRC:$MOUNT_DIR" \
+  --no-container-mount-home \
+  --container-workdir="$MOUNT_DIR/experimental/CollectiveX" \
+  --no-container-entrypoint --export=ALL \
+  bash "$MOUNT_DIR/experimental/CollectiveX/launchers/run_in_container.sh"
+
+cx_collect_results "$MOUNT_SRC" "$REPO_ROOT"
+cx_log "done — JSON artifacts under $MOUNT_SRC/experimental/CollectiveX/results/"
diff --git a/experimental/CollectiveX/launchers/launch_b300-nv.sh b/experimental/CollectiveX/launchers/launch_b300-nv.sh
new file mode 100644
index 000000000..7f485480a
--- /dev/null
+++ b/experimental/CollectiveX/launchers/launch_b300-nv.sh
@@ -0,0 +1,7 @@
+#!/usr/bin/env bash
+# CollectiveX — B300 (b300-nv GH runner) adapter. The self-hosted runner is named
+# `b300-nv_NN`, so runner.name's prefix resolves to this file via
+# launch_${RUNNER_NAME%%_*}.sh. Identical B300 settings to launch_b300.sh (the
+# canonical/manual entry point) — delegate so there is a single source of truth.
+set -euo pipefail
+exec bash "$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/launch_b300.sh" "$@"
diff --git a/experimental/CollectiveX/launchers/launch_b300.sh b/experimental/CollectiveX/launchers/launch_b300.sh
new file mode 100644
index 000000000..6085165d9
--- /dev/null
+++ b/experimental/CollectiveX/launchers/launch_b300.sh
@@ -0,0 +1,69 @@
+#!/usr/bin/env bash
+# CollectiveX — B300 single-node SKU adapter (8x B300 SXM6, NVLink island, x86_64, SM100).
+#
+# Thin adapter: B300-specific allocation/container, then hands off to
+# launchers/run_in_container.sh (CX_BENCH = nccl | deepep | all). Mirrors
+# launch_h200.sh; B300 differs in: partition `batch_1` with a REQUIRED account
+# (`benchmark`), and the compute-visible share is /data (10.3.26.100:/data) — NOT
+# /home and NOT the node-local /scratch, both invisible to compute nodes here. Both
+# the squash AND the staged repo MUST live on /data or pyxis fails "No such file".
+#
+# Run from inside the InferenceX checkout on the B300 login node:
+#     bash experimental/CollectiveX/launchers/launch_b300.sh            # nccl (default)
+#     CX_BENCH=deepep CX_PHASE=both bash .../launch_b300.sh             # DeepEP, decode+prefill
+#
+# Env knobs: CX_PARTITION(batch_1) CX_ACCOUNT(benchmark) CX_NGPUS(8) CX_TIME(45)
+#   CX_IMAGE CX_SQUASH_DIR CX_STAGE_DIR CX_BENCH CX_PHASE CX_DRYRUN(0)
+set -euo pipefail
+
+HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+CX_DIR="$(cd "$HERE/.." && pwd)"
+REPO_ROOT="$(cd "$CX_DIR/../.." && pwd)"
+# shellcheck source=common.sh
+source "$HERE/common.sh"
+
+RUNNER_NAME="${RUNNER_NAME:-b300}"
+PARTITION="${CX_PARTITION:-batch_1}"
+ACCOUNT="${CX_ACCOUNT:-benchmark}"   # B300 scheduler REQUIRES a valid account/partition combo
+EXCLUDE_NODES="${CX_EXCLUDE_NODES:-b300-018}"  # known-bad node (per the serving launcher)
+NGPUS="${CX_NGPUS:-8}"
+TIME_MIN="${CX_TIME:-45}"
+IMAGE="${CX_IMAGE:-$(cx_default_image b300)}"
+SQUASH_DIR="${CX_SQUASH_DIR:-/data/sa-shared/containers}"
+export CX_STAGE_DIR="${CX_STAGE_DIR:-/data/sa-shared/cx_stage}"
+MOUNT_DIR=/ix
+TS="$(date -u +%Y-%m-%dT%H-%M-%SZ)"
+
+export CX_RUNNER="$RUNNER_NAME" CX_NGPUS="$NGPUS" CX_TS="$TS"
+export CX_TOPO="b300-nvlink-island" CX_TRANSPORT="nvlink"
+export CX_BENCH="${CX_BENCH:-nccl}"
+export CX_NCCL_HOME="${CX_NCCL_HOME:-/usr}"
+export COLLECTIVEX_IMAGE="$IMAGE" COLLECTIVEX_IMAGE_DIGEST="${CX_IMAGE_DIGEST:-}"
+export NCCL_CUMEM_ENABLE=1
+
+cx_log "runner=$RUNNER_NAME partition=$PARTITION account=$ACCOUNT ngpus=$NGPUS bench=$CX_BENCH"
+cx_log "image=$IMAGE"
+SQUASH_FILE="$(cx_ensure_squash "$SQUASH_DIR" "$IMAGE")"
+MOUNT_SRC="$(cx_stage_repo "$REPO_ROOT" "${CX_STAGE_DIR:-}")"
+cx_log "squash=$SQUASH_FILE  mount=$MOUNT_SRC -> $MOUNT_DIR"
+
+if [ "${CX_DRYRUN:-0}" = "1" ]; then cx_log "CX_DRYRUN=1 — not allocating"; exit 0; fi
+command -v salloc >/dev/null || cx_die "salloc not found — run on the Slurm login node"
+
+salloc --partition="$PARTITION" --account="$ACCOUNT" --exclude="$EXCLUDE_NODES" \
+       --gres=gpu:"$NGPUS" --exclusive --time="$TIME_MIN" --no-shell --job-name="$RUNNER_NAME"
+JOB_ID="$(squeue --name="$RUNNER_NAME" -u "$USER" -h -o %A | head -n1)"
+[ -n "$JOB_ID" ] || cx_die "could not resolve allocated JOB_ID"
+cx_log "JOB_ID=$JOB_ID"
+trap 'scancel "$JOB_ID" 2>/dev/null || true' EXIT
+
+srun --jobid="$JOB_ID" \
+  --container-image="$SQUASH_FILE" \
+  --container-mounts="$MOUNT_SRC:$MOUNT_DIR" \
+  --no-container-mount-home \
+  --container-workdir="$MOUNT_DIR/experimental/CollectiveX" \
+  --no-container-entrypoint --export=ALL \
+  bash "$MOUNT_DIR/experimental/CollectiveX/launchers/run_in_container.sh"
+
+cx_collect_results "$MOUNT_SRC" "$REPO_ROOT"
+cx_log "done — JSON artifacts under $MOUNT_SRC/experimental/CollectiveX/results/"
diff --git a/experimental/CollectiveX/launchers/launch_gb200-nv.sh b/experimental/CollectiveX/launchers/launch_gb200-nv.sh
new file mode 100644
index 000000000..4863b9c10
--- /dev/null
+++ b/experimental/CollectiveX/launchers/launch_gb200-nv.sh
@@ -0,0 +1,135 @@
+#!/usr/bin/env bash
+# CollectiveX — GB200 (NVL72, MNNVL domain) SKU adapter. aarch64, 4 GPU/tray.
+#
+# Two paths, selected by CX_NODES:
+#   * CX_NODES=1 (default): single tray, 4 GPU, intra-tray MNNVL. Hands off to
+#     run_in_container.sh (CX_BENCH = nccl | deepep | all), -g 4.
+#   * CX_NODES>1: multi-node over the NVL72 NVLink fabric (MNNVL), e.g. CX_NODES=2
+#     = 8 GPU. nccl only — builds nccl-tests (MPI=1), runs each op across all ranks
+#     via `srun --mpi=pmix` (1 GPU/rank), parses on the login node. Same shape that
+#     runs single-node B200 (NVLink island) and multi-node B200 (CX-7 IB) — here it
+#     stays entirely on NVL72 NVLink. Validated 8-GPU (2 trays) on-node.
+#
+# Run from inside the InferenceX checkout on the GB200 login node:
+#     bash experimental/CollectiveX/launchers/launch_gb200-nv.sh             # 4 GPU, nccl
+#     CX_NODES=2 bash .../launch_gb200-nv.sh                                  # 8 GPU MNNVL
+#     CX_BENCH=deepep bash .../launch_gb200-nv.sh                             # 4 GPU, DeepEP
+#
+# Env knobs: CX_PARTITION(batch) CX_ACCOUNT(benchmark) CX_NODES(1)
+#   CX_GPUS_PER_NODE(4) CX_TIME(30) CX_IMAGE CX_SQUASH_DIR CX_STAGE_DIR CX_BENCH
+#   CX_OPS CX_MIN_BYTES CX_MAX_BYTES CX_SRUN_MPI(pmix) CX_DRYRUN(0)
+set -euo pipefail
+
+HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+CX_DIR="$(cd "$HERE/.." && pwd)"
+REPO_ROOT="$(cd "$CX_DIR/../.." && pwd)"
+# shellcheck source=common.sh
+source "$HERE/common.sh"
+
+RUNNER_NAME="${RUNNER_NAME:-gb200-nv}"
+PARTITION="${CX_PARTITION:-batch}"
+ACCOUNT="${CX_ACCOUNT:-benchmark}"
+GPUS_PER_NODE="${CX_GPUS_PER_NODE:-4}"          # NVL72 compute tray = 4 GPU/node
+NODES="${CX_NODES:-1}"
+TIME_MIN="${CX_TIME:-30}"
+IMAGE="${CX_IMAGE:-$(cx_default_image gb200)}"
+SQUASH_DIR="${CX_SQUASH_DIR:-/mnt/lustre01/users-public/sa-shared}"
+MOUNT_DIR=/ix
+TS="$(date -u +%Y-%m-%dT%H-%M-%SZ)"
+WORLD=$((NODES * GPUS_PER_NODE))
+
+export CX_RUNNER="$RUNNER_NAME" CX_TS="$TS"
+export CX_TOPO="gb200-nvl72-mnnvl" CX_TRANSPORT="mnnvl"
+export CX_BENCH="${CX_BENCH:-nccl}"
+export CX_NCCL_HOME="${CX_NCCL_HOME:-/usr}"
+export COLLECTIVEX_IMAGE="$IMAGE" COLLECTIVEX_IMAGE_DIGEST="${CX_IMAGE_DIGEST:-}"
+# Validated GB200 MNNVL transport env (from serving recipes) — set AND recorded.
+export NCCL_CUMEM_ENABLE=1 NCCL_MNNVL_ENABLE=1 MC_FORCE_MNNVL=1
+
+cx_log "runner=$RUNNER_NAME partition=$PARTITION nodes=$NODES x ${GPUS_PER_NODE}gpu world=$WORLD bench=$CX_BENCH (aarch64)"
+cx_log "image=$IMAGE"
+SQUASH_FILE="$(cx_ensure_squash "$SQUASH_DIR" "$IMAGE")"
+MOUNT_SRC="$(cx_stage_repo "$REPO_ROOT" "${CX_STAGE_DIR:-}")"
+cx_log "squash=$SQUASH_FILE  mount=$MOUNT_SRC -> $MOUNT_DIR"
+
+if [ "${CX_DRYRUN:-0}" = "1" ]; then cx_log "CX_DRYRUN=1 — not allocating"; exit 0; fi
+command -v salloc >/dev/null || cx_die "salloc not found — run on the Slurm login node"
+
+# ----------------------------------------------------------------------------
+if [ "$NODES" -le 1 ]; then
+  # Single tray (4 GPU): generic dispatcher, -g N single process.
+  export CX_NGPUS="$GPUS_PER_NODE"
+  salloc --partition="$PARTITION" --account="$ACCOUNT" --gres=gpu:"$GPUS_PER_NODE" \
+         --exclusive --time="$TIME_MIN" --no-shell --job-name="$RUNNER_NAME"
+  JOB_ID="$(squeue --name="$RUNNER_NAME" -u "$USER" -h -o %A | head -n1)"
+  [ -n "$JOB_ID" ] || cx_die "could not resolve allocated JOB_ID"
+  cx_log "JOB_ID=$JOB_ID"
+  trap 'scancel "$JOB_ID" 2>/dev/null || true' EXIT
+  srun --jobid="$JOB_ID" \
+    --container-image="$SQUASH_FILE" --container-mounts="$MOUNT_SRC:$MOUNT_DIR" \
+    --no-container-mount-home --container-workdir="$MOUNT_DIR/experimental/CollectiveX" \
+    --no-container-entrypoint --export=ALL \
+    bash "$MOUNT_DIR/experimental/CollectiveX/launchers/run_in_container.sh"
+  cx_collect_results "$MOUNT_SRC" "$REPO_ROOT"
+  cx_log "done — JSON artifacts under $MOUNT_SRC/experimental/CollectiveX/results/"
+  exit 0
+fi
+
+# ----------------------------------------------------------------------------
+# Multi-node MNNVL (nccl only): mirrors launch_b200-dgxc-slurm but stays on the
+# NVL72 NVLink fabric. Build nccl-tests MPI=1, run each op across WORLD ranks
+# (1 GPU/rank) via srun --mpi=pmix, parse on the login node.
+[ "$CX_BENCH" = "nccl" ] || cx_die "GB200 multi-node supports CX_BENCH=nccl only (got '$CX_BENCH')"
+MPI_FLAG="${CX_SRUN_MPI:-pmix}"
+declare -A BIN=( [all_reduce]=all_reduce_perf [all_gather]=all_gather_perf
+                 [reduce_scatter]=reduce_scatter_perf [alltoall]=alltoall_perf )
+
+salloc --partition="$PARTITION" --account="$ACCOUNT" --nodes="$NODES" \
+       --gres=gpu:"$GPUS_PER_NODE" --exclusive --time="$TIME_MIN" \
+       --no-shell --job-name="$RUNNER_NAME"
+JOB_ID="$(squeue --name="$RUNNER_NAME" -u "$USER" -h -o %A | head -n1)"
+[ -n "$JOB_ID" ] || cx_die "could not resolve allocated JOB_ID"
+cx_log "JOB_ID=$JOB_ID nodes=[$(squeue -j "$JOB_ID" -h -o %N 2>/dev/null)]"
+trap 'scancel "$JOB_ID" 2>/dev/null || true' EXIT
+
+COMMON_MOUNT=(--container-image="$SQUASH_FILE" --container-mounts="$MOUNT_SRC:$MOUNT_DIR"
+              --no-container-mount-home --container-workdir="$MOUNT_DIR/experimental/CollectiveX"
+              --no-container-entrypoint)
+ENVJSON="$MOUNT_SRC/experimental/CollectiveX/results/env_${RUNNER_NAME}_${TS}.json"
+
+# 1) Build nccl-tests (MPI=1) + capture environment (single task, one node).
+srun --jobid="$JOB_ID" --ntasks=1 --nodes=1 "${COMMON_MOUNT[@]}" \
+     --export=ALL,CX_TS="$TS",CX_RUNNER="$RUNNER_NAME" </dev/null \
+  bash -c '
+    set -euo pipefail
+    cd /ix/experimental/CollectiveX
+    source launchers/common.sh
+    mkdir -p results
+    cx_build_nccl_tests "$PWD/.nccl-tests" 1 >/dev/null
+    python3 env_capture.py --out "results/env_${CX_RUNNER}_${CX_TS}.json" --timestamp "$CX_TS"
+  '
+
+BUILD_IN_CTR="$MOUNT_DIR/experimental/CollectiveX/.nccl-tests/nccl-tests-mpi/build"
+OPS="${CX_OPS:-all_reduce all_gather reduce_scatter alltoall}"
+
+# 2) Per op: run across all ranks (1 GPU/rank), tee raw output to the shared FS.
+for op in $OPS; do
+  raw="$MOUNT_SRC/experimental/CollectiveX/results/raw_${RUNNER_NAME}_${op}_${TS}.txt"
+  cx_log "running $op across $WORLD ranks (mpi=$MPI_FLAG, MNNVL) -> $raw"
+  srun --jobid="$JOB_ID" --mpi="$MPI_FLAG" --nodes="$NODES" \
+       --ntasks="$WORLD" --ntasks-per-node="$GPUS_PER_NODE" "${COMMON_MOUNT[@]}" \
+       --export=ALL,NCCL_CUMEM_ENABLE=1,NCCL_MNNVL_ENABLE=1,MC_FORCE_MNNVL=1 </dev/null \
+       "$BUILD_IN_CTR/${BIN[$op]}" -b "${CX_MIN_BYTES:-8}" -e "${CX_MAX_BYTES:-2G}" -f 2 -g 1 -c 1 -w 5 -n 20 \
+       > "$raw" 2>"$raw.stderr" || cx_log "WARN: $op srun returned nonzero (see $raw.stderr)"
+
+  # 3) Parse on the login node (pure stdlib; no container needed).
+  python3 "$CX_DIR/run_nccl.py" --op "$op" --parse-only "$raw" \
+    --world-size "$WORLD" --nodes "$NODES" \
+    --runner "$RUNNER_NAME" --topology-class "$CX_TOPO" --transport "$CX_TRANSPORT" \
+    --env-json "$ENVJSON" \
+    --out "$CX_DIR/results/${RUNNER_NAME}_${op}_${TS}.json" \
+    --timestamp "$TS" || cx_log "WARN: parse $op failed"
+done
+
+cx_collect_results "$MOUNT_SRC" "$REPO_ROOT"
+cx_log "done — JSON artifacts under $CX_DIR/results/"
diff --git a/experimental/CollectiveX/launchers/launch_h100-dgxc-slurm.sh b/experimental/CollectiveX/launchers/launch_h100-dgxc-slurm.sh
new file mode 100644
index 000000000..590ea112d
--- /dev/null
+++ b/experimental/CollectiveX/launchers/launch_h100-dgxc-slurm.sh
@@ -0,0 +1,73 @@
+#!/usr/bin/env bash
+# CollectiveX — H100 (DGX Cloud Slurm) single-node SKU adapter (8x H100, NVLink
+# island, x86_64, SM90). Matches the GH self-hosted runner name `h100-dgxc-slurm_NN`
+# (runner.name prefix -> this script via launch_${RUNNER_NAME%%_*}.sh).
+#
+# Thin adapter mirroring launch_b200-dgxc.sh (same DGX Cloud tenancy/conventions:
+# partition default gpu-2, account benchmark, compute-visible /home/sa-shared);
+# allocates, then hands off to run_in_container.sh (CX_BENCH = nccl | deepep | all).
+# The DeepEP path runs the full FP8 + low-latency matrix (validated on 8x H100).
+#
+# !!! First on-runner run = validation (no direct SSH to this cluster at authoring).
+# If pyxis fails "No such file" the share is not compute-visible — set CX_SQUASH_DIR
+# + CX_STAGE_DIR to a compute-visible FS (cf. hpc-gpu-1 needing /mnt/nfs).
+#
+# Env knobs: CX_PARTITION(gpu-2) CX_ACCOUNT(benchmark) CX_NGPUS(8) CX_TIME(45)
+#   CX_IMAGE CX_SQUASH_DIR CX_STAGE_DIR CX_BENCH CX_PHASE CX_DRYRUN(0)
+set -euo pipefail
+
+HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+CX_DIR="$(cd "$HERE/.." && pwd)"
+REPO_ROOT="$(cd "$CX_DIR/../.." && pwd)"
+# shellcheck source=common.sh
+source "$HERE/common.sh"
+
+# Cluster identity from runners/launch_h100-dgxc-slurm.sh (the serving launcher):
+# partition hpc-gpu-1, account customer, known-bad node hpc-gpu-1-7 excluded. This
+# is the SAME cluster validated over SSH. CRITICAL: /home is login-local (not
+# compute-visible) — the squash MUST live on /mnt/nfs; the GH runner workspace is
+# already on /mnt/nfs (compute-visible) so the checkout mounts directly (no staging).
+RUNNER_NAME="${RUNNER_NAME:-h100-dgxc-slurm}"
+PARTITION="${CX_PARTITION:-hpc-gpu-1}"
+ACCOUNT="${CX_ACCOUNT:-customer}"
+EXCLUDE_NODES="${CX_EXCLUDE_NODES:-hpc-gpu-1-7}"
+NGPUS="${CX_NGPUS:-8}"
+TIME_MIN="${CX_TIME:-45}"
+IMAGE="${CX_IMAGE:-$(cx_default_image h100)}"
+SQUASH_DIR="${CX_SQUASH_DIR:-/mnt/nfs/sa-shared/containers}"
+MOUNT_DIR=/ix
+TS="$(date -u +%Y-%m-%dT%H-%M-%SZ)"
+
+export CX_RUNNER="$RUNNER_NAME" CX_NGPUS="$NGPUS" CX_TS="$TS"
+export CX_TOPO="h100-nvlink-island" CX_TRANSPORT="nvlink"
+export CX_BENCH="${CX_BENCH:-nccl}"
+export CX_NCCL_HOME="${CX_NCCL_HOME:-/usr}"
+export COLLECTIVEX_IMAGE="$IMAGE" COLLECTIVEX_IMAGE_DIGEST="${CX_IMAGE_DIGEST:-}"
+export NCCL_CUMEM_ENABLE=1
+
+cx_log "runner=$RUNNER_NAME partition=$PARTITION account=$ACCOUNT ngpus=$NGPUS bench=$CX_BENCH"
+cx_log "image=$IMAGE"
+SQUASH_FILE="$(cx_ensure_squash "$SQUASH_DIR" "$IMAGE")"
+MOUNT_SRC="$(cx_stage_repo "$REPO_ROOT" "${CX_STAGE_DIR:-}")"
+cx_log "squash=$SQUASH_FILE  mount=$MOUNT_SRC -> $MOUNT_DIR"
+
+if [ "${CX_DRYRUN:-0}" = "1" ]; then cx_log "CX_DRYRUN=1 — not allocating"; exit 0; fi
+command -v salloc >/dev/null || cx_die "salloc not found — run on the Slurm login node"
+
+salloc --partition="$PARTITION" --account="$ACCOUNT" --exclude="$EXCLUDE_NODES" \
+       --gres=gpu:"$NGPUS" --exclusive --time="$TIME_MIN" --no-shell --job-name="$RUNNER_NAME"
+JOB_ID="$(squeue --name="$RUNNER_NAME" -u "$USER" -h -o %A | head -n1)"
+[ -n "$JOB_ID" ] || cx_die "could not resolve allocated JOB_ID"
+cx_log "JOB_ID=$JOB_ID"
+trap 'scancel "$JOB_ID" 2>/dev/null || true' EXIT
+
+srun --jobid="$JOB_ID" \
+  --container-image="$SQUASH_FILE" \
+  --container-mounts="$MOUNT_SRC:$MOUNT_DIR" \
+  --no-container-mount-home \
+  --container-workdir="$MOUNT_DIR/experimental/CollectiveX" \
+  --no-container-entrypoint --export=ALL \
+  bash "$MOUNT_DIR/experimental/CollectiveX/launchers/run_in_container.sh"
+
+cx_collect_results "$MOUNT_SRC" "$REPO_ROOT"
+cx_log "done — JSON artifacts under $MOUNT_SRC/experimental/CollectiveX/results/"
diff --git a/experimental/CollectiveX/launchers/launch_h200.sh b/experimental/CollectiveX/launchers/launch_h200.sh
new file mode 100644
index 000000000..82bdaccdd
--- /dev/null
+++ b/experimental/CollectiveX/launchers/launch_h200.sh
@@ -0,0 +1,70 @@
+#!/usr/bin/env bash
+# CollectiveX — H200 single-node SKU adapter (8x H200, NVLink island, x86_64, SM90).
+#
+# Thin adapter: H200-specific allocation/container, then hands off to
+# launchers/run_in_container.sh (CX_BENCH = nccl | deepep | all). Mirrors
+# launch_b200-dgxc.sh; H200 differs in: partition `hpc-gpu-1` (20x 8-GPU nodes),
+# NO account (open scheduler), home is shared NFS (compute-visible, so no
+# CX_STAGE_DIR), and the sglang image is imported on first use (not pre-staged).
+#
+# Run from inside the InferenceX checkout on the H200 login node:
+#     bash experimental/CollectiveX/launchers/launch_h200.sh             # nccl (default)
+#     CX_BENCH=deepep CX_PHASE=both bash .../launch_h200.sh              # DeepEP, decode+prefill
+#
+# Env knobs: CX_PARTITION(main) CX_ACCOUNT() CX_NGPUS(8) CX_TIME(45) CX_IMAGE
+#   CX_SQUASH_DIR CX_STAGE_DIR CX_BENCH CX_PHASE CX_DRYRUN(0)
+set -euo pipefail
+
+HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+CX_DIR="$(cd "$HERE/.." && pwd)"
+REPO_ROOT="$(cd "$CX_DIR/../.." && pwd)"
+# shellcheck source=common.sh
+source "$HERE/common.sh"
+
+RUNNER_NAME="${RUNNER_NAME:-h200}"
+PARTITION="${CX_PARTITION:-hpc-gpu-1}"
+ACCOUNT="${CX_ACCOUNT:-}"            # H200 scheduler is open; no account needed
+NGPUS="${CX_NGPUS:-8}"
+TIME_MIN="${CX_TIME:-45}"            # generous: first-use enroot import of the image
+IMAGE="${CX_IMAGE:-$(cx_default_image h200)}"
+# CRITICAL: on this cluster /home is LOGIN-LOCAL (/dev/sdc) — invisible to compute
+# nodes. The compute-visible share is /mnt/nfs (10.0.0.130:/nfs). Both the squash
+# AND the staged repo MUST live there or pyxis fails "No such file or directory".
+SQUASH_DIR="${CX_SQUASH_DIR:-/mnt/nfs/sa-shared/containers}"
+export CX_STAGE_DIR="${CX_STAGE_DIR:-/mnt/nfs/sa-shared/cx_stage}"
+MOUNT_DIR=/ix
+TS="$(date -u +%Y-%m-%dT%H-%M-%SZ)"
+
+export CX_RUNNER="$RUNNER_NAME" CX_NGPUS="$NGPUS" CX_TS="$TS"
+export CX_TOPO="h200-nvlink-island" CX_TRANSPORT="nvlink"
+export CX_BENCH="${CX_BENCH:-nccl}"
+export CX_NCCL_HOME="${CX_NCCL_HOME:-/usr}"
+export COLLECTIVEX_IMAGE="$IMAGE" COLLECTIVEX_IMAGE_DIGEST="${CX_IMAGE_DIGEST:-}"
+export NCCL_CUMEM_ENABLE=1
+
+cx_log "runner=$RUNNER_NAME partition=$PARTITION ${ACCOUNT:+account=$ACCOUNT }ngpus=$NGPUS bench=$CX_BENCH"
+cx_log "image=$IMAGE"
+SQUASH_FILE="$(cx_ensure_squash "$SQUASH_DIR" "$IMAGE")"
+MOUNT_SRC="$(cx_stage_repo "$REPO_ROOT" "${CX_STAGE_DIR:-}")"
+cx_log "squash=$SQUASH_FILE  mount=$MOUNT_SRC -> $MOUNT_DIR"
+
+if [ "${CX_DRYRUN:-0}" = "1" ]; then cx_log "CX_DRYRUN=1 — not allocating"; exit 0; fi
+command -v salloc >/dev/null || cx_die "salloc not found — run on the Slurm login node"
+
+salloc --partition="$PARTITION" ${ACCOUNT:+--account="$ACCOUNT"} --gres=gpu:"$NGPUS" \
+       --exclusive --time="$TIME_MIN" --no-shell --job-name="$RUNNER_NAME"
+JOB_ID="$(squeue --name="$RUNNER_NAME" -u "$USER" -h -o %A | head -n1)"
+[ -n "$JOB_ID" ] || cx_die "could not resolve allocated JOB_ID"
+cx_log "JOB_ID=$JOB_ID"
+trap 'scancel "$JOB_ID" 2>/dev/null || true' EXIT
+
+srun --jobid="$JOB_ID" \
+  --container-image="$SQUASH_FILE" \
+  --container-mounts="$MOUNT_SRC:$MOUNT_DIR" \
+  --no-container-mount-home \
+  --container-workdir="$MOUNT_DIR/experimental/CollectiveX" \
+  --no-container-entrypoint --export=ALL \
+  bash "$MOUNT_DIR/experimental/CollectiveX/launchers/run_in_container.sh"
+
+cx_collect_results "$MOUNT_SRC" "$REPO_ROOT"
+cx_log "done — JSON artifacts under $MOUNT_SRC/experimental/CollectiveX/results/"
diff --git a/experimental/CollectiveX/launchers/launch_mi355x-amds.sh b/experimental/CollectiveX/launchers/launch_mi355x-amds.sh
new file mode 100644
index 000000000..3a7ceccb3
--- /dev/null
+++ b/experimental/CollectiveX/launchers/launch_mi355x-amds.sh
@@ -0,0 +1,113 @@
+#!/usr/bin/env bash
+# CollectiveX — MI355X (AMD CDNA4, 8 GPU/node) SKU adapter: MoRI dispatch/combine.
+#
+# AMD counterpart to the NVIDIA adapters. Differs from them in ways taken from
+# the real runners/launch_mi355x-amds.sh:
+#   * partition `compute`, no --account (cluster default), --cpus-per-task=128,
+#     and known-bad nodes excluded;
+#   * squash is NODE-LOCAL (/var/lib/squash), so enroot import runs via srun on
+#     the allocated node (not on the login node like the shared-FS NVIDIA path);
+#   * pyxis flags --container-writable --container-remap-root for the ROCm image.
+# AMD backends: CX_BENCH=mori (MoRI EP dispatch/combine, default) or nccl
+# (collective primitives via rccl-tests, the ROCm nccl-tests fork).
+#
+# !!! NOT yet validated on hardware (no MI355X cluster access at authoring time).
+# Treat the first on-runner run as validation — like the DeepEP path was on GB200.
+#
+# Run from inside the InferenceX checkout on the MI355X login node:
+#     bash experimental/CollectiveX/launchers/launch_mi355x-amds.sh
+#
+# Env knobs: CX_PARTITION(compute) CX_NGPUS(8) CX_TIME(60) CX_IMAGE
+#   CX_SQUASH_DIR(/var/lib/squash) CX_EXCLUDE_NODES CX_DRYRUN(0)
+set -euo pipefail
+
+HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+CX_DIR="$(cd "$HERE/.." && pwd)"
+REPO_ROOT="$(cd "$CX_DIR/../.." && pwd)"
+# shellcheck source=common.sh
+source "$HERE/common.sh"
+
+RUNNER_NAME="${RUNNER_NAME:-mi355x-amds}"
+PARTITION="${CX_PARTITION:-compute}"
+NGPUS="${CX_NGPUS:-8}"
+TIME_MIN="${CX_TIME:-60}"   # generous: a cold enroot import of the large ROCm image
+IMAGE="${CX_IMAGE:-$(cx_default_image mi355x)}"
+SQUASH_DIR="${CX_SQUASH_DIR:-/var/lib/squash}"   # node-local on MI355X
+EXCLUDE_NODES="${CX_EXCLUDE_NODES:-mia1-p01-g09,mia1-p01-g11}"
+# Optional node pin. The node-local squash is only staged on some nodes, and on
+# others /var/lib/squash isn't writable (cold-import fails). Pin CI to nodes that
+# already hold the squash via CX_NODELIST (overrides the exclude list).
+NODELIST="${CX_NODELIST:-}"
+MOUNT_DIR=/ix
+TS="$(date -u +%Y-%m-%dT%H-%M-%SZ)"
+
+# AMD backends wired: mori (MoRI EP dispatch/combine) and nccl (collective
+# primitives via rccl-tests). Default mori; honor an explicit CX_BENCH.
+export CX_BENCH="${CX_BENCH:-mori}"
+case "$CX_BENCH" in
+  mori|nccl) ;;
+  *) cx_log "mi355x: CX_BENCH='$CX_BENCH' unsupported on AMD (want mori|nccl); using mori"; export CX_BENCH=mori ;;
+esac
+export CX_RUNNER="$RUNNER_NAME" CX_NGPUS="$NGPUS" CX_TS="$TS"
+export CX_TOPO="mi355x-xgmi" CX_TRANSPORT="xgmi"
+export COLLECTIVEX_IMAGE="$IMAGE" COLLECTIVEX_IMAGE_DIGEST="${CX_IMAGE_DIGEST:-}"
+
+cx_log "runner=$RUNNER_NAME partition=$PARTITION ngpus=$NGPUS bench=$CX_BENCH image=$IMAGE"
+# AMD workspace is compute-visible (the serving launcher bind-mounts it directly),
+# so no staging; the node-local squash is handled via srun below.
+MOUNT_SRC="$(cx_stage_repo "$REPO_ROOT" "${CX_STAGE_DIR:-}")"
+SQUASH_KEY="$(printf '%s' "$IMAGE" | sed 's#[/:@#]#_#g')"
+SQUASH_FILE="$SQUASH_DIR/${SQUASH_KEY}.sqsh"
+# Lock in a guaranteed-writable per-node dir, NOT next to the squash: on some
+# nodes /var/lib/squash is root/admin-owned, so even a world-readable squash
+# can't get a sibling .lock created (flock -> "Bad file descriptor"). CX_LOCK_DIR
+# overrides. The lock only serializes concurrent imports on the same node.
+LOCK_FILE="${CX_LOCK_DIR:-/tmp}/${SQUASH_KEY}.sqsh.lock"
+cx_log "squash(node-local)=$SQUASH_FILE  lock=$LOCK_FILE  mount=$MOUNT_SRC -> $MOUNT_DIR"
+
+if [ "${CX_DRYRUN:-0}" = "1" ]; then cx_log "CX_DRYRUN=1 — not allocating"; exit 0; fi
+command -v salloc >/dev/null || cx_die "salloc not found — run on the Slurm login node"
+
+# Pin to specific nodes (CX_NODELIST) when set, else exclude the known-bad ones.
+if [ -n "$NODELIST" ]; then
+  cx_log "node pin: --nodelist=$NODELIST"
+  salloc --partition="$PARTITION" --nodelist="$NODELIST" --gres=gpu:"$NGPUS" \
+         --exclusive --cpus-per-task=128 --time="$TIME_MIN" --no-shell --job-name="$RUNNER_NAME"
+else
+  salloc --partition="$PARTITION" --exclude="$EXCLUDE_NODES" --gres=gpu:"$NGPUS" \
+         --exclusive --cpus-per-task=128 --time="$TIME_MIN" --no-shell --job-name="$RUNNER_NAME"
+fi
+JOB_ID="$(squeue --name="$RUNNER_NAME" -h -o %A | head -n1)"
+[ -n "$JOB_ID" ] || cx_die "could not resolve allocated JOB_ID"
+cx_log "JOB_ID=$JOB_ID"
+trap 'scancel "$JOB_ID" 2>/dev/null || true' EXIT
+
+# Clear stray containers, then enroot-import to the node-local squash (flock,
+# </dev/null so a missing token can't hang). Both run on the allocated node.
+# shellcheck disable=SC2016  # $(...) must expand on the remote node, not here
+srun --jobid="$JOB_ID" bash -c 'docker stop $(docker ps -aq) 2>/dev/null || true' || true
+srun --jobid="$JOB_ID" bash -c "
+  mkdir -p \"$(dirname "$LOCK_FILE")\" 2>/dev/null || true
+  exec 9>\"$LOCK_FILE\" || { echo 'cannot open lock $LOCK_FILE' >&2; exit 1; }
+  flock -w 600 9 || { echo 'lock timeout for $SQUASH_FILE' >&2; exit 1; }
+  if unsquashfs -l \"$SQUASH_FILE\" >/dev/null 2>&1; then
+    echo 'squash present: $SQUASH_FILE'
+  else
+    rm -f \"$SQUASH_FILE\"
+    enroot import -o \"$SQUASH_FILE\" \"docker://$IMAGE\" </dev/null
+  fi
+"
+
+srun --jobid="$JOB_ID" \
+  --container-image="$SQUASH_FILE" \
+  --container-mounts="$MOUNT_SRC:$MOUNT_DIR" \
+  --container-writable --container-remap-root --no-container-mount-home \
+  --container-workdir="$MOUNT_DIR/experimental/CollectiveX" \
+  --no-container-entrypoint --export=ALL \
+  bash "$MOUNT_DIR/experimental/CollectiveX/launchers/run_in_container.sh"
+
+cx_collect_results "$MOUNT_SRC" "$REPO_ROOT"
+# ROCm can leave gpucore.* dumps in the workdir on a crash; clear them so the
+# next checkout on this runner is clean (mirrors the serving launcher).
+rm -f "$MOUNT_SRC"/experimental/CollectiveX/gpucore.* 2>/dev/null || true
+cx_log "done — JSON artifacts under $MOUNT_SRC/experimental/CollectiveX/results/"
diff --git a/experimental/CollectiveX/launchers/run_in_container.sh b/experimental/CollectiveX/launchers/run_in_container.sh
new file mode 100644
index 000000000..bfbbba845
--- /dev/null
+++ b/experimental/CollectiveX/launchers/run_in_container.sh
@@ -0,0 +1,139 @@
+#!/usr/bin/env bash
+# CollectiveX — generic in-container benchmark dispatcher (single-node).
+#
+# Runs INSIDE the container under `srun`, invoked by every per-SKU adapter
+# (launch_<sku>.sh). The SKU adapter handles allocation/container/transport-env;
+# this script decides WHICH benchmark to run from CX_BENCH, so any benchmark can
+# be driven through any SKU's launch script. Writes provenance-tagged JSON to
+# results/.
+#
+# Required env (exported by the adapter): CX_RUNNER CX_NGPUS CX_TS CX_TOPO
+# Selector:        CX_BENCH = nccl | deepep | mori | all    (default nccl)
+#                  (mori = AMD ROCm EP; nccl/deepep = NVIDIA. `all` = nccl+deepep.)
+# NCCL knobs:      CX_OPS, CX_MIN_BYTES, CX_MAX_BYTES, CX_TRANSPORT, CX_NCCL_HOME
+# EP knobs (DeepEP/MoRI), all -> tests/run_ep.py:
+#   CX_PHASE = decode | prefill | both (default decode)   <- picks the token sweep
+#   CX_TOKENS_LADDER (space/comma sep; blank = phase default), CX_TOKENS_PER_RANK (legacy single point)
+#   CX_HIDDEN CX_TOPK CX_EXPERTS CX_DISPATCH_DTYPE CX_ROUTING CX_MODE(normal|ll)
+#   CX_NUM_SMS (DeepEP comm SMs) CX_SEED CX_ITERS
+set -euo pipefail
+
+cd /ix/experimental/CollectiveX
+# shellcheck source=common.sh
+source launchers/common.sh
+mkdir -p results
+
+: "${CX_RUNNER:?CX_RUNNER not set}"
+: "${CX_NGPUS:?CX_NGPUS not set}"
+: "${CX_TS:?CX_TS not set}"
+: "${CX_TOPO:?CX_TOPO not set}"
+CX_BENCH="${CX_BENCH:-nccl}"
+CX_TRANSPORT="${CX_TRANSPORT:-}"
+ENVJSON="results/env_${CX_RUNNER}_${CX_TS}.json"
+
+cx_log "in-container: runner=$CX_RUNNER ngpus=$CX_NGPUS bench=$CX_BENCH topo=$CX_TOPO"
+python3 env_capture.py --out "$ENVJSON" --timestamp "$CX_TS"
+
+run_nccl_suite() {
+  local build ops op sfail=0 impl=nccl
+  # AMD/ROCm -> rccl-tests (fork; same binaries + output, parsed by run_nccl.py);
+  # NVIDIA/CUDA -> nccl-tests. Both single-node: MPI=0, -g N.
+  if [ -d /opt/rocm ] || command -v hipcc >/dev/null 2>&1; then
+    impl=rccl
+    build="$(cx_build_rccl_tests "$PWD/.nccl-tests" 0)" || return 1
+  else
+    build="$(cx_build_nccl_tests "$PWD/.nccl-tests" 0)" || return 1
+  fi
+  cx_log "collective impl=$impl build=$build"
+  ops="${CX_OPS:-all_reduce all_gather reduce_scatter alltoall}"
+  for op in $ops; do
+    if ! python3 run_nccl.py --op "$op" --nccl-tests-dir "$build" \
+        --world-size "$CX_NGPUS" --nodes 1 --gpus-per-proc "$CX_NGPUS" \
+        --runner "$CX_RUNNER" --topology-class "$CX_TOPO" --transport "$CX_TRANSPORT" \
+        --env-json "$ENVJSON" --out "results/${CX_RUNNER}_${op}_${CX_TS}.json" \
+        --min-bytes "${CX_MIN_BYTES:-8}" --max-bytes "${CX_MAX_BYTES:-8G}" --check 1; then
+      cx_log "WARN: $impl $op failed or invalid"; sfail=1
+    fi
+  done
+  return "$sfail"
+}
+
+# Resolve the source-tokens-per-rank sweep: explicit CX_TOKENS_LADDER wins; else
+# the legacy single-point CX_TOKENS_PER_RANK becomes a one-point ladder; else
+# blank => tests/run_ep.py picks the phase default (decode small / prefill large).
+cx_ep_ladder() {
+  if [ -n "${CX_TOKENS_LADDER:-}" ]; then printf '%s' "$CX_TOKENS_LADDER"
+  elif [ -n "${CX_TOKENS_PER_RANK:-}" ]; then printf '%s' "$CX_TOKENS_PER_RANK"
+  else printf ''; fi
+}
+
+# run_ep_suite <backend: deepep|mori>
+# One tests/run_ep.py invocation per phase (decode/prefill/both); dispatch and
+# combine are timed separately inside it. One JSON per (backend, phase).
+run_ep_suite() {
+  local backend="$1" phase phases ladder rc=0
+  ladder="$(cx_ep_ladder)"
+  phases="${CX_PHASE:-decode}"
+  [ "$phases" = "both" ] && phases="decode prefill"
+  for phase in $phases; do
+    cx_log "ep backend=$backend phase=$phase ngpus=$CX_NGPUS ladder='${ladder:-<phase-default>}'"
+    # Hard wall-clock guard: a wedged collective (e.g. a backend that hangs at a shape)
+    # must FAIL FAST, never burn the whole job timeout. timeout -k sends SIGKILL after
+    # a grace period. Override with CX_RUN_TIMEOUT (seconds).
+    if ! timeout -k 30 "${CX_RUN_TIMEOUT:-900}" \
+        torchrun --nproc_per_node="$CX_NGPUS" tests/run_ep.py --backend "$backend" \
+        --phase "$phase" --tokens-ladder "$ladder" --mode "${CX_MODE:-normal}" \
+        --hidden "${CX_HIDDEN:-7168}" --topk "${CX_TOPK:-8}" --experts "${CX_EXPERTS:-256}" \
+        --dispatch-dtype "${CX_DISPATCH_DTYPE:-bf16}" --routing "${CX_ROUTING:-uniform}" \
+        ${CX_EPLB:+--eplb} ${CX_WORKLOAD_DIR:+--workload-dir "$CX_WORKLOAD_DIR"} \
+        --num-sms "${CX_NUM_SMS:-24}" --seed "${CX_SEED:-67}" --iters "${CX_ITERS:-200}" \
+        --trials "${CX_TRIALS:-3}" \
+        --measurement-contract "${CX_MEASUREMENT_CONTRACT:-layout-and-dispatch-v1}" \
+        --resource-mode "${CX_RESOURCE_MODE:-normalized}" --sm-fraction "${CX_SM_FRACTION:-0.18}" \
+        --runner "$CX_RUNNER" --topology-class "$CX_TOPO" --transport "$CX_TRANSPORT" \
+        --env-json "$ENVJSON" --out "results/${CX_RUNNER}_${backend}_${phase}_${CX_TS}.json"; then
+      cx_log "WARN: $backend $phase run failed/timed out (CX_RUN_TIMEOUT=${CX_RUN_TIMEOUT:-900}s)"; rc=1
+    fi
+  done
+  return "$rc"
+}
+
+run_deepep_suite() {
+  # DeepEP is not bundled in the multi-arch image. Try to import; if absent,
+  # attempt rebuild-deepep (srt-slurm setup script). Inability to run is a
+  # failure, not a silent skip — the caller asked for deepep.
+  if ! python3 -c "import deep_ep" 2>/dev/null; then
+    if command -v rebuild-deepep.sh >/dev/null 2>&1; then
+      cx_log "building DeepEP via rebuild-deepep.sh"
+      rebuild-deepep.sh >&2 || { cx_log "WARN: rebuild-deepep.sh failed"; return 1; }
+    else
+      cx_log "WARN: deep_ep not importable and no rebuild-deepep.sh on PATH; cannot run deepep"
+      return 1
+    fi
+  fi
+  run_ep_suite deepep
+}
+
+run_mori_suite() {
+  # MoRI (AMD ROCm EP), bundled in the AMD MoRI image. If absent this is a
+  # failure (MoRI is not rebuildable here), not a silent skip. Single-node
+  # 8x MI355X over XGMI; torch.cuda maps onto ROCm/HIP.
+  if ! python3 -c "import mori" 2>/dev/null; then
+    cx_log "WARN: mori not importable — needs the AMD MoRI image (rocm/sgl-dev:...-mori-...); cannot run mori"
+    return 1
+  fi
+  run_ep_suite mori
+}
+
+rc=0
+case "$CX_BENCH" in
+  nccl)   run_nccl_suite || rc=1 ;;
+  deepep) run_deepep_suite || rc=1 ;;
+  mori)   run_mori_suite || rc=1 ;;
+  all)    run_nccl_suite || rc=1; run_deepep_suite || rc=1 ;;
+  *)      cx_die "unknown CX_BENCH=$CX_BENCH (want nccl|deepep|mori|all)" ;;
+esac
+
+# Summary table for the log; also fails the job if no valid results were produced.
+python3 summarize.py --results-dir results --runner "$CX_RUNNER" --ts "$CX_TS" || rc=1
+exit "$rc"
diff --git a/experimental/CollectiveX/plan.md b/experimental/CollectiveX/plan.md
new file mode 100644
index 000000000..d62bb7746
--- /dev/null
+++ b/experimental/CollectiveX/plan.md
@@ -0,0 +1,940 @@
+# CollectiveX — Plan
+
+> **How to read this.** This is the single canonical plan. It is **spike-first** and **scoped to `experimental/CollectiveX/`** on a branch — nothing in the production serving path changes until a promotion decision is made later. Part 1 is background (what CollectiveX is, reconstructed from team discussion). Part 2 is the implementation plan. Where this plan says "now," it means the Milestone 0 spike; "later" items (GitHub workflow, database, app frontend) are deliberately deferred. All repository references (runners, launchers, workflows, matrix logic, the `experimental/` charter) were verified against the live InferenceX repo — see References.
+
+---
+
+# Part 1 — Background
+
+## What it is
+
+CollectiveX is an benchmarking workstream under the InferenceX umbrella. It measures **collective communication** and **MoE dispatch/combine**, and performs **apples-to-apples, cross-vendor comparison of expert-parallel (EP) libraries** across NVIDIA and AMD (TPU later). The intended deliverables are an **OSS benchmark project** and a **public explainer article** — a credible cross-vendor collective benchmark plus the story around it.
+
+## Why
+
+Existing public benchmarks don't offer trustworthy, like-for-like collective/EP comparison across vendors. CollectiveX fills that gap by reusing InferenceX's runner and cluster infrastructure to produce reproducible, provenance-tagged results.
+
+## Current state
+
+- An initial MVP exists: it collected collective and kernel shapes and produced MoE dispatch/combine results on NVIDIA.
+- **Normal mode works; low-latency (LL) mode is blocked** on IBGDA enablement — a direct GPU↔NIC data-and-control path over PCIe that removes CPU coordination and simplifies MoE dispatch/combine collectives — which depends on cluster-networking work outside this project.
+- The main near-term enabler is NVIDIA networking / IBGDA; the AMD EP stack and AMD networking (Ultra Ethernet) are the cross-vendor counterpart.
+
+---
+
+# Part 2 — Implementation plan
+
+## Implementation status (built)
+
+The Milestone-0 spike ran for real on **both** B200 (8× NVLink island, x86_64) and GB200 (4× NVL72 MNNVL, aarch64) — 4 NCCL primitives, correctness-passed, topology-keyed distinctly (peak bus-bw: B200 all-reduce 835 GB/s; GB200 689 GB/s). Built on top of that:
+
+- **Multi-arch container** for all NVIDIA SKUs: import by tag `lmsysorg/sglang:v0.5.11-cu130` (amd64 + arm64; index digest `sha256:061fb71f…` recorded for provenance) — one reference both arches; DeepEP via `rebuild-deepep`. Imported by tag, not digest (enroot anonymous auth needs a tag); v0.5.12-cu130 avoided (62-layer overlay-mount failure). See `CONTAINERS.md`.
+- **Per-SKU launch adapters** (`launchers/launch_<sku>.sh`, the InferenceX `launch_${RUNNER_NAME%%_*}.sh` convention) that run **any** benchmark via `CX_BENCH` (nccl|deepep|mori|all) through a shared `launchers/run_in_container.sh`.
+- **`on: push` workflow** (`.github/workflows/collectivex-experimental.yml`): push → MI355X MoRI dispatch/combine (the "CollectiveX Experimental" job); `workflow_dispatch` → chosen `sku`+`benchmark`. No merge to main; activates when the branch is pushed to GitHub.
+- **AMD MI355X / MoRI path validated** (first cross-vendor reach, ahead of Milestone 1): `tests/ep_mori.py` (MoRI dispatch+combine, mirrors `ROCm/mori`'s example with the zero-copy registered-combine-buffer path and `expected = input × unique-destination-ranks`), `launchers/launch_mi355x-amds.sh` (partition `compute`, node-local `/var/lib/squash` imported via `srun`, `--container-writable --container-remap-root`), ROCm MoRI image in `cx_default_image`, and `mi355x`/`mori` workflow options. **Validated on 8× MI355X** (dispatch+combine numerically correct, ~85 µs round-trip): the run surfaced three ionic_rdma-fabric constraints now baked into `tests/ep_mori.py` — a 2 GiB symmetric heap (these NICs cap RDMA MRs at ~4 GiB; MoRI registers the whole heap), a bounded `max_num_inp_token_per_rank`, and a hard-exit past MoRI's post-finalize shmem teardown assertion (see `CONTAINERS.md`).
+
+This supersedes the Milestone-0 "light single-script launcher" sketch below where they differ — launchers are now thin SKU adapters + a shared dispatcher (still light/experimental).
+
+## Scope and placement
+
+CollectiveX starts as an **experimental project on its own branch**, fully contained under `experimental/CollectiveX/`:
+
+```bash
+git switch main
+git pull --ff-only
+git switch -c collectivex
+mkdir -p experimental/CollectiveX
+```
+
+This matches the repository's intent: `experimental/` is explicitly non-core ("experimental WIP code that is mostly Claude Code generated… not intended for production use or as part of the official InferenceMAX results").
+
+For the experimental phase, **everything stays inside `experimental/CollectiveX/**`**. Do **not** modify:
+
+```text
+benchmarks/
+runners/
+utils/
+.github/configs/
+perf-changelog.yaml
+InferenceX-app
+```
+
+The only eventual exception is a minimal workflow dispatcher under `.github/workflows/` (because executable workflows must live there); all real CollectiveX logic, schemas, launchers, and processing stay under `experimental/CollectiveX/`.
+
+**This supersedes any notion of CollectiveX becoming a top-level InferenceX subsystem or extending the production serving matrix up front.** Promotion — into core InferenceX, into a dedicated repo, or into InferenceX-app's database/frontend — is an explicit *later* decision (Milestone 4), made only after the benchmark contract has stabilized on real hardware.
+
+### What InferenceX already gives us
+
+InferenceX's existing execution model is almost exactly the control plane CollectiveX needs:
+
+1. Generate and strictly validate a matrix on a GitHub-hosted runner.
+2. Fan jobs out to named or labelled self-hosted runners.
+3. Those listeners submit work to Slurm (or launch Docker locally).
+4. Normalize outputs.
+5. Upload artifacts.
+6. Aggregate and dispatch ingestion to the dashboard.
+
+`e2e-tests.yml` already divides generated configs into job families and invokes reusable single-node and multi-node workflows; `benchmark-tmpl.yml` cleans up resources, checks out the selected ref, **derives the launcher from the runner name**, launches the job, validates outputs, and uploads normalized results. Runner listeners live on cluster login/controller nodes while jobs run on compute nodes via Slurm; runner names/labels are load-bearing — the name prefix selects the launcher and exact names/SKU labels control scheduling.
+
+CollectiveX reuses all of this, but enters through **CollectiveX-specific launchers** rather than threading fake models through the serving launchers (see Cluster reuse).
+
+## Architecture
+
+Four planes, cleanly separated:
+
+- **Control plane:** scheduling, runners, cleanup, artifact movement, workflow metadata (reused from InferenceX).
+- **Benchmark plane:** collective semantics, backend invocation, correctness, timing.
+- **Data plane:** canonical result records, raw per-rank samples, topology and provenance.
+- **Presentation plane:** comparable subsets, charts, history, diagnostics.
+
+Data flow within the experimental directory:
+
+```text
+Portable shape definitions
+          +
+Backend definitions
+          +
+Target/cluster definitions
+          ↓
+CollectiveX matrix resolver
+          ↓
+Resolved shards
+          ↓
+Existing InferenceX self-hosted runner
+          ↓
+experimental/CollectiveX/launchers/*
+          ↓
+Backend adapter  (NCCL / RCCL / DeepEP / AITER / MoRI / …)
+          ↓
+Versioned result bundle
+          ↓
+Aggregator + regression checker
+          ↓
+Static experimental report   →  (later) InferenceX-app ingestion → Postgres → /collectives
+```
+
+### Target structure at promotion (Milestone 4)
+
+This packaged layout is the **promotion target**, not the spike. Milestone 0 uses the light layout in the rollout section below (`run_nccl.py` / `run_deepep.py` / `env_capture.py` / `plot.py` + flat `results/`); the structure here is what CollectiveX grows into *if* it is promoted out of `experimental/`.
+
+```text
+InferenceX/
+├── experimental/
+│   ├── README.md
+│   └── CollectiveX/
+│       ├── README.md
+│       ├── DESIGN.md
+│       ├── ROADMAP.md
+│       ├── pyproject.toml
+│       ├── Makefile
+│       │
+│       ├── src/
+│       │   └── collectivex/
+│       │       ├── __init__.py
+│       │       ├── cli.py
+│       │       ├── config/
+│       │       │   ├── models.py
+│       │       │   ├── loader.py
+│       │       │   ├── resolver.py
+│       │       │   └── matrix.py
+│       │       ├── benchmark/
+│       │       │   ├── harness.py
+│       │       │   ├── timing.py
+│       │       │   ├── correctness.py
+│       │       │   ├── routing.py
+│       │       │   └── metrics.py
+│       │       ├── backends/
+│       │       │   ├── base.py
+│       │       │   ├── fake.py
+│       │       │   ├── nccl_tests.py
+│       │       │   ├── rccl_tests.py
+│       │       │   ├── deepep.py
+│       │       │   └── framework_ep.py
+│       │       ├── cluster/
+│       │       │   ├── inventory.py
+│       │       │   ├── capabilities.py
+│       │       │   ├── environment.py
+│       │       │   └── launcher.py
+│       │       ├── results/
+│       │       │   ├── models.py
+│       │       │   ├── writer.py
+│       │       │   ├── aggregate.py
+│       │       │   ├── compare.py
+│       │       │   └── redact.py
+│       │       └── report/
+│       │           ├── build.py
+│       │           └── templates/
+│       │
+│       ├── configs/
+│       │   ├── suites/
+│       │   │   ├── smoke.yaml
+│       │   │   ├── primitives.yaml
+│       │   │   ├── moe-decode.yaml
+│       │   │   ├── moe-prefill.yaml
+│       │   │   └── full.yaml
+│       │   ├── shapes/
+│       │   │   ├── synthetic/
+│       │   │   └── traced/
+│       │   ├── backends/
+│       │   ├── targets/
+│       │   └── clusters.yaml
+│       │
+│       ├── launchers/
+│       │   ├── common.sh
+│       │   ├── launch_b200-dgxc.sh         # B200 single node
+│       │   ├── launch_b200-dgxc-slurm.sh   # B200 multinode
+│       │   └── launch_gb200-nv.sh          # GB200 NVL72
+│       │
+│       ├── schemas/
+│       │   ├── case-v1.schema.json
+│       │   ├── result-v1.schema.json
+│       │   ├── manifest-v1.schema.json
+│       │   └── environment-v1.schema.json
+│       │
+│       ├── scripts/
+│       │   ├── bootstrap.sh
+│       │   ├── run_suite.sh
+│       │   ├── run_shard.sh
+│       │   └── build_report.sh
+│       │
+│       ├── tests/
+│       │   ├── fixtures/
+│       │   ├── test_config.py
+│       │   ├── test_matrix.py
+│       │   ├── test_parsers.py
+│       │   ├── test_correctness.py
+│       │   └── test_comparability.py
+│       │
+│       └── docs/
+│           ├── BENCHMARK_CONTRACT.md
+│           ├── BACKEND_ADAPTER.md
+│           ├── SHAPE_REGISTRY.md
+│           ├── RESULT_FORMAT.md
+│           ├── FRONTEND.md
+│           └── PROMOTION_CRITERIA.md
+│
+└── .github/workflows/
+    └── collectivex-experimental.yml   # Added only when cluster CI begins (Milestone 2)
+```
+
+> Note: launcher names mirror the real runner-name prefixes. The spike adds the three NVIDIA launchers above; AMD (`launch_mi355x-amds.sh`) and others follow.
+
+## Benchmark model — keep four concepts separate
+
+CollectiveX needs its **own** schema. Do **not** reuse or extend the serving matrix, which is built around model / ISL / OSL / framework / TP / EP / concurrency and lives in `utils/matrix_logic/generate_sweep_configs.py`. Representing collectives with fake model names, `ISL=0`, or overloaded concurrency fields would create permanent technical debt. CollectiveX gets its own matrix logic (in the packaged layout, `src/collectivex/config/matrix.py`) — introduced with the workflow at Milestone 2, not the spike — rather than touching `utils/matrix_logic/generate_sweep_configs.py`.
+
+The model keeps four concepts independent:
+
+**Shape** — the logical communication workload:
+
+```text
+operation, message size, tokens per rank, hidden size, top-k,
+expert count, routing distribution, dtype, phase
+```
+
+**Backend** — the implementation under test:
+
+```text
+NCCL, RCCL, DeepEP, AITER, MoRI, framework-native EP, reference implementation
+```
+
+**Target** — where and how it runs:
+
+```text
+runner type, cluster, nodes, GPUs per node, rank placement,
+fabric, container image, transport capabilities
+```
+
+**Suite** — a curated selection of shape × backend × target combinations. Keeping these separate prevents copying the same DeepSeek/MiniMax shape into every NVIDIA and AMD configuration.
+
+### Portable definitions
+
+Shape:
+
+```yaml
+schema-version: 1
+shape-id: moe.decode.h7168.top8.e256.t64.uniform.v1
+
+kind: moe
+phase: decode
+operation: dispatch-combine
+
+shape:
+  tokens-per-rank: 64
+  hidden-size: 7168
+  top-k: 8
+  num-experts: 256
+  dispatch-dtype: fp8
+  combine-dtype: bf16
+  routing:
+    distribution: uniform
+    seed: 67
+  expert-alignment: 16
+```
+
+Backend:
+
+```yaml
+backend-id: deepep-normal
+backend: deepep
+mode: normal
+
+source:
+  repository: deepseek-ai/DeepEP
+  ref: pinned-commit
+
+settings:
+  async-overlap: false
+  num-comm-sms: standardized
+  qp-count: auto
+```
+
+Target:
+
+```yaml
+target-id: b200-dgxc-4n
+runner-type: b200-multinode
+cluster-id: b200-dgxc
+
+resources:
+  nodes: 4
+  gpus-per-node: 8
+  exclusive: true
+
+placement:
+  ranks-per-node: 8
+  rank-order: contiguous
+
+capabilities:
+  rdma: true
+  ibgda: experimental
+  nvshmem: true
+```
+
+Suite:
+
+```yaml
+suite-id: moe-decode-smoke
+
+shapes:
+  - moe.decode.h7168.top8.e256.t64.uniform.v1
+
+backends:
+  - deepep-normal
+  - deepep-low-latency
+
+targets:
+  - b200-dgxc-2n
+
+measurement:
+  warmup-iterations: 20
+  measured-iterations: 200
+  trials: 3
+  correctness: full
+```
+
+### Case identity
+
+A **case** is one immutable, versioned point: the natural key composes the three concepts —
+
+```text
+case-id = <backend-id> __ <shape-id> __ <target-id>
+e.g.  deepep-normal__moe.decode.h7168.top8.e256.t64.uniform.v1__b200-dgxc-4n
+      nccl__allreduce.fp16.logsweep.v1__b200-dgxc-2n
+```
+
+A shape must never silently change; a newly extracted distribution gets a new versioned `shape-id`.
+
+**Required shape fields — primitives:** operation; logical element count; datatype; input/output bytes; in-place vs out-of-place; reduction op (where applicable); world size; rank placement; host-driven vs device-driven launch; blocking/synchronization semantics.
+
+**Required shape fields — MoE (additional):** tokens per rank; hidden size; top-k; number of experts; EP size; dispatch and combine dtypes; routing distribution; expert alignment/padding; capacity constraints; quantization scale representation; cached vs recomputed routing layout; communication-SM count; async-overlap mode. DeepEP shows why these must be first-class — its interface takes tokens/rank, hidden size, top-k, expert count, FP8 mode and comm-SM settings, and exposes async dispatch/combine.
+
+### Shape registry
+
+Two independent shape sources:
+
+**Synthetic** — for continuous curves and hardware characterization (logarithmic byte sweep for primitives; token-count sweep for MoE; EP-scaling sweep; uniform and controlled-skew routing; intranode and internode placements; decode-oriented and prefill-oriented regimes). Don't build every Cartesian combination; define named suites (`primitive-latency-v1`, `primitive-bandwidth-v1`, `moe-decode-v1`, `moe-prefill-v1`, `moe-skew-v1`, `scaleout-v1`).
+
+**Trace-derived** — extracted from real InferenceX runs/profiles:
+
+```text
+models/deepseek-v4/decode/<shape-id>
+models/minimax-m3/decode/<shape-id>
+models/kimi-k2.7/prefill/<shape-id>
+```
+
+Each traced shape retains: source workflow run; model/config; phase; layer/layer-group; observed token histogram; routing skew; concurrent collective count; framework version; extraction-tool version. InferenceX already has a targeted profiling workflow (`profile.yml`) with optional MoE debug output and a separate trace-storage path — a natural source for real shapes rather than only guessed synthetic inputs.
+
+## Benchmark layers and comparison classes
+
+| Layer | Purpose | Examples |
+|---|---|---|
+| **L0 Environment** | Prove the cluster is benchmarkable | topology, NIC/GPU state, peer access, RDMA, IBGDA capability, version capture |
+| **L1 Primitive collectives** | Characterize the raw communication substrate | send/recv, all-reduce, all-gather, reduce-scatter, all-to-all, all-to-allv |
+| **L2 MoE communication** | Compare real EP libraries | dispatch, combine, dispatch+combine round trip, normal and low-latency modes |
+| **L3 Integrated pipelines** | Communication in realistic operator sequences | route → permute → dispatch → grouped GEMM → combine → unpermute |
+| **L4 E2E correlation** | Explain InferenceX serving performance | isolated CollectiveX result linked to the corresponding InferenceX run/profile |
+
+The MVP concentrates on **L1 and L2**. L3 overlaps OperatorX and comes after the contracts are stable; L4 is the eventual tie-back to serving.
+
+**L0 — Environment validation** (before measuring anything): GPU count/identity; GPU/NIC topology; CUDA/ROCm version; driver version; NCCL/RCCL version; RDMA device visibility; peer-access matrix; IBGDA/SHMEM capability; container digest; clock/power state; selected network interfaces. A failed probe yields one clear `environment-invalid` result, not dozens of misleading backend failures.
+
+**L1 — Primitives:** send/receive, all-reduce, all-gather, reduce-scatter, all-to-all, all-to-allv. Use vendor test programs where possible rather than rewriting primitives. Measure two regions separately: latency (bytes→low KiB) and bandwidth (MiB→GiB).
+
+**L2 — MoE collectives:** dispatch, combine, dispatch+combine. Dimensions: tokens/rank, hidden size, top-k, expert count, EP size, dispatch dtype, combine dtype, routing skew, normal vs low-latency, comm-SM count, node count.
+
+### Three comparison classes
+
+Every result is tagged with exactly one, and they must never be silently mixed on one chart:
+
+| Class | Meaning |
+|---|---|
+| `standardized` | Matched logical shape **and** fixed resource budget — same shape, topology, dtype, correctness contract, allowed comm-SMs, and timing boundaries. The main apples-to-apples comparison. |
+| `backend-optimized` | Same logical output, but each library uses its recommended comm-SMs / protocols / QP count / buffer sizing / graph capture / tuning. Answers "what is the best each stack can do?" |
+| `framework-integrated` | The actual path used by SGLang / vLLM / TensorRT-LLM / Dynamo. Connects to InferenceX; not a pure microbenchmark. |
+
+### Comparability key
+
+Every result gets a machine-generated comparison key; rows with different keys are not connected on the same curve by default:
+
+```text
+operation, shape ID, dtype, world size, node count, rank placement,
+routing distribution, comparison class, measurement contract version, topology class
+```
+
+## Measurement and correctness
+
+### Timing boundaries
+
+Record separately — never report one latency that sometimes includes JIT and sometimes doesn't:
+
+```text
+1. communicator creation
+2. buffer allocation and registration
+3. first invocation / JIT
+4. warmed steady-state invocation
+5. host launch time
+6. GPU completion time
+7. optional end-to-end framework-visible time
+```
+
+Per measured iteration: synchronize before starting (unless explicitly testing queued execution); use GPU events for device duration and host monotonic time for API/launch duration; retain per-rank measurements; aggregate only after rank-level data is stored; report the **slowest rank** as well as the average.
+
+### Correctness as a hard gate
+
+A result is `valid` only after correctness passes. A fast result that fails correctness stays visible as `invalid` — never silently dropped.
+
+Primitive checks: deterministic input; expected reduction result; guard regions around buffers; in-place and out-of-place checks; dtype-specific tolerances.
+
+MoE checks: token conservation; correct expert assignment; correct routing weights; valid permutation metadata; dispatch output vs reference; combine output vs reference; no padded-token leakage; deterministic routing hash.
+
+Failed results remain in artifacts, e.g.:
+
+```json
+{
+  "status": "invalid",
+  "correctness_passed": false,
+  "error": "combine result exceeded bf16 tolerance"
+}
+```
+
+### Routing distributions
+
+At minimum: uniform; single-hot/worst-case concentration; Zipf-like skew; bounded imbalance; replayed real histogram. Store the routing seed and the generated assignment hash.
+
+### Metrics
+
+| Category | Metrics |
+|---|---|
+| Latency | p50, p90, p95, p99, min, max |
+| Rank behavior | slowest-rank latency, rank spread, coefficient of variation |
+| Primitive throughput | algorithm bandwidth, bus bandwidth, effective bytes/s |
+| MoE throughput | tokens/s, logical payload GB/s, dispatch and combine separately |
+| Efficiency | bandwidth relative to declared topology bottleneck |
+| Host overhead | API launch time, CPU utilization where available |
+| GPU overhead | communication SM count, GPU active time, optional power |
+| Memory | persistent buffer bytes, peak temporary bytes |
+| Overlap | standalone comm, standalone compute, overlapped duration, overlap efficiency |
+| Reliability | initialization failures, hangs, retries, correctness failures |
+| Provenance | all software, image, driver, firmware and topology identifiers |
+
+### Bandwidth definitions
+
+NCCL `algbw`/`busbw` are stored but not treated as universal (NCCL applies operation-specific correction factors). MoE libraries often report **logical bottleneck bandwidth** (may include local-rank traffic or exclude metadata/padding; DeepEP explicitly publishes logical bandwidth). Store separate fields, and use `null` rather than a deceptive inference when a backend can't expose physical bytes:
+
+```text
+logical_payload_bytes
+allocated_payload_bytes
+estimated_link_bytes
+metadata_bytes
+padding_bytes
+```
+
+## Result and artifact format
+
+Each shard emits a versioned bundle:
+
+```text
+output/
+├── manifest.json
+├── cases.json
+├── results.jsonl
+├── rank-samples.jsonl.gz
+├── summary.json
+├── environment/
+│   ├── gpu.json
+│   ├── network.json
+│   ├── topology.json
+│   └── software.json
+├── raw/
+│   ├── stdout.log
+│   ├── stderr.log
+│   └── backend-output/
+├── commands/
+│   └── reproduce.sh
+└── profiles/
+```
+
+**Manifest** (invariant run-level metadata): schema version; workflow run + attempt; source SHA/ref; cluster ID; runner; Slurm job ID; node count; topology fingerprint; image digest; backend commit/build; start/end timestamps; redaction version.
+
+**Result row:**
+
+```json
+{
+  "schema_version": 1,
+  "case_id": "deepep-normal__moe.decode.h7168.top8.e256.t64.uniform.v1__b200-dgxc-4n",
+  "status": "valid",
+  "trial": 1,
+  "backend": "deepep",
+  "mode": "normal",
+  "comparison_class": "standardized",
+  "metrics": {
+    "latency_us_p50": 0,
+    "latency_us_p99": 0,
+    "slowest_rank_us_p50": 0,
+    "logical_bandwidth_gbps": 0,
+    "tokens_per_second": 0,
+    "rank_spread_pct": 0,
+    "persistent_buffer_bytes": 0
+  },
+  "correctness": { "passed": true, "max_abs_error": 0, "max_rel_error": 0 }
+}
+```
+
+Use an explicit `schema_version` from the beginning — do not repeat the app's historical need to infer schema version from whether a field happens to exist.
+
+## Backend adapters
+
+Each adapter implements a small contract:
+
+```python
+class CollectiveBackend:
+    def probe(self, environment) -> CapabilityReport: ...
+    def prepare(self, case, workdir) -> PreparedCommand: ...
+    def run(self, prepared, launcher) -> RawRun: ...
+    def parse(self, raw_run) -> list[RankSample]: ...
+    def validate(self, case, raw_run) -> CorrectnessReport: ...
+    def describe(self) -> BackendProvenance: ...
+```
+
+**Tier 0 — communication baselines:** NVIDIA `nccl-tests`, ROCm `rccl-tests`, optionally PyTorch distributed as a common-API baseline. Don't rewrite primitives from scratch — `nccl-tests` already supports multi-node, warmups, correctness checking (`-c 1`), per-rank aggregation, device-driven implementations, and separate CPU-time reporting. *(Confirm whether the installed build emits JSON; if not, parse the text table.)*
+
+**Tier 1 — MoE dispatch/combine:** upstream DeepEP, ROCm DeepEP, and the NVIDIA/AMD EP paths already used by the InferenceX serving stacks. **Version pins are first-class.** Upstream DeepEP V2 changed NVSHMEM→NCCL, unified high-throughput and low-latency APIs, changed buffer behavior, and removed a previous zero-SM LL mode; ROCm's port has different maturity, NIC variants, rocSHMEM dependencies. DeepEP is **built at job setup** (via `rebuild-deepep.sh`, resolved by srt-slurm), not shipped in the image — its build time and `aarch64` (GB200) feasibility are tracked spike risks. A chart labelled only "DeepEP" is therefore ambiguous — store:
+
+```text
+backend name, upstream/fork, git commit, API generation,
+transport backend, build flags, runtime library versions, container digest
+```
+
+**Tier 2 — additional optimized stacks (later):** MSCCL++, AITER comm/fusion paths, MoRI/Pollara, NVSHMEM/rocSHMEM microbenchmarks, framework-native fused collectives.
+
+## Rollout — spike-first
+
+**Spike-first.** No schema, Pydantic model, or comparison contract is frozen until one real, correctness-gated number exists on real hardware. The first milestone is a single end-to-end spike on **two NVIDIA topologies, B200 and GB200**, chosen because they exercise the two transport regimes that matter: B200 is an 8-GPU NVLink island with CX-7 InfiniBand between nodes; GB200 is an NVL72 multi-node-NVLink (MNNVL) domain. Running the same collective across both is itself the first headline result, and it forces the provenance and comparison-class machinery to be real from line one. The schema is the spike's *output*, extracted from the artifacts it produces — not its input. AMD and all platform work (workflow, DB, frontend) follow.
+
+### Milestone 0 — NVIDIA B200 + GB200 spike
+
+One milestone, NVIDIA-only, end to end. This collapses the former "design contract," "CPU framework," "primitive NVIDIA baseline," and the NVIDIA half of "MoE MVP" into a single vertical slice that produces real numbers on real fabric.
+
+Scaffolding — deliberately light, matching `experimental/` convention (bare scripts + flat JSON + a plot; no package / Pydantic / JSON-schemas yet — those arrive at the contract freeze):
+
+```text
+experimental/CollectiveX/
+  README.md
+  run_nccl.py        # argparse; run stock nccl-tests, parse its text table (do NOT assume JSON)
+  tests/run_ep.py    # EP dispatch/combine sweep (DeepEP/MoRI); dispatch & combine timed separately
+  env_capture.py     # Layer-0 env + topology fingerprint (torch.cuda.* + nvidia-smi topo) → json
+  plot.py            # matplotlib, like token_position_decode_slo/*/plot_*.py
+  launchers/
+    common.sh
+    launch_b200-dgxc.sh         # B200 single node  (b200-dgxc runner → 8-GPU NVLink island, x86_64)
+    launch_b200-dgxc-slurm.sh   # B200 multinode    (b200-multinode runner → CX-7 IB spine)
+    launch_gb200-nv.sh          # GB200             (gb200 runner → NVL72 MNNVL, aarch64, 4 GPU/node)
+  results/*.json     # flat, hand-verifiable
+```
+
+Reuse existing patterns rather than reinventing: `experimental/dsv32/bench.py` for `torch.cuda.Event` timing and stdout environment capture, and `experimental/token_position_decode_slo/glm-5/{bmk_*_sbatch.sh,plot_sla_frontier.py}` for Slurm orchestration + plotting. Mirror the runner→launcher routing convention (`bash ./launchers/launch_${RUNNER_NAME%%_*}.sh`) so the runner name selects the CollectiveX launcher as the serving path does.
+
+**DeepEP is not prebuilt in any image.** The serving recipes build it at job setup via `setup_script: rebuild-deepep.sh` (resolved by srt-slurm; see `benchmarks/multi_node/srt-slurm-recipes/sglang/qwen3.5/gb200-fp8/`). The spike reuses that same rebuild path — on B200 (x86_64) first. Pin images by digest from `.github/configs/nvidia-master.yaml`: B200 `lmsysorg/sglang:deepseek-v4-blackwell@sha256:df18bfc4aa9ecf59451002b49ba00cae58042de9e2a96378bbd21b404dd62c7b`; GB200 `lmsysorg/sglang:nightly-dev-cu13-20260608-303757cc` (an unpinned nightly today — capture its digest before relying on it).
+
+What it measures:
+
+```text
+Primitives (stock nccl-tests, -c 1 for correctness) — on BOTH B200 and GB200:
+  all-reduce, all-gather, reduce-scatter, all-to-all
+  latency regime (bytes→KiB) and bandwidth regime (MiB→GiB)
+  B200  : 8 GPU/node (x86_64); 1 node (NVLink island) and 2 nodes (cross CX-7 IB)
+  GB200 : 4 GPU/node (aarch64); 1 node and 2+ nodes — all still inside the NVL72 NVLink (MNNVL) domain
+
+MoE (DeepEP, normal mode only — LL mode is the known-broken/blocked path, out of scope):
+  one decode-shaped dispatch+combine: tokens-per-rank=64, hidden=7168,
+  top-k=8, experts=256, dispatch fp8
+  correctness: token conservation + combine vs a reference implementation
+  B200 (x86_64) first; GB200 DeepEP is a fast-follow once the aarch64 rebuild-deepep path is proven
+```
+
+The headline is the **same NCCL primitive shape on both topologies**: B200's 2-node path crosses CX-7 InfiniBand, while GB200's stays on NVL72 NVLink (MNNVL). That IB-vs-MNNVL contrast at a matched logical shape is the result worth publishing. (nccl-tests and DeepEP must be built for `aarch64` on GB200 — the reason DeepEP is B200-first.)
+
+Provenance captured on every row from the first run — non-negotiable even in a spike, because it is what makes the B200-vs-GB200 number defensible:
+
+```text
+topology-class       b200-nvlink-island(+cx7-ib)  |  gb200-nvl72-mnnvl
+transport actually used   (NVLink / IB / NVSHMEM-IBGDA), derived from flags + measured behavior
+transport env set/recorded:
+  B200  : NCCL_CUMEM_ENABLE=1
+  GB200 : NCCL_CUMEM_ENABLE=1, NCCL_MNNVL_ENABLE=1, MC_FORCE_MNNVL=1
+  (also seen in serving: NCCL_P2P_LEVEL=NVL, SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK)
+comm-SM count, QP count where applicable
+backend commit + API generation + build flags
+container digest, CUDA / driver / NCCL versions
+comparison-class tag (standardized where shape, dtype and SM budget match)
+```
+
+These flags come from validated GB200 serving recipes (`…/srt-slurm-recipes/sglang/qwen3.5/gb200-fp8/`); MNNVL is GB200/GB300-only, which is exactly what makes the transport differ from B200.
+
+Output: a result bundle on disk (`manifest.json`, `results.jsonl`, `environment/`, `raw/`, `commands/reproduce.sh`). Hand-verify the first rows; do not build a generated Pydantic contract yet.
+
+Exit criteria:
+
+* real NCCL latency + bandwidth curves on **both** B200 and GB200, correctness-passed (the headline)
+* one DeepEP dispatch+combine number (normal mode) on **B200**, correctness-passed; GB200 DeepEP as the immediate fast-follow
+* every row carries topology-class, transport, comparison-class and full provenance
+* a B200-vs-GB200 side-by-side that the comparison key permits **and labels as topology-class-differing** — that labeled comparison is the intended result, not an accident
+* **only now** freeze the schema (`CollectiveCase` / `CollectiveResult` / manifest), extracted from these artifacts
+
+Explicitly out of scope for the spike: AMD, IBGDA low-latency mode, GitHub Actions, database, frontend, trace-derived shapes, and the fake backend as a deliverable (keep a trivial one only if it speeds offline tests).
+
+### Milestone 1 — AMD parity
+
+Bring the AMD side up against the schema the spike froze — not in parallel with it:
+
+```text
+RCCL-tests adapter (mirror the nccl-tests text-table parser)
+one AMD launcher (launch_mi355x-amds.sh)
+one AMD MoE dispatch/combine backend (DeepEP ROCm / AITER / MoRI)
+equivalent shapes + identical result contract
+first cross-vendor (NVIDIA vs AMD) comparison
+```
+
+Record the AMD transport stack (rocSHMEM, MoRI-IO / Pollara, NIC variant) with the same provenance rigor the spike established. An unlabeled "DeepEP" row compared across vendors is meaningless.
+
+### Milestone 2 — GitHub workflow
+
+Add (orchestration only; see GitHub workflow design below):
+
+```text
+collectivex-experimental.yml
+preflight
+canary
+matrix sharding
+artifact collection
+regression comparison
+static report artifact
+```
+
+Do not connect it to `perf-changelog.yaml`.
+
+### Milestone 3 — Trace-derived shapes
+
+Extract representative shapes from InferenceX profiles (DeepSeek V4, MiniMax M3, Kimi). Every traced shape must retain: source workflow run; source configuration; framework version; model phase; extraction-tool version; routing-histogram hash.
+
+### Milestone 4 — Promotion decision
+
+Only then decide whether to: keep CollectiveX permanently experimental; move it into core InferenceX; extract it into a dedicated repository; or integrate its data into InferenceX-app (database + `/collectives` frontend).
+
+### First PRs (the spike)
+
+The spike lands as a few small PRs, each producing something runnable — not a docs-and-schema PR:
+
+```text
+1. Scaffold + NCCL on B200 single node
+   run_nccl.py (text-table parser), env_capture.py, plot.py,
+   launchers/launch_b200-dgxc.sh, results/*.json
+   → lands when it emits a real all-reduce curve with provenance from an 8-GPU B200
+
+2. B200 multinode + GB200
+   launchers/launch_b200-dgxc-slurm.sh, launchers/launch_gb200-nv.sh
+   → lands when the same primitive runs on 2-node B200 (cross-IB) and on GB200 NVL72 (MNNVL),
+     each tagged with topology-class and transport (aarch64 build for GB200)
+
+3. DeepEP dispatch+combine — B200 first
+   tests/ep_deepep.py, routing generator + reference combine for correctness,
+   reusing rebuild-deepep at job setup
+   → one decode shape, normal mode, on B200; GB200 DeepEP fast-follow
+
+4. Freeze the contract
+   extract the case / result / manifest schema from the bundles produced in 1–3;
+   add fixtures captured from real output — this is where the packaged structure begins
+```
+
+The first objective is a real, provenance-tagged, correctness-gated number on two NVIDIA topologies — the contract is the spike's output, not its foundation.
+
+## Cluster reuse and capability inventory
+
+### What to reuse
+
+Existing self-hosted runner registrations; exact runner labels; Slurm access from runner hosts; checkout and artifact patterns; resource-cleanup strategy; repository secrets; container caches where appropriate. The runner inventory (`.github/configs/runners.yaml`) already enumerates H100, H200, B200, B300, GB200, GB300, MI300X, MI325X, MI355X fleets and groups such as `h200-multinode`, `b200-multinode`, individual nodes, etc. CollectiveX **reads** this file rather than duplicating runner names.
+
+### What not to reuse directly
+
+Do not call the serving launchers (`runners/launch_${RUNNER_NAME%%_*}.sh`) — they carry model-serving assumptions (model paths, framework setup, result naming). Mirror the **selection convention** with CollectiveX launchers instead:
+
+```bash
+bash experimental/CollectiveX/launchers/launch_${RUNNER_NAME%%_*}.sh
+```
+
+Each CollectiveX launcher handles only: Slurm allocation; container image; mounts; network environment; rank launch; result copy-back; cleanup. There are **two launch paths**, mirroring the serving side: **single-node** B200 mirrors the `salloc … --gres=gpu:N --exclusive … && srun --container-image=<enroot squash>` pattern in `runners/launch_b200-dgxc.sh`; **multi-node** B200/GB200 drives **srt-slurm** (`srtctl apply -f <recipe>`), which already knows how to rebuild DeepEP and set the MNNVL env — so the CollectiveX GB200 launcher is a thin wrapper handing srt-slurm a CollectiveX recipe, not a from-scratch sbatch. (Later, common Slurm/container functions can be factored into a shared lib used by both systems.)
+
+> Runner-name subtlety to handle in `inventory.py`: one physical cluster can appear under multiple prefixes — `b200-dgxc_NN` routes to `launch_b200-dgxc.sh` (single-node) while `b200-dgxc-slurm_N` (label `b200-multinode`) routes to `launch_b200-dgxc-slurm.sh`. One fabric domain can therefore span several runner labels.
+
+### Capability overlay
+
+`inventory.py` loads `../../../.github/configs/runners.yaml` and combines it with a CollectiveX capability overlay — one source of truth for runner names, CollectiveX metadata kept isolated:
+
+```yaml
+b200-multinode:
+  launcher: b200-dgxc-slurm
+  vendor: nvidia
+  hardware: b200
+  topology-class: b200-nvlink-cx7
+  fabric-domain: b200-dgxc-main
+  gpus-per-node: 8
+  arch: x86_64
+  max-nodes: 16
+  scheduler: slurm
+  container-runtime: enroot-pyxis
+  capabilities:
+    nccl: true
+    deepep: true                # built at job setup via rebuild-deepep, not prebuilt
+    rdma: true
+    nvshmem: true
+    ibgda: experimental         # capability present ≠ currently validated
+  scheduling:
+    exclusive-nodes: true
+    max-parallel-shards: 1
+
+gb200:
+  launcher: gb200-nv
+  vendor: nvidia
+  hardware: gb200
+  topology-class: gb200-nvl72-mnnvl
+  gpus-per-node: 4              # NVL72 compute tray
+  arch: aarch64                 # nccl-tests + DeepEP must build for aarch64
+  scheduler: srt-slurm
+  transport-env: { NCCL_CUMEM_ENABLE: 1, NCCL_MNNVL_ENABLE: 1, MC_FORCE_MNNVL: 1 }
+  capabilities:
+    nccl: true
+    deepep: true                # rebuilt at setup; aarch64 path is a tracked risk
+    mnnvl: true                 # GB200/GB300 only
+    ibgda: experimental
+```
+
+`fabric-domain` is essential: two jobs on separate compute nodes may still contend for the same leaf/spine network, so **GitHub concurrency is keyed by fabric domain, not GPU SKU**. The inventory distinguishes hardware capability, software currently installed, and feature state (known-good vs experimental vs temporarily broken) — IBGDA support and "IBGDA low-latency currently validated" are different properties.
+
+**Operational coexistence with the serving sweep.** `b200-multinode` is only three runners (`b200-dgxc-slurm_7/8/9`), **shared with the production serving sweeps**, and srt-slurm allocations are long. Exclusive nodes + `max-parallel-shards: 1` + fabric-domain serialization means CollectiveX and the serving sweep contend for the same scarce runners. Decide the scheduling/coexistence policy (off-hours windows? a dedicated runner?) before enabling any recurring CollectiveX suite, rather than discovering the contention in CI.
+
+## GitHub workflow design (Milestone 2)
+
+When cluster CI begins, add one small orchestration-only file — `.github/workflows/collectivex-experimental.yml` — with no benchmarking logic:
+
+```text
+validate → resolve matrix → preflight canaries → benchmark shards
+→ aggregate → compare against baseline → build static report → upload artifacts
+```
+
+Triggers while on the branch:
+
+```yaml
+on:
+  push:
+    branches: [ collectivex ]
+    paths:
+      - experimental/CollectiveX/**
+      - .github/workflows/collectivex-experimental.yml
+  pull_request:
+    paths:
+      - experimental/CollectiveX/**
+      - .github/workflows/collectivex-experimental.yml
+```
+
+Later, after a minimal dispatcher exists on `main`, add `workflow_dispatch` with inputs: `ref, suite, target, backend, shape, profile` (and comparison class / normal-LL-both / dry-run).
+
+Jobs:
+
+1. **Validate** — install the package; validate all suite/shape/backend/cluster YAML; confirm runner references exist in `runners.yaml`; reject unknown fields; emit the resolved run plan as an artifact. (Match InferenceX's strict Pydantic practice — models reject extra fields.)
+2. **Compile and shard** — **do not** generate one job per benchmark point. Group cases by `cluster, node count, GPU placement, container image, backend build, transport mode, fabric domain, profiler requirement`. A shard runs many compatible points under one Slurm allocation (avoids thousands of matrix jobs, repeated communicator init, queue latency, repeated container import). Bounded runtime; record per-case failures unless the cluster itself is unhealthy.
+3. **Preflight** — confirm GPU count; validate peer access; enumerate NICs; test RDMA/device visibility; verify backend libraries; run a tiny correctness case; capture topology/software. A failed preflight marks the whole shard `environment-invalid` rather than manufacturing dozens of backend failures.
+4. **Canary** — for each `(cluster, backend, mode)` group, run one small representative case; launch the larger matrix only after it passes (mirrors InferenceX's canary-before-full-sweep).
+5. **Benchmark** (`collectivex-benchmark-tmpl.yml`) — run on the resolved runner label; unique Slurm job name from workflow/attempt/shard; exclusive nodes; serialize/limit by `fabric-domain`; call the CollectiveX launcher; upload results even on partial failure; always upload environment+logs; fail the job only after artifact creation.
+6. **Aggregate and regress** — validate every result against JSON schema; reject duplicate natural keys; merge rank samples and summaries; compute trial aggregates; compare against the most recent compatible baseline; publish a step summary; upload one `results_collectivex` bundle.
+7. **Dispatch ingestion** (only once promoted to feed the app) — repository-dispatch the InferenceX-app repo with `{ "benchmark-family": "collectivex", "run-id": "...", "run-attempt": "..." }`.
+
+Use a separate `collectivex-changelog.yaml`: a CollectiveX backend change must not trigger the expensive serving sweep through `perf-changelog.yaml`, and a serving change must not launch every collective suite.
+
+## Regression policy (Milestone 2+)
+
+A compatible baseline requires exact matches on: case ID; cluster ID; topology fingerprint (or approved topology class); backend; comparison class; normal/LL mode; node and rank placement; dtype and shape; measurement-contract version. **Do not compare "same GPU SKU" across materially different fabrics.**
+
+```text
+regression if:
+  correctness changed pass → fail
+  OR median latency degradation exceeds max(fixed floor, cluster noise threshold)
+  OR bandwidth degradation exceeds max(fixed floor, cluster noise threshold)
+```
+
+Derive each cluster's noise threshold from repeated baseline measurements via median absolute deviation — don't hard-code a universal 3% before knowing each fabric's noise. Retain failed, timed-out, and invalid results; reliability is part of the benchmark.
+
+## Reporting, database, and frontend
+
+**Now (spike / Milestone 2): a static, artifact-driven report.** Do not begin by changing InferenceX-app.
+
+```bash
+python -m collectivex.report --results output/aggregate.json --output output/report/
+```
+
+```text
+report/
+├── index.html
+├── data.json
+├── assets/
+└── runs/
+    └── <case-id>.html
+```
+
+Report views: **Overview** (supported clusters/backends, latest run, correctness failures, recent regressions, coverage matrix); **Primitive explorer** (latency / algbw / busbw / rank-spread vs payload size; single-node vs multinode); **MoE explorer** (dispatch & combine latency vs tokens/rank; tokens/s vs EP size; uniform vs skewed; normal vs LL; comm-SMs vs performance); **Case details** (exact shape, backend commit, container digest, topology fingerprint, environment, command, correctness report, rank-level distribution, raw logs). A **comparison warning** must visibly reject invalid comparisons:
+
+```text
+Not directly comparable:
+- different routing distribution
+- different topology class
+- different communication-SM budget
+- standardized versus backend-optimized mode
+```
+
+**Later (Milestone 4 / promotion into InferenceX-app):** add `/collectives` to the app (Next.js, React Query, raw API rows, client-side transforms, D3 charts; tab metadata/routing are centralized). Avoid a single global "CollectiveX score" at launch. Port the report views, plus Library Comparison, Scale-and-topology, and Historical-regression views, and a run-detail drawer. The frontend computes the `comparison-key` and refuses to connect rows with differing keys by default — **this guard matters more than any individual chart.**
+
+API routes (app):
+
+```text
+/api/v1/collectives
+/api/v1/collectives/availability
+/api/v1/collectives/history
+/api/v1/collectives/runs/:id
+/api/v1/collectives/artifacts/:id
+```
+
+Continue the app convention: API returns raw DB rows; the frontend does chart-specific transforms.
+
+**Database (app, later).** Do not put CollectiveX rows in `benchmark_results` (its identity is serving configs + ISL/OSL/concurrency). Reuse `workflow_runs`, then add:
+
+```sql
+collective_workloads(id, case_id, schema_version, family, operation, shape jsonb)
+collective_environments(id, cluster_id, hardware, topology_class, topology_hash, software jsonb, capabilities jsonb)
+collective_configs(id, workload_id, environment_id, backend, backend_version, comparison_class, mode, nodes, gpus_per_node, world_size, settings jsonb)
+collective_results(id, workflow_run_id, config_id, trial, date, status, metrics jsonb,
+                   latency_p50_us, latency_p99_us, logical_bandwidth_gbps, bus_bandwidth_gbps,
+                   tokens_per_second, rank_skew_pct, error)
+collective_artifacts(result_id, artifact_type, storage_url, metadata jsonb)
+collective_availability(date, hardware, cluster_id, backend, family, operation, mode)
+```
+
+Follow the app's hybrid design (JSONB for evolving metrics; indexed "hot" columns for common filters; idempotent ingestion; natural unique keys; denormalized date; latest-results materialized view). Keep raw per-rank samples in artifacts/object storage, not in Postgres.
+
+## Future expansions
+
+The spike de-risks the path to the actual deliverable — a public OSS collective benchmark and an explainer article. Expansion axes, roughly near → far, with dependencies:
+
+**Hardware breadth.** B300 / GB300 next (GB300 is also MNNVL, with known disagg KV-transfer wins) → H100 / H200 as a cheaper, more-available **InfiniBand baseline** ideal for characterizing per-fabric noise → AMD MI300X / MI325X / MI355X (this is Milestone 1) → TPU (far; a separate stack and toolchain).
+
+**Backend breadth.** Framework-native EP (the `framework-integrated` class — ties numbers back to the SGLang/vLLM serving paths) → MSCCL++, NVSHMEM / rocSHMEM microbenchmarks, AITER comm/fusion, MoRI / Pollara (AMD).
+
+**IBGDA low-latency mode.** The recurring strategic blocker and the original "LL is broken" story; gated on the NVIDIA SRE maintenance window for B200/B300. Highest narrative value — add as an experimental suite the moment it unblocks.
+
+**Scale-out.** 2 → 4 → 8 → 16 nodes; on GB200, intra-NVL72 vs cross-rack scaling-efficiency curves (where MNNVL ends and the inter-rack fabric begins).
+
+**L3 integrated operator path.** route → permute → dispatch → grouped-GEMM → combine → unpermute — the bridge to OperatorX.
+
+**L4 e2e correlation.** Link an isolated dispatch/combine number to the same shape's cost inside a real serving run via `profile.yml` traces — the "explain serving performance" payoff and the tie-back to the core product.
+
+**Trace-derived shapes (Milestone 3).** DeepSeek V4 / MiniMax M3 / Kimi token-histogram and routing-skew extraction, so the synthetic shapes are anchored to real workloads.
+
+**AMD Ultra Ethernet (UEC).** The AMD networking path; pairs with the MoRI / Pollara backends.
+
+**Productization (north star).** Static report → public OSS benchmark site + the explainer article; promotion into InferenceX-app (`/collectives` + Postgres + nightly suite + regression alerts) at Milestone 2 / 4.
+
+## Continuous benchmark — vision & scope
+
+Goal: a continuous benchmark that reproduces the spike automatically and grows into a credible cross-vendor EP/collective comparison. **Start with balanced DeepSeek shapes, intranode EP**, then venture to advanced cases. Target **≥1 EP library per platform** first — DeepEP on NVIDIA, MoRI on AMD.
+
+### EP library landscape
+- MoRI (AMD) — https://github.com/ROCm/mori
+- DeepEP / DeepEPv2 / Hybrid-EP — https://github.com/deepseek-ai/DeepEP (hybrid: https://github.com/deepseek-ai/DeepEP/tree/hybrid-ep)
+- NVIDIA NCCL EP — https://github.com/NVIDIA/nccl/tree/master/contrib/nccl_ep
+- UCCL — https://github.com/uccl-project/uccl
+- NVLink One-Sided AllToAll EP (mainly NVL72) — TensorRT-LLM blog18 (Optimizing MoE Communication with One-Sided AllToAll over NVLink)
+- NIXL EP — https://github.com/ai-dynamo/nixl/tree/main/examples/device/ep
+
+### Shapes & axes
+- **Classic DeepSeek V3:** hidden 7168, top-8, 256 routable experts.
+- **Prefill vs decode** (# tokens).
+- **Normal EP vs low-latency (LL) EP.**
+- **Dispatch precision:** NVFP4, MXFP4, MXFP8, BF16.
+- **Combine precision:** MXFP8, direct-cast FP8, BF16, NVFP4 — see MoRI #311, flashinfer #3643 / #3376.
+- **Balanced vs unbalanced vs EPLB.**
+- **Realistic shapes from InferenceX models** — collect hidden sizes / routing (Qwen3.5 has an unusual top-k).
+
+### Other inference collectives (later)
+- KV-cache transfer: MoRI-IO, NIXL, Mooncake; CPU↔GPU offload — `experimental/kvcache_transfer_DtoH_HtoD/benchmark.py`.
+- Low-latency one-shot / two-shot all-reduce (SGLang & vLLM in-tree kernels + AITER / FlashInfer variants) — e.g. sglang `sgl-kernel/csrc/allreduce/quick_all_reduce.cuh`.
+
+### Reference benchmark scripts to draw from
+- flashinfer PR #3000; ROCm/mori `tests/python/ops`; DeepEP `tests/legacy`.
+
+### Learning resources
+- arXiv 2511.15076, 2603.13606, 2512.19849, 2412.19437.
+
+## Things not to do
+
+* Do not add collective fields to the existing serving matrix.
+* Do not make one GitHub Actions job per payload size.
+* Do not call all logical-bandwidth figures "bus bandwidth."
+* Do not compare different topology fingerprints as though GPU SKU were sufficient.
+* Do not silently discard failed or incorrect results.
+* Do not let a backend choose undocumented tuning parameters (in `standardized` mode).
+* Do not make low-latency mode the only reported result.
+* Do not publish one overall ranking before coverage and comparison contracts are stable.
+* Do not start with every EP library, TPU, UEC, and every model shape.
+* Do not store full raw rank samples indefinitely in Postgres.
+* Do not expose internal hostnames, paths, NIC GUIDs, IP addresses, or private image references in public artifacts.
+* Do not freeze the schema before the spike has produced a real artifact to freeze it from.
+
+## References (verified against the live InferenceX repo)
+
+- `experimental/README.md` — the non-core / "not official results" charter this project lives under.
+- `.github/configs/runners.yaml` — runner labels and exact names (H100…GB300, AMD MI3xx).
+- `.github/workflows/benchmark-tmpl.yml`, `benchmark-multinode-tmpl.yml`, `profile.yml`, `speedbench-al.yml` — the `bash ./runners/launch_${RUNNER_NAME%%_*}.sh` selection convention.
+- `runners/launch_*.sh` — existing per-cluster launchers (`launch_b200-dgxc.sh`, `launch_b200-dgxc-slurm.sh`, `launch_gb200-nv.sh`, `launch_mi355x-amds.sh`, …).
+- `utils/matrix_logic/generate_sweep_configs.py`, `validation.py` — the serving matrix CollectiveX must **not** extend.
+- `.github/workflows/e2e-tests.yml`, `collect-results.yml` — the validate → fan-out → collect control plane being reused.
+- `perf-changelog.yaml` — the additions-only serving gate CollectiveX must **not** trigger.
+- NVIDIA Magnum IO NVSHMEM + GPUDirect Async (IBGDA): `https://developer.nvidia.com/blog/improving-network-performance-of-hpc-systems-using-nvidia-magnum-io-nvshmem-and-gpudirect-async/`
diff --git a/experimental/CollectiveX/plot.py b/experimental/CollectiveX/plot.py
new file mode 100644
index 000000000..c24136ebc
--- /dev/null
+++ b/experimental/CollectiveX/plot.py
@@ -0,0 +1,142 @@
+#!/usr/bin/env python3
+"""CollectiveX spike — plot NCCL primitive curves, B200 vs GB200.
+
+Loads run_nccl.py result JSONs from results/, and for each operation draws two
+panels: latency-vs-size and bus-bandwidth-vs-size, overlaying one curve per
+(runner, topology-class, world-size). The B200(IB)-vs-GB200(MNNVL) contrast at
+a matched shape is the intended overlay and the spike's headline.
+
+Comparison guard (plan §Comparability): curves are only overlaid when they
+share op + dtype + comparison-class + measurement-contract. Anything else is
+reported as "not directly comparable" and skipped rather than silently mixed.
+
+    python plot.py --results-dir results --out-dir results/plots
+
+matplotlib + (optional) numpy. Run on a workstation/laptop over the JSON
+artifacts; no GPU needed.
+"""
+from __future__ import annotations
+
+import argparse
+import glob
+import json
+import os
+from collections import defaultdict
+
+import matplotlib
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+
+
+def _human(nbytes: int) -> str:
+    for unit in ("B", "KiB", "MiB", "GiB"):
+        if nbytes < 1024 or unit == "GiB":
+            return f"{nbytes:.0f}{unit}" if unit == "B" else f"{nbytes/1:.0f}{unit}"
+        nbytes /= 1024
+    return str(nbytes)
+
+
+def load_nccl_results(results_dir: str) -> list[dict]:
+    docs = []
+    for path in sorted(glob.glob(os.path.join(results_dir, "*.json"))):
+        try:
+            with open(path) as _f:
+                d = json.load(_f)
+        except (json.JSONDecodeError, OSError):
+            continue
+        if d.get("family") == "nccl" and d.get("rows"):
+            d["_path"] = path
+            docs.append(d)
+    return docs
+
+
+def curve_label(d: dict) -> str:
+    return f"{d['runner']} · {d['topology_class']} · ws{d['world_size']}"
+
+
+def overlay_signature(d: dict) -> tuple:
+    """Fields that must match for two curves to share a chart (topology and
+    world-size are deliberately NOT here — they are the comparison axis)."""
+    return (d["op"], d.get("dtype"), d.get("comparison_class"), d.get("measurement_contract"))
+
+
+def plot_op(op: str, docs: list[dict], out_dir: str) -> str | None:
+    if not docs:
+        return None
+    # Comparison guard: keep the dominant signature, warn on the rest.
+    sigs = defaultdict(list)
+    for d in docs:
+        sigs[overlay_signature(d)].append(d)
+    main_sig = max(sigs, key=lambda s: len(sigs[s]))
+    keep = sigs[main_sig]
+    for sig, ds in sigs.items():
+        if sig == main_sig:
+            continue
+        for d in ds:
+            print(f"  [guard] skipping {curve_label(d)} for op={op}: not directly "
+                  f"comparable (dtype/class/contract differs: {sig} vs {main_sig})")
+
+    fig, (ax_lat, ax_bw) = plt.subplots(1, 2, figsize=(14, 5))
+    for d in sorted(keep, key=curve_label):
+        rows = sorted(d["rows"], key=lambda r: r["size_bytes"])
+        sizes = [r["size_bytes"] for r in rows]
+        lat = [r["out_of_place"]["time_us"] for r in rows]
+        bw = [r["busbw_gbps"] for r in rows]
+        label = curve_label(d)
+        ax_lat.plot(sizes, lat, "o-", linewidth=2, markersize=4, label=label)
+        ax_bw.plot(sizes, bw, "o-", linewidth=2, markersize=4, label=label)
+
+    for ax in (ax_lat, ax_bw):
+        ax.set_xscale("log", base=2)
+        ax.set_xlabel("Message size (bytes)")
+        ax.grid(True, alpha=0.3)
+        ax.legend(fontsize=9)
+    ax_lat.set_yscale("log")
+    ax_lat.set_ylabel("Latency (µs, out-of-place)")
+    ax_lat.set_title(f"{op}: latency vs size")
+    ax_bw.set_ylabel("Bus bandwidth (GB/s)")
+    ax_bw.set_title(f"{op}: bus bandwidth vs size")
+    fig.suptitle(
+        f"CollectiveX · {op} · dtype={main_sig[1]} · class={main_sig[2]}  "
+        f"(topology is the comparison axis)",
+        fontsize=11,
+    )
+    fig.tight_layout()
+    os.makedirs(out_dir, exist_ok=True)
+    out = os.path.join(out_dir, f"nccl_{op}.png")
+    fig.savefig(out, dpi=150, bbox_inches="tight")
+    plt.close(fig)
+    return out
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser(description="CollectiveX primitive plots")
+    ap.add_argument("--results-dir", default="results")
+    ap.add_argument("--out-dir", default="results/plots")
+    ap.add_argument("--op", help="only plot this op")
+    args = ap.parse_args()
+
+    docs = load_nccl_results(args.results_dir)
+    if not docs:
+        print(f"no nccl result JSONs found in {args.results_dir}/")
+        return 1
+
+    by_op = defaultdict(list)
+    for d in docs:
+        by_op[d["op"]].append(d)
+
+    ops = [args.op] if args.op else sorted(by_op)
+    made = []
+    for op in ops:
+        out = plot_op(op, by_op.get(op, []), args.out_dir)
+        if out:
+            made.append(out)
+            print(f"wrote {out}  ({len(by_op[op])} curve(s))")
+    if not made:
+        print("nothing plotted")
+        return 1
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/experimental/CollectiveX/plot_ep.py b/experimental/CollectiveX/plot_ep.py
new file mode 100644
index 000000000..403775a9d
--- /dev/null
+++ b/experimental/CollectiveX/plot_ep.py
@@ -0,0 +1,479 @@
+#!/usr/bin/env python3
+"""CollectiveX — render EP dispatch/combine sweeps to a self-contained HTML.
+
+Reads the family=moe result JSONs (tests/run_ep.py output) and emits ONE
+dependency-free HTML file (inline SVG, no CDN — opens offline) with:
+
+  * an interactive explorer: operation (dispatch | combine | round-trip) x
+    phase (decode | prefill) x x-axis (tokens/rank | global tokens) x y-axis
+    (latency | tokens/s | alg bandwidth), one colored line per SKU/backend/EP;
+  * a static small-multiples grid (phase x operation) of latency vs tokens/rank.
+
+Only source-tokens-per-rank varies along a line; everything else (backend, EP
+degree, phase, precision, top-k/experts/hidden, routing) is fixed and identifies
+the line — per the CollectiveX EP framework.
+
+    python3 plot_ep.py --results-dir results --out results/plots/collectivex_ep.html
+"""
+from __future__ import annotations
+
+import argparse
+import glob
+import json
+import os
+
+# SKU -> color (matches the matplotlib convention used for the NCCL plots).
+COLORS = {"b200": "#1f77b4", "gb200": "#2ca02c", "mi355x": "#d62728",
+          "b300": "#9467bd", "gb300": "#8c564b", "h100": "#ff7f0e", "h200": "#e377c2"}
+
+# Per-SKU color FAMILIES: every (sku,backend,dtype,mode,resource) config gets its own
+# shade within its SKU's hue family, so lines are individually identifiable AND the SKU
+# is still readable at a glance (SKU-only coloring collided same-SKU configs into one).
+SKU_FAMILY = {
+    "h100":  ["#ff7f0e", "#d6a72b", "#ffbb78", "#8c6d1f", "#e8a33d"],  # oranges / golds
+    "h200":  ["#e377c2", "#b04a8f", "#f4b6df"],                        # pinks
+    "b200":  ["#1f77b4", "#0d3d66", "#4a90d9", "#7fb2e0"],             # blues
+    "b300":  ["#9467bd", "#6b3fa0", "#c5b0d5", "#7b4fa0"],             # purples
+    "gb200": ["#2ca02c", "#1a661a", "#7bc77b"],                        # greens
+    "gb300": ["#8c564b", "#5e372f", "#c49c94"],                        # browns
+    "mi355x": ["#d62728", "#a30000", "#ff9896", "#e34a4a"],            # reds
+}
+PALETTE = ["#17becf", "#bcbd22", "#7f7f7f", "#393b79", "#637939"]      # fallback for unknown SKUs
+
+
+def load_series(results_dir: str, legacy: str = "all") -> list[dict]:
+    series = []
+    for path in sorted(glob.glob(os.path.join(results_dir, "**", "*.json"), recursive=True)):
+        try:
+            d = json.load(open(path))
+        except (json.JSONDecodeError, OSError):
+            continue
+        if d.get("family") != "moe" or not d.get("rows"):
+            continue
+        # legacy = a v3 doc with no machine-derived publication_status. exclude -> v4-only main
+        # plot; only -> the legacy.html archive.
+        is_legacy = "publication_status" not in d
+        if (legacy == "exclude" and is_legacy) or (legacy == "only" and not is_legacy):
+            continue
+        sku = (d.get("runner") or "?").split("_")[0].split("-")[0]
+        rows = []
+        for r in d["rows"]:
+            # v4 carries nested {p50,p90,p95,p99} dicts for dispatch/combine/roundtrip/isolated_sum.
+            # Fall back to v3 flat *_us_p* (serial -> isolated_sum) so legacy docs still load.
+            def pcts(k, flat):
+                if isinstance(r.get(k), dict) and r[k].get("p50") is not None:
+                    o = dict(r[k]); o.setdefault("p95", o.get("p90"))
+                    return o
+                p50 = r.get(f"{flat}_us_p50")
+                return {"p50": p50, "p90": r.get(f"{flat}_us_p90") or p50,
+                        "p95": r.get(f"{flat}_us_p95") or r.get(f"{flat}_us_p90") or p50,
+                        "p99": r.get(f"{flat}_us_p99") or p50}
+            dop, cop = pcts("dispatch", "dispatch"), pcts("combine", "combine")
+            iso = pcts("isolated_sum", "serial")                       # renamed from "serial"
+            rtp = pcts("roundtrip", "roundtrip")                       # MEASURED round trip (v4)
+            if not (dop["p50"] and cop["p50"]):
+                continue
+            if rtp["p50"] is None:                                     # legacy: no measured RT
+                rtp = iso
+            rows.append({
+                "t": r["tokens_per_rank"], "gt": r.get("global_tokens"),
+                "dispatch": dop, "combine": cop, "roundtrip": rtp, "isolated_sum": iso,
+                "fanout": r.get("fanout_mean"),
+                "dbytes": r.get("dispatch_logical_bytes") or r.get("routed_bytes_total") or 0,
+                "cbytes": r.get("combine_logical_bytes") or 0,
+                "recv": r.get("recv_tokens_max") or r.get("recv_tokens") or 0,
+                "straggler": (r.get("per_rank_dispatch_us") or {}).get("slowest_rank"),
+                "correct": bool(r.get("correct")),
+            })
+        if not rows:
+            continue
+        sh = d.get("shape", {})
+        mode = d.get("mode", "normal")
+        dtype = sh.get("dispatch_dtype", "?")
+        rmode = d.get("resource_mode", "")
+        ll = " LL" if mode == "ll" else ""
+        # resource suffix: tuned is the default (omit); flag the others so a normalized
+        # or default-budget line is never confused with the tuned one.
+        rs = {"normalized": " (norm)", "default": " (def)"}.get(rmode, "")
+        contract = d.get("measurement_contract", "?")
+        cl = " [cl]" if contract == "cached-layout-comm-only-v1" else ""   # cached-layout flag
+        backend = d.get("backend")
+        ep = d.get("ep_size")
+        # Routing axis: base distribution + EPLB. "zipf+eplb" is the balanced-by-replication
+        # variant of zipf; uniform is the baseline (omitted from the label to keep it short).
+        eplb_doc = d.get("eplb") or {}
+        routing_disp = f'{sh.get("routing", "?")}+eplb' if eplb_doc.get("enabled") else sh.get("routing", "?")
+        rt = "" if routing_disp == "uniform" else f' ·{routing_disp}'
+        # FULL per-line label: SKU·EP·backend·dtype[·LL][·resource][·cached-layout][·routing].
+        # EP is explicit because a SKU can span EP degrees (GB300 EP4 on one NVL72 tray, EP8
+        # across two); routing is explicit so balanced/zipf/zipf+eplb don't collide with uniform.
+        label = f'{sku.upper()} EP{ep} · {backend} · {dtype}{ll}{rs}{cl}{rt}'
+        repro = d.get("reproduction", {})
+        gr = repro.get("git_run") or {}
+        rid = d.get("routing_identity", {})
+        wl = d.get("workload") or {}
+        # publication status (v4) gates the default view; legacy v3 docs -> "legacy".
+        pub = d.get("publication_status") or "legacy"
+        # workload signature: prefer the v4 workload block, fall back to routing_identity (v3).
+        wsig = wl.get("trace_signature") or rid.get("trace_signature")
+        series.append({
+            "sku": sku, "backend": backend, "ep": ep,
+            "pub": pub, "wsig": wsig, "wid": wl.get("workload_id"),
+            "phase": d.get("phase", "decode"), "mode": mode,
+            "dtype": dtype, "resource": rmode or "tuned", "contract": contract,
+            # comparison class: best-stack (tuned/default) vs resource-constrained
+            # (normalized) — kept distinct so they're never read as one fair contest.
+            "suite": "resource-constrained" if rmode == "normalized" else "backend-default",
+            "routing": routing_disp,
+            # eplb per-rank load imbalance removed (the headline of zipf vs zipf+eplb).
+            "eplb_before": eplb_doc.get("imbalance_before"), "eplb_after": eplb_doc.get("imbalance_after"),
+            # ep + routing in the key so EP4/EP8 and uniform/balanced/zipf/zipf+eplb of one SKU
+            # get distinct colors/lines (sku stays ckey.split("|")[0] for the family lookup).
+            "ckey": f"{sku}|{backend}|{dtype}|{mode}|{rmode}|{contract}|ep{ep}|{routing_disp}",  # config identity (color)
+            "label": label,
+            "dash": "" if dtype == "bf16" else "6 4",   # bf16 solid, fp8 dashed (2nd cue)
+            "color": COLORS.get(sku, "#555"),           # provisional; reassigned below
+            "topo": d.get("topology_class"), "transport": d.get("transport"),
+            "fp8_in_timing": repro.get("fp8_quant_in_timing"),
+            "run_id": gr.get("run_id"), "source_sha": (gr.get("source_sha") or "")[:10],
+            "repo": gr.get("repo"), "image_digest": (repro.get("image_digest") or "")[:19],
+            "routing_consistent": rid.get("consistent_across_ranks"),
+            "trace_sig": rid.get("trace_signature"),
+            "samples": (rows and d["rows"][0].get("samples_pooled")) or None,
+            "prov": d.get("backend_provenance", {}),
+            "shape": sh, "rows": rows,
+        })
+    # NOTE (goal Part 1, "plot/artifact integrity"): raw series are IMMUTABLE after loading.
+    # An earlier version injected each config's decode-range points into its prefill series so
+    # prefill panels spanned the full token axis — that COPIED observations between series and
+    # is removed. Each phase now plots only its own measured points; the x-axis simply spans
+    # whatever a series measured. (A shaded decode/prefill regime is the cosmetic alternative.)
+
+    # Assign a DISTINCT color per config key, grouped by SKU family (stable across the
+    # decode/prefill panels so a line keeps its color everywhere).
+    by_sku: dict[str, list[str]] = {}
+    for ck in sorted({s["ckey"] for s in series}):
+        by_sku.setdefault(ck.split("|")[0], []).append(ck)
+    ckcolor: dict[str, str] = {}
+    fb = 0
+    for sku, cks in by_sku.items():
+        fam = SKU_FAMILY.get(sku)
+        for j, ck in enumerate(cks):
+            if fam:
+                ckcolor[ck] = fam[j % len(fam)]
+            else:
+                ckcolor[ck] = PALETTE[fb % len(PALETTE)]; fb += 1
+    for s in series:
+        s["color"] = ckcolor[s["ckey"]]
+    return series
+
+
+HEAD = """<!doctype html><html lang="en"><head><meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1">
+<title>CollectiveX — EP dispatch / combine</title>
+<style>
+:root{--bg:#0f1115;--panel:#171a21;--ink:#e6e9ef;--mut:#9aa4b2;--line:#2a2f3a;--accent:#5b8def}
+*{box-sizing:border-box}
+body{margin:0;background:var(--bg);color:var(--ink);font:14px/1.45 -apple-system,BlinkMacSystemFont,"Segoe UI",Roboto,Helvetica,Arial,sans-serif}
+.wrap{max-width:1080px;margin:0 auto;padding:24px 18px 64px}
+h1{font-size:20px;margin:0 0 4px} h2{font-size:15px;color:var(--mut);font-weight:600;margin:28px 0 10px;border-bottom:1px solid var(--line);padding-bottom:6px}
+.sub{color:var(--mut);font-size:12.5px;margin:0 0 18px}
+.controls{display:flex;flex-wrap:wrap;gap:14px;background:var(--panel);border:1px solid var(--line);border-radius:10px;padding:12px 14px;margin-bottom:14px}
+.grp{display:flex;flex-direction:column;gap:5px}
+.grp .lab{font-size:11px;letter-spacing:.04em;text-transform:uppercase;color:var(--mut)}
+.seg{display:inline-flex;border:1px solid var(--line);border-radius:8px;overflow:hidden}
+.seg button{background:transparent;color:var(--mut);border:0;padding:6px 11px;font-size:12.5px;cursor:pointer}
+.seg button:hover{color:var(--ink)}
+.seg button.on{background:var(--accent);color:#fff}
+.card{background:var(--panel);border:1px solid var(--line);border-radius:10px;padding:10px}
+.legend{display:flex;flex-wrap:wrap;gap:16px;margin:6px 2px 0;color:var(--mut);font-size:12.5px}
+.guard{background:#3a2a14;border:1px solid #6b4f1f;color:#f0c674;border-radius:6px;padding:6px 10px;margin:6px 2px;font-size:12px}
+table.cov{border-collapse:collapse;font-size:12px;width:100%;margin:4px 0 18px}
+table.cov th,table.cov td{border:1px solid var(--line);padding:3px 8px;text-align:left}
+table.cov th{color:var(--mut)}
+.badge{color:#0f1115;border-radius:4px;padding:1px 6px;font-size:11px;font-weight:600}
+.legend .it{display:flex;align-items:center;gap:7px}
+.legend .sw{width:22px;height:3px;border-radius:2px;display:inline-block}
+.grid{display:grid;grid-template-columns:repeat(3,1fr);gap:12px}
+.gtit{font-size:12.5px;color:var(--ink);margin:0 0 2px;font-weight:600}
+.note{color:var(--mut);font-size:12px;margin-top:10px}
+svg{display:block;width:100%;height:auto}
+.ax{stroke:var(--line);stroke-width:1}.gl{stroke:var(--line);stroke-width:1;opacity:.45}
+.tk{fill:var(--mut);font-size:11px}.axl{fill:var(--mut);font-size:11.5px}
+.ttl{fill:var(--ink);font-size:13px;font-weight:600}
+circle.pt{stroke:#0f1115;stroke-width:1}
+@media(max-width:760px){.grid{grid-template-columns:1fr}}
+</style></head><body><div class="wrap">
+<h1>CollectiveX — EP dispatch / combine</h1>
+<p class="sub" id="prov"></p>
+"""
+
+TAIL = "</div></body></html>"
+
+JS = r"""
+const SKUS = [...new Set(DATA.map(s=>s.sku))];
+// roundtrip = INDEPENDENTLY MEASURED chained latency (v4). isolated_sum = Σ of isolated
+// dispatch+combine percentiles — NOT a measured op (no throughput/SLO use). serial(v3)->isolated_sum.
+const OPS = {dispatch:"Dispatch", combine:"Combine", roundtrip:"Round trip (measured)", isolated_sum:"Isolated sum (Σp, not measured)"};
+// NOT algorithmic/bus bandwidth: logical routed payload (recv copies x hidden x dtype)
+// over latency; dispatch & combine count their OWN bytes. Excludes scales/idx/meta/padding.
+const YK  = {lat:"Latency (µs)", tps:"Tokens / s", bw:"Logical routed payload rate (GB/s)"};
+const XK  = {t:"Source tokens / rank", gt:"Global source tokens"};
+const PCT = {p50:"p50", p90:"p90", p99:"p99"};
+const SUITE = {all:"All", "backend-default":"Backend-default", "resource-constrained":"Resource-constrained"};
+// Routing distributions present in the data (+ "all"): uniform (baseline) / balanced /
+// zipf (skewed) / zipf+eplb (skew rebalanced by EPLB replication). Default to uniform so the
+// initial view matches the headline sweep; switch to compare zipf vs zipf+eplb.
+const ROUTING = (()=>{ const o={all:"All"}; [...new Set(DATA.map(s=>s.routing))].sort().forEach(r=>{o[r]=r;}); return o; })();
+// Prefill panels show only the real large-T prefill range. MoRI ramps its prefill sweep from 1
+// (cold-jump wedge) and records decode-scale points; the intended prefill floor is the DeepEP
+// prefill ladder min. So every SKU's prefill panel starts there — the sub-floor MoRI points are
+// ramp-warmup (same kernel as decode) and live in the decode panel, not fabricated/duplicated here.
+const _dpf = DATA.filter(s=>s.phase==="prefill"&&s.backend==="deepep").flatMap(s=>s.rows.map(r=>r.t));
+const PREFILL_MIN = _dpf.length? Math.min(..._dpf) : 128;
+// Publication-status filter (goal P1): default hides diagnostic/invalid/failed so the first
+// view is publication-valid; "publishable" = official + comparable-experimental + legacy v3.
+const PUB = {publishable:"Publishable", official:"Official only", all:"All (incl. diagnostic)"};
+function pubOk(s){ return ST.pub==="all" || (ST.pub==="official" ? s.pub==="official"
+                   : !["diagnostic","invalid","failed"].includes(s.pub)); }
+// Default to ONE suite (not all) + publishable results (goal P1).
+const ST  = {op:"dispatch", phase:"decode", x:"t", y:"lat", xlog:true, ylog:true, pct:"p50",
+             suite:"backend-default", routing:"uniform", pub:"publishable"};
+
+function xval(r,xk){ return xk==="t"? r.t : r.gt; }
+function metric(r,op,yk,pct){
+  const us=(r[op] && r[op][pct]!=null)? r[op][pct] : (r[op]? r[op].p50 : 0);
+  if(yk==="lat") return us;
+  if(yk==="tps") return r.gt/(us*1e-6);
+  const b = op==="dispatch"? r.dbytes : op==="combine"? r.cbytes : (r.dbytes + r.cbytes);
+  return us>0 ? b/(us*1e3) : 0;   // logical routed payload rate (GB/s), per-op bytes
+}
+function fmt(v){
+  if(v>=1e9) return (v/1e9).toFixed(v<1e10?2:0)+"G";
+  if(v>=1e6) return (v/1e6).toFixed(v<1e7?2:0)+"M";
+  if(v>=1e3) return (v/1e3).toFixed(v<1e4?1:0)+"k";
+  if(v>=10)  return v.toFixed(0);
+  if(v>=1)   return v.toFixed(v<3?1:0);
+  return v.toFixed(2);
+}
+function logTicks(mn,mx){
+  const t=[]; let e=Math.floor(Math.log10(mn));
+  for(;Math.pow(10,e)<=mx*1.0001;e++) for(const m of [1,2,5]){const v=m*Math.pow(10,e); if(v>=mn*0.999&&v<=mx*1.001)t.push(v);}
+  return t.length?t:[mn,mx];
+}
+function linTicks(mn,mx){
+  const span=mx-mn||1, step=Math.pow(10,Math.floor(Math.log10(span))); const t=[];
+  let s=step; if(span/step>6)s=step*2; if(span/step<3)s=step/2;
+  for(let v=Math.ceil(mn/s)*s; v<=mx*1.0001; v+=s) t.push(+v.toFixed(6));
+  return t.length?t:[mn,mx];
+}
+const mapLog=(v,a,b,p,q)=>p+(Math.log(v)-Math.log(a))/(Math.log(b)-Math.log(a))*(q-p);
+const mapLin=(v,a,b,p,q)=>p+(v-a)/(b-a)*(q-p);
+
+// Build one SVG chart. opts: {op,phase,x,y,ylog,title,legend,w,h}
+function chart(o){
+  const W=o.w||900, H=o.h||520, m={l:64,r:16,t:34,b:46};
+  const pct=o.pct||"p99", suite=o.suite||"all", routing=o.routing||"all";
+  const sl = DATA.filter(s=>s.phase===o.phase && (o.ep==null || s.ep===o.ep)
+                            && (suite==="all" || s.suite===suite)
+                            && (routing==="all" || s.routing===routing) && pubOk(s));
+  const pts = sl.map(s=>({s, P:s.rows.map(r=>({x:xval(r,o.x), y:metric(r,o.op,o.y,pct), r}))
+                                     .filter(p=>p.x>0 && (o.ylog? p.y>0 : p.y>=0)
+                                                && (o.phase!=="prefill" || p.r.t>=PREFILL_MIN))}));
+  let xs=[], ys=[]; pts.forEach(g=>g.P.forEach(p=>{xs.push(p.x);ys.push(p.y);}));
+  if(!xs.length) return '<svg viewBox="0 0 '+W+' '+H+'"><text x="'+(W/2)+'" y="'+(H/2)+'" class="axl" text-anchor="middle">no data</text></svg>';
+  const xmn=Math.min(...xs), xmx=Math.max(...xs);
+  let ymn=Math.min(...ys), ymx=Math.max(...ys);
+  if(o.ylog){ ymn=Math.min(...ys.filter(v=>v>0)); } else { ymn=Math.min(0,ymn); }
+  if(ymx===ymn) ymx=ymn+1;
+  const X0=m.l,X1=W-m.r,Y0=H-m.b,Y1=m.t;
+  const xlog = o.xlog!==false;                              // x defaults to log (geometric sweep)
+  const xv=v=>xlog?mapLog(v,xmn,xmx,X0,X1):mapLin(v,xmn,xmx,X0,X1);
+  const yv=v=>o.ylog?mapLog(Math.max(v,ymn),ymn,ymx,Y0,Y1):mapLin(v,ymn,ymx,Y0,Y1);
+  let s='<svg viewBox="0 0 '+W+' '+H+'" role="img">';
+  s+='<text x="'+X0+'" y="20" class="ttl">'+o.title+'</text>';
+  // y grid + ticks
+  const yt=o.ylog?logTicks(ymn,ymx):linTicks(ymn,ymx);
+  yt.forEach(v=>{const y=yv(v); s+='<line class="gl" x1="'+X0+'" y1="'+y+'" x2="'+X1+'" y2="'+y+'"/>'+
+    '<text class="tk" x="'+(X0-7)+'" y="'+(y+3.5)+'" text-anchor="end">'+fmt(v)+'</text>';});
+  // x grid + ticks (label the actual sweep points)
+  const xt=[...new Set(xs)].sort((a,b)=>a-b);
+  xt.forEach(v=>{const x=xv(v); s+='<line class="gl" x1="'+x+'" y1="'+Y0+'" x2="'+x+'" y2="'+Y1+'"/>'+
+    '<text class="tk" x="'+x+'" y="'+(Y0+16)+'" text-anchor="middle">'+fmt(v)+'</text>';});
+  // axes
+  s+='<line class="ax" x1="'+X0+'" y1="'+Y0+'" x2="'+X1+'" y2="'+Y0+'"/><line class="ax" x1="'+X0+'" y1="'+Y0+'" x2="'+X0+'" y2="'+Y1+'"/>';
+  s+='<text class="axl" x="'+((X0+X1)/2)+'" y="'+(H-6)+'" text-anchor="middle">'+XK[o.x]+(xlog?'  (log)':'')+'</text>';
+  s+='<text class="axl" transform="translate(15,'+((Y0+Y1)/2)+') rotate(-90)" text-anchor="middle">'+YK[o.y]+(o.ylog?'  (log)':'')+'</text>';
+  // lines + points
+  pts.forEach(g=>{ if(!g.P.length) return;
+    const d=g.P.map((p,i)=>(i?'L':'M')+xv(p.x).toFixed(1)+' '+yv(p.y).toFixed(1)).join(' ');
+    const dash=g.s.dash?' stroke-dasharray="'+g.s.dash+'"':'';
+    s+='<path d="'+d+'" fill="none" stroke="'+g.s.color+'" stroke-width="2"'+dash+'/>';
+    g.P.forEach(p=>{ const D=p.r.dispatch, C=p.r.combine, R=p.r.roundtrip;
+      // artifact links (goal P1): the workflow run + source SHA + image digest + workload id
+      // that produced this point. (Result JSON / manifest / raw-samples live alongside by name.)
+      const run=g.s.run_id? ('\nrun '+g.s.run_id+(g.s.source_sha?' @'+g.s.source_sha:'')) : '';
+      const art='\nworkload='+(g.s.wid||g.s.wsig||'?')+(g.s.image_digest?'  ·  image '+g.s.image_digest:'')
+                +(g.s.repo?'  ·  '+g.s.repo:'');
+      s+='<circle class="pt" cx="'+xv(p.x).toFixed(1)+'" cy="'+yv(p.y).toFixed(1)+'" r="3.2" fill="'+g.s.color+'">'+
+      '<title>'+g.s.label+'  ['+pct+']  ('+g.s.pub+')'+
+      '\nT/rank='+p.r.t+'  ·  global='+p.r.gt+
+      '\n'+YK[o.y]+' = '+fmt(p.y)+(o.y==='lat'?' µs':o.y==='bw'?' GB/s':'')+
+      '\ndispatch  µs p50/p90/p99 = '+D.p50.toFixed(1)+'/'+D.p90.toFixed(1)+'/'+D.p99.toFixed(1)+
+      '\ncombine   µs p50/p90/p99 = '+C.p50.toFixed(1)+'/'+C.p90.toFixed(1)+'/'+C.p99.toFixed(1)+
+      '\nroundtrip µs p50/p90/p99 = '+R.p50.toFixed(1)+'/'+R.p90.toFixed(1)+'/'+R.p99.toFixed(1)+' (measured)'+
+      '\nfan-out='+(p.r.fanout!=null?p.r.fanout.toFixed(2):'?')+'  ·  recv(max)='+p.r.recv
+      +(p.r.straggler!=null?'  ·  straggler=r'+p.r.straggler:'')+(p.r.correct?'':'  ✗')+
+      '\ncontract='+g.s.contract+'  ·  suite='+g.s.suite+run+art+
+      '</title></circle>'; });
+  });
+  s+='</svg>'; return s;
+}
+// Comparison guard (goal P1): flag when overlaid lines are NOT a direct comparison —
+// differing topology at one EP, or differing realized workload signature within one routing.
+function guardNote(vis){
+  if(!vis.length) return '';
+  const w=[];
+  const topos=[...new Set(vis.map(s=>s.topo).filter(Boolean))];
+  if(topos.length>1) w.push('mixed topology ('+topos.join(', ')+')');
+  const byRt={}; vis.forEach(s=>{ (byRt[s.routing]=byRt[s.routing]||new Set()).add(s.wsig||'?'); });
+  const split=Object.entries(byRt).filter(([k,v])=>v.size>1).map(([k])=>k);
+  if(split.length) w.push('different workload trace within routing ['+split.join(',')+'] — NOT identical workloads');
+  const eps=[...new Set(vis.map(s=>s.ep))];
+  if(eps.length>1) w.push('mixed EP degree '+eps.join('/')+' — compare only on the global-tokens x-axis');
+  return w.length? '<div class="guard">⚠ not a direct comparison: '+w.join('; ')+'</div>' : '';
+}
+function legend(phase, ep, suite, routing){
+  return '<div class="legend">'+DATA.filter(s=>s.phase===phase && (ep==null||s.ep===ep)
+                                              && (!suite||suite==="all"||s.suite===suite)
+                                              && (!routing||routing==="all"||s.routing===routing) && pubOk(s)).map(s=>{
+    const sw = s.dash ? 'background:repeating-linear-gradient(90deg,'+s.color+' 0 5px,transparent 5px 9px)'
+                      : 'background:'+s.color;   // dashed swatch = fp8 (matches the line)
+    return '<span class="it"><span class="sw" style="'+sw+'"></span>'+s.label+'</span>';
+  }).join('')+'</div>';
+}
+function seg(name,opts,cur){
+  return '<div class="seg">'+Object.entries(opts).map(([k,v])=>
+    '<button data-grp="'+name+'" data-val="'+k+'" class="'+(k===cur?'on':'')+'">'+v+'</button>').join('')+'</div>';
+}
+function renderControls(){
+  document.getElementById('controls').innerHTML =
+    '<div class="grp"><span class="lab">Operation</span>'+seg('op',OPS,ST.op)+'</div>'+
+    '<div class="grp"><span class="lab">Phase</span>'+seg('phase',{decode:"Decode",prefill:"Prefill"},ST.phase)+'</div>'+
+    '<div class="grp"><span class="lab">Percentile</span>'+seg('pct',PCT,ST.pct)+'</div>'+
+    '<div class="grp"><span class="lab">Suite</span>'+seg('suite',SUITE,ST.suite)+'</div>'+
+    '<div class="grp"><span class="lab">Routing</span>'+seg('routing',ROUTING,ST.routing)+'</div>'+
+    '<div class="grp"><span class="lab">Publication</span>'+seg('pub',PUB,ST.pub)+'</div>'+
+    '<div class="grp"><span class="lab">X-axis</span>'+seg('x',XK,ST.x)+'</div>'+
+    '<div class="grp"><span class="lab">X scale</span>'+seg('xlog',{true:"Log",false:"Linear"},String(ST.xlog))+'</div>'+
+    '<div class="grp"><span class="lab">Y-axis</span>'+seg('y',YK,ST.y)+'</div>'+
+    '<div class="grp"><span class="lab">Y scale</span>'+seg('ylog',{true:"Log",false:"Linear"},String(ST.ylog))+'</div>';
+  document.querySelectorAll('#controls button').forEach(b=>b.onclick=()=>{
+    const g=b.dataset.grp, v=b.dataset.val; ST[g]= (g==='ylog'||g==='xlog')? v==='true' : v;
+    renderControls(); renderMain(); renderGrid(); });  // grid also reflects pct/suite/scale toggles
+}
+function renderMain(){
+  document.getElementById('chart').innerHTML = chart({op:ST.op,phase:ST.phase,x:ST.x,y:ST.y,xlog:ST.xlog,ylog:ST.ylog,
+    pct:ST.pct, suite:ST.suite, routing:ST.routing,
+    title:OPS[ST.op]+' — '+ST.phase+' · '+ST.pct+(ST.routing==='all'?'':' · '+ST.routing)+' ('+YK[ST.y].toLowerCase()+' vs '+XK[ST.x].toLowerCase()+')'});
+  const vis=DATA.filter(s=>s.phase===ST.phase && (ST.suite==="all"||s.suite===ST.suite)
+                           && (ST.routing==="all"||s.routing===ST.routing) && pubOk(s));
+  document.getElementById('mlegend').innerHTML = guardNote(vis)+legend(ST.phase, null, ST.suite, ST.routing);
+}
+function renderGrid(){
+  // SEPARATE panels per (phase, EP degree); within a panel, the SUITE selector keeps
+  // backend-default and resource-constrained lines from being read as one fair contest.
+  const phases=[...new Set(DATA.map(s=>s.phase))].sort();
+  const eps=[...new Set(DATA.map(s=>s.ep))].sort((a,b)=>a-b);
+  let h='';
+  phases.forEach(ph=>{ eps.forEach(ep=>{
+    const panelVis=DATA.filter(s=>s.phase===ph && s.ep===ep && (ST.suite==="all"||s.suite===ST.suite)
+                     && (ST.routing==="all"||s.routing===ST.routing) && pubOk(s));
+    if(!panelVis.length) return;
+    const scale=(ST.xlog?'log':'lin')+'–'+(ST.ylog?'log':'lin');
+    h+='<h2>'+ph[0].toUpperCase()+ph.slice(1)+' · EP'+ep+' · '+ST.pct+(ST.routing==='all'?'':' · '+ST.routing)+' — latency vs source tokens/rank (µs, '+scale+')</h2>'+
+       guardNote(panelVis)+legend(ph,ep,ST.suite,ST.routing)+'<div class="grid">';
+    ['dispatch','combine','roundtrip'].forEach(op=>{ h+='<div class="card"><div class="gtit">'+OPS[op]+'</div>'+
+      chart({op,phase:ph,ep,x:'t',y:'lat',xlog:ST.xlog,ylog:ST.ylog,pct:ST.pct,suite:ST.suite,routing:ST.routing,title:'',w:340,h:260})+'</div>'; });
+    h+='</div>'; }); });
+  document.getElementById('grid').innerHTML=h;
+}
+// Coverage table (goal P2): publication status per measured config (validated=official,
+// experimental=comparable/legacy, failed=invalid/failed). Supported/unsupported come from
+// generate_matrix.py (capability), which records omissions with reasons.
+function renderCoverage(){
+  const cls={official:'#2ca02c','comparable-experimental':'#d6a72b',legacy:'#7f7f7f',
+             diagnostic:'#9467bd',invalid:'#d62728',failed:'#a30000'};
+  const by={}; DATA.forEach(s=>{ (by[s.sku]=by[s.sku]||[]).push(s); });
+  let h='<table class="cov"><tr><th>SKU</th><th>EP</th><th>config</th><th>phase</th><th>routing</th><th>status</th><th>correct pts</th></tr>';
+  Object.keys(by).sort().forEach(sku=>{
+    by[sku].sort((a,b)=>(a.ep-b.ep)||a.label.localeCompare(b.label)).forEach(s=>{
+      const ok=s.rows.filter(r=>r.correct).length;
+      const cfg=(s.dtype||'?')+'/'+s.mode+'/'+(s.contract||'?').replace('-v1','');
+      h+='<tr><td>'+sku+'</td><td>'+s.ep+'</td><td>'+cfg+'</td><td>'+s.phase+'</td><td>'+s.routing+'</td>'
+        +'<td><span class="badge" style="background:'+(cls[s.pub]||'#555')+'">'+s.pub+'</span></td>'
+        +'<td>'+ok+'/'+s.rows.length+'</td></tr>';
+    });
+  });
+  document.getElementById('coverage').innerHTML=h+'</table>';
+}
+(function(){
+  const sh=(DATA[0]||{shape:{}}).shape||{};
+  const provs=[...new Set(DATA.map(s=>s.backend+' '+(s.prov.deepep_version||s.prov.mori_commit||'?')))];
+  const fo=[...new Set(DATA.map(s=>(s.rows[0]&&s.rows[0].fanout!=null)?s.rows[0].fanout.toFixed(1):'?'))].join('/');
+  const contracts=[...new Set(DATA.map(s=>s.contract))].join(' / ');
+  const dtypes=[...new Set(DATA.map(s=>s.dtype))].join('+');
+  const suites=[...new Set(DATA.map(s=>s.suite))].join(' + ');
+  const samp=[...new Set(DATA.map(s=>s.samples).filter(Boolean))].join('/');
+  const allconsistent=DATA.every(s=>s.routing_consistent!==false);
+  const routings=[...new Set(DATA.map(s=>s.routing))].sort().join(' / ');
+  const ez=DATA.find(s=>s.eplb_after!=null);
+  const eplbNote=ez? ' EPLB (routing=zipf+eplb) replicates hot experts to rebalance per-rank load — imbalance '+ez.eplb_before.toFixed(1)+'x→'+ez.eplb_after.toFixed(1)+'x (vs raw zipf).' : '';
+  document.getElementById('prov').textContent=
+    'Deterministic shared routing trace (seed-fixed; routings: '+routings+' — Routing selector; mean fan-out ≈'+fo+
+    ' dest-ranks/token; cross-rank identity '+(allconsistent?'PROVEN (SHA-256 of topk_idx+weights agrees on every rank)':'NOT proven on some series')+
+    '). Fixed: hidden='+(sh.hidden||'?')+', top-k='+(sh.topk||'?')+', experts='+(sh.experts||'?')+
+    '. dtype/mode/resource/contract vary PER LINE — read the label (dtypes shown: '+dtypes+'). '+
+    'Contract(s): '+contracts+' (layout-and-dispatch times routing-layout INSIDE dispatch; cached-layout [cl] hoists it out). '+
+    'Latency = percentile (selector; p99 default) over POOLED per-iteration cross-rank-MAX samples'+(samp?(' (~'+samp+'/point)'):'')+
+    '. ROUND TRIP is INDEPENDENTLY MEASURED (dispatch→sync→no-op expert→combine, raw per-iter samples); ISOLATED_SUM is Σ of isolated dispatch+combine percentiles, NOT a measured op (no throughput/SLO use). Publication filter defaults to publishable (diagnostic/invalid hidden); status is machine-derived from validity. The bandwidth axis is a LOGICAL routed-payload rate '+
+    '(recv copies x hidden x dtype / latency; per-op bytes; excludes scales/idx/meta/padding) — NOT algBW/busBW/wire utilization. '+
+    'Suites ('+suites+') are kept distinct (Suite selector): backend-default = best stack; resource-constrained = ~fixed SM/CU fraction — '+
+    'do not read across suites as one contest. Correctness = round-trip reconstruction smoke check (NOT a full per-token routing proof).'+eplbNote+' '+
+    'Backends: '+provs.join(', ')+'. Hover a point for p50/p90/p99, contract, suite, and its workflow run.';
+  renderControls(); renderMain(); renderGrid(); renderCoverage();
+})();
+"""
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser(description="CollectiveX EP HTML plotter")
+    ap.add_argument("--results-dir", default="results")
+    ap.add_argument("--out", default="results/plots/collectivex_ep.html")
+    ap.add_argument("--legacy", choices=["all", "exclude", "only"], default="all",
+                    help="exclude -> v4-only main plot; only -> the legacy v3 archive")
+    args = ap.parse_args()
+
+    series = load_series(args.results_dir, args.legacy)
+    if not series:
+        print(f"no family=moe results with rows under {args.results_dir} (legacy={args.legacy})")
+        return 1
+    os.makedirs(os.path.dirname(os.path.abspath(args.out)), exist_ok=True)
+    html = HEAD + '<div class="controls" id="controls"></div>' \
+        + '<div class="card"><div id="chart"></div></div><div id="mlegend"></div>' \
+        + '<div id="grid"></div>' \
+        + '<h2>Coverage</h2><div id="coverage"></div>' \
+        + '<p class="note">Self-contained (inline SVG, no external scripts). Generated from ' \
+        + f'{len(series)} EP sweeps. Latency (p50/p90/p99 selector) is the primary metric; the ' \
+        + 'bandwidth axis is a LOGICAL routed-payload rate (per-op bytes ÷ latency), not bus/alg ' \
+        + 'bandwidth. dtype/mode/resource/contract vary per line — see labels + provenance.</p>' \
+        + "<script>\nconst DATA = " + json.dumps(series) + ";\n" + JS + "\n</script>\n" + TAIL
+    with open(args.out, "w") as fh:
+        fh.write(html)
+    phases = sorted({s["phase"] for s in series})
+    print(f"wrote {args.out}  ({len(series)} series across SKUs={sorted({s['sku'] for s in series})}, phases={phases})")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/experimental/CollectiveX/requirements.txt b/experimental/CollectiveX/requirements.txt
new file mode 100644
index 000000000..574afb1f0
--- /dev/null
+++ b/experimental/CollectiveX/requirements.txt
@@ -0,0 +1,9 @@
+# CollectiveX spike dependencies.
+#
+# run_nccl.py + env_capture.py : Python standard library only (run anywhere).
+# run_deepep.py                : torch + deep_ep — provided by the benchmark
+#                                container; DeepEP is built at job setup
+#                                (rebuild-deepep), NOT pinned here.
+# plot.py                      : the only thing worth a local venv:
+matplotlib
+numpy
diff --git a/experimental/CollectiveX/results/.gitkeep b/experimental/CollectiveX/results/.gitkeep
new file mode 100644
index 000000000..8940934a2
--- /dev/null
+++ b/experimental/CollectiveX/results/.gitkeep
@@ -0,0 +1,3 @@
+# CollectiveX result bundles land here as flat *.json (one per runner×op),
+# plus plots/ and raw_*.txt captures (gitignored). Keep this file so the dir
+# exists before the first run.
diff --git a/experimental/CollectiveX/run_nccl.py b/experimental/CollectiveX/run_nccl.py
new file mode 100644
index 000000000..c22654c59
--- /dev/null
+++ b/experimental/CollectiveX/run_nccl.py
@@ -0,0 +1,269 @@
+#!/usr/bin/env python3
+"""CollectiveX spike — NCCL primitive benchmark wrapper.
+
+Runs stock `nccl-tests` binaries (built in-container at job time — the login
+nodes have no nvcc), parses the text table (NOT JSON — we do not assume the
+build emits JSON), and writes a flat, provenance-tagged JSON result the plot
+script and the eventual schema-freeze can consume.
+
+Standard library only, so it runs in any minimal container.
+
+Run (inside the container, after building nccl-tests):
+    python run_nccl.py --op all_reduce \\
+        --nccl-tests-dir /tmp/nccl-tests/build \\
+        --world-size 8 --min-bytes 8 --max-bytes 8G \\
+        --runner b200-dgxc --topology-class b200-nvlink-island --transport nvlink \\
+        --env-json results/env.json --out results/b200_all_reduce.json
+
+Verify the parser offline (no GPU needed):
+    python run_nccl.py --op all_reduce --parse-only tests/fixtures/all_reduce_perf_b200_8gpu.txt \\
+        --world-size 8 --runner b200-dgxc --topology-class b200-nvlink-island \\
+        --out /tmp/parsed.json
+"""
+from __future__ import annotations
+
+import argparse
+import datetime as _dt
+import hashlib
+import json
+import os
+import subprocess
+import sys
+
+SCHEMA_VERSION = 1
+MEASUREMENT_CONTRACT = "nccl-tests-v1"
+
+# op -> nccl-tests binary name
+OP_BINARY = {
+    "all_reduce": "all_reduce_perf",
+    "all_gather": "all_gather_perf",
+    "reduce_scatter": "reduce_scatter_perf",
+    "alltoall": "alltoall_perf",
+    "all_to_all": "alltoall_perf",
+    "broadcast": "broadcast_perf",
+    "sendrecv": "sendrecv_perf",
+}
+
+
+def _f(tok: str):
+    """Parse a numeric cell; nccl-tests prints 'N/A' for #wrong when -c 0."""
+    if tok in ("N/A", "n/a", "-"):
+        return None
+    try:
+        return float(tok)
+    except ValueError:
+        return None
+
+
+def parse_nccl_table(text: str) -> tuple[list[dict], dict]:
+    """Parse nccl-tests stdout into per-size rows + a run summary.
+
+    Robust across ops: the column count varies (all_reduce/reduce_scatter carry
+    redop+root; all_gather/alltoall do not), but every op prints the same 8
+    trailing numeric columns — out-of-place (time, algbw, busbw, #wrong) then
+    in-place (time, algbw, busbw, #wrong). `size` is always the first token and
+    `type` the third. So we key off the first token and the last 8 tokens.
+    """
+    rows: list[dict] = []
+    summary: dict = {"avg_busbw_gbps": None, "out_of_bounds": None, "check_passed": None}
+    for line in text.splitlines():
+        s = line.strip()
+        if not s:
+            continue
+        if s.startswith("#"):
+            if "Avg bus bandwidth" in s:
+                summary["avg_busbw_gbps"] = _f(s.split(":")[-1].strip())
+            elif "Out of bounds values" in s:
+                tail = s.split(":")[-1].strip()
+                summary["out_of_bounds"] = tail
+                summary["check_passed"] = tail.endswith("OK")
+            continue
+        toks = s.split()
+        # Data line: first token is the byte size (all digits), and we need the
+        # 8 trailing metric columns plus size+count+type up front (>=11 tokens).
+        if len(toks) < 11 or not toks[0].isdigit():
+            continue
+        tail = toks[-8:]
+        size = int(toks[0])
+        dtype = toks[2] if len(toks) >= 3 else None
+        oop_wrong = _f(tail[3])
+        ip_wrong = _f(tail[7])
+        rows.append(
+            {
+                "size_bytes": size,
+                "dtype": dtype,
+                "out_of_place": {
+                    "time_us": _f(tail[0]),
+                    "algbw_gbps": _f(tail[1]),
+                    "busbw_gbps": _f(tail[2]),
+                    "wrong": oop_wrong,
+                },
+                "in_place": {
+                    "time_us": _f(tail[4]),
+                    "algbw_gbps": _f(tail[5]),
+                    "busbw_gbps": _f(tail[6]),
+                    "wrong": ip_wrong,
+                },
+                # convenience: best (max) busbw across the two placements
+                "busbw_gbps": max(
+                    [b for b in (_f(tail[2]), _f(tail[6])) if b is not None],
+                    default=None,
+                ),
+                "correct": (
+                    None
+                    if oop_wrong is None and ip_wrong is None
+                    else ((oop_wrong or 0) == 0 and (ip_wrong or 0) == 0)
+                ),
+            }
+        )
+    return rows, summary
+
+
+def comparison_key(meta: dict) -> str:
+    """Machine key gating which rows may share a curve (see plan §Comparability).
+    Topology-class is intentionally part of the key, so B200(IB) and
+    GB200(MNNVL) are labelled distinct rather than silently overlaid."""
+    parts = [
+        meta["op"],
+        meta["dtype"],
+        str(meta["world_size"]),
+        str(meta["nodes"]),
+        meta["topology_class"],
+        meta["comparison_class"],
+        meta["measurement_contract"],
+    ]
+    digest = hashlib.sha256("|".join(parts).encode()).hexdigest()[:16]
+    return digest
+
+
+def build_command(args, binary_path: str) -> list[str]:
+    cmd: list[str] = []
+    if args.launch_prefix:
+        cmd += args.launch_prefix.split()
+    cmd += [
+        binary_path,
+        "-b", str(args.min_bytes),
+        "-e", str(args.max_bytes),
+        "-f", str(args.factor),
+        "-g", str(args.gpus_per_proc),
+        "-c", str(args.check),
+        "-w", str(args.warmup),
+        "-n", str(args.iters),
+    ]
+    if args.extra_args:
+        cmd += args.extra_args.split()
+    return cmd
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser(description="CollectiveX NCCL primitive runner")
+    ap.add_argument("--op", required=True, choices=sorted(OP_BINARY))
+    ap.add_argument("--nccl-tests-dir", help="dir containing <op>_perf binaries (build/)")
+    ap.add_argument("--parse-only", help="parse this captured stdout file instead of running")
+    # nccl-tests knobs
+    ap.add_argument("--min-bytes", default="8")
+    ap.add_argument("--max-bytes", default="8G")
+    ap.add_argument("--factor", type=int, default=2, help="size step factor")
+    ap.add_argument("--gpus-per-proc", type=int, default=8,
+                    help="-g: GPUs per process (single-node multi-GPU). Use 1 under MPI.")
+    ap.add_argument("--check", type=int, default=1, help="-c: 1 enables correctness check")
+    ap.add_argument("--warmup", type=int, default=5)
+    ap.add_argument("--iters", type=int, default=20)
+    ap.add_argument("--extra-args", default="", help="extra args appended to the binary")
+    ap.add_argument("--launch-prefix", default="",
+                    help="e.g. 'mpirun -np 16 --hostfile hf' for multi-node; empty for single-node -g mode")
+    # provenance
+    ap.add_argument("--runner", required=True, help="runner label, e.g. b200-dgxc")
+    ap.add_argument("--world-size", type=int, required=True, help="total ranks/GPUs in the run")
+    ap.add_argument("--nodes", type=int, default=1)
+    ap.add_argument("--topology-class", required=True,
+                    help="e.g. b200-nvlink-island, b200-nvlink-island+cx7-ib, gb200-nvl72-mnnvl")
+    ap.add_argument("--transport", default="", help="observed transport label: nvlink | ib | mnnvl")
+    ap.add_argument("--comparison-class", default="standardized",
+                    choices=["standardized", "backend-optimized", "framework-integrated"])
+    ap.add_argument("--env-json", help="path to env_capture.py output to embed")
+    ap.add_argument("--timestamp", help="ISO timestamp (default now)")
+    ap.add_argument("--out", required=True)
+    args = ap.parse_args()
+
+    binary = OP_BINARY[args.op]
+    command = None
+    if args.parse_only:
+        with open(args.parse_only) as fh:
+            stdout = fh.read()
+        ran_ok = True
+    else:
+        if not args.nccl_tests_dir:
+            ap.error("--nccl-tests-dir is required unless --parse-only is given")
+        binary_path = os.path.join(args.nccl_tests_dir, binary)
+        if not os.path.exists(binary_path):
+            print(f"ERROR: binary not found: {binary_path}", file=sys.stderr)
+            return 2
+        command = build_command(args, binary_path)
+        print("running:", " ".join(command), file=sys.stderr)
+        proc = subprocess.run(command, capture_output=True, text=True, check=False)
+        stdout = proc.stdout
+        ran_ok = proc.returncode == 0
+        if not ran_ok:
+            print(stdout, file=sys.stderr)
+            print(proc.stderr, file=sys.stderr)
+            print(f"ERROR: {binary} exited {proc.returncode}", file=sys.stderr)
+
+    rows, summary = parse_nccl_table(stdout)
+    dtype = rows[0]["dtype"] if rows else None
+
+    meta = {
+        "op": args.op,
+        "dtype": dtype,
+        "world_size": args.world_size,
+        "nodes": args.nodes,
+        "topology_class": args.topology_class,
+        "comparison_class": args.comparison_class,
+        "measurement_contract": MEASUREMENT_CONTRACT,
+    }
+
+    env = None
+    if args.env_json and os.path.exists(args.env_json):
+        with open(args.env_json) as fh:
+            env = json.load(fh)
+
+    # All-zero busbw means the benchmark didn't actually communicate — e.g. an
+    # MPI=0 binary launched under srun --mpi=pmix runs as N standalone world=1
+    # procs (busbw formula -> 0). Don't let that pass the gate as "valid".
+    peak_busbw = max((r.get("busbw_gbps") or 0.0 for r in rows), default=0.0)
+
+    doc = {
+        "schema_version": SCHEMA_VERSION,
+        "family": "nccl",
+        "generated_by": "run_nccl.py",
+        "generated_at": args.timestamp or _dt.datetime.now().astimezone().isoformat(),
+        "runner": args.runner,
+        "binary": binary,
+        "command": " ".join(command) if command else f"<parse-only {args.parse_only}>",
+        "transport": args.transport,
+        "status": ("valid" if (rows and ran_ok and peak_busbw > 0.0
+                   and (summary.get("check_passed") is True
+                   or (args.check == 0 and summary.get("check_passed") is None))) else "invalid"),
+        "comparison_key": comparison_key(meta),
+        **meta,
+        "summary": summary,
+        "num_rows": len(rows),
+        "rows": rows,
+        "environment": env,
+    }
+
+    os.makedirs(os.path.dirname(os.path.abspath(args.out)), exist_ok=True)
+    with open(args.out, "w") as fh:
+        json.dump(doc, fh, indent=2)
+        fh.write("\n")
+
+    print(
+        f"{args.op}: parsed {len(rows)} sizes -> {args.out} "
+        f"(status={doc['status']}, avg_busbw={summary.get('avg_busbw_gbps')} GB/s, "
+        f"key={doc['comparison_key']})"
+    )
+    return 0 if doc["status"] == "valid" else 1
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/experimental/CollectiveX/schemas/ep-result-v4.schema.json b/experimental/CollectiveX/schemas/ep-result-v4.schema.json
new file mode 100644
index 000000000..11828a8bb
--- /dev/null
+++ b/experimental/CollectiveX/schemas/ep-result-v4.schema.json
@@ -0,0 +1,122 @@
+{
+  "$schema": "http://json-schema.org/draft-07/schema#",
+  "$id": "https://semianalysis/collectivex/schemas/ep-result-v4.schema.json",
+  "title": "CollectiveX EP dispatch/combine result (v4)",
+  "description": "One (backend, phase, dtype, mode, contract, routing) sweep. v4 adds multi-dimensional validity + machine-derived publication_status, measured roundtrip, dual byte contracts, per-rank diagnostics, raw-sample histograms, and workload identity. v3 docs load via compatibility (publication_status absent => treated as legacy/experimental).",
+  "type": "object",
+  "required": ["schema_version", "family", "runner", "backend", "mode", "phase",
+               "ep_size", "measurement_contract", "shape", "rows",
+               "validity", "publication_status", "workload", "reproduction",
+               "backend_provenance", "comparison_key"],
+  "properties": {
+    "schema_version": {"type": "integer", "minimum": 3},
+    "family": {"const": "moe"},
+    "runner": {"type": "string"},
+    "backend": {"type": "string", "enum": ["deepep", "mori", "aiter"]},
+    "mode": {"type": "string", "enum": ["normal", "ll"]},
+    "phase": {"type": "string", "enum": ["decode", "prefill"]},
+    "ep_size": {"type": "integer", "minimum": 1},
+    "world_size": {"type": "integer", "minimum": 1},
+    "nodes": {"type": "integer", "minimum": 1},
+    "topology_class": {"type": "string"},
+    "transport": {"type": "string"},
+    "resource_mode": {"type": "string", "enum": ["normalized", "tuned", "default"]},
+    "measurement_contract": {"type": "string",
+      "enum": ["layout-and-dispatch-v1", "cached-layout-comm-only-v1", "runtime-visible-v1"]},
+    "publication_status": {"type": "string",
+      "enum": ["official", "comparable-experimental", "diagnostic", "invalid", "failed"]},
+    "validity": {
+      "type": "object",
+      "required": ["execution_status", "semantic_correctness", "workload_identity",
+                   "measurement_conformance", "resource_conformance", "provenance_complete"],
+      "properties": {
+        "execution_status": {"type": "string", "enum": ["complete", "failed"]},
+        "semantic_correctness": {"type": "string", "enum": ["pass", "fail"]},
+        "workload_identity": {"type": "string"},
+        "workload_source": {"type": "string", "enum": ["canonical-serialized", "seeded-runtime"]},
+        "measurement_conformance": {"type": "string", "enum": ["conformant", "nonconformant"]},
+        "resource_conformance": {"type": "string"},
+        "provenance_complete": {"type": "boolean"}
+      }
+    },
+    "workload": {
+      "type": "object",
+      "required": ["source", "trace_signature", "cross_rank_consistent"],
+      "properties": {
+        "source": {"type": "string", "enum": ["canonical-serialized", "seeded-runtime"]},
+        "workload_id": {"type": ["string", "null"]},
+        "manifest_checksums": {"type": ["object", "null"]},
+        "trace_signature": {"type": "string"},
+        "distinct_per_T_hashes": {"type": "array", "items": {"type": "string"}},
+        "cross_rank_consistent": {"type": "boolean"}
+      }
+    },
+    "shape": {
+      "type": "object",
+      "required": ["hidden", "topk", "experts", "experts_per_rank", "dispatch_dtype", "routing"],
+      "properties": {
+        "hidden": {"type": "integer"}, "topk": {"type": "integer"},
+        "experts": {"type": "integer"}, "experts_per_rank": {"type": "integer"},
+        "dispatch_dtype": {"type": "string", "enum": ["bf16", "fp8"]},
+        "routing": {"type": "string"},
+        "eplb": {"type": "boolean"}, "num_logical_experts": {"type": "integer"}
+      }
+    },
+    "reproduction": {
+      "type": "object",
+      "required": ["command", "seed", "warmup", "iters", "trials", "measurement_contract"],
+      "properties": {
+        "command": {"type": "string"},
+        "image": {"type": ["string", "null"]},
+        "image_digest": {"type": ["string", "null"]},
+        "image_arch": {"type": ["string", "null"]},
+        "squash_sha256": {"type": ["string", "null"]},
+        "git_run": {"type": ["object", "null"]},
+        "fp8_quant_in_timing": {"type": ["boolean", "null"]}
+      }
+    },
+    "backend_provenance": {"type": "object"},
+    "rows": {
+      "type": "array", "minItems": 1,
+      "items": {
+        "type": "object",
+        "required": ["tokens_per_rank", "global_tokens", "dispatch", "combine", "roundtrip",
+                     "isolated_sum", "samples_pooled", "byte_contracts", "correct"],
+        "properties": {
+          "tokens_per_rank": {"type": "integer", "minimum": 1},
+          "global_tokens": {"type": "integer", "minimum": 1},
+          "dispatch": {"$ref": "#/definitions/percentiles"},
+          "combine": {"$ref": "#/definitions/percentiles"},
+          "roundtrip": {"$ref": "#/definitions/percentiles"},
+          "isolated_sum": {"type": "object"},
+          "samples_pooled": {"type": "integer", "minimum": 1},
+          "percentile_interpolation": {"type": "string"},
+          "per_rank_dispatch_us": {"type": "object"},
+          "raw_samples": {"type": "object"},
+          "byte_contracts": {
+            "type": "object",
+            "required": ["token_rank_payload_copies", "token_expert_payload_copies",
+                         "dispatch_bytes", "combine_bytes"],
+            "properties": {
+              "token_rank_payload_copies": {"type": "integer"},
+              "token_expert_payload_copies": {"type": "integer"},
+              "dispatch_bytes": {"type": "integer"}, "combine_bytes": {"type": "integer"}
+            }
+          },
+          "roundtrip_tokens_per_second": {"type": ["number", "null"]},
+          "correct": {"type": "boolean"}
+        }
+      }
+    }
+  },
+  "definitions": {
+    "percentiles": {
+      "type": "object",
+      "required": ["p50", "p90", "p95", "p99"],
+      "properties": {
+        "p50": {"type": "number"}, "p90": {"type": "number"},
+        "p95": {"type": "number"}, "p99": {"type": "number"}
+      }
+    }
+  }
+}
diff --git a/experimental/CollectiveX/schemas/workload-v1.schema.json b/experimental/CollectiveX/schemas/workload-v1.schema.json
new file mode 100644
index 000000000..285f56ad2
--- /dev/null
+++ b/experimental/CollectiveX/schemas/workload-v1.schema.json
@@ -0,0 +1,46 @@
+{
+  "$schema": "http://json-schema.org/draft-07/schema#",
+  "$id": "https://semianalysis/collectivex/schemas/workload-v1.schema.json",
+  "title": "CollectiveX canonical MoE routing workload manifest",
+  "description": "Manifest for a serialized routing trace (tests/workload.py). The <workload_id>.npz holds topk_idx/topk_weights; this manifest carries the identity, dimensions, routing profile, and SHA-256 checksums that gate cross-hardware comparison.",
+  "type": "object",
+  "additionalProperties": false,
+  "required": ["schema_version", "workload_id", "generator_version", "gate_weight_format",
+               "dims", "routing_profile", "seed", "checksums"],
+  "properties": {
+    "schema_version": {"const": 1},
+    "workload_id": {"type": "string", "pattern": "^[0-9a-f]{16}$",
+      "description": "Immutable id = sha256(generator|routing|hidden|topk|experts|gt|seed)[:16]."},
+    "generator_version": {"type": "string",
+      "description": "Routing generator identity; bump when numerics change so stale files can't masquerade."},
+    "gate_weight_format": {"type": "string"},
+    "dims": {
+      "type": "object",
+      "additionalProperties": false,
+      "required": ["hidden", "topk", "experts", "global_tokens", "experts_per_rank"],
+      "properties": {
+        "hidden": {"type": "integer", "minimum": 1},
+        "topk": {"type": "integer", "minimum": 1},
+        "experts": {"type": "integer", "minimum": 1},
+        "global_tokens": {"type": "integer", "minimum": 1},
+        "experts_per_rank": {"type": "integer", "minimum": 1}
+      }
+    },
+    "routing_profile": {"type": "string",
+      "enum": ["uniform", "balanced", "balanced-rank-local", "zipf",
+               "zipf-mild", "zipf-moderate", "zipf-heavy", "hotspot-single"]},
+    "seed": {"type": "integer"},
+    "checksums": {
+      "type": "object",
+      "additionalProperties": false,
+      "required": ["topk_idx", "topk_weights", "trace"],
+      "properties": {
+        "topk_idx": {"type": "string", "pattern": "^[0-9a-f]{64}$"},
+        "topk_weights": {"type": "string", "pattern": "^[0-9a-f]{64}$"},
+        "trace": {"type": "string", "pattern": "^[0-9a-f]{64}$"}
+      }
+    },
+    "routing_stats": {"type": "object",
+      "description": "Realized fan-out / load / locality stats (advisory; not identity-defining)."}
+  }
+}
diff --git a/experimental/CollectiveX/summarize.py b/experimental/CollectiveX/summarize.py
new file mode 100644
index 000000000..2d71a87e1
--- /dev/null
+++ b/experimental/CollectiveX/summarize.py
@@ -0,0 +1,260 @@
+#!/usr/bin/env python3
+"""CollectiveX — summarize a run's results.
+
+Two output modes over the same data:
+  (default)    a plain-text table for the Slurm/container log; ALSO the result
+               gate — exits non-zero if no valid results were produced, so a
+               failed/skipped benchmark doesn't get reported as a green job.
+  --markdown   GitHub-flavored markdown for a GitHub Actions job summary
+               (https://github.blog/.../supercharging-github-actions-with-job-summaries/);
+               reporting only, always exits 0. A workflow step appends this to
+               $GITHUB_STEP_SUMMARY so the run page shows a rendered table.
+
+    python summarize.py --results-dir results --runner gb200-nv_1 --ts <ts>
+    python summarize.py --results-dir results --markdown >> "$GITHUB_STEP_SUMMARY"
+"""
+from __future__ import annotations
+
+import argparse
+import glob
+import json
+import os
+
+
+def load_results(results_dir: str, runner: str | None, ts: str | None) -> list[dict]:
+    docs = []
+    for path in sorted(glob.glob(os.path.join(results_dir, "*.json"))):
+        base = os.path.basename(path)
+        if base.startswith("env_"):
+            continue
+        if runner and not base.startswith(f"{runner}_"):
+            continue
+        if ts and ts not in base:
+            continue
+        try:
+            with open(path) as fh:
+                d = json.load(fh)
+        except (json.JSONDecodeError, OSError):
+            continue
+        if d.get("family") in ("nccl", "moe"):
+            docs.append(d)
+    return docs
+
+
+def _peak_busbw(rows):
+    return max((r.get("busbw_gbps") or 0.0 for r in rows), default=0.0)
+
+
+_OP_ORDER = ["all_reduce", "reduce_scatter", "all_gather", "alltoall"]
+
+
+def _row_lat(r):
+    vals = [(r.get(k) or {}).get("time_us") for k in ("out_of_place", "in_place")]
+    vals = [v for v in vals if v is not None]
+    return min(vals) if vals else None
+
+
+def _lat_floor(rows):
+    # Small-message latency floor: time at the smallest REAL (size>0) message.
+    # (Sub-granularity 0-byte rows are a no-op ~1 us and not a real latency.)
+    real = [r for r in rows if (r.get("size_bytes") or 0) > 0]
+    if not real:
+        return float("nan")
+    v = _row_lat(min(real, key=lambda r: r["size_bytes"]))
+    return v if v is not None else float("nan")
+
+
+def _at_size(rows, size, fn):
+    for r in rows:
+        if r.get("size_bytes") == size:
+            return fn(r)
+    return None
+
+
+def _fmt_bytes(b):
+    for u, s in ((2**30, "GiB"), (2**20, "MiB"), (2**10, "KiB")):
+        if b >= u and b % u == 0:
+            return f"{b // u} {s}"
+    return f"{b} B"
+
+
+def _ops_sorted(nccl):
+    present = {d.get("op") for d in nccl}
+    ordered = [o for o in _OP_ORDER if o in present]
+    return ordered + sorted(present - set(ordered))
+
+
+def _ladder(nccl):
+    sizes = sorted({r["size_bytes"] for d in nccl for r in d.get("rows", [])
+                    if (r.get("size_bytes") or 0) > 0})
+    if not sizes:
+        return []
+    cand = [16384, 262144, 4194304, 67108864, 268435456, 1073741824, 4294967296]
+    lad = [s for s in cand if s in set(sizes) and s < sizes[-1]]
+    lad.append(sizes[-1])
+    return lad
+
+
+def _sweep_table(nccl, title, rowfn, fmt):
+    lad = _ladder(nccl)
+    if not lad:
+        return []
+    ops = _ops_sorted(nccl)
+    rows_by_op = {d.get("op"): d.get("rows", []) for d in nccl}
+    out = [f"\n**{title}**\n",
+           "| bytes/rank | " + " | ".join(f"`{o}`" for o in ops) + " |",
+           "|---" + "|--:" * len(ops) + "|"]
+    for s in lad:
+        cells = []
+        for o in ops:
+            v = _at_size(rows_by_op.get(o, []), s, rowfn)
+            cells.append(format(v, fmt) if isinstance(v, (int, float)) else "—")
+        out.append(f"| {_fmt_bytes(s)} | " + " | ".join(cells) + " |")
+    return out
+
+
+def _fnum(x, fmt):
+    return format(x, fmt) if isinstance(x, (int, float)) else "—"
+
+
+def _moe_sorted(moe):
+    return sorted(moe, key=lambda x: (x.get("backend", ""), x.get("phase", ""), x.get("ep_size", 0)))
+
+
+def _moe_sweep_table(d):
+    """Markdown sweep table for one EP doc — the rows already ARE the ladder, so
+    emit one row per source-tokens-per-rank point. Skips old single-point docs
+    (no rows[])."""
+    rows = d.get("rows")
+    if not rows:
+        return []
+    sh = d.get("shape", {})
+    head = (f"\n**`{d.get('backend')}` · {d.get('phase')} · ep{d.get('ep_size')} · "
+            f"H{sh.get('hidden')} top{sh.get('topk')} E{sh.get('experts')} "
+            f"{sh.get('dispatch_dtype')} {sh.get('routing')}** — latency vs source tokens/rank\n")
+    out = [head,
+           "| tokens/rank | fan-out | dispatch µs | combine µs | serial µs (D+C) | tokens/s | recv max | correct |",
+           "|--:|--:|--:|--:|--:|--:|--:|:--:|"]
+    for r in rows:
+        out.append(f"| {r.get('tokens_per_rank')} | {_fnum(r.get('fanout_mean'), '.2f')} | "
+                   f"{_fnum(r.get('dispatch_us_p50'), '.2f')} | {_fnum(r.get('combine_us_p50'), '.2f')} | "
+                   f"{_fnum(r.get('serial_us_p50', r.get('roundtrip_us_p50')), '.2f')} | "
+                   f"{_fnum(r.get('tokens_per_second'), '.3e')} | "
+                   f"{r.get('recv_tokens_max', r.get('recv_tokens', '—'))} | {'✅' if r.get('correct') else '❌'} |")
+    return out
+
+
+def render_plain(nccl, moe, n_valid, total) -> str:
+    out = []
+    hdr = "CollectiveX results"
+    if nccl or moe:
+        d0 = (nccl + moe)[0]
+        hdr += f" — runner={d0.get('runner')} topology={d0.get('topology_class')} transport={d0.get('transport')}"
+    out += ["=" * len(hdr), hdr, "=" * len(hdr)]
+    if nccl:
+        out.append(f"\nNCCL primitives (world={nccl[0].get('world_size')}, dtype={nccl[0].get('dtype')}):")
+        out.append(f"  {'op':<16}{'status':<9}{'peak busbw':>12}{'lat floor':>10}{'avg busbw':>11}")
+        for d in sorted(nccl, key=lambda x: x["op"]):
+            rows = d.get("rows", [])
+            avg = (d.get("summary") or {}).get("avg_busbw_gbps")
+            out.append(f"  {d['op']:<16}{d.get('status',''):<9}{_peak_busbw(rows):>12.1f}"
+                       f"{_lat_floor(rows):>10.2f}{(avg if avg is not None else float('nan')):>11.1f}")
+    if moe:
+        out.append("\nMoE EP dispatch/combine (DeepEP / MoRI) — headline (* = headline tokens/rank):")
+        out.append(f"  {'backend':<9}{'phase':<8}{'ep':>3} {'status':<9}{'T*':>5}{'disp_p50':>10}{'comb_p50':>10}{'serial':>9}  correct")
+        for d in sorted(moe, key=lambda x: (x.get("backend", ""), x.get("phase", ""))):
+            m, c = d.get("metrics", {}), d.get("correctness", {})
+            ser = m.get("serial_us_p50", m.get("roundtrip_us_p50"))
+            out.append(f"  {d.get('backend',''):<9}{d.get('phase',''):<8}{str(d.get('ep_size','')):>3} {d.get('status',''):<9}"
+                       f"{str(m.get('headline_tokens_per_rank','')):>5}"
+                       f"{(m.get('dispatch_us_p50') or float('nan')):>10.1f}{(m.get('combine_us_p50') or float('nan')):>10.1f}"
+                       f"{(ser or float('nan')):>9.1f}   {c.get('passed')}")
+    return "\n".join(out)
+
+
+def _emoji(status) -> str:
+    return "✅ valid" if status == "valid" else f"❌ {status}"
+
+
+def render_markdown(nccl, moe, n_valid, total) -> str:
+    out = []
+    if nccl or moe:
+        d0 = (nccl + moe)[0]
+        out.append(f"## CollectiveX results — `{d0.get('runner')}` · {d0.get('topology_class')} · {d0.get('transport') or 'n/a'}")
+    if nccl:
+        out.append(f"\n### NCCL/RCCL primitives (world={nccl[0].get('world_size')}, dtype={nccl[0].get('dtype')})\n")
+        out.append("| op | status | peak busbw (GB/s) | lat floor (µs) |")
+        out.append("|---|---|--:|--:|")
+        for d in sorted(nccl, key=lambda x: _OP_ORDER.index(x["op"]) if x["op"] in _OP_ORDER else 99):
+            rows = d.get("rows", [])
+            out.append(f"| `{d['op']}` | {_emoji(d.get('status'))} | {_peak_busbw(rows):.1f} | {_lat_floor(rows):.2f} |")
+        out += _sweep_table(nccl, "Bus bandwidth vs bytes/rank (GB/s)", lambda r: r.get("busbw_gbps"), ".1f")
+        out += _sweep_table(nccl, "Latency vs bytes/rank (µs)", _row_lat, ".2f")
+        out.append("\n> bytes/rank = nccl/rccl-tests message size (= per-rank for all-reduce / "
+                   "reduce-scatter / all-to-all; all-gather input/rank = size ÷ #GPUs). Small "
+                   "sizes are latency-bound (busbw ≈ 0); peak bandwidth is at the largest size.")
+    if moe:
+        out.append("\n### MoE EP dispatch / combine (DeepEP / MoRI)\n")
+        out.append("Headline = the reference point (tokens/rank shown as `T*`); the per-line "
+                   "sweep tables below carry the full source-tokens-per-rank curve.\n")
+        out.append("| backend | phase | mode | dtype | resource | ep | routing (fan-out) | status | T\\* | dispatch p50 (µs) | combine p50 (µs) | serial p50 (µs) | tokens/s | correct |")
+        out.append("|---|---|---|---|---|--:|---|---|--:|--:|--:|--:|--:|:--:|")
+        for d in _moe_sorted(moe):
+            m, c = d.get("metrics", {}), d.get("correctness", {})
+            rp = d.get("routing_profile", {})
+            ser = m.get("serial_us_p50", m.get("roundtrip_us_p50"))
+            sh = d.get("shape") or {}
+            fo = f"{sh.get('routing','?')} ({_fnum(rp.get('fanout_mean'), '.1f')})"
+            # dtype shows whether the fp8 cast was inside the timed dispatch (LL) or not.
+            dt = sh.get("dispatch_dtype", "?")
+            fit = (d.get("reproduction") or {}).get("fp8_quant_in_timing")
+            dt += "*" if fit else ("⁺" if fit is False else "")
+            out.append(f"| `{d.get('backend')}` | {d.get('phase','')} | {d.get('mode','')} | {dt} | "
+                       f"{d.get('resource_mode','')} | {d.get('ep_size','')} | {fo} | {_emoji(d.get('status'))} | "
+                       f"{m.get('headline_tokens_per_rank','—')} | {_fnum(m.get('dispatch_us_p50'), '.1f')} | "
+                       f"{_fnum(m.get('combine_us_p50'), '.1f')} | {_fnum(ser, '.1f')} | "
+                       f"{_fnum(m.get('tokens_per_second'), '.3e')} | {'✅' if c.get('passed') else '❌'} |")
+        for d in _moe_sorted(moe):
+            out += _moe_sweep_table(d)
+        out.append("\n> EP sweep: only source tokens/rank varies along a line. **fan-out** = mean "
+                   "destination ranks/token (representativeness — top-k spread, not a permutation). "
+                   "Dispatch & combine timed **separately** (staging untimed); **serial = dispatch + "
+                   "combine** (a sum, not an independently-measured chained op). dtype `fp8*` = fp8 cast "
+                   "IS inside the timed dispatch (LL kernel); `fp8⁺` = cast is untimed preprocessing "
+                   "(normal mode). `mode` ll = DeepEP low-latency; `resource` = comm SM/CU regime.")
+    if not total:
+        out.append("\n> No result files found — the benchmark produced nothing.")
+    return "\n".join(out)
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser(description="CollectiveX result summary")
+    ap.add_argument("--results-dir", default="results")
+    ap.add_argument("--runner", default=None)
+    ap.add_argument("--ts", default=None)
+    ap.add_argument("--markdown", action="store_true",
+                    help="emit GitHub job-summary markdown (reporting only; always exits 0)")
+    args = ap.parse_args()
+
+    docs = load_results(args.results_dir, args.runner, args.ts)
+    nccl = [d for d in docs if d["family"] == "nccl"]
+    moe = [d for d in docs if d["family"] == "moe"]
+    total = len(docs)
+    n_valid = sum(d.get("status") == "valid" for d in docs)
+
+    if args.markdown:
+        print(render_markdown(nccl, moe, n_valid, total))
+        return 0  # reporting step — never fail the job here
+
+    print(render_plain(nccl, moe, n_valid, total))
+    if total == 0:
+        print("ERROR: no result files found — benchmark produced nothing.")
+        return 1
+    if n_valid < total:
+        print(f"ERROR: {total - n_valid} result(s) invalid — failing the job.")
+        return 1
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/experimental/CollectiveX/tests/_gb300_ep_probe.py b/experimental/CollectiveX/tests/_gb300_ep_probe.py
new file mode 100644
index 000000000..3889c98f5
--- /dev/null
+++ b/experimental/CollectiveX/tests/_gb300_ep_probe.py
@@ -0,0 +1,144 @@
+#!/usr/bin/env python3
+"""GB300 EP8 GO/NO-GO probe — does DeepEP work across 2 NVL72 trays (8 ranks / 2 nodes)?
+
+Read-only spike (no artifacts). One PATH per process (CX_PROBE_PATH), because NVSHMEM
+inits once per process and the internode/LL buffers each bootstrap it. Reports, on rank 0,
+which Buffer construction + a 1-shot dispatch/combine round-trip actually runs on this fabric:
+
+  intranode  Buffer(group, nvl, 0)                         (MNNVL-as-one-NVLink-domain hope)
+  internode  Buffer(group, nvl, rdma>0)                    (DeepEP NVSHMEM path, over NVLink/IB)
+  ll         Buffer(group, 0, rdma, low_latency_mode=True) (decode path; nvlink-LL allowed)
+
+Env (set per-rank by the srun wrapper): RANK WORLD_SIZE LOCAL_RANK MASTER_ADDR MASTER_PORT
+                                        CX_PROBE_PATH=intranode|internode|ll
+"""
+import os
+import socket
+import sys
+import traceback
+
+import torch
+import torch.distributed as dist
+
+RANK = int(os.environ["RANK"])
+WORLD = int(os.environ["WORLD_SIZE"])
+LR = int(os.environ["LOCAL_RANK"])
+PATH = os.environ.get("CX_PROBE_PATH", "intranode")
+HOST = socket.gethostname()
+H = 7168
+TOPK = 8
+EXPERTS = WORLD * 32          # 256 at world=8 — same as the real sweep
+T = 8                          # tiny: this is a does-it-run probe, not a timing run
+
+
+def log(msg):
+    print(f"[r{RANK}@{HOST} {PATH}] {msg}", flush=True)
+
+
+def main():
+    torch.cuda.set_device(LR)
+    dev = torch.device(f"cuda:{LR}")
+    dist.init_process_group("nccl", rank=RANK, world_size=WORLD)
+
+    import deep_ep
+    from deep_ep import Buffer
+    if RANK == 0:
+        import inspect
+        try:
+            import importlib.metadata as md
+            ver = md.version("deep_ep")
+        except Exception:
+            ver = getattr(deep_ep, "__version__", "?")
+        log(f"deep_ep={ver} torch={torch.__version__} cuda={torch.version.cuda}")
+        log(f"Buffer.__init__{inspect.signature(Buffer.__init__)}")
+        log(f"caps: internode_dispatch={hasattr(Buffer,'internode_dispatch')} "
+            f"get_dispatch_config={hasattr(Buffer,'get_dispatch_config')} "
+            f"low_latency_dispatch={hasattr(Buffer,'low_latency_dispatch')} "
+            f"ll_rdma_hint={hasattr(Buffer,'get_low_latency_rdma_size_hint')}")
+
+    hosts = [None] * WORLD
+    dist.all_gather_object(hosts, HOST)
+    if RANK == 0:
+        uniq = sorted(set(hosts))
+        log(f"world={WORLD} over {len(uniq)} node(s): {uniq}")
+
+    group = dist.group.WORLD
+    x = torch.randn(T, H, dtype=torch.bfloat16, device=dev)
+    g = torch.Generator(device=dev).manual_seed(1234 + RANK)
+    idx = torch.stack([torch.randperm(EXPERTS, device=dev, generator=g)[:TOPK]
+                       for _ in range(T)]).to(torch.int64)
+    w = torch.rand(T, TOPK, device=dev, generator=g).to(torch.float32)
+
+    dist.barrier()
+    try:
+        if PATH == "intranode":
+            buf = Buffer(group, 1 * 1024**3, 0)
+            try:
+                Buffer.set_num_sms(24)
+            except Exception:
+                pass
+            ntr, ntrr, ntpe, itir, _ = buf.get_dispatch_layout(idx, EXPERTS)
+            rx, _ri, rw, _nre, h, _ev = buf.dispatch(
+                x, topk_idx=idx, topk_weights=w, num_tokens_per_rank=ntr,
+                num_tokens_per_rdma_rank=ntrr, is_token_in_rank=itir,
+                num_tokens_per_expert=ntpe)
+            cx, _, _ = buf.combine(rx, h, topk_weights=rw)
+            rxs = rx[0].shape if isinstance(rx, tuple) else rx.shape
+            log(f"RESULT intranode OK: recv={tuple(rxs)} combine={tuple(cx.shape)} "
+                f"rdma_rank_layout={'present' if ntrr is not None else 'None'}")
+
+        elif PATH == "internode":
+            buf = Buffer(group, 1 * 1024**3, 1 * 1024**3)
+            try:
+                Buffer.set_num_sms(24)
+            except Exception:
+                pass
+            ntr, ntrr, ntpe, itir, _ = buf.get_dispatch_layout(idx, EXPERTS)
+            rx, _ri, rw, _nre, h, _ev = buf.dispatch(
+                x, topk_idx=idx, topk_weights=w, num_tokens_per_rank=ntr,
+                num_tokens_per_rdma_rank=ntrr, is_token_in_rank=itir,
+                num_tokens_per_expert=ntpe)
+            cx, _, _ = buf.combine(rx, h, topk_weights=rw)
+            rxs = rx[0].shape if isinstance(rx, tuple) else rx.shape
+            log(f"RESULT internode OK: recv={tuple(rxs)} combine={tuple(cx.shape)} "
+                f"rdma_rank_layout={'present' if ntrr is not None else 'None'}")
+
+        elif PATH == "ll":
+            num_max = 128
+            rdma = Buffer.get_low_latency_rdma_size_hint(num_max, H, WORLD, EXPERTS)
+            nq = max(1, EXPERTS // WORLD)
+            buf = Buffer(group, 0, rdma, low_latency_mode=True, num_qps_per_rank=nq,
+                         allow_nvlink_for_low_latency_mode=True)
+            rx, rc, h, _ev, _hook = buf.low_latency_dispatch(
+                x, idx, num_max, EXPERTS, use_fp8=False, return_recv_hook=False)
+            cx, _ev2, _hook2 = buf.low_latency_combine(rx, idx, w, h)
+            rxs = rx[0].shape if isinstance(rx, tuple) else rx.shape
+            log(f"RESULT ll OK: recv={tuple(rxs)} combine={tuple(cx.shape)}")
+        else:
+            log(f"unknown CX_PROBE_PATH={PATH}")
+            return 2
+        dist.barrier()
+    except Exception as exc:
+        if RANK == 0:
+            log(f"RESULT {PATH} FAIL: {exc!r}")
+            tb = traceback.format_exc().strip().splitlines()
+            for ln in tb[-8:]:
+                log(f"  | {ln}")
+        # let other ranks print their error too (often the real one is rank-specific)
+        else:
+            log(f"FAIL(non0): {exc!r}")
+        try:
+            dist.barrier()
+        except Exception:
+            pass
+        return 1
+    finally:
+        try:
+            dist.destroy_process_group()
+        except Exception:
+            pass
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/experimental/CollectiveX/tests/capability.py b/experimental/CollectiveX/tests/capability.py
new file mode 100644
index 000000000..fc10780c0
--- /dev/null
+++ b/experimental/CollectiveX/tests/capability.py
@@ -0,0 +1,98 @@
+#!/usr/bin/env python3
+"""CollectiveX capability resolver (stdlib-only — runs on a login node, no torch).
+
+A workflow that exposes backend x SKU x mode x dtype x contract can request combinations
+no backend supports, and 'all' is not the same backend set across vendors. This static
+table mirrors the adapters' SUPPORTED_* sets so the matrix compiler / a pre-flight step
+can REJECT or OMIT invalid combinations BEFORE consuming a runner (review #3). The
+adapters still reject at runtime — this just fails fast and keeps the matrix honest.
+
+  python3 tests/capability.py --sku b300 --backend deepep --mode ll --dtype fp8 \
+      --contract layout-and-dispatch-v1            # exit 0 if valid, 3 + reason if not
+  python3 tests/capability.py --list               # dump the table
+"""
+from __future__ import annotations
+
+import argparse
+import json
+import sys
+
+# SKU -> vendor. The runner label's SKU prefix selects the launcher; vendor gates backend.
+SKU_VENDOR = {
+    "h100": "nvidia", "h200": "nvidia", "b200": "nvidia", "b300": "nvidia",
+    "gb200": "nvidia", "gb300": "nvidia", "h100-dgxc": "nvidia", "b200-dgxc": "nvidia",
+    "mi355x": "amd", "mi350x": "amd", "mi325x": "amd", "mi300x": "amd",
+}
+
+# Backend capability table — MIRRORS the adapter SUPPORTED_* sets (the runtime source of
+# truth). Keep in sync with ep_deepep.py / ep_mori.py. LL is decode-only; cached-layout is
+# normal-only; MoRI is bf16/normal/layout-and-dispatch only.
+CAP = {
+    "deepep": {
+        "vendors": ["nvidia"],
+        "modes": ["normal", "ll"],
+        "dtypes": ["bf16", "fp8"],
+        "contracts": ["layout-and-dispatch-v1", "cached-layout-comm-only-v1"],
+        "transports": ["nvlink", "rdma"],
+    },
+    "mori": {
+        "vendors": ["amd"],
+        "modes": ["normal"],
+        "dtypes": ["bf16"],
+        "contracts": ["layout-and-dispatch-v1"],
+        "transports": ["xgmi", "rdma"],
+    },
+}
+# nccl/rccl are collective primitives, not EP dispatch/combine — phase is meaningless.
+COLLECTIVE = {"nccl": ["nvidia"], "rccl": ["amd"]}
+
+# 'all' resolves to a DEFINED per-vendor backend set (not the same across vendors).
+VENDOR_BACKENDS = {"nvidia": ["nccl", "deepep"], "amd": ["rccl", "mori"]}
+
+
+def resolve(sku, backend, mode="normal", dtype="bf16",
+            contract="layout-and-dispatch-v1"):
+    """Return (ok: bool, reason: str)."""
+    sku = (sku or "").split("_")[0]
+    vendor = SKU_VENDOR.get(sku)
+    if vendor is None:
+        return False, f"unknown SKU '{sku}'"
+    if backend in COLLECTIVE:
+        if vendor not in COLLECTIVE[backend]:
+            return False, f"{backend} is not the {vendor} collective backend"
+        return True, "collective primitive (phase/dtype/mode/contract not applicable)"
+    cap = CAP.get(backend)
+    if cap is None:
+        return False, f"unknown backend '{backend}'"
+    if vendor not in cap["vendors"]:
+        return False, f"{backend} runs on {cap['vendors']}, not {vendor} SKU '{sku}'"
+    if mode not in cap["modes"]:
+        return False, f"{backend} modes={cap['modes']} (got '{mode}')"
+    if dtype not in cap["dtypes"]:
+        return False, f"{backend} dtypes={cap['dtypes']} (got '{dtype}')"
+    if contract not in cap["contracts"]:
+        return False, f"{backend} contracts={cap['contracts']} (got '{contract}')"
+    if mode == "ll" and contract == "cached-layout-comm-only-v1":
+        return False, "cached-layout-comm-only-v1 is meaningless for LL (layout is in-kernel)"
+    return True, "ok"
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser(description="CollectiveX capability resolver")
+    ap.add_argument("--sku"); ap.add_argument("--backend")
+    ap.add_argument("--mode", default="normal"); ap.add_argument("--dtype", default="bf16")
+    ap.add_argument("--contract", default="layout-and-dispatch-v1")
+    ap.add_argument("--list", action="store_true")
+    a = ap.parse_args()
+    if a.list:
+        print(json.dumps({"sku_vendor": SKU_VENDOR, "cap": CAP,
+                          "collective": COLLECTIVE, "vendor_backends": VENDOR_BACKENDS}, indent=2))
+        return 0
+    ok, reason = resolve(a.sku, a.backend, a.mode, a.dtype, a.contract)
+    print(f"{'VALID' if ok else 'INVALID'}: sku={a.sku} backend={a.backend} mode={a.mode} "
+          f"dtype={a.dtype} contract={a.contract} — {reason}")
+    return 0 if ok else 3
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/experimental/CollectiveX/tests/ep_deepep.py b/experimental/CollectiveX/tests/ep_deepep.py
new file mode 100644
index 000000000..51ce43fbb
--- /dev/null
+++ b/experimental/CollectiveX/tests/ep_deepep.py
@@ -0,0 +1,280 @@
+#!/usr/bin/env python3
+"""CollectiveX EP backend adapter — DeepEP (NVIDIA), normal mode.
+
+The harness owns the deterministic shared routing trace, the comm-only timing, and
+the doc; this file owns only DeepEP's API calls and its correctness reference.
+`make_problem` materializes the harness-provided rank slice (no RNG here), so every
+SKU runs the identical routed workload.
+
+Correctness (per DeepEP's intranode test): a pure dispatch->combine round trip with no
+expert compute reconstructs x only after dividing by the number of ranks each token was
+sent to, so the harness expects combined ≈ x * is_token_in_rank.sum(dim=1).
+"""
+from __future__ import annotations
+
+import os
+import sys
+import types
+
+import torch
+import torch.distributed as dist
+
+try:
+    from deep_ep import Buffer  # type: ignore
+    import deep_ep  # for version/provenance
+except Exception as exc:  # pragma: no cover - needs the built DeepEP
+    print("ERROR: deep_ep import failed — DeepEP must be present/built at job setup. "
+          f"{exc!r}", file=sys.stderr)
+    raise
+
+
+def _deepep_version() -> str:
+    try:
+        import importlib.metadata as _md
+        return _md.version("deep_ep")
+    except Exception:
+        return getattr(deep_ep, "__version__", "unknown")
+
+
+# DeepEP's normal-mode fp8 dispatch takes x as a (fp8, scales) tuple with a per-token
+# block-128 scale (deep_ep 1.2.1 ships NO helper for this — utils is empty — so we
+# implement the exact convention its kernels expect: scales [T, H//128] float32, e4m3,
+# 448 = e4m3 max). Both directions of the cast run OUTSIDE the timed window (cast in
+# make_problem, dequant in stage), so fp8 quantization is NOT included in dispatch time.
+_FP8_MAX = 448.0
+_FP8_BLOCK = 128
+
+
+def _per_token_cast_to_fp8(x):
+    # x: [T, H] (H % 128 == 0) -> (x_fp8 [T,H] e4m3fn, scales [T, H//128] f32)
+    T, H = x.shape
+    xv = x.float().view(T, H // _FP8_BLOCK, _FP8_BLOCK)
+    amax = xv.abs().amax(dim=2).clamp(min=1e-4)               # [T, H//128]
+    x_fp8 = (xv * (_FP8_MAX / amax.unsqueeze(2))).to(torch.float8_e4m3fn).view(T, H)
+    return x_fp8, (amax / _FP8_MAX).contiguous()
+
+
+def _per_block_dequant(x_fp8, scales):
+    # inverse of the above: [R,H] e4m3 + [R, H//128] f32 -> [R,H] bf16
+    R, H = x_fp8.shape
+    xv = x_fp8.float().view(R, H // _FP8_BLOCK, _FP8_BLOCK)
+    return (xv * scales.unsqueeze(2)).view(R, H).to(torch.bfloat16)
+
+
+def _per_block_dequant_3d(x_fp8, scales):
+    # LL recv layout: [E, S, H] e4m3 + [E, S, H//128] f32 -> [E, S, H] bf16
+    E, S, H = x_fp8.shape
+    xv = x_fp8.float().view(E, S, H // _FP8_BLOCK, _FP8_BLOCK)
+    return (xv * scales.unsqueeze(-1)).view(E, S, H).to(torch.bfloat16)
+
+
+class DeepEPBackend:
+    name = "deepep"
+    combine_needs_redispatch = False  # DeepEP combine reuses the handle (its own bench does too)
+    # Blackwell (B300) drops GPU clocks during the tiny small-T points, so the harness
+    # re-ramps clocks at each shape before timing it. Harmless (just untimed iters) on H100.
+    wants_warm_burst = True
+    # Capabilities — run_ep.py REJECTS anything outside these BEFORE construction (no
+    # fallback/mislabel). Expanded as each path is implemented + hardware-validated.
+    #   normal mode: bf16 + fp8 (per-token block-128 cast) — validated intranode NVLink.
+    #   ll mode: low_latency_dispatch/combine — verified RUNNING intranode over NVLink via
+    #   allow_nvlink_for_low_latency_mode (IBGDA not required intranode) on 8xH100.
+    SUPPORTED_PRECISIONS = {"bf16", "fp8"}
+    SUPPORTED_MODES = {"normal", "ll"}
+    # Both contracts (review #3): layout-and-dispatch-v1 times get_dispatch_layout INSIDE
+    # dispatch; cached-layout-comm-only-v1 hoists the layout out (untimed) so dispatch is
+    # pure comm — matching DeepEP's own benchmark. (cached-layout applies to normal mode;
+    # LL has no separable layout — its low_latency_dispatch computes it internally.)
+    SUPPORTED_CONTRACTS = {"layout-and-dispatch-v1", "cached-layout-comm-only-v1"}
+
+    def __init__(self, args, rank, world_size, local_rank, device):
+        self.args = args
+        self.rank = rank
+        self.world_size = world_size
+        self.device = device
+        self.mode = args.mode
+        self.ll = (args.mode == "ll")
+        self.contract = args.measurement_contract
+        # hoist layout out of the timed dispatch only for the cached contract in normal mode.
+        self.cache_layout = (self.contract == "cached-layout-comm-only-v1") and not self.ll
+        self.group = dist.group.WORLD
+        assert args.dispatch_dtype in self.SUPPORTED_PRECISIONS and args.mode in self.SUPPORTED_MODES, \
+            "run_ep.py must reject unsupported dtype/mode before constructing the backend"
+        # fp8 e4m3 per-token-block round-trip caps reconstruction error near the largest
+        # element at ~1/16 (3 mantissa bits); bf16 round-trip is ~5e-3. Tolerance is
+        # recorded in the artifact so the looser fp8 gate is explicit, not hidden.
+        self.fp8 = (args.dispatch_dtype == "fp8")
+        self.tolerance = 1.25e-1 if self.fp8 else 5e-2
+        dev_sms = torch.cuda.get_device_properties(device).multi_processor_count
+        ver = _deepep_version()
+        if self.ll:
+            self._init_ll(args, dev_sms, ver)
+        else:
+            self._init_normal(args, rank, dev_sms, ver)
+
+    def _init_normal(self, args, rank, dev_sms, ver):
+        # fp8 cast is done in make_problem / dequant in stage — both UNTIMED. So fp8
+        # quantization is NOT inside the dispatch timing for DeepEP normal mode.
+        self.fp8_in_timing = False if self.fp8 else None
+        self.combine_needs_redispatch = False  # normal combine reuses the handle
+        # Intranode normal mode: NVLink buffer only. ONE buffer size for ALL points
+        # (review: a phase-dependent 2/4 GiB made the shared T=128 point differ between
+        # the decode and prefill sweeps). 4 GiB holds T up to 4096 (validated).
+        num_nvl_bytes = int(os.environ.get("CX_DEEPEP_NVL_BYTES", str(4 * 1024 * 1024 * 1024)))
+        self.buffer = Buffer(self.group, num_nvl_bytes, 0)
+        rm = args.resource_mode
+        tuned_src = None
+        if rm == "normalized":
+            num_sms = max(1, round(args.sm_fraction * dev_sms))   # ~same device fraction as MoRI
+        elif rm == "tuned":
+            # Best-available for the installed DeepEP: its OWN default SM count
+            # (Buffer.num_sms — the library's analytic choice; it deliberately uses
+            # fewer SMs). get_dispatch_config(num_ranks) returns the recommended Config
+            # but doesn't expose num_sms to Python, and the default already reflects it.
+            num_sms = int(getattr(Buffer, "num_sms", args.num_sms))
+            tuned_src = "deepep-default-num_sms"
+        else:  # default — the bring-up budget
+            num_sms = args.num_sms
+        try:
+            Buffer.set_num_sms(num_sms)
+        except Exception as exc:  # pragma: no cover - version dependent
+            if rank == 0:
+                print(f"WARN: could not set num_sms={num_sms}: {exc!r}", file=sys.stderr)
+        self.backend_provenance = {
+            "deepep_version": ver,
+            "deepep_commit": os.environ.get("DEEPEP_COMMIT") or f"pkg-{ver}",
+            "mode": "normal", "resource_mode": rm, "num_sms": num_sms, "device_sms": dev_sms,
+            "sm_fraction": (num_sms / dev_sms), "tuned_source": tuned_src or "n/a",
+            "num_nvl_bytes": num_nvl_bytes,
+        }
+
+    def _init_ll(self, args, dev_sms, ver):
+        # Low-latency mode: a distinct kernel family (IBGDA, but runs intranode over NVLink
+        # via allow_nvlink_for_low_latency_mode). fp8 cast happens INSIDE low_latency_dispatch
+        # so for fp8 the quantization IS inside the timed window (recorded honestly). The
+        # buffer is sized for a FIXED num_max_dispatch_tokens_per_rank (all ranks identical),
+        # so LL is a decode-shaped path; buffer_cap caps the sweep at num_max (no silent drop).
+        # set_num_sms does NOT apply (the LL kernel picks its own occupancy) — recorded n/a.
+        self.fp8_in_timing = (True if self.fp8 else None)
+        self.combine_needs_redispatch = True   # re-dispatch (untimed) before each timed combine
+        self.num_max = int(os.environ.get("CX_LL_MAX_TOKENS", "128"))
+        self.experts = args.experts
+        rdma_bytes = Buffer.get_low_latency_rdma_size_hint(
+            self.num_max, args.hidden, self.world_size, args.experts)
+        # one QP per local expert is the DeepEP convention for LL
+        self.num_qps = max(1, args.experts // self.world_size)
+        self.buffer = Buffer(self.group, 0, rdma_bytes, low_latency_mode=True,
+                             num_qps_per_rank=self.num_qps,
+                             allow_nvlink_for_low_latency_mode=True)
+        self.backend_provenance = {
+            "deepep_version": ver,
+            "deepep_commit": os.environ.get("DEEPEP_COMMIT") or f"pkg-{ver}",
+            "mode": "ll", "resource_mode": args.resource_mode,
+            "num_sms": None, "device_sms": dev_sms, "tuned_source": "ll-fixed-kernel",
+            "num_max_dispatch_tokens_per_rank": self.num_max,
+            "num_rdma_bytes": rdma_bytes, "num_qps_per_rank": self.num_qps,
+            "low_latency_mode": True, "use_fp8": self.fp8,
+        }
+
+    def buffer_cap(self, args):
+        # LL is sized for a fixed num_max; cap the sweep there (reported, not silent).
+        return self.num_max if self.ll else None
+
+    def make_problem(self, T, idx, weights, x):
+        # idx[T,topk] int64, weights[T,topk] f32, x[T,hidden] bf16 — the shared trace slice.
+        p = types.SimpleNamespace(T=T, x=x, topk_idx=idx.to(torch.int64),
+                                  topk_weights=weights.to(torch.float32), layout=None)
+        if self.fp8 and not self.ll:
+            # normal mode: per-token block-128 cast, UNTIMED (preprocessing, mirrors the
+            # real producer that hands the dispatcher already-quantized activations).
+            # LL mode does NOT pre-cast — its kernel casts internally (timed).
+            p.x_fp8, p.x_scales = _per_token_cast_to_fp8(x)
+        if self.cache_layout:
+            # cached-layout-comm-only-v1: compute the dispatch layout ONCE here (untimed)
+            # so the timed dispatch is pure comm. (layout-and-dispatch-v1 leaves it None
+            # and dispatch computes it inside the timed window.)
+            ntr, _, ntpe, itir, _ = self.buffer.get_dispatch_layout(p.topk_idx, self.args.experts)
+            p.layout = (ntr, ntpe, itir)
+        return p
+
+    def dispatch(self, p):
+        if self.ll:
+            return self._dispatch_ll(p)
+        if p.layout is not None:                       # cached-layout-comm-only-v1
+            num_tokens_per_rank, num_tokens_per_expert, is_token_in_rank = p.layout
+        else:                                          # layout-and-dispatch-v1 (timed layout)
+            (num_tokens_per_rank, _, num_tokens_per_expert,
+             is_token_in_rank, _) = self.buffer.get_dispatch_layout(p.topk_idx, self.args.experts)
+        x_in = (p.x_fp8, p.x_scales) if self.fp8 else p.x  # tuple => DeepEP fp8 dispatch
+        recv_x, _recv_idx, recv_topk_weights, _, handle, _ = self.buffer.dispatch(
+            x_in, topk_idx=p.topk_idx, topk_weights=p.topk_weights,
+            num_tokens_per_rank=num_tokens_per_rank, is_token_in_rank=is_token_in_rank,
+            num_tokens_per_expert=num_tokens_per_expert)
+        return types.SimpleNamespace(
+            recv_x=recv_x, recv_topk_weights=recv_topk_weights, handle=handle,
+            is_token_in_rank=is_token_in_rank)
+
+    def _dispatch_ll(self, p):
+        # x is bf16; the kernel casts to fp8 internally when use_fp8=True (so for fp8 the
+        # cast IS inside this timed op — fp8_in_timing=True). recv is the expert-major
+        # 3D layout [num_local_experts, num_max*world, hidden] (+scales when fp8).
+        recv_x, recv_count, handle, _event, _hook = self.buffer.low_latency_dispatch(
+            p.x, p.topk_idx, self.num_max, self.experts,
+            use_fp8=self.fp8, return_recv_hook=False)
+        return types.SimpleNamespace(recv_x=recv_x, recv_count=recv_count, handle=handle)
+
+    def stage(self, p, h):
+        # comm-only contract: "expert outputs" already exist as recv_x. Dequantize fp8 recv
+        # to bf16 HERE (untimed) — the expert-compute boundary — so combine moves bf16 in
+        # both precisions. Bf16 recv is staged as-is. (LL recv is 3D; normal recv is 2D.)
+        if self.ll:
+            if self.fp8:
+                recv_fp8, recv_scales = h.recv_x
+                h.combine_input = _per_block_dequant_3d(recv_fp8, recv_scales)
+            else:
+                h.combine_input = h.recv_x
+        elif self.fp8:
+            recv_fp8, recv_scales = h.recv_x
+            h.combine_input = _per_block_dequant(recv_fp8, recv_scales)
+        else:
+            h.combine_input = h.recv_x
+        return None
+
+    def combine(self, p, h):
+        if self.ll:
+            # weighted per-expert reduce; topk_idx/weights are the ORIGINAL per-token ones.
+            combined_x, _event, _hook = self.buffer.low_latency_combine(
+                h.combine_input, p.topk_idx, p.topk_weights, h.handle)
+            return combined_x
+        combined_x, _, _ = self.buffer.combine(h.combine_input, h.handle,
+                                               topk_weights=h.recv_topk_weights)
+        return combined_x
+
+    def expected(self, p, h):
+        if self.ll:
+            # LL combine reduces each token's topk expert copies weighted by topk_weights;
+            # with no expert compute each copy is (the kernel's fp8 cast of) x, so
+            # combined ≈ x * sum(topk_weights). fp8 quant error is covered by self.tolerance.
+            wsum = p.topk_weights.sum(dim=1, keepdim=True)
+            return p.x.float() * wsum, p.T
+        # normal: round trip with no expert compute reconstructs x*(#destination ranks);
+        # for fp8 compare against the dequantized cast that was actually sent.
+        ranks_per_token = h.is_token_in_rank.sum(dim=1, keepdim=True).clamp(min=1).float()
+        ref = p.x.float()
+        if self.fp8:
+            ref = _per_block_dequant(p.x_fp8, p.x_scales).float()
+        return ref * ranks_per_token, p.T
+
+    def recv_tokens(self, h):
+        if self.ll:
+            return int(h.recv_count.sum().item())  # token-copies received across local experts
+        rx = h.recv_x[0] if isinstance(h.recv_x, tuple) else h.recv_x
+        return int(rx.shape[0])
+
+    def finalize(self, rc):
+        try:
+            dist.barrier()
+            dist.destroy_process_group()
+        except Exception:
+            pass
+        return rc
diff --git a/experimental/CollectiveX/tests/ep_harness.py b/experimental/CollectiveX/tests/ep_harness.py
new file mode 100644
index 000000000..4b9c746ef
--- /dev/null
+++ b/experimental/CollectiveX/tests/ep_harness.py
@@ -0,0 +1,731 @@
+#!/usr/bin/env python3
+"""CollectiveX — shared EP (expert-parallel) dispatch/combine benchmark harness.
+
+Backend-agnostic core. The per-backend adapters (`ep_deepep.py`, `ep_mori.py`)
+implement a small duck-typed protocol; this module owns the source-tokens-per-rank
+sweep, the timing, the correctness gate, and the provenance-tagged JSON doc.
+
+Fair-comparison contract (hardened after review — see notes.md / plan.md):
+  * **Deterministic shared routing trace** (`routing.py`): the per-token expert IDs +
+    gate weights are generated once from a fixed seed over the *global* batch and are
+    identical on every SKU; each rank materializes its slice. So every platform runs
+    the *same* problem (no per-rank/per-platform RNG in the adapters).
+  * **Explicit measurement contract** (review #3): adapters conform to a NAMED timing
+    boundary, they do not each choose their own. layout-and-dispatch-v1 times the
+    routing-layout step inside dispatch (the only contract MoRI can honor); cached-
+    layout-comm-only-v1 hoists it out (DeepEP). Combine excludes staging in both.
+    Serial = SUM of the two isolated medians (NOT a measured chained op).
+  * **Correct collective percentile**: each iteration's latency is reduced MAX across
+    ranks first (a collective finishes with its slowest rank), THEN percentiled —
+    `median_i(max_r)`, not `max_r(median_i)`.
+  * **One line = one fixed config**; only T varies. Both `tokens_per_rank` and
+    `global_tokens = T * ep_size` are recorded for the weak/strong-scaling x toggle.
+
+stdlib-only at module top (torch is passed in by the entrypoint; `routing` is imported
+lazily inside run_sweep) so this file `py_compile`s without torch.
+
+Backend protocol:
+    name, mode, combine_needs_redispatch, backend_provenance(dict)
+    buffer_cap(args) -> int|None
+    make_problem(T, idx, weights, x) -> problem   # materialize this rank's trace slice
+    dispatch(problem) -> handle                   # pure dispatch comm (timed)
+    stage(problem, handle)                        # untimed expert-output placement
+    combine(problem, handle) -> tensor            # pure combine comm (timed)
+    expected(problem, handle) -> (tensor, n_cmp)  # correctness reference
+    recv_tokens(handle) -> int                    # realized tokens received this rank
+    finalize(rc) -> int|NoReturn
+"""
+from __future__ import annotations
+
+import argparse
+import datetime as _dt
+import hashlib
+import json
+import os
+
+SCHEMA_VERSION = 3  # v3: explicit contracts, pooled trials p50/p90/p99, routing-identity proof, separated logical bytes
+
+# Phase-default sweeps — token-size regimes, NOT distinct kernels (both run normal
+# mode; "decode"/"prefill" name the small/large-token regime). Powers of two for a
+# clean log x-axis; clamped to the backend buffer ceiling (MoRI's registerable heap).
+DECODE_LADDER = [1, 2, 4, 8, 16, 32, 64, 128]
+PREFILL_LADDER = [128, 256, 512, 1024, 2048, 4096]
+
+_DTYPE_BYTES = {"bf16": 2, "fp16": 2, "fp8": 1}
+
+
+def add_common_args(ap: argparse.ArgumentParser) -> None:
+    """CLI args shared by every backend (the entrypoint adds --backend)."""
+    ap.add_argument("--phase", default="decode", choices=["decode", "prefill"],
+                    help="token-size regime: decode (small T) / prefill (large T) — picks the default ladder")
+    ap.add_argument("--tokens-ladder", default="",
+                    help="space/comma-separated source-tokens-per-rank sweep; blank = phase default")
+    ap.add_argument("--hidden", type=int, default=7168)
+    ap.add_argument("--topk", type=int, default=8)
+    ap.add_argument("--experts", type=int, default=256, help="TOTAL experts (fixed across EP degrees)")
+    ap.add_argument("--dispatch-dtype", default="bf16", choices=["bf16", "fp8"])
+    # uniform = realistic top-k (fan-out ≈5.3 over EP8); balanced = load-equalized,
+    # one-expert-per-rank (fan-out = ep_size); balanced-rank-local = fan-out 1 (min
+    # comm) edge case; zipf = skewed. Default to the REALISTIC one.
+    ap.add_argument("--routing", default="uniform",
+                    choices=["uniform", "balanced", "balanced-rank-local", "zipf",
+                             "zipf-mild", "zipf-moderate", "zipf-heavy", "hotspot-single"])
+    # EPLB (Expert-Parallel Load Balancer): replicate hot experts onto redundant physical
+    # slots + balanced-place so per-rank load equalizes. A pure routing-trace transform
+    # (tests/eplb.py); experts becomes num_logical+redundant. The remedy for `zipf` skew.
+    ap.add_argument("--eplb", action="store_true",
+                    help="apply EPLB expert replication/placement to the routing trace")
+    ap.add_argument("--num-redundant-experts", type=int, default=32,
+                    help="EPLB: redundant physical expert slots (rounded up to a multiple of ep_size)")
+    # Canonical serialized workload (goal P1): consume pre-generated trace bytes instead of the
+    # seeded runtime generator, so a result is provably the SAME workload as another machine's
+    # (checksum match). Points at a dir of <workload_id>.npz/.manifest.json (make_workloads.py).
+    ap.add_argument("--workload-dir", default="",
+                    help="dir of canonical workload traces; empty = seeded runtime generation (dev)")
+    ap.add_argument("--mode", default="normal", choices=["normal", "ll"],
+                    help="kernel path: normal or low-latency (LL); LL is backend-dependent")
+    # Measurement contract — the EXPLICIT timing boundary every adapter must conform to
+    # (review #3: adapters must not each decide their own boundary). Backends declare
+    # SUPPORTED_CONTRACTS; run_ep.py rejects an unsupported one.
+    #   layout-and-dispatch-v1   — dispatch timing INCLUDES routing-layout generation
+    #                              (the only contract MoRI can honor; its layout is
+    #                              computed inside the kernel and cannot be hoisted).
+    #   cached-layout-comm-only-v1 — layout computed ONCE untimed; dispatch times pure
+    #                              comm (DeepEP-only; matches DeepEP's own benchmark).
+    # Combine excludes staging in BOTH (staging is untimed for every backend).
+    ap.add_argument("--measurement-contract", default="layout-and-dispatch-v1",
+                    choices=["layout-and-dispatch-v1", "cached-layout-comm-only-v1"])
+    ap.add_argument("--num-sms", type=int, default=24,
+                    help="DeepEP comm-SM budget in 'default' resource-mode (MoRI uses block_num/warps)")
+    # Resource regime (review: budgets were neither normalized nor tuned):
+    #   normalized — each backend restricted to ~sm_fraction of its device's units
+    #                (DeepEP set_num_sms(frac·SMs); MoRI block_num≈frac·CUs). Fraction-
+    #                based, recorded — an approximate apples-to-apples, not identical work.
+    #   tuned      — each backend's recommended/auto launch config (best achievable).
+    #   default    — DeepEP --num-sms / MoRI 80 blocks (the bring-up budget).
+    ap.add_argument("--resource-mode", default="normalized",
+                    choices=["normalized", "tuned", "default"])
+    ap.add_argument("--sm-fraction", type=float, default=0.18,
+                    help="normalized mode: fraction of device SMs/CUs dedicated to comms (~24/132)")
+    ap.add_argument("--num-ep-groups", type=int, default=1,
+                    help="concurrent EP groups; >1 is REJECTED (real subgroup PGs unimplemented)")
+    ap.add_argument("--seed", type=int, default=67)
+    # 32: B300/Blackwell needs ~30 untimed iters to reach steady-state GPU clocks +
+    # establish NVLink/NVSHMEM connections — at warmup=8 its dispatch read ~1787us
+    # (cold), at warmup>=30 it settles to ~85us (faster than H100, reproducible within
+    # ~2.5%). H100/MI355X reach steady state much sooner; the extra iters are harmless.
+    ap.add_argument("--warmup", type=int, default=32)
+    ap.add_argument("--iters", type=int, default=200,
+                    help="timed iterations PER TRIAL; pooled across trials for percentiles")
+    # review #3: p99 from ~50 samples is just the max. Pool iters x trials, randomize the
+    # token-order each trial so warmup/clock drift doesn't correlate with T, report p50/
+    # p90/p99 (p99 is the headline). 3 trials x 200 iters = 600 pooled samples per point.
+    ap.add_argument("--trials", type=int, default=3,
+                    help="independent timed trials, token-order randomized per trial; samples pooled")
+    ap.add_argument("--allow-unknown-provenance", action="store_true",
+                    help="permit a run with unpinned backend commit/version (default: fail)")
+    # provenance / output
+    ap.add_argument("--runner", required=True)
+    ap.add_argument("--topology-class", required=True)
+    ap.add_argument("--transport", default="")
+    ap.add_argument("--comparison-class", default="standardized")
+    # Structured placement metadata (goal P2 topology): GPUs/node + scale-up domain + placement
+    # kind let routing locality (local/same-node/cross-domain copy fractions) be computed and let
+    # packed/striped/adversarial be distinguished. gpus-per-node=0 -> single node (= ep_size).
+    ap.add_argument("--gpus-per-node", type=int, default=0)
+    ap.add_argument("--scale-up-domain", type=int, default=0, help="0 = gpus_per_node*ep (one domain)")
+    ap.add_argument("--placement", default="packed",
+                    choices=["packed", "striped", "runtime-native", "adversarial"])
+    ap.add_argument("--env-json")
+    ap.add_argument("--timestamp")
+    ap.add_argument("--out", required=True)
+
+
+def token_ladder(spec: str, phase: str, cap: int | None) -> tuple[list[int], list[int]]:
+    """Return (ladder, dropped): explicit spec else the phase default; positive ints;
+    clamped to `cap` with dropped points reported (never silently truncated)."""
+    if spec and spec.strip():
+        want = [int(t) for t in spec.replace(",", " ").split() if t]
+    else:
+        want = DECODE_LADDER if phase == "decode" else PREFILL_LADDER
+    want = sorted({t for t in want if t > 0})
+    if cap is not None:
+        return [t for t in want if t <= cap], [t for t in want if t > cap]
+    return want, []
+
+
+def percentile(xs: list[float], q: float) -> float:
+    if not xs:
+        return float("nan")
+    s = sorted(xs)
+    i = max(0, min(len(s) - 1, int(round(q / 100.0 * (len(s) - 1)))))
+    return s[i]
+
+
+def time_us(torch, fn, warmup: int, iters: int, pre=None) -> list[float]:
+    """Per-iteration CUDA-event latencies (µs) for THIS rank.
+
+    Without `pre`: times `fn()`. With `pre`: runs `pre()` UNTIMED each iteration (sync
+    before the start event so its GPU work can't bleed in), then times `fn(pre_result)`
+    — how combine is isolated when it consumes the dispatch state and needs a fresh
+    untimed dispatch+stage before every sample. Returns the raw per-iteration series;
+    the caller reduces across ranks per iteration before percentiling.
+    """
+    def sample():
+        arg = pre() if pre is not None else None
+        if pre is not None:
+            torch.cuda.synchronize()
+        s = torch.cuda.Event(enable_timing=True)
+        e = torch.cuda.Event(enable_timing=True)
+        s.record()
+        fn(arg) if pre is not None else fn()
+        e.record()
+        torch.cuda.synchronize()
+        return s.elapsed_time(e) * 1000.0  # ms -> us
+
+    for _ in range(max(0, warmup)):
+        if pre is not None:
+            a = pre(); torch.cuda.synchronize(); fn(a)
+        else:
+            fn()
+        # sync EACH warmup iteration, not just once after the loop: the measured-roundtrip fn
+        # interleaves dispatch+combine on a backend's persistent comm buffer, so back-to-back
+        # un-synced warmup iterations let iter N+1's dispatch race iter N's combine (CUDA abort
+        # on a rank -> NCCL-watchdog SIGABRT). Cheap (warmup is small); timed samples already sync.
+        torch.cuda.synchronize()
+    return [sample() for _ in range(iters)]
+
+
+def comparison_key(meta: dict) -> str:
+    """Machine key gating which rows share a curve — built from the FIXED config ONLY
+    (tokens_per_rank is the x-axis and is excluded). op/backend/mode/phase/ep_size/
+    topology are in the key, so EP4 vs EP8, normal vs LL, decode vs prefill, and
+    different SKUs are labelled distinct, never silently overlaid."""
+    parts = [
+        meta["op"], meta["backend"], meta["mode"], meta["phase"],
+        str(meta["ep_size"]), str(meta["nodes"]), meta.get("resource_mode", "default"),
+        meta["topology_class"], meta["comparison_class"], meta["measurement_contract"],
+        json.dumps(meta["shape"], sort_keys=True),
+    ]
+    return hashlib.sha256("|".join(parts).encode()).hexdigest()[:16]
+
+
+def _reduce_vec(torch, dist, device, vals, op):
+    t = torch.tensor(vals, device=device, dtype=torch.float64)
+    dist.all_reduce(t, op=op)
+    return [float(x) for x in t.tolist()]
+
+
+def _reduce_int(torch, dist, device, v: int, op) -> int:
+    t = torch.tensor([int(v)], device=device, dtype=torch.int64)
+    dist.all_reduce(t, op=op)
+    return int(t.item())
+
+
+def _allgather_floats(torch, dist, device, v: float) -> list[float]:
+    """Gather one scalar from every rank -> list indexed by rank (for per-rank diagnostics:
+    which rank is the straggler, the rank spread). all_reduce can't do this — it collapses."""
+    world = dist.get_world_size()
+    out = [torch.zeros(1, device=device, dtype=torch.float64) for _ in range(world)]
+    dist.all_gather(out, torch.tensor([float(v)], device=device, dtype=torch.float64))
+    return [float(x.item()) for x in out]
+
+
+def _histogram(xs: list[float], nbins: int = 40) -> dict:
+    """Compact distribution of pooled cross-rank-max samples (for p99-spike debugging without
+    storing every sample). Equal-width bins between min and max."""
+    if not xs:
+        return {"n": 0}
+    lo, hi = min(xs), max(xs)
+    if hi <= lo:
+        return {"n": len(xs), "min": lo, "max": hi, "bins": nbins, "counts": [len(xs)]}
+    counts = [0] * nbins
+    span = hi - lo
+    for x in xs:
+        b = min(nbins - 1, int((x - lo) / span * nbins))
+        counts[b] += 1
+    return {"n": len(xs), "min": round(lo, 3), "max": round(hi, 3), "bins": nbins, "counts": counts}
+
+
+def _provenance_unknown(prov: dict) -> list[str]:
+    return [k for k, v in prov.items() if isinstance(v, str) and v.strip().lower() == "unknown"]
+
+
+def _resource_profile(prov: dict, args) -> dict:
+    """Map backend-specific provenance onto the backend-INDEPENDENT resource vocabulary (goal P3):
+    requested vs achieved comm-unit fraction, configured units/warps, and a conformance class.
+    DeepEP units = SMs (num_sms); MoRI units = CU blocks (block_num)."""
+    dev = prov.get("device_sms") or prov.get("device_cus")
+    cfg = prov.get("num_sms") if prov.get("num_sms") is not None else prov.get("block_num")
+    requested = args.sm_fraction if args.resource_mode == "normalized" else None
+    achieved = (cfg / dev) if (cfg and dev) else None
+    floored = bool(prov.get("block_num_floored"))
+    if floored:
+        cls = "minimum-functional"            # backend needed MORE than requested to run
+    elif args.resource_mode == "normalized":
+        cls = "resource-conforming"
+    elif args.resource_mode == "tuned":
+        cls = "best-known" if "default" not in str(prov.get("tuned_source", "")) else "backend-default"
+    else:
+        cls = "backend-default"
+    # within tolerance? (normalized only — did we hit the requested fraction?)
+    tol = 0.10
+    target_achieved = (requested is not None and achieved is not None
+                       and abs(achieved - requested) <= tol) if requested else None
+    return {
+        "comm_units_kind": "sm" if prov.get("num_sms") is not None else "cu_block",
+        "requested_fraction": requested, "configured_units": cfg, "device_units": dev,
+        "achieved_fraction": round(achieved, 4) if achieved else None,
+        "warps_dispatch": prov.get("dispatch_warps"), "warps_combine": prov.get("combine_warps"),
+        "qps_per_rank": prov.get("num_qps_per_rank"),
+        "persistent_bytes": prov.get("num_nvl_bytes") or prov.get("num_rdma_bytes") or prov.get("heap_size"),
+        "tuned_source": prov.get("tuned_source"),
+        "conformance_class": cls, "tolerance": tol, "target_achieved_within_tol": target_achieved,
+        "nonconforming": floored,
+    }
+
+
+def _derive_publication_status(v: dict) -> str:
+    """Machine-derive the publication state from the validity dimensions (goal P1). No caller
+    may hand-label a result 'official' — it must earn every gate here."""
+    if v["execution_status"] != "complete":
+        return "failed"
+    if v["semantic_correctness"] != "pass" or v["measurement_conformance"] != "conformant" \
+       or v["workload_identity"] == "inconsistent":
+        return "invalid"
+    sound = (v["semantic_correctness"] == "pass"
+             and v["workload_identity"].startswith("consistent")
+             and v["measurement_conformance"] == "conformant")
+    # resource-nonconforming but otherwise sound -> diagnostic (not a fair cross-platform point)
+    if v["resource_conformance"].endswith("nonconforming"):
+        return "diagnostic"
+    if sound and v["provenance_complete"] and v["workload_source"] == "canonical-serialized":
+        return "official"
+    if sound:
+        return "comparable-experimental"   # measurement sound, missing a publication requirement
+    return "diagnostic"
+
+
+def run_sweep(args, backend, torch, dist, device, rank: int, world_size: int) -> int:
+    """Drive the source-tokens-per-rank sweep for one fully-specified line."""
+    import routing  # torch-based; imported lazily so the module byte-compiles without torch
+    import eplb     # stdlib planner + torch remap (the EPLB transform)
+
+    ep_size = world_size  # num_ep_groups removed (was metadata-only; no real subgroups)
+    # EPLB (if on): run_ep.py already bumped args.experts to the PHYSICAL count and stashed the
+    # logical count, so experts_per_rank below is physical. The trace is built over LOGICAL
+    # experts then remapped to physical (build_trace), so the whole sweep runs over the
+    # balanced physical placement with no adapter change.
+    eplb_on = getattr(args, "eplb", False)
+    num_logical = getattr(args, "num_logical_experts", args.experts)
+    if args.experts % ep_size != 0:
+        if rank == 0:
+            print(f"ERROR: experts ({args.experts}) must divide ep_size ({ep_size})")
+        return 2
+    experts_per_rank = args.experts // ep_size
+    elem_bytes = _DTYPE_BYTES.get(args.dispatch_dtype, 2)
+
+    # Provenance gate (review #1): refuse a comparison run with unpinned backend info.
+    unknown = _provenance_unknown(backend.backend_provenance)
+    if unknown and not args.allow_unknown_provenance:
+        if rank == 0:
+            print(f"ERROR: unpinned provenance {unknown} in {backend.backend_provenance}; "
+                  f"set the commit/version env or pass --allow-unknown-provenance.")
+        return 4
+
+    cap = backend.buffer_cap(args)
+    ladder, dropped = token_ladder(args.tokens_ladder, args.phase, cap)
+    if rank == 0 and dropped:
+        print(f"NOTE: dropped tokens/rank {dropped} — exceed {backend.name} buffer cap {cap} "
+              f"(hidden={args.hidden}); not silently truncated.")
+    if not ladder:
+        if rank == 0:
+            print(f"ERROR: empty token ladder (phase={args.phase}, cap={cap})")
+        return 2
+    # MoRI wedges on a COLD dispatch that jumps straight to a large T; it sets
+    # needs_gradual_ramp so the sweep approaches its max T via a geometric ramp from 1
+    # (validated on MI355X). A naturally-gradual ladder (decode) is unchanged.
+    if getattr(backend, "needs_gradual_ramp", False):
+        top, ramp, t = ladder[-1], [], 1
+        while t < top:
+            ramp.append(t); t *= 2
+        ramp.append(top)
+        if rank == 0 and ramp != ladder:
+            print(f"NOTE: {backend.name} sweep ramped gradually 1..{top} (cold-jump-safe): {ramp}")
+        ladder = ramp
+
+    MAX, MIN, SUM = dist.ReduceOp.MAX, dist.ReduceOp.MIN, dist.ReduceOp.SUM
+
+    # EPLB plan (once): estimate logical load from the global logical trace at the largest
+    # ladder T (most samples), then replicate+place. Held fixed across all T (as real EPLB
+    # plans from an observed load estimate). build_trace builds the LOGICAL trace and remaps
+    # to physical when the plan is present; otherwise it's the identity (logical == physical).
+    eplb_plan = None
+    if eplb_on:
+        ref_idx, _ = routing.build_global_routing(max(ladder) * ep_size, num_logical, args.topk,
+                                                  args.routing, args.seed, num_logical // ep_size)
+        load = torch.bincount(ref_idx.reshape(-1), minlength=num_logical).float().tolist()
+        eplb_plan = eplb.build_plan(load, args.experts, ep_size)
+        if rank == 0:
+            print(f"NOTE: EPLB {num_logical}->{args.experts} experts ({ep_size}x{experts_per_rank}); "
+                  f"per-rank load imbalance {eplb_plan['imbalance_before']:.2f}x -> "
+                  f"{eplb_plan['imbalance_after']:.2f}x; {eplb_plan['replicated_experts']} experts "
+                  f"replicated (hottest {eplb_plan['max_replicas']}x)")
+
+    canonical = bool(getattr(args, "workload_dir", ""))
+    loaded_workload_ids, loaded_checksums = [], {}
+    if canonical:
+        import workload as _wl
+
+    def build_trace(gt):
+        # canonical: load pre-serialized trace bytes (verified by checksum) so this run is
+        # provably the SAME workload as any other consuming the same files. else: seeded gen.
+        if canonical:
+            wid = _wl.compute_workload_id(args.routing, args.hidden, args.topk, num_logical, gt, args.seed)
+            idx_np, w_np, man = _wl.load_workload(os.path.join(args.workload_dir, f"{wid}.npz"), verify=True)
+            idx_l = torch.from_numpy(idx_np).to(torch.int64)
+            w = torch.from_numpy(w_np).to(torch.float32)
+            if wid not in loaded_workload_ids:
+                loaded_workload_ids.append(wid)
+                loaded_checksums[wid] = man.get("checksums")
+        else:
+            idx_l, w = routing.build_global_routing(gt, num_logical, args.topk, args.routing,
+                                                    args.seed, num_logical // ep_size)
+        return (eplb.remap_idx(idx_l, eplb_plan) if eplb_plan is not None else idx_l), w
+
+    # Fabric/clock warm-up BEFORE any timed point (review: H200 had an anomalous cold
+    # first point and a 40% decode-vs-prefill mismatch at the shared T=128). Gradually
+    # ramp through the small ladder shapes untimed — warms clocks/fabric for everyone
+    # and is also cold-jump-safe for MoRI.
+    warm_T = min(ladder[-1], 128)
+    warm_shapes = [t for t in ladder if t <= warm_T] or [ladder[0]]
+    for wt in warm_shapes:
+        wi, ww = build_trace(wt * ep_size)
+        wsi, wsw = routing.rank_slice(wi, ww, rank, wt)
+        wx = routing.rank_activations(wt, args.hidden, args.seed, rank, device, torch.bfloat16)
+        wp = backend.make_problem(wt, wsi.to(device), wsw.to(device), wx)
+        for _ in range(8):
+            wh = backend.dispatch(wp); backend.stage(wp, wh); backend.combine(wp, wh)
+    torch.cuda.synchronize()
+    try:
+        dist.barrier()
+    except Exception:
+        pass
+    # Per-point clock-ramp burst (set up below, applied inside the loop): a ONE-TIME burst
+    # warms clocks, but on Blackwell (B300) the tiny small-T points let clocks drop again,
+    # so a mid-sweep T=64 reads ~20x cold. Re-ramping at EACH shape keeps every timed point
+    # steady-state. Gated by backend.wants_warm_burst — MoRI WEDGES on a sustained burst
+    # (and is already steady at warmup=8), so it opts out. CX_FABRIC_WARM_BURST overrides.
+    warm_burst = int(os.environ.get("CX_FABRIC_WARM_BURST", "40"))
+    do_burst = warm_burst > 0 and getattr(backend, "wants_warm_burst", False)
+
+    import random as _random
+    elem_dispatch = elem_bytes          # fp8=1 / bf16=2 (dispatch payload element size)
+    tol = getattr(backend, "tolerance", 5e-2)
+
+    # ---- Pass 1: build the per-T problem ONCE (deterministic trace + cached layout per
+    # contract), run the correctness gate ONCE. Timing is Pass 2 (pooled over trials). ----
+    problems, gate = {}, {}
+    routing_hashes = set()
+    for T in ladder:
+        gt = T * ep_size
+        idx_g, w_g = build_trace(gt)
+        rstats = routing.routing_stats(idx_g, args.experts, experts_per_rank, weights=w_g)
+        gpn = args.gpus_per_node or ep_size
+        rstats["locality"] = routing.routing_locality(idx_g, experts_per_rank, ep_size, T, gpn,
+                                                      args.scale_up_domain or None)
+        routing_hashes.add(rstats["routing_hash"])
+        idx_s, w_s = routing.rank_slice(idx_g, w_g, rank, T)
+        x = routing.rank_activations(T, args.hidden, args.seed, rank, device, torch.bfloat16)
+        problem = backend.make_problem(T, idx_s.to(device), w_s.to(device), x)
+        h = backend.dispatch(problem); backend.stage(problem, h)
+        combined = backend.combine(problem, h)
+        torch.cuda.synchronize()
+        recv_local = backend.recv_tokens(h)
+        exp, n_cmp = backend.expected(problem, h)
+        max_abs = (combined[:n_cmp].float() - exp[:n_cmp].float()).abs().max().item()
+        max_rel = max_abs / (exp[:n_cmp].float().abs().max().item() + 1e-6)
+        problems[T] = problem
+        gate[T] = {"rstats": rstats, "recv_local": recv_local,
+                   "max_rel": max_rel, "local_ok": 1 if max_rel < tol else 0}
+
+    # ---- Pass 2: N timed trials. Token order is randomized PER TRIAL (seeded ⇒ identical
+    # on every rank, so collectives stay lock-step) so warmup/clock drift can't correlate
+    # with T. Per-iteration cross-rank MAX samples are POOLED across trials, then
+    # percentiled (review #3: p99 from one 50-iter run is just the max). MoRI keeps
+    # ascending order — it wedges on a cold jump to a large T. ----
+    disp_pool = {T: [] for T in ladder}     # pooled per-iteration cross-rank MAX (dispatch)
+    comb_pool = {T: [] for T in ladder}     # ... combine
+    rt_pool = {T: [] for T in ladder}       # ... INDEPENDENTLY-MEASURED round trip (goal P1)
+    disp_local = {T: [] for T in ladder}    # THIS rank's own dispatch samples (per-rank diag)
+    order = list(ladder)
+    rng = _random.Random(args.seed)
+    shuffle_ok = not getattr(backend, "needs_gradual_ramp", False)
+    for trial in range(max(1, args.trials)):
+        if shuffle_ok:
+            rng.shuffle(order)
+        for T in order:
+            problem = problems[T]
+            if do_burst:   # re-ramp clocks at THIS shape before timing (Blackwell)
+                for _ in range(warm_burst):
+                    bh = backend.dispatch(problem); backend.stage(problem, bh); backend.combine(problem, bh)
+                torch.cuda.synchronize()
+            disp_iters = time_us(torch, lambda p=problem: backend.dispatch(p), args.warmup, args.iters)
+
+            def prep(p=problem):
+                hh = backend.dispatch(p); backend.stage(p, hh); return hh
+            if backend.combine_needs_redispatch:
+                comb_iters = time_us(torch, lambda hh, p=problem: backend.combine(p, hh),
+                                     args.warmup, args.iters, pre=prep)
+            else:
+                hh = prep()
+                comb_iters = time_us(torch, lambda p=problem, hx=hh: backend.combine(p, hx),
+                                     args.warmup, args.iters)
+            # MEASURED round trip (goal P1: not a sum of percentiles): one timed region over
+            # dispatch -> stage (no-op "expert" transform) -> combine -> output ready. Captures
+            # shared sync / launch amortization / overlap that the isolated_sum cannot.
+            def rt_once(p=problem):
+                hh = backend.dispatch(p); backend.stage(p, hh); return backend.combine(p, hh)
+            rt_iters = time_us(torch, lambda p=problem: rt_once(p), args.warmup, args.iters)
+            # per-iteration cross-rank MAX (the distributed-op latency per iter), pooled.
+            disp_pool[T] += _reduce_vec(torch, dist, device, disp_iters, MAX)
+            comb_pool[T] += _reduce_vec(torch, dist, device, comb_iters, MAX)
+            rt_pool[T] += _reduce_vec(torch, dist, device, rt_iters, MAX)
+            disp_local[T] += disp_iters
+
+    # ---- Pass 3: percentiles (p50/p90/p95/p99, nearest-rank) from pooled samples + bytes + row ----
+    def pcts(xs):
+        return {"p50": percentile(xs, 50), "p90": percentile(xs, 90),
+                "p95": percentile(xs, 95), "p99": percentile(xs, 99)}
+    rows = []
+    for T in ladder:
+        gt = T * ep_size
+        g = gate[T]; rstats = g["rstats"]
+        d, c, rt = disp_pool[T], comb_pool[T], rt_pool[T]
+        dp, cp, rtp = pcts(d), pcts(c), pcts(rt)
+        # isolated_sum = SUM of the isolated dispatch+combine percentiles. NOT a measured op
+        # (can't reveal shared sync / launch amortization / overlap) — do NOT use for throughput
+        # or SLO capacity. The MEASURED round trip (rtp) is the real chained latency.
+        isum = {k: dp[k] + cp[k] for k in dp}
+        recv_total = _reduce_int(torch, dist, device, g["recv_local"], SUM)
+        recv_max = _reduce_int(torch, dist, device, g["recv_local"], MAX)
+        recv_min = _reduce_int(torch, dist, device, g["recv_local"], MIN)
+        global_ok = _reduce_int(torch, dist, device, g["local_ok"], MIN)
+        max_rel = _reduce_vec(torch, dist, device, [g["max_rel"]], MAX)[0]
+        point_ok = bool(global_ok) and recv_total > 0
+        # Per-rank diagnostics: gather each rank's own dispatch median -> spread + straggler.
+        per_rank_med = _allgather_floats(torch, dist, device, percentile(disp_local[T], 50))
+        slowest_rank = max(range(len(per_rank_med)), key=lambda i: per_rank_med[i])
+        rmean = sum(per_rank_med) / len(per_rank_med)
+        # Canonical LOGICAL payload byte contracts (from the routing trace, NOT backend recv
+        # tensors): token-rank = one copy per unique (token,dest-rank); token-expert = one copy
+        # per routed (token,expert). routed_copies = token-rank copies; gt*topk = token-expert.
+        token_rank_copies = rstats["routed_copies"]
+        token_expert_copies = gt * args.topk
+        H = args.hidden
+        rows.append({
+            "tokens_per_rank": T, "global_tokens": gt,
+            "dispatch": dp, "combine": cp, "roundtrip": rtp, "isolated_sum": isum,
+            # flat aliases kept for back-compat with v3 readers
+            "dispatch_us_p50": dp["p50"], "dispatch_us_p90": dp["p90"], "dispatch_us_p99": dp["p99"],
+            "combine_us_p50": cp["p50"], "combine_us_p90": cp["p90"], "combine_us_p99": cp["p99"],
+            "roundtrip_us_p50": rtp["p50"], "roundtrip_us_p90": rtp["p90"],
+            "roundtrip_us_p95": rtp["p95"], "roundtrip_us_p99": rtp["p99"],
+            "isolated_sum_us_p50": isum["p50"], "isolated_sum_us_p99": isum["p99"],
+            "samples_pooled": len(d), "trials": max(1, args.trials),
+            "percentile_interpolation": "nearest-rank",
+            "recv_tokens_max": recv_max, "recv_tokens_min": recv_min,
+            "recv_tokens_mean": recv_total / world_size, "recv_tokens_total": recv_total,
+            "per_rank_dispatch_us": {"min": min(per_rank_med), "mean": rmean,
+                                     "max": max(per_rank_med), "spread": max(per_rank_med) - min(per_rank_med),
+                                     "slowest_rank": slowest_rank},
+            # dispatch carries its dtype's element size; combine input is bf16 (2B).
+            "dispatch_logical_bytes": token_rank_copies * H * elem_dispatch,
+            "combine_logical_bytes": token_rank_copies * H * 2,
+            "byte_contracts": {
+                "token_rank_payload_copies": token_rank_copies,
+                "token_expert_payload_copies": token_expert_copies,
+                "dispatch_bytes": token_rank_copies * H * elem_dispatch,
+                "combine_bytes": token_rank_copies * H * 2,
+                "fp8_scale_bytes": (token_rank_copies * (H // 128) * 4) if elem_dispatch == 1 else 0,
+                "routing_index_bytes": token_expert_copies * 4,   # int32 topk_idx
+                "gate_weight_bytes": token_expert_copies * 4,     # f32 topk_weights
+            },
+            "byte_contract": "logical-routed-payload-v1",
+            # throughput from the MEASURED round trip ONLY (not isolated_sum).
+            "roundtrip_tokens_per_second": (gt / (rtp["p50"] * 1e-6)) if rtp["p50"] > 0 else None,
+            "raw_samples": {"dispatch": _histogram(d), "combine": _histogram(c), "roundtrip": _histogram(rt)},
+            "fanout_mean": rstats["fanout_mean"], "fanout_max": rstats["fanout_max"],
+            "routed_copies": rstats["routed_copies"], "expert_load_max": rstats["expert_load_max"],
+            "routing_hash": rstats["routing_hash"], "locality": rstats.get("locality"),
+            "correct": point_ok, "max_rel_error": max_rel,
+        })
+        if rank == 0:
+            print(f"  T={T:<5} disp p50/p99={dp['p50']:7.1f}/{dp['p99']:7.1f} comb {cp['p50']:6.1f}/{cp['p99']:6.1f} "
+                  f"RT p50/p99={rtp['p50']:7.1f}/{rtp['p99']:7.1f}us n={len(d)} fanout={rstats['fanout_mean']:.2f} "
+                  f"recv[min/mean/max]={recv_min}/{recv_total // world_size}/{recv_max} "
+                  f"straggler=r{slowest_rank} correct={point_ok}")
+
+    # Cross-rank workload-identity proof: every rank must have built the SAME global routing
+    # (one hash per T here); confirm all ranks agree by hashing the per-T hash set and
+    # MIN/MAX-reducing it — a mismatch means NVIDIA and AMD did NOT run identical routing.
+    trace_sig = int(hashlib.sha256("|".join(sorted(routing_hashes)).encode()).hexdigest()[:15], 16)
+    sig_min = _reduce_int(torch, dist, device, trace_sig, MIN)
+    sig_max = _reduce_int(torch, dist, device, trace_sig, MAX)
+    routing_consistent = (sig_min == sig_max == trace_sig)
+
+    if rank != 0:
+        return 0
+
+    # status=valid requires correctness AND a proven-identical routing trace across ranks.
+    all_ok = bool(rows) and all(r["correct"] for r in rows) and routing_consistent
+
+    # ---- Multi-dimensional validity (goal P1) -> MACHINE-DERIVED publication_status. Adapters
+    # never self-label "official"; status is a pure function of these gates. ----
+    prov = backend.backend_provenance
+    prov_unknown = _provenance_unknown(prov)
+    repro = getattr(args, "reproduction_full", {})
+    git_run = getattr(args, "git_run", None)
+    provenance_complete = (not prov_unknown
+                           and bool(getattr(args, "image_digest", ""))
+                           and bool(git_run) and all((git_run or {}).get(k) for k in ("run_id", "source_sha")))
+    floored = bool(prov.get("block_num_floored"))
+    resource_conformance = ("minimum-functional-nonconforming" if floored
+                            else ("resource-conforming" if args.resource_mode == "normalized"
+                                  else "backend-default" if args.resource_mode in ("tuned", "default")
+                                  else "unspecified"))
+    # record the canonical workload identity consumed (one trace per T -> set of ids/checksums).
+    if canonical and loaded_workload_ids:
+        args.workload_id = (loaded_workload_ids[0] if len(loaded_workload_ids) == 1
+                            else f"set:{len(loaded_workload_ids)}:{loaded_workload_ids[0]}")
+        args.workload_checksums = loaded_checksums
+    canonical_workload = bool(getattr(args, "workload_id", None))
+    validity = {
+        "execution_status": "complete" if rows else "failed",
+        "semantic_correctness": "pass" if (rows and all(r["correct"] for r in rows)) else "fail",
+        "workload_identity": "consistent-across-ranks" if routing_consistent else "inconsistent",
+        "workload_source": "canonical-serialized" if canonical_workload else "seeded-runtime",
+        "measurement_conformance": "conformant",   # run_ep gate rejects nonconformant pre-run
+        "resource_conformance": resource_conformance,
+        "provenance_complete": provenance_complete,
+    }
+    publication_status = _derive_publication_status(validity)
+
+    shape = {  # FIXED line identity (no T, no per-backend resource knobs)
+        "hidden": args.hidden, "topk": args.topk, "experts": args.experts,
+        "experts_per_rank": experts_per_rank, "dispatch_dtype": args.dispatch_dtype,
+        "routing": args.routing, "eplb": bool(eplb_plan), "num_logical_experts": num_logical,
+    }
+    meta = {
+        "op": "ep-dispatch-combine", "backend": backend.name, "mode": args.mode,
+        "phase": args.phase, "world_size": world_size, "ep_size": ep_size,
+        "resource_mode": args.resource_mode,
+        "nodes": int(os.environ.get("SLURM_NNODES", "1")),
+        "topology_class": args.topology_class, "comparison_class": args.comparison_class,
+        # honest contract name (was the misleading "comm-only-v1": dispatch INCLUDES layout
+        # under layout-and-dispatch-v1). Adapters declare which they conform to.
+        "measurement_contract": args.measurement_contract, "shape": shape,
+        # structured placement metadata (goal P2 topology) — replaces the bare topology string.
+        "placement": {
+            "kind": args.placement, "nodes": int(os.environ.get("SLURM_NNODES", "1")),
+            "gpus_per_node": args.gpus_per_node or ep_size,
+            "scale_up_domain": args.scale_up_domain or ((args.gpus_per_node or ep_size) * 1),
+            "ranks": ep_size, "transport": args.transport,
+        },
+    }
+    headline = next((r for r in rows if r["tokens_per_rank"] == 64), rows[len(rows) // 2])
+    env = None
+    if args.env_json and os.path.exists(args.env_json):
+        with open(args.env_json) as fh:
+            env = json.load(fh)
+    doc = {
+        "schema_version": SCHEMA_VERSION, "family": "moe", "generated_by": "tests/run_ep.py",
+        "generated_at": args.timestamp or _dt.datetime.now().astimezone().isoformat(),
+        "runner": args.runner, "transport": args.transport,
+        # Multi-dimensional validity + machine-derived publication status (goal P1). `status`
+        # is a back-compat alias (legacy v3 readers) — publication_status is authoritative.
+        "validity": validity,
+        "publication_status": publication_status,
+        "status": "valid" if all_ok else "invalid",
+        "workload": {
+            "source": validity["workload_source"],
+            "workload_id": getattr(args, "workload_id", None),
+            "manifest_checksums": getattr(args, "workload_checksums", None),
+            "trace_signature": f"{trace_sig:015x}",
+            "distinct_per_T_hashes": sorted(routing_hashes),
+            # within-run (cross-rank) identity is PROVEN here; cross-hardware identity holds
+            # only if another run records the SAME trace_signature / workload_id.
+            "cross_rank_consistent": routing_consistent,
+        },
+        "comparison_key": comparison_key(meta),
+        "x_axis": {"primary": "tokens_per_rank",
+                   "global_relation": "global_tokens = tokens_per_rank * ep_size"},
+        "backend_provenance": backend.backend_provenance,
+        # backend-independent resource vocabulary + conformance class (goal P3).
+        "resource_profile": _resource_profile(backend.backend_provenance, args),
+        "reproduction": {
+            "command": getattr(args, "reproduction_command", ""),
+            "image": getattr(args, "image", "") or None,
+            "image_digest": getattr(args, "image_digest", "") or None,
+            "image_arch": getattr(args, "image_arch", None),
+            "squash_sha256": getattr(args, "squash_sha256", None),
+            "git_run": getattr(args, "git_run", None),   # repo/run/attempt/ref/sha/job/artifact
+            # redaction (goal P1): command + provenance carry NO hostnames/IPs/UUIDs/private paths;
+            # per-node env (hostnames, GPU UUIDs, NIC GUIDs) lives in the separate gitignored
+            # env_json (CI uploads it as a workflow artifact), never inlined into this record.
+            "redaction": "no hostnames/IPs/UUIDs/private-paths in command or provenance",
+            "seed": args.seed, "warmup": args.warmup, "iters": args.iters,
+            "trials": max(1, args.trials), "samples_per_point": (max(1, args.trials) * args.iters),
+            "measurement_contract": args.measurement_contract,
+            "dispatch_dtype": args.dispatch_dtype, "mode": args.mode,
+            "fp8_quant_in_timing": getattr(backend, "fp8_in_timing", None),
+        },
+        **meta,
+        "correctness": {"passed": all_ok,
+                        "max_rel_error": max((r["max_rel_error"] for r in rows), default=None),
+                        "tolerance": getattr(backend, "tolerance", 5e-2), "points": len(rows),
+                        # honest scope: round-trip reconstruction + non-silent recv, NOT a full
+                        # per-token routing/ordering/weight/padding proof (review #3).
+                        "scope": "roundtrip-reconstruction-smoke-v1"},
+        "routing_identity": {   # cryptographic workload-identity proof (review #3)
+            "consistent_across_ranks": routing_consistent,
+            "trace_signature": f"{trace_sig:015x}",
+            "distinct_per_T_hashes": sorted(routing_hashes),
+        },
+        # EPLB plan + the per-rank load imbalance it removes (the headline of the zipf+EPLB
+        # comparison). enabled=False when the run did not apply EPLB.
+        "eplb": ({"enabled": True, "num_logical_experts": num_logical,
+                  "num_physical_experts": args.experts,
+                  "num_redundant": args.experts - num_logical,
+                  "imbalance_before": eplb_plan["imbalance_before"],
+                  "imbalance_after": eplb_plan["imbalance_after"],
+                  "replicated_experts": eplb_plan["replicated_experts"],
+                  "max_replicas": eplb_plan["max_replicas"]}
+                 if eplb_plan else {"enabled": False}),
+        "routing_profile": {
+            "routing": args.routing,
+            "fanout_mean": sum(r["fanout_mean"] for r in rows) / len(rows),
+            "fanout_max": max(r["fanout_max"] for r in rows),
+            "headline_hash": headline["routing_hash"],
+        },
+        "metrics": {   # p99 is the headline percentile (review #3); p50/p90/p95 also kept per row
+            "headline_tokens_per_rank": headline["tokens_per_rank"],
+            "headline_percentile": "p99",
+            "dispatch_us_p50": headline["dispatch_us_p50"], "dispatch_us_p99": headline["dispatch_us_p99"],
+            "combine_us_p50": headline["combine_us_p50"], "combine_us_p99": headline["combine_us_p99"],
+            "roundtrip_us_p50": headline["roundtrip_us_p50"], "roundtrip_us_p99": headline["roundtrip_us_p99"],
+            "isolated_sum_us_p50": headline["isolated_sum_us_p50"], "isolated_sum_us_p99": headline["isolated_sum_us_p99"],
+            "isolated_sum_label": "sum of isolated dispatch+combine percentiles — NOT a measured chained op",
+            "roundtrip_tokens_per_second": headline["roundtrip_tokens_per_second"],
+        },
+        "rows": rows, "environment": env,
+    }
+    os.makedirs(os.path.dirname(os.path.abspath(args.out)), exist_ok=True)
+    with open(args.out, "w") as fh:
+        json.dump(doc, fh, indent=2)
+        fh.write("\n")
+    print(f"{backend.name} ep-dispatch-combine [{args.phase}/{args.mode}/{args.measurement_contract}]: "
+          f"status={doc['status']} {len(rows)} pts, routing_consistent={routing_consistent}, "
+          f"headline T={headline['tokens_per_rank']} disp_p99={headline['dispatch_us_p99']:.1f}us "
+          f"-> {args.out}")
+    return 0 if all_ok else 1
diff --git a/experimental/CollectiveX/tests/ep_mori.py b/experimental/CollectiveX/tests/ep_mori.py
new file mode 100644
index 000000000..363736485
--- /dev/null
+++ b/experimental/CollectiveX/tests/ep_mori.py
@@ -0,0 +1,187 @@
+#!/usr/bin/env python3
+"""CollectiveX EP backend adapter — MoRI (AMD ROCm), normal mode.
+
+The harness owns the deterministic shared routing trace and the comm-only timing;
+this file owns MoRI's API and the ionic_rdma-fabric constraints found on MI355X
+(validated on-node, see CONTAINERS.md): the whole symmetric heap is one RDMA MR
+capped at ~4 GiB (hold at 2 GiB; bound buffers via max_num_inp_token_per_rank ⇒
+buffer_cap); combine() resets recv_num (read it before combine; compare only the
+first T rows); and the post-shmem_finalize teardown asserts (finalize hard-exits).
+
+`make_problem` now materializes the harness-provided rank slice, so MoRI honors the
+requested routing (it no longer always-uniform) and runs the identical workload to
+the NVIDIA SKUs. combine_needs_redispatch=True: combine consumes recv_num, so the
+harness re-dispatches (untimed) before each timed combine sample.
+"""
+from __future__ import annotations
+
+import os
+import sys
+import types
+
+# MoRI registers the WHOLE symmetric heap as one RDMA MR at shmem init — set BEFORE
+# `import mori`. 2 GiB registers on the MI355X ionic_rdma NICs; larger fails.
+os.environ.setdefault("MORI_SHMEM_HEAP_SIZE",
+                      os.environ.get("CX_MORI_HEAP_SIZE", "2G"))
+
+import torch
+import torch.distributed as dist
+
+try:
+    import mori  # type: ignore
+except Exception as exc:  # pragma: no cover - needs the AMD MoRI image
+    print("ERROR: mori import failed — needs the AMD MoRI image "
+          f"(rocm/sgl-dev:...-mori-...). {exc!r}", file=sys.stderr)
+    raise
+
+
+class MoRIBackend:
+    name = "mori"
+    combine_needs_redispatch = True
+    # MoRI wedges on a COLD dispatch jumping straight to a large T (validated on
+    # MI355X); the harness ramps this backend's ladder geometrically from 1.
+    needs_gradual_ramp = True
+    # MoRI WEDGES under a sustained warm-up burst (the harness's Blackwell clock-ramp)
+    # and is already steady at a short warm-up (~44us, reproducible) — so it opts out.
+    wants_warm_burst = False
+    # Capabilities — run_ep.py REJECTS anything outside these BEFORE construction (no
+    # fallback/mislabel). Expanded as each path is implemented + hardware-validated.
+    # MoRI exposes quant_type (fp8) in EpDispatchCombineConfig; added once validated.
+    SUPPORTED_PRECISIONS = {"bf16"}        # + "fp8" once the fp8 quant_type path is wired
+    SUPPORTED_MODES = {"normal"}           # MoRI has no separate low-latency entrypoint
+    # MoRI computes its routing layout INSIDE the dispatch kernel (block_num/warps launch);
+    # it cannot be hoisted, so MoRI honors only the layout-and-dispatch contract. Cross-
+    # vendor comparisons must therefore use layout-and-dispatch-v1 (the common contract).
+    SUPPORTED_CONTRACTS = {"layout-and-dispatch-v1"}
+
+    def __init__(self, args, rank, world_size, local_rank, device):
+        self.args = args
+        self.rank = rank
+        self.world_size = world_size
+        self.device = device
+        self.mode = args.mode
+        assert args.dispatch_dtype in self.SUPPORTED_PRECISIONS and args.mode in self.SUPPORTED_MODES, \
+            "run_ep.py must reject unsupported dtype/mode before constructing the backend"
+        self.fp8_in_timing = None  # set when fp8 dispatch is used (whether the cast is timed)
+        self.ep_size = world_size
+        self.experts_per_rank = args.experts // self.ep_size
+        dev_cus = torch.cuda.get_device_properties(device).multi_processor_count
+        # Resource regime — map the comm budget onto CUs to mirror DeepEP's SM fraction.
+        #   normalized: block_num ≈ sm_fraction · CUs (≈ the same device fraction);
+        #   tuned: MoRI launch auto-tuning (API not present in this build — uses default,
+        #          labeled tuned_source); default: the 80-block bring-up budget.
+        # MoRI DEADLOCKS at T>=32 when block_num is reduced toward the normalized target
+        # (validated on MI355X g15: block_num=46 wedges, 80 completes T=32/64 with the
+        # realistic fan-out≈5.3 trace). So MoRI cannot be normalized down to DeepEP's
+        # device fraction; floor it at a known-functional minimum and record that the
+        # target fraction was NOT reached.
+        rm = args.resource_mode
+        floor = int(os.environ.get("CX_MORI_MIN_BLOCKS", "80"))  # functional minimum (deadlocks lower)
+        env_blocks = os.environ.get("CX_MORI_BLOCK_NUM")
+        self._block_floored = False
+        if env_blocks:
+            self.block_num = int(env_blocks)
+            self._block_target = self.block_num
+        elif rm == "normalized":
+            self._block_target = max(1, round(args.sm_fraction * dev_cus))
+            self.block_num = max(floor, self._block_target)
+            self._block_floored = self.block_num > self._block_target
+        else:  # tuned (no launch auto-tune API in mori-0227-2) / default
+            self.block_num = 80
+            self._block_target = 80
+        self._tuned_source = ("default-80" if rm == "tuned" else
+                              ("normalized-floored" if self._block_floored else "n/a"))
+        self.dispatch_warps = int(os.environ.get("CX_MORI_DISPATCH_WARPS", "16"))
+        self.combine_warps = int(os.environ.get("CX_MORI_COMBINE_WARPS", "8"))
+
+        world_group = torch.distributed.group.WORLD
+        torch._C._distributed_c10d._register_process_group("default", world_group)
+        mori.shmem.shmem_torch_process_group_init("default")
+
+        self._cap = self.buffer_cap(args)
+        self.config = mori.ops.EpDispatchCombineConfig(
+            data_type=torch.bfloat16, rank=rank, world_size=world_size,
+            hidden_dim=args.hidden, scale_dim=0,
+            scale_type_size=torch.tensor([], dtype=torch.float8_e4m3fnuz).element_size(),
+            max_token_type_size=torch.tensor([], dtype=torch.float32).element_size(),
+            max_num_inp_token_per_rank=max(512, self._cap),
+            num_experts_per_rank=self.experts_per_rank,
+            num_experts_per_token=args.topk,
+            use_external_inp_buf=False, quant_type="none",
+        )
+        self.op = mori.ops.EpDispatchCombineOp(self.config)
+        # Provenance: MoRI has no pip version; pin via MORI_COMMIT, else the image tag
+        # the launcher exported (COLLECTIVEX_IMAGE carries the mori build tag), so the
+        # provenance gate has something real rather than "unknown".
+        img = os.environ.get("COLLECTIVEX_IMAGE", "")
+        mori_commit = os.environ.get("MORI_COMMIT") or (f"image:{img}" if img else "unknown")
+        self.backend_provenance = {
+            "mori_commit": mori_commit,
+            "heap_size": os.environ.get("MORI_SHMEM_HEAP_SIZE"),
+            "max_num_inp_token_per_rank": max(512, self._cap),
+            "resource_mode": args.resource_mode, "block_num": self.block_num,
+            "block_num_target": self._block_target, "block_num_floored": self._block_floored,
+            "dispatch_warps": self.dispatch_warps, "combine_warps": self.combine_warps,
+            "device_cus": dev_cus, "sm_fraction": (self.block_num / dev_cus),
+            "tuned_source": self._tuned_source,
+        }
+
+    def buffer_cap(self, args):
+        # Largest tokens/rank the 2 GiB registerable heap holds at hidden=7168 (512,
+        # validated on-node). Override via CX_MORI_MAX_TOKENS.
+        return int(os.environ.get("CX_MORI_MAX_TOKENS", "512"))
+
+    def make_problem(self, T, idx, weights, x):
+        # Shared-trace slice: idx[T,topk] -> int32 (MoRI expects int32 expert ids);
+        # weights[T,topk] f32; x[T,hidden] bf16; scales is a real (T,0) fp8 tensor
+        # (not None) since scale_dim==0.
+        indices = idx.to(torch.int32)
+        scales = torch.empty((T, 0), dtype=torch.float8_e4m3fnuz, device=self.device)
+        return types.SimpleNamespace(T=T, x=x, indices=indices,
+                                     weights=weights.to(torch.float32), scales=scales)
+
+    def dispatch(self, p):
+        (dispatch_output, dispatch_weights, _scales, dispatch_indices, recv_num) = self.op.dispatch(
+            p.x, p.weights, p.scales, p.indices,
+            block_num=self.block_num, warp_per_block=self.dispatch_warps)
+        total_recv = int(recv_num[0].item())  # read BEFORE combine (combine resets recv_num)
+        return types.SimpleNamespace(
+            dispatch_output=dispatch_output, dispatch_weights=dispatch_weights,
+            dispatch_indices=dispatch_indices, total_recv=total_recv,
+            combine_input=dispatch_output.to(torch.bfloat16))
+
+    def stage(self, p, h):
+        # comm-only contract: stage the "expert outputs" into MoRI's registered
+        # combine-input buffer UNTIMED (in a real MoE the expert FFN writes here).
+        buf = self.op.get_registered_combine_input_buffer(
+            torch.bfloat16, hidden_dim=h.combine_input.size(1))
+        buf[:h.total_recv, :].copy_(h.combine_input[:h.total_recv, :])
+
+    def combine(self, p, h):
+        combined, _w = self.op.combine(
+            h.combine_input, h.dispatch_weights, h.dispatch_indices,
+            block_num=self.block_num, warp_per_block=self.combine_warps)
+        return combined
+
+    def expected(self, p, h):
+        # MoRI combine sums one copy per destination RANK ⇒ combined[i] ≈
+        # x[i] * (#unique destination ranks among the token's topk experts).
+        pes = p.indices.long() // self.experts_per_rank
+        unique_pes = torch.tensor(
+            [len(set(row.tolist())) for row in pes], device=self.device, dtype=torch.float32
+        ).unsqueeze(1)
+        return p.x.float() * unique_pes, p.T
+
+    def recv_tokens(self, h):
+        return int(h.total_recv)
+
+    def finalize(self, rc):
+        # MoRI's shmem teardown asserts after shmem_finalize(); results are already
+        # written, so sync and hard-exit past it.
+        try:
+            dist.barrier()
+        except Exception:
+            pass
+        sys.stdout.flush()
+        sys.stderr.flush()
+        os._exit(0 if rc == 0 else 1)
diff --git a/experimental/CollectiveX/tests/eplb.py b/experimental/CollectiveX/tests/eplb.py
new file mode 100644
index 000000000..2234fea96
--- /dev/null
+++ b/experimental/CollectiveX/tests/eplb.py
@@ -0,0 +1,176 @@
+#!/usr/bin/env python3
+"""CollectiveX — EPLB (Expert-Parallel Load Balancer), the DeepSeek-style remedy for
+skewed (zipf) expert load.
+
+Under skewed routing, the ranks hosting hot logical experts receive far more token-copies
+than the rest; dispatch/combine latency is gated by that busiest rank (the cross-rank MAX
+the harness measures), so the whole collective stalls on it. EPLB REPLICATES hot experts
+onto extra physical slots and PLACES the slots so every rank carries ~equal load.
+
+This module is backend-agnostic: it is purely a transform of the deterministic routing
+trace. The trick that keeps every adapter unchanged — DeepEP/MoRI both route expert i to
+rank `i // experts_per_rank` (contiguous block placement) — is to number the physical slots
+RANK-MAJOR (rank r owns physical ids [r*spp, (r+1)*spp)), so the standard contiguous mapping
+reproduces EPLB's balanced placement. The harness then runs with `experts = num_physical`
+and the remapped (physical) trace; nothing else changes.
+
+  num_physical = num_logical + redundant   (redundant rounded up to a multiple of ep_size)
+  build_plan(): greedy replicate-by-load + equal-cardinality balanced packing onto ep_size ranks
+  remap_idx():  each token's logical targets -> physical replicas, spread by global token id
+
+Pure-Python planner (no torch) so it unit-tests on a login node; remap_idx needs torch.
+"""
+from __future__ import annotations
+
+
+def physical_count(num_logical: int, num_redundant: int, ep_size: int) -> int:
+    """num_logical + redundant, with redundant rounded UP to a multiple of ep_size so the
+    physical experts divide evenly across ranks (symmetric dispatch)."""
+    r = ((max(0, num_redundant) + ep_size - 1) // ep_size) * ep_size
+    return num_logical + r
+
+
+def _contiguous_rank_load(logical_load, ep_size):
+    """Per-rank received load WITHOUT EPLB: logical experts placed contiguously
+    (experts_per_rank = num_logical/ep_size), so rank r carries its block's total."""
+    n = len(logical_load)
+    per = n // ep_size
+    return [sum(logical_load[r * per:(r + 1) * per]) for r in range(ep_size)]
+
+
+def build_plan(logical_load, num_physical: int, ep_size: int) -> dict:
+    """logical_load: list[float] length num_logical (token-copies per logical expert).
+    Returns the replication+placement plan (all pure-Python lists) + before/after balance."""
+    num_logical = len(logical_load)
+    assert num_physical >= num_logical, "num_physical must be >= num_logical"
+    assert num_physical % ep_size == 0, "num_physical must divide ep_size"
+    assert num_logical % ep_size == 0, "num_logical must divide ep_size"
+    spp = num_physical // ep_size                      # physical slots per rank (fixed)
+
+    # 1) Replica allocation — start one slot per logical expert, then hand each redundant
+    #    slot to the expert with the highest CURRENT per-replica load (greedy min-max).
+    replicas = [1] * num_logical
+    for _ in range(num_physical - num_logical):
+        best, best_lps = 0, -1.0
+        for e in range(num_logical):
+            lps = logical_load[e] / replicas[e]
+            if lps > best_lps:
+                best, best_lps = e, lps
+        replicas[best] += 1
+
+    # 2) Slots = (per-replica load, logical expert), one per replica.
+    slots = []
+    for e in range(num_logical):
+        lps = logical_load[e] / replicas[e]
+        slots.extend((lps, e) for _ in range(replicas[e]))
+
+    # 3) Balanced packing into ep_size bins of EQUAL cardinality (spp each), minimizing the
+    #    max per-rank load: heaviest slot first -> least-loaded rank that still has capacity.
+    slots.sort(reverse=True)
+    rank_slots = [[] for _ in range(ep_size)]
+    rank_load = [0.0] * ep_size
+    for lps, e in slots:
+        r = min((r for r in range(ep_size) if len(rank_slots[r]) < spp),
+                key=lambda r: rank_load[r])
+        rank_slots[r].append(e)
+        rank_load[r] += lps
+
+    # 4) Rank-major physical numbering -> contiguous placement == this balanced placement.
+    phys2log, rank_of_phys = [], []
+    for r in range(ep_size):
+        for e in rank_slots[r]:
+            phys2log.append(e)
+            rank_of_phys.append(r)
+    log2phys = [[] for _ in range(num_logical)]
+    for pid, e in enumerate(phys2log):
+        log2phys[e].append(pid)
+
+    before = _contiguous_rank_load(logical_load, ep_size)
+    total = sum(logical_load) or 1.0
+    mean = total / ep_size
+    return {
+        "num_logical": num_logical, "num_physical": num_physical, "ep_size": ep_size,
+        "slots_per_rank": spp, "replicas": replicas, "max_replicas": max(replicas),
+        "phys2log": phys2log, "rank_of_phys": rank_of_phys, "log2phys": log2phys,
+        "rank_load_after": rank_load, "rank_load_before": before,
+        # imbalance = busiest rank / mean (1.0 = perfect). This is the number EPLB cuts.
+        "imbalance_before": max(before) / mean, "imbalance_after": max(rank_load) / mean,
+        "replicated_experts": sum(1 for r in replicas if r > 1),
+    }
+
+
+def remap_idx(idx_logical, plan):
+    """idx_logical: torch [gt, topk] int64 logical-expert ids (global trace).
+    Returns idx_physical [gt, topk]: each token's logical target -> one of that expert's
+    physical replicas, SPREAD by global token id (row) so a hot expert's tokens fan out
+    across its replicas (= across ranks). Replicas of distinct logical experts are disjoint,
+    so a token's top-k physical ids stay distinct (dispatch invariant preserved)."""
+    import torch
+    replicas = plan["replicas"]
+    num_logical = len(replicas)
+    max_rc = plan["max_replicas"]
+    rc = torch.tensor(replicas, dtype=torch.int64)
+    # padded [num_logical, max_rc] table of physical ids (pad with replica 0; never indexed
+    # past rc[e] because the replica index is taken mod rc[e]).
+    padded = torch.zeros(num_logical, max_rc, dtype=torch.int64)
+    for e, phys in enumerate(plan["log2phys"]):
+        for k in range(max_rc):
+            padded[e, k] = phys[k] if k < len(phys) else phys[0]
+    gt = idx_logical.shape[0]
+    rows = torch.arange(gt, dtype=torch.int64).unsqueeze(1)     # [gt,1] global token id
+    e = idx_logical.to(torch.int64)                             # [gt,topk]
+    ridx = rows % rc[e]                                         # [gt,topk] replica index
+    return padded[e, ridx]                                      # [gt,topk] physical ids
+
+
+# --------------------------------------------------------------------------- self-test
+if __name__ == "__main__":
+    # Synthetic zipf load (popularity ∝ 1/(e+1)) — the case EPLB targets. No torch needed.
+    import sys
+    NUM_LOGICAL, EP, REDUNDANT = 256, 8, 32
+    load = [1.0 / (e + 1) for e in range(NUM_LOGICAL)]
+    nphys = physical_count(NUM_LOGICAL, REDUNDANT, EP)
+    plan = build_plan(load, nphys, EP)
+    print(f"num_logical={NUM_LOGICAL} ep={EP} num_physical={nphys} slots/rank={plan['slots_per_rank']}")
+    print(f"replicated experts={plan['replicated_experts']} max_replicas={plan['max_replicas']} "
+          f"(hottest expert 0 replicas={plan['replicas'][0]})")
+    print(f"per-rank load BEFORE (contiguous): {[round(x,3) for x in plan['rank_load_before']]}")
+    print(f"per-rank load AFTER  (EPLB):       {[round(x,3) for x in plan['rank_load_after']]}")
+    print(f"imbalance (max/mean)  BEFORE={plan['imbalance_before']:.2f}x  AFTER={plan['imbalance_after']:.2f}x")
+    # Gates: equal slot cardinality, every logical expert placed, big imbalance cut.
+    assert all(plan["replicas"][e] >= 1 for e in range(NUM_LOGICAL))
+    assert sum(plan["replicas"]) == nphys
+    assert len(plan["phys2log"]) == nphys
+    assert all(len(plan["log2phys"][e]) == plan["replicas"][e] for e in range(NUM_LOGICAL))
+    # rank-major numbering => contiguous block per rank => rank_of_phys is non-decreasing
+    assert plan["rank_of_phys"] == sorted(plan["rank_of_phys"])
+    assert plan["imbalance_after"] < plan["imbalance_before"], "EPLB must reduce imbalance"
+    assert plan["imbalance_after"] < 1.30, f"EPLB should get within ~30% of perfect, got {plan['imbalance_after']:.2f}"
+    # remap (if torch present): distinctness + balanced receive on a sampled zipf trace.
+    try:
+        import torch
+        g = torch.Generator().manual_seed(0)
+        p = torch.tensor(load); p = (p / p.sum()).expand(4096, NUM_LOGICAL)
+        idx_l = torch.multinomial(p, 8, replacement=False, generator=g).to(torch.int64)
+        idx_p = remap_idx(idx_l, plan)
+        assert idx_p.shape == idx_l.shape
+        # top-k physical ids distinct per token
+        assert all(len(set(row.tolist())) == 8 for row in idx_p), "physical top-k must stay distinct"
+        spp = plan["slots_per_rank"]
+        recv_before = [0] * EP
+        recv_after = [0] * EP
+        per_log = NUM_LOGICAL // EP
+        for row_l, row_p in zip(idx_l.tolist(), idx_p.tolist()):
+            for e in row_l:
+                recv_before[e // per_log] += 1
+            for pid in row_p:
+                recv_after[pid // spp] += 1
+        ib = max(recv_before) / (sum(recv_before) / EP)
+        ia = max(recv_after) / (sum(recv_after) / EP)
+        print(f"sampled-trace receive imbalance BEFORE={ib:.2f}x  AFTER={ia:.2f}x")
+        assert ia < ib and ia < 1.35, "remap must balance per-rank receive load"
+        print("remap self-test: OK")
+    except ImportError:
+        print("(torch absent — skipped remap self-test; planner gates passed)")
+    print("EPLB self-test: PASS")
+    sys.exit(0)
diff --git a/experimental/CollectiveX/tests/failure_taxonomy.py b/experimental/CollectiveX/tests/failure_taxonomy.py
new file mode 100644
index 000000000..45782ee07
--- /dev/null
+++ b/experimental/CollectiveX/tests/failure_taxonomy.py
@@ -0,0 +1,83 @@
+#!/usr/bin/env python3
+"""CollectiveX failure taxonomy (goal Part 3: failure & reliability characterization).
+
+A wedged or crashing EP run should become a CLASSIFIED, bounded record — not a silent hang or a
+bare rc=1. classify() maps an exception (or a process return code from the timeout-wrapped driver)
+onto a stable failure mode, so coverage/reliability views can keep failed cases instead of dropping
+them. Pure stdlib.
+"""
+from __future__ import annotations
+
+# Stable failure modes (goal Part 3). Order matters: classify() returns the first match.
+MODES = [
+    "unsupported",            # capability rejected the combo (run_ep exit 5)
+    "initialization-failure",  # process group / buffer / NVSHMEM bring-up failed
+    "out-of-memory",
+    "registration-failure",    # MR / symmetric-heap registration (e.g. MoRI errno 22)
+    "correctness-failure",     # ran but reconstruction gate failed
+    "timeout",                 # killed by the timeout wrapper (rc 124) — bounded hang
+    "deadlock",                # collective watchdog abort (NCCL SIGABRT / rc -6 after a stall)
+    "teardown-failure",        # post-finalize / shmem_finalize assertion
+    "infrastructure",          # slurm / container / FS / node failure
+    "unknown",
+]
+
+_SIGNATURES = [
+    ("unsupported", ("unsupported", "rejects", "not supported", "no fallback")),
+    ("out-of-memory", ("out of memory", "outofmemory", "cuda oom", "cudaerrormemoryallocation")),
+    ("registration-failure", ("errno 22", "registration", "register", "ibv_reg", "mr ")),
+    ("initialization-failure", ("nvshmem", "init_process_group", "ncclcomminit", "bootstrap", "buffer(")),
+    ("deadlock", ("watchdog", "sigabrt", "signal 6", "collective", "timed out waiting", "nccl timeout")),
+    ("teardown-failure", ("shmem_finalize", "destroy_process_group", "teardown", "finalize")),
+    ("correctness-failure", ("correct=false", "reconstruction", "max_rel", "assertion.*tol")),
+    ("infrastructure", ("srun: error", "slurm", "node fail", "container", "no such file")),
+]
+
+
+def classify(text: str = "", rc: int | None = None) -> str:
+    """Best-effort failure mode from captured stderr/stdout text and/or a process return code."""
+    if rc is not None:
+        if rc == 5:
+            return "unsupported"
+        if rc == 124:
+            return "timeout"             # GNU timeout SIGTERM
+        if rc in (137, -9):
+            return "timeout"             # SIGKILL (timeout -k)
+        if rc in (134, -6):
+            return "deadlock"            # SIGABRT (NCCL watchdog / assertion)
+    t = (text or "").lower()
+    for mode, sigs in _SIGNATURES:
+        if any(s in t for s in sigs):
+            return mode
+    if rc not in (None, 0):
+        return "unknown"
+    return "unknown"
+
+
+def record(text="", rc=None, case=None) -> dict:
+    """A classified failure record preserving the exact case + signal for reliability views."""
+    return {"failure_mode": classify(text, rc), "return_code": rc,
+            "case": case or {}, "evidence": (text or "")[-400:]}
+
+
+if __name__ == "__main__":
+    import sys
+    cases = [
+        ("RuntimeError: Unsupported number of EP ranks", None, "unsupported"),
+        ("", 124, "timeout"),
+        ("Signal 6 (SIGABRT) received ... NCCL watchdog", None, "deadlock"),
+        ("", -6, "deadlock"),
+        ("cuda out of memory", None, "out-of-memory"),
+        ("ibv_reg_mr failed errno 22", None, "registration-failure"),
+        ("shmem_finalize teardown assertion", None, "teardown-failure"),
+        ("srun: error: node failed", None, "infrastructure"),
+    ]
+    ok = True
+    for text, rc, want in cases:
+        got = classify(text, rc)
+        flag = "OK" if got == want else "FAIL"
+        if got != want:
+            ok = False
+        print(f"  [{flag}] rc={rc} text={text[:40]!r} -> {got} (want {want})")
+    print("failure_taxonomy self-test:", "PASS" if ok else "FAIL")
+    sys.exit(0 if ok else 1)
diff --git a/experimental/CollectiveX/tests/fixtures/all_reduce_perf_b200_8gpu.txt b/experimental/CollectiveX/tests/fixtures/all_reduce_perf_b200_8gpu.txt
new file mode 100644
index 000000000..c8825164e
--- /dev/null
+++ b/experimental/CollectiveX/tests/fixtures/all_reduce_perf_b200_8gpu.txt
@@ -0,0 +1,50 @@
+# nThread 1 nGpus 8 minBytes 8 maxBytes 8589934592 step: 2(factor) warmup iters: 5 iters: 20 agg iters: 1 validation: 1 graph: 0
+#
+# Using devices
+#  Rank  0 Group  0 Pid  12345 on    b200-node device  0 [0x1b] NVIDIA B200
+#  Rank  1 Group  0 Pid  12345 on    b200-node device  1 [0x43] NVIDIA B200
+#  Rank  2 Group  0 Pid  12345 on    b200-node device  2 [0x52] NVIDIA B200
+#  Rank  3 Group  0 Pid  12345 on    b200-node device  3 [0x61] NVIDIA B200
+#  Rank  4 Group  0 Pid  12345 on    b200-node device  4 [0x9d] NVIDIA B200
+#  Rank  5 Group  0 Pid  12345 on    b200-node device  5 [0xc3] NVIDIA B200
+#  Rank  6 Group  0 Pid  12345 on    b200-node device  6 [0xd1] NVIDIA B200
+#  Rank  7 Group  0 Pid  12345 on    b200-node device  7 [0xdf] NVIDIA B200
+#
+#                                                              out-of-place                       in-place
+#       size         count      type   redop    root     time   algbw   busbw #wrong     time   algbw   busbw #wrong
+#        (B)    (elements)                               (us)  (GB/s)  (GB/s)            (us)  (GB/s)  (GB/s)
+           8             2     float     sum      -1     9.62    0.00    0.00      0     9.60    0.00    0.00      0
+          16             4     float     sum      -1     9.61    0.00    0.00      0     9.59    0.00    0.00      0
+          32             8     float     sum      -1     9.63    0.00    0.00      0     9.62    0.00    0.00      0
+          64            16     float     sum      -1     9.60    0.00    0.00      0     9.58    0.00    0.00      0
+         128            32     float     sum      -1     9.64    0.01    0.02      0     9.63    0.01    0.02      0
+         256            64     float     sum      -1     9.66    0.03    0.05      0     9.64    0.03    0.05      0
+         512           128     float     sum      -1     9.69    0.05    0.09      0     9.67    0.05    0.09      0
+        1024           256     float     sum      -1     9.74    0.11    0.18      0     9.72    0.11    0.18      0
+        2048           512     float     sum      -1     9.82    0.21    0.37      0     9.80    0.21    0.37      0
+        4096          1024     float     sum      -1     9.97    0.41    0.72      0     9.95    0.41    0.72      0
+        8192          2048     float     sum      -1    10.22    0.80    1.40      0    10.20    0.80    1.40      0
+       16384          4096     float     sum      -1    10.81    1.52    2.65      0    10.79    1.52    2.65      0
+       32768          8192     float     sum      -1    11.93    2.75    4.81      0    11.90    2.75    4.81      0
+       65536         16384     float     sum      -1    13.62    4.81    8.42      0    13.59    4.82    8.43      0
+      131072         32768     float     sum      -1    16.94    7.74   13.54      0    16.90    7.76   13.57      0
+      262144         65536     float     sum      -1    23.14   11.33   19.83      0    23.10   11.35   19.86      0
+      524288        131072     float     sum      -1    35.62   14.72   25.76      0    35.55   14.75   25.81      0
+     1048576        262144     float     sum      -1    60.40   17.36   30.38      0    60.30   17.39   30.43      0
+     2097152        524288     float     sum      -1    76.50   27.41   47.97      0    76.40   27.45   48.04      0
+     4194304       1048576     float     sum      -1   110.20   38.06   66.61      0   110.05   38.11   66.70      0
+     8388608       2097152     float     sum      -1   165.80   50.60   88.55      0   165.60   50.66   88.65      0
+    16777216       4194304     float     sum      -1   250.10   67.08  117.40      0   249.80   67.16  117.54      0
+    33554432       8388608     float     sum      -1   360.50   93.08  162.90      0   360.10   93.18  163.07      0
+    67108864      16777216     float     sum      -1   520.80  128.85  225.50      0   520.20  129.00  225.75      0
+   134217728      33554432     float     sum      -1   720.30  186.34  326.10      0   719.50  186.55  326.46      0
+   268435456      67108864     float     sum      -1  1080.50  248.43  434.80      0  1079.20  248.73  435.27      0
+   536870912     134217728     float     sum      -1  1990.20  269.76  472.10      0  1988.50  269.99  472.49      0
+  1073741824     268435456     float     sum      -1  3940.60  272.48  476.84      0  3938.10  272.65  477.14      0
+  2147483648     536870912     float     sum      -1  7850.10  273.56  478.73      0  7846.20  273.69  478.96      0
+  4294967296    1073741824     float     sum      -1 15680.50  273.91  479.34      0 15673.80  274.03  479.55      0
+  8589934592    2147483648     float     sum      -1 31250.80  274.87  481.02      0 31238.10  274.98  481.22      0
+#
+# Out of bounds values : 0 OK
+# Avg bus bandwidth    : 168.42
+#
diff --git a/experimental/CollectiveX/tests/make_workloads.py b/experimental/CollectiveX/tests/make_workloads.py
new file mode 100644
index 000000000..cc77b1303
--- /dev/null
+++ b/experimental/CollectiveX/tests/make_workloads.py
@@ -0,0 +1,51 @@
+#!/usr/bin/env python3
+"""Generate canonical serialized workloads (goal Part 1). Runs build_workload (needs torch) for
+each (routing, global_tokens) in a ladder and writes <workload_id>.npz + .manifest.json into a
+dir that runs then consume via `run_ep.py --workload-dir`. One trace per global-token count
+because the generator is not prefix-consistent across sizes.
+
+  python3 tests/make_workloads.py --out-dir /data/sa-shared/cx_workloads \\
+      --routing uniform --ep 8 --hidden 7168 --topk 8 --experts 256 --seed 67 \\
+      --tokens-ladder "1 2 4 8 16 32 64 128 256 512"
+
+Generate every routing the suites need by running once per --routing. Idempotent (same id => same
+file). The dir is the cross-hardware artifact: copy it to each cluster so all consume identical bytes.
+"""
+from __future__ import annotations
+
+import argparse
+import os
+import sys
+
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+import workload as wl   # noqa: E402
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser(description="Generate canonical CollectiveX workloads")
+    ap.add_argument("--out-dir", required=True)
+    ap.add_argument("--routing", required=True)
+    ap.add_argument("--ep", type=int, required=True, help="ep_size (global_tokens = T * ep)")
+    ap.add_argument("--hidden", type=int, default=7168)
+    ap.add_argument("--topk", type=int, default=8)
+    ap.add_argument("--experts", type=int, default=256)
+    ap.add_argument("--seed", type=int, default=67)
+    ap.add_argument("--tokens-ladder", default="1 2 4 8 16 32 64 128 256 512")
+    a = ap.parse_args()
+    epr = a.experts // a.ep
+    ladder = sorted({int(t) for t in a.tokens_ladder.replace(",", " ").split() if int(t) > 0})
+    os.makedirs(a.out_dir, exist_ok=True)
+    made = []
+    for T in ladder:
+        gt = T * a.ep
+        idx, w, man = wl.build_workload(a.hidden, a.topk, a.experts, a.routing, gt, a.seed, epr)
+        wid = wl.save_workload(a.out_dir, idx, w, man)
+        made.append((T, gt, wid))
+        print(f"  T={T:<5} gt={gt:<6} routing={a.routing} -> {wid}  "
+              f"(trace sha {man['checksums']['trace'][:12]})")
+    print(f"wrote {len(made)} canonical workloads to {a.out_dir} (routing={a.routing}, ep={a.ep})")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/experimental/CollectiveX/tests/probe_deepep_caps.py b/experimental/CollectiveX/tests/probe_deepep_caps.py
new file mode 100644
index 000000000..0f08ed6a5
--- /dev/null
+++ b/experimental/CollectiveX/tests/probe_deepep_caps.py
@@ -0,0 +1,82 @@
+#!/usr/bin/env python3
+"""Read-only DeepEP capability probe (single process, no dist init needed for sigs).
+
+Dumps the exact API surface CollectiveX needs to wire fp8 dispatch + low-latency:
+constructor + dispatch/combine/low_latency_* signatures, the LL rdma size hint,
+the fp8 per-token cast helpers, and the device. Drives the reject matrix + impl.
+Run inside the SGLang container on one GPU; prints to stdout only.
+"""
+import inspect
+import sys
+
+
+def sig(obj, name):
+    fn = getattr(obj, name, None)
+    if fn is None:
+        return f"  {name}: <ABSENT>"
+    try:
+        return f"  {name}{inspect.signature(fn)}"
+    except (ValueError, TypeError):
+        return f"  {name}: <builtin/no-signature>"
+
+
+def main():
+    import torch
+    print("=== torch / device ===")
+    print("torch", torch.__version__, "cuda", torch.version.cuda)
+    if torch.cuda.is_available():
+        p = torch.cuda.get_device_properties(0)
+        print(f"device={p.name} sms={p.multi_processor_count} "
+              f"mem={p.total_memory/1e9:.0f}GB cc={p.major}.{p.minor}")
+    print("fp8 dtypes:", [d for d in ("float8_e4m3fn", "float8_e4m3fnuz", "float8_e5m2")
+                          if hasattr(torch, d)])
+
+    print("\n=== deep_ep ===")
+    import deep_ep
+    from deep_ep import Buffer
+    print("deep_ep file:", getattr(deep_ep, "__file__", "?"))
+    try:
+        import importlib.metadata as md
+        print("deep_ep version:", md.version("deep_ep"))
+    except Exception as e:
+        print("deep_ep version: <none>", repr(e))
+    print("deep_ep dir:", [n for n in dir(deep_ep) if not n.startswith("_")])
+    print("Buffer.num_sms (default):", getattr(Buffer, "num_sms", "<absent>"))
+
+    print("\n=== Buffer signatures ===")
+    print(sig(Buffer, "__init__"))
+    for m in ("dispatch", "combine", "get_dispatch_layout",
+              "low_latency_dispatch", "low_latency_combine",
+              "clean_low_latency_buffer", "get_low_latency_rdma_size_hint",
+              "get_dispatch_config", "get_combine_config", "set_num_sms",
+              "get_buffer_size_hint", "internode_dispatch", "internode_combine"):
+        print(sig(Buffer, m))
+
+    print("\n=== fp8 cast helpers ===")
+    # The canonical per-token fp8 cast in DeepEP's own tests/utils.
+    for modname in ("deep_ep.utils", "deep_ep"):
+        try:
+            mod = __import__(modname, fromlist=["*"])
+            cands = [n for n in dir(mod) if "fp8" in n.lower() or "cast" in n.lower()
+                     or "quant" in n.lower()]
+            print(f"{modname}: {cands}")
+        except Exception as e:
+            print(f"{modname}: <import failed> {e!r}")
+
+    print("\n=== LL dispatch source (return shape / fp8 default) ===")
+    for m in ("low_latency_dispatch", "low_latency_combine", "dispatch"):
+        fn = getattr(Buffer, m, None)
+        if fn is None:
+            continue
+        try:
+            src = inspect.getsource(fn)
+            head = "\n".join(src.splitlines()[:45])
+            print(f"--- {m} (first 45 lines) ---\n{head}\n")
+        except (OSError, TypeError) as e:
+            print(f"--- {m}: no source ({e!r}) ---")
+
+    print("\nPROBE_OK")
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/experimental/CollectiveX/tests/probe_deepep_ll.py b/experimental/CollectiveX/tests/probe_deepep_ll.py
new file mode 100644
index 000000000..88792407b
--- /dev/null
+++ b/experimental/CollectiveX/tests/probe_deepep_ll.py
@@ -0,0 +1,94 @@
+#!/usr/bin/env python3
+"""Go/No-Go: does DeepEP low-latency (LL) mode actually run on THIS fabric?
+
+LL dispatch/combine require IBGDA ("all ranks visible via RDMA, IBGDA enabled" —
+even intranode), with allow_nvlink_for_low_latency_mode as a possible NVLink escape
+hatch. On a single-node NVLink-only box this may or may not initialize. Run under
+torchrun (8 ranks). Prints LL_OK with shapes + reconstruction error, or LL_FAIL with
+the exception — that verdict decides whether 'll' enters DeepEPBackend.SUPPORTED_MODES.
+"""
+import os
+import sys
+import traceback
+
+import torch
+import torch.distributed as dist
+
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+import routing  # noqa: E402
+
+
+def main() -> int:
+    rank = int(os.environ.get("RANK", "0"))
+    world = int(os.environ.get("WORLD_SIZE", "1"))
+    local = int(os.environ.get("LOCAL_RANK", "0"))
+    torch.cuda.set_device(local)
+    device = torch.device(f"cuda:{local}")
+    os.environ.setdefault("MASTER_ADDR", "localhost")
+    os.environ.setdefault("MASTER_PORT", "12377")
+    dist.init_process_group("nccl")
+
+    from deep_ep import Buffer
+    hidden, topk, experts = 7168, 8, 256
+    T = 8                                   # decode-shaped
+    num_max = 128                           # fixed LL cap (>= max T in a decode sweep)
+    nle = experts // world                  # num local experts
+
+    ok = True
+    detail = ""
+    try:
+        rdma = Buffer.get_low_latency_rdma_size_hint(num_max, hidden, world, experts)
+        if rank == 0:
+            print(f"[ll] rdma_size_hint={rdma} bytes; nle={nle} num_max={num_max}")
+        # LL buffer: nvl=0, rdma=hint, low_latency_mode=True. allow_nvlink default True.
+        buf = Buffer(dist.group.WORLD, 0, rdma, low_latency_mode=True,
+                     num_qps_per_rank=max(1, experts // world))
+        # shared trace slice (same builder the harness uses)
+        gi, gw = routing.build_global_routing(T * world, experts, topk, "uniform", 67, nle)
+        si, sw = routing.rank_slice(gi, gw, rank, T)
+        x = routing.rank_activations(T, hidden, 67, rank, device, torch.bfloat16)
+        topk_idx = si.to(device).to(torch.int64)
+        topk_w = sw.to(device).to(torch.float32)
+
+        recv_x, recv_count, handle, event, hook = buf.low_latency_dispatch(
+            x, topk_idx, num_max, experts, use_fp8=True, return_recv_hook=False)
+        rfp8, rscale = recv_x if isinstance(recv_x, tuple) else (recv_x, None)
+        if rank == 0:
+            print(f"[ll] dispatch OK: recv_fp8={tuple(rfp8.shape)} dtype={rfp8.dtype} "
+                  f"scale={None if rscale is None else tuple(rscale.shape)} "
+                  f"recv_count={tuple(recv_count.shape)}")
+        # dequant fp8 recv -> bf16 in the [nle, num_max*world, hidden] layout for combine
+        R = rfp8.float()
+        if rscale is not None:
+            E, S, H = rfp8.shape
+            R = (rfp8.float().view(E, S, H // 128, 128) * rscale.unsqueeze(-1)).view(E, S, H)
+        comb_in = R.to(torch.bfloat16)
+        combined, event2, hook2 = buf.low_latency_combine(comb_in, topk_idx, topk_w, handle)
+        torch.cuda.synchronize()
+        # reconstruction: combined[i] ~= dequant(x[i]) * sum_j w[i,j]  (weighted reduce)
+        wsum = topk_w.sum(dim=1, keepdim=True)
+        ref = x.float() * wsum
+        err = (combined[:T].float() - ref[:T]).abs().max().item() / (ref[:T].abs().max().item() + 1e-6)
+        buf.clean_low_latency_buffer(num_max, hidden, experts)
+        detail = (f"combined={tuple(combined.shape)} max_rel_err={err:.4f} "
+                  f"wsum[0]={wsum[0].item():.3f}")
+        if rank == 0:
+            print(f"[ll] combine OK: {detail}")
+    except Exception as exc:
+        ok = False
+        detail = f"{type(exc).__name__}: {exc}"
+        if rank == 0:
+            print(f"[ll] EXCEPTION: {detail}")
+            traceback.print_exc()
+
+    # reduce verdict across ranks
+    v = torch.tensor([1 if ok else 0], device=device)
+    dist.all_reduce(v, op=dist.ReduceOp.MIN)
+    if rank == 0:
+        print("LL_OK" if int(v.item()) == 1 else "LL_FAIL", detail)
+    dist.destroy_process_group()
+    return 0 if int(v.item()) == 1 else 7
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/experimental/CollectiveX/tests/probe_mori_caps.py b/experimental/CollectiveX/tests/probe_mori_caps.py
new file mode 100644
index 000000000..19ae6e9ed
--- /dev/null
+++ b/experimental/CollectiveX/tests/probe_mori_caps.py
@@ -0,0 +1,113 @@
+#!/usr/bin/env python3
+"""Read-only MoRI capability probe (run under torchrun on MI355X, 8 ranks).
+
+Decides whether 'fp8' enters MoRIBackend.SUPPORTED_PRECISIONS: inspects
+EpDispatchCombineConfig for quant_type options + the scale plumbing, then attempts a
+small fp8 dispatch/combine. Prints MORI_FP8_OK (with the working quant_type + recon
+error) or MORI_FP8_FAIL (with the exception) — that verdict gates the reject matrix.
+LL is not probed: MoRI exposes no separate low-latency entrypoint (caps exclude it).
+"""
+import inspect
+import os
+import sys
+import traceback
+
+import torch
+import torch.distributed as dist
+
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+import routing  # noqa: E402
+
+os.environ.setdefault("MORI_SHMEM_HEAP_SIZE", os.environ.get("CX_MORI_HEAP_SIZE", "2G"))
+
+
+def main() -> int:
+    rank = int(os.environ.get("RANK", "0"))
+    world = int(os.environ.get("WORLD_SIZE", "1"))
+    local = int(os.environ.get("LOCAL_RANK", "0"))
+    torch.cuda.set_device(local)
+    device = torch.device(f"cuda:{local}")
+    os.environ.setdefault("MASTER_ADDR", "localhost")
+    os.environ.setdefault("MASTER_PORT", "12399")
+    dist.init_process_group(backend="cpu:gloo,cuda:nccl", rank=rank, world_size=world,
+                            device_id=device)
+    import mori
+
+    if rank == 0:
+        p = torch.cuda.get_device_properties(0)
+        print(f"[mori] device={p.name} cus={p.multi_processor_count}")
+        print("[mori] EpDispatchCombineConfig sig:")
+        try:
+            print("   ", inspect.signature(mori.ops.EpDispatchCombineConfig))
+        except Exception as e:
+            print("    <no sig>", repr(e))
+        # surface any quant enum the module exposes
+        for name in dir(mori.ops):
+            if "quant" in name.lower() or "Quant" in name:
+                obj = getattr(mori.ops, name)
+                print(f"[mori] ops.{name} = {obj}")
+                if hasattr(obj, "__members__"):
+                    print("     members:", list(obj.__members__))
+
+    hidden, topk, experts = 7168, 8, 256
+    T = 8
+    epr = experts // world
+    world_group = torch.distributed.group.WORLD
+    torch._C._distributed_c10d._register_process_group("default", world_group)
+    mori.shmem.shmem_torch_process_group_init("default")
+
+    # candidate fp8 quant_type values to try (string and enum forms)
+    candidates = []
+    QT = getattr(mori.ops, "EpDispatchCombineQuantType", None) or getattr(mori.ops, "QuantType", None)
+    if QT is not None and hasattr(QT, "__members__"):
+        for mname in QT.__members__:
+            if "8" in mname or "fp8" in mname.lower() or "FP8" in mname:
+                candidates.append((f"enum:{mname}", QT.__members__[mname]))
+    for s in ("fp8", "fp8_e4m3", "e4m3"):
+        candidates.append((f"str:{s}", s))
+
+    if rank == 0:
+        print(f"[mori] fp8 quant_type candidates: {[c[0] for c in candidates]}")
+
+    gi, gw = routing.build_global_routing(T * world, experts, topk, "uniform", 67, epr)
+    si, sw = routing.rank_slice(gi, gw, rank, T)
+    x = routing.rank_activations(T, hidden, 67, rank, device, torch.bfloat16)
+    indices = si.to(device).to(torch.int32)
+    weights = sw.to(device).to(torch.float32)
+
+    working = None
+    detail = ""
+    for label, qt in candidates:
+        try:
+            cfg = mori.ops.EpDispatchCombineConfig(
+                data_type=torch.bfloat16, rank=rank, world_size=world,
+                hidden_dim=hidden, scale_dim=hidden // 128,
+                scale_type_size=torch.tensor([], dtype=torch.float32).element_size(),
+                max_token_type_size=torch.tensor([], dtype=torch.float32).element_size(),
+                max_num_inp_token_per_rank=512, num_experts_per_rank=epr,
+                num_experts_per_token=topk, use_external_inp_buf=False, quant_type=qt)
+            op = mori.ops.EpDispatchCombineOp(cfg)
+            scales = torch.ones((T, hidden // 128), dtype=torch.float32, device=device)
+            out = op.dispatch(x, weights, scales, indices, block_num=80, warp_per_block=16)
+            recv = int(out[-1][0].item())
+            dist.barrier()
+            working = label
+            detail = f"quant_type={label} dispatched recv={recv}"
+            if rank == 0:
+                print(f"[mori] FP8 DISPATCH OK with {label}: recv={recv}")
+            break
+        except Exception as exc:
+            if rank == 0:
+                print(f"[mori] {label} failed: {type(exc).__name__}: {str(exc)[:160]}")
+            detail = f"{type(exc).__name__}: {str(exc)[:160]}"
+
+    v = torch.tensor([1 if working else 0], device=device)
+    dist.all_reduce(v, op=dist.ReduceOp.MIN)
+    if rank == 0:
+        print(("MORI_FP8_OK " + detail) if int(v.item()) == 1 else ("MORI_FP8_FAIL " + detail))
+    sys.stdout.flush(); sys.stderr.flush()
+    os._exit(0 if int(v.item()) == 1 else 7)
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/experimental/CollectiveX/tests/reference_ep.py b/experimental/CollectiveX/tests/reference_ep.py
new file mode 100644
index 000000000..c19f854e0
--- /dev/null
+++ b/experimental/CollectiveX/tests/reference_ep.py
@@ -0,0 +1,117 @@
+#!/usr/bin/env python3
+"""CollectiveX independent EP reference semantics (goal Part 3).
+
+A from-scratch model of MoE dispatch + combine, written WITHOUT DeepEP or MoRI, used ONLY for
+UNTIMED correctness validation. The point (goal: "avoid validating backend against itself"):
+expected outputs come from the canonical routing trace + this independent logic, never from the
+backend's own round trip. Pure numpy — runs anywhere, no torch.
+
+Model (ep_size ranks, experts_per_rank experts each; expert e lives on rank e // experts_per_rank):
+  dispatch:  token t selected for expert e contributes a copy of x[t] to (rank e//epr, expert e).
+  expert:    a deterministic per-expert transform f_e (default: scale x by (1 + e/E) — distinct
+             per expert so a mis-routed copy is detectable; identity is the degenerate case).
+  combine:   y[t] = sum over t's selected experts e of  topk_weight[t,e] * f_e(x[t]).
+             Reduction is over the token's experts; output is in SOURCE token order.
+
+validate_dispatch() checks every (token, selected-expert) maps to the right rank+expert and the
+right payload+gate weight, exactly once. validate_combine() checks the reduction, gate-weighting,
+source ordering, and multiple-experts-on-one-rank. reference_combine() returns y for comparing a
+backend's combined output against an independent oracle.
+"""
+from __future__ import annotations
+
+import numpy as np
+
+
+def expert_scale(e: int, experts: int) -> float:
+    """Default deterministic per-expert transform factor — distinct per expert so a copy routed
+    to the wrong expert produces a wrong value (identity would hide mis-routing)."""
+    return 1.0 + e / float(experts)
+
+
+def dispatch_plan(idx: np.ndarray, experts: int, experts_per_rank: int):
+    """Independent dispatch model. idx[T,topk] selected experts per token.
+    Returns list of (token, slot, expert, dest_rank) — every routed copy, exactly once."""
+    T, topk = idx.shape
+    plan = []
+    for t in range(T):
+        seen = set()
+        for k in range(topk):
+            e = int(idx[t, k])
+            assert e not in seen, f"token {t} selects expert {e} twice (must be distinct)"
+            seen.add(e)
+            plan.append((t, k, e, e // experts_per_rank))
+    return plan
+
+
+def reference_combine(idx, weights, x, experts, experts_per_rank, transform=expert_scale):
+    """y[t] = sum_k weights[t,k] * f_{idx[t,k]}(x[t]); source-token order. The independent oracle."""
+    T, topk = idx.shape
+    y = np.zeros_like(x, dtype=np.float64)
+    for t in range(T):
+        for k in range(topk):
+            e = int(idx[t, k])
+            y[t] += float(weights[t, k]) * transform(e, experts) * x[t].astype(np.float64)
+    return y
+
+
+def validate_dispatch(idx, experts, experts_per_rank):
+    """Every selected (token,expert) routes to the correct rank+expert, exactly once."""
+    plan = dispatch_plan(idx, experts, experts_per_rank)
+    errs = []
+    # exactly-once: no duplicate (token, expert)
+    pairs = [(t, e) for (t, _k, e, _r) in plan]
+    if len(pairs) != len(set(pairs)):
+        errs.append("duplicate (token,expert) routed copy")
+    # correct destination rank
+    for (t, k, e, r) in plan:
+        if r != e // experts_per_rank:
+            errs.append(f"token {t} expert {e} -> rank {r}, expected {e // experts_per_rank}")
+    ep = (experts + experts_per_rank - 1) // experts_per_rank
+    for (t, k, e, r) in plan:
+        if not (0 <= r < ep):
+            errs.append(f"dest rank {r} out of range [0,{ep})")
+    return errs
+
+
+def validate_combine(idx, weights, x, experts, experts_per_rank, transform=expert_scale, tol=1e-9):
+    """Recompute y two ways (vectorizable reduction vs explicit per-copy accumulation) and confirm
+    they agree — exercises reduction across experts, gate-weighting, source ordering, and the
+    multiple-experts-on-one-rank case (when topk experts share a rank)."""
+    errs = []
+    y_ref = reference_combine(idx, weights, x, experts, experts_per_rank, transform)
+    # explicit accumulation over the dispatch plan (independent path)
+    T = idx.shape[0]
+    y_acc = np.zeros((T, x.shape[1]), dtype=np.float64)
+    for (t, k, e, r) in dispatch_plan(idx, experts, experts_per_rank):
+        y_acc[t] += float(weights[t, k]) * transform(e, experts) * x[t].astype(np.float64)
+    if np.abs(y_ref - y_acc).max() > tol:
+        errs.append(f"combine reduction mismatch ({np.abs(y_ref - y_acc).max():.2e})")
+    # multiple-experts-on-one-rank present?
+    multi = any(len({int(e) // experts_per_rank for e in idx[t]}) < idx.shape[1] for t in range(T))
+    return errs, {"has_multi_expert_per_rank": bool(multi)}
+
+
+# --------------------------------------------------------------------------- self-test
+if __name__ == "__main__":
+    import sys
+    rng = np.random.default_rng(0)
+    E, EPR, T, topk, H = 256, 32, 64, 8, 16
+    idx = np.stack([rng.permutation(E)[:topk] for _ in range(T)]).astype(np.int64)
+    w = rng.random((T, topk)).astype(np.float32)
+    x = rng.standard_normal((T, H)).astype(np.float32)
+    de = validate_dispatch(idx, E, EPR); assert not de, de
+    ce, info = validate_combine(idx, w, x, E, EPR); assert not ce, ce
+    print(f"dispatch+combine semantics OK (multi_expert_per_rank={info['has_multi_expert_per_rank']})")
+    # mis-routing is DETECTED: corrupt one expert id and confirm the oracle value changes
+    y0 = reference_combine(idx, w, x, E, EPR)
+    idx2 = idx.copy(); idx2[0, 0] = (idx2[0, 0] + 1) % E
+    y1 = reference_combine(idx2, w, x, E, EPR)
+    assert np.abs(y0[0] - y1[0]).max() > 1e-6, "per-expert transform must make mis-routing detectable"
+    print("mis-routing detectable via distinct per-expert transform OK")
+    # edge cases (goal Part 3): empty rank, repeated dest rank, non-divisible handled by callers
+    idx_hot = np.zeros((4, topk), dtype=np.int64)
+    idx_hot[:] = np.arange(topk)               # all tokens -> experts 0..7 (all on rank 0) = hotspot
+    assert not validate_dispatch(idx_hot, E, EPR), "single-rank hotspot must validate"
+    print("edge case: single-rank hotspot (all topk on rank 0) OK")
+    print("reference_ep self-test: PASS"); sys.exit(0)
diff --git a/experimental/CollectiveX/tests/routing.py b/experimental/CollectiveX/tests/routing.py
new file mode 100644
index 000000000..66db5a350
--- /dev/null
+++ b/experimental/CollectiveX/tests/routing.py
@@ -0,0 +1,158 @@
+#!/usr/bin/env python3
+"""CollectiveX — deterministic, platform-independent MoE routing trace.
+
+Fair-comparison fix #1: routing (per-token expert IDs + gate weights) is generated
+ONCE from a fixed seed over the *global* token batch, indexed by global token id, and
+is identical on every SKU for the same (seed, routing, global_tokens, experts, top-k,
+experts_per_rank). Each rank materializes its slice `[rank*T,(rank+1)*T)`. Activations
+are per-rank (same rank ⇒ same x on any platform), so a given global token id has
+identical activation everywhere without materializing a global activation tensor.
+
+Trace classes (the rank fan-out — #destination ranks a token's top-k experts touch —
+is the property that makes an EP workload representative; review caught the old
+default having fan-out 1):
+
+  * uniform   — top-k distinct experts drawn uniformly per token. The DEFAULT.
+                Expected fan-out for top-k=8, 256 experts, EP8 (32 experts/rank) ≈
+                8·(1 − C(224,8)/C(256,8)) ≈ 5.3 ranks/token. Load ~ Poisson.
+  * balanced  — load-equalized AND maximally spread: token i, slot j →
+                (i + j·experts_per_rank) mod E, so the 8 experts sit one-per-rank
+                (fan-out = ep_size) and every expert is hit equally. The high-fan-out,
+                perfectly-balanced reference.
+  * balanced-rank-local — the OLD degenerate "balanced": (i·top_k + j) mod E, i.e.
+                top_k consecutive experts, which (top_k ≤ experts/rank, aligned) all
+                land on ONE rank ⇒ fan-out 1, minimum communication. Kept as an
+                explicit edge case, honestly named.
+  * zipf      — expert popularity ∝ 1/rank (skewed load), uniform-ish fan-out.
+
+Always publish the realized fan-out so the workload is never misread again
+(`routing_stats`).
+"""
+from __future__ import annotations
+
+import hashlib
+
+import torch
+
+_RANK_SUBSEED = 7919
+
+
+def _cpu_gen(seed: int) -> "torch.Generator":
+    g = torch.Generator(device="cpu")
+    g.manual_seed(int(seed))
+    return g
+
+
+def build_global_routing(global_tokens: int, experts: int, topk: int,
+                         routing: str, seed: int, experts_per_rank: int):
+    """(idx[gt, topk] int64, weights[gt, topk] float32) on CPU — deterministic,
+    independent of world/EP/platform, experts distinct within a token."""
+    if topk > experts:
+        raise ValueError(f"topk ({topk}) > experts ({experts})")
+    gt = int(global_tokens)
+    g = _cpu_gen(seed)
+    if routing == "uniform":
+        keys = torch.rand(gt, experts, generator=g)
+        idx = keys.argsort(dim=1)[:, :topk].contiguous().to(torch.int64)
+    elif routing == "balanced":
+        # one expert per rank ⇒ fan-out = ep_size, perfectly balanced load.
+        i = torch.arange(gt, dtype=torch.int64).unsqueeze(1)
+        j = torch.arange(topk, dtype=torch.int64).unsqueeze(0)
+        idx = (i + j * int(experts_per_rank)) % experts
+    elif routing == "balanced-rank-local":
+        # top_k consecutive (mod E) ⇒ all on ONE rank ⇒ fan-out 1 (min comm). Edge case.
+        i = torch.arange(gt, dtype=torch.int64).unsqueeze(1)
+        j = torch.arange(topk, dtype=torch.int64).unsqueeze(0)
+        idx = (i * topk + j) % experts
+    elif routing == "zipf" or routing.startswith("zipf-"):
+        # popularity ∝ 1/rank^s — s sets the skew. zipf == zipf-moderate (s=1).
+        s = {"zipf": 1.0, "zipf-mild": 0.5, "zipf-moderate": 1.0, "zipf-heavy": 2.0}.get(routing)
+        if s is None:
+            raise ValueError(f"unknown zipf level '{routing}'")
+        p = 1.0 / torch.arange(1, experts + 1, dtype=torch.float32).pow(s)
+        p = (p / p.sum()).expand(gt, experts)
+        idx = torch.multinomial(p, topk, replacement=False, generator=g).to(torch.int64)
+    elif routing == "hotspot-single":
+        # adversarial: expert 0 is in EVERY token's top-k (single hot expert/rank), the other
+        # topk-1 drawn uniformly from the rest — maximal single-rank load.
+        rest = torch.stack([torch.randperm(experts - 1, generator=g)[:topk - 1] + 1
+                            for _ in range(gt)]).to(torch.int64)
+        idx = torch.cat([torch.zeros(gt, 1, dtype=torch.int64), rest], dim=1)
+    else:
+        raise ValueError(f"unknown routing '{routing}' "
+                         f"(uniform|balanced|balanced-rank-local|zipf[-mild|-moderate|-heavy]|hotspot-single)")
+    weights = torch.softmax(torch.randn(gt, topk, generator=g), dim=1).to(torch.float32)
+    return idx, weights
+
+
+def rank_slice(idx, weights, rank: int, tokens_per_rank: int):
+    lo = rank * tokens_per_rank
+    return idx[lo:lo + tokens_per_rank].contiguous(), weights[lo:lo + tokens_per_rank].contiguous()
+
+
+def rank_activations(tokens: int, hidden: int, seed: int, rank: int, device, dtype=torch.bfloat16):
+    g = _cpu_gen(int(seed) * _RANK_SUBSEED + int(rank) + 1)
+    return torch.randn(tokens, hidden, generator=g, dtype=torch.float32).to(device=device, dtype=dtype)
+
+
+def routing_locality(idx, experts_per_rank: int, ep_size: int, tokens_per_rank: int,
+                     gpus_per_node: int, scale_up_domain: int = None) -> dict:
+    """Locality of the routed (token, dest-rank) copies (goal Part 2 topology section).
+    A token's SOURCE rank is global_id // tokens_per_rank; its DEST ranks are idx // epr.
+    Reports the fraction of copies that stay on the local rank / same node / same scale-up
+    domain vs cross-node / cross-domain — the property a placement (packed/striped) changes."""
+    import torch as _t
+    gt = idx.shape[0]
+    dest = (idx // experts_per_rank).clamp(max=ep_size - 1)             # [gt, topk]
+    src = (_t.arange(gt) // max(1, tokens_per_rank)).unsqueeze(1)       # [gt,1] source rank
+    src = src.expand_as(dest)
+    sud = scale_up_domain or (gpus_per_node * ep_size)                  # default: all one domain
+    local = (dest == src)
+    same_node = (dest // gpus_per_node) == (src // gpus_per_node)
+    same_dom = (dest // sud) == (src // sud)
+    n = dest.numel()
+    return {
+        "local_rank_fraction": float(local.float().mean()),
+        "same_node_fraction": float(same_node.float().mean()),
+        "same_scaleup_domain_fraction": float(same_dom.float().mean()),
+        "cross_node_fraction": float((~same_node).float().mean()),
+        "cross_domain_fraction": float((~same_dom).float().mean()),
+        "gpus_per_node": gpus_per_node, "scale_up_domain": sud, "copies": int(n),
+    }
+
+
+def routing_stats(idx, experts: int, experts_per_rank: int, weights=None) -> dict:
+    """Realized routing properties for the GLOBAL trace — published per point so the
+    fan-out / load can never be silently misread. idx is the global [gt, topk] tensor;
+    weights the matching [gt, topk] gate weights (hashed too for workload identity).
+    """
+    ep = max(1, experts // max(1, experts_per_rank))
+    ranks = (idx // experts_per_rank)                       # [gt, topk] destination rank per assignment
+    # unique destination ranks per token (fan-out)
+    onehot = torch.zeros(idx.shape[0], ep, dtype=torch.bool)
+    onehot.scatter_(1, ranks.clamp(max=ep - 1), True)
+    fanout = onehot.sum(dim=1)                              # [gt]
+    hist = torch.bincount(fanout, minlength=ep + 1)[1:ep + 1].tolist()  # counts for fan-out 1..ep
+    load = torch.bincount(idx.reshape(-1), minlength=experts).float()
+    # token-copies SENT to each destination rank (the "send histogram", review #3).
+    rank_load = torch.bincount(ranks.reshape(-1).clamp(max=ep - 1), minlength=ep).tolist()
+    # SHA-256 workload identity over BOTH topk_idx and gate weights (review #3): a chart
+    # point's routing is provably identical across SKUs only if both hashes match.
+    idx_bytes = idx.to(torch.int32).cpu().numpy().tobytes()
+    idx_hash = hashlib.sha256(idx_bytes).hexdigest()[:16]
+    if weights is not None:
+        w_bytes = weights.to(torch.float32).cpu().numpy().tobytes()
+        w_hash = hashlib.sha256(w_bytes).hexdigest()[:16]
+        routing_hash = hashlib.sha256(idx_bytes + w_bytes).hexdigest()[:16]  # combined identity
+    else:
+        w_hash, routing_hash = None, idx_hash
+    return {
+        "fanout_mean": float(fanout.float().mean()),
+        "fanout_min": int(fanout.min()), "fanout_max": int(fanout.max()),
+        "fanout_hist": hist,                               # index k-1 = #tokens with fan-out k
+        "rank_load_hist": rank_load,                       # token-copies sent to each dest rank
+        "routed_copies": int(fanout.sum()),                # total (token, dest-rank) pairs
+        "expert_load_min": int(load.min()), "expert_load_max": int(load.max()),
+        "expert_load_mean": float(load.mean()),
+        "routing_hash": routing_hash, "idx_hash": idx_hash, "weights_hash": w_hash,
+    }
diff --git a/experimental/CollectiveX/tests/run_ep.py b/experimental/CollectiveX/tests/run_ep.py
new file mode 100644
index 000000000..e9a74f6ab
--- /dev/null
+++ b/experimental/CollectiveX/tests/run_ep.py
@@ -0,0 +1,138 @@
+#!/usr/bin/env python3
+"""CollectiveX — EP dispatch/combine benchmark entrypoint (run under torchrun).
+
+Picks a backend adapter (DeepEP or MoRI), runs the source-tokens-per-rank sweep
+via ep_harness, and writes one provenance-tagged JSON doc. Dispatch and combine
+are timed SEPARATELY (see ep_harness); only T varies along the resulting line.
+
+  torchrun --nproc_per_node=8 tests/run_ep.py --backend mori \\
+      --phase decode --runner mi355x-amds --topology-class mi355x-xgmi \\
+      --transport xgmi --env-json results/env.json --out results/mi355x_mori_decode.json
+
+  torchrun --nproc_per_node=8 tests/run_ep.py --backend deepep \\
+      --phase prefill --runner b200-dgxc --topology-class b200-nvlink-island \\
+      --transport nvlink --env-json results/env.json --out results/b200_deepep_prefill.json
+"""
+from __future__ import annotations
+
+import argparse
+import os
+import sys
+
+# Make the sibling tests/ modules importable when run as `tests/run_ep.py` under
+# torchrun (it executes the file as __main__, not as a package).
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+
+import ep_harness  # noqa: E402  (stdlib-only; safe before torch)
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser(description="CollectiveX EP dispatch/combine sweep")
+    ap.add_argument("--backend", required=True, choices=["deepep", "mori"])
+    ep_harness.add_common_args(ap)
+    args = ap.parse_args()
+
+    try:
+        import torch
+        import torch.distributed as dist
+    except Exception as exc:  # pragma: no cover
+        print(f"ERROR: torch unavailable: {exc!r}", file=sys.stderr)
+        return 3
+
+    rank = int(os.environ.get("RANK", "0"))
+    world_size = int(os.environ.get("WORLD_SIZE", "1"))
+    local_rank = int(os.environ.get("LOCAL_RANK", "0"))
+    torch.cuda.set_device(local_rank)
+    device = torch.device(f"cuda:{local_rank}")
+    os.environ.setdefault("MASTER_ADDR", "localhost")
+    os.environ.setdefault("MASTER_PORT", "12355")
+
+    # EPLB bumps the expert count to PHYSICAL (logical + redundant) BEFORE backend construction
+    # so the backend sizes its buffers for the replicated set; ep_harness builds the LOGICAL
+    # routing trace and remaps it to the balanced physical placement (a pure routing transform,
+    # tests/eplb.py — no adapter change). Deterministic, so every rank agrees on the count.
+    if getattr(args, "eplb", False):
+        import eplb
+        args.num_logical_experts = args.experts
+        args.experts = eplb.physical_count(args.experts, args.num_redundant_experts, world_size)
+
+    # Reproduction provenance (recorded in the artifact).
+    args.reproduction_command = (f"torchrun --nproc_per_node={world_size} tests/run_ep.py "
+                                 + " ".join(sys.argv[1:]))
+    args.image = os.environ.get("COLLECTIVEX_IMAGE", "")
+    args.image_digest = os.environ.get("COLLECTIVEX_IMAGE_DIGEST", "")
+    # Container provenance (goal P1): arch (amd64/arm64) + local squash hash for Enroot/Pyxis.
+    import platform as _plat
+    _arch = {"x86_64": "amd64", "aarch64": "arm64"}.get(_plat.machine(), _plat.machine())
+    args.image_arch = _arch
+    args.squash_sha256 = os.environ.get("COLLECTIVEX_SQUASH_SHA256")
+    # Complete GitHub provenance (goal P1): repo, run id, attempt, ref/branch, source SHA, job,
+    # artifact. A result is only publication-'official' when these are present (validity gate).
+    _run = {"run_id": os.environ.get("GITHUB_RUN_ID"),
+            "run_attempt": os.environ.get("GITHUB_RUN_ATTEMPT"),
+            "ref": os.environ.get("GITHUB_REF_NAME") or os.environ.get("GITHUB_REF"),
+            "source_sha": os.environ.get("COLLECTIVEX_SOURCE_SHA") or os.environ.get("GITHUB_SHA"),
+            "repo": os.environ.get("GITHUB_REPOSITORY"),
+            "job": os.environ.get("GITHUB_JOB"),
+            "artifact": os.environ.get("COLLECTIVEX_ARTIFACT_NAME")}
+    args.git_run = _run if any(_run.values()) else None
+
+    # Import the backend CLASS (module-top imports torch + the backend lib; no process
+    # group needed) and REJECT unsupported combos BEFORE init — never fall back or
+    # mislabel (review/goal). All ranks reject identically.
+    if args.backend == "mori":
+        from ep_mori import MoRIBackend as Backend
+    else:
+        from ep_deepep import DeepEPBackend as Backend
+    if args.num_ep_groups != 1:
+        if rank == 0:
+            print(f"ERROR: num_ep_groups={args.num_ep_groups} REJECTED — real subgroup process "
+                  f"groups are unimplemented; not faking it.", file=sys.stderr)
+        return 5
+    sp = getattr(Backend, "SUPPORTED_PRECISIONS", {"bf16"})
+    sm = getattr(Backend, "SUPPORTED_MODES", {"normal"})
+    if args.dispatch_dtype not in sp or args.mode not in sm:
+        if rank == 0:
+            print(f"ERROR: {args.backend} REJECTS dispatch-dtype={args.dispatch_dtype} / "
+                  f"mode={args.mode} — not supported on this build (no fallback). "
+                  f"supported precisions={sorted(sp)} modes={sorted(sm)}.", file=sys.stderr)
+        return 5
+    # Measurement-contract capability (review #3): each adapter conforms to a declared
+    # contract; reject anything else rather than letting it pick its own timing boundary.
+    sc = getattr(Backend, "SUPPORTED_CONTRACTS", {"layout-and-dispatch-v1"})
+    if args.measurement_contract not in sc:
+        if rank == 0:
+            print(f"ERROR: {args.backend} REJECTS measurement-contract="
+                  f"{args.measurement_contract} — supported={sorted(sc)}.", file=sys.stderr)
+        return 5
+    if args.measurement_contract == "cached-layout-comm-only-v1" and args.mode == "ll":
+        if rank == 0:
+            print("ERROR: cached-layout-comm-only-v1 is meaningless for LL (low_latency_dispatch "
+                  "computes its layout internally; nothing to hoist).", file=sys.stderr)
+        return 5
+
+    # MoRI inits its shmem on a process group it registers as "default" and wants
+    # the gloo+nccl combo with an explicit device_id (per its reference test);
+    # DeepEP uses a plain nccl group.
+    if not dist.is_initialized():
+        if args.backend == "mori":
+            dist.init_process_group(backend="cpu:gloo,cuda:nccl", rank=rank,
+                                    world_size=world_size, device_id=device)
+        else:
+            dist.init_process_group("nccl")
+
+    backend = Backend(args, rank, world_size, local_rank, device)
+    if rank == 0:
+        print(f"[run_ep] backend={args.backend} phase={args.phase} mode={args.mode} "
+              f"world={world_size} ep_size={world_size} hidden={args.hidden} "
+              f"topk={args.topk} experts={args.experts} dtype={args.dispatch_dtype} "
+              f"routing={args.routing} seed={args.seed}")
+
+    rc = ep_harness.run_sweep(args, backend, torch, dist, device, rank, world_size)
+    # finalize() handles backend-specific teardown: DeepEP returns rc cleanly;
+    # MoRI hard-exits past its post-shmem_finalize teardown assertion.
+    return backend.finalize(rc)
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/experimental/CollectiveX/tests/workload.py b/experimental/CollectiveX/tests/workload.py
new file mode 100644
index 000000000..54465eb16
--- /dev/null
+++ b/experimental/CollectiveX/tests/workload.py
@@ -0,0 +1,168 @@
+#!/usr/bin/env python3
+"""CollectiveX — canonical, serialized MoE routing workloads (goal Part 1: workload identity).
+
+A *canonical workload* is a routing trace generated ONCE, serialized to a platform-independent
+file, and referenced by an immutable `workload_id`. Every official benchmark point consumes the
+SAME serialized bytes, so "did NVIDIA and AMD run the identical workload?" is answered by a
+checksum match, not by trusting that two machines re-ran the same seeded generator.
+
+Layout on disk (one workload = two files, basename = workload_id):
+  <dir>/<workload_id>.npz            topk_idx [gt,topk] int32, topk_weights [gt,topk] float32
+  <dir>/<workload_id>.manifest.json  dims, routing profile, generator version, seed, SHA-256s
+
+Split by dependency so it runs where each step lives:
+  * build_workload()  needs torch (via routing.py) — run on a node/container.
+  * load/verify/manifest  need only numpy + stdlib — run on a login node or in CI.
+
+Seeded runtime generation (routing.build_global_routing) stays for local dev; canonical files
+are how cross-hardware comparisons are gated.
+"""
+from __future__ import annotations
+
+import hashlib
+import json
+import os
+
+WORKLOAD_SCHEMA_VERSION = 1
+# Bump when routing.build_global_routing's numerics change so a stale file can't masquerade as
+# current. The workload_id folds this in: same id <=> same generator + params.
+GENERATOR_VERSION = "collectivex-routing-v1"
+GATE_WEIGHT_FORMAT = "softmax-of-randn-f32"   # how topk_weights are produced (see routing.py)
+
+
+def _sha256(b: bytes) -> str:
+    return hashlib.sha256(b).hexdigest()
+
+
+def compute_workload_id(routing: str, hidden: int, topk: int, experts: int,
+                        global_tokens: int, seed: int, generator: str = GENERATOR_VERSION) -> str:
+    """Deterministic id over the identity-defining params. Same params+generator => same id."""
+    key = (f"{generator}|routing={routing}|hidden={hidden}|topk={topk}|experts={experts}"
+           f"|gt={global_tokens}|seed={seed}")
+    return _sha256(key.encode())[:16]
+
+
+def build_manifest(routing, hidden, topk, experts, global_tokens, seed, experts_per_rank,
+                   idx_np, weights_np, routing_stats=None):
+    """Assemble the manifest dict from the (numpy) trace arrays. Pure numpy/stdlib."""
+    idx_bytes = idx_np.astype("int32").tobytes()
+    w_bytes = weights_np.astype("float32").tobytes()
+    wid = compute_workload_id(routing, hidden, topk, experts, global_tokens, seed)
+    return {
+        "schema_version": WORKLOAD_SCHEMA_VERSION,
+        "workload_id": wid,
+        "generator_version": GENERATOR_VERSION,
+        "gate_weight_format": GATE_WEIGHT_FORMAT,
+        "dims": {"hidden": hidden, "topk": topk, "experts": experts,
+                 "global_tokens": int(global_tokens), "experts_per_rank": experts_per_rank},
+        "routing_profile": routing,
+        "seed": seed,
+        "checksums": {  # SHA-256 over the raw little-endian array bytes (int32 / float32)
+            "topk_idx": _sha256(idx_bytes),
+            "topk_weights": _sha256(w_bytes),
+            "trace": _sha256(idx_bytes + w_bytes),   # full-workload identity
+        },
+        "routing_stats": routing_stats or {},
+    }
+
+
+def build_workload(hidden, topk, experts, routing, global_tokens, seed, experts_per_rank):
+    """Generate a canonical trace. Needs torch (routing.py). Returns (idx_np, weights_np, manifest)."""
+    import numpy as np
+    import routing as _routing
+    idx_t, w_t = _routing.build_global_routing(global_tokens, experts, topk, routing, seed,
+                                               experts_per_rank)
+    rstats = _routing.routing_stats(idx_t, experts, experts_per_rank, weights=w_t)
+    idx_np = idx_t.detach().cpu().numpy().astype(np.int32)
+    w_np = w_t.detach().cpu().numpy().astype(np.float32)
+    manifest = build_manifest(routing, hidden, topk, experts, global_tokens, seed,
+                              experts_per_rank, idx_np, w_np, rstats)
+    return idx_np, w_np, manifest
+
+
+def save_workload(out_dir, idx_np, weights_np, manifest) -> str:
+    import numpy as np
+    os.makedirs(out_dir, exist_ok=True)
+    wid = manifest["workload_id"]
+    np.savez_compressed(os.path.join(out_dir, f"{wid}.npz"),
+                        topk_idx=idx_np.astype(np.int32), topk_weights=weights_np.astype(np.float32))
+    with open(os.path.join(out_dir, f"{wid}.manifest.json"), "w") as fh:
+        json.dump(manifest, fh, indent=2, sort_keys=True)
+    return wid
+
+
+def load_workload(npz_path, verify=True):
+    """Load a canonical trace (numpy + stdlib only). Returns (idx_np, weights_np, manifest).
+    Raises ValueError if verify=True and the on-disk bytes don't match the manifest checksums."""
+    import numpy as np
+    base = npz_path[:-4] if npz_path.endswith(".npz") else npz_path
+    with open(base + ".manifest.json") as fh:
+        manifest = json.load(fh)
+    z = np.load(base + ".npz")
+    idx_np, w_np = z["topk_idx"], z["topk_weights"]
+    if verify:
+        ok, reason = verify_workload(manifest, idx_np, w_np)
+        if not ok:
+            raise ValueError(f"workload checksum mismatch for {base}: {reason}")
+    return idx_np, w_np, manifest
+
+
+def verify_workload(manifest, idx_np, weights_np):
+    """Recompute checksums and compare to the manifest. Returns (ok, reason)."""
+    import numpy as np  # noqa: F401
+    ib = idx_np.astype("int32").tobytes()
+    wb = weights_np.astype("float32").tobytes()
+    cs = manifest.get("checksums", {})
+    if _sha256(ib) != cs.get("topk_idx"):
+        return False, "topk_idx hash differs"
+    if _sha256(wb) != cs.get("topk_weights"):
+        return False, "topk_weights hash differs"
+    if _sha256(ib + wb) != cs.get("trace"):
+        return False, "trace hash differs"
+    wid = compute_workload_id(manifest["routing_profile"], manifest["dims"]["hidden"],
+                              manifest["dims"]["topk"], manifest["dims"]["experts"],
+                              manifest["dims"]["global_tokens"], manifest["seed"],
+                              manifest.get("generator_version", GENERATOR_VERSION))
+    if wid != manifest["workload_id"]:
+        return False, f"workload_id mismatch (recomputed {wid} != {manifest['workload_id']})"
+    return True, "ok"
+
+
+# --------------------------------------------------------------------------- self-test
+if __name__ == "__main__":
+    import sys
+    import tempfile
+    # (1) workload_id determinism + sensitivity — pure stdlib, always runs.
+    a = compute_workload_id("zipf", 7168, 8, 256, 4096, 67)
+    b = compute_workload_id("zipf", 7168, 8, 256, 4096, 67)
+    c = compute_workload_id("uniform", 7168, 8, 256, 4096, 67)
+    assert a == b, "workload_id must be deterministic"
+    assert a != c, "workload_id must depend on routing"
+    print(f"workload_id determinism OK (zipf={a} uniform={c})")
+    # (2) build/save/load/verify roundtrip + cross-build identity — needs torch+numpy.
+    try:
+        import numpy as np  # noqa: F401
+        try:
+            idx, w, man = build_workload(7168, 8, 256, "zipf", 512, 67, 32)
+            built = True
+        except Exception as exc:   # torch missing on a login node
+            print(f"(torch unavailable — synthesizing arrays to test load/verify: {exc!r})")
+            idx = np.random.default_rng(0).integers(0, 256, size=(512, 8)).astype(np.int32)
+            w = np.random.default_rng(1).random((512, 8)).astype(np.float32)
+            man = build_manifest("zipf", 7168, 8, 256, 512, 67, 32, idx, w)
+            built = False
+        with tempfile.TemporaryDirectory() as d:
+            wid = save_workload(d, idx, w, man)
+            idx2, w2, man2 = load_workload(os.path.join(d, f"{wid}.npz"), verify=True)
+            assert (idx2 == idx).all() and (w2 == w).all(), "roundtrip array mismatch"
+            ok, reason = verify_workload(man2, idx2, w2)
+            assert ok, reason
+            # tamper -> must fail
+            idx2[0, 0] = (int(idx2[0, 0]) + 1) % 256
+            bad, _ = verify_workload(man2, idx2, w2)
+            assert not bad, "verify must catch tampering"
+        print(f"save/load/verify roundtrip OK (workload_id={wid}, built_via_torch={built})")
+    except ImportError:
+        print("(numpy unavailable — skipped serialization roundtrip; id logic passed)")
+    print("workload self-test: PASS")
+    sys.exit(0)
diff --git a/experimental/CollectiveX/validate_results.py b/experimental/CollectiveX/validate_results.py
new file mode 100644
index 000000000..584674ab1
--- /dev/null
+++ b/experimental/CollectiveX/validate_results.py
@@ -0,0 +1,165 @@
+#!/usr/bin/env python3
+"""CollectiveX result validator (goal Part 1: schema + validation tooling).
+
+Validates EP result JSON docs against ep-result-v4 and the project's semantic gates:
+schema shape, provenance completeness, workload identity (incl. cross-run trace-signature
+agreement within a comparison_key), measurement-contract membership, byte-contract presence,
+sample counts, and — crucially — that `publication_status` is the MACHINE-DERIVED function of
+`validity` (no doc may hand-label itself official). Exits non-zero when any doc claims
+`official` but fails a gate (or, with --require-official, when any doc isn't official).
+
+Pure stdlib; uses `jsonschema` if importable, else a built-in required-key/type/enum check.
+v3 docs (no publication_status) load as legacy/experimental and are reported, not failed.
+
+  python3 validate_results.py results/*.json
+  python3 validate_results.py --require-official --schema schemas/ep-result-v4.schema.json results/
+"""
+from __future__ import annotations
+
+import argparse
+import glob
+import json
+import os
+import sys
+
+MIN_SAMPLES_OFFICIAL = 100
+KNOWN_CONTRACTS = {"layout-and-dispatch-v1", "cached-layout-comm-only-v1", "runtime-visible-v1"}
+PUB_STATES = {"official", "comparable-experimental", "diagnostic", "invalid", "failed"}
+
+
+def derive_publication_status(v: dict) -> str:
+    """MUST mirror ep_harness._derive_publication_status — the validator's job is to confirm the
+    recorded status equals this derivation."""
+    if v.get("execution_status") != "complete":
+        return "failed"
+    if (v.get("semantic_correctness") != "pass" or v.get("measurement_conformance") != "conformant"
+            or v.get("workload_identity") == "inconsistent"):
+        return "invalid"
+    sound = (v.get("semantic_correctness") == "pass"
+             and str(v.get("workload_identity", "")).startswith("consistent")
+             and v.get("measurement_conformance") == "conformant")
+    if str(v.get("resource_conformance", "")).endswith("nonconforming"):
+        return "diagnostic"
+    if sound and v.get("provenance_complete") and v.get("workload_source") == "canonical-serialized":
+        return "official"
+    if sound:
+        return "comparable-experimental"
+    return "diagnostic"
+
+
+def _schema_check(doc, schema):
+    """jsonschema if available; else a pragmatic required-keys/enum check of the top level + rows."""
+    try:
+        import jsonschema
+        jsonschema.validate(doc, schema)
+        return []
+    except ImportError:
+        errs = []
+        for k in schema.get("required", []):
+            if k not in doc:
+                errs.append(f"missing required field '{k}'")
+        # enum spot-checks the built-in path can do cheaply
+        ms = doc.get("measurement_contract")
+        if ms is not None and ms not in KNOWN_CONTRACTS:
+            errs.append(f"unknown measurement_contract '{ms}'")
+        ps = doc.get("publication_status")
+        if ps is not None and ps not in PUB_STATES:
+            errs.append(f"unknown publication_status '{ps}'")
+        if not doc.get("rows"):
+            errs.append("no rows")
+        return errs
+    except Exception as exc:   # jsonschema.ValidationError
+        return [f"schema: {exc.message if hasattr(exc, 'message') else exc}"]
+
+
+def validate_doc(doc, schema, path):
+    errs, warns = [], []
+    legacy = "publication_status" not in doc
+    if legacy:
+        warns.append("legacy (v3, no publication_status) — loads as experimental, not comparable as official")
+        return errs, warns, "legacy-experimental"
+    errs += _schema_check(doc, schema) if schema else []
+    v = doc.get("validity", {})
+    recorded = doc.get("publication_status")
+    derived = derive_publication_status(v)
+    if recorded != derived:
+        errs.append(f"publication_status '{recorded}' != machine-derived '{derived}' (validity tampered or stale)")
+    # byte + contract + sample gates
+    if doc.get("measurement_contract") not in KNOWN_CONTRACTS:
+        errs.append(f"unknown measurement_contract {doc.get('measurement_contract')}")
+    rows = doc.get("rows", [])
+    for r in rows:
+        if "byte_contracts" not in r:
+            errs.append(f"T={r.get('tokens_per_rank')}: missing byte_contracts"); break
+        for op in ("dispatch", "combine", "roundtrip"):
+            if op not in r or "p99" not in r.get(op, {}):
+                errs.append(f"T={r.get('tokens_per_rank')}: missing {op} percentiles"); break
+    # official-grade gates
+    if recorded == "official":
+        if not v.get("provenance_complete"):
+            errs.append("official but provenance_complete=false")
+        if v.get("workload_source") != "canonical-serialized":
+            errs.append("official but workload not canonical-serialized")
+        if rows and min((r.get("samples_pooled", 0) for r in rows)) < MIN_SAMPLES_OFFICIAL:
+            errs.append(f"official but a point has <{MIN_SAMPLES_OFFICIAL} pooled samples")
+        if not all(r.get("correct") for r in rows):
+            errs.append("official but a point failed correctness")
+    return errs, warns, recorded
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser(description="CollectiveX EP result validator")
+    ap.add_argument("paths", nargs="+", help="result JSON files or dirs")
+    ap.add_argument("--schema", default=os.path.join(os.path.dirname(__file__), "schemas", "ep-result-v4.schema.json"))
+    ap.add_argument("--require-official", action="store_true",
+                    help="fail if any non-legacy doc is not 'official'")
+    a = ap.parse_args()
+    schema = None
+    if a.schema and os.path.exists(a.schema):
+        schema = json.load(open(a.schema))
+    files = []
+    for p in a.paths:
+        if os.path.isdir(p):
+            files += glob.glob(os.path.join(p, "**", "*.json"), recursive=True)
+        else:
+            files.append(p)
+    files = sorted(f for f in files if not os.path.basename(f).startswith("env_"))
+
+    # cross-run workload identity: trace_signature must agree within a comparison_key.
+    by_ck = {}
+    bad = 0
+    for f in files:
+        try:
+            doc = json.load(open(f))
+        except (json.JSONDecodeError, OSError):
+            continue
+        if doc.get("family") != "moe":
+            continue
+        errs, warns, status = validate_doc(doc, schema, f)
+        ck = doc.get("comparison_key")
+        sig = (doc.get("workload") or {}).get("trace_signature")
+        if ck and sig:
+            by_ck.setdefault(ck, {}).setdefault(sig, []).append(os.path.basename(f))
+        tag = "OK" if not errs else "FAIL"
+        if errs:
+            bad += 1
+        if a.require_official and status not in ("official",) and not errs:
+            tag = "FAIL"; bad += 1; errs = [f"not official (status={status})"]
+        print(f"[{tag}] {os.path.basename(f):70s} status={status}")
+        for e in errs:
+            print(f"        ERROR: {e}")
+        for w in warns:
+            print(f"        note: {w}")
+    # report cross-run identity disagreements (different hardware, same config, different trace)
+    for ck, sigs in by_ck.items():
+        if len(sigs) > 1:
+            bad += 1
+            print(f"[FAIL] comparison_key {ck[:12]}: {len(sigs)} DIFFERENT trace signatures — not the same workload:")
+            for sig, fs in sigs.items():
+                print(f"        {sig}: {', '.join(fs)}")
+    print(f"\n{'FAILED' if bad else 'PASS'}: {len(files)} files, {bad} problem(s)")
+    return 1 if bad else 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())