SemiAnalysisAI · Oseltamivir · Jun 23, 2026 · Jun 23, 2026 · Jun 23, 2026 · Jun 23, 2026
diff --git a/.github/workflows/collectivex-experimental.yml b/.github/workflows/collectivex-experimental.yml
@@ -0,0 +1,254 @@
+name: CollectiveX Experimental
+
+# Orchestration only — all benchmark logic lives in experimental/CollectiveX/.
+# Push to the feature branch runs the MI355X MoRI dispatch/combine benchmark (no
+# merge to main needed); workflow_dispatch runs a chosen SKU + benchmark (the lane
+# for GB200/B200 NCCL, DeepEP, and larger sweeps). Each job lands on the SKU's
+# self-hosted runner and invokes that SKU's launch script — the same
+# launch_${RUNNER_NAME%%_*}.sh convention the serving benchmarks use.
+
+on:
+  push:
+    branches:
+      - collectivex
+    paths:
+      - 'experimental/CollectiveX/**'
+      - '.github/workflows/collectivex-experimental.yml'
+  workflow_dispatch:
+    inputs:
+      sku:
+        # Only SKUs with a matching launchers/launch_<prefix>.sh are offered —
+        # runner.name's prefix selects the script, so an SKU without one fails.
+        description: Self-hosted runner pool (must have a CollectiveX launcher)
+        type: choice
+        default: gb200
+        options: [gb200, b200-dgxc, b200-multinode, mi355x, h100-dgxc, h200, b300, gb300]
+      benchmark:
+        # mori runs only on mi355x; nccl/deepep/all on the NVIDIA SKUs.
+        description: Which benchmark to run
+        type: choice
+        default: nccl
+        options: [nccl, deepep, mori, all]
+      ops:
+        description: NCCL ops (space-separated); blank = default set
+        type: string
+        default: ''
+      min_bytes:
+        description: nccl-tests min message size
+        type: string
+        default: '8'
+      max_bytes:
+        description: nccl-tests max message size
+        type: string
+        default: '8G'
+      ngpus:
+        description: GPUs per node (blank = SKU default)
+        type: string
+        default: ''
+      nodes:
+        description: Node count (gb200 multi-node MNNVL; 2 = 8 GPU). Blank/1 = single node.
+        type: string
+        default: ''
+      phase:
+        # EP only. 'both' fans out to one job per phase (decode + prefill).
+        description: EP phase — decode (small T) / prefill (large T); 'both' = a job each
+        type: choice
+        default: both
+        options: [both, decode, prefill]
+      tokens_ladder:
+        description: EP source-tokens-per-rank sweep (space/comma sep); blank = phase default
+        type: string
+        default: ''
+      dispatch_dtype:
+        description: EP dispatch payload precision
+        type: choice
+        default: bf16
+        options: [bf16, fp8]
+      mode:
+        # normal = high-throughput kernels (decode+prefill); ll = DeepEP low-latency
+        # (decode-shaped, fp8 cast in-kernel). LL is rejected on backends without it
+        # (MoRI) and aborts on fabrics that lack it (B300) — run only where supported.
+        description: EP kernel path — normal or low-latency (LL)
+        type: choice
+        default: normal
+        options: [normal, ll]
+      resource_mode:
+        # normalized = ~sm_fraction of device units (cross-vendor apples-to-apples);
+        # tuned = each backend's own recommended/default launch config.
+        description: Comm resource regime
+        type: choice
+        default: normalized
+        options: [normalized, tuned, default]
+      contract:
+        # layout-and-dispatch-v1 = dispatch timing includes routing-layout gen (the only
+        # contract MoRI honors; use for cross-vendor). cached-layout-comm-only-v1 = layout
+        # hoisted out, pure-comm dispatch (DeepEP normal only).
+        description: Measurement contract (timing boundary)
+        type: choice
+        default: layout-and-dispatch-v1
+        options: [layout-and-dispatch-v1, cached-layout-comm-only-v1]
+      routing:
+        # Routing distribution of the shared trace. uniform=realistic; balanced=load-equalized;
+        # zipf*=skewed; hotspot-single=one hot expert. The skew + EPLB sweep lives here.
+        description: EP routing distribution
+        type: choice
+        default: uniform
+        options: [uniform, balanced, zipf, zipf-mild, zipf-moderate, zipf-heavy, hotspot-single]
+      eplb:
+        # EPLB = replicate hot experts + balanced-place (the remedy for skewed routing). A pure
+        # routing-trace transform; experts -> num_logical+redundant. Meaningful with zipf*.
+        description: Apply EPLB expert replication/placement
+        type: boolean
+        default: false
+
+concurrency:
+  # Group per (SKU + FULL config): GitHub keeps only one running + one pending per group and
+  # cancels the rest, so a coarse per-SKU group made a fan-out of many configs on one SKU
+  # self-cancel down to ~2. Including dtype/mode/contract/routing/eplb/phase gives each config
+  # its OWN group -> all configs survive; they queue only on the runner's own capacity, not on
+  # GitHub concurrency. cancel-in-progress FALSE so a re-dispatch of the SAME config queues.
+  group: collectivex-${{ github.ref }}-${{ github.event_name }}-${{ inputs.sku || 'push' }}-${{ inputs.dispatch_dtype }}-${{ inputs.mode }}-${{ inputs.contract }}-${{ inputs.routing }}-${{ inputs.eplb }}-${{ inputs.phase }}
+  cancel-in-progress: false
+
+permissions:
+  contents: read
+
+jobs:
+  # Push -> MI355X MoRI dispatch/combine. Lands on a free mi355x-amds runner and
+  # runs launch_mi355x-amds.sh (CX_BENCH=mori). The AMD workspace is compute-
+  # visible, so no CX_STAGE_DIR; the launcher defaults to 8 GPUs.
+  experimental:
+    name: CollectiveX Experimental (${{ matrix.phase }})
+    if: github.event_name == 'push'
+    runs-on: mi355x
+    timeout-minutes: 90
+    strategy:
+      fail-fast: false
+      matrix:
+        # Push = a fast MoRI SMOKE only (decode). The full sweep is workflow_dispatch.
+        phase: [decode]
+    env:
+      CX_BENCH: mori
+      CX_PHASE: ${{ matrix.phase }}
+      # SMOKE ladder capped at T<=16: MoRI + realistic (fan-out≈5.3) routing currently
+      # WEDGES at T>=32 (under investigation; DeepEP is fine), and an unguarded run hung
+      # ~1 h before the job timeout. Keep the push smoke in the known-good range; run the
+      # full sweep via workflow_dispatch (timeout-guarded). Remove the cap once fixed.
+      CX_TOKENS_LADDER: "1 2 4 8 16"
+      CX_RUN_TIMEOUT: "600"
+      # Pin to the MI355X nodes that hold the node-local squash and have a writable
+      # /var/lib/squash; other nodes need a slow cold import that can fail on lock/
+      # cache permissions. Widen once the squash is staged cluster-wide.
+      CX_NODELIST: mia1-p01-g10,mia1-p01-g15
+    steps:
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v5.0.0
+        with: { clean: true }
+      - name: Launch MI355X MoRI (${{ matrix.phase }})
+        env:
+          RUNNER_NAME: ${{ runner.name }}
+        run: bash "experimental/CollectiveX/launchers/launch_${RUNNER_NAME%%_*}.sh"
+      - name: Results summary
+        if: always()
+        run: python3 experimental/CollectiveX/summarize.py --results-dir experimental/CollectiveX/results --markdown >> "$GITHUB_STEP_SUMMARY"
+      - name: Upload results
+        if: always()
+        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
+        with:
+          name: collectivex_mi355x_mori_${{ matrix.phase }}_${{ github.run_id }}
+          path: experimental/CollectiveX/results/*.json
+          if-no-files-found: warn
+
+  # Manual dispatch -> chosen SKU + benchmark. Lands on the inputs.sku runner.
+  dispatch:
+    if: github.event_name == 'workflow_dispatch'
+    # The bare `h200` label spans TWO clusters: 14 h200-dgxc runners (login-0; the EP
+    # path is validated there) and 2 h200-cw (CoreWeave) runners that have no
+    # launch_h200-cw.sh and die exit 127. Pin h200 to the h200-dgxc pool so every
+    # dispatch lands where the launcher + FS + partition are known-good. Other SKUs are
+    # single-pool, so pass the sku through unchanged.
+    runs-on: ${{ inputs.sku == 'h200' && 'h200-dgxc' || inputs.sku }}
+    timeout-minutes: 120
+    strategy:
+      fail-fast: false
+      matrix:
+        # nccl/rccl are collective primitives — phase is meaningless, so run ONE job (not
+        # the same work twice). EP backends: 'both' -> decode + prefill; else a single job.
+        phase: ${{ fromJSON((inputs.benchmark == 'nccl' || inputs.benchmark == 'rccl') && '["na"]' || (inputs.phase == 'both' && '["decode","prefill"]' || format('["{0}"]', inputs.phase))) }}
+    env:
+      CX_BENCH: ${{ inputs.benchmark }}
+      CX_OPS: ${{ inputs.ops }}
+      CX_MIN_BYTES: ${{ inputs.min_bytes }}
+      CX_MAX_BYTES: ${{ inputs.max_bytes }}
+      CX_NGPUS: ${{ inputs.ngpus }}
+      CX_NODES: ${{ inputs.nodes }}
+      CX_PHASE: ${{ matrix.phase }}
+      CX_TOKENS_LADDER: ${{ inputs.tokens_ladder }}
+      CX_DISPATCH_DTYPE: ${{ inputs.dispatch_dtype }}
+      CX_MODE: ${{ inputs.mode }}
+      CX_RESOURCE_MODE: ${{ inputs.resource_mode }}
+      CX_MEASUREMENT_CONTRACT: ${{ inputs.contract }}
+      CX_ROUTING: ${{ inputs.routing }}
+      CX_EPLB: ${{ inputs.eplb && '1' || '' }}
+      # GHA run provenance: run_ep records git_run (repo/run/attempt/ref/sha/job) -> a GHA result
+      # is provenance_complete (publication_status >= comparable-experimental, official w/ canonical).
+      COLLECTIVEX_SOURCE_SHA: ${{ github.sha }}
+      COLLECTIVEX_ARTIFACT_NAME: collectivex_${{ inputs.sku }}_${{ inputs.benchmark }}_${{ matrix.phase }}_${{ github.run_id }}
+      # GB200/watchtower needs a compute-visible workspace; harmless elsewhere.
+      CX_STAGE_DIR: ${{ inputs.sku == 'gb200' && '/mnt/lustre01/users-public/sa-shared/cx-stage' || '' }}
+      # MI355X: pin to the warm-squash, writable nodes (see the push job).
+      CX_NODELIST: ${{ inputs.sku == 'mi355x' && 'mia1-p01-g10,mia1-p01-g15' || '' }}
+    steps:
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v5.0.0
+        with: { clean: true }
+      # Reject an unsupported backend/SKU/mode/dtype/contract BEFORE consuming the runner
+      # (review #3): fail fast on the login node, not after a salloc. 'all' fans out per
+      # vendor in-container, so skip the single-combo check for it.
+      - name: Validate capability
+        if: inputs.benchmark != 'all'
+        run: |
+          python3 experimental/CollectiveX/tests/capability.py \
+            --sku "${{ inputs.sku }}" --backend "${{ inputs.benchmark }}" \
+            --mode "${{ inputs.mode }}" --dtype "${{ inputs.dispatch_dtype }}" \
+            --contract "${{ inputs.contract }}"
+      - name: Launch ${{ inputs.sku }} / ${{ inputs.benchmark }} (${{ matrix.phase }})
+        env:
+          RUNNER_NAME: ${{ runner.name }}
+        run: bash "experimental/CollectiveX/launchers/launch_${RUNNER_NAME%%_*}.sh"
+      - name: Results summary
+        if: always()
+        run: python3 experimental/CollectiveX/summarize.py --results-dir experimental/CollectiveX/results --markdown >> "$GITHUB_STEP_SUMMARY"
+      - name: Upload results
+        if: always()
+        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
+        with:
+          name: collectivex_${{ inputs.sku }}_${{ inputs.benchmark }}_${{ matrix.phase }}_${{ github.run_id }}
+          path: experimental/CollectiveX/results/*.json
+          if-no-files-found: warn
+
+  update-frontend-snapshot:
+    name: Update InferenceX-app snapshot
+    needs: [experimental, dispatch]
+    if: >-
+      always() &&
+      (
+        (github.event_name == 'push' && needs.experimental.result == 'success') ||
+        (github.event_name == 'workflow_dispatch' && needs.dispatch.result == 'success')
+      )
+    runs-on: ubuntu-latest
+    steps:
+      - name: Trigger CollectiveX snapshot update
+        env:
+          FRONTEND_PAT: ${{ secrets.INFX_FRONTEND_PAT }}
+        run: |
+          set -euo pipefail
+          curl -sSf -X POST \
+            -H "Authorization: Bearer $FRONTEND_PAT" \
+            -H "Accept: application/vnd.github+json" \
+            -H "X-GitHub-Api-Version: 2022-11-28" \
+            https://api.github.com/repos/SemiAnalysisAI/InferenceX-app/dispatches \
+            -d '{
+              "event_type": "update-collectivex-data",
+              "client_payload": {
+                "source_run_id": "${{ github.run_id }}"
+              }
+            }'
diff --git a/experimental/CollectiveX/.gitignore b/experimental/CollectiveX/.gitignore
@@ -0,0 +1,18 @@
+# in-container nccl-tests build cache
+.nccl-tests/
+# python
+__pycache__/
+*.pyc
+# generated run artifacts: captured env embeds hostnames / GPU UUIDs / NIC GUIDs,
+# so keep results out of git (CI uploads them as workflow artifacts instead).
+# Sanitized headline numbers live in CONTAINERS.md.
+results/*.json
+results/plots/
+results/raw_*.txt
+results/raw_*.txt.stderr
+# superseded SSH-provenance result JSONs moved aside so plot_ep's recursive glob
+# won't double-load them; same hostname/UUID sensitivity as results/.
+_ssh_v4_archive/
+# running local-only reflection log (not a committed artifact)
+notes.md
+goal.md