SemiAnalysisAI · Oseltamivir · Jun 25, 2026 · Jun 25, 2026 · Jun 25, 2026 · Jun 26, 2026
@@ -11955,6 +11955,221 @@ minimaxm3-fp8-b300-dynamo-vllm:
           ep: 8
           dp-attn: false
 
+# MiniMax-M3 NVFP4 disagg sweep on the same B300 topology matrix as the MXFP8
+# baseline above. vLLM PR #46380 is overlaid by the runner setup script.
+minimaxm3-fp4-b300-dynamo-vllm:
+  image: vllm/vllm-openai:vllm-minimax-m3-perf-x86_64-13.0.1-7a67223
+  model: nvidia/MiniMax-M3-NVFP4
+  model-prefix: minimaxm3
+  runner: b300
+  precision: fp4
+  framework: dynamo-vllm
+  multinode: true
+  disagg: true
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      - conc-list: [4, 16, 64, 128, 4096]
+        prefill:
+          num-worker: 1
+          tp: 2
+          ep: 2
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp4/1k1k/1p1d-dep2-tep8-1k1k.yaml"
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 8
+          dp-attn: false
+      - conc-list: [1, 4, 8, 16]
+        prefill:
+          num-worker: 1
+          tp: 2
+          ep: 2
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp4/1k1k/1p1d-dep2-tp4-1k1k.yaml"
+        decode:
+          num-worker: 1
+          tp: 4
+          ep: 1
+          dp-attn: false
+      - conc-list: [2048]
+        prefill:
+          num-worker: 1
+          tp: 2
+          ep: 2
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp4/1k1k/1p2d-dep2-dep4-1k1k.yaml"
+        decode:
+          num-worker: 2
+          tp: 4
+          ep: 4
+          dp-attn: true
+      - conc-list: [512, 4096]
+        prefill:
+          num-worker: 2
+          tp: 2
+          ep: 2
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp4/1k1k/2p1d-dep2-dep8-1k1k.yaml"
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 8
+          dp-attn: true
+      - conc-list: [32]
+        prefill:
+          num-worker: 2
+          tp: 2
+          ep: 2
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp4/1k1k/2p1d-dep2-tep8-1k1k.yaml"
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 8
+          dp-attn: false
+      - conc-list: [16]
+        prefill:
+          num-worker: 2
+          tp: 2
+          ep: 2
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp4/1k1k/2p2d-dep2-tep8-1k1k.yaml"
+        decode:
+          num-worker: 2
+          tp: 8
+          ep: 8
+          dp-attn: false
+      - conc-list: [4]
+        prefill:
+          num-worker: 3
+          tp: 2
+          ep: 2
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp4/1k1k/3p2d-dep2-tep8-1k1k.yaml"
+        decode:
+          num-worker: 2
+          tp: 8
+          ep: 8
+          dp-attn: false
+    - isl: 8192
+      osl: 1024
+      search-space:
+      - conc-list: [256, 512]
+        prefill:
+          num-worker: 2
+          tp: 2
+          ep: 2
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp4/8k1k/2p2d-dep2-dep8-8k1k.yaml"
+        decode:
+          num-worker: 2
+          tp: 8
+          ep: 8
+          dp-attn: true
+      - conc-list: [16]
+        prefill:
+          num-worker: 2
+          tp: 2
+          ep: 2
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp4/8k1k/2p2d-dep2-tep8-8k1k.yaml"
+        decode:
+          num-worker: 2
+          tp: 8
+          ep: 8
+          dp-attn: false
+      - conc-list: [4096]
+        prefill:
+          num-worker: 4
+          tp: 2
+          ep: 2
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp4/8k1k/4p2d-dep2-dep8-8k1k.yaml"
+        decode:
+          num-worker: 2
+          tp: 8
+          ep: 8
+          dp-attn: true
+      - conc-list: [1, 4, 8, 16]
+        prefill:
+          num-worker: 1
+          tp: 2
+          ep: 2
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp4/8k1k/1p1d-dep2-tp4-8k1k.yaml"
+        decode:
+          num-worker: 1
+          tp: 4
+          ep: 1
+          dp-attn: false
+      - conc-list: [4096]
+        prefill:
+          num-worker: 4
+          tp: 2
+          ep: 2
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp4/8k1k/4p2d-dep2-tep4-8k1k.yaml"
+        decode:
+          num-worker: 2
+          tp: 4
+          ep: 4
+          dp-attn: false
+      - conc-list: [16, 32, 64, 128]
+        prefill:
+          num-worker: 1
+          tp: 2
+          ep: 2
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp4/8k1k/1p4d-dep2-tep4-8k1k.yaml"
+        decode:
+          num-worker: 4
+          tp: 4
+          ep: 4
+          dp-attn: false
+      - conc-list: [16]
+        prefill:
+          num-worker: 1
+          tp: 2
+          ep: 2
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp4/8k1k/1p2d-dep2-tep4-8k1k.yaml"
+        decode:
+          num-worker: 2
+          tp: 4
+          ep: 4
+          dp-attn: false
+      - conc-list: [4]
+        prefill:
+          num-worker: 1
+          tp: 2
+          ep: 2
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp4/8k1k/1p4d-dep2-tep8-8k1k.yaml"
+        decode:
+          num-worker: 4
+          tp: 8
+          ep: 8
+          dp-attn: false
+
 # MiniMax-M3 GB300 disagg sweep — adapted from NV B300 PR #1863.
 # All prefill DEP2 (TP1 DP2 EP, 2 GPU/worker). Decode: TP4+Marlin, TEP8,
 # DEP8, DEP4. 4 GPU/node (GB300 NVL72). 4p3d (3 decode workers) skipped.

diff --git a/benchmarks/multi_node/srt-slurm-recipes/configs/minimax-m3-nvfp4-vllm-fixes.sh b/benchmarks/multi_node/srt-slurm-recipes/configs/minimax-m3-nvfp4-vllm-fixes.sh
@@ -0,0 +1,35 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# MiniMax-M3 ModelOpt NVFP4 support from vllm-project/vllm#46380.
+VLLM_NVFP4_COMMIT="6c08558112acd2fd8b4bfc270104d556eb77f9bf"
+VLLM_ROOT="$(python3 -c 'import os, vllm; print(os.path.dirname(vllm.__file__))')"
+for file in \
+    model_executor/layers/fused_moe/experts/trtllm_nvfp4_moe.py \
+    model_executor/layers/quantization/modelopt.py \
+    model_executor/layers/quantization/utils/flashinfer_utils.py
+do
+    curl -fsSL \
+        "https://raw.githubusercontent.com/vllm-project/vllm/${VLLM_NVFP4_COMMIT}/vllm/${file}" \
+        -o "${VLLM_ROOT}/${file}"
+done
+
+python3 - <<'PYEOF'
+from importlib.util import find_spec
+from pathlib import Path
+
+spec = find_spec("vllm")
+if not spec or not spec.origin:
+    raise RuntimeError("vllm is not installed")
+path = Path(spec.origin).parent / "models/minimax_m3/nvidia/sparse_attention_msa.py"
+source = path.read_text()
+old = "            prefill_topk = topk[:, nd:num_tokens, :]\n"
+new = "            prefill_topk = topk[:, nd:num_tokens, :].contiguous()\n"
+if new not in source:
+    if source.count(old) != 1:
+        raise RuntimeError(f"missing or ambiguous patch anchor in {path}")
+    path.write_text(source.replace(old, new, 1))
+PYEOF
+
+python3 -c \
+    "from vllm.model_executor.layers.fused_moe.experts.trtllm_nvfp4_moe import TrtLlmNvFp4ExpertsModular; print('[nvfp4-patch] OK')"
diff --git a/...marks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/1k1k/1p1d-dep2-tep8-1k1k.yaml b/...marks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/1k1k/1p1d-dep2-tep8-1k1k.yaml
@@ -0,0 +1,79 @@
+name: "minimax-m3-vllm-disagg-b300-1p1d-fp4-dep2-tep8-1k1k"
+
+model:
+  path: "nvidia/MiniMax-M3-NVFP4"
+  container: "vllm/vllm-openai:vllm-minimax-m3-perf-x86_64-13.0.1-7a67223"
+  precision: "fp4"
+
+resources:
+  gpu_type: "b300"
+  gpus_per_node: 8
+  prefill_nodes: 1
+  decode_nodes: 1
+  prefill_workers: 1
+  decode_workers: 1
+  gpus_per_prefill: 2
+  gpus_per_decode: 8
+
+dynamo:
+  install: true
+  version: 1.3.0.dev20260614
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    UCX_TLS: "cuda_copy,rc"
+    VLLM_FLOAT32_MATMUL_PRECISION: "high"
+
+  decode_environment:
+    UCX_TLS: "cuda_copy,rc"
+    VLLM_FLOAT32_MATMUL_PRECISION: "high"
+
+  vllm_config:
+    prefill:
+      tensor-parallel-size: 1
+      data-parallel-size: 2
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      block-size: 128
+      gpu-memory-utilization: 0.90
+      max-model-len: 2304
+      language-model-only: true
+      stream-interval: 32
+      max-cudagraph-capture-size: 2048
+      max-num-batched-tokens: 2048
+
+    decode:
+      tensor-parallel-size: 8
+      enable-expert-parallel: true
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      block-size: 128
+      gpu-memory-utilization: 0.90
+      max-model-len: 2304
+      language-model-only: true
+      stream-interval: 32
+      max-num-seqs: 4096
+      max-num-batched-tokens: 16384
+      max-cudagraph-capture-size: 8192
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "4x16x64x128x4096"
+  req_rate: "inf"