Skip to content
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
215 changes: 215 additions & 0 deletions .github/configs/nvidia-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11955,6 +11955,221 @@ minimaxm3-fp8-b300-dynamo-vllm:
ep: 8
dp-attn: false

# MiniMax-M3 NVFP4 disagg sweep on the same B300 topology matrix as the MXFP8
# baseline above. vLLM PR #46380 is overlaid by the runner setup script.
minimaxm3-fp4-b300-dynamo-vllm:
image: vllm/vllm-openai:vllm-minimax-m3-perf-x86_64-13.0.1-7a67223
model: nvidia/MiniMax-M3-NVFP4
model-prefix: minimaxm3
runner: b300
precision: fp4
framework: dynamo-vllm
multinode: true
disagg: true
scenarios:
fixed-seq-len:
- isl: 1024
osl: 1024
search-space:
- conc-list: [4, 16, 64, 128, 4096]
prefill:
num-worker: 1
tp: 2
ep: 2
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp4/1k1k/1p1d-dep2-tep8-1k1k.yaml"
decode:
num-worker: 1
tp: 8
ep: 8
dp-attn: false
- conc-list: [1, 4, 8, 16]
prefill:
num-worker: 1
tp: 2
ep: 2
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp4/1k1k/1p1d-dep2-tp4-1k1k.yaml"
decode:
num-worker: 1
tp: 4
ep: 1
dp-attn: false
- conc-list: [2048]
prefill:
num-worker: 1
tp: 2
ep: 2
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp4/1k1k/1p2d-dep2-dep4-1k1k.yaml"
decode:
num-worker: 2
tp: 4
ep: 4
dp-attn: true
- conc-list: [512, 4096]
prefill:
num-worker: 2
tp: 2
ep: 2
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp4/1k1k/2p1d-dep2-dep8-1k1k.yaml"
decode:
num-worker: 1
tp: 8
ep: 8
dp-attn: true
- conc-list: [32]
prefill:
num-worker: 2
tp: 2
ep: 2
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp4/1k1k/2p1d-dep2-tep8-1k1k.yaml"
decode:
num-worker: 1
tp: 8
ep: 8
dp-attn: false
- conc-list: [16]
prefill:
num-worker: 2
tp: 2
ep: 2
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp4/1k1k/2p2d-dep2-tep8-1k1k.yaml"
decode:
num-worker: 2
tp: 8
ep: 8
dp-attn: false
- conc-list: [4]
prefill:
num-worker: 3
tp: 2
ep: 2
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp4/1k1k/3p2d-dep2-tep8-1k1k.yaml"
decode:
num-worker: 2
tp: 8
ep: 8
dp-attn: false
- isl: 8192
osl: 1024
search-space:
- conc-list: [256, 512]
prefill:
num-worker: 2
tp: 2
ep: 2
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp4/8k1k/2p2d-dep2-dep8-8k1k.yaml"
decode:
num-worker: 2
tp: 8
ep: 8
dp-attn: true
- conc-list: [16]
prefill:
num-worker: 2
tp: 2
ep: 2
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp4/8k1k/2p2d-dep2-tep8-8k1k.yaml"
decode:
num-worker: 2
tp: 8
ep: 8
dp-attn: false
- conc-list: [4096]
prefill:
num-worker: 4
tp: 2
ep: 2
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp4/8k1k/4p2d-dep2-dep8-8k1k.yaml"
decode:
num-worker: 2
tp: 8
ep: 8
dp-attn: true
- conc-list: [1, 4, 8, 16]
prefill:
num-worker: 1
tp: 2
ep: 2
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp4/8k1k/1p1d-dep2-tp4-8k1k.yaml"
decode:
num-worker: 1
tp: 4
ep: 1
dp-attn: false
- conc-list: [4096]
prefill:
num-worker: 4
tp: 2
ep: 2
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp4/8k1k/4p2d-dep2-tep4-8k1k.yaml"
decode:
num-worker: 2
tp: 4
ep: 4
dp-attn: false
- conc-list: [16, 32, 64, 128]
prefill:
num-worker: 1
tp: 2
ep: 2
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp4/8k1k/1p4d-dep2-tep4-8k1k.yaml"
decode:
num-worker: 4
tp: 4
ep: 4
dp-attn: false
- conc-list: [16]
prefill:
num-worker: 1
tp: 2
ep: 2
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp4/8k1k/1p2d-dep2-tep4-8k1k.yaml"
decode:
num-worker: 2
tp: 4
ep: 4
dp-attn: false
- conc-list: [4]
prefill:
num-worker: 1
tp: 2
ep: 2
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp4/8k1k/1p4d-dep2-tep8-8k1k.yaml"
decode:
num-worker: 4
tp: 8
ep: 8
dp-attn: false

# MiniMax-M3 GB300 disagg sweep — adapted from NV B300 PR #1863.
# All prefill DEP2 (TP1 DP2 EP, 2 GPU/worker). Decode: TP4+Marlin, TEP8,
# DEP8, DEP4. 4 GPU/node (GB300 NVL72). 4p3d (3 decode workers) skipped.
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
#!/usr/bin/env bash
set -euo pipefail

# MiniMax-M3 ModelOpt NVFP4 support from vllm-project/vllm#46380.
VLLM_NVFP4_COMMIT="6c08558112acd2fd8b4bfc270104d556eb77f9bf"
VLLM_ROOT="$(python3 -c 'import os, vllm; print(os.path.dirname(vllm.__file__))')"
for file in \
model_executor/layers/fused_moe/experts/trtllm_nvfp4_moe.py \
model_executor/layers/quantization/modelopt.py \
model_executor/layers/quantization/utils/flashinfer_utils.py
do
curl -fsSL \
"https://raw.githubusercontent.com/vllm-project/vllm/${VLLM_NVFP4_COMMIT}/vllm/${file}" \
-o "${VLLM_ROOT}/${file}"
done

python3 - <<'PYEOF'
from importlib.util import find_spec
from pathlib import Path
Comment thread
Oseltamivir marked this conversation as resolved.
Outdated

spec = find_spec("vllm")
if not spec or not spec.origin:
raise RuntimeError("vllm is not installed")
path = Path(spec.origin).parent / "models/minimax_m3/nvidia/sparse_attention_msa.py"
source = path.read_text()
old = " prefill_topk = topk[:, nd:num_tokens, :]\n"
new = " prefill_topk = topk[:, nd:num_tokens, :].contiguous()\n"
if new not in source:
if source.count(old) != 1:
raise RuntimeError(f"missing or ambiguous patch anchor in {path}")
path.write_text(source.replace(old, new, 1))
PYEOF

python3 -c \
"from vllm.model_executor.layers.fused_moe.experts.trtllm_nvfp4_moe import TrtLlmNvFp4ExpertsModular; print('[nvfp4-patch] OK')"
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
name: "minimax-m3-vllm-disagg-b300-1p1d-fp4-dep2-tep8-1k1k"

model:
path: "nvidia/MiniMax-M3-NVFP4"
container: "vllm/vllm-openai:vllm-minimax-m3-perf-x86_64-13.0.1-7a67223"
precision: "fp4"

resources:
gpu_type: "b300"
gpus_per_node: 8
prefill_nodes: 1
decode_nodes: 1
prefill_workers: 1
decode_workers: 1
gpus_per_prefill: 2
gpus_per_decode: 8

dynamo:
install: true
version: 1.3.0.dev20260614

frontend:
type: dynamo
enable_multiple_frontends: false

backend:
type: vllm
connector: null

prefill_environment:
UCX_TLS: "cuda_copy,rc"
VLLM_FLOAT32_MATMUL_PRECISION: "high"

decode_environment:
UCX_TLS: "cuda_copy,rc"
VLLM_FLOAT32_MATMUL_PRECISION: "high"

vllm_config:
prefill:
tensor-parallel-size: 1
data-parallel-size: 2
data-parallel-rpc-port: 13345
enable-expert-parallel: true
trust-remote-code: true
no-enable-prefix-caching: true
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
block-size: 128
gpu-memory-utilization: 0.90
max-model-len: 2304
language-model-only: true
stream-interval: 32
max-cudagraph-capture-size: 2048
max-num-batched-tokens: 2048

decode:
tensor-parallel-size: 8
enable-expert-parallel: true
trust-remote-code: true
no-enable-prefix-caching: true
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
block-size: 128
gpu-memory-utilization: 0.90
max-model-len: 2304
language-model-only: true
stream-interval: 32
max-num-seqs: 4096
max-num-batched-tokens: 16384
max-cudagraph-capture-size: 8192

health_check:
max_attempts: 360
interval_seconds: 10

benchmark:
type: "sa-bench"
isl: 1024
osl: 1024
concurrencies: "4x16x64x128x4096"
req_rate: "inf"
Loading
Loading