From 9bacb24e44960c6647897c431d7104c05529a484 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Thu, 25 Jun 2026 18:52:06 +0800 Subject: [PATCH 1/4] feat: add MiniMax-M3 NVFP4 B300 Dynamo vLLM --- .github/configs/nvidia-master.yaml | 215 ++++++++++++++++++ .../configs/minimax-m3-nvfp4-vllm-fixes.sh | 35 +++ .../b300-fp4/1k1k/1p1d-dep2-tep8-1k1k.yaml | 79 +++++++ .../b300-fp4/1k1k/1p1d-dep2-tp4-1k1k.yaml | 80 +++++++ .../b300-fp4/1k1k/1p2d-dep2-dep4-1k1k.yaml | 81 +++++++ .../b300-fp4/1k1k/2p1d-dep2-dep8-1k1k.yaml | 81 +++++++ .../b300-fp4/1k1k/2p1d-dep2-tep8-1k1k.yaml | 79 +++++++ .../b300-fp4/1k1k/2p2d-dep2-tep8-1k1k.yaml | 79 +++++++ .../b300-fp4/1k1k/3p2d-dep2-tep8-1k1k.yaml | 79 +++++++ .../b300-fp4/8k1k/1p1d-dep2-tp4-8k1k.yaml | 84 +++++++ .../b300-fp4/8k1k/1p2d-dep2-tep4-8k1k.yaml | 83 +++++++ .../b300-fp4/8k1k/1p4d-dep2-tep4-8k1k.yaml | 83 +++++++ .../b300-fp4/8k1k/1p4d-dep2-tep8-8k1k.yaml | 83 +++++++ .../b300-fp4/8k1k/2p2d-dep2-dep8-8k1k.yaml | 85 +++++++ .../b300-fp4/8k1k/2p2d-dep2-tep8-8k1k.yaml | 83 +++++++ .../b300-fp4/8k1k/4p2d-dep2-dep8-8k1k.yaml | 85 +++++++ .../b300-fp4/8k1k/4p2d-dep2-tep4-8k1k.yaml | 83 +++++++ perf-changelog.yaml | 9 + runners/launch_b300-nv.sh | 13 +- 19 files changed, 1496 insertions(+), 3 deletions(-) create mode 100755 benchmarks/multi_node/srt-slurm-recipes/configs/minimax-m3-nvfp4-vllm-fixes.sh create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/1k1k/1p1d-dep2-tep8-1k1k.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/1k1k/1p1d-dep2-tp4-1k1k.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/1k1k/1p2d-dep2-dep4-1k1k.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/1k1k/2p1d-dep2-dep8-1k1k.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/1k1k/2p1d-dep2-tep8-1k1k.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/1k1k/2p2d-dep2-tep8-1k1k.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/1k1k/3p2d-dep2-tep8-1k1k.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/8k1k/1p1d-dep2-tp4-8k1k.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/8k1k/1p2d-dep2-tep4-8k1k.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/8k1k/1p4d-dep2-tep4-8k1k.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/8k1k/1p4d-dep2-tep8-8k1k.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/8k1k/2p2d-dep2-dep8-8k1k.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/8k1k/2p2d-dep2-tep8-8k1k.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/8k1k/4p2d-dep2-dep8-8k1k.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/8k1k/4p2d-dep2-tep4-8k1k.yaml diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index f4d70f977..5978874be 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -11955,6 +11955,221 @@ minimaxm3-fp8-b300-dynamo-vllm: ep: 8 dp-attn: false +# MiniMax-M3 NVFP4 disagg sweep on the same B300 topology matrix as the MXFP8 +# baseline above. vLLM PR #46380 is overlaid by the runner setup script. +minimaxm3-fp4-b300-dynamo-vllm: + image: vllm/vllm-openai:vllm-minimax-m3-perf-x86_64-13.0.1-7a67223 + model: nvidia/MiniMax-M3-NVFP4 + model-prefix: minimaxm3 + runner: b300 + precision: fp4 + framework: dynamo-vllm + multinode: true + disagg: true + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - conc-list: [4, 16, 64, 128, 4096] + prefill: + num-worker: 1 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp4/1k1k/1p1d-dep2-tep8-1k1k.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: false + - conc-list: [1, 4, 8, 16] + prefill: + num-worker: 1 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp4/1k1k/1p1d-dep2-tp4-1k1k.yaml" + decode: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + - conc-list: [2048] + prefill: + num-worker: 1 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp4/1k1k/1p2d-dep2-dep4-1k1k.yaml" + decode: + num-worker: 2 + tp: 4 + ep: 4 + dp-attn: true + - conc-list: [512, 4096] + prefill: + num-worker: 2 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp4/1k1k/2p1d-dep2-dep8-1k1k.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + - conc-list: [32] + prefill: + num-worker: 2 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp4/1k1k/2p1d-dep2-tep8-1k1k.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: false + - conc-list: [16] + prefill: + num-worker: 2 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp4/1k1k/2p2d-dep2-tep8-1k1k.yaml" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: false + - conc-list: [4] + prefill: + num-worker: 3 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp4/1k1k/3p2d-dep2-tep8-1k1k.yaml" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: false + - isl: 8192 + osl: 1024 + search-space: + - conc-list: [256, 512] + prefill: + num-worker: 2 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp4/8k1k/2p2d-dep2-dep8-8k1k.yaml" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: true + - conc-list: [16] + prefill: + num-worker: 2 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp4/8k1k/2p2d-dep2-tep8-8k1k.yaml" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: false + - conc-list: [4096] + prefill: + num-worker: 4 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp4/8k1k/4p2d-dep2-dep8-8k1k.yaml" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: true + - conc-list: [1, 4, 8, 16] + prefill: + num-worker: 1 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp4/8k1k/1p1d-dep2-tp4-8k1k.yaml" + decode: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + - conc-list: [4096] + prefill: + num-worker: 4 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp4/8k1k/4p2d-dep2-tep4-8k1k.yaml" + decode: + num-worker: 2 + tp: 4 + ep: 4 + dp-attn: false + - conc-list: [16, 32, 64, 128] + prefill: + num-worker: 1 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp4/8k1k/1p4d-dep2-tep4-8k1k.yaml" + decode: + num-worker: 4 + tp: 4 + ep: 4 + dp-attn: false + - conc-list: [16] + prefill: + num-worker: 1 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp4/8k1k/1p2d-dep2-tep4-8k1k.yaml" + decode: + num-worker: 2 + tp: 4 + ep: 4 + dp-attn: false + - conc-list: [4] + prefill: + num-worker: 1 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp4/8k1k/1p4d-dep2-tep8-8k1k.yaml" + decode: + num-worker: 4 + tp: 8 + ep: 8 + dp-attn: false + # MiniMax-M3 GB300 disagg sweep — adapted from NV B300 PR #1863. # All prefill DEP2 (TP1 DP2 EP, 2 GPU/worker). Decode: TP4+Marlin, TEP8, # DEP8, DEP4. 4 GPU/node (GB300 NVL72). 4p3d (3 decode workers) skipped. diff --git a/benchmarks/multi_node/srt-slurm-recipes/configs/minimax-m3-nvfp4-vllm-fixes.sh b/benchmarks/multi_node/srt-slurm-recipes/configs/minimax-m3-nvfp4-vllm-fixes.sh new file mode 100755 index 000000000..27a954008 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/configs/minimax-m3-nvfp4-vllm-fixes.sh @@ -0,0 +1,35 @@ +#!/usr/bin/env bash +set -euo pipefail + +# MiniMax-M3 ModelOpt NVFP4 support from vllm-project/vllm#46380. +VLLM_NVFP4_COMMIT="6c08558112acd2fd8b4bfc270104d556eb77f9bf" +VLLM_ROOT="$(python3 -c 'import os, vllm; print(os.path.dirname(vllm.__file__))')" +for file in \ + model_executor/layers/fused_moe/experts/trtllm_nvfp4_moe.py \ + model_executor/layers/quantization/modelopt.py \ + model_executor/layers/quantization/utils/flashinfer_utils.py +do + curl -fsSL \ + "https://raw.githubusercontent.com/vllm-project/vllm/${VLLM_NVFP4_COMMIT}/vllm/${file}" \ + -o "${VLLM_ROOT}/${file}" +done + +python3 - <<'PYEOF' +from importlib.util import find_spec +from pathlib import Path + +spec = find_spec("vllm") +if not spec or not spec.origin: + raise RuntimeError("vllm is not installed") +path = Path(spec.origin).parent / "models/minimax_m3/nvidia/sparse_attention_msa.py" +source = path.read_text() +old = " prefill_topk = topk[:, nd:num_tokens, :]\n" +new = " prefill_topk = topk[:, nd:num_tokens, :].contiguous()\n" +if new not in source: + if source.count(old) != 1: + raise RuntimeError(f"missing or ambiguous patch anchor in {path}") + path.write_text(source.replace(old, new, 1)) +PYEOF + +python3 -c \ + "from vllm.model_executor.layers.fused_moe.experts.trtllm_nvfp4_moe import TrtLlmNvFp4ExpertsModular; print('[nvfp4-patch] OK')" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/1k1k/1p1d-dep2-tep8-1k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/1k1k/1p1d-dep2-tep8-1k1k.yaml new file mode 100644 index 000000000..0c030cecc --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/1k1k/1p1d-dep2-tep8-1k1k.yaml @@ -0,0 +1,79 @@ +name: "minimax-m3-vllm-disagg-b300-1p1d-fp4-dep2-tep8-1k1k" + +model: + path: "nvidia/MiniMax-M3-NVFP4" + container: "vllm/vllm-openai:vllm-minimax-m3-perf-x86_64-13.0.1-7a67223" + precision: "fp4" + +resources: + gpu_type: "b300" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 2 + gpus_per_decode: 8 + +dynamo: + install: true + version: 1.3.0.dev20260614 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + UCX_TLS: "cuda_copy,rc" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + decode_environment: + UCX_TLS: "cuda_copy,rc" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + vllm_config: + prefill: + tensor-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 2304 + language-model-only: true + stream-interval: 32 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + + decode: + tensor-parallel-size: 8 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 2304 + language-model-only: true + stream-interval: 32 + max-num-seqs: 4096 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 8192 + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "4x16x64x128x4096" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/1k1k/1p1d-dep2-tp4-1k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/1k1k/1p1d-dep2-tp4-1k1k.yaml new file mode 100644 index 000000000..20dc0494f --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/1k1k/1p1d-dep2-tp4-1k1k.yaml @@ -0,0 +1,80 @@ +name: "minimax-m3-vllm-disagg-b300-1p1d-fp4-dep2-tp4-1k1k" + +model: + path: "nvidia/MiniMax-M3-NVFP4" + container: "vllm/vllm-openai:vllm-minimax-m3-perf-x86_64-13.0.1-7a67223" + precision: "fp4" + +resources: + gpu_type: "b300" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 2 + gpus_per_decode: 4 + +dynamo: + install: true + version: 1.3.0.dev20260614 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + allow_prefill_decode_colocation: true + + prefill_environment: + UCX_TLS: "cuda_ipc,cuda_copy,rc" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + decode_environment: + UCX_TLS: "cuda_ipc,cuda_copy,rc" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + vllm_config: + prefill: + tensor-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 2304 + language-model-only: true + stream-interval: 32 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + + decode: + tensor-parallel-size: 4 + enable-expert-parallel: false + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 2304 + language-model-only: true + stream-interval: 32 + max-num-seqs: 4096 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 2048 + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "1x4x8x16" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/1k1k/1p2d-dep2-dep4-1k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/1k1k/1p2d-dep2-dep4-1k1k.yaml new file mode 100644 index 000000000..5850e8910 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/1k1k/1p2d-dep2-dep4-1k1k.yaml @@ -0,0 +1,81 @@ +name: "minimax-m3-vllm-disagg-b300-1p2d-fp4-dep2-dep4-1k1k" + +model: + path: "nvidia/MiniMax-M3-NVFP4" + container: "vllm/vllm-openai:vllm-minimax-m3-perf-x86_64-13.0.1-7a67223" + precision: "fp4" + +resources: + gpu_type: "b300" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 1 + decode_workers: 2 + gpus_per_prefill: 2 + gpus_per_decode: 4 + +dynamo: + install: true + version: 1.3.0.dev20260614 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + UCX_TLS: "cuda_copy,rc" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + decode_environment: + UCX_TLS: "cuda_copy,rc" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + vllm_config: + prefill: + tensor-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 2304 + language-model-only: true + stream-interval: 32 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + + decode: + tensor-parallel-size: 1 + data-parallel-size: 4 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 2304 + language-model-only: true + stream-interval: 32 + max-num-seqs: 1024 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 8192 + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "2048" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/1k1k/2p1d-dep2-dep8-1k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/1k1k/2p1d-dep2-dep8-1k1k.yaml new file mode 100644 index 000000000..b60a55f24 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/1k1k/2p1d-dep2-dep8-1k1k.yaml @@ -0,0 +1,81 @@ +name: "minimax-m3-vllm-disagg-b300-2p1d-fp4-dep2-dep8-1k1k" + +model: + path: "nvidia/MiniMax-M3-NVFP4" + container: "vllm/vllm-openai:vllm-minimax-m3-perf-x86_64-13.0.1-7a67223" + precision: "fp4" + +resources: + gpu_type: "b300" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 2 + decode_workers: 1 + gpus_per_prefill: 2 + gpus_per_decode: 8 + +dynamo: + install: true + version: 1.3.0.dev20260614 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + UCX_TLS: "cuda_copy,rc" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + decode_environment: + UCX_TLS: "cuda_copy,rc" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + vllm_config: + prefill: + tensor-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 2304 + language-model-only: true + stream-interval: 32 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + + decode: + tensor-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 2304 + language-model-only: true + stream-interval: 32 + max-num-seqs: 4096 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 8192 + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "512x4096" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/1k1k/2p1d-dep2-tep8-1k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/1k1k/2p1d-dep2-tep8-1k1k.yaml new file mode 100644 index 000000000..eaf9f3baf --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/1k1k/2p1d-dep2-tep8-1k1k.yaml @@ -0,0 +1,79 @@ +name: "minimax-m3-vllm-disagg-b300-2p1d-fp4-dep2-tep8-1k1k" + +model: + path: "nvidia/MiniMax-M3-NVFP4" + container: "vllm/vllm-openai:vllm-minimax-m3-perf-x86_64-13.0.1-7a67223" + precision: "fp4" + +resources: + gpu_type: "b300" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 2 + decode_workers: 1 + gpus_per_prefill: 2 + gpus_per_decode: 8 + +dynamo: + install: true + version: 1.3.0.dev20260614 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + UCX_TLS: "cuda_copy,rc" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + decode_environment: + UCX_TLS: "cuda_copy,rc" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + vllm_config: + prefill: + tensor-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 2304 + language-model-only: true + stream-interval: 32 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + + decode: + tensor-parallel-size: 8 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 2304 + language-model-only: true + stream-interval: 32 + max-num-seqs: 4096 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 8192 + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "32" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/1k1k/2p2d-dep2-tep8-1k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/1k1k/2p2d-dep2-tep8-1k1k.yaml new file mode 100644 index 000000000..c7e6d5f07 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/1k1k/2p2d-dep2-tep8-1k1k.yaml @@ -0,0 +1,79 @@ +name: "minimax-m3-vllm-disagg-b300-2p2d-fp4-dep2-tep8-1k1k" + +model: + path: "nvidia/MiniMax-M3-NVFP4" + container: "vllm/vllm-openai:vllm-minimax-m3-perf-x86_64-13.0.1-7a67223" + precision: "fp4" + +resources: + gpu_type: "b300" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 2 + prefill_workers: 2 + decode_workers: 2 + gpus_per_prefill: 2 + gpus_per_decode: 8 + +dynamo: + install: true + version: 1.3.0.dev20260614 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + UCX_TLS: "cuda_copy,rc" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + decode_environment: + UCX_TLS: "cuda_copy,rc" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + vllm_config: + prefill: + tensor-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 2304 + language-model-only: true + stream-interval: 32 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + + decode: + tensor-parallel-size: 8 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 2304 + language-model-only: true + stream-interval: 32 + max-num-seqs: 4096 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 8192 + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "16" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/1k1k/3p2d-dep2-tep8-1k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/1k1k/3p2d-dep2-tep8-1k1k.yaml new file mode 100644 index 000000000..494c9c2b5 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/1k1k/3p2d-dep2-tep8-1k1k.yaml @@ -0,0 +1,79 @@ +name: "minimax-m3-vllm-disagg-b300-3p2d-fp4-dep2-tep8-1k1k" + +model: + path: "nvidia/MiniMax-M3-NVFP4" + container: "vllm/vllm-openai:vllm-minimax-m3-perf-x86_64-13.0.1-7a67223" + precision: "fp4" + +resources: + gpu_type: "b300" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 2 + prefill_workers: 3 + decode_workers: 2 + gpus_per_prefill: 2 + gpus_per_decode: 8 + +dynamo: + install: true + version: 1.3.0.dev20260614 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + UCX_TLS: "cuda_copy,rc" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + decode_environment: + UCX_TLS: "cuda_copy,rc" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + vllm_config: + prefill: + tensor-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 2304 + language-model-only: true + stream-interval: 32 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + + decode: + tensor-parallel-size: 8 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 2304 + language-model-only: true + stream-interval: 32 + max-num-seqs: 4096 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 8192 + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "4" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/8k1k/1p1d-dep2-tp4-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/8k1k/1p1d-dep2-tp4-8k1k.yaml new file mode 100644 index 000000000..17dd9e876 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/8k1k/1p1d-dep2-tp4-8k1k.yaml @@ -0,0 +1,84 @@ +name: "minimax-m3-vllm-disagg-b300-1p1d-fp4-dep2-tp4-8k1k" + +model: + path: "nvidia/MiniMax-M3-NVFP4" + container: "vllm/vllm-openai:vllm-minimax-m3-perf-x86_64-13.0.1-7a67223" + precision: "fp4" + +resources: + gpu_type: "b300" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 2 + gpus_per_decode: 4 + +dynamo: + install: true + version: 1.3.0.dev20260614 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + allow_prefill_decode_colocation: true + + prefill_environment: + VLLM_FLOAT32_MATMUL_PRECISION: high + UCX_TLS: "cuda_ipc,cuda_copy,rc" + + decode_environment: + VLLM_FLOAT32_MATMUL_PRECISION: high + UCX_TLS: "cuda_ipc,cuda_copy,rc" + + vllm_config: + prefill: + tensor-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' + kv-cache-dtype: fp8 + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + stream-interval: 32 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 16384 + + decode: + tensor-parallel-size: 4 + enable-expert-parallel: false + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' + kv-cache-dtype: fp8 + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + stream-interval: 32 + max-num-seqs: 1024 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 2048 + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "1x4x8x16" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/8k1k/1p2d-dep2-tep4-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/8k1k/1p2d-dep2-tep4-8k1k.yaml new file mode 100644 index 000000000..6ebd1a963 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/8k1k/1p2d-dep2-tep4-8k1k.yaml @@ -0,0 +1,83 @@ +name: "minimax-m3-vllm-disagg-b300-1p2d-fp4-dep2-tep4-8k1k" + +model: + path: "nvidia/MiniMax-M3-NVFP4" + container: "vllm/vllm-openai:vllm-minimax-m3-perf-x86_64-13.0.1-7a67223" + precision: "fp4" + +resources: + gpu_type: "b300" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 1 + decode_workers: 2 + gpus_per_prefill: 2 + gpus_per_decode: 4 + +dynamo: + install: true + version: 1.3.0.dev20260614 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_FLOAT32_MATMUL_PRECISION: high + UCX_TLS: "cuda_copy,rc" + + decode_environment: + VLLM_FLOAT32_MATMUL_PRECISION: high + UCX_TLS: "cuda_copy,rc" + + vllm_config: + prefill: + tensor-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' + kv-cache-dtype: fp8 + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + stream-interval: 32 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 16384 + + decode: + tensor-parallel-size: 4 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' + kv-cache-dtype: fp8 + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + stream-interval: 32 + max-num-seqs: 512 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 4096 + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "16" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/8k1k/1p4d-dep2-tep4-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/8k1k/1p4d-dep2-tep4-8k1k.yaml new file mode 100644 index 000000000..5668312cc --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/8k1k/1p4d-dep2-tep4-8k1k.yaml @@ -0,0 +1,83 @@ +name: "minimax-m3-vllm-disagg-b300-1p4d-fp4-dep2-tep4-8k1k" + +model: + path: "nvidia/MiniMax-M3-NVFP4" + container: "vllm/vllm-openai:vllm-minimax-m3-perf-x86_64-13.0.1-7a67223" + precision: "fp4" + +resources: + gpu_type: "b300" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 2 + prefill_workers: 1 + decode_workers: 4 + gpus_per_prefill: 2 + gpus_per_decode: 4 + +dynamo: + install: true + version: 1.3.0.dev20260614 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_FLOAT32_MATMUL_PRECISION: high + UCX_TLS: "cuda_copy,rc" + + decode_environment: + VLLM_FLOAT32_MATMUL_PRECISION: high + UCX_TLS: "cuda_copy,rc" + + vllm_config: + prefill: + tensor-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' + kv-cache-dtype: fp8 + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + stream-interval: 32 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 16384 + + decode: + tensor-parallel-size: 4 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' + kv-cache-dtype: fp8 + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + stream-interval: 32 + max-num-seqs: 512 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 4096 + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "16x32x64x128" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/8k1k/1p4d-dep2-tep8-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/8k1k/1p4d-dep2-tep8-8k1k.yaml new file mode 100644 index 000000000..bbafa006c --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/8k1k/1p4d-dep2-tep8-8k1k.yaml @@ -0,0 +1,83 @@ +name: "minimax-m3-vllm-disagg-b300-1p4d-fp4-dep2-tep8-8k1k" + +model: + path: "nvidia/MiniMax-M3-NVFP4" + container: "vllm/vllm-openai:vllm-minimax-m3-perf-x86_64-13.0.1-7a67223" + precision: "fp4" + +resources: + gpu_type: "b300" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 4 + prefill_workers: 1 + decode_workers: 4 + gpus_per_prefill: 2 + gpus_per_decode: 8 + +dynamo: + install: true + version: 1.3.0.dev20260614 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_FLOAT32_MATMUL_PRECISION: high + UCX_TLS: "cuda_copy,rc" + + decode_environment: + VLLM_FLOAT32_MATMUL_PRECISION: high + UCX_TLS: "cuda_copy,rc" + + vllm_config: + prefill: + tensor-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' + kv-cache-dtype: fp8 + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + stream-interval: 32 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 16384 + + decode: + tensor-parallel-size: 8 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' + kv-cache-dtype: fp8 + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + stream-interval: 32 + max-num-seqs: 1024 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 4096 + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "4" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/8k1k/2p2d-dep2-dep8-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/8k1k/2p2d-dep2-dep8-8k1k.yaml new file mode 100644 index 000000000..5f71f2496 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/8k1k/2p2d-dep2-dep8-8k1k.yaml @@ -0,0 +1,85 @@ +name: "minimax-m3-vllm-disagg-b300-2p2d-fp4-dep2-dep8-8k1k" + +model: + path: "nvidia/MiniMax-M3-NVFP4" + container: "vllm/vllm-openai:vllm-minimax-m3-perf-x86_64-13.0.1-7a67223" + precision: "fp4" + +resources: + gpu_type: "b300" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 2 + prefill_workers: 2 + decode_workers: 2 + gpus_per_prefill: 2 + gpus_per_decode: 8 + +dynamo: + install: true + version: 1.3.0.dev20260614 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_FLOAT32_MATMUL_PRECISION: high + UCX_TLS: "cuda_copy,rc" + + decode_environment: + VLLM_FLOAT32_MATMUL_PRECISION: high + UCX_TLS: "cuda_copy,rc" + + vllm_config: + prefill: + tensor-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' + kv-cache-dtype: fp8 + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + stream-interval: 32 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 16384 + + decode: + tensor-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' + kv-cache-dtype: fp8 + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + stream-interval: 32 + max-num-seqs: 1024 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 4096 + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "256x512" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/8k1k/2p2d-dep2-tep8-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/8k1k/2p2d-dep2-tep8-8k1k.yaml new file mode 100644 index 000000000..8a6f41f56 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/8k1k/2p2d-dep2-tep8-8k1k.yaml @@ -0,0 +1,83 @@ +name: "minimax-m3-vllm-disagg-b300-2p2d-fp4-dep2-tep8-8k1k" + +model: + path: "nvidia/MiniMax-M3-NVFP4" + container: "vllm/vllm-openai:vllm-minimax-m3-perf-x86_64-13.0.1-7a67223" + precision: "fp4" + +resources: + gpu_type: "b300" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 2 + prefill_workers: 2 + decode_workers: 2 + gpus_per_prefill: 2 + gpus_per_decode: 8 + +dynamo: + install: true + version: 1.3.0.dev20260614 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_FLOAT32_MATMUL_PRECISION: high + UCX_TLS: "cuda_copy,rc" + + decode_environment: + VLLM_FLOAT32_MATMUL_PRECISION: high + UCX_TLS: "cuda_copy,rc" + + vllm_config: + prefill: + tensor-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' + kv-cache-dtype: fp8 + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + stream-interval: 32 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 16384 + + decode: + tensor-parallel-size: 8 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' + kv-cache-dtype: fp8 + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + stream-interval: 32 + max-num-seqs: 1024 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 4096 + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "16" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/8k1k/4p2d-dep2-dep8-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/8k1k/4p2d-dep2-dep8-8k1k.yaml new file mode 100644 index 000000000..fd7b98935 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/8k1k/4p2d-dep2-dep8-8k1k.yaml @@ -0,0 +1,85 @@ +name: "minimax-m3-vllm-disagg-b300-4p2d-fp4-dep2-dep8-8k1k" + +model: + path: "nvidia/MiniMax-M3-NVFP4" + container: "vllm/vllm-openai:vllm-minimax-m3-perf-x86_64-13.0.1-7a67223" + precision: "fp4" + +resources: + gpu_type: "b300" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 2 + prefill_workers: 4 + decode_workers: 2 + gpus_per_prefill: 2 + gpus_per_decode: 8 + +dynamo: + install: true + version: 1.3.0.dev20260614 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_FLOAT32_MATMUL_PRECISION: high + UCX_TLS: "cuda_copy,rc" + + decode_environment: + VLLM_FLOAT32_MATMUL_PRECISION: high + UCX_TLS: "cuda_copy,rc" + + vllm_config: + prefill: + tensor-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' + kv-cache-dtype: fp8 + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + stream-interval: 32 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 16384 + + decode: + tensor-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' + kv-cache-dtype: fp8 + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + stream-interval: 32 + max-num-seqs: 1024 # Per DP rank: 2 workers x DP8 = 16 ranks. + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 4096 + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "4096" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/8k1k/4p2d-dep2-tep4-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/8k1k/4p2d-dep2-tep4-8k1k.yaml new file mode 100644 index 000000000..c8d5913d8 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/8k1k/4p2d-dep2-tep4-8k1k.yaml @@ -0,0 +1,83 @@ +name: "minimax-m3-vllm-disagg-b300-4p2d-fp4-dep2-tep4-8k1k" + +model: + path: "nvidia/MiniMax-M3-NVFP4" + container: "vllm/vllm-openai:vllm-minimax-m3-perf-x86_64-13.0.1-7a67223" + precision: "fp4" + +resources: + gpu_type: "b300" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 4 + decode_workers: 2 + gpus_per_prefill: 2 + gpus_per_decode: 4 + +dynamo: + install: true + version: 1.3.0.dev20260614 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_FLOAT32_MATMUL_PRECISION: high + UCX_TLS: "cuda_copy,rc" + + decode_environment: + VLLM_FLOAT32_MATMUL_PRECISION: high + UCX_TLS: "cuda_copy,rc" + + vllm_config: + prefill: + tensor-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' + kv-cache-dtype: fp8 + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + stream-interval: 32 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 16384 + + decode: + tensor-parallel-size: 4 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' + kv-cache-dtype: fp8 + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + stream-interval: 32 + max-num-seqs: 1024 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 4096 + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "4096" + req_rate: "inf" diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 9c45a352a..65d5b3d1e 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -4190,3 +4190,12 @@ - "Update the DeepSeek-V4-Pro B200 disaggregated Dynamo-vLLM benchmark to the vllm/vllm-openai:v0.23.0 image" - "Lower max-num-batched-tokens to 16384 and gpu-memory-utilization to 0.9 on the high-throughput and max-throughput recipes to avoid OOM" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1899 + +- config-keys: + - minimaxm3-fp4-b300-dynamo-vllm + description: + - "Add MiniMax-M3 NVFP4 B300 disaggregated vLLM benchmarks via Dynamo for 1k1k and 8k1k STP (no MTP)" + - "Use nvidia/MiniMax-M3-NVFP4 from /scratch/models/MiniMax-M3-NVFP4 with vllm/vllm-openai:vllm-minimax-m3-perf-x86_64-13.0.1-7a67223" + - "Reuse the existing MXFP8 B300 topology and concurrency matrix across 15 srt-slurm recipes, while dropping the FP8-only Marlin override from TP4 decode" + - "Overlay vllm-project/vllm#46380 at commit 6c08558112acd2fd8b4bfc270104d556eb77f9bf and retain the MiniMax-M3 MSA contiguous top-k fix" + pr-link: XXX diff --git a/runners/launch_b300-nv.sh b/runners/launch_b300-nv.sh index b4802b049..a14d739a4 100644 --- a/runners/launch_b300-nv.sh +++ b/runners/launch_b300-nv.sh @@ -47,11 +47,14 @@ elif [[ $MODEL_PREFIX == "minimaxm2.5" && $PRECISION == "fp4" && $FRAMEWORK == " elif [[ $MODEL_PREFIX == "minimaxm2.5" && $PRECISION == "fp8" && $FRAMEWORK == "dynamo-vllm" ]]; then export MODEL_PATH="/data/models/MiniMax-M2.5" export SRT_SLURM_MODEL_PREFIX="minimax-m2.5-fp8" +elif [[ $MODEL_PREFIX == "minimaxm3" && $PRECISION == "fp4" && $FRAMEWORK == "dynamo-vllm" ]]; then + export MODEL_PATH="/scratch/models/MiniMax-M3-NVFP4" + export SRT_SLURM_MODEL_PREFIX="nvidia/MiniMax-M3-NVFP4" elif [[ $MODEL_PREFIX == "minimaxm3" && $PRECISION == "fp8" && $FRAMEWORK == "dynamo-vllm" ]]; then export MODEL_PATH="/data/models/MiniMax-M3-MXFP8" export SRT_SLURM_MODEL_PREFIX="MiniMaxAI/MiniMax-M3-MXFP8" else - echo "Unsupported model: $MODEL_PREFIX-$PRECISION. Supported models are: dsr1-fp4, dsr1-fp8, dsv4-fp4 with dynamo-vllm, minimaxm2.5-fp4 with dynamo-vllm, minimaxm2.5-fp8 with dynamo-vllm, minimaxm3-fp8 with dynamo-vllm" + echo "Unsupported model: $MODEL_PREFIX-$PRECISION. Supported models are: dsr1-fp4, dsr1-fp8, dsv4-fp4 with dynamo-vllm, minimaxm2.5-fp4 with dynamo-vllm, minimaxm2.5-fp8 with dynamo-vllm, minimaxm3-fp4 with dynamo-vllm, minimaxm3-fp8 with dynamo-vllm" exit 1 fi @@ -85,13 +88,17 @@ elif [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "minimaxm2.5" && $PRECIS git checkout main mkdir -p recipes/vllm/minimax-m2.5-fp8 cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300-fp8" recipes/vllm/minimax-m2.5-fp8 -elif [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "minimaxm3" && $PRECISION == "fp8" ]]; then +elif [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "minimaxm3" && ( $PRECISION == "fp4" || $PRECISION == "fp8" ) ]]; then git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" cd "$SRT_REPO_DIR" || exit 1 git checkout sa-submission-q2-2026 mkdir -p recipes/vllm/minimax-m3 cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3" recipes/vllm/minimax-m3 - SRTCTL_SETUP_SCRIPT="minimax-m3-vllm-fixes.sh" + if [[ $PRECISION == "fp4" ]]; then + SRTCTL_SETUP_SCRIPT="minimax-m3-nvfp4-vllm-fixes.sh" + else + SRTCTL_SETUP_SCRIPT="minimax-m3-vllm-fixes.sh" + fi # NVIDIA/srt-slurm#38 git show 22d46ba9971615016d2339c9ffbc7b4597accfad --format= -- src/srtctl/core/ip_utils/get_node_ip.sh | git apply - || exit 1 cp \ From 75f3c54e67b8687afb90a16859d4514ef003a9ed Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Thu, 25 Jun 2026 18:52:31 +0800 Subject: [PATCH 2/4] chore: link PR 1931 in performance changelog --- perf-changelog.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 65d5b3d1e..bc7042ea7 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -4198,4 +4198,4 @@ - "Use nvidia/MiniMax-M3-NVFP4 from /scratch/models/MiniMax-M3-NVFP4 with vllm/vllm-openai:vllm-minimax-m3-perf-x86_64-13.0.1-7a67223" - "Reuse the existing MXFP8 B300 topology and concurrency matrix across 15 srt-slurm recipes, while dropping the FP8-only Marlin override from TP4 decode" - "Overlay vllm-project/vllm#46380 at commit 6c08558112acd2fd8b4bfc270104d556eb77f9bf and retain the MiniMax-M3 MSA contiguous top-k fix" - pr-link: XXX + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1931 From ab22a9067bb62d81998449e8e60c6e69d50fc3a9 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Thu, 25 Jun 2026 21:50:04 +0800 Subject: [PATCH 3/4] fix: disable FlashInfer autotune for MiniMax-M3 B300 --- .../vllm/minimax-m3/b300-fp4/1k1k/1p1d-dep2-tep8-1k1k.yaml | 2 ++ .../vllm/minimax-m3/b300-fp4/1k1k/1p1d-dep2-tp4-1k1k.yaml | 2 ++ .../vllm/minimax-m3/b300-fp4/1k1k/1p2d-dep2-dep4-1k1k.yaml | 2 ++ .../vllm/minimax-m3/b300-fp4/1k1k/2p1d-dep2-dep8-1k1k.yaml | 2 ++ .../vllm/minimax-m3/b300-fp4/1k1k/2p1d-dep2-tep8-1k1k.yaml | 2 ++ .../vllm/minimax-m3/b300-fp4/1k1k/2p2d-dep2-tep8-1k1k.yaml | 2 ++ .../vllm/minimax-m3/b300-fp4/1k1k/3p2d-dep2-tep8-1k1k.yaml | 2 ++ .../vllm/minimax-m3/b300-fp4/8k1k/1p1d-dep2-tp4-8k1k.yaml | 2 ++ .../vllm/minimax-m3/b300-fp4/8k1k/1p2d-dep2-tep4-8k1k.yaml | 2 ++ .../vllm/minimax-m3/b300-fp4/8k1k/1p4d-dep2-tep4-8k1k.yaml | 2 ++ .../vllm/minimax-m3/b300-fp4/8k1k/1p4d-dep2-tep8-8k1k.yaml | 2 ++ .../vllm/minimax-m3/b300-fp4/8k1k/2p2d-dep2-dep8-8k1k.yaml | 2 ++ .../vllm/minimax-m3/b300-fp4/8k1k/2p2d-dep2-tep8-8k1k.yaml | 2 ++ .../vllm/minimax-m3/b300-fp4/8k1k/4p2d-dep2-dep8-8k1k.yaml | 2 ++ .../vllm/minimax-m3/b300-fp4/8k1k/4p2d-dep2-tep4-8k1k.yaml | 2 ++ 15 files changed, 30 insertions(+) diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/1k1k/1p1d-dep2-tep8-1k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/1k1k/1p1d-dep2-tep8-1k1k.yaml index 0c030cecc..213b650aa 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/1k1k/1p1d-dep2-tep8-1k1k.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/1k1k/1p1d-dep2-tep8-1k1k.yaml @@ -37,6 +37,7 @@ backend: vllm_config: prefill: + no-enable-flashinfer-autotune: true tensor-parallel-size: 1 data-parallel-size: 2 data-parallel-rpc-port: 13345 @@ -53,6 +54,7 @@ backend: max-num-batched-tokens: 2048 decode: + no-enable-flashinfer-autotune: true tensor-parallel-size: 8 enable-expert-parallel: true trust-remote-code: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/1k1k/1p1d-dep2-tp4-1k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/1k1k/1p1d-dep2-tp4-1k1k.yaml index 20dc0494f..df131dd7b 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/1k1k/1p1d-dep2-tp4-1k1k.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/1k1k/1p1d-dep2-tp4-1k1k.yaml @@ -38,6 +38,7 @@ backend: vllm_config: prefill: + no-enable-flashinfer-autotune: true tensor-parallel-size: 1 data-parallel-size: 2 data-parallel-rpc-port: 13345 @@ -54,6 +55,7 @@ backend: max-num-batched-tokens: 2048 decode: + no-enable-flashinfer-autotune: true tensor-parallel-size: 4 enable-expert-parallel: false trust-remote-code: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/1k1k/1p2d-dep2-dep4-1k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/1k1k/1p2d-dep2-dep4-1k1k.yaml index 5850e8910..8e550297c 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/1k1k/1p2d-dep2-dep4-1k1k.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/1k1k/1p2d-dep2-dep4-1k1k.yaml @@ -37,6 +37,7 @@ backend: vllm_config: prefill: + no-enable-flashinfer-autotune: true tensor-parallel-size: 1 data-parallel-size: 2 data-parallel-rpc-port: 13345 @@ -53,6 +54,7 @@ backend: max-num-batched-tokens: 2048 decode: + no-enable-flashinfer-autotune: true tensor-parallel-size: 1 data-parallel-size: 4 data-parallel-rpc-port: 13345 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/1k1k/2p1d-dep2-dep8-1k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/1k1k/2p1d-dep2-dep8-1k1k.yaml index b60a55f24..1e54eda1b 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/1k1k/2p1d-dep2-dep8-1k1k.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/1k1k/2p1d-dep2-dep8-1k1k.yaml @@ -37,6 +37,7 @@ backend: vllm_config: prefill: + no-enable-flashinfer-autotune: true tensor-parallel-size: 1 data-parallel-size: 2 data-parallel-rpc-port: 13345 @@ -53,6 +54,7 @@ backend: max-num-batched-tokens: 2048 decode: + no-enable-flashinfer-autotune: true tensor-parallel-size: 1 data-parallel-size: 8 data-parallel-rpc-port: 13345 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/1k1k/2p1d-dep2-tep8-1k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/1k1k/2p1d-dep2-tep8-1k1k.yaml index eaf9f3baf..24155688d 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/1k1k/2p1d-dep2-tep8-1k1k.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/1k1k/2p1d-dep2-tep8-1k1k.yaml @@ -37,6 +37,7 @@ backend: vllm_config: prefill: + no-enable-flashinfer-autotune: true tensor-parallel-size: 1 data-parallel-size: 2 data-parallel-rpc-port: 13345 @@ -53,6 +54,7 @@ backend: max-num-batched-tokens: 2048 decode: + no-enable-flashinfer-autotune: true tensor-parallel-size: 8 enable-expert-parallel: true trust-remote-code: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/1k1k/2p2d-dep2-tep8-1k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/1k1k/2p2d-dep2-tep8-1k1k.yaml index c7e6d5f07..a283f08a5 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/1k1k/2p2d-dep2-tep8-1k1k.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/1k1k/2p2d-dep2-tep8-1k1k.yaml @@ -37,6 +37,7 @@ backend: vllm_config: prefill: + no-enable-flashinfer-autotune: true tensor-parallel-size: 1 data-parallel-size: 2 data-parallel-rpc-port: 13345 @@ -53,6 +54,7 @@ backend: max-num-batched-tokens: 2048 decode: + no-enable-flashinfer-autotune: true tensor-parallel-size: 8 enable-expert-parallel: true trust-remote-code: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/1k1k/3p2d-dep2-tep8-1k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/1k1k/3p2d-dep2-tep8-1k1k.yaml index 494c9c2b5..d9de76e30 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/1k1k/3p2d-dep2-tep8-1k1k.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/1k1k/3p2d-dep2-tep8-1k1k.yaml @@ -37,6 +37,7 @@ backend: vllm_config: prefill: + no-enable-flashinfer-autotune: true tensor-parallel-size: 1 data-parallel-size: 2 data-parallel-rpc-port: 13345 @@ -53,6 +54,7 @@ backend: max-num-batched-tokens: 2048 decode: + no-enable-flashinfer-autotune: true tensor-parallel-size: 8 enable-expert-parallel: true trust-remote-code: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/8k1k/1p1d-dep2-tp4-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/8k1k/1p1d-dep2-tp4-8k1k.yaml index 17dd9e876..ae2eca8b6 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/8k1k/1p1d-dep2-tp4-8k1k.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/8k1k/1p1d-dep2-tp4-8k1k.yaml @@ -38,6 +38,7 @@ backend: vllm_config: prefill: + no-enable-flashinfer-autotune: true tensor-parallel-size: 1 data-parallel-size: 2 data-parallel-rpc-port: 13345 @@ -56,6 +57,7 @@ backend: max-num-batched-tokens: 16384 decode: + no-enable-flashinfer-autotune: true tensor-parallel-size: 4 enable-expert-parallel: false trust-remote-code: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/8k1k/1p2d-dep2-tep4-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/8k1k/1p2d-dep2-tep4-8k1k.yaml index 6ebd1a963..52e66e20b 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/8k1k/1p2d-dep2-tep4-8k1k.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/8k1k/1p2d-dep2-tep4-8k1k.yaml @@ -37,6 +37,7 @@ backend: vllm_config: prefill: + no-enable-flashinfer-autotune: true tensor-parallel-size: 1 data-parallel-size: 2 data-parallel-rpc-port: 13345 @@ -55,6 +56,7 @@ backend: max-num-batched-tokens: 16384 decode: + no-enable-flashinfer-autotune: true tensor-parallel-size: 4 enable-expert-parallel: true trust-remote-code: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/8k1k/1p4d-dep2-tep4-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/8k1k/1p4d-dep2-tep4-8k1k.yaml index 5668312cc..2c24d5206 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/8k1k/1p4d-dep2-tep4-8k1k.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/8k1k/1p4d-dep2-tep4-8k1k.yaml @@ -37,6 +37,7 @@ backend: vllm_config: prefill: + no-enable-flashinfer-autotune: true tensor-parallel-size: 1 data-parallel-size: 2 data-parallel-rpc-port: 13345 @@ -55,6 +56,7 @@ backend: max-num-batched-tokens: 16384 decode: + no-enable-flashinfer-autotune: true tensor-parallel-size: 4 enable-expert-parallel: true trust-remote-code: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/8k1k/1p4d-dep2-tep8-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/8k1k/1p4d-dep2-tep8-8k1k.yaml index bbafa006c..67872b436 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/8k1k/1p4d-dep2-tep8-8k1k.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/8k1k/1p4d-dep2-tep8-8k1k.yaml @@ -37,6 +37,7 @@ backend: vllm_config: prefill: + no-enable-flashinfer-autotune: true tensor-parallel-size: 1 data-parallel-size: 2 data-parallel-rpc-port: 13345 @@ -55,6 +56,7 @@ backend: max-num-batched-tokens: 16384 decode: + no-enable-flashinfer-autotune: true tensor-parallel-size: 8 enable-expert-parallel: true trust-remote-code: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/8k1k/2p2d-dep2-dep8-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/8k1k/2p2d-dep2-dep8-8k1k.yaml index 5f71f2496..fcf641f4e 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/8k1k/2p2d-dep2-dep8-8k1k.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/8k1k/2p2d-dep2-dep8-8k1k.yaml @@ -37,6 +37,7 @@ backend: vllm_config: prefill: + no-enable-flashinfer-autotune: true tensor-parallel-size: 1 data-parallel-size: 2 data-parallel-rpc-port: 13345 @@ -55,6 +56,7 @@ backend: max-num-batched-tokens: 16384 decode: + no-enable-flashinfer-autotune: true tensor-parallel-size: 1 data-parallel-size: 8 data-parallel-rpc-port: 13345 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/8k1k/2p2d-dep2-tep8-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/8k1k/2p2d-dep2-tep8-8k1k.yaml index 8a6f41f56..b16b7b7c0 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/8k1k/2p2d-dep2-tep8-8k1k.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/8k1k/2p2d-dep2-tep8-8k1k.yaml @@ -37,6 +37,7 @@ backend: vllm_config: prefill: + no-enable-flashinfer-autotune: true tensor-parallel-size: 1 data-parallel-size: 2 data-parallel-rpc-port: 13345 @@ -55,6 +56,7 @@ backend: max-num-batched-tokens: 16384 decode: + no-enable-flashinfer-autotune: true tensor-parallel-size: 8 enable-expert-parallel: true trust-remote-code: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/8k1k/4p2d-dep2-dep8-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/8k1k/4p2d-dep2-dep8-8k1k.yaml index fd7b98935..7c1ad991b 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/8k1k/4p2d-dep2-dep8-8k1k.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/8k1k/4p2d-dep2-dep8-8k1k.yaml @@ -37,6 +37,7 @@ backend: vllm_config: prefill: + no-enable-flashinfer-autotune: true tensor-parallel-size: 1 data-parallel-size: 2 data-parallel-rpc-port: 13345 @@ -55,6 +56,7 @@ backend: max-num-batched-tokens: 16384 decode: + no-enable-flashinfer-autotune: true tensor-parallel-size: 1 data-parallel-size: 8 data-parallel-rpc-port: 13345 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/8k1k/4p2d-dep2-tep4-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/8k1k/4p2d-dep2-tep4-8k1k.yaml index c8d5913d8..674bbceaf 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/8k1k/4p2d-dep2-tep4-8k1k.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/8k1k/4p2d-dep2-tep4-8k1k.yaml @@ -37,6 +37,7 @@ backend: vllm_config: prefill: + no-enable-flashinfer-autotune: true tensor-parallel-size: 1 data-parallel-size: 2 data-parallel-rpc-port: 13345 @@ -55,6 +56,7 @@ backend: max-num-batched-tokens: 16384 decode: + no-enable-flashinfer-autotune: true tensor-parallel-size: 4 enable-expert-parallel: true trust-remote-code: true From 0297969d530b98f65a0f908d29af80d89025ba2d Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Fri, 26 Jun 2026 10:50:57 +0800 Subject: [PATCH 4/4] fix: use updated MiniMax-M3 NVFP4 image --- .github/configs/nvidia-master.yaml | 5 +-- .../configs/minimax-m3-nvfp4-vllm-fixes.sh | 35 ------------------- .../b300-fp4/1k1k/1p1d-dep2-tep8-1k1k.yaml | 2 +- .../b300-fp4/1k1k/1p1d-dep2-tp4-1k1k.yaml | 2 +- .../b300-fp4/1k1k/1p2d-dep2-dep4-1k1k.yaml | 2 +- .../b300-fp4/1k1k/2p1d-dep2-dep8-1k1k.yaml | 2 +- .../b300-fp4/1k1k/2p1d-dep2-tep8-1k1k.yaml | 2 +- .../b300-fp4/1k1k/2p2d-dep2-tep8-1k1k.yaml | 2 +- .../b300-fp4/1k1k/3p2d-dep2-tep8-1k1k.yaml | 2 +- .../b300-fp4/8k1k/1p1d-dep2-tp4-8k1k.yaml | 2 +- .../b300-fp4/8k1k/1p2d-dep2-tep4-8k1k.yaml | 2 +- .../b300-fp4/8k1k/1p4d-dep2-tep4-8k1k.yaml | 2 +- .../b300-fp4/8k1k/1p4d-dep2-tep8-8k1k.yaml | 2 +- .../b300-fp4/8k1k/2p2d-dep2-dep8-8k1k.yaml | 2 +- .../b300-fp4/8k1k/2p2d-dep2-tep8-8k1k.yaml | 2 +- .../b300-fp4/8k1k/4p2d-dep2-dep8-8k1k.yaml | 2 +- .../b300-fp4/8k1k/4p2d-dep2-tep4-8k1k.yaml | 2 +- perf-changelog.yaml | 3 +- runners/launch_b300-nv.sh | 12 +++---- 19 files changed, 25 insertions(+), 60 deletions(-) delete mode 100755 benchmarks/multi_node/srt-slurm-recipes/configs/minimax-m3-nvfp4-vllm-fixes.sh diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 5978874be..9eb019a18 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -11956,9 +11956,10 @@ minimaxm3-fp8-b300-dynamo-vllm: dp-attn: false # MiniMax-M3 NVFP4 disagg sweep on the same B300 topology matrix as the MXFP8 -# baseline above. vLLM PR #46380 is overlaid by the runner setup script. +# baseline above. The image includes vLLM PR #46380, so no runtime patch is +# needed. minimaxm3-fp4-b300-dynamo-vllm: - image: vllm/vllm-openai:vllm-minimax-m3-perf-x86_64-13.0.1-7a67223 + image: vllm/vllm-openai:vllm-minimax-m3-perf-x86_64-13.0.1-8b00f41 model: nvidia/MiniMax-M3-NVFP4 model-prefix: minimaxm3 runner: b300 diff --git a/benchmarks/multi_node/srt-slurm-recipes/configs/minimax-m3-nvfp4-vllm-fixes.sh b/benchmarks/multi_node/srt-slurm-recipes/configs/minimax-m3-nvfp4-vllm-fixes.sh deleted file mode 100755 index 27a954008..000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/configs/minimax-m3-nvfp4-vllm-fixes.sh +++ /dev/null @@ -1,35 +0,0 @@ -#!/usr/bin/env bash -set -euo pipefail - -# MiniMax-M3 ModelOpt NVFP4 support from vllm-project/vllm#46380. -VLLM_NVFP4_COMMIT="6c08558112acd2fd8b4bfc270104d556eb77f9bf" -VLLM_ROOT="$(python3 -c 'import os, vllm; print(os.path.dirname(vllm.__file__))')" -for file in \ - model_executor/layers/fused_moe/experts/trtllm_nvfp4_moe.py \ - model_executor/layers/quantization/modelopt.py \ - model_executor/layers/quantization/utils/flashinfer_utils.py -do - curl -fsSL \ - "https://raw.githubusercontent.com/vllm-project/vllm/${VLLM_NVFP4_COMMIT}/vllm/${file}" \ - -o "${VLLM_ROOT}/${file}" -done - -python3 - <<'PYEOF' -from importlib.util import find_spec -from pathlib import Path - -spec = find_spec("vllm") -if not spec or not spec.origin: - raise RuntimeError("vllm is not installed") -path = Path(spec.origin).parent / "models/minimax_m3/nvidia/sparse_attention_msa.py" -source = path.read_text() -old = " prefill_topk = topk[:, nd:num_tokens, :]\n" -new = " prefill_topk = topk[:, nd:num_tokens, :].contiguous()\n" -if new not in source: - if source.count(old) != 1: - raise RuntimeError(f"missing or ambiguous patch anchor in {path}") - path.write_text(source.replace(old, new, 1)) -PYEOF - -python3 -c \ - "from vllm.model_executor.layers.fused_moe.experts.trtllm_nvfp4_moe import TrtLlmNvFp4ExpertsModular; print('[nvfp4-patch] OK')" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/1k1k/1p1d-dep2-tep8-1k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/1k1k/1p1d-dep2-tep8-1k1k.yaml index 213b650aa..486af0557 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/1k1k/1p1d-dep2-tep8-1k1k.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/1k1k/1p1d-dep2-tep8-1k1k.yaml @@ -2,7 +2,7 @@ name: "minimax-m3-vllm-disagg-b300-1p1d-fp4-dep2-tep8-1k1k" model: path: "nvidia/MiniMax-M3-NVFP4" - container: "vllm/vllm-openai:vllm-minimax-m3-perf-x86_64-13.0.1-7a67223" + container: "vllm/vllm-openai:vllm-minimax-m3-perf-x86_64-13.0.1-8b00f41" precision: "fp4" resources: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/1k1k/1p1d-dep2-tp4-1k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/1k1k/1p1d-dep2-tp4-1k1k.yaml index df131dd7b..532b78a10 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/1k1k/1p1d-dep2-tp4-1k1k.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/1k1k/1p1d-dep2-tp4-1k1k.yaml @@ -2,7 +2,7 @@ name: "minimax-m3-vllm-disagg-b300-1p1d-fp4-dep2-tp4-1k1k" model: path: "nvidia/MiniMax-M3-NVFP4" - container: "vllm/vllm-openai:vllm-minimax-m3-perf-x86_64-13.0.1-7a67223" + container: "vllm/vllm-openai:vllm-minimax-m3-perf-x86_64-13.0.1-8b00f41" precision: "fp4" resources: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/1k1k/1p2d-dep2-dep4-1k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/1k1k/1p2d-dep2-dep4-1k1k.yaml index 8e550297c..fde8442a1 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/1k1k/1p2d-dep2-dep4-1k1k.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/1k1k/1p2d-dep2-dep4-1k1k.yaml @@ -2,7 +2,7 @@ name: "minimax-m3-vllm-disagg-b300-1p2d-fp4-dep2-dep4-1k1k" model: path: "nvidia/MiniMax-M3-NVFP4" - container: "vllm/vllm-openai:vllm-minimax-m3-perf-x86_64-13.0.1-7a67223" + container: "vllm/vllm-openai:vllm-minimax-m3-perf-x86_64-13.0.1-8b00f41" precision: "fp4" resources: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/1k1k/2p1d-dep2-dep8-1k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/1k1k/2p1d-dep2-dep8-1k1k.yaml index 1e54eda1b..ed3b5f995 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/1k1k/2p1d-dep2-dep8-1k1k.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/1k1k/2p1d-dep2-dep8-1k1k.yaml @@ -2,7 +2,7 @@ name: "minimax-m3-vllm-disagg-b300-2p1d-fp4-dep2-dep8-1k1k" model: path: "nvidia/MiniMax-M3-NVFP4" - container: "vllm/vllm-openai:vllm-minimax-m3-perf-x86_64-13.0.1-7a67223" + container: "vllm/vllm-openai:vllm-minimax-m3-perf-x86_64-13.0.1-8b00f41" precision: "fp4" resources: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/1k1k/2p1d-dep2-tep8-1k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/1k1k/2p1d-dep2-tep8-1k1k.yaml index 24155688d..0784283b9 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/1k1k/2p1d-dep2-tep8-1k1k.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/1k1k/2p1d-dep2-tep8-1k1k.yaml @@ -2,7 +2,7 @@ name: "minimax-m3-vllm-disagg-b300-2p1d-fp4-dep2-tep8-1k1k" model: path: "nvidia/MiniMax-M3-NVFP4" - container: "vllm/vllm-openai:vllm-minimax-m3-perf-x86_64-13.0.1-7a67223" + container: "vllm/vllm-openai:vllm-minimax-m3-perf-x86_64-13.0.1-8b00f41" precision: "fp4" resources: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/1k1k/2p2d-dep2-tep8-1k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/1k1k/2p2d-dep2-tep8-1k1k.yaml index a283f08a5..59c52da00 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/1k1k/2p2d-dep2-tep8-1k1k.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/1k1k/2p2d-dep2-tep8-1k1k.yaml @@ -2,7 +2,7 @@ name: "minimax-m3-vllm-disagg-b300-2p2d-fp4-dep2-tep8-1k1k" model: path: "nvidia/MiniMax-M3-NVFP4" - container: "vllm/vllm-openai:vllm-minimax-m3-perf-x86_64-13.0.1-7a67223" + container: "vllm/vllm-openai:vllm-minimax-m3-perf-x86_64-13.0.1-8b00f41" precision: "fp4" resources: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/1k1k/3p2d-dep2-tep8-1k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/1k1k/3p2d-dep2-tep8-1k1k.yaml index d9de76e30..7e9f7dec3 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/1k1k/3p2d-dep2-tep8-1k1k.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/1k1k/3p2d-dep2-tep8-1k1k.yaml @@ -2,7 +2,7 @@ name: "minimax-m3-vllm-disagg-b300-3p2d-fp4-dep2-tep8-1k1k" model: path: "nvidia/MiniMax-M3-NVFP4" - container: "vllm/vllm-openai:vllm-minimax-m3-perf-x86_64-13.0.1-7a67223" + container: "vllm/vllm-openai:vllm-minimax-m3-perf-x86_64-13.0.1-8b00f41" precision: "fp4" resources: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/8k1k/1p1d-dep2-tp4-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/8k1k/1p1d-dep2-tp4-8k1k.yaml index ae2eca8b6..be2683d0c 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/8k1k/1p1d-dep2-tp4-8k1k.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/8k1k/1p1d-dep2-tp4-8k1k.yaml @@ -2,7 +2,7 @@ name: "minimax-m3-vllm-disagg-b300-1p1d-fp4-dep2-tp4-8k1k" model: path: "nvidia/MiniMax-M3-NVFP4" - container: "vllm/vllm-openai:vllm-minimax-m3-perf-x86_64-13.0.1-7a67223" + container: "vllm/vllm-openai:vllm-minimax-m3-perf-x86_64-13.0.1-8b00f41" precision: "fp4" resources: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/8k1k/1p2d-dep2-tep4-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/8k1k/1p2d-dep2-tep4-8k1k.yaml index 52e66e20b..5be198f11 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/8k1k/1p2d-dep2-tep4-8k1k.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/8k1k/1p2d-dep2-tep4-8k1k.yaml @@ -2,7 +2,7 @@ name: "minimax-m3-vllm-disagg-b300-1p2d-fp4-dep2-tep4-8k1k" model: path: "nvidia/MiniMax-M3-NVFP4" - container: "vllm/vllm-openai:vllm-minimax-m3-perf-x86_64-13.0.1-7a67223" + container: "vllm/vllm-openai:vllm-minimax-m3-perf-x86_64-13.0.1-8b00f41" precision: "fp4" resources: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/8k1k/1p4d-dep2-tep4-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/8k1k/1p4d-dep2-tep4-8k1k.yaml index 2c24d5206..90d688f61 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/8k1k/1p4d-dep2-tep4-8k1k.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/8k1k/1p4d-dep2-tep4-8k1k.yaml @@ -2,7 +2,7 @@ name: "minimax-m3-vllm-disagg-b300-1p4d-fp4-dep2-tep4-8k1k" model: path: "nvidia/MiniMax-M3-NVFP4" - container: "vllm/vllm-openai:vllm-minimax-m3-perf-x86_64-13.0.1-7a67223" + container: "vllm/vllm-openai:vllm-minimax-m3-perf-x86_64-13.0.1-8b00f41" precision: "fp4" resources: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/8k1k/1p4d-dep2-tep8-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/8k1k/1p4d-dep2-tep8-8k1k.yaml index 67872b436..215474282 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/8k1k/1p4d-dep2-tep8-8k1k.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/8k1k/1p4d-dep2-tep8-8k1k.yaml @@ -2,7 +2,7 @@ name: "minimax-m3-vllm-disagg-b300-1p4d-fp4-dep2-tep8-8k1k" model: path: "nvidia/MiniMax-M3-NVFP4" - container: "vllm/vllm-openai:vllm-minimax-m3-perf-x86_64-13.0.1-7a67223" + container: "vllm/vllm-openai:vllm-minimax-m3-perf-x86_64-13.0.1-8b00f41" precision: "fp4" resources: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/8k1k/2p2d-dep2-dep8-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/8k1k/2p2d-dep2-dep8-8k1k.yaml index fcf641f4e..c49fd1ccb 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/8k1k/2p2d-dep2-dep8-8k1k.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/8k1k/2p2d-dep2-dep8-8k1k.yaml @@ -2,7 +2,7 @@ name: "minimax-m3-vllm-disagg-b300-2p2d-fp4-dep2-dep8-8k1k" model: path: "nvidia/MiniMax-M3-NVFP4" - container: "vllm/vllm-openai:vllm-minimax-m3-perf-x86_64-13.0.1-7a67223" + container: "vllm/vllm-openai:vllm-minimax-m3-perf-x86_64-13.0.1-8b00f41" precision: "fp4" resources: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/8k1k/2p2d-dep2-tep8-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/8k1k/2p2d-dep2-tep8-8k1k.yaml index b16b7b7c0..1b8dfd627 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/8k1k/2p2d-dep2-tep8-8k1k.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/8k1k/2p2d-dep2-tep8-8k1k.yaml @@ -2,7 +2,7 @@ name: "minimax-m3-vllm-disagg-b300-2p2d-fp4-dep2-tep8-8k1k" model: path: "nvidia/MiniMax-M3-NVFP4" - container: "vllm/vllm-openai:vllm-minimax-m3-perf-x86_64-13.0.1-7a67223" + container: "vllm/vllm-openai:vllm-minimax-m3-perf-x86_64-13.0.1-8b00f41" precision: "fp4" resources: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/8k1k/4p2d-dep2-dep8-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/8k1k/4p2d-dep2-dep8-8k1k.yaml index 7c1ad991b..73473aac9 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/8k1k/4p2d-dep2-dep8-8k1k.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/8k1k/4p2d-dep2-dep8-8k1k.yaml @@ -2,7 +2,7 @@ name: "minimax-m3-vllm-disagg-b300-4p2d-fp4-dep2-dep8-8k1k" model: path: "nvidia/MiniMax-M3-NVFP4" - container: "vllm/vllm-openai:vllm-minimax-m3-perf-x86_64-13.0.1-7a67223" + container: "vllm/vllm-openai:vllm-minimax-m3-perf-x86_64-13.0.1-8b00f41" precision: "fp4" resources: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/8k1k/4p2d-dep2-tep4-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/8k1k/4p2d-dep2-tep4-8k1k.yaml index 674bbceaf..23c99d328 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/8k1k/4p2d-dep2-tep4-8k1k.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/8k1k/4p2d-dep2-tep4-8k1k.yaml @@ -2,7 +2,7 @@ name: "minimax-m3-vllm-disagg-b300-4p2d-fp4-dep2-tep4-8k1k" model: path: "nvidia/MiniMax-M3-NVFP4" - container: "vllm/vllm-openai:vllm-minimax-m3-perf-x86_64-13.0.1-7a67223" + container: "vllm/vllm-openai:vllm-minimax-m3-perf-x86_64-13.0.1-8b00f41" precision: "fp4" resources: diff --git a/perf-changelog.yaml b/perf-changelog.yaml index bc7042ea7..d3aae6aa5 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -4195,7 +4195,6 @@ - minimaxm3-fp4-b300-dynamo-vllm description: - "Add MiniMax-M3 NVFP4 B300 disaggregated vLLM benchmarks via Dynamo for 1k1k and 8k1k STP (no MTP)" - - "Use nvidia/MiniMax-M3-NVFP4 from /scratch/models/MiniMax-M3-NVFP4 with vllm/vllm-openai:vllm-minimax-m3-perf-x86_64-13.0.1-7a67223" + - "Use nvidia/MiniMax-M3-NVFP4 from /scratch/models/MiniMax-M3-NVFP4 with vllm/vllm-openai:vllm-minimax-m3-perf-x86_64-13.0.1-8b00f41, which includes vllm-project/vllm PR #46380; no runtime patch needed" - "Reuse the existing MXFP8 B300 topology and concurrency matrix across 15 srt-slurm recipes, while dropping the FP8-only Marlin override from TP4 decode" - - "Overlay vllm-project/vllm#46380 at commit 6c08558112acd2fd8b4bfc270104d556eb77f9bf and retain the MiniMax-M3 MSA contiguous top-k fix" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1931 diff --git a/runners/launch_b300-nv.sh b/runners/launch_b300-nv.sh index a14d739a4..65275b232 100644 --- a/runners/launch_b300-nv.sh +++ b/runners/launch_b300-nv.sh @@ -94,16 +94,16 @@ elif [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "minimaxm3" && ( $PRECIS git checkout sa-submission-q2-2026 mkdir -p recipes/vllm/minimax-m3 cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3" recipes/vllm/minimax-m3 - if [[ $PRECISION == "fp4" ]]; then - SRTCTL_SETUP_SCRIPT="minimax-m3-nvfp4-vllm-fixes.sh" - else + if [[ $PRECISION == "fp8" ]]; then SRTCTL_SETUP_SCRIPT="minimax-m3-vllm-fixes.sh" fi # NVIDIA/srt-slurm#38 git show 22d46ba9971615016d2339c9ffbc7b4597accfad --format= -- src/srtctl/core/ip_utils/get_node_ip.sh | git apply - || exit 1 - cp \ - "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/configs/$SRTCTL_SETUP_SCRIPT" \ - "configs/$SRTCTL_SETUP_SCRIPT" + if [[ -n "$SRTCTL_SETUP_SCRIPT" ]]; then + cp \ + "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/configs/$SRTCTL_SETUP_SCRIPT" \ + "configs/$SRTCTL_SETUP_SCRIPT" + fi else git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" cd "$SRT_REPO_DIR" || exit 1