diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index f8bc53075..a47e0e665 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -2827,6 +2827,57 @@ minimaxm3-fp8-mi355x-atom-disagg: additional-settings: - "DECODE_NODES=1" +minimaxm3-fp4-mi355x-atom-disagg: + image: rocm/atom-dev:MiniMax-M3-20260622 + model: amd/MiniMax-M3-MXFP4 + model-prefix: minimaxm3 + runner: mi355x-disagg + precision: fp4 + framework: atom-disagg + multinode: true + disagg: true + scenarios: + fixed-seq-len: + - isl: 8192 + osl: 1024 + search-space: + # 1P1D TP4 + - conc-list: [ 1, 2, 4, 8, 16, 32, 64, 128, 256, 512 ] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=1" + # 1P1D TP4 + - isl: 1024 + osl: 1024 + search-space: + # 1P1D TP4 + - conc-list: [ 1, 2, 4, 8, 16, 32, 64, 128, 256, 512 ] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=1" + # MiniMax-M3 MXFP8 MI300X day-zero recipe. Reuse the dedicated ROCm image and # MI355X serving shape, but retain the default BF16 KV cache because this # checkpoint lacks calibrated ROCm FP8 attention scales. Use the TP8-only H100 diff --git a/benchmarks/multi_node/amd_utils/server_atom.sh b/benchmarks/multi_node/amd_utils/server_atom.sh index 5ecb85ec2..ccc864030 100755 --- a/benchmarks/multi_node/amd_utils/server_atom.sh +++ b/benchmarks/multi_node/amd_utils/server_atom.sh @@ -200,7 +200,6 @@ INFO # rank 1 .. (NODE_OFFSET-1) -> remaining prefill nodes # rank NODE_OFFSET .. -> decode nodes # ============================================================================= - if [ "$NODE_RANK" -eq 0 ]; then # ────────────────────────────────────────────────────────────────────────── # Node 0: prefill server (producer) + atomesh router diff --git a/benchmarks/multi_node/minimaxm3_fp4_mi355x_atom-disagg.sh b/benchmarks/multi_node/minimaxm3_fp4_mi355x_atom-disagg.sh new file mode 100644 index 000000000..505f74319 --- /dev/null +++ b/benchmarks/multi_node/minimaxm3_fp4_mi355x_atom-disagg.sh @@ -0,0 +1,94 @@ +#!/usr/bin/env bash + +source "$(dirname "$0")/../benchmark_lib.sh" + +check_env_vars \ + CONC_LIST \ + ISL \ + OSL \ + IMAGE \ + MODEL_PATH \ + PREFILL_NUM_WORKERS \ + PREFILL_TP \ + PREFILL_EP \ + PREFILL_DP_ATTN \ + DECODE_NUM_WORKERS \ + DECODE_TP \ + DECODE_EP \ + DECODE_DP_ATTN \ + PREFILL_NODES \ + DECODE_NODES \ + RANDOM_RANGE_RATIO \ + FRAMEWORK + +if [[ -n "$SLURM_JOB_ID" ]]; then + echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" +fi + +set -x + +# Use upstreamed multi_node scripts (no external clone needed) +cd "$GITHUB_WORKSPACE/benchmarks/multi_node/amd_utils" || exit 1 + +# Set up SGL launch script-specific environment variables +export TIME_LIMIT="08:00:00" +export MODEL_PATH=$MODEL_PATH +export MODEL_NAME=$MODEL_NAME +export CONTAINER_IMAGE=$IMAGE + +if [[ "${PREFILL_EP:-1}" -eq 1 ]]; then +export PREFILL_ENABLE_EP=false +else +export PREFILL_ENABLE_EP=true +fi + +if [[ "$PREFILL_DP_ATTN" == "true" ]]; then +export PREFILL_ENABLE_DP=true +else +export PREFILL_ENABLE_DP=false +fi + +if [[ "${DECODE_EP:-1}" -eq 1 ]]; then +export DECODE_ENABLE_EP=false +else +export DECODE_ENABLE_EP=true +fi + +if [[ "$DECODE_DP_ATTN" == "true" ]]; then +export DECODE_ENABLE_DP=true +else +export DECODE_ENABLE_DP=false +fi + +# No MTP for MiniMax-M3 +export SPEC_DECODING="none" +export DECODE_MTP_SIZE=0 + +# Block size 128 +export KV_CACHE_DTYPE="${KV_CACHE_DTYPE:-auto}" +export BLOCK_SIZE="${BLOCK_SIZE:-128}" +export MEM_FRAC_STATIC="${MEM_FRAC_STATIC:-0.8}" +export MAX_MODEL_LEN=32768 +export MAX_NUM_SEQS="${MAX_NUM_SEQS:-128}" +export MAX_NUM_BATCHED_TOKENS="${MAX_NUM_BATCHED_TOKENS:-32768}" + +# Launch jobs based on ISL/OSL +# Replace ' ' in CONC_LIST with 'x' such that the concurrency list is represented +# by a list of numbers delimited by 'x'. This is because of how the underlying launch script +# expects the concurrencies. +JOB_ID=$(bash ./submit.sh $PREFILL_NODES \ + $PREFILL_NUM_WORKERS \ + $DECODE_NODES \ + $DECODE_NUM_WORKERS \ + $ISL $OSL "${CONC_LIST// /x}" inf \ + ${PREFILL_ENABLE_EP} ${PREFILL_ENABLE_DP} \ + ${DECODE_ENABLE_EP} ${DECODE_ENABLE_DP} \ + ${PREFILL_TP} ${DECODE_TP} \ + ${RANDOM_RANGE_RATIO}) + +if [[ $? -ne 0 ]]; then + echo "Failed to submit job" >&2 + exit 1 +fi + +echo "$JOB_ID" diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 1651f1d10..296556d60 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -4089,6 +4089,14 @@ - "8k/1k: 1p4d-dep4-tep4 (conc 128), 1p4d-dep4-tp8 (conc 4-256), 3p1d-dep4-dep16 (conc 1024), 6p1d-dep4-dep16 (conc 3072), 8p1d-dep4-dep16 (conc 6144)" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1862 +- config-keys: + - minimaxm3-fp4-mi355x-atom-disagg + description: + - "Add minimaxm3-fp4-mi355x-atom-disagg CI script: multi-node disaggregated PD on MI355X via ATOM for MiniMax-M3-MXFP4" + - "No MTP, KV_CACHE_DTYPE=auto (MXFP4 native, no fp8 override), MAX_MODEL_LEN=32768, MAX_NUM_BATCHED_TOKENS=32768" + - "server_atom.sh: conditional --kv_cache_dtype, MAX_MODEL_LEN/MAX_NUM_BATCHED_TOKENS/CUDAGRAPH_OPT support, syntax fixes" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1856 + - config-keys: - dsv4-fp4-mi355x-sglang description: