Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/configs/amd-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2632,8 +2632,8 @@ minimaxm3-fp8-mi355x-atom-mtp:
- { tp: 4, conc-start: 1, conc-end: 256, spec-decoding: mtp }

minimaxm3-fp8-mi355x-atom-disagg:
image: rocm/atom-dev:MiniMax-M3-20260622
model: amd/MiniMax-M3-MXFP8
image: rocm/atom-dev:MiniMax-M3-20260623
model: MiniMaxAI/MiniMax-M3-MXFP8
model-prefix: minimaxm3
runner: mi355x-disagg
precision: fp8
Expand Down
26 changes: 7 additions & 19 deletions benchmarks/multi_node/amd_utils/env_atom.sh
Original file line number Diff line number Diff line change
Expand Up @@ -32,36 +32,24 @@ else
fi
export IBDEVICES

export SAFETENSORS_FAST_GPU=1
export VLLM_LOG_LEVEL=WARNING
export ATOM_LOG_LEVEL=WARNING
export AITER_LOG_LEVEL=WARNING
export LOG_LEVEL=WARNING
export LOGLEVEL=WARNING

# =============================================================================
# ATOM/mooncake-specific environment
# =============================================================================

# mooncake RDMA KV transfer library path
export LD_LIBRARY_PATH=/opt/venv/lib/python3.10/site-packages/mooncake:/opt/rocm/lib:${LD_LIBRARY_PATH:-}


# ATOM_HOST_IP is set per-node in server_atom.sh (= host_ip, used as handshake IP)
# faster model loading (safetensors only)
export SAFETENSORS_FAST_GPU=1

# aiter logging (WARNING to reduce noise; use DEBUG for troubleshooting)
export VLLM_LOG_LEVEL=WARNING
export ATOM_LOG_LEVEL=WARNING
export AITER_LOG_LEVEL=WARNING

if [[ "$MODEL_NAME" == "DeepSeek-V4-Pro" ]]; then
# ATOM MoE gather/scatter interleave optimization
export ATOM_MOE_GU_ITLV=1
# Disable bf16->fp8 MoE bound (only for DeepSeek-V4-Pro)
export AITER_BF16_FP8_MOE_BOUND=0
fi

# Clear stale ATOM cache on startup (server_atom.sh handles this via rm -rf)
# No env var needed; documented here for reference.
export LOG_LEVEL=WARNING
export LOGLEVEL=WARNING

set +x

# ATOM_HOST_IP is set per-node in server_atom.sh (= host_ip, used as handshake IP)
echo "[INFO] ATOM env: IBDEVICES=$IBDEVICES LD_LIBRARY_PATH includes mooncake"
68 changes: 26 additions & 42 deletions benchmarks/multi_node/amd_utils/models_atom.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Model-specific SGLang server configurations for disaggregated inference.
# Model-specific ATOM server configurations for disaggregated inference.
#
# Each top-level key is a MODEL_NAME value (must match the directory name under MODEL_DIR).
#
Expand All @@ -7,50 +7,34 @@
#
# Schema:
# <model-name>:
# base_flags: str # Common flags for both prefill and decode
# mtp_flags: str # Appended to decode when DECODE_MTP_SIZE > 0
# dp_flags: str # Appended when DP is enabled (prefill or decode)
# prefill:
# mem_fraction_static: float
# disable_radix_cache: bool
# dp: # Config when data-parallel attention is enabled
# max_running_requests: int
# chunked_prefill_size: str # Can be integer or bash arithmetic expression
# cuda_graph_bs: str # Space-separated values
# no_dp: # Config when data-parallel attention is disabled
# max_running_requests: int
# chunked_prefill_size: int
# cuda_graph_bs_range: str # "start-end" expanded via seq
# decode:
# mem_fraction_static: float
# prefill_round_robin_balance: bool
# dp:
# max_running_requests: int
# chunked_prefill_size: str
# cuda_graph_bs_range: str
# ep_only: # Config when EP is enabled but DP is disabled
# max_running_requests: int
# chunked_prefill_size: int
# cuda_graph_bs_range: str
# no_dp:
# max_running_requests: int
# chunked_prefill_size: int
# cuda_graph_bs_range: str
# env: str # Space-separated KEY=VALUE pairs exported unconditionally
# hf_overrides: str # JSON string passed to --hf-overrides
# tp_dp_flags: str # Parallel flags for TP+DPA case (must include --enable-dp-attention)
# tp_dp_env: str # Space-separated KEY=VALUE pairs exported only in TP+DPA mode
# ep_dp_flags: str # Parallel flags for EP+DPA case (must include --enable-expert-parallel --enable-dp-attention)
# ep_dp_env: str # Space-separated KEY=VALUE pairs exported only in EP+DPA mode
# mtp_flags: str # Flags passed to SPEC_ARGS before $DECODE_MTP_SIZE (e.g. "--method mtp --num-speculative-tokens")
# kv_cache_flags: str # Full --kv_cache_dtype flag string (e.g. "--kv_cache_dtype fp8", or "" for none)

DeepSeek-V4-Pro:
# ATOM engine (atom-disagg): server_atom.sh uses MEM_FRACTION/KV_CACHE_DTYPE/BLOCK_SIZE/MAX_NUM_SEQS
# directly from env vars (defaulting to 0.85/fp8/16/256). base_flags/dp_flags are not used by
# server_atom.sh; they are kept here for documentation and potential future use.
base_flags: ""
mtp_flags: ""
dp_flags: ""
env: "ATOM_MOE_GU_ITLV=1 AITER_BF16_FP8_MOE_BOUND=0"
kv_cache_flags: "--kv_cache_dtype fp8"
tp_dp_flags: "--enable-dp-attention --enable-tbo"
tp_dp_env: "GPU_MAX_HW_QUEUES=5 ATOM_CPU_AFFINITY=1"
ep_dp_flags: "--enable-expert-parallel --enable-dp-attention"
mtp_flags: "--method mtp --num-speculative-tokens"
hf_overrides: '{"use_index_cache":true,"index_topk_freq":4}'

MiniMax-M3-MXFP4:
base_flags: ""
mtp_flags: ""
dp_flags: ""
env: "AITER_QUICK_REDUCE_QUANTIZATION=INT4 ATOM_M3_SPARSE_USE_ASM_PA=1 AITER_QUICK_REDUCE_CAST_BF16_TO_FP16=0"
kv_cache_flags: "--kv_cache_dtype fp8"
tp_dp_flags: "--enable-dp-attention"
ep_dp_flags: "--enable-expert-parallel --enable-dp-attention"
mtp_flags: "--method eagle3 --draft-model Inferact/MiniMax-M3-EAGLE3 --num-speculative-tokens"

MiniMax-M3-MXFP8:
base_flags: ""
mtp_flags: ""
dp_flags: ""
env: "AITER_QUICK_REDUCE_QUANTIZATION=INT4 ATOM_M3_SPARSE_USE_ASM_PA=1 AITER_QUICK_REDUCE_CAST_BF16_TO_FP16=0"
kv_cache_flags: "--kv_cache_dtype fp8"
tp_dp_flags: "--enable-dp-attention"
ep_dp_flags: "--enable-expert-parallel --enable-dp-attention"
mtp_flags: "--method eagle3 --draft-model Inferact/MiniMax-M3-EAGLE3 --num-speculative-tokens"
Comment thread
seungrokj marked this conversation as resolved.
89 changes: 49 additions & 40 deletions benchmarks/multi_node/amd_utils/server_atom.sh
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,6 @@ HANDSHAKE_PORT="${HANDSHAKE_PORT:-6301}"

# ATOM server tuning (from reference script defaults)
MEM_FRAC_STATIC="${MEM_FRAC_STATIC:-0.85}"
KV_CACHE_DTYPE="${KV_CACHE_DTYPE:-fp8}"
BLOCK_SIZE="${BLOCK_SIZE:-16}"
MAX_NUM_SEQS="${MAX_NUM_SEQS:-256}"
MAX_MODEL_LEN="${MAX_MODEL_LEN:-}"
Expand Down Expand Up @@ -78,6 +77,24 @@ if [[ -z "$host_ip" ]]; then
fi
host_name=$(hostname)

# =============================================================================
# Model-Specific Configuration from YAML
# =============================================================================
# Load model-specific config from YAML (single parse for all fields)
eval "$(python3 -c "
import yaml
with open('${ATOM_WS_PATH}/models_atom.yaml') as f:
m = yaml.safe_load(f).get('${MODEL_NAME}', {})
print(f'MODEL_ENVS=\"{m.get(\"env\", \"\")}\"')
print(f'MODEL_TP_DP_FLAGS=\"{m.get(\"tp_dp_flags\", \"\")}\"')
print(f'MODEL_EP_DP_FLAGS=\"{m.get(\"ep_dp_flags\", \"\")}\"')
print(f'MODEL_TP_DP_ENV=\"{m.get(\"tp_dp_env\", \"\")}\"')
print(f'MODEL_EP_DP_ENV=\"{m.get(\"ep_dp_env\", \"\")}\"')
print(f'MODEL_MTP_FLAGS=\"{m.get(\"mtp_flags\", \"\")}\"')
print(f'MODEL_KV_ARG=\"{m.get(\"kv_cache_flags\", \"\")}\"')
Comment thread
seungrokj marked this conversation as resolved.
print(f'_HF_OVERRIDES=\"{m.get(\"hf_overrides\", \"\")}\"')
")"

# =============================================================================
# Cluster Topology Configuration
# =============================================================================
Expand Down Expand Up @@ -114,53 +131,48 @@ DECODE_ENABLE_DP="${DECODE_ENABLE_DP}"
# Parallel args
PREFILL_PARALLEL_ARGS=(-tp "$PREFILL_TP_SIZE") #TP
if [ "$PREFILL_ENABLE_DP" = "true" ]; then
if [ "$PREFILL_ENABLE_EP" -gt 1 ]; then #DPA+EP
PREFILL_PARALLEL_ARGS=(-tp "$PREFILL_TP_SIZE" --enable-expert-parallel --enable-dp-attention )
else #TP+DPA+TBO
if [[ "$MODEL_NAME" == "DeepSeek-V4-Pro" ]]; then
PREFILL_PARALLEL_ARGS=(-tp "$PREFILL_TP_SIZE" --enable-dp-attention --enable-tbo )
export GPU_MAX_HW_QUEUES=5
export ATOM_CPU_AFFINITY=1
else #TP+DPA
PREFILL_PARALLEL_ARGS=(-tp "$PREFILL_TP_SIZE" --enable-dp-attention )
fi
if [ "$PREFILL_ENABLE_EP" -gt 1 ]; then #EP+DPA
PREFILL_PARALLEL_ARGS=(-tp "$PREFILL_TP_SIZE" ${MODEL_EP_DP_FLAGS})
for _dp_env_pair in ${MODEL_EP_DP_ENV}; do export "$_dp_env_pair"; done
else #TP+DPA
PREFILL_PARALLEL_ARGS=(-tp "$PREFILL_TP_SIZE" ${MODEL_TP_DP_FLAGS})
for _dp_env_pair in ${MODEL_TP_DP_ENV}; do export "$_dp_env_pair"; done
fi
fi
fi

# (srok), split DPA & TBO cases
DECODE_PARALLEL_ARGS=(-tp "$PREFILL_TP_SIZE") #TP
DECODE_PARALLEL_ARGS=(-tp "$DECODE_TP_SIZE") #TP
if [ "$DECODE_ENABLE_DP" = "true" ]; then
if [ "$DECODE_ENABLE_EP" -gt 1 ]; then #DPA+EP
DECODE_PARALLEL_ARGS=(-tp "$DECODE_TP_SIZE" --enable-expert-parallel --enable-dp-attention )
else #TP+DPA+TBO
if [[ "$MODEL_NAME" == "DeepSeek-V4-Pro" ]]; then
DECODE_PARALLEL_ARGS=(-tp "$DECODE_TP_SIZE" --enable-dp-attention --enable-tbo )
export GPU_MAX_HW_QUEUES=5
export ATOM_CPU_AFFINITY=1
else #TP+DPA
DECODE_PARALLEL_ARGS=(-tp "$DECODE_TP_SIZE" --enable-dp-attention )
fi
if [ "$DECODE_ENABLE_EP" -gt 1 ]; then #EP+DPA
DECODE_PARALLEL_ARGS=(-tp "$DECODE_TP_SIZE" ${MODEL_EP_DP_FLAGS})
for _dp_env_pair in ${MODEL_EP_DP_ENV}; do export "$_dp_env_pair"; done
else #TP+DPA
DECODE_PARALLEL_ARGS=(-tp "$DECODE_TP_SIZE" ${MODEL_TP_DP_FLAGS})
for _dp_env_pair in ${MODEL_TP_DP_ENV}; do export "$_dp_env_pair"; done
fi
fi

# MTP args
SPEC_ARGS=() #TP
if [ "$SPEC_DECODING" = "mtp" ]; then
SPEC_ARGS=(--method mtp --num-speculative-tokens "$DECODE_MTP_SIZE")
fi
unset _dp_env_pair

# HF overrides (single-quoted JSON preserved through eval)
HF_OVERRIDES_ARG=""
if [[ "$MODEL_NAME" == "DeepSeek-V4-Pro" ]]; then
HF_OVERRIDES_ARG="--hf-overrides '{\"use_index_cache\":true,\"index_topk_freq\":4}'"
if [[ -n "$_HF_OVERRIDES" ]]; then
HF_OVERRIDES_ARG="--hf-overrides '${_HF_OVERRIDES}'"
fi
unset _HF_OVERRIDES

for _env_pair in ${MODEL_ENVS}; do
export "$_env_pair"
done
unset _env_pair

# KV cache dtype (skip if unset or 'auto')
KV_CACHE_ARG=""
if [[ -n "$KV_CACHE_DTYPE" && "$KV_CACHE_DTYPE" != "auto" ]]; then
KV_CACHE_ARG="--kv_cache_dtype ${KV_CACHE_DTYPE}"
# MTP args
SPEC_ARGS=()
if [ "$SPEC_DECODING" = "mtp" ]; then
SPEC_ARGS=(${MODEL_MTP_FLAGS} "$DECODE_MTP_SIZE")
fi

# KV cache arg - full flag string from YAML
KV_CACHE_ARG="${MODEL_KV_ARG}"

# Optional model length / batched-token cap
MODEL_LEN_ARGS=""
if [[ -n "$MAX_MODEL_LEN" ]]; then
Expand All @@ -170,9 +182,6 @@ if [[ -n "$MAX_NUM_BATCHED_TOKENS" ]]; then
MODEL_LEN_ARGS="${MODEL_LEN_ARGS} --max-num-batched-tokens ${MAX_NUM_BATCHED_TOKENS}"
fi

if [[ "$MODEL_NAME" != "DeepSeek-V4-Pro" ]]; then
export AITER_QUICK_REDUCE_QUANTIZATION=INT4
fi

cat <<INFO
=== Configuration ===
Expand All @@ -183,7 +192,7 @@ MODEL : ${MODEL_NAME}
BACKEND : atom (PD mooncake KV transfer)
MTP : method=mtp num_speculative_tokens=${DECODE_MTP_SIZE}
xP/yD : ${xP} / ${yD}
KV cache : dtype=${KV_CACHE_DTYPE:-auto} block_size=${BLOCK_SIZE} mem_frac=${MEM_FRAC_STATIC}
KV cache : ${KV_CACHE_ARG:-none} block_size=${BLOCK_SIZE} mem_frac=${MEM_FRAC_STATIC}
Comment thread
seungrokj marked this conversation as resolved.
Model len: max_model_len=${MAX_MODEL_LEN:-unset} max_num_batched_tokens=${MAX_NUM_BATCHED_TOKENS:-unset}
Prefill args : ${PREFILL_PARALLEL_ARGS[*]}
Decode args : ${DECODE_PARALLEL_ARGS[*]}
Expand Down
1 change: 0 additions & 1 deletion benchmarks/multi_node/minimaxm3_fp8_mi355x_atom-disagg.sh
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,6 @@ export SPEC_DECODING="none"
export DECODE_MTP_SIZE=0

# Block size 128
export KV_CACHE_DTYPE="${KV_CACHE_DTYPE:-auto}"
export BLOCK_SIZE="${BLOCK_SIZE:-128}"
export MEM_FRAC_STATIC="${MEM_FRAC_STATIC:-0.8}"
export MAX_MODEL_LEN=32768
Expand Down
8 changes: 8 additions & 0 deletions perf-changelog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4183,6 +4183,14 @@
- "server_atom.sh: fix _MAX_CONC assignment before cudagraph size check; gate ATOM_MOE_GU_ITLV/AITER_BF16_FP8_MOE_BOUND on DeepSeek-V4-Pro only"
- "Search space: ISL=8192 and ISL=1024, 1P1D TP4, conc 1-512"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1927

- config-keys:
- minimaxm3-fp8-mi355x-atom-disagg
description:
- "Refactor server_atom.sh: eliminate all hardcoded model-name checks; drive all model-specific config (env vars, parallel flags, MTP flags, KV cache flags, HF overrides) from models_atom.yaml"
- "models_atom.yaml: add MiniMax-M3-MXFP4 and MiniMax-M3-MXFP8 entries with EAGLE3 MTP flags; add DeepSeek-V4-Pro with TBO/cpu-affinity TP+DPA env and MTP flags; add tp_dp_flags, ep_dp_flags, tp_dp_env, ep_dp_env, kv_cache_flags, mtp_flags, hf_overrides fields"
- "Image bump for minimaxm3-fp8-mi355x-atom-disagg: rocm/atom-dev:MiniMax-M3-20260622 -> rocm/atom-dev:MiniMax-M3-20260623"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1930

- config-keys:
- dsv4-fp4-b200-dynamo-vllm
Expand Down
Loading