diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index a3f352ada..301fd4c02 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -2632,8 +2632,8 @@ minimaxm3-fp8-mi355x-atom-mtp: - { tp: 4, conc-start: 1, conc-end: 256, spec-decoding: mtp } minimaxm3-fp8-mi355x-atom-disagg: - image: rocm/atom-dev:MiniMax-M3-20260622 - model: amd/MiniMax-M3-MXFP8 + image: rocm/atom-dev:MiniMax-M3-20260623 + model: MiniMaxAI/MiniMax-M3-MXFP8 model-prefix: minimaxm3 runner: mi355x-disagg precision: fp8 diff --git a/benchmarks/multi_node/amd_utils/env_atom.sh b/benchmarks/multi_node/amd_utils/env_atom.sh index f2b906312..d84aca9af 100644 --- a/benchmarks/multi_node/amd_utils/env_atom.sh +++ b/benchmarks/multi_node/amd_utils/env_atom.sh @@ -32,13 +32,6 @@ else fi export IBDEVICES -export SAFETENSORS_FAST_GPU=1 -export VLLM_LOG_LEVEL=WARNING -export ATOM_LOG_LEVEL=WARNING -export AITER_LOG_LEVEL=WARNING -export LOG_LEVEL=WARNING -export LOGLEVEL=WARNING - # ============================================================================= # ATOM/mooncake-specific environment # ============================================================================= @@ -46,22 +39,17 @@ export LOGLEVEL=WARNING # mooncake RDMA KV transfer library path export LD_LIBRARY_PATH=/opt/venv/lib/python3.10/site-packages/mooncake:/opt/rocm/lib:${LD_LIBRARY_PATH:-} - -# ATOM_HOST_IP is set per-node in server_atom.sh (= host_ip, used as handshake IP) +# faster model loading (safetensors only) +export SAFETENSORS_FAST_GPU=1 # aiter logging (WARNING to reduce noise; use DEBUG for troubleshooting) +export VLLM_LOG_LEVEL=WARNING +export ATOM_LOG_LEVEL=WARNING export AITER_LOG_LEVEL=WARNING - -if [[ "$MODEL_NAME" == "DeepSeek-V4-Pro" ]]; then - # ATOM MoE gather/scatter interleave optimization - export ATOM_MOE_GU_ITLV=1 - # Disable bf16->fp8 MoE bound (only for DeepSeek-V4-Pro) - export AITER_BF16_FP8_MOE_BOUND=0 -fi - -# Clear stale ATOM cache on startup (server_atom.sh handles this via rm -rf) -# No env var needed; documented here for reference. +export LOG_LEVEL=WARNING +export LOGLEVEL=WARNING set +x +# ATOM_HOST_IP is set per-node in server_atom.sh (= host_ip, used as handshake IP) echo "[INFO] ATOM env: IBDEVICES=$IBDEVICES LD_LIBRARY_PATH includes mooncake" diff --git a/benchmarks/multi_node/amd_utils/models_atom.yaml b/benchmarks/multi_node/amd_utils/models_atom.yaml index 85771eeaa..620aaf6c6 100644 --- a/benchmarks/multi_node/amd_utils/models_atom.yaml +++ b/benchmarks/multi_node/amd_utils/models_atom.yaml @@ -1,4 +1,4 @@ -# Model-specific SGLang server configurations for disaggregated inference. +# Model-specific ATOM server configurations for disaggregated inference. # # Each top-level key is a MODEL_NAME value (must match the directory name under MODEL_DIR). # @@ -7,50 +7,34 @@ # # Schema: # : -# base_flags: str # Common flags for both prefill and decode -# mtp_flags: str # Appended to decode when DECODE_MTP_SIZE > 0 -# dp_flags: str # Appended when DP is enabled (prefill or decode) -# prefill: -# mem_fraction_static: float -# disable_radix_cache: bool -# dp: # Config when data-parallel attention is enabled -# max_running_requests: int -# chunked_prefill_size: str # Can be integer or bash arithmetic expression -# cuda_graph_bs: str # Space-separated values -# no_dp: # Config when data-parallel attention is disabled -# max_running_requests: int -# chunked_prefill_size: int -# cuda_graph_bs_range: str # "start-end" expanded via seq -# decode: -# mem_fraction_static: float -# prefill_round_robin_balance: bool -# dp: -# max_running_requests: int -# chunked_prefill_size: str -# cuda_graph_bs_range: str -# ep_only: # Config when EP is enabled but DP is disabled -# max_running_requests: int -# chunked_prefill_size: int -# cuda_graph_bs_range: str -# no_dp: -# max_running_requests: int -# chunked_prefill_size: int -# cuda_graph_bs_range: str +# env: str # Space-separated KEY=VALUE pairs exported unconditionally +# hf_overrides: str # JSON string passed to --hf-overrides +# tp_dp_flags: str # Parallel flags for TP+DPA case (must include --enable-dp-attention) +# tp_dp_env: str # Space-separated KEY=VALUE pairs exported only in TP+DPA mode +# ep_dp_flags: str # Parallel flags for EP+DPA case (must include --enable-expert-parallel --enable-dp-attention) +# ep_dp_env: str # Space-separated KEY=VALUE pairs exported only in EP+DPA mode +# mtp_flags: str # Flags passed to SPEC_ARGS before $DECODE_MTP_SIZE (e.g. "--method mtp --num-speculative-tokens") +# kv_cache_flags: str # Full --kv_cache_dtype flag string (e.g. "--kv_cache_dtype fp8", or "" for none) DeepSeek-V4-Pro: - # ATOM engine (atom-disagg): server_atom.sh uses MEM_FRACTION/KV_CACHE_DTYPE/BLOCK_SIZE/MAX_NUM_SEQS - # directly from env vars (defaulting to 0.85/fp8/16/256). base_flags/dp_flags are not used by - # server_atom.sh; they are kept here for documentation and potential future use. - base_flags: "" - mtp_flags: "" - dp_flags: "" + env: "ATOM_MOE_GU_ITLV=1 AITER_BF16_FP8_MOE_BOUND=0" + kv_cache_flags: "--kv_cache_dtype fp8" + tp_dp_flags: "--enable-dp-attention --enable-tbo" + tp_dp_env: "GPU_MAX_HW_QUEUES=5 ATOM_CPU_AFFINITY=1" + ep_dp_flags: "--enable-expert-parallel --enable-dp-attention" + mtp_flags: "--method mtp --num-speculative-tokens" + hf_overrides: '{"use_index_cache":true,"index_topk_freq":4}' MiniMax-M3-MXFP4: - base_flags: "" - mtp_flags: "" - dp_flags: "" + env: "AITER_QUICK_REDUCE_QUANTIZATION=INT4 ATOM_M3_SPARSE_USE_ASM_PA=1 AITER_QUICK_REDUCE_CAST_BF16_TO_FP16=0" + kv_cache_flags: "--kv_cache_dtype fp8" + tp_dp_flags: "--enable-dp-attention" + ep_dp_flags: "--enable-expert-parallel --enable-dp-attention" + mtp_flags: "--method eagle3 --draft-model Inferact/MiniMax-M3-EAGLE3 --num-speculative-tokens" MiniMax-M3-MXFP8: - base_flags: "" - mtp_flags: "" - dp_flags: "" \ No newline at end of file + env: "AITER_QUICK_REDUCE_QUANTIZATION=INT4 ATOM_M3_SPARSE_USE_ASM_PA=1 AITER_QUICK_REDUCE_CAST_BF16_TO_FP16=0" + kv_cache_flags: "--kv_cache_dtype fp8" + tp_dp_flags: "--enable-dp-attention" + ep_dp_flags: "--enable-expert-parallel --enable-dp-attention" + mtp_flags: "--method eagle3 --draft-model Inferact/MiniMax-M3-EAGLE3 --num-speculative-tokens" diff --git a/benchmarks/multi_node/amd_utils/server_atom.sh b/benchmarks/multi_node/amd_utils/server_atom.sh index 5ecb85ec2..ceab32f65 100755 --- a/benchmarks/multi_node/amd_utils/server_atom.sh +++ b/benchmarks/multi_node/amd_utils/server_atom.sh @@ -47,7 +47,6 @@ HANDSHAKE_PORT="${HANDSHAKE_PORT:-6301}" # ATOM server tuning (from reference script defaults) MEM_FRAC_STATIC="${MEM_FRAC_STATIC:-0.85}" -KV_CACHE_DTYPE="${KV_CACHE_DTYPE:-fp8}" BLOCK_SIZE="${BLOCK_SIZE:-16}" MAX_NUM_SEQS="${MAX_NUM_SEQS:-256}" MAX_MODEL_LEN="${MAX_MODEL_LEN:-}" @@ -78,6 +77,24 @@ if [[ -z "$host_ip" ]]; then fi host_name=$(hostname) +# ============================================================================= +# Model-Specific Configuration from YAML +# ============================================================================= +# Load model-specific config from YAML (single parse for all fields) +eval "$(python3 -c " +import yaml +with open('${ATOM_WS_PATH}/models_atom.yaml') as f: + m = yaml.safe_load(f).get('${MODEL_NAME}', {}) +print(f'MODEL_ENVS=\"{m.get(\"env\", \"\")}\"') +print(f'MODEL_TP_DP_FLAGS=\"{m.get(\"tp_dp_flags\", \"\")}\"') +print(f'MODEL_EP_DP_FLAGS=\"{m.get(\"ep_dp_flags\", \"\")}\"') +print(f'MODEL_TP_DP_ENV=\"{m.get(\"tp_dp_env\", \"\")}\"') +print(f'MODEL_EP_DP_ENV=\"{m.get(\"ep_dp_env\", \"\")}\"') +print(f'MODEL_MTP_FLAGS=\"{m.get(\"mtp_flags\", \"\")}\"') +print(f'MODEL_KV_ARG=\"{m.get(\"kv_cache_flags\", \"\")}\"') +print(f'_HF_OVERRIDES=\"{m.get(\"hf_overrides\", \"\")}\"') +")" + # ============================================================================= # Cluster Topology Configuration # ============================================================================= @@ -114,53 +131,48 @@ DECODE_ENABLE_DP="${DECODE_ENABLE_DP}" # Parallel args PREFILL_PARALLEL_ARGS=(-tp "$PREFILL_TP_SIZE") #TP if [ "$PREFILL_ENABLE_DP" = "true" ]; then - if [ "$PREFILL_ENABLE_EP" -gt 1 ]; then #DPA+EP - PREFILL_PARALLEL_ARGS=(-tp "$PREFILL_TP_SIZE" --enable-expert-parallel --enable-dp-attention ) - else #TP+DPA+TBO - if [[ "$MODEL_NAME" == "DeepSeek-V4-Pro" ]]; then - PREFILL_PARALLEL_ARGS=(-tp "$PREFILL_TP_SIZE" --enable-dp-attention --enable-tbo ) - export GPU_MAX_HW_QUEUES=5 - export ATOM_CPU_AFFINITY=1 - else #TP+DPA - PREFILL_PARALLEL_ARGS=(-tp "$PREFILL_TP_SIZE" --enable-dp-attention ) - fi + if [ "$PREFILL_ENABLE_EP" -gt 1 ]; then #EP+DPA + PREFILL_PARALLEL_ARGS=(-tp "$PREFILL_TP_SIZE" ${MODEL_EP_DP_FLAGS}) + for _dp_env_pair in ${MODEL_EP_DP_ENV}; do export "$_dp_env_pair"; done + else #TP+DPA + PREFILL_PARALLEL_ARGS=(-tp "$PREFILL_TP_SIZE" ${MODEL_TP_DP_FLAGS}) + for _dp_env_pair in ${MODEL_TP_DP_ENV}; do export "$_dp_env_pair"; done fi -fi +fi -# (srok), split DPA & TBO cases -DECODE_PARALLEL_ARGS=(-tp "$PREFILL_TP_SIZE") #TP +DECODE_PARALLEL_ARGS=(-tp "$DECODE_TP_SIZE") #TP if [ "$DECODE_ENABLE_DP" = "true" ]; then - if [ "$DECODE_ENABLE_EP" -gt 1 ]; then #DPA+EP - DECODE_PARALLEL_ARGS=(-tp "$DECODE_TP_SIZE" --enable-expert-parallel --enable-dp-attention ) - else #TP+DPA+TBO - if [[ "$MODEL_NAME" == "DeepSeek-V4-Pro" ]]; then - DECODE_PARALLEL_ARGS=(-tp "$DECODE_TP_SIZE" --enable-dp-attention --enable-tbo ) - export GPU_MAX_HW_QUEUES=5 - export ATOM_CPU_AFFINITY=1 - else #TP+DPA - DECODE_PARALLEL_ARGS=(-tp "$DECODE_TP_SIZE" --enable-dp-attention ) - fi + if [ "$DECODE_ENABLE_EP" -gt 1 ]; then #EP+DPA + DECODE_PARALLEL_ARGS=(-tp "$DECODE_TP_SIZE" ${MODEL_EP_DP_FLAGS}) + for _dp_env_pair in ${MODEL_EP_DP_ENV}; do export "$_dp_env_pair"; done + else #TP+DPA + DECODE_PARALLEL_ARGS=(-tp "$DECODE_TP_SIZE" ${MODEL_TP_DP_FLAGS}) + for _dp_env_pair in ${MODEL_TP_DP_ENV}; do export "$_dp_env_pair"; done fi -fi - -# MTP args -SPEC_ARGS=() #TP -if [ "$SPEC_DECODING" = "mtp" ]; then - SPEC_ARGS=(--method mtp --num-speculative-tokens "$DECODE_MTP_SIZE") fi +unset _dp_env_pair # HF overrides (single-quoted JSON preserved through eval) HF_OVERRIDES_ARG="" -if [[ "$MODEL_NAME" == "DeepSeek-V4-Pro" ]]; then - HF_OVERRIDES_ARG="--hf-overrides '{\"use_index_cache\":true,\"index_topk_freq\":4}'" +if [[ -n "$_HF_OVERRIDES" ]]; then + HF_OVERRIDES_ARG="--hf-overrides '${_HF_OVERRIDES}'" fi +unset _HF_OVERRIDES + +for _env_pair in ${MODEL_ENVS}; do + export "$_env_pair" +done +unset _env_pair -# KV cache dtype (skip if unset or 'auto') -KV_CACHE_ARG="" -if [[ -n "$KV_CACHE_DTYPE" && "$KV_CACHE_DTYPE" != "auto" ]]; then - KV_CACHE_ARG="--kv_cache_dtype ${KV_CACHE_DTYPE}" +# MTP args +SPEC_ARGS=() +if [ "$SPEC_DECODING" = "mtp" ]; then + SPEC_ARGS=(${MODEL_MTP_FLAGS} "$DECODE_MTP_SIZE") fi +# KV cache arg - full flag string from YAML +KV_CACHE_ARG="${MODEL_KV_ARG}" + # Optional model length / batched-token cap MODEL_LEN_ARGS="" if [[ -n "$MAX_MODEL_LEN" ]]; then @@ -170,9 +182,6 @@ if [[ -n "$MAX_NUM_BATCHED_TOKENS" ]]; then MODEL_LEN_ARGS="${MODEL_LEN_ARGS} --max-num-batched-tokens ${MAX_NUM_BATCHED_TOKENS}" fi -if [[ "$MODEL_NAME" != "DeepSeek-V4-Pro" ]]; then - export AITER_QUICK_REDUCE_QUANTIZATION=INT4 -fi cat < rocm/atom-dev:MiniMax-M3-20260623" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1930 - config-keys: - dsv4-fp4-b200-dynamo-vllm