SemiAnalysisAI · functionstackx · Jun 26, 2026 · Jun 19, 2026 · Jun 19, 2026 · Jun 19, 2026
@@ -2827,6 +2827,57 @@ minimaxm3-fp8-mi355x-atom-disagg:
           additional-settings:
           - "DECODE_NODES=1"
 
+minimaxm3-fp4-mi355x-atom-disagg:
+  image: rocm/atom-dev:MiniMax-M3-20260622
+  model: amd/MiniMax-M3-MXFP4
+  model-prefix: minimaxm3
+  runner: mi355x-disagg
+  precision: fp4
+  framework: atom-disagg
+  multinode: true
+  disagg: true
+  scenarios:
+    fixed-seq-len:
+    - isl: 8192
+      osl: 1024
+      search-space:
+      # 1P1D TP4
+      - conc-list: [ 1, 2, 4, 8, 16, 32, 64, 128, 256, 512 ]
+        prefill:
+          num-worker: 1
+          tp: 4
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "PREFILL_NODES=1"
+        decode:
+          num-worker: 1
+          tp: 4
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "DECODE_NODES=1"
+      # 1P1D TP4
+    - isl: 1024
+      osl: 1024
+      search-space:
+      # 1P1D TP4
+      - conc-list: [ 1, 2, 4, 8, 16, 32, 64, 128, 256, 512 ]
+        prefill:
+          num-worker: 1
+          tp: 4
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "PREFILL_NODES=1"
+        decode:
+          num-worker: 1
+          tp: 4
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "DECODE_NODES=1"
+
 # MiniMax-M3 MXFP8 MI300X day-zero recipe. Reuse the dedicated ROCm image and
 # MI355X serving shape, but retain the default BF16 KV cache because this
 # checkpoint lacks calibrated ROCm FP8 attention scales. Use the TP8-only H100

diff --git a/benchmarks/multi_node/amd_utils/server_atom.sh b/benchmarks/multi_node/amd_utils/server_atom.sh
@@ -200,7 +200,6 @@ INFO
 #   rank 1 .. (NODE_OFFSET-1)       -> remaining prefill nodes
 #   rank NODE_OFFSET ..             -> decode nodes
 # =============================================================================
-
 if [ "$NODE_RANK" -eq 0 ]; then
     # ──────────────────────────────────────────────────────────────────────────
     # Node 0: prefill server (producer) + atomesh router

diff --git a/benchmarks/multi_node/minimaxm3_fp4_mi355x_atom-disagg.sh b/benchmarks/multi_node/minimaxm3_fp4_mi355x_atom-disagg.sh
@@ -0,0 +1,94 @@
+#!/usr/bin/env bash
+
+source "$(dirname "$0")/../benchmark_lib.sh"
+
+check_env_vars \
+    CONC_LIST \
+    ISL \
+    OSL \
+    IMAGE \
+    MODEL_PATH \
+    PREFILL_NUM_WORKERS \
+    PREFILL_TP \
+    PREFILL_EP \
+    PREFILL_DP_ATTN \
+    DECODE_NUM_WORKERS \
+    DECODE_TP \
+    DECODE_EP \
+    DECODE_DP_ATTN \
+    PREFILL_NODES \
+    DECODE_NODES \
+    RANDOM_RANGE_RATIO \
+    FRAMEWORK
+
+if [[ -n "$SLURM_JOB_ID" ]]; then
+  echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
+fi
+
+set -x
+
+# Use upstreamed multi_node scripts (no external clone needed)
+cd "$GITHUB_WORKSPACE/benchmarks/multi_node/amd_utils" || exit 1
+
+# Set up SGL launch script-specific environment variables
+export TIME_LIMIT="08:00:00"
+export MODEL_PATH=$MODEL_PATH
+export MODEL_NAME=$MODEL_NAME
+export CONTAINER_IMAGE=$IMAGE
+
+if [[ "${PREFILL_EP:-1}" -eq 1 ]]; then
+export PREFILL_ENABLE_EP=false
+else
+export PREFILL_ENABLE_EP=true
+fi
+
+if [[ "$PREFILL_DP_ATTN" == "true" ]]; then
+export PREFILL_ENABLE_DP=true
+else
+export PREFILL_ENABLE_DP=false
+fi
+
+if [[ "${DECODE_EP:-1}" -eq 1 ]]; then
+export DECODE_ENABLE_EP=false
+else
+export DECODE_ENABLE_EP=true
+fi
+
+if [[ "$DECODE_DP_ATTN" == "true" ]]; then
+export DECODE_ENABLE_DP=true
+else
+export DECODE_ENABLE_DP=false
+fi
+
+# No MTP for MiniMax-M3
+export SPEC_DECODING="none"
+export DECODE_MTP_SIZE=0
+
+# Block size 128
+export KV_CACHE_DTYPE="${KV_CACHE_DTYPE:-auto}"
+export BLOCK_SIZE="${BLOCK_SIZE:-128}"
+export MEM_FRAC_STATIC="${MEM_FRAC_STATIC:-0.8}"
+export MAX_MODEL_LEN=32768
+export MAX_NUM_SEQS="${MAX_NUM_SEQS:-128}"
+export MAX_NUM_BATCHED_TOKENS="${MAX_NUM_BATCHED_TOKENS:-32768}"
+
+# Launch jobs based on ISL/OSL
+# Replace ' ' in CONC_LIST with 'x' such that the concurrency list is represented
+# by a list of numbers delimited by 'x'. This is because of how the underlying launch script
+# expects the concurrencies.
+JOB_ID=$(bash ./submit.sh $PREFILL_NODES \
+    $PREFILL_NUM_WORKERS \
+    $DECODE_NODES \
+    $DECODE_NUM_WORKERS \
+    $ISL $OSL "${CONC_LIST// /x}" inf \
+    ${PREFILL_ENABLE_EP} ${PREFILL_ENABLE_DP} \
+    ${DECODE_ENABLE_EP} ${DECODE_ENABLE_DP} \
+    ${PREFILL_TP} ${DECODE_TP} \
+    ${RANDOM_RANGE_RATIO})
+
+if [[ $? -ne 0 ]]; then
+    echo "Failed to submit job" >&2
+    exit 1
+fi
+
+echo "$JOB_ID"
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
@@ -4089,6 +4089,14 @@
     - "8k/1k: 1p4d-dep4-tep4 (conc 128), 1p4d-dep4-tp8 (conc 4-256), 3p1d-dep4-dep16 (conc 1024), 6p1d-dep4-dep16 (conc 3072), 8p1d-dep4-dep16 (conc 6144)"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1862
 
+- config-keys:
+    - minimaxm3-fp4-mi355x-atom-disagg
+  description:
+    - "Add minimaxm3-fp4-mi355x-atom-disagg CI script: multi-node disaggregated PD on MI355X via ATOM for MiniMax-M3-MXFP4"
+    - "No MTP, KV_CACHE_DTYPE=auto (MXFP4 native, no fp8 override), MAX_MODEL_LEN=32768, MAX_NUM_BATCHED_TOKENS=32768"
+    - "server_atom.sh: conditional --kv_cache_dtype, MAX_MODEL_LEN/MAX_NUM_BATCHED_TOKENS/CUDAGRAPH_OPT support, syntax fixes"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1856
+
 - config-keys:
     - dsv4-fp4-mi355x-sglang
   description: