SemiAnalysisAI · functionstackx · Jun 30, 2026 · Jun 30, 2026 · Jun 30, 2026 · claude
@@ -2669,10 +2669,15 @@ minimaxm3-fp4-mi355x-vllm:
 
 # EAGLE3 speculative-decoding variant of minimaxm3-fp4-mi355x-vllm. Pair the
 # amd/MiniMax-M3-MXFP4 target with Inferact/MiniMax-M3-EAGLE3 and three draft
-# tokens. Search space mirrors the MI355X MXFP8 MTP entry, trimming the base
-# FP4 sweep at extreme concurrency where speculative decoding loses value.
+# tokens.
+#
+# EP / dp-attn configs disable AITER fused MoE (incompatible with expert
+# parallelism) but keep the general AITER backend on so MXFP4 weight dequant
+# uses AITER instead of the Quark path (quark.torch.kernel.mx), which is broken
+# in the current nightly (torch.ao.quantization.pt2e removed). See the EP branch
+# in minimaxm3_fp4_mi355x_vllm_mtp.sh and run 28422097175 (PR #1958).
 minimaxm3-fp4-mi355x-vllm-mtp:
-  image: vllm/vllm-openai-rocm:nightly-3f5a1e1733200760169ff31ebe60a271072b199e
+  image: vllm/vllm-openai-rocm:nightly-4559c43a9526597c00cbcc4f59979496500268d1
   model: amd/MiniMax-M3-MXFP4
   model-prefix: minimaxm3
   runner: mi355x
@@ -2693,9 +2698,7 @@ minimaxm3-fp4-mi355x-vllm-mtp:
       osl: 1024
       search-space:
       - { tp: 8, conc-start: 1, conc-end: 64, spec-decoding: mtp }
-      - { tp: 8, ep: 8, conc-start: 1, conc-end: 256, spec-decoding: mtp }
       - { tp: 4, conc-start: 1, conc-end: 64, spec-decoding: mtp }
-      - { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 256, spec-decoding: mtp }
 
 # MiniMax-M3 MXFP4 MI355X atom recipe:
 # https://github.com/ROCm/ATOM/blob/5d42d49f9e4292e5b61475917e92e7ec1b1dacb7/recipes/MiniMax-M3.md

diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_mi355x_vllm_mtp.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_mi355x_vllm_mtp.sh
@@ -5,6 +5,8 @@
 # minimaxm3_fp4_mi355x_vllm.sh and uses three speculative tokens from
 # Inferact/MiniMax-M3-EAGLE3. The pinned nightly includes upstream AMD
 # MiniMax-M3 SupportsEagle3 support, so no runtime model patch is needed.
+# MoE serving mirrors minimaxm3_fp4_mi355x_vllm.sh (AITER MoE, vllm#46419),
+# except AITER MoE is gated off when expert parallelism is enabled (see below).
 
 source "$(dirname "$0")/../../benchmark_lib.sh"
 
@@ -37,6 +39,26 @@ SERVER_LOG=/workspace/server.log
 export VLLM_ENGINE_READY_TIMEOUT_S=3600
 export VLLM_USE_BREAKABLE_CUDAGRAPH=0
 
+# AITER MoE accelerates the dense (non-EP) MoE path but is incompatible with
+# expert parallelism, so disable AITER *fused MoE* when EP is enabled (DP
+# attention or EP > 1). We still keep the general AITER backend enabled in that
+# case: it routes the MXFP4 weight dequant through AITER instead of the Quark
+# path (mxfp4_utils._dequant_mxfp4 -> `from quark.torch.kernel import mx`),
+# which is broken in the current nightly (ModuleNotFoundError:
+# torch.ao.quantization.pt2e). Fully disabling AITER here would fall back to
+# that broken Quark dequant and crash engine-core startup on every EP config.
+# https://github.com/SemiAnalysisAI/InferenceX/pull/1955#discussion_r3495386866
+MOE_ARGS=()
+if [ "${DP_ATTENTION}" = "true" ] || [ "$EP_SIZE" -gt 1 ]; then
+    export VLLM_ROCM_USE_AITER=1
+    export VLLM_ROCM_USE_AITER_MOE=0
+else
+    export VLLM_ROCM_USE_AITER=1
+    export VLLM_ROCM_USE_AITER_MOE=1
+    export VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS=1
+    MOE_ARGS=(--moe-backend aiter)
+fi
+
 if [ "${EVAL_ONLY}" = "true" ]; then
     setup_eval_context
 fi
@@ -65,6 +87,7 @@ vllm serve "$MODEL" --port "$PORT" \
     --language-model-only \
     --max-model-len "$MAX_MODEL_LEN" \
     --attention-backend TRITON_ATTN \
+    "${MOE_ARGS[@]}" \
     --speculative-config "{\"method\": \"eagle3\", \"model\": \"$DRAFT_MODEL\", \"num_speculative_tokens\": $NUM_SPEC_TOKENS}" \
     --tool-call-parser minimax_m3 \
     --enable-auto-tool-choice \

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
@@ -4323,3 +4323,12 @@
     - "Enable AITER MoE on MiniMax-M3 MXFP4 MI355X single-node vLLM STP: export VLLM_ROCM_USE_AITER=1, VLLM_ROCM_USE_AITER_MOE=1, and VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS=1; pass --moe-backend aiter."
     - "Pin vllm/vllm-openai-rocm:nightly-4559c43a9526597c00cbcc4f59979496500268d1 (from nightly-3f5a1e1733200760169ff31ebe60a271072b199e) for AITER MoE and shared-expert fusion support (vllm-project/vllm#46419, vllm-project/vllm#46545)."
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1954
+
+- config-keys:
+    - minimaxm3-fp4-mi355x-vllm-mtp
+  description:
+    - "Enable AITER MoE on the MiniMax-M3 MI355X single-node vLLM EAGLE3 MTP MXFP4 benchmark for non-EP configs: export VLLM_ROCM_USE_AITER=1, VLLM_ROCM_USE_AITER_MOE=1, and VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS=1, and pass --moe-backend aiter."
+    - "EP and DP-attention configs disable AITER fused MoE (VLLM_ROCM_USE_AITER_MOE=0) since AITER MoE is incompatible with expert parallelism (vLLM #46419), but keep the general AITER backend on (VLLM_ROCM_USE_AITER=1) so MXFP4 weight dequant uses AITER instead of the Quark path (mxfp4_utils._dequant_mxfp4), which is broken in this nightly (ModuleNotFoundError: torch.ao.quantization.pt2e)."
+    - "Drop EP and DP-attention search-space entries for 8k1k (those EP>1 points are off the Pareto curve); 1k1k keeps its EP and DP-attention coverage."
+    - "Pin vllm/vllm-openai-rocm:nightly-4559c43a9526597c00cbcc4f59979496500268d1 (from nightly-3f5a1e1733200760169ff31ebe60a271072b199e)."
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1958