diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index cbfc09f81..995e1717f 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -2669,10 +2669,15 @@ minimaxm3-fp4-mi355x-vllm: # EAGLE3 speculative-decoding variant of minimaxm3-fp4-mi355x-vllm. Pair the # amd/MiniMax-M3-MXFP4 target with Inferact/MiniMax-M3-EAGLE3 and three draft -# tokens. Search space mirrors the MI355X MXFP8 MTP entry, trimming the base -# FP4 sweep at extreme concurrency where speculative decoding loses value. +# tokens. +# +# EP / dp-attn configs disable AITER fused MoE (incompatible with expert +# parallelism) but keep the general AITER backend on so MXFP4 weight dequant +# uses AITER instead of the Quark path (quark.torch.kernel.mx), which is broken +# in the current nightly (torch.ao.quantization.pt2e removed). See the EP branch +# in minimaxm3_fp4_mi355x_vllm_mtp.sh and run 28422097175 (PR #1958). minimaxm3-fp4-mi355x-vllm-mtp: - image: vllm/vllm-openai-rocm:nightly-3f5a1e1733200760169ff31ebe60a271072b199e + image: vllm/vllm-openai-rocm:nightly-4559c43a9526597c00cbcc4f59979496500268d1 model: amd/MiniMax-M3-MXFP4 model-prefix: minimaxm3 runner: mi355x @@ -2693,9 +2698,7 @@ minimaxm3-fp4-mi355x-vllm-mtp: osl: 1024 search-space: - { tp: 8, conc-start: 1, conc-end: 64, spec-decoding: mtp } - - { tp: 8, ep: 8, conc-start: 1, conc-end: 256, spec-decoding: mtp } - { tp: 4, conc-start: 1, conc-end: 64, spec-decoding: mtp } - - { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 256, spec-decoding: mtp } # MiniMax-M3 MXFP4 MI355X atom recipe: # https://github.com/ROCm/ATOM/blob/5d42d49f9e4292e5b61475917e92e7ec1b1dacb7/recipes/MiniMax-M3.md diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_mi355x_vllm_mtp.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_mi355x_vllm_mtp.sh index 96a560493..591eefba5 100755 --- a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_mi355x_vllm_mtp.sh +++ b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_mi355x_vllm_mtp.sh @@ -5,6 +5,8 @@ # minimaxm3_fp4_mi355x_vllm.sh and uses three speculative tokens from # Inferact/MiniMax-M3-EAGLE3. The pinned nightly includes upstream AMD # MiniMax-M3 SupportsEagle3 support, so no runtime model patch is needed. +# MoE serving mirrors minimaxm3_fp4_mi355x_vllm.sh (AITER MoE, vllm#46419), +# except AITER MoE is gated off when expert parallelism is enabled (see below). source "$(dirname "$0")/../../benchmark_lib.sh" @@ -37,6 +39,26 @@ SERVER_LOG=/workspace/server.log export VLLM_ENGINE_READY_TIMEOUT_S=3600 export VLLM_USE_BREAKABLE_CUDAGRAPH=0 +# AITER MoE accelerates the dense (non-EP) MoE path but is incompatible with +# expert parallelism, so disable AITER *fused MoE* when EP is enabled (DP +# attention or EP > 1). We still keep the general AITER backend enabled in that +# case: it routes the MXFP4 weight dequant through AITER instead of the Quark +# path (mxfp4_utils._dequant_mxfp4 -> `from quark.torch.kernel import mx`), +# which is broken in the current nightly (ModuleNotFoundError: +# torch.ao.quantization.pt2e). Fully disabling AITER here would fall back to +# that broken Quark dequant and crash engine-core startup on every EP config. +# https://github.com/SemiAnalysisAI/InferenceX/pull/1955#discussion_r3495386866 +MOE_ARGS=() +if [ "${DP_ATTENTION}" = "true" ] || [ "$EP_SIZE" -gt 1 ]; then + export VLLM_ROCM_USE_AITER=1 + export VLLM_ROCM_USE_AITER_MOE=0 +else + export VLLM_ROCM_USE_AITER=1 + export VLLM_ROCM_USE_AITER_MOE=1 + export VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS=1 + MOE_ARGS=(--moe-backend aiter) +fi + if [ "${EVAL_ONLY}" = "true" ]; then setup_eval_context fi @@ -65,6 +87,7 @@ vllm serve "$MODEL" --port "$PORT" \ --language-model-only \ --max-model-len "$MAX_MODEL_LEN" \ --attention-backend TRITON_ATTN \ + "${MOE_ARGS[@]}" \ --speculative-config "{\"method\": \"eagle3\", \"model\": \"$DRAFT_MODEL\", \"num_speculative_tokens\": $NUM_SPEC_TOKENS}" \ --tool-call-parser minimax_m3 \ --enable-auto-tool-choice \ diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 3ef8c37db..f98722fee 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -4323,3 +4323,12 @@ - "Enable AITER MoE on MiniMax-M3 MXFP4 MI355X single-node vLLM STP: export VLLM_ROCM_USE_AITER=1, VLLM_ROCM_USE_AITER_MOE=1, and VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS=1; pass --moe-backend aiter." - "Pin vllm/vllm-openai-rocm:nightly-4559c43a9526597c00cbcc4f59979496500268d1 (from nightly-3f5a1e1733200760169ff31ebe60a271072b199e) for AITER MoE and shared-expert fusion support (vllm-project/vllm#46419, vllm-project/vllm#46545)." pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1954 + +- config-keys: + - minimaxm3-fp4-mi355x-vllm-mtp + description: + - "Enable AITER MoE on the MiniMax-M3 MI355X single-node vLLM EAGLE3 MTP MXFP4 benchmark for non-EP configs: export VLLM_ROCM_USE_AITER=1, VLLM_ROCM_USE_AITER_MOE=1, and VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS=1, and pass --moe-backend aiter." + - "EP and DP-attention configs disable AITER fused MoE (VLLM_ROCM_USE_AITER_MOE=0) since AITER MoE is incompatible with expert parallelism (vLLM #46419), but keep the general AITER backend on (VLLM_ROCM_USE_AITER=1) so MXFP4 weight dequant uses AITER instead of the Quark path (mxfp4_utils._dequant_mxfp4), which is broken in this nightly (ModuleNotFoundError: torch.ao.quantization.pt2e)." + - "Drop EP and DP-attention search-space entries for 8k1k (those EP>1 points are off the Pareto curve); 1k1k keeps its EP and DP-attention coverage." + - "Pin vllm/vllm-openai-rocm:nightly-4559c43a9526597c00cbcc4f59979496500268d1 (from nightly-3f5a1e1733200760169ff31ebe60a271072b199e)." + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1958