Merge branch 'main' into eval-8k1k-server-restart

Oseltamivir · Oseltamivir · commit 79db8ecebe4f · 2026-03-26T21:14:32.000-07:00
# Conflicts:
#	benchmarks/single_node/kimik2.5_fp4_mi355x.sh
#	perf-changelog.yaml
#	runners/launch_mi300x-amd.sh
#	runners/launch_mi300x-cr.sh
diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
@@ -294,7 +294,7 @@ glm5-fp8-mi355x-sglang:
     - { tp: 8, conc-start: 4, conc-end: 64 }
 
 kimik2.5-int4-mi355x-vllm:
-  image: vllm/vllm-openai-rocm:v0.15.1
+  image: vllm/vllm-openai-rocm:v0.18.0
   model: moonshotai/Kimi-K2.5
   model-prefix: kimik2.5
   runner: mi355x
@@ -338,7 +338,7 @@ kimik2.5-int4-mi325x-vllm:
     - { tp: 8, conc-start: 4, conc-end: 64 }
 
 kimik2.5-fp4-mi355x-vllm:
-  image: vllm/vllm-openai-rocm:v0.16.0
+  image: vllm/vllm-openai-rocm:v0.18.0
   model: amd/Kimi-K2.5-MXFP4
   model-prefix: kimik2.5
   runner: mi355x
@@ -350,14 +350,18 @@ kimik2.5-fp4-mi355x-vllm:
     osl: 1024
     search-space:
     - { tp: 8, conc-start: 4, conc-end: 64 }
+    - { tp: 4, conc-start: 4, conc-end: 64 }
   - isl: 1024
     osl: 8192
     search-space:
     - { tp: 8, conc-start: 4, conc-end: 64 }
+    - { tp: 4, conc-start: 4, conc-end: 64 }
+    - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 }
   - isl: 8192
     osl: 1024
     search-space:
     - { tp: 8, conc-start: 4, conc-end: 64 }
+    - { tp: 4, conc-start: 4, conc-end: 64 }
 
 minimaxm2.5-fp8-mi355x-vllm:
   image: vllm/vllm-openai-rocm:v0.18.0
diff --git a/.github/configs/runners.yaml b/.github/configs/runners.yaml
@@ -66,11 +66,6 @@ b200-multinode:
 - 'b200-dgxc-slurm_7'
 - 'b200-dgxc-slurm_8'
 mi300x:
-- 'mi300x-amd_0'
-- 'mi300x-amd_1'
-- 'mi300x-amd_2'
-- 'mi300x-amd_3'
-- 'mi300x-amd_4'
 - 'mi300x-amds_0'
 - 'mi300x-amds_1'
 - 'mi300x-amds_2'
diff --git a/benchmarks/single_node/kimik2.5_fp4_mi355x.sh b/benchmarks/single_node/kimik2.5_fp4_mi355x.sh
@@ -36,9 +36,29 @@ if [ "${EVAL_ONLY}" = "true" ]; then
     MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN"
 fi
 
-# do not enable aiter due to Aiter MLA not currently supporting num_heads=8
-# https://github.com/vllm-project/vllm/issues/35641
-# export VLLM_ROCM_USE_AITER=1
+# If the machine runs a MEC FW older than 177, RCCL
+# cannot reclaim some memory.
+# Disable that features to avoid crashes.
+# This is related to the changes in the driver at:
+# https://rocm.docs.amd.com/en/docs-6.4.3/about/release-notes.html#amdgpu-driver-updates
+version=`rocm-smi --showfw | grep MEC | head -n 1 |  awk '{print $NF}'`
+if [[ "$version" == "" || $version -lt 177 ]]; then
+  export HSA_NO_SCRATCH_RECLAIM=1
+fi
+
+export VLLM_ROCM_USE_AITER=1
+export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4
+
+# Disable AITER RMSNorm for TP < 8 due to accuracy issues
+if [ "${TP}" -lt 8 ]; then
+  export VLLM_ROCM_USE_AITER_RMSNORM=0
+fi
+
+if [ "${EP_SIZE:-0}" -gt 1 ]; then
+  EP=" --enable-expert-parallel"
+else
+  EP=" "
+fi
 
 # following AMD andy luo's recipe
 # https://x.com/linluo77/status/2017024513595301985
@@ -49,10 +69,11 @@ start_gpu_monitor
 set -x
 vllm serve $MODEL --port $PORT \
 --tensor-parallel-size=$TP \
---gpu-memory-utilization 0.95 \
+$EP \
+--gpu-memory-utilization 0.90 \
 --max-model-len $MAX_MODEL_LEN \
---block-size=64 \
---disable-log-requests \
+--block-size=1 \
+--no-enable-prefix-caching \
 --trust-remote-code \
 --mm-encoder-tp-mode data > $SERVER_LOG 2>&1 &
 
@@ -76,7 +97,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT"
+    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
     append_lm_eval_summary
 fi
 
diff --git a/benchmarks/single_node/kimik2.5_int4_mi355x.sh b/benchmarks/single_node/kimik2.5_int4_mi355x.sh
@@ -34,13 +34,14 @@ fi
 start_gpu_monitor
 
 set -x
+export VLLM_ROCM_USE_AITER=1
 vllm serve $MODEL --port $PORT \
 --tensor-parallel-size=$TP \
 --gpu-memory-utilization 0.95 \
 --max-model-len $MAX_MODEL_LEN \
 --block-size=64 \
---disable-log-requests \
 --trust-remote-code \
+--max-num-seqs 256 \
 --mm-encoder-tp-mode data > $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!
@@ -63,7 +64,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT"
+    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
     append_lm_eval_summary
 fi
 
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
@@ -1068,6 +1068,24 @@
     - "dsr1-fp8-h200-sglang: v0.5.9-cu129-amd64 → v0.5.9-cu130"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/943
 
+- config-keys:
+    - kimik2.5-fp4-mi355x-vllm
+  description:
+    - "Upgrade vLLM ROCm image from v0.16.0 to v0.18.0"
+    - "Enable AITER with INT4 quick reduce; disable AITER RMSNorm for TP < 8 (accuracy)"
+    - "Add expert parallel, TP4, and TP4/EP4 search spaces"
+    - "Switch block-size 64 to 1 gpu-memory-utilization 0.95 to 0.90"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/936
+
+- config-keys:
+    - kimik2.5-int4-mi355x-vllm
+  description:
+    - "Upgrade vLLM ROCm image from v0.15.1 to v0.18.0"
+    - "Enable AITER MLA, export VLLM_ROCM_USE_AITER=1, https://github.com/vllm-project/vllm/issues/35641"
+    - "Triton Fused Moe Tuning https://github.com/vllm-project/vllm/pull/35093"
+    - "Add --max-num-seqs 256, remove --disable-log-requests"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/950
+
 - config-keys:
     # NVIDIA single-node
     - dsr1-fp4-b200-sglang
diff --git a/runners/launch_mi300x-amd.sh b/runners/launch_mi300x-amd.sh
diff --git a/runners/launch_mi300x-cr.sh b/runners/launch_mi300x-cr.sh