Skip to content

Commit 79db8ec

Browse files
committed
Merge branch 'main' into eval-8k1k-server-restart
# Conflicts: # benchmarks/single_node/kimik2.5_fp4_mi355x.sh # perf-changelog.yaml # runners/launch_mi300x-amd.sh # runners/launch_mi300x-cr.sh
2 parents bec4dba + da23bb4 commit 79db8ec

File tree

7 files changed

+55
-58
lines changed

7 files changed

+55
-58
lines changed

.github/configs/amd-master.yaml

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -294,7 +294,7 @@ glm5-fp8-mi355x-sglang:
294294
- { tp: 8, conc-start: 4, conc-end: 64 }
295295

296296
kimik2.5-int4-mi355x-vllm:
297-
image: vllm/vllm-openai-rocm:v0.15.1
297+
image: vllm/vllm-openai-rocm:v0.18.0
298298
model: moonshotai/Kimi-K2.5
299299
model-prefix: kimik2.5
300300
runner: mi355x
@@ -338,7 +338,7 @@ kimik2.5-int4-mi325x-vllm:
338338
- { tp: 8, conc-start: 4, conc-end: 64 }
339339

340340
kimik2.5-fp4-mi355x-vllm:
341-
image: vllm/vllm-openai-rocm:v0.16.0
341+
image: vllm/vllm-openai-rocm:v0.18.0
342342
model: amd/Kimi-K2.5-MXFP4
343343
model-prefix: kimik2.5
344344
runner: mi355x
@@ -350,14 +350,18 @@ kimik2.5-fp4-mi355x-vllm:
350350
osl: 1024
351351
search-space:
352352
- { tp: 8, conc-start: 4, conc-end: 64 }
353+
- { tp: 4, conc-start: 4, conc-end: 64 }
353354
- isl: 1024
354355
osl: 8192
355356
search-space:
356357
- { tp: 8, conc-start: 4, conc-end: 64 }
358+
- { tp: 4, conc-start: 4, conc-end: 64 }
359+
- { tp: 4, ep: 4, conc-start: 4, conc-end: 64 }
357360
- isl: 8192
358361
osl: 1024
359362
search-space:
360363
- { tp: 8, conc-start: 4, conc-end: 64 }
364+
- { tp: 4, conc-start: 4, conc-end: 64 }
361365

362366
minimaxm2.5-fp8-mi355x-vllm:
363367
image: vllm/vllm-openai-rocm:v0.18.0

.github/configs/runners.yaml

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -66,11 +66,6 @@ b200-multinode:
6666
- 'b200-dgxc-slurm_7'
6767
- 'b200-dgxc-slurm_8'
6868
mi300x:
69-
- 'mi300x-amd_0'
70-
- 'mi300x-amd_1'
71-
- 'mi300x-amd_2'
72-
- 'mi300x-amd_3'
73-
- 'mi300x-amd_4'
7469
- 'mi300x-amds_0'
7570
- 'mi300x-amds_1'
7671
- 'mi300x-amds_2'

benchmarks/single_node/kimik2.5_fp4_mi355x.sh

Lines changed: 28 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -36,9 +36,29 @@ if [ "${EVAL_ONLY}" = "true" ]; then
3636
MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN"
3737
fi
3838

39-
# do not enable aiter due to Aiter MLA not currently supporting num_heads=8
40-
# https://github.com/vllm-project/vllm/issues/35641
41-
# export VLLM_ROCM_USE_AITER=1
39+
# If the machine runs a MEC FW older than 177, RCCL
40+
# cannot reclaim some memory.
41+
# Disable that features to avoid crashes.
42+
# This is related to the changes in the driver at:
43+
# https://rocm.docs.amd.com/en/docs-6.4.3/about/release-notes.html#amdgpu-driver-updates
44+
version=`rocm-smi --showfw | grep MEC | head -n 1 | awk '{print $NF}'`
45+
if [[ "$version" == "" || $version -lt 177 ]]; then
46+
export HSA_NO_SCRATCH_RECLAIM=1
47+
fi
48+
49+
export VLLM_ROCM_USE_AITER=1
50+
export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4
51+
52+
# Disable AITER RMSNorm for TP < 8 due to accuracy issues
53+
if [ "${TP}" -lt 8 ]; then
54+
export VLLM_ROCM_USE_AITER_RMSNORM=0
55+
fi
56+
57+
if [ "${EP_SIZE:-0}" -gt 1 ]; then
58+
EP=" --enable-expert-parallel"
59+
else
60+
EP=" "
61+
fi
4262

4363
# following AMD andy luo's recipe
4464
# https://x.com/linluo77/status/2017024513595301985
@@ -49,10 +69,11 @@ start_gpu_monitor
4969
set -x
5070
vllm serve $MODEL --port $PORT \
5171
--tensor-parallel-size=$TP \
52-
--gpu-memory-utilization 0.95 \
72+
$EP \
73+
--gpu-memory-utilization 0.90 \
5374
--max-model-len $MAX_MODEL_LEN \
54-
--block-size=64 \
55-
--disable-log-requests \
75+
--block-size=1 \
76+
--no-enable-prefix-caching \
5677
--trust-remote-code \
5778
--mm-encoder-tp-mode data > $SERVER_LOG 2>&1 &
5879

@@ -76,7 +97,7 @@ run_benchmark_serving \
7697

7798
# After throughput, run evaluation only if RUN_EVAL is true
7899
if [ "${RUN_EVAL}" = "true" ]; then
79-
run_eval --framework lm-eval --port "$PORT"
100+
run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
80101
append_lm_eval_summary
81102
fi
82103

benchmarks/single_node/kimik2.5_int4_mi355x.sh

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,13 +34,14 @@ fi
3434
start_gpu_monitor
3535

3636
set -x
37+
export VLLM_ROCM_USE_AITER=1
3738
vllm serve $MODEL --port $PORT \
3839
--tensor-parallel-size=$TP \
3940
--gpu-memory-utilization 0.95 \
4041
--max-model-len $MAX_MODEL_LEN \
4142
--block-size=64 \
42-
--disable-log-requests \
4343
--trust-remote-code \
44+
--max-num-seqs 256 \
4445
--mm-encoder-tp-mode data > $SERVER_LOG 2>&1 &
4546

4647
SERVER_PID=$!
@@ -63,7 +64,7 @@ run_benchmark_serving \
6364

6465
# After throughput, run evaluation only if RUN_EVAL is true
6566
if [ "${RUN_EVAL}" = "true" ]; then
66-
run_eval --framework lm-eval --port "$PORT"
67+
run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
6768
append_lm_eval_summary
6869
fi
6970

perf-changelog.yaml

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1068,6 +1068,24 @@
10681068
- "dsr1-fp8-h200-sglang: v0.5.9-cu129-amd64 → v0.5.9-cu130"
10691069
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/943
10701070

1071+
- config-keys:
1072+
- kimik2.5-fp4-mi355x-vllm
1073+
description:
1074+
- "Upgrade vLLM ROCm image from v0.16.0 to v0.18.0"
1075+
- "Enable AITER with INT4 quick reduce; disable AITER RMSNorm for TP < 8 (accuracy)"
1076+
- "Add expert parallel, TP4, and TP4/EP4 search spaces"
1077+
- "Switch block-size 64 to 1 gpu-memory-utilization 0.95 to 0.90"
1078+
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/936
1079+
1080+
- config-keys:
1081+
- kimik2.5-int4-mi355x-vllm
1082+
description:
1083+
- "Upgrade vLLM ROCm image from v0.15.1 to v0.18.0"
1084+
- "Enable AITER MLA, export VLLM_ROCM_USE_AITER=1, https://github.com/vllm-project/vllm/issues/35641"
1085+
- "Triton Fused Moe Tuning https://github.com/vllm-project/vllm/pull/35093"
1086+
- "Add --max-num-seqs 256, remove --disable-log-requests"
1087+
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/950
1088+
10711089
- config-keys:
10721090
# NVIDIA single-node
10731091
- dsr1-fp4-b200-sglang

runners/launch_mi300x-amd.sh

Lines changed: 0 additions & 21 deletions
This file was deleted.

runners/launch_mi300x-cr.sh

Lines changed: 0 additions & 21 deletions
This file was deleted.

0 commit comments

Comments
 (0)