Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions .github/workflows/gptoss-tmpl.yml
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,23 @@ on:
required: true

jobs:
bmk-h200:
if: ${{ inputs.use_h200 }}
uses: ./.github/workflows/benchmark-tmpl.yml
secrets: inherit
with:
exp-name: ${{ inputs.exp-name }}
isl: ${{ inputs.isl }}
osl: ${{ inputs.osl }}
max-model-len: ${{ inputs.max-model-len }}
random-range-ratio: ${{ inputs.random-range-ratio }}
runner: h200
image: 'vllm/vllm-openai:v0.10.2'
model: 'openai/gpt-oss-120b'
tp-list: '[1, 2, 4, 8]'
framework: 'vllm'
precision: 'fp4'

bmk-h200-trt:
if: ${{ inputs.use_h200 }}
uses: ./.github/workflows/benchmark-tmpl.yml
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/runner-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ on:
options:
- 'kedarpotdar147/vllm0.1:latest'
- 'kedarpotdar147/vllm:05'
- 'vllm/vllm-openai:v0.10.2'
- 'rocm/7.0-preview:rocm7.0_preview_ubuntu_22.04_vllm_0.10.1_instinct_rc1'
- 'rocm/vllm-dev:open-mi300-08052025'
- 'rocm/vllm-dev:open-mi355-08052025'
Expand Down
68 changes: 67 additions & 1 deletion benchmarks/gptoss_fp4_h200_slurm.sh
Original file line number Diff line number Diff line change
@@ -1 +1,67 @@
echo "dummy script"
#!/usr/bin/env bash

# === Required Env Vars ===
# HF_TOKEN
# HF_HUB_CACHE
# IMAGE
# MODEL
# ISL
# OSL
# MAX_MODEL_LEN
# RANDOM_RANGE_RATIO
# TP
# CONC
# RESULT_FILENAME
# PORT_OFFSET

echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"

set -x
hf download $MODEL
pip install datasets pandas

# Create config.yaml
cat > config.yaml << EOF
async-scheduling: true
no-enable-prefix-caching: true
cuda-graph-sizes: 1024
max-num-batched-tokens: 8192
max-model-len: 10240
EOF

SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
PORT=$(( 8888 + $PORT_OFFSET ))

export TORCH_CUDA_ARCH_LIST="9.0"

PYTHONNOUSERSITE=1 vllm serve $MODEL --host 0.0.0.0 --port $PORT --config config.yaml \
--gpu-memory-utilization 0.9 --tensor-parallel-size $TP --max-num-seqs $CONC \
--disable-log-requests > $SERVER_LOG 2>&1 &

set +x
while IFS= read -r line; do
printf '%s\n' "$line"
# Ignore intel_extension_for_pytorch import errors
if [[ "$line" =~ [Ee][Rr][Rr][Oo][Rr] ]] && [[ ! "$line" =~ "intel_extension_for_pytorch" ]]; then
sleep 5
tail -n100 $SERVER_LOG
echo "JOB $SLURM_JOB_ID ran on NODE $SLURMD_NODENAME"
exit 1
fi
if [[ "$line" == *"Application startup complete"* ]]; then
break
fi
done < <(tail -F -n0 "$SERVER_LOG")

set -x
git clone https://github.com/kimbochen/bench_serving.git
python3 bench_serving/benchmark_serving.py \
--model $MODEL --backend vllm \
--base-url http://0.0.0.0:$PORT \
--dataset-name random \
--random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \
--num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \
--request-rate inf --ignore-eos \
--save-result --percentile-metrics 'ttft,tpot,itl,e2el' \
--result-dir /workspace/ \
--result-filename $RESULT_FILENAME.json