diff --git a/.github/workflows/gptoss-tmpl.yml b/.github/workflows/gptoss-tmpl.yml index c65dcd754..21ec4e200 100644 --- a/.github/workflows/gptoss-tmpl.yml +++ b/.github/workflows/gptoss-tmpl.yml @@ -39,6 +39,23 @@ on: required: true jobs: + bmk-h200: + if: ${{ inputs.use_h200 }} + uses: ./.github/workflows/benchmark-tmpl.yml + secrets: inherit + with: + exp-name: ${{ inputs.exp-name }} + isl: ${{ inputs.isl }} + osl: ${{ inputs.osl }} + max-model-len: ${{ inputs.max-model-len }} + random-range-ratio: ${{ inputs.random-range-ratio }} + runner: h200 + image: 'vllm/vllm-openai:v0.10.2' + model: 'openai/gpt-oss-120b' + tp-list: '[1, 2, 4, 8]' + framework: 'vllm' + precision: 'fp4' + bmk-h200-trt: if: ${{ inputs.use_h200 }} uses: ./.github/workflows/benchmark-tmpl.yml diff --git a/.github/workflows/runner-test.yml b/.github/workflows/runner-test.yml index fbe73d246..afe7cb676 100644 --- a/.github/workflows/runner-test.yml +++ b/.github/workflows/runner-test.yml @@ -46,6 +46,7 @@ on: options: - 'kedarpotdar147/vllm0.1:latest' - 'kedarpotdar147/vllm:05' + - 'vllm/vllm-openai:v0.10.2' - 'rocm/7.0-preview:rocm7.0_preview_ubuntu_22.04_vllm_0.10.1_instinct_rc1' - 'rocm/vllm-dev:open-mi300-08052025' - 'rocm/vllm-dev:open-mi355-08052025' diff --git a/benchmarks/gptoss_fp4_h200_slurm.sh b/benchmarks/gptoss_fp4_h200_slurm.sh index 4c6b9e9f5..8a082d466 100644 --- a/benchmarks/gptoss_fp4_h200_slurm.sh +++ b/benchmarks/gptoss_fp4_h200_slurm.sh @@ -1 +1,67 @@ -echo "dummy script" +#!/usr/bin/env bash + +# === Required Env Vars === +# HF_TOKEN +# HF_HUB_CACHE +# IMAGE +# MODEL +# ISL +# OSL +# MAX_MODEL_LEN +# RANDOM_RANGE_RATIO +# TP +# CONC +# RESULT_FILENAME +# PORT_OFFSET + +echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" + +set -x +hf download $MODEL +pip install datasets pandas + +# Create config.yaml +cat > config.yaml << EOF +async-scheduling: true +no-enable-prefix-caching: true +cuda-graph-sizes: 1024 +max-num-batched-tokens: 8192 +max-model-len: 10240 +EOF + +SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) +PORT=$(( 8888 + $PORT_OFFSET )) + +export TORCH_CUDA_ARCH_LIST="9.0" + +PYTHONNOUSERSITE=1 vllm serve $MODEL --host 0.0.0.0 --port $PORT --config config.yaml \ + --gpu-memory-utilization 0.9 --tensor-parallel-size $TP --max-num-seqs $CONC \ + --disable-log-requests > $SERVER_LOG 2>&1 & + +set +x +while IFS= read -r line; do + printf '%s\n' "$line" + # Ignore intel_extension_for_pytorch import errors + if [[ "$line" =~ [Ee][Rr][Rr][Oo][Rr] ]] && [[ ! "$line" =~ "intel_extension_for_pytorch" ]]; then + sleep 5 + tail -n100 $SERVER_LOG + echo "JOB $SLURM_JOB_ID ran on NODE $SLURMD_NODENAME" + exit 1 + fi + if [[ "$line" == *"Application startup complete"* ]]; then + break + fi +done < <(tail -F -n0 "$SERVER_LOG") + +set -x +git clone https://github.com/kimbochen/bench_serving.git +python3 bench_serving/benchmark_serving.py \ +--model $MODEL --backend vllm \ +--base-url http://0.0.0.0:$PORT \ +--dataset-name random \ +--random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ +--num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \ +--request-rate inf --ignore-eos \ +--save-result --percentile-metrics 'ttft,tpot,itl,e2el' \ +--result-dir /workspace/ \ +--result-filename $RESULT_FILENAME.json