SemiAnalysisAI · kimbochen · Sep 16, 2025 · Sep 15, 2025 · Sep 16, 2025
diff --git a/.github/workflows/gptoss-tmpl.yml b/.github/workflows/gptoss-tmpl.yml
@@ -39,6 +39,23 @@ on:
         required: true
 
 jobs:
+  bmk-h200:
+    if: ${{ inputs.use_h200 }}
+    uses: ./.github/workflows/benchmark-tmpl.yml
+    secrets: inherit
+    with:
+      exp-name: ${{ inputs.exp-name }}
+      isl: ${{ inputs.isl }}
+      osl: ${{ inputs.osl }}
+      max-model-len: ${{ inputs.max-model-len }}
+      random-range-ratio: ${{ inputs.random-range-ratio }}
+      runner: h200
+      image: 'vllm/vllm-openai:v0.10.2'
+      model: 'openai/gpt-oss-120b'
+      tp-list: '[1, 2, 4, 8]'
+      framework: 'vllm'
+      precision: 'fp4'
+
   bmk-h200-trt:
     if: ${{ inputs.use_h200 }}
     uses: ./.github/workflows/benchmark-tmpl.yml

diff --git a/.github/workflows/runner-test.yml b/.github/workflows/runner-test.yml
@@ -46,6 +46,7 @@ on:
         options:
           - 'kedarpotdar147/vllm0.1:latest'
           - 'kedarpotdar147/vllm:05'
+          - 'vllm/vllm-openai:v0.10.2'
           - 'rocm/7.0-preview:rocm7.0_preview_ubuntu_22.04_vllm_0.10.1_instinct_rc1'
           - 'rocm/vllm-dev:open-mi300-08052025'
           - 'rocm/vllm-dev:open-mi355-08052025'

diff --git a/benchmarks/gptoss_fp4_h200_slurm.sh b/benchmarks/gptoss_fp4_h200_slurm.sh
@@ -1 +1,67 @@
-echo "dummy script"
+#!/usr/bin/env bash
+
+# === Required Env Vars === 
+# HF_TOKEN
+# HF_HUB_CACHE
+# IMAGE
+# MODEL
+# ISL
+# OSL
+# MAX_MODEL_LEN
+# RANDOM_RANGE_RATIO
+# TP
+# CONC
+# RESULT_FILENAME
+# PORT_OFFSET
+
+echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
+
+set -x
+hf download $MODEL
+pip install datasets pandas
+
+# Create config.yaml
+cat > config.yaml << EOF
+async-scheduling: true
+no-enable-prefix-caching: true
+cuda-graph-sizes: 1024
+max-num-batched-tokens: 8192
+max-model-len: 10240
+EOF
+
+SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
+PORT=$(( 8888 + $PORT_OFFSET ))
+
+export TORCH_CUDA_ARCH_LIST="9.0"
+
+PYTHONNOUSERSITE=1 vllm serve $MODEL --host 0.0.0.0 --port $PORT --config config.yaml \
+ --gpu-memory-utilization 0.9 --tensor-parallel-size $TP --max-num-seqs $CONC  \
+ --disable-log-requests > $SERVER_LOG 2>&1 &
+
+set +x
+while IFS= read -r line; do
+    printf '%s\n' "$line"
+    # Ignore intel_extension_for_pytorch import errors
+    if [[ "$line" =~ [Ee][Rr][Rr][Oo][Rr] ]] && [[ ! "$line" =~ "intel_extension_for_pytorch" ]]; then
+		sleep 5
+		tail -n100 $SERVER_LOG
+        echo "JOB $SLURM_JOB_ID ran on NODE $SLURMD_NODENAME"
+        exit 1
+    fi
+    if [[ "$line" == *"Application startup complete"* ]]; then
+        break
+    fi
+done < <(tail -F -n0 "$SERVER_LOG")
+
+set -x
+git clone https://github.com/kimbochen/bench_serving.git
+python3 bench_serving/benchmark_serving.py \
+--model $MODEL --backend vllm \
+--base-url http://0.0.0.0:$PORT \
+--dataset-name random \
+--random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \
+--num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \
+--request-rate inf --ignore-eos \
+--save-result --percentile-metrics 'ttft,tpot,itl,e2el' \
+--result-dir /workspace/ \
+--result-filename $RESULT_FILENAME.json