Skip to content
240 changes: 179 additions & 61 deletions .github/configs/nvidia-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3723,7 +3723,7 @@ dsr1-fp8-gb200-dynamo-trt:


dsr1-fp8-gb200-dynamo-sglang:
image: lmsysorg/sglang:v0.5.5.post2
image: lmsysorg/sglang:v0.5.8.post1-cu130
model: deepseek-ai/DeepSeek-R1-0528
model-prefix: dsr1
runner: gb200
Expand All @@ -3735,114 +3735,232 @@ dsr1-fp8-gb200-dynamo-sglang:
- isl: 1024
osl: 1024
search-space:
# "Top of curve" (2 prefill workers each at DEP8 and 1 decode worker at DEP32)
- spec-decoding: "none"
conc-list: [ 4096 ]
# "Low latency" (1 prefill worker at TP4 and 1 decode worker at TP4)
- conc-list: [4, 8]
prefill:
num-worker: 1
tp: 4
ep: 1
dp-attn: false
additional-settings:
# https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/gb200-fp8/1k1k/low-latency.yaml
- "CONFIG_FILE=recipes/gb200-fp8/1k1k/low-latency.yaml"
decode:
num-worker: 1
tp: 4
ep: 1
dp-attn: false

# "Mid curve" (3 prefill workers at DEP8 and 1 decode worker at DEP48)
- conc-list: [1024, 2048, 4096]
prefill:
num-worker: 3
tp: 8
ep: 8
dp-attn: true
additional-settings:
# https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/gb200-fp8/1k1k/mid-curve.yaml
- "CONFIG_FILE=recipes/gb200-fp8/1k1k/mid-curve.yaml"
decode:
num-worker: 1
tp: 48
ep: 48
dp-attn: true

# "Max throughput" (2 prefill workers at DEP8 and 1 decode worker at DEP32)
- conc-list: [1024, 2048, 4096, 6144]
prefill:
num-worker: 2
# tp, ep, and dp-attn do nothing because they are hardcoded in the following file:
# https://github.com/Elnifio/dynamo/blob/update-result-file-name/components/backends/sglang/slurm_jobs/scripts/gb200-fp8.sh
tp: 1
tp: 8
ep: 8
dp-attn: true
additional-settings:
- "PREFILL_NODES=4"
- "N_ADDITIONAL_FRONTENDS=9"
- "SCRIPT_MODE=1k1k-max-tpt"
# https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/gb200-fp8/1k1k/max-tpt.yaml
- "CONFIG_FILE=recipes/gb200-fp8/1k1k/max-tpt.yaml"
decode:
num-worker: 1
tp: 1
tp: 32
ep: 32
dp-attn: true
additional-settings:
- "DECODE_NODES=8"

# "Bottom of curve" (1 prefill worker at DEP4 and 4 decode workers at DEP4)
- spec-decoding: "none"
conc-list: [ 2, 4, 8, 16, 64, 128 ]
# "Ultra throughput" (1 prefill workers at DEP8 and 1 decode worker at DEP8)
- conc-list: [4096]
prefill:
num-worker: 1
# tp, ep, and dp-attn do nothing because they are hardcoded in the following file:
# https://github.com/Elnifio/dynamo/blob/update-result-file-name/components/backends/sglang/slurm_jobs/scripts/gb200-fp8.sh
tp: 1
ep: 4
tp: 8
ep: 8
dp-attn: true
additional-settings:
- "PREFILL_NODES=1"
- "N_ADDITIONAL_FRONTENDS=9"
- "SCRIPT_MODE=1k1k-low-latency"
# https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/gb200-fp8/1k1k/ultra-tpt.yaml
- "CONFIG_FILE=recipes/gb200-fp8/1k1k/ultra-tpt.yaml"
decode:
num-worker: 4
tp: 1
ep: 4
num-worker: 1
tp: 8
ep: 8
dp-attn: true

- isl: 8192
osl: 1024
search-space:
# "Low latency" (1 prefill worker at TP8 and 1 decode worker at TP8)
- conc-list: [4, 8, 16]
prefill:
num-worker: 1
tp: 8
ep: 1
dp-attn: false
additional-settings:
- "DECODE_NODES=4"
# https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/gb200-fp8/8k1k/low-latency.yaml
- "CONFIG_FILE=recipes/gb200-fp8/8k1k/low-latency.yaml"
decode:
num-worker: 1
tp: 8
ep: 1
dp-attn: false

# "Middle of curve" (3 prefill workers each at DEP8 and 1 decode worker at DEP48)
- spec-decoding: "none"
conc-list: [ 1024, 2048, 4096 ]
# "Mid curve" (5 prefill workers at DEP8 and 1 decode worker at DEP32)
- conc-list: [512, 1024, 2048, 6144]
prefill:
num-worker: 3
# tp, ep, and dp-attn do nothing because they are hardcoded in the following file:
# https://github.com/Elnifio/dynamo/blob/update-result-file-name/components/backends/sglang/slurm_jobs/scripts/gb200-fp8.sh
tp: 1
num-worker: 5
tp: 8
ep: 8
dp-attn: true
additional-settings:
- "PREFILL_NODES=6"
- "N_ADDITIONAL_FRONTENDS=9"
- "SCRIPT_MODE=1k1k-max-tpt"
# https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/gb200-fp8/8k1k/mid-curve.yaml
- "CONFIG_FILE=recipes/gb200-fp8/8k1k/mid-curve.yaml"
decode:
num-worker: 1
tp: 1
ep: 48
tp: 32
ep: 32
dp-attn: true

# "Max throughput" (6 prefill workers at DEP8 and 1 decode worker at DEP24)
- conc-list: [2048, 4096, 6144]
prefill:
num-worker: 6
tp: 8
ep: 8
dp-attn: true
additional-settings:
- "DECODE_NODES=12"
# https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/gb200-fp8/8k1k/max_tpt.yaml
- "CONFIG_FILE=recipes/gb200-fp8/8k1k/max_tpt.yaml"
decode:
num-worker: 1
tp: 24
ep: 24
dp-attn: true

- isl: 8192
dsr1-fp8-gb300-dynamo-sglang:
image: lmsysorg/sglang:v0.5.8.post1-cu130
model: deepseek-ai/DeepSeek-R1-0528
model-prefix: dsr1
runner: gb300
precision: fp8
framework: dynamo-sglang
multinode: true
disagg: true
seq-len-configs:
- isl: 1024
osl: 1024
search-space:
# Low latency (1 prefill worker at DEP4 and 1 decode worker at DEP4)
- spec-decoding: "none"
conc-list: [ 4, 8, 16, 32 ]
# "Low latency" (1 prefill worker at TP4 and 4 decode workers at TP4)
- conc-list: [4, 8, 16, 32]
prefill:
num-worker: 1
tp: 1
ep: 4
tp: 4
ep: 1
dp-attn: false
additional-settings:
# https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/gb300-fp8/1k1k/stp/low-latency.yaml
- "CONFIG_FILE=recipes/gb300-fp8/1k1k/stp/low-latency.yaml"
decode:
num-worker: 4
tp: 4
ep: 1
dp-attn: false

# "Mid curve" (2 prefill workers at DEP8 and 1 decode worker at DEP32)
- conc-list: [1024, 2048, 4096, 6144]
prefill:
num-worker: 2
tp: 8
ep: 8
dp-attn: true
additional-settings:
- "PREFILL_NODES=1"
- "N_ADDITIONAL_FRONTENDS=8"
- "SCRIPT_MODE=8k1k-low-latency"
# https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/gb300-fp8/1k1k/stp/mid.yaml
- "CONFIG_FILE=recipes/gb300-fp8/1k1k/stp/mid.yaml"
decode:
num-worker: 1
tp: 1
ep: 4
tp: 32
ep: 32
dp-attn: true

# "Max throughput" (1 prefill worker at DEP8 and 1 decode worker at DEP8)
- conc-list: [4096, 7168, 7680]
prefill:
num-worker: 1
tp: 8
ep: 8
dp-attn: true
additional-settings:
- "DECODE_NODES=1"
# https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/gb300-fp8/1k1k/stp/max.yaml
- "CONFIG_FILE=recipes/gb300-fp8/1k1k/stp/max.yaml"
decode:
num-worker: 1
tp: 8
ep: 8
dp-attn: true

# Middle and top of curve (5 prefill workers each at DEP8 and 1 decode worker at DEP32)
- spec-decoding: "none"
conc-list: [ 512, 1024, 2048, 6144 ]
- isl: 8192
osl: 1024
search-space:
# "Low latency" (1 prefill worker at TP4 and 1 decode worker at TP4)
- conc-list: [4, 8]
prefill:
num-worker: 1
tp: 4
ep: 1
dp-attn: false
additional-settings:
# https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/gb300-fp8/8k1k/stp/low-latency.yaml
- "CONFIG_FILE=recipes/gb300-fp8/8k1k/stp/low-latency.yaml"
decode:
num-worker: 1
tp: 4
ep: 1
dp-attn: false

# "Mid curve" (5 prefill workers at DEP8 and 1 decode worker at DEP32)
- conc-list: [128, 256, 512, 1024]
prefill:
num-worker: 5
tp: 1
tp: 8
ep: 8
dp-attn: true
additional-settings:
- "PREFILL_NODES=10"
- "N_ADDITIONAL_FRONTENDS=8"
- "SCRIPT_MODE=8k1k-max-tpt"
# https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/gb300-fp8/8k1k/stp/mid.yaml
- "CONFIG_FILE=recipes/gb300-fp8/8k1k/stp/mid.yaml"
decode:
num-worker: 1
tp: 1
tp: 32
ep: 32
dp-attn: true

# "Max throughput" (6 prefill workers at DEP8 and 1 decode worker at DEP24)
- conc-list: [2048, 4096]
prefill:
num-worker: 6
tp: 8
ep: 8
dp-attn: true
additional-settings:
- "DECODE_NODES=8"
# https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/gb300-fp8/8k1k/stp/max.yaml
- "CONFIG_FILE=recipes/gb300-fp8/8k1k/stp/max.yaml"
decode:
num-worker: 1
tp: 24
ep: 24
dp-attn: true

dsr1-fp4-gb200-dynamo-sglang:
image: "lmsysorg/sglang:v0.5.8-cu130"
Expand Down
9 changes: 9 additions & 0 deletions perf-changelog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -474,3 +474,12 @@
- "Add 1k1k configurations: low-latency (1P2D), mid-curve (4P8D), max-tpt (4P12D)"
- "Add 8k1k configurations: low-latency (1P4D), mid-curve (6P12D), max-tpt (10P8D)"
pr-link: https://github.com/InferenceMAX/InferenceMAX/pull/633

- config-keys:
- dsr1-fp8-gb200-dynamo-sglang
- dsr1-fp8-gb300-dynamo-sglang
description:
- "Update GB200 and GB300 configs for DSR1 FP8 SGLANG STP mode"
- "Image: lmsysorg/sglang:v0.5.8-cu130"
- "Update prefill/decode worker counts, TP/EP parallelism, and dp-attn settings for 1k1k and 8k1k"
pr-link: https://github.com/InferenceMAX/InferenceMAX/pull/635
3 changes: 2 additions & 1 deletion runners/launch_gb200-nv.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ if [[ $FRAMEWORK == "dynamo-sglang" ]]; then
export CONFIG_DIR="/mnt/lustre01/artifacts/sglang-configs/1k1k"
if [[ $MODEL_PREFIX == "dsr1" && $PRECISION == "fp8" ]]; then
export MODEL_PATH="/mnt/lustre01/models/deepseek-r1-0528"
export SRT_SLURM_MODEL_PREFIX="dsr1"
export SRT_SLURM_MODEL_PREFIX="dsr1-fp8"
elif [[ $MODEL_PREFIX == "dsr1" && $PRECISION == "fp4" ]]; then
export MODEL_PATH="/mnt/lustre01/models/deepseek-r1-0528-fp4-v2/"
export SRT_SLURM_MODEL_PREFIX="dsr1-fp4"
Expand Down Expand Up @@ -98,6 +98,7 @@ PY
exit 0
fi


echo "Cloning srt-slurm repository..."
SRT_REPO_DIR="srt-slurm"
if [ -d "$SRT_REPO_DIR" ]; then
Expand Down
1 change: 1 addition & 0 deletions runners/launch_gb300-nv.sh
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ model_paths:
"${SRT_SLURM_MODEL_PREFIX}": "${MODEL_PATH}"
containers:
dynamo-trtllm: ${SQUASH_FILE}
dynamo-sglang: ${SQUASH_FILE}
use_segment_sbatch_directive: false
EOF

Expand Down