diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 2362ae632..5bcf8f171 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -3723,7 +3723,7 @@ dsr1-fp8-gb200-dynamo-trt: dsr1-fp8-gb200-dynamo-sglang: - image: lmsysorg/sglang:v0.5.5.post2 + image: lmsysorg/sglang:v0.5.8.post1-cu130 model: deepseek-ai/DeepSeek-R1-0528 model-prefix: dsr1 runner: gb200 @@ -3735,114 +3735,232 @@ dsr1-fp8-gb200-dynamo-sglang: - isl: 1024 osl: 1024 search-space: - # "Top of curve" (2 prefill workers each at DEP8 and 1 decode worker at DEP32) - - spec-decoding: "none" - conc-list: [ 4096 ] + # "Low latency" (1 prefill worker at TP4 and 1 decode worker at TP4) + - conc-list: [4, 8] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/gb200-fp8/1k1k/low-latency.yaml + - "CONFIG_FILE=recipes/gb200-fp8/1k1k/low-latency.yaml" + decode: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + + # "Mid curve" (3 prefill workers at DEP8 and 1 decode worker at DEP48) + - conc-list: [1024, 2048, 4096] + prefill: + num-worker: 3 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/gb200-fp8/1k1k/mid-curve.yaml + - "CONFIG_FILE=recipes/gb200-fp8/1k1k/mid-curve.yaml" + decode: + num-worker: 1 + tp: 48 + ep: 48 + dp-attn: true + + # "Max throughput" (2 prefill workers at DEP8 and 1 decode worker at DEP32) + - conc-list: [1024, 2048, 4096, 6144] prefill: num-worker: 2 - # tp, ep, and dp-attn do nothing because they are hardcoded in the following file: - # https://github.com/Elnifio/dynamo/blob/update-result-file-name/components/backends/sglang/slurm_jobs/scripts/gb200-fp8.sh - tp: 1 + tp: 8 ep: 8 dp-attn: true additional-settings: - - "PREFILL_NODES=4" - - "N_ADDITIONAL_FRONTENDS=9" - - "SCRIPT_MODE=1k1k-max-tpt" + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/gb200-fp8/1k1k/max-tpt.yaml + - "CONFIG_FILE=recipes/gb200-fp8/1k1k/max-tpt.yaml" decode: num-worker: 1 - tp: 1 + tp: 32 ep: 32 dp-attn: true - additional-settings: - - "DECODE_NODES=8" - # "Bottom of curve" (1 prefill worker at DEP4 and 4 decode workers at DEP4) - - spec-decoding: "none" - conc-list: [ 2, 4, 8, 16, 64, 128 ] + # "Ultra throughput" (1 prefill workers at DEP8 and 1 decode worker at DEP8) + - conc-list: [4096] prefill: num-worker: 1 - # tp, ep, and dp-attn do nothing because they are hardcoded in the following file: - # https://github.com/Elnifio/dynamo/blob/update-result-file-name/components/backends/sglang/slurm_jobs/scripts/gb200-fp8.sh - tp: 1 - ep: 4 + tp: 8 + ep: 8 dp-attn: true additional-settings: - - "PREFILL_NODES=1" - - "N_ADDITIONAL_FRONTENDS=9" - - "SCRIPT_MODE=1k1k-low-latency" + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/gb200-fp8/1k1k/ultra-tpt.yaml + - "CONFIG_FILE=recipes/gb200-fp8/1k1k/ultra-tpt.yaml" decode: - num-worker: 4 - tp: 1 - ep: 4 + num-worker: 1 + tp: 8 + ep: 8 dp-attn: true + + - isl: 8192 + osl: 1024 + search-space: + # "Low latency" (1 prefill worker at TP8 and 1 decode worker at TP8) + - conc-list: [4, 8, 16] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false additional-settings: - - "DECODE_NODES=4" + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/gb200-fp8/8k1k/low-latency.yaml + - "CONFIG_FILE=recipes/gb200-fp8/8k1k/low-latency.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false - # "Middle of curve" (3 prefill workers each at DEP8 and 1 decode worker at DEP48) - - spec-decoding: "none" - conc-list: [ 1024, 2048, 4096 ] + # "Mid curve" (5 prefill workers at DEP8 and 1 decode worker at DEP32) + - conc-list: [512, 1024, 2048, 6144] prefill: - num-worker: 3 - # tp, ep, and dp-attn do nothing because they are hardcoded in the following file: - # https://github.com/Elnifio/dynamo/blob/update-result-file-name/components/backends/sglang/slurm_jobs/scripts/gb200-fp8.sh - tp: 1 + num-worker: 5 + tp: 8 ep: 8 dp-attn: true additional-settings: - - "PREFILL_NODES=6" - - "N_ADDITIONAL_FRONTENDS=9" - - "SCRIPT_MODE=1k1k-max-tpt" + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/gb200-fp8/8k1k/mid-curve.yaml + - "CONFIG_FILE=recipes/gb200-fp8/8k1k/mid-curve.yaml" decode: num-worker: 1 - tp: 1 - ep: 48 + tp: 32 + ep: 32 + dp-attn: true + + # "Max throughput" (6 prefill workers at DEP8 and 1 decode worker at DEP24) + - conc-list: [2048, 4096, 6144] + prefill: + num-worker: 6 + tp: 8 + ep: 8 dp-attn: true additional-settings: - - "DECODE_NODES=12" + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/gb200-fp8/8k1k/max_tpt.yaml + - "CONFIG_FILE=recipes/gb200-fp8/8k1k/max_tpt.yaml" + decode: + num-worker: 1 + tp: 24 + ep: 24 + dp-attn: true - - isl: 8192 +dsr1-fp8-gb300-dynamo-sglang: + image: lmsysorg/sglang:v0.5.8.post1-cu130 + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + runner: gb300 + precision: fp8 + framework: dynamo-sglang + multinode: true + disagg: true + seq-len-configs: + - isl: 1024 osl: 1024 search-space: - # Low latency (1 prefill worker at DEP4 and 1 decode worker at DEP4) - - spec-decoding: "none" - conc-list: [ 4, 8, 16, 32 ] + # "Low latency" (1 prefill worker at TP4 and 4 decode workers at TP4) + - conc-list: [4, 8, 16, 32] prefill: num-worker: 1 - tp: 1 - ep: 4 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/gb300-fp8/1k1k/stp/low-latency.yaml + - "CONFIG_FILE=recipes/gb300-fp8/1k1k/stp/low-latency.yaml" + decode: + num-worker: 4 + tp: 4 + ep: 1 + dp-attn: false + + # "Mid curve" (2 prefill workers at DEP8 and 1 decode worker at DEP32) + - conc-list: [1024, 2048, 4096, 6144] + prefill: + num-worker: 2 + tp: 8 + ep: 8 dp-attn: true additional-settings: - - "PREFILL_NODES=1" - - "N_ADDITIONAL_FRONTENDS=8" - - "SCRIPT_MODE=8k1k-low-latency" + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/gb300-fp8/1k1k/stp/mid.yaml + - "CONFIG_FILE=recipes/gb300-fp8/1k1k/stp/mid.yaml" decode: num-worker: 1 - tp: 1 - ep: 4 + tp: 32 + ep: 32 + dp-attn: true + + # "Max throughput" (1 prefill worker at DEP8 and 1 decode worker at DEP8) + - conc-list: [4096, 7168, 7680] + prefill: + num-worker: 1 + tp: 8 + ep: 8 dp-attn: true additional-settings: - - "DECODE_NODES=1" + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/gb300-fp8/1k1k/stp/max.yaml + - "CONFIG_FILE=recipes/gb300-fp8/1k1k/stp/max.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true - # Middle and top of curve (5 prefill workers each at DEP8 and 1 decode worker at DEP32) - - spec-decoding: "none" - conc-list: [ 512, 1024, 2048, 6144 ] + - isl: 8192 + osl: 1024 + search-space: + # "Low latency" (1 prefill worker at TP4 and 1 decode worker at TP4) + - conc-list: [4, 8] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/gb300-fp8/8k1k/stp/low-latency.yaml + - "CONFIG_FILE=recipes/gb300-fp8/8k1k/stp/low-latency.yaml" + decode: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + + # "Mid curve" (5 prefill workers at DEP8 and 1 decode worker at DEP32) + - conc-list: [128, 256, 512, 1024] prefill: num-worker: 5 - tp: 1 + tp: 8 ep: 8 dp-attn: true additional-settings: - - "PREFILL_NODES=10" - - "N_ADDITIONAL_FRONTENDS=8" - - "SCRIPT_MODE=8k1k-max-tpt" + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/gb300-fp8/8k1k/stp/mid.yaml + - "CONFIG_FILE=recipes/gb300-fp8/8k1k/stp/mid.yaml" decode: num-worker: 1 - tp: 1 + tp: 32 ep: 32 dp-attn: true + + # "Max throughput" (6 prefill workers at DEP8 and 1 decode worker at DEP24) + - conc-list: [2048, 4096] + prefill: + num-worker: 6 + tp: 8 + ep: 8 + dp-attn: true additional-settings: - - "DECODE_NODES=8" + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/gb300-fp8/8k1k/stp/max.yaml + - "CONFIG_FILE=recipes/gb300-fp8/8k1k/stp/max.yaml" + decode: + num-worker: 1 + tp: 24 + ep: 24 + dp-attn: true dsr1-fp4-gb200-dynamo-sglang: image: "lmsysorg/sglang:v0.5.8-cu130" diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 486c3b1e3..6a339b7b3 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -474,3 +474,12 @@ - "Add 1k1k configurations: low-latency (1P2D), mid-curve (4P8D), max-tpt (4P12D)" - "Add 8k1k configurations: low-latency (1P4D), mid-curve (6P12D), max-tpt (10P8D)" pr-link: https://github.com/InferenceMAX/InferenceMAX/pull/633 + +- config-keys: + - dsr1-fp8-gb200-dynamo-sglang + - dsr1-fp8-gb300-dynamo-sglang + description: + - "Update GB200 and GB300 configs for DSR1 FP8 SGLANG STP mode" + - "Image: lmsysorg/sglang:v0.5.8-cu130" + - "Update prefill/decode worker counts, TP/EP parallelism, and dp-attn settings for 1k1k and 8k1k" + pr-link: https://github.com/InferenceMAX/InferenceMAX/pull/635 diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh index f71ea72d4..c4d1ed7af 100755 --- a/runners/launch_gb200-nv.sh +++ b/runners/launch_gb200-nv.sh @@ -11,7 +11,7 @@ if [[ $FRAMEWORK == "dynamo-sglang" ]]; then export CONFIG_DIR="/mnt/lustre01/artifacts/sglang-configs/1k1k" if [[ $MODEL_PREFIX == "dsr1" && $PRECISION == "fp8" ]]; then export MODEL_PATH="/mnt/lustre01/models/deepseek-r1-0528" - export SRT_SLURM_MODEL_PREFIX="dsr1" + export SRT_SLURM_MODEL_PREFIX="dsr1-fp8" elif [[ $MODEL_PREFIX == "dsr1" && $PRECISION == "fp4" ]]; then export MODEL_PATH="/mnt/lustre01/models/deepseek-r1-0528-fp4-v2/" export SRT_SLURM_MODEL_PREFIX="dsr1-fp4" @@ -98,6 +98,7 @@ PY exit 0 fi + echo "Cloning srt-slurm repository..." SRT_REPO_DIR="srt-slurm" if [ -d "$SRT_REPO_DIR" ]; then diff --git a/runners/launch_gb300-nv.sh b/runners/launch_gb300-nv.sh index 0fe24b891..80df5347e 100644 --- a/runners/launch_gb300-nv.sh +++ b/runners/launch_gb300-nv.sh @@ -72,6 +72,7 @@ model_paths: "${SRT_SLURM_MODEL_PREFIX}": "${MODEL_PATH}" containers: dynamo-trtllm: ${SQUASH_FILE} + dynamo-sglang: ${SQUASH_FILE} use_segment_sbatch_directive: false EOF