SemiAnalysisAI · kedarpotdar-nv · Feb 9, 2026 · Feb 5, 2026 · Feb 5, 2026 · Feb 5, 2026
@@ -3723,7 +3723,7 @@ dsr1-fp8-gb200-dynamo-trt:
 
 
 dsr1-fp8-gb200-dynamo-sglang:
-  image: lmsysorg/sglang:v0.5.5.post2
+  image: lmsysorg/sglang:v0.5.8.post1-cu130
   model: deepseek-ai/DeepSeek-R1-0528
   model-prefix: dsr1
   runner: gb200
@@ -3735,114 +3735,232 @@ dsr1-fp8-gb200-dynamo-sglang:
   - isl: 1024
     osl: 1024
     search-space:
-    # "Top of curve" (2 prefill workers each at DEP8 and 1 decode worker at DEP32)
-    - spec-decoding: "none"
-      conc-list: [ 4096 ]
+   # "Low latency" (1 prefill worker at TP4 and 1 decode worker at TP4)
+    - conc-list: [4, 8]
+      prefill:
+        num-worker: 1
+        tp: 4
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/gb200-fp8/1k1k/low-latency.yaml
+        - "CONFIG_FILE=recipes/gb200-fp8/1k1k/low-latency.yaml"
+      decode:
+        num-worker: 1
+        tp: 4
+        ep: 1
+        dp-attn: false
+
+    # "Mid curve" (3 prefill workers at DEP8 and 1 decode worker at DEP48)
+    - conc-list: [1024, 2048, 4096]
+      prefill:
+        num-worker: 3
+        tp: 8
+        ep: 8
+        dp-attn: true
+        additional-settings:
+        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/gb200-fp8/1k1k/mid-curve.yaml
+        - "CONFIG_FILE=recipes/gb200-fp8/1k1k/mid-curve.yaml"
+      decode:
+        num-worker: 1
+        tp: 48
+        ep: 48
+        dp-attn: true
+
+    # "Max throughput" (2 prefill workers at DEP8 and 1 decode worker at DEP32)
+    - conc-list: [1024, 2048, 4096, 6144]
       prefill:
         num-worker: 2
-        # tp, ep, and dp-attn do nothing because they are hardcoded in the following file:
-        # https://github.com/Elnifio/dynamo/blob/update-result-file-name/components/backends/sglang/slurm_jobs/scripts/gb200-fp8.sh
-        tp: 1
+        tp: 8
         ep: 8
         dp-attn: true
         additional-settings:
-        - "PREFILL_NODES=4"
-        - "N_ADDITIONAL_FRONTENDS=9"
-        - "SCRIPT_MODE=1k1k-max-tpt"
+        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/gb200-fp8/1k1k/max-tpt.yaml
+        - "CONFIG_FILE=recipes/gb200-fp8/1k1k/max-tpt.yaml"
       decode:
         num-worker: 1
-        tp: 1
+        tp: 32
         ep: 32
         dp-attn: true
-        additional-settings:
-        - "DECODE_NODES=8"
 
-    # "Bottom of curve" (1 prefill worker at DEP4 and 4 decode workers at DEP4)
-    - spec-decoding: "none"
-      conc-list: [ 2, 4, 8, 16, 64, 128 ]
+    # "Ultra throughput" (1 prefill workers at DEP8 and 1 decode worker at DEP8)
+    - conc-list: [4096]
       prefill:
         num-worker: 1
-        # tp, ep, and dp-attn do nothing because they are hardcoded in the following file:
-        # https://github.com/Elnifio/dynamo/blob/update-result-file-name/components/backends/sglang/slurm_jobs/scripts/gb200-fp8.sh
-        tp: 1
-        ep: 4
+        tp: 8
+        ep: 8
         dp-attn: true
         additional-settings:
-        - "PREFILL_NODES=1"
-        - "N_ADDITIONAL_FRONTENDS=9"
-        - "SCRIPT_MODE=1k1k-low-latency"
+        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/gb200-fp8/1k1k/ultra-tpt.yaml
+        - "CONFIG_FILE=recipes/gb200-fp8/1k1k/ultra-tpt.yaml"
       decode:
-        num-worker: 4
-        tp: 1
-        ep: 4
+        num-worker: 1
+        tp: 8
+        ep: 8
         dp-attn: true
+
+  - isl: 8192
+    osl: 1024
+    search-space:
+   # "Low latency" (1 prefill worker at TP8 and 1 decode worker at TP8)
+    - conc-list: [4, 8, 16]
+      prefill:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: false
         additional-settings:
-        - "DECODE_NODES=4"
+        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/gb200-fp8/8k1k/low-latency.yaml
+        - "CONFIG_FILE=recipes/gb200-fp8/8k1k/low-latency.yaml"
+      decode:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: false
 
-    # "Middle of curve" (3 prefill workers each at DEP8 and 1 decode worker at DEP48)
-    - spec-decoding: "none"
-      conc-list: [ 1024, 2048, 4096 ]
+    # "Mid curve" (5 prefill workers at DEP8 and 1 decode worker at DEP32)
+    - conc-list: [512, 1024, 2048, 6144]
       prefill:
-        num-worker: 3
-        # tp, ep, and dp-attn do nothing because they are hardcoded in the following file:
-        # https://github.com/Elnifio/dynamo/blob/update-result-file-name/components/backends/sglang/slurm_jobs/scripts/gb200-fp8.sh
-        tp: 1
+        num-worker: 5
+        tp: 8
         ep: 8
         dp-attn: true
         additional-settings:
-        - "PREFILL_NODES=6"
-        - "N_ADDITIONAL_FRONTENDS=9"
-        - "SCRIPT_MODE=1k1k-max-tpt"
+        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/gb200-fp8/8k1k/mid-curve.yaml
+        - "CONFIG_FILE=recipes/gb200-fp8/8k1k/mid-curve.yaml"
       decode:
         num-worker: 1
-        tp: 1
-        ep: 48
+        tp: 32
+        ep: 32
+        dp-attn: true
+
+    # "Max throughput" (6 prefill workers at DEP8 and 1 decode worker at DEP24)
+    - conc-list: [2048, 4096, 6144]
+      prefill:
+        num-worker: 6
+        tp: 8
+        ep: 8
         dp-attn: true
         additional-settings:
-        - "DECODE_NODES=12"
+        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/gb200-fp8/8k1k/max_tpt.yaml
+        - "CONFIG_FILE=recipes/gb200-fp8/8k1k/max_tpt.yaml"
+      decode:
+        num-worker: 1
+        tp: 24
+        ep: 24
+        dp-attn: true
 
-  - isl: 8192
+dsr1-fp8-gb300-dynamo-sglang:
+  image: lmsysorg/sglang:v0.5.8.post1-cu130
+  model: deepseek-ai/DeepSeek-R1-0528
+  model-prefix: dsr1
+  runner: gb300
+  precision: fp8
+  framework: dynamo-sglang
+  multinode: true
+  disagg: true
+  seq-len-configs:
+  - isl: 1024
     osl: 1024
     search-space:
-    # Low latency (1 prefill worker at DEP4 and 1 decode worker at DEP4)
-    - spec-decoding: "none"
-      conc-list: [ 4, 8, 16, 32 ]
+   # "Low latency" (1 prefill worker at TP4 and 4 decode workers at TP4)
+    - conc-list: [4, 8, 16, 32]
       prefill:
         num-worker: 1
-        tp: 1
-        ep: 4
+        tp: 4
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/gb300-fp8/1k1k/stp/low-latency.yaml
+        - "CONFIG_FILE=recipes/gb300-fp8/1k1k/stp/low-latency.yaml"
+      decode:
+        num-worker: 4
+        tp: 4
+        ep: 1
+        dp-attn: false
+
+    # "Mid curve" (2 prefill workers at DEP8 and 1 decode worker at DEP32)
+    - conc-list: [1024, 2048, 4096, 6144]
+      prefill:
+        num-worker: 2
+        tp: 8
+        ep: 8
         dp-attn: true
         additional-settings:
-        - "PREFILL_NODES=1"
-        - "N_ADDITIONAL_FRONTENDS=8"
-        - "SCRIPT_MODE=8k1k-low-latency"
+        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/gb300-fp8/1k1k/stp/mid.yaml
+        - "CONFIG_FILE=recipes/gb300-fp8/1k1k/stp/mid.yaml"
       decode:
         num-worker: 1
-        tp: 1
-        ep: 4
+        tp: 32
+        ep: 32
+        dp-attn: true
+
+    # "Max throughput" (1 prefill worker at DEP8 and 1 decode worker at DEP8)
+    - conc-list: [4096, 7168, 7680]
+      prefill:
+        num-worker: 1
+        tp: 8
+        ep: 8
         dp-attn: true
         additional-settings:
-        - "DECODE_NODES=1"
+        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/gb300-fp8/1k1k/stp/max.yaml
+        - "CONFIG_FILE=recipes/gb300-fp8/1k1k/stp/max.yaml"
+      decode:
+        num-worker: 1
+        tp: 8
+        ep: 8
+        dp-attn: true
 
-    # Middle and top of curve (5 prefill workers each at DEP8 and 1 decode worker at DEP32)
-    - spec-decoding: "none"
-      conc-list: [ 512, 1024, 2048, 6144 ]
+  - isl: 8192
+    osl: 1024
+    search-space:
+   # "Low latency" (1 prefill worker at TP4 and 1 decode worker at TP4)
+    - conc-list: [4, 8]
+      prefill:
+        num-worker: 1
+        tp: 4
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/gb300-fp8/8k1k/stp/low-latency.yaml
+        - "CONFIG_FILE=recipes/gb300-fp8/8k1k/stp/low-latency.yaml"
+      decode:
+        num-worker: 1
+        tp: 4
+        ep: 1
+        dp-attn: false
+
+    # "Mid curve" (5 prefill workers at DEP8 and 1 decode worker at DEP32)
+    - conc-list: [128, 256, 512, 1024]
       prefill:
         num-worker: 5
-        tp: 1
+        tp: 8
         ep: 8
         dp-attn: true
         additional-settings:
-        - "PREFILL_NODES=10"
-        - "N_ADDITIONAL_FRONTENDS=8"
-        - "SCRIPT_MODE=8k1k-max-tpt"
+        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/gb300-fp8/8k1k/stp/mid.yaml
+        - "CONFIG_FILE=recipes/gb300-fp8/8k1k/stp/mid.yaml"
       decode:
         num-worker: 1
-        tp: 1
+        tp: 32
         ep: 32
         dp-attn: true
+
+    # "Max throughput" (6 prefill workers at DEP8 and 1 decode worker at DEP24)
+    - conc-list: [2048, 4096]
+      prefill:
+        num-worker: 6
+        tp: 8
+        ep: 8
+        dp-attn: true
         additional-settings:
-        - "DECODE_NODES=8"
+        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/gb300-fp8/8k1k/stp/max.yaml
+        - "CONFIG_FILE=recipes/gb300-fp8/8k1k/stp/max.yaml"
+      decode:
+        num-worker: 1
+        tp: 24
+        ep: 24
+        dp-attn: true
 
 dsr1-fp4-gb200-dynamo-sglang:
   image: "lmsysorg/sglang:v0.5.8-cu130"

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
@@ -474,3 +474,12 @@
     - "Add 1k1k configurations: low-latency (1P2D), mid-curve (4P8D), max-tpt (4P12D)"
     - "Add 8k1k configurations: low-latency (1P4D), mid-curve (6P12D), max-tpt (10P8D)"
   pr-link: https://github.com/InferenceMAX/InferenceMAX/pull/633  
+
+- config-keys:
+    - dsr1-fp8-gb200-dynamo-sglang
+    - dsr1-fp8-gb300-dynamo-sglang
+  description:
+    - "Update  GB200 and GB300 configs for DSR1 FP8 SGLANG STP mode"
+    - "Image: lmsysorg/sglang:v0.5.8-cu130"
+    - "Update prefill/decode worker counts, TP/EP parallelism, and dp-attn settings for 1k1k and 8k1k"
+  pr-link: https://github.com/InferenceMAX/InferenceMAX/pull/635
diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh
@@ -11,7 +11,7 @@ if [[ $FRAMEWORK == "dynamo-sglang" ]]; then
     export CONFIG_DIR="/mnt/lustre01/artifacts/sglang-configs/1k1k"
     if [[ $MODEL_PREFIX == "dsr1" && $PRECISION == "fp8" ]]; then
         export MODEL_PATH="/mnt/lustre01/models/deepseek-r1-0528"
-        export SRT_SLURM_MODEL_PREFIX="dsr1"
+        export SRT_SLURM_MODEL_PREFIX="dsr1-fp8"
     elif [[ $MODEL_PREFIX == "dsr1" && $PRECISION == "fp4" ]]; then
         export MODEL_PATH="/mnt/lustre01/models/deepseek-r1-0528-fp4-v2/"
         export SRT_SLURM_MODEL_PREFIX="dsr1-fp4"
@@ -98,6 +98,7 @@ PY
     exit 0
 fi
 
+
 echo "Cloning srt-slurm repository..."
 SRT_REPO_DIR="srt-slurm"
 if [ -d "$SRT_REPO_DIR" ]; then

diff --git a/runners/launch_gb300-nv.sh b/runners/launch_gb300-nv.sh
@@ -72,6 +72,7 @@ model_paths:
   "${SRT_SLURM_MODEL_PREFIX}": "${MODEL_PATH}"
 containers:
   dynamo-trtllm: ${SQUASH_FILE}
+  dynamo-sglang: ${SQUASH_FILE}
 use_segment_sbatch_directive: false
 EOF