SemiAnalysisAI · cquil11 · Jan 27, 2026 · Jan 27, 2026
diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
@@ -2266,160 +2266,4 @@ gptoss-fp4-gb200-dynamo-trt:
         - "DECODE_MAX_NUM_TOKENS=20000"
         - "DECODE_MAX_BATCH_SIZE=512"
         - "DECODE_GPU_MEM_FRACTION=0.9"
-
-dsr1-fp8-h200-dynamo-sglang:
-  image: lmsysorg/sglang:v0.5.8-cu130-runtime
-  model: deepseek-ai/DeepSeek-R1-0528
-  model-prefix: dsr1
-  runner: h200-multinode-slurm
-  precision: fp8
-  framework: dynamo-sglang
-  multinode: true
-  disagg: true
-  seq-len-configs:
-  - isl: 1024
-    osl: 1024
-    search-space:
-    # Aggregated mode (single node TEP)
-    - conc-list: [1, 4, 16, 32, 64, 128, 256, 512]
-      prefill:
-        num-worker: 1
-        tp: 8
-        ep: 1
-        dp-attn: false
-        additional-settings:
-        - "CONFIG_FILE=recipes/h200/1k1k/bs128-agg-tp.yaml"
-      decode:
-        num-worker: 0
-        tp: 8
-        ep: 1
-        dp-attn: false
-    # Low latency (1 prefill, 9 decode, TEP)
-    - conc-list: [1, 4, 8, 16, 32, 64, 128, 256]
-      prefill:
-        num-worker: 1
-        tp: 8
-        ep: 1
-        dp-attn: false
-        additional-settings:
-        - "CONFIG_FILE=recipes/h200/1k1k/low-latency-1p9d.yaml"
-      decode:
-        num-worker: 9
-        tp: 8
-        ep: 1
-        dp-attn: false
-    # High throughput TEP (1 prefill, 6 decode)
-    - conc-list: [512, 1024, 2048]
-      prefill:
-        num-worker: 1
-        tp: 8
-        ep: 1
-        dp-attn: false
-        additional-settings:
-        - "CONFIG_FILE=recipes/h200/1k1k/bs256-1p6d-tp.yaml"
-      decode:
-        num-worker: 6
-        tp: 8
-        ep: 1
-        dp-attn: false
-    # High throughput DEP (1 prefill, 6 decode, dp-attention)
-    - conc-list: [128, 256, 512, 1024, 2048]
-      prefill:
-        num-worker: 1
-        tp: 8
-        ep: 8
-        dp-attn: true
-        additional-settings:
-        - "CONFIG_FILE=recipes/h200/1k1k/bs256-1p6d-dep.yaml"
-      decode:
-        num-worker: 6
-        tp: 8
-        ep: 8
-        dp-attn: true
-  - isl: 8192
-    osl: 1024
-    search-space:
-    # Aggregated mode (single node TEP)
-    - conc-list: [1, 4, 16, 32, 64, 128, 256]
-      prefill:
-        num-worker: 1
-        tp: 8
-        ep: 1
-        dp-attn: false
-        additional-settings:
-        - "CONFIG_FILE=recipes/h200/8k1k/bs128-agg-tp.yaml"
-      decode:
-        num-worker: 0
-        tp: 8
-        ep: 1
-        dp-attn: false
-    # Low latency TEP (1 prefill, 7 decode)
-    - conc-list: [1, 4, 8]
-      prefill:
-        num-worker: 1
-        tp: 8
-        ep: 1
-        dp-attn: false
-        additional-settings:
-        - "CONFIG_FILE=recipes/h200/8k1k/bs4-1p7d.yaml"
-      decode:
-        num-worker: 7
-        tp: 8
-        ep: 1
-        dp-attn: false
-    # TEP (1 prefill, 6 decode)
-    - conc-list: [4, 8, 16]
-      prefill:
-        num-worker: 1
-        tp: 8
-        ep: 1
-        dp-attn: false
-        additional-settings:
-        - "CONFIG_FILE=recipes/h200/8k1k/bs8-1p6d.yaml"
-      decode:
-        num-worker: 6
-        tp: 8
-        ep: 1
-        dp-attn: false
-    # TEP (1 prefill, 3 decode)
-    - conc-list: [8, 16, 32]
-      prefill:
-        num-worker: 1
-        tp: 8
-        ep: 1
-        dp-attn: false
-        additional-settings:
-        - "CONFIG_FILE=recipes/h200/8k1k/bs16-1p3d.yaml"
-      decode:
-        num-worker: 3
-        tp: 8
-        ep: 1
-        dp-attn: false
-    # TEP (2 prefill, 3 decode)
-    - conc-list: [32, 64, 128]
-      prefill:
-        num-worker: 2
-        tp: 8
-        ep: 1
-        dp-attn: false
-        additional-settings:
-        - "CONFIG_FILE=recipes/h200/8k1k/bs64-2p3d.yaml"
-      decode:
-        num-worker: 3
-        tp: 8
-        ep: 1
-        dp-attn: false
-    # High throughput DEP (1 prefill, 1 decode, dp-attention)
-    - conc-list: [64, 128, 256]
-      prefill:
-        num-worker: 1
-        tp: 8
-        ep: 1
-        dp-attn: false
-        additional-settings:
-        - "CONFIG_FILE=recipes/h200/8k1k/bs128-1p1d-dep.yaml"
-      decode:
-        num-worker: 1
-        tp: 8
-        ep: 8
-        dp-attn: true
+
diff --git a/AGENT.md b/AGENT.md
@@ -179,92 +179,6 @@ When working with benchmark configurations, use these valid values:
 2. Create launcher script in `runners/` directory
 3. Update relevant master config with new runner type
 
-### Registering Recipes from srtslurm
-
-For disaggregated multi-node configurations (dynamo-sglang, dynamo-trt), recipes are stored in the external [srtslurm](https://github.com/ishandhanani/srt-slurm) repository. To stage these recipes in InferenceMAX:
-
-**1. Locate source recipes in srtslurm:**
-```bash
-# Example: H200 sglang disagg recipes
-ls /path/to/srtslurm/recipes/h200/
-# 1k1k/  8k1k/
-```
-
-**2. Analyze recipe structure:**
-Each recipe YAML contains:
-- `name`: Recipe identifier
-- `model`: Model path/container info
-- `resources`: GPU type, prefill/decode node/worker counts
-- `backend.sglang_config`: Prefill and decode configuration (tp-size, dp-size, ep-size, dp-attention, etc.)
-- `benchmark`: ISL/OSL and concurrency settings
-
-**3. Add config to nvidia-master.yaml:**
-```yaml
-dsr1-fp8-h200-dynamo-sglang:
-  image: lmsysorg/sglang:v0.5.8-cu130-runtime
-  model: deepseek-ai/DeepSeek-R1-0528
-  model-prefix: dsr1
-  runner: h200-multinode-slurm
-  precision: fp8
-  framework: dynamo-sglang
-  multinode: true
-  disagg: true
-  seq-len-configs:
-  - isl: 1024
-    osl: 1024
-    search-space:
-    - conc-list: [1, 4, 16, 32, 64, 128, 256, 512]
-      prefill:
-        num-worker: 1
-        tp: 8
-        ep: 1
-        dp-attn: false
-        additional-settings:
-        - "CONFIG_FILE=recipes/h200/1k1k/bs128-agg-tp.yaml"
-      decode:
-        num-worker: 0
-        tp: 8
-        ep: 1
-        dp-attn: false
-```
-
-**4. Key mapping from srtslurm to nvidia-master.yaml:**
-
-| srtslurm field | nvidia-master.yaml field |
-|----------------|-------------------------|
-| `resources.prefill_workers` | `prefill.num-worker` |
-| `resources.decode_workers` | `decode.num-worker` |
-| `sglang_config.prefill.tp-size` | `prefill.tp` |
-| `sglang_config.prefill.ep-size` | `prefill.ep` |
-| `sglang_config.prefill.enable-dp-attention` | `prefill.dp-attn` |
-| `benchmark.concurrencies` (parsed) | `conc-list` |
-| Recipe file path | `additional-settings: CONFIG_FILE=...` |
-
-**5. Common patterns:**
-- **Aggregated (AGG)**: Single node, `num-worker: 1` for prefill, `num-worker: 0` for decode
-- **TEP (Tensor-Expert Parallel)**: `dp-attn: false`, `ep: 1`
-- **DEP (Data-Expert Parallel)**: `dp-attn: true`, `ep: 8` (typically)
-- **Low latency**: More decode workers (e.g., 9), lower concurrencies
-- **High throughput**: Fewer decode workers, higher concurrencies
-
-**6. Add perf-changelog entry:**
-```yaml
-- config-keys:
-    - dsr1-fp8-h200-dynamo-sglang
-  description:
-    - "Add DSR1 FP8 H200 Dynamo SGLang disaggregated multinode configuration"
-    - "Image: lmsysorg/sglang:v0.5.8-cu130-runtime"
-    - "Recipes sourced from srtslurm repo (recipes/h200/)"
-  pr-link: https://github.com/InferenceMAX/InferenceMAX/pull/XXX
-```
-
-**7. Validate configuration:**
-```bash
-python utils/matrix_logic/generate_sweep_configs.py full-sweep \
-  --master-config .github/configs/nvidia-master.yaml \
-  --framework dynamo-sglang
-```
-
 ### Updating Docker Images
 
 When upgrading Docker images in benchmark scripts and master configs .yaml:

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
@@ -283,15 +283,3 @@
     - "Includes MTP and STP configurations for 1k1k and 8k1k sequence lengths"
     - "Concurrency levels: 4, 8, 16, 32, 64, 128, 256, 512"
   pr-link: https://github.com/InferenceMAX/InferenceMAX/pull/570
-
-- config-keys:
-    - dsr1-fp8-h200-dynamo-sglang
-  description:
-    - "Add DSR1 FP8 H200 Dynamo SGLang disaggregated multinode configuration"
-    - "Image: lmsysorg/sglang:v0.5.8-cu130-runtime"
-    - "Runner: h200-multinode-slurm with multinode and disagg enabled"
-    - "Recipes sourced from srtslurm repo (recipes/h200/)"
-    - "1k1k configs: aggregated, low-latency (1P9D), high-throughput TEP (1P6D), DEP (1P6D)"
-    - "8k1k configs: aggregated, TEP configs (1P7D, 1P6D, 1P3D, 2P3D), DEP (1P1D)"
-    - "Concurrency levels range from 1 to 2048 depending on configuration"
-  pr-link: https://github.com/InferenceMAX/InferenceMAX/pull/TBD