From 643e75b53982042e96fa419f6f06c24c612802a9 Mon Sep 17 00:00:00 2001 From: jthomson04 Date: Tue, 27 Jan 2026 10:13:35 -0800 Subject: [PATCH 01/11] b200 fp4 Signed-off-by: jthomson04 --- .github/configs/nvidia-master.yaml | 360 +++++++++++++++++++++++++++++ .github/configs/runners.yaml | 2 + runners/launch_b200-dgxc-slurm.sh | 166 +++++++++++++ 3 files changed, 528 insertions(+) create mode 100644 runners/launch_b200-dgxc-slurm.sh diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 44a0f1044..de3b47697 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -1,3 +1,363 @@ +dsr1-fp4-b200-dynamo-trt: + image: /lustre/fsw/containers/jwillthomson+dynamo-0.8.1.post2-test+latest.sqsh + model: deepseek-r1-fp4 + model-prefix: dsr1 + runner: b200-multinode-slurm + precision: fp4 + framework: dynamo-trt + multinode: true + disagg: true + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + - spec-decoding: "mtp" + conc-list: [1214] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipies/trtllm/b200-fp4/1k1k/mtp/ctx1_gen2_dep8_batch64_eplb0_mtp2.yaml" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: true + - spec-decoding: "mtp" + conc-list: [875] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipies/trtllm/b200-fp4/1k1k/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp3.yaml" + decode: + num-worker: 5 + tp: 8 + ep: 8 + dp-attn: true + - spec-decoding: "mtp" + conc-list: [6] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipies/trtllm/b200-fp4/1k1k/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml" + decode: + num-worker: 5 + tp: 8 + ep: 8 + dp-attn: false + - spec-decoding: "mtp" + conc-list: [10, 15, 25, 45, 90, 180] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipies/trtllm/b200-fp4/1k1k/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml" + decode: + num-worker: 5 + tp: 8 + ep: 8 + dp-attn: false + - spec-decoding: "mtp" + conc-list: [ 4968 ] + prefill: + num-worker: 3 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipies/trtllm/b200-fp4/1k1k/mtp/ctx3_gen4_dep8_batch128_eplb0_mtp1.yaml" + decode: + num-worker: 4 + tp: 8 + ep: 8 + dp-attn: true + - spec-decoding: "mtp" + conc-list: [10860] + prefill: + num-worker: 3 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipies/trtllm/b200-fp4/1k1k/mtp/ctx3_gen5_dep4_batch512_eplb0_mtp1.yaml" + decode: + num-worker: 5 + tp: 4 + ep: 4 + dp-attn: true + + # Non-MTP configurations + - conc-list: [4096] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipies/trtllm/b200-fp4/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + - conc-list: [2192] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipies/trtllm/b200-fp4/1k1k/stp/ctx1_gen2_dep8_batch128_eplb0_mtp0.yaml" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: true + - conc-list: [1365] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipies/trtllm/b200-fp4/1k1k/stp/ctx1_gen5_dep8_batch32_eplb0_mtp0.yaml" + decode: + num-worker: 5 + tp: 8 + ep: 8 + dp-attn: true + - conc-list: [6] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipies/trtllm/b200-fp4/1k1k/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml" + decode: + num-worker: 5 + tp: 8 + ep: 8 + dp-attn: false + - conc-list: [10, 15, 25, 45, 90, 180] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipies/trtllm/b200-fp4/1k1k/stp/ctx1_gen5_tep8_batch32_eplb0_mtp0.yaml" + decode: + num-worker: 5 + tp: 8 + ep: 8 + dp-attn: false + - conc-list: [450] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipies/trtllm/b200-fp4/1k1k/stp/ctx1_gen6_tep8_batch64_eplb0_mtp0.yaml" + decode: + num-worker: 6 + tp: 8 + ep: 8 + dp-attn: false + + - isl: 8192 + osl: 1024 + search-space: + - spec-decoding: "mtp" + conc-list: [90] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipies/trtllm/b200-fp4/8k1k/mtp/ctx1_gen1_dep8_batch8_eplb0_mtp3.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + - spec-decoding: "mtp" + conc-list: [66] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipies/trtllm/b200-fp4/8k1k/mtp/ctx1_gen3_tep8_batch16_eplb0_mtp3.yaml" + decode: + num-worker: 3 + tp: 8 + ep: 8 + dp-attn: false + - spec-decoding: "mtp" + conc-list: [6] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipies/trtllm/b200-fp4/8k1k/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml" + decode: + num-worker: 5 + tp: 8 + ep: 8 + dp-attn: false + - spec-decoding: "mtp" + conc-list: [10, 15, 30, 60] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipies/trtllm/b200-fp4/8k1k/mtp/ctx1_gen5_tep8_batch8_eplb0_mtp3.yaml" + decode: + num-worker: 5 + tp: 8 + ep: 8 + dp-attn: false + - spec-decoding: "mtp" + conc-list: [548] + prefill: + num-worker: 3 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipies/trtllm/b200-fp4/8k1k/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + - spec-decoding: "mtp" + conc-list: [1096, 1691] + prefill: + num-worker: 5 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipies/trtllm/b200-fp4/8k1k/mtp/ctx5_gen1_dep8_batch192_eplb0_mtp1.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + - spec-decoding: "mtp" + conc-list: [658] + prefill: + num-worker: 5 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipies/trtllm/b200-fp4/8k1k/mtp/ctx5_gen2_dep8_batch32_eplb0_mtp3.yaml" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: true + + # Non-MTP configurations + - conc-list: [6] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipies/trtllm/b200-fp4/8k1k/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml" + decode: + num-worker: 5 + tp: 8 + ep: 8 + dp-attn: false + - conc-list: [10, 15, 25, 50, 100] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipies/trtllm/b200-fp4/8k1k/stp/ctx1_gen5_tep8_batch8_eplb0_mtp0.yaml" + decode: + num-worker: 5 + tp: 8 + ep: 8 + dp-attn: false + - conc-list: [370] + prefill: + num-worker: 2 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipies/trtllm/b200-fp4/8k1k/stp/ctx2_gen5_tep8_batch64_eplb0_mtp0.yaml" + decode: + num-worker: 5 + tp: 8 + ep: 8 + dp-attn: false + - conc-list: [1606] + prefill: + num-worker: 4 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipies/trtllm/b200-fp4/8k1k/stp/ctx4_gen1_dep8_batch192_eplb0_mtp0.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + - conc-list: [837] + prefill: + num-worker: 4 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipies/trtllm/b200-fp4/8k1k/stp/ctx4_gen3_dep8_batch32_eplb0_mtp0.yaml" + decode: + num-worker: 3 + tp: 8 + ep: 8 + dp-attn: true + - conc-list: [2222] + prefill: + num-worker: 7 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipies/trtllm/b200-fp4/8k1k/stp/ctx7_gen2_dep8_batch128_eplb0_mtp0.yaml" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: true + dsr1-fp4-b300-dynamo-trt: image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.0 model: deepseek-r1-fp4 diff --git a/.github/configs/runners.yaml b/.github/configs/runners.yaml index 7754dda67..b0e187c7c 100644 --- a/.github/configs/runners.yaml +++ b/.github/configs/runners.yaml @@ -62,3 +62,5 @@ gb200: - gb200-nv_0 b300: - 'b300-nv_0' +b200-multinode-slurm: +- 'b200-dgxc-slurm_0' \ No newline at end of file diff --git a/runners/launch_b200-dgxc-slurm.sh b/runners/launch_b200-dgxc-slurm.sh new file mode 100644 index 000000000..8cbd246db --- /dev/null +++ b/runners/launch_b200-dgxc-slurm.sh @@ -0,0 +1,166 @@ +#!/usr/bin/bash + +set -x + +SRT_REPO_DIR="srt-slurm" +echo "Cloning $SRT_REPO_DIR repository..." +if [ -d "$SRT_REPO_DIR" ]; then + echo "Removing existing $SRT_REPO_DIR..." + rm -rf "$SRT_REPO_DIR" +fi + +git clone https://github.com/ishandhanani/srt-slurm.git "$SRT_REPO_DIR" +cd "$SRT_REPO_DIR" +git checkout jthomson04/trtllm + +echo "Installing srtctl..." +curl -LsSf https://astral.sh/uv/install.sh | sh +source $HOME/.local/bin/env + +uv venv +source .venv/bin/activate +uv pip install -e . + +if ! command -v srtctl &> /dev/null; then + echo "Error: Failed to install srtctl" + exit 1 +fi + +echo "Configs available at: $SRT_REPO_DIR/" + +export SLURM_PARTITION="gpu" +export SLURM_ACCOUNT="root" + +if [[ $MODEL_PREFIX == "dsr1" ]]; then + if [[ $PRECISION == "fp4" ]]; then + export MODEL_PATH="/lustre/fsw/models/dsr1-0528-nvfp4-v2" + elif [[ $PRECISION == "fp8" ]]; then + export MODEL_PATH="/lustre/fsw/models/dsr1-0528-fp8" + else + echo "Unsupported precision: $PRECISION. Supported precisions are: fp4, fp8" + exit 1 + fi + export SERVED_MODEL_NAME=$MODEL +else + echo "Unsupported model prefix: $MODEL_PREFIX. Supported prefixes are: dsr1" + exit 1 +fi + +export ISL="$ISL" +export OSL="$OSL" + +# Create srtslurm.yaml for srtctl +echo "Creating srtslurm.yaml configuration..." +cat > srtslurm.yaml <&1) +echo "$SRTCTL_OUTPUT" + +# Extract JOB_ID from srtctl output +JOB_ID=$(echo "$SRTCTL_OUTPUT" | grep -oP '✅ Job \K[0-9]+' || echo "$SRTCTL_OUTPUT" | grep -oP 'Job \K[0-9]+') + +if [ -z "$JOB_ID" ]; then + echo "Error: Failed to extract JOB_ID from srtctl output" + exit 1 +fi + +echo "Extracted JOB_ID: $JOB_ID" + +# Wait for this specific job to complete +echo "Waiting for job $JOB_ID to complete..." +while [ -n "$(squeue -j $JOB_ID --noheader 2>/dev/null)" ]; do + echo "Job $JOB_ID still running..." + squeue -j $JOB_ID + sleep 30 +done +echo "Job $JOB_ID completed!" + +echo "Collecting results..." + +# Use the JOB_ID to find the logs directory +# srtctl creates logs in outputs/JOB_ID/logs/ +LOGS_DIR="outputs/$JOB_ID/logs" + +if [ ! -d "$LOGS_DIR" ]; then + echo "Warning: Logs directory not found at $LOGS_DIR" + exit 1 +fi + +echo "Found logs directory: $LOGS_DIR" +cat $LOGS_DIR/sweep_$JOB_ID.log + +for file in $LOGS_DIR/*; do + if [ -f "$file" ]; then + tail -n 500 $file + fi +done + +# Find all result subdirectories +RESULT_SUBDIRS=$(find "$LOGS_DIR" -maxdepth 1 -type d -name "*isl*osl*" 2>/dev/null) + +if [ -z "$RESULT_SUBDIRS" ]; then + echo "Warning: No result subdirectories found in $LOGS_DIR" +else + # Process results from all configurations + for result_subdir in $RESULT_SUBDIRS; do + echo "Processing result subdirectory: $result_subdir" + + # Extract configuration info from directory name + CONFIG_NAME=$(basename "$result_subdir") + + # Find all result JSON files + RESULT_FILES=$(find "$result_subdir" -name "results_concurrency_*.json" 2>/dev/null) + + for result_file in $RESULT_FILES; do + if [ -f "$result_file" ]; then + # Extract metadata from filename + filename=$(basename "$result_file") + concurrency=$(echo "$filename" | sed -n 's/results_concurrency_\([0-9]*\)_gpus_.*/\1/p') + gpus=$(echo "$filename" | sed -n 's/results_concurrency_[0-9]*_gpus_\([0-9]*\)_ctx_.*/\1/p') + ctx=$(echo "$filename" | sed -n 's/.*_ctx_\([0-9]*\)_gen_.*/\1/p') + gen=$(echo "$filename" | sed -n 's/.*_gen_\([0-9]*\)\.json/\1/p') + + echo "Processing concurrency $concurrency with $gpus GPUs (ctx: $ctx, gen: $gen): $result_file" + + WORKSPACE_RESULT_FILE="$GITHUB_WORKSPACE/${RESULT_FILENAME}_${CONFIG_NAME}_conc${concurrency}_gpus_${gpus}_ctx_${ctx}_gen_${gen}.json" + cp "$result_file" "$WORKSPACE_RESULT_FILE" + + echo "Copied result file to: $WORKSPACE_RESULT_FILE" + fi + done + done +fi + +echo "All result files processed" + +# Cleanup +echo "Cleaning up..." +deactivate 2>/dev/null || true +rm -rf .venv +echo "Cleanup complete" From 010b48a51bee1779c10f3f20fded3e83417e5de4 Mon Sep 17 00:00:00 2001 From: jthomson04 Date: Tue, 27 Jan 2026 14:55:13 -0800 Subject: [PATCH 02/11] fix runners Signed-off-by: jthomson04 --- .github/configs/runners.yaml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.github/configs/runners.yaml b/.github/configs/runners.yaml index 3fddc3d99..95bacc78d 100644 --- a/.github/configs/runners.yaml +++ b/.github/configs/runners.yaml @@ -60,7 +60,5 @@ mi355x-disagg: - 'mi355x-amds_2' gb200: - gb200-nv_0 -b300: -- 'b300-nv_0' b200-multinode-slurm: - 'b200-dgxc-slurm_0' From ab854286d58803795b7cbc3f5878886f3e3d17ad Mon Sep 17 00:00:00 2001 From: jthomson04 Date: Tue, 27 Jan 2026 15:28:30 -0800 Subject: [PATCH 03/11] fix branch + container Signed-off-by: jthomson04 --- .github/configs/nvidia-master.yaml | 2 +- runners/launch_b200-dgxc-slurm.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index c1ff905b3..c4388a6a1 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -1,5 +1,5 @@ dsr1-fp4-b200-dynamo-trt: - image: /lustre/fsw/containers/jwillthomson+dynamo-0.8.1.post2-test+latest.sqsh + image: /scratch/fsw/containers/dynamo-0.8.1.post1.sqsh model: deepseek-r1-fp4 model-prefix: dsr1 runner: b200-multinode-slurm diff --git a/runners/launch_b200-dgxc-slurm.sh b/runners/launch_b200-dgxc-slurm.sh index 8cbd246db..dd102c192 100644 --- a/runners/launch_b200-dgxc-slurm.sh +++ b/runners/launch_b200-dgxc-slurm.sh @@ -11,7 +11,7 @@ fi git clone https://github.com/ishandhanani/srt-slurm.git "$SRT_REPO_DIR" cd "$SRT_REPO_DIR" -git checkout jthomson04/trtllm +git checkout sa-submission-q1-2026 echo "Installing srtctl..." curl -LsSf https://astral.sh/uv/install.sh | sh From ffae7b4822a0f7748010721c22996427bb0e0237 Mon Sep 17 00:00:00 2001 From: jthomson04 Date: Tue, 27 Jan 2026 16:34:00 -0800 Subject: [PATCH 04/11] dynamically pull sqsh files Signed-off-by: jthomson04 --- .github/configs/nvidia-master.yaml | 2 +- runners/launch_b200-dgxc-slurm.sh | 12 ++++++++++-- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index c4388a6a1..6fe491d56 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -1,5 +1,5 @@ dsr1-fp4-b200-dynamo-trt: - image: /scratch/fsw/containers/dynamo-0.8.1.post1.sqsh + image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1 model: deepseek-r1-fp4 model-prefix: dsr1 runner: b200-multinode-slurm diff --git a/runners/launch_b200-dgxc-slurm.sh b/runners/launch_b200-dgxc-slurm.sh index dd102c192..2eee25d81 100644 --- a/runners/launch_b200-dgxc-slurm.sh +++ b/runners/launch_b200-dgxc-slurm.sh @@ -49,6 +49,14 @@ fi export ISL="$ISL" export OSL="$OSL" +NGINX_IMAGE="nginx:1.27.4" + +SQUASH_FILE="/scratch/fsw/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" +NGINX_SQUASH_FILE="/scratch/fsw/squash/$(echo "$NGINX_IMAGE" | sed 's/[\/:@#]/_/g').sqsh" + +srun -N 1 -A $SLURM_ACCOUNT -p $SLURM_PARTITION bash -c "enroot import -o $SQUASH_FILE docker://$IMAGE" +srun -N 1 -A $SLURM_ACCOUNT -p $SLURM_PARTITION bash -c "enroot import -o $NGINX_SQUASH_FILE docker://$NGINX_IMAGE" + # Create srtslurm.yaml for srtctl echo "Creating srtslurm.yaml configuration..." cat > srtslurm.yaml < Date: Tue, 27 Jan 2026 16:50:35 -0800 Subject: [PATCH 05/11] recipies to recipes Signed-off-by: jthomson04 --- .github/configs/nvidia-master.yaml | 50 +++++++++++++++--------------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 6fe491d56..7d251923c 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -19,7 +19,7 @@ dsr1-fp4-b200-dynamo-trt: ep: 4 dp-attn: true additional-settings: - - "CONFIG_FILE=recipies/trtllm/b200-fp4/1k1k/mtp/ctx1_gen2_dep8_batch64_eplb0_mtp2.yaml" + - "CONFIG_FILE=recipes/trtllm/b200-fp4/1k1k/mtp/ctx1_gen2_dep8_batch64_eplb0_mtp2.yaml" decode: num-worker: 2 tp: 8 @@ -33,7 +33,7 @@ dsr1-fp4-b200-dynamo-trt: ep: 4 dp-attn: true additional-settings: - - "CONFIG_FILE=recipies/trtllm/b200-fp4/1k1k/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp3.yaml" + - "CONFIG_FILE=recipes/trtllm/b200-fp4/1k1k/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp3.yaml" decode: num-worker: 5 tp: 8 @@ -47,7 +47,7 @@ dsr1-fp4-b200-dynamo-trt: ep: 4 dp-attn: true additional-settings: - - "CONFIG_FILE=recipies/trtllm/b200-fp4/1k1k/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml" + - "CONFIG_FILE=recipes/trtllm/b200-fp4/1k1k/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml" decode: num-worker: 5 tp: 8 @@ -61,7 +61,7 @@ dsr1-fp4-b200-dynamo-trt: ep: 4 dp-attn: true additional-settings: - - "CONFIG_FILE=recipies/trtllm/b200-fp4/1k1k/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml" + - "CONFIG_FILE=recipes/trtllm/b200-fp4/1k1k/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml" decode: num-worker: 5 tp: 8 @@ -75,7 +75,7 @@ dsr1-fp4-b200-dynamo-trt: ep: 4 dp-attn: true additional-settings: - - "CONFIG_FILE=recipies/trtllm/b200-fp4/1k1k/mtp/ctx3_gen4_dep8_batch128_eplb0_mtp1.yaml" + - "CONFIG_FILE=recipes/trtllm/b200-fp4/1k1k/mtp/ctx3_gen4_dep8_batch128_eplb0_mtp1.yaml" decode: num-worker: 4 tp: 8 @@ -89,7 +89,7 @@ dsr1-fp4-b200-dynamo-trt: ep: 4 dp-attn: true additional-settings: - - "CONFIG_FILE=recipies/trtllm/b200-fp4/1k1k/mtp/ctx3_gen5_dep4_batch512_eplb0_mtp1.yaml" + - "CONFIG_FILE=recipes/trtllm/b200-fp4/1k1k/mtp/ctx3_gen5_dep4_batch512_eplb0_mtp1.yaml" decode: num-worker: 5 tp: 4 @@ -104,7 +104,7 @@ dsr1-fp4-b200-dynamo-trt: ep: 4 dp-attn: true additional-settings: - - "CONFIG_FILE=recipies/trtllm/b200-fp4/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml" + - "CONFIG_FILE=recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml" decode: num-worker: 1 tp: 8 @@ -117,7 +117,7 @@ dsr1-fp4-b200-dynamo-trt: ep: 4 dp-attn: true additional-settings: - - "CONFIG_FILE=recipies/trtllm/b200-fp4/1k1k/stp/ctx1_gen2_dep8_batch128_eplb0_mtp0.yaml" + - "CONFIG_FILE=recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen2_dep8_batch128_eplb0_mtp0.yaml" decode: num-worker: 2 tp: 8 @@ -130,7 +130,7 @@ dsr1-fp4-b200-dynamo-trt: ep: 4 dp-attn: true additional-settings: - - "CONFIG_FILE=recipies/trtllm/b200-fp4/1k1k/stp/ctx1_gen5_dep8_batch32_eplb0_mtp0.yaml" + - "CONFIG_FILE=recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen5_dep8_batch32_eplb0_mtp0.yaml" decode: num-worker: 5 tp: 8 @@ -143,7 +143,7 @@ dsr1-fp4-b200-dynamo-trt: ep: 4 dp-attn: true additional-settings: - - "CONFIG_FILE=recipies/trtllm/b200-fp4/1k1k/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml" + - "CONFIG_FILE=recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml" decode: num-worker: 5 tp: 8 @@ -156,7 +156,7 @@ dsr1-fp4-b200-dynamo-trt: ep: 4 dp-attn: true additional-settings: - - "CONFIG_FILE=recipies/trtllm/b200-fp4/1k1k/stp/ctx1_gen5_tep8_batch32_eplb0_mtp0.yaml" + - "CONFIG_FILE=recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen5_tep8_batch32_eplb0_mtp0.yaml" decode: num-worker: 5 tp: 8 @@ -169,7 +169,7 @@ dsr1-fp4-b200-dynamo-trt: ep: 4 dp-attn: true additional-settings: - - "CONFIG_FILE=recipies/trtllm/b200-fp4/1k1k/stp/ctx1_gen6_tep8_batch64_eplb0_mtp0.yaml" + - "CONFIG_FILE=recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen6_tep8_batch64_eplb0_mtp0.yaml" decode: num-worker: 6 tp: 8 @@ -187,7 +187,7 @@ dsr1-fp4-b200-dynamo-trt: ep: 4 dp-attn: true additional-settings: - - "CONFIG_FILE=recipies/trtllm/b200-fp4/8k1k/mtp/ctx1_gen1_dep8_batch8_eplb0_mtp3.yaml" + - "CONFIG_FILE=recipes/trtllm/b200-fp4/8k1k/mtp/ctx1_gen1_dep8_batch8_eplb0_mtp3.yaml" decode: num-worker: 1 tp: 8 @@ -201,7 +201,7 @@ dsr1-fp4-b200-dynamo-trt: ep: 4 dp-attn: true additional-settings: - - "CONFIG_FILE=recipies/trtllm/b200-fp4/8k1k/mtp/ctx1_gen3_tep8_batch16_eplb0_mtp3.yaml" + - "CONFIG_FILE=recipes/trtllm/b200-fp4/8k1k/mtp/ctx1_gen3_tep8_batch16_eplb0_mtp3.yaml" decode: num-worker: 3 tp: 8 @@ -215,7 +215,7 @@ dsr1-fp4-b200-dynamo-trt: ep: 4 dp-attn: true additional-settings: - - "CONFIG_FILE=recipies/trtllm/b200-fp4/8k1k/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml" + - "CONFIG_FILE=recipes/trtllm/b200-fp4/8k1k/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml" decode: num-worker: 5 tp: 8 @@ -229,7 +229,7 @@ dsr1-fp4-b200-dynamo-trt: ep: 4 dp-attn: true additional-settings: - - "CONFIG_FILE=recipies/trtllm/b200-fp4/8k1k/mtp/ctx1_gen5_tep8_batch8_eplb0_mtp3.yaml" + - "CONFIG_FILE=recipes/trtllm/b200-fp4/8k1k/mtp/ctx1_gen5_tep8_batch8_eplb0_mtp3.yaml" decode: num-worker: 5 tp: 8 @@ -243,7 +243,7 @@ dsr1-fp4-b200-dynamo-trt: ep: 4 dp-attn: true additional-settings: - - "CONFIG_FILE=recipies/trtllm/b200-fp4/8k1k/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3.yaml" + - "CONFIG_FILE=recipes/trtllm/b200-fp4/8k1k/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3.yaml" decode: num-worker: 1 tp: 8 @@ -257,7 +257,7 @@ dsr1-fp4-b200-dynamo-trt: ep: 4 dp-attn: true additional-settings: - - "CONFIG_FILE=recipies/trtllm/b200-fp4/8k1k/mtp/ctx5_gen1_dep8_batch192_eplb0_mtp1.yaml" + - "CONFIG_FILE=recipes/trtllm/b200-fp4/8k1k/mtp/ctx5_gen1_dep8_batch192_eplb0_mtp1.yaml" decode: num-worker: 1 tp: 8 @@ -271,7 +271,7 @@ dsr1-fp4-b200-dynamo-trt: ep: 4 dp-attn: true additional-settings: - - "CONFIG_FILE=recipies/trtllm/b200-fp4/8k1k/mtp/ctx5_gen2_dep8_batch32_eplb0_mtp3.yaml" + - "CONFIG_FILE=recipes/trtllm/b200-fp4/8k1k/mtp/ctx5_gen2_dep8_batch32_eplb0_mtp3.yaml" decode: num-worker: 2 tp: 8 @@ -286,7 +286,7 @@ dsr1-fp4-b200-dynamo-trt: ep: 4 dp-attn: true additional-settings: - - "CONFIG_FILE=recipies/trtllm/b200-fp4/8k1k/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml" + - "CONFIG_FILE=recipes/trtllm/b200-fp4/8k1k/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml" decode: num-worker: 5 tp: 8 @@ -299,7 +299,7 @@ dsr1-fp4-b200-dynamo-trt: ep: 4 dp-attn: true additional-settings: - - "CONFIG_FILE=recipies/trtllm/b200-fp4/8k1k/stp/ctx1_gen5_tep8_batch8_eplb0_mtp0.yaml" + - "CONFIG_FILE=recipes/trtllm/b200-fp4/8k1k/stp/ctx1_gen5_tep8_batch8_eplb0_mtp0.yaml" decode: num-worker: 5 tp: 8 @@ -312,7 +312,7 @@ dsr1-fp4-b200-dynamo-trt: ep: 4 dp-attn: true additional-settings: - - "CONFIG_FILE=recipies/trtllm/b200-fp4/8k1k/stp/ctx2_gen5_tep8_batch64_eplb0_mtp0.yaml" + - "CONFIG_FILE=recipes/trtllm/b200-fp4/8k1k/stp/ctx2_gen5_tep8_batch64_eplb0_mtp0.yaml" decode: num-worker: 5 tp: 8 @@ -325,7 +325,7 @@ dsr1-fp4-b200-dynamo-trt: ep: 4 dp-attn: true additional-settings: - - "CONFIG_FILE=recipies/trtllm/b200-fp4/8k1k/stp/ctx4_gen1_dep8_batch192_eplb0_mtp0.yaml" + - "CONFIG_FILE=recipes/trtllm/b200-fp4/8k1k/stp/ctx4_gen1_dep8_batch192_eplb0_mtp0.yaml" decode: num-worker: 1 tp: 8 @@ -338,7 +338,7 @@ dsr1-fp4-b200-dynamo-trt: ep: 4 dp-attn: true additional-settings: - - "CONFIG_FILE=recipies/trtllm/b200-fp4/8k1k/stp/ctx4_gen3_dep8_batch32_eplb0_mtp0.yaml" + - "CONFIG_FILE=recipes/trtllm/b200-fp4/8k1k/stp/ctx4_gen3_dep8_batch32_eplb0_mtp0.yaml" decode: num-worker: 3 tp: 8 @@ -351,7 +351,7 @@ dsr1-fp4-b200-dynamo-trt: ep: 4 dp-attn: true additional-settings: - - "CONFIG_FILE=recipies/trtllm/b200-fp4/8k1k/stp/ctx7_gen2_dep8_batch128_eplb0_mtp0.yaml" + - "CONFIG_FILE=recipes/trtllm/b200-fp4/8k1k/stp/ctx7_gen2_dep8_batch128_eplb0_mtp0.yaml" decode: num-worker: 2 tp: 8 From 3e30c2595b1a34b9b42b9247253b86a8fab613ee Mon Sep 17 00:00:00 2001 From: jthomson04 Date: Tue, 27 Jan 2026 16:55:53 -0800 Subject: [PATCH 06/11] update sqshfile path Signed-off-by: jthomson04 --- runners/launch_b200-dgxc-slurm.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/runners/launch_b200-dgxc-slurm.sh b/runners/launch_b200-dgxc-slurm.sh index 2eee25d81..e9e749dd8 100644 --- a/runners/launch_b200-dgxc-slurm.sh +++ b/runners/launch_b200-dgxc-slurm.sh @@ -51,8 +51,8 @@ export OSL="$OSL" NGINX_IMAGE="nginx:1.27.4" -SQUASH_FILE="/scratch/fsw/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" -NGINX_SQUASH_FILE="/scratch/fsw/squash/$(echo "$NGINX_IMAGE" | sed 's/[\/:@#]/_/g').sqsh" +SQUASH_FILE="/scratch/fsw/containers/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" +NGINX_SQUASH_FILE="/scratch/fsw/containers/$(echo "$NGINX_IMAGE" | sed 's/[\/:@#]/_/g').sqsh" srun -N 1 -A $SLURM_ACCOUNT -p $SLURM_PARTITION bash -c "enroot import -o $SQUASH_FILE docker://$IMAGE" srun -N 1 -A $SLURM_ACCOUNT -p $SLURM_PARTITION bash -c "enroot import -o $NGINX_SQUASH_FILE docker://$NGINX_IMAGE" From 1611b1edbc567ff3f02c8d99497c9d751f4742d7 Mon Sep 17 00:00:00 2001 From: jthomson04 Date: Tue, 27 Jan 2026 17:03:10 -0800 Subject: [PATCH 07/11] fix sqshfile path again Signed-off-by: jthomson04 --- runners/launch_b200-dgxc-slurm.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/runners/launch_b200-dgxc-slurm.sh b/runners/launch_b200-dgxc-slurm.sh index e9e749dd8..ed192cd57 100644 --- a/runners/launch_b200-dgxc-slurm.sh +++ b/runners/launch_b200-dgxc-slurm.sh @@ -51,8 +51,8 @@ export OSL="$OSL" NGINX_IMAGE="nginx:1.27.4" -SQUASH_FILE="/scratch/fsw/containers/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" -NGINX_SQUASH_FILE="/scratch/fsw/containers/$(echo "$NGINX_IMAGE" | sed 's/[\/:@#]/_/g').sqsh" +SQUASH_FILE="/home/sa-shared/containers/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" +NGINX_SQUASH_FILE="/home/sa-shared/containers/$(echo "$NGINX_IMAGE" | sed 's/[\/:@#]/_/g').sqsh" srun -N 1 -A $SLURM_ACCOUNT -p $SLURM_PARTITION bash -c "enroot import -o $SQUASH_FILE docker://$IMAGE" srun -N 1 -A $SLURM_ACCOUNT -p $SLURM_PARTITION bash -c "enroot import -o $NGINX_SQUASH_FILE docker://$NGINX_IMAGE" From 3993a92470bce33c3b86209c93195c00a8e9cd15 Mon Sep 17 00:00:00 2001 From: jthomson04 Date: Wed, 28 Jan 2026 13:25:53 -0800 Subject: [PATCH 08/11] add URL comments above CONFIG_FILE entries in dsr1-fp4-b200-dynamo-trt Co-Authored-By: Claude Opus 4.5 --- .github/configs/nvidia-master.yaml | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 7d251923c..12442ad16 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -19,6 +19,7 @@ dsr1-fp4-b200-dynamo-trt: ep: 4 dp-attn: true additional-settings: + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp4/1k1k/mtp/ctx1_gen2_dep8_batch64_eplb0_mtp2.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp4/1k1k/mtp/ctx1_gen2_dep8_batch64_eplb0_mtp2.yaml" decode: num-worker: 2 @@ -33,6 +34,7 @@ dsr1-fp4-b200-dynamo-trt: ep: 4 dp-attn: true additional-settings: + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp4/1k1k/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp3.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp4/1k1k/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp3.yaml" decode: num-worker: 5 @@ -47,6 +49,7 @@ dsr1-fp4-b200-dynamo-trt: ep: 4 dp-attn: true additional-settings: + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp4/1k1k/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp4/1k1k/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml" decode: num-worker: 5 @@ -61,6 +64,7 @@ dsr1-fp4-b200-dynamo-trt: ep: 4 dp-attn: true additional-settings: + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp4/1k1k/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp4/1k1k/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml" decode: num-worker: 5 @@ -75,6 +79,7 @@ dsr1-fp4-b200-dynamo-trt: ep: 4 dp-attn: true additional-settings: + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp4/1k1k/mtp/ctx3_gen4_dep8_batch128_eplb0_mtp1.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp4/1k1k/mtp/ctx3_gen4_dep8_batch128_eplb0_mtp1.yaml" decode: num-worker: 4 @@ -89,6 +94,7 @@ dsr1-fp4-b200-dynamo-trt: ep: 4 dp-attn: true additional-settings: + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp4/1k1k/mtp/ctx3_gen5_dep4_batch512_eplb0_mtp1.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp4/1k1k/mtp/ctx3_gen5_dep4_batch512_eplb0_mtp1.yaml" decode: num-worker: 5 @@ -104,6 +110,7 @@ dsr1-fp4-b200-dynamo-trt: ep: 4 dp-attn: true additional-settings: + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml" decode: num-worker: 1 @@ -117,6 +124,7 @@ dsr1-fp4-b200-dynamo-trt: ep: 4 dp-attn: true additional-settings: + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen2_dep8_batch128_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen2_dep8_batch128_eplb0_mtp0.yaml" decode: num-worker: 2 @@ -130,6 +138,7 @@ dsr1-fp4-b200-dynamo-trt: ep: 4 dp-attn: true additional-settings: + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen5_dep8_batch32_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen5_dep8_batch32_eplb0_mtp0.yaml" decode: num-worker: 5 @@ -143,6 +152,7 @@ dsr1-fp4-b200-dynamo-trt: ep: 4 dp-attn: true additional-settings: + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml" decode: num-worker: 5 @@ -156,6 +166,7 @@ dsr1-fp4-b200-dynamo-trt: ep: 4 dp-attn: true additional-settings: + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen5_tep8_batch32_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen5_tep8_batch32_eplb0_mtp0.yaml" decode: num-worker: 5 @@ -169,6 +180,7 @@ dsr1-fp4-b200-dynamo-trt: ep: 4 dp-attn: true additional-settings: + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen6_tep8_batch64_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen6_tep8_batch64_eplb0_mtp0.yaml" decode: num-worker: 6 @@ -187,6 +199,7 @@ dsr1-fp4-b200-dynamo-trt: ep: 4 dp-attn: true additional-settings: + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp4/8k1k/mtp/ctx1_gen1_dep8_batch8_eplb0_mtp3.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp4/8k1k/mtp/ctx1_gen1_dep8_batch8_eplb0_mtp3.yaml" decode: num-worker: 1 @@ -201,6 +214,7 @@ dsr1-fp4-b200-dynamo-trt: ep: 4 dp-attn: true additional-settings: + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp4/8k1k/mtp/ctx1_gen3_tep8_batch16_eplb0_mtp3.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp4/8k1k/mtp/ctx1_gen3_tep8_batch16_eplb0_mtp3.yaml" decode: num-worker: 3 @@ -215,6 +229,7 @@ dsr1-fp4-b200-dynamo-trt: ep: 4 dp-attn: true additional-settings: + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp4/8k1k/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp4/8k1k/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml" decode: num-worker: 5 @@ -229,6 +244,7 @@ dsr1-fp4-b200-dynamo-trt: ep: 4 dp-attn: true additional-settings: + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp4/8k1k/mtp/ctx1_gen5_tep8_batch8_eplb0_mtp3.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp4/8k1k/mtp/ctx1_gen5_tep8_batch8_eplb0_mtp3.yaml" decode: num-worker: 5 @@ -243,6 +259,7 @@ dsr1-fp4-b200-dynamo-trt: ep: 4 dp-attn: true additional-settings: + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp4/8k1k/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp4/8k1k/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3.yaml" decode: num-worker: 1 @@ -257,6 +274,7 @@ dsr1-fp4-b200-dynamo-trt: ep: 4 dp-attn: true additional-settings: + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp4/8k1k/mtp/ctx5_gen1_dep8_batch192_eplb0_mtp1.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp4/8k1k/mtp/ctx5_gen1_dep8_batch192_eplb0_mtp1.yaml" decode: num-worker: 1 @@ -271,6 +289,7 @@ dsr1-fp4-b200-dynamo-trt: ep: 4 dp-attn: true additional-settings: + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp4/8k1k/mtp/ctx5_gen2_dep8_batch32_eplb0_mtp3.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp4/8k1k/mtp/ctx5_gen2_dep8_batch32_eplb0_mtp3.yaml" decode: num-worker: 2 @@ -286,6 +305,7 @@ dsr1-fp4-b200-dynamo-trt: ep: 4 dp-attn: true additional-settings: + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp4/8k1k/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp4/8k1k/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml" decode: num-worker: 5 @@ -299,6 +319,7 @@ dsr1-fp4-b200-dynamo-trt: ep: 4 dp-attn: true additional-settings: + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp4/8k1k/stp/ctx1_gen5_tep8_batch8_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp4/8k1k/stp/ctx1_gen5_tep8_batch8_eplb0_mtp0.yaml" decode: num-worker: 5 @@ -312,6 +333,7 @@ dsr1-fp4-b200-dynamo-trt: ep: 4 dp-attn: true additional-settings: + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp4/8k1k/stp/ctx2_gen5_tep8_batch64_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp4/8k1k/stp/ctx2_gen5_tep8_batch64_eplb0_mtp0.yaml" decode: num-worker: 5 @@ -325,6 +347,7 @@ dsr1-fp4-b200-dynamo-trt: ep: 4 dp-attn: true additional-settings: + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp4/8k1k/stp/ctx4_gen1_dep8_batch192_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp4/8k1k/stp/ctx4_gen1_dep8_batch192_eplb0_mtp0.yaml" decode: num-worker: 1 @@ -338,6 +361,7 @@ dsr1-fp4-b200-dynamo-trt: ep: 4 dp-attn: true additional-settings: + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp4/8k1k/stp/ctx4_gen3_dep8_batch32_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp4/8k1k/stp/ctx4_gen3_dep8_batch32_eplb0_mtp0.yaml" decode: num-worker: 3 @@ -351,6 +375,7 @@ dsr1-fp4-b200-dynamo-trt: ep: 4 dp-attn: true additional-settings: + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp4/8k1k/stp/ctx7_gen2_dep8_batch128_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp4/8k1k/stp/ctx7_gen2_dep8_batch128_eplb0_mtp0.yaml" decode: num-worker: 2 From 7983467ba29821d255a200d7013f431fc986e283 Mon Sep 17 00:00:00 2001 From: jthomson04 Date: Wed, 28 Jan 2026 13:26:36 -0800 Subject: [PATCH 09/11] description of results format Signed-off-by: jthomson04 --- runners/launch_b200-dgxc-slurm.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/runners/launch_b200-dgxc-slurm.sh b/runners/launch_b200-dgxc-slurm.sh index ed192cd57..7a6d31288 100644 --- a/runners/launch_b200-dgxc-slurm.sh +++ b/runners/launch_b200-dgxc-slurm.sh @@ -148,6 +148,7 @@ else for result_file in $RESULT_FILES; do if [ -f "$result_file" ]; then # Extract metadata from filename + # Files are of the format "results_concurrency_gpus_{num gpus}_ctx_{num ctx}_gen_{num gen}.json" filename=$(basename "$result_file") concurrency=$(echo "$filename" | sed -n 's/results_concurrency_\([0-9]*\)_gpus_.*/\1/p') gpus=$(echo "$filename" | sed -n 's/results_concurrency_[0-9]*_gpus_\([0-9]*\)_ctx_.*/\1/p') From 4357ff52d7a82f558cbca188b4fdd55ad1bebb8a Mon Sep 17 00:00:00 2001 From: jthomson04 Date: Wed, 28 Jan 2026 16:17:05 -0800 Subject: [PATCH 10/11] perf changelog Signed-off-by: jthomson04 --- perf-changelog.yaml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index cf1c6035f..fc54f9995 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -273,3 +273,10 @@ description: - "Add DSR1 FP4 B300 Dynamo TRT configurations" pr-link: https://github.com/InferenceMAX/InferenceMAX/pull/585 +- config-keys: + - dsr1-fp4-b200-dynamo-trt + description: + - "Update DSR1 FP4 B200 Dynamo TRT configurations" + - "Update TRTLLM version to 1.2.0rc6.post2" + - "Transform to use srt-slurm recipes" + pr-link: https://github.com/InferenceMAX/InferenceMAX/pull/588 \ No newline at end of file From d8d1aa60c56653e57a0e44d4de8d240334c01b4d Mon Sep 17 00:00:00 2001 From: jthomson04 Date: Wed, 28 Jan 2026 16:17:58 -0800 Subject: [PATCH 11/11] perf changelog newline Signed-off-by: jthomson04 --- perf-changelog.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index fc54f9995..474681feb 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -279,4 +279,4 @@ - "Update DSR1 FP4 B200 Dynamo TRT configurations" - "Update TRTLLM version to 1.2.0rc6.post2" - "Transform to use srt-slurm recipes" - pr-link: https://github.com/InferenceMAX/InferenceMAX/pull/588 \ No newline at end of file + pr-link: https://github.com/InferenceMAX/InferenceMAX/pull/588