diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 683ba28d0..6949525c4 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -33,17 +33,17 @@ dsr1-fp4-mi355x-atom: - isl: 1024 osl: 1024 search-space: - - { tp: 4, ep: 1, conc-start: 32, conc-end: 128 } - - { tp: 8, ep: 1, conc-start: 4, conc-end: 32 } + - { tp: 4, ep: 1, conc-start: 32, conc-end: 256 } + - { tp: 8, ep: 1, conc-start: 4, conc-end: 32 } - isl: 1024 osl: 8192 search-space: - - { tp: 4, ep: 1, conc-start: 128, conc-end: 128 } + - { tp: 4, ep: 1, conc-start: 128, conc-end: 256 } - { tp: 8, ep: 1, conc-start: 4, conc-end: 128 } - isl: 8192 osl: 1024 search-space: - - { tp: 4, ep: 1, conc-start: 4, conc-end: 128 } + - { tp: 4, ep: 1, conc-start: 4, conc-end: 256 } - { tp: 8, ep: 1, conc-start: 4, conc-end: 4 } dsr1-fp4-mi355x-atom-mtp: @@ -64,11 +64,13 @@ dsr1-fp4-mi355x-atom-mtp: - isl: 1024 osl: 8192 search-space: - - { tp: 8, conc-start: 256, conc-end: 256, spec-decoding: mtp } + # - { tp: 4, conc-start: 4, conc-end: 256, spec-decoding: mtp } + - { tp: 8, conc-start: 4, conc-end: 256, spec-decoding: mtp } - isl: 8192 osl: 1024 search-space: - - { tp: 8, conc-start: 4, conc-end: 256, spec-decoding: mtp } + #- { tp: 4, conc-start: 32, conc-end: 256, spec-decoding: mtp } + - { tp: 8, conc-start: 4, conc-end: 256, spec-decoding: mtp } dsr1-fp8-mi300x-sglang: image: lmsysorg/sglang:v0.5.8-rocm700-mi30x diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 9490cb4b9..cc864f765 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -712,3 +712,14 @@ - "Image: vllm/vllm-openai:v0.15.1" - "TP=8, concurrency 4-64 for 1k1k, 1k8k, and 8k1k sequence lengths" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/735 + +- config-keys: + - dsr1-fp4-mi355x-atom + - dsr1-fp4-mi355x-atom-mtp + description: + - "Update search-space configurations for DSR1 FP4 MI355X ATOM and ATOM-MTP" + - "Comment out TP=4 configs, consolidate to TP=8 only" + - "Extend concurrency range to conc-end: 256 across all sequence lengths (1k1k, 1k8k, 8k1k)" + - "Fix MTP 1k8k conc-start from 256 to 4 to enable full concurrency sweep" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/699 +