From 7f7c12cfcd78eb461bd184d5ab06a25de0164e82 Mon Sep 17 00:00:00 2001 From: Pekka Heikura Date: Wed, 17 Jun 2026 16:08:22 +0300 Subject: [PATCH] perf(cuda): packed multi-prompt prefill for continuous batching (#193) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CudaForwardPass.PrefillPackedMulti prefilled the S pending prompts' chunks sequentially — a loop over PrefillWithCache, each chunk a separate batched-trunk forward pass that re-read every weight. With several prompts admitted concurrently their prefill GEMMs were not amortized across prompts (unlike the CPU ForwardPass.PrefillPackedMulti, #183 Gap 2). Pack the S chunks token-major into the batched-trunk scratch and run ONE forward pass over N = Σ chunk_len so every trunk + output GEMM amortizes its weight read across all prompts. Per-sequence RoPE / QK-norm / KV-append / varlen attention run on each chunk's slice at its absolute startPos against its own per-sequence cache (cu_seqlens-style — no cross-sequence attention, no padding), mirroring the CPU packed path and the single-sequence PrefillBatchedTrunk. - Extract GpuPrefillAppendAttention from GpuLayerBatchedTrunk so the packed path drives the IDENTICAL KV-append + attention dispatch per sub-sequence (argmax-stable with the single-sequence trunk; verified the extraction is byte-for-byte the prior inline dispatch — Qwen3 dense + Gemma4 SWA/shared-KV prefill oracles all pass). - New GpuLayerPackedTrunk (dense-only packed layer) + PrefillPackedTrunkMulti (driver) + AllChunksPackable gate (mirrors the single-seq #162 attention cap). - PrefillPackedMulti packs when S>=2 && IsBatchedPrefillSupported() && AllChunksPackable, else falls back to the (always-correct) sequential loop. - Dense-only assertion in PrefillPackedTrunkMulti self-enforces the contract (ThrowIfBatchingUnsupported is the real Gemma-4/softcap gate, not IsBatchedPrefillSupported which Gemma 4 satisfies). Tests (Qwen3-8B Q4_K, 4070 Ti): packed-vs-sequential final-token logits, chunked packed prefill vs whole-prompt (cross-chunk KV + S<2 fallback), and packed prefill -> batched decode vs single-user prefill+decode. All argmax-stable within the cross-path tolerance the batched-trunk oracles hold. Co-Authored-By: Claude Opus 4.8 --- src/SharpInference.Engine/CudaForwardPass.cs | 345 +++++++++++++++--- .../CudaBatchForwardMultiTests.cs | 172 +++++++++ 2 files changed, 465 insertions(+), 52 deletions(-) diff --git a/src/SharpInference.Engine/CudaForwardPass.cs b/src/SharpInference.Engine/CudaForwardPass.cs index a5d226c..56c8ad4 100644 --- a/src/SharpInference.Engine/CudaForwardPass.cs +++ b/src/SharpInference.Engine/CudaForwardPass.cs @@ -3023,20 +3023,88 @@ void ApplyRopeBatched() _gpu.HeadNormPureBatched(vAll, layerKv, layerHd, N, _hp.RmsNormEps); ApplyRopeBatched(); - if (!kvShared) + // Append-target ring (this layer's own KV) and attention-source ring (the effective + // layer's — same as `layer` unless shared-KV aliases it). SWA layers wrap a window- + // sized ring; everything else is full-context. + int appendCtx = isSwa && window > 0 ? SwaRingSize(_maxSeqLen, window) : _maxSeqLen; + int effLayerCtx = (_hp.IsSwaLayer is { } swaEff && swaEff[effLayer] && window > 0) + ? SwaRingSize(_maxSeqLen, window) : _maxSeqLen; + GpuPrefillAppendAttention( + qAll, kAll, vAll, attnAll, + kvShared ? null : _gpuKCache[layer], kvShared ? null : _gpuVCache[layer], + _gpuKCache[effLayer], _gpuVCache[effLayer], + _numHeads, layerKv, layerHd, startPos, N, + isSwa, window, appendCtx, effLayerCtx); + + GpuMatMulBatched(_bpHidden!, _wo[layer], attnAll, N); + if (_wPostAttnNorm is not null) + _gpu.RmsNormBatched(_bpHidden!, _bpHidden!, _wPostAttnNorm[layer], N, _embDim, _hp.RmsNormEps); + _gpu.AddInPlace(_bpHidden!, _bpResidual!); + + // FFN. + _gpu.CopyDevice(_bpResidual!, _bpHidden!); + _gpu.RmsNormBatched(_bpNorm!, _bpHidden!, _wFfnNorm[layer], N, _embDim, _hp.RmsNormEps); + GpuMatMulBatched(_bpFfnGate!, _wGate[layer], _bpNorm!, N); + GpuMatMulBatched(_bpFfnUp!, _wUp[layer], _bpNorm!, N); + // SwiGLU (Silu, Qwen/Llama) vs GEGLU (GeluApprox, Gemma 4). Both are + // elementwise over the whole N·intermDim buffer, so the batched call is + // identical to the per-token one bar the activation. + if (_hp.FfnActivation == FfnActivation.GeluApprox) + _gpu.GeluTanhMul(_bpFfnGate!, _bpFfnUp!); + else + _gpu.SiLuMul(_bpFfnGate!, _bpFfnUp!); + GpuMatMulBatched(_bpHidden!, _wDown[layer], _bpFfnGate!, N); + if (_wPostFfwNorm is not null) + _gpu.RmsNormBatched(_bpHidden!, _bpHidden!, _wPostFfwNorm[layer], N, _embDim, _hp.RmsNormEps); + _gpu.AddInPlace(_bpHidden!, _bpResidual!); + + // PLE injection, batched: gate = inp_gate @ hidden; gelu * proj-slice; + // proj @; post-norm; add. proj-slice read with per-token stride via the + // strided gelu, so no gather of the [N × L*pleWidth] projection buffer. + if (_hp.HasPerLayerTokenEmbd) { - int layerCtx = isSwa && window > 0 ? SwaRingSize(_maxSeqLen, window) : _maxSeqLen; + GpuMatMulBatched(_bpPleGate!, _gpuInpGate![layer], _bpHidden!, N); + _gpu.GeluTanhMulStrided(_bpPleGate!, _bpProjAll!, _pleWidth, + (long)_hp.NumLayers * _pleWidth, (long)layer * _pleWidth, N); + GpuMatMulBatched(_bpPleY!, _gpuPleProj![layer], _bpPleGate!, N); + _gpu.RmsNormBatched(_bpPleY!, _bpPleY!, _gpuPlePostNorm![layer], N, _embDim, _hp.RmsNormEps); + _gpu.AddInPlace(_bpHidden!, _bpPleY!); + } + + if (_layerOutputScale is not null) + _gpu.ScaleInPlace(_bpHidden!, _layerOutputScale[layer]); + + _gpu.Free(qAll); _gpu.Free(kAll); _gpu.Free(vAll); _gpu.Free(attnAll); + } + + /// + /// Shared KV-append + attention dispatch for a contiguous span of + /// query rows at absolute positions [, startPos+nTok). When + /// / are non-null the rows' K/V are + /// appended into that ring (skipped for Gemma-4 shared-KV layers, which reuse the source + /// layer's cache); the queries then attend against / over [0, startPos+i] (causal, SWA-windowed when ). + /// Factored out of so the packed multi-prompt prefill + /// (issue #193) drives the IDENTICAL dispatch per sub-sequence — guaranteeing it is + /// argmax-stable with the single-sequence batched trunk it amortizes across prompts. + /// + private void GpuPrefillAppendAttention( + Tensor qAll, Tensor kAll, Tensor vAll, Tensor attnAll, + Tensor? kAppend, Tensor? vAppend, Tensor kAttn, Tensor vAttn, + int numHeads, int layerKv, int layerHd, int startPos, int nTok, + bool isSwa, int window, int appendCtx, int effLayerCtx) + { + if (kAppend is not null && vAppend is not null) + { + int kvDimL = layerKv * layerHd; if (_kvDType == DType.BFloat16) - _gpu.KvAppendBatchedBf16(kAll, vAll, _gpuKCache[layer], _gpuVCache[layer], kvDimL, startPos, layerCtx, N); + _gpu.KvAppendBatchedBf16(kAll, vAll, kAppend, vAppend, kvDimL, startPos, appendCtx, nTok); else if (_kvDType == DType.Q8_0) - _gpu.KvAppendBatchedQ8_0(kAll, vAll, _gpuKCache[layer], _gpuVCache[layer], kvDimL, startPos, layerCtx, N); + _gpu.KvAppendBatchedQ8_0(kAll, vAll, kAppend, vAppend, kvDimL, startPos, appendCtx, nTok); else - _gpu.KvAppendBatched(kAll, vAll, _gpuKCache[layer], _gpuVCache[layer], kvDimL, startPos, layerCtx, N); + _gpu.KvAppendBatched(kAll, vAll, kAppend, vAppend, kvDimL, startPos, appendCtx, nTok); } - int effLayerCtx = (_hp.IsSwaLayer is { } swaEff && swaEff[effLayer] && window > 0) - ? SwaRingSize(_maxSeqLen, window) : _maxSeqLen; - if (s_prefillProfile) { _gpu.Synchronize(); _profSw.Restart(); } // Gemma 4: attention_scale = 1.0, passed explicitly (kernel skips its rsqrtf). // Other models pass _attnScale = -1 so the kernel derives 1/sqrt(head_dim). @@ -3050,47 +3118,120 @@ void ApplyRopeBatched() // have no narrowed thunk yet — a trivial follow-up only a non-%64 head_dim model // past 4096 would need. if (PrefillFlashTcEnabled && !_forceFlashTc1 && (layerHd & 63) == 0) - _gpu.FlashAttentionPrefillTc2(qAll, _gpuKCache[effLayer], _gpuVCache[effLayer], attnAll, - _numHeads, layerKv, layerHd, startPos, isSwa ? window : 0, effLayerCtx, N, + _gpu.FlashAttentionPrefillTc2(qAll, kAttn, vAttn, attnAll, + numHeads, layerKv, layerHd, startPos, isSwa ? window : 0, effLayerCtx, nTok, attnScale: _attnScale, kvCacheType: _kvDType); else if (isSwa && _kvDType == DType.BFloat16) - _gpu.AttentionSwaBatchedBf16(qAll, _gpuKCache[effLayer], _gpuVCache[effLayer], attnAll, - _numHeads, layerKv, layerHd, startPos, window, effLayerCtx, N, attnScale: _attnScale); + _gpu.AttentionSwaBatchedBf16(qAll, kAttn, vAttn, attnAll, + numHeads, layerKv, layerHd, startPos, window, effLayerCtx, nTok, attnScale: _attnScale); else if (isSwa) - _gpu.AttentionSwaBatchedQ8_0(qAll, _gpuKCache[effLayer], _gpuVCache[effLayer], attnAll, - _numHeads, layerKv, layerHd, startPos, window, effLayerCtx, N, attnScale: _attnScale); + _gpu.AttentionSwaBatchedQ8_0(qAll, kAttn, vAttn, attnAll, + numHeads, layerKv, layerHd, startPos, window, effLayerCtx, nTok, attnScale: _attnScale); else if (_kvDType == DType.BFloat16) - _gpu.AttentionBatchedBf16(qAll, _gpuKCache[effLayer], _gpuVCache[effLayer], attnAll, - _numHeads, layerKv, layerHd, startPos, effLayerCtx, N, attnScale: _attnScale); + _gpu.AttentionBatchedBf16(qAll, kAttn, vAttn, attnAll, + numHeads, layerKv, layerHd, startPos, effLayerCtx, nTok, attnScale: _attnScale); else - _gpu.AttentionBatchedQ8_0(qAll, _gpuKCache[effLayer], _gpuVCache[effLayer], attnAll, - _numHeads, layerKv, layerHd, startPos, effLayerCtx, N, attnScale: _attnScale); + _gpu.AttentionBatchedQ8_0(qAll, kAttn, vAttn, attnAll, + numHeads, layerKv, layerHd, startPos, effLayerCtx, nTok, attnScale: _attnScale); } else if (PrefillFlashTcEnabled && (layerHd & 15) == 0) { // #147 multi-warp/d-split when head_dim is a multiple of 64 (W·16); else the // #146 single-warp kernel. SHARPI_PREFILL_FLASH_TC1=1 forces single-warp (A/B). if (!_forceFlashTc1 && (layerHd & 63) == 0) - _gpu.FlashAttentionPrefillTc2(qAll, _gpuKCache[effLayer], _gpuVCache[effLayer], attnAll, - _numHeads, layerKv, layerHd, startPos, isSwa ? window : 0, effLayerCtx, N, attnScale: _attnScale); + _gpu.FlashAttentionPrefillTc2(qAll, kAttn, vAttn, attnAll, + numHeads, layerKv, layerHd, startPos, isSwa ? window : 0, effLayerCtx, nTok, attnScale: _attnScale); else - _gpu.FlashAttentionPrefillTc(qAll, _gpuKCache[effLayer], _gpuVCache[effLayer], attnAll, - _numHeads, layerKv, layerHd, startPos, isSwa ? window : 0, effLayerCtx, N, attnScale: _attnScale); + _gpu.FlashAttentionPrefillTc(qAll, kAttn, vAttn, attnAll, + numHeads, layerKv, layerHd, startPos, isSwa ? window : 0, effLayerCtx, nTok, attnScale: _attnScale); } else if (PrefillFlashAttnEnabled) - _gpu.FlashAttentionPrefill(qAll, _gpuKCache[effLayer], _gpuVCache[effLayer], attnAll, - _numHeads, layerKv, layerHd, startPos, isSwa ? window : 0, effLayerCtx, N, attnScale: _attnScale); + _gpu.FlashAttentionPrefill(qAll, kAttn, vAttn, attnAll, + numHeads, layerKv, layerHd, startPos, isSwa ? window : 0, effLayerCtx, nTok, attnScale: _attnScale); else if (isSwa) - _gpu.AttentionSwaBatched(qAll, _gpuKCache[effLayer], _gpuVCache[effLayer], attnAll, - _numHeads, layerKv, layerHd, startPos, window, effLayerCtx, N, attnScale: _attnScale); + _gpu.AttentionSwaBatched(qAll, kAttn, vAttn, attnAll, + numHeads, layerKv, layerHd, startPos, window, effLayerCtx, nTok, attnScale: _attnScale); else - _gpu.AttentionBatched(qAll, _gpuKCache[effLayer], _gpuVCache[effLayer], attnAll, - _numHeads, layerKv, layerHd, startPos, effLayerCtx, N, attnScale: _attnScale); + _gpu.AttentionBatched(qAll, kAttn, vAttn, attnAll, + numHeads, layerKv, layerHd, startPos, effLayerCtx, nTok, attnScale: _attnScale); if (s_prefillProfile) { _gpu.Synchronize(); _profAttnMs += _profSw.Elapsed.TotalMilliseconds; } + } + + /// + /// One transformer layer of the PACKED multi-prompt prefill (issue #193): the dense subset + /// of with the attention block run per sub-sequence. The + /// RmsNorm / QKV / O / FFN GEMMs run batched over the full packed N (= Σ chunk_len, so each + /// weight read amortizes across all prompts); QK-norm / RoPE / KV-append / attention run per + /// sub-sequence on a slice of the packed Q/K/V buffers at that sequence's absolute startPos + /// against its own cache (cu_seqlens-style varlen — no cross-sequence attention, no padding). + /// Dense-only: the caller's (via + /// ) rejects every Gemma-4 / softcap model, so no + /// PLE / SWA / per-layer-head_dim / shared-KV / k_eq_v / sandwich-norm / layer_output_scale / + /// softcap path is reachable here. ( alone is NOT + /// enough — Gemma 4 satisfies it; asserts the stronger + /// contract.) + /// + private void GpuLayerPackedTrunk(int layer, int[] off, int[] startPos, CudaSequenceKvCache[] caches, int S, int N) + { + int qDim = _numHeads * _headDim; + int kvDim = _numKvHeads * _headDim; + + var qAll = _gpu.View(_bpQ!, 0, (long)N * qDim); + var kAll = _gpu.View(_bpK!, 0, (long)N * kvDim); + var vAll = _gpu.View(_bpV!, 0, (long)N * kvDim); + var attnAll = _gpu.View(_bpAttnOut!, 0, (long)N * qDim); + + _gpu.CopyDevice(_bpResidual!, _bpHidden!); + _gpu.RmsNormBatched(_bpNorm!, _bpHidden!, _wAttnNorm[layer], N, _embDim, _hp.RmsNormEps); + + GpuMatMulBatched(qAll, _wq[layer], _bpNorm!, N); + GpuMatMulBatched(kAll, _wk[layer], _bpNorm!, N); + GpuMatMulBatched(vAll, _wv[layer]!, _bpNorm!, N); + + bool useRoPE = _hp.NoRopeLayerStep == 0 || (layer + 1) % _hp.NoRopeLayerStep != 0; + float ropeTheta = _hp.RopeTheta; + + // Per sub-sequence: QK-norm before RoPE (#157), then KV-append + varlen attention into + // that sequence's own cache via the shared dispatch. Each op acts on the chunk's slice + // of the packed buffers at its absolute startPos[s]. + for (int s = 0; s < S; s++) + { + int len = off[s + 1] - off[s]; + var qS = _gpu.View(_bpQ!, (long)off[s] * qDim, (long)len * qDim); + var kS = _gpu.View(_bpK!, (long)off[s] * kvDim, (long)len * kvDim); + var vS = _gpu.View(_bpV!, (long)off[s] * kvDim, (long)len * kvDim); + var aS = _gpu.View(_bpAttnOut!, (long)off[s] * qDim, (long)len * qDim); + try + { + if (_hasQkNorm && !_hp.UseL2QkNorm) + _gpu.HeadNormQkBatched(qS, _wqNorm![layer], kS, _wkNorm![layer], + _numHeads, _numKvHeads, _headDim, len, _hp.RmsNormEps, _hp.IsPerChannelQkNorm); + if (useRoPE) + { + if (_gpuRopeFreqs is { } rfTbl) + { + _gpu.RoPEWithFactorsBatched(qS, startPos[s], _headDim, ropeTheta, rfTbl, _numHeads, len); + _gpu.RoPEWithFactorsBatched(kS, startPos[s], _headDim, ropeTheta, rfTbl, _numKvHeads, len); + } + else + { + _gpu.RoPEPartialBatched(qS, startPos[s], _headDim, _headDim, ropeTheta, _numHeads, len, neox: true); + _gpu.RoPEPartialBatched(kS, startPos[s], _headDim, _headDim, ropeTheta, _numKvHeads, len, neox: true); + } + } + var kc = caches[s].K[layer]; + var vc = caches[s].V[layer]; + GpuPrefillAppendAttention(qS, kS, vS, aS, kc, vc, kc, vc, + _numHeads, _numKvHeads, _headDim, startPos[s], len, + isSwa: false, window: 0, appendCtx: _maxSeqLen, effLayerCtx: _maxSeqLen); + } + finally + { + _gpu.Free(qS); _gpu.Free(kS); _gpu.Free(vS); _gpu.Free(aS); + } + } GpuMatMulBatched(_bpHidden!, _wo[layer], attnAll, N); - if (_wPostAttnNorm is not null) - _gpu.RmsNormBatched(_bpHidden!, _bpHidden!, _wPostAttnNorm[layer], N, _embDim, _hp.RmsNormEps); _gpu.AddInPlace(_bpHidden!, _bpResidual!); // FFN. @@ -3098,35 +3239,118 @@ void ApplyRopeBatched() _gpu.RmsNormBatched(_bpNorm!, _bpHidden!, _wFfnNorm[layer], N, _embDim, _hp.RmsNormEps); GpuMatMulBatched(_bpFfnGate!, _wGate[layer], _bpNorm!, N); GpuMatMulBatched(_bpFfnUp!, _wUp[layer], _bpNorm!, N); - // SwiGLU (Silu, Qwen/Llama) vs GEGLU (GeluApprox, Gemma 4). Both are - // elementwise over the whole N·intermDim buffer, so the batched call is - // identical to the per-token one bar the activation. if (_hp.FfnActivation == FfnActivation.GeluApprox) _gpu.GeluTanhMul(_bpFfnGate!, _bpFfnUp!); else _gpu.SiLuMul(_bpFfnGate!, _bpFfnUp!); GpuMatMulBatched(_bpHidden!, _wDown[layer], _bpFfnGate!, N); - if (_wPostFfwNorm is not null) - _gpu.RmsNormBatched(_bpHidden!, _bpHidden!, _wPostFfwNorm[layer], N, _embDim, _hp.RmsNormEps); _gpu.AddInPlace(_bpHidden!, _bpResidual!); - // PLE injection, batched: gate = inp_gate @ hidden; gelu * proj-slice; - // proj @; post-norm; add. proj-slice read with per-token stride via the - // strided gelu, so no gather of the [N × L*pleWidth] projection buffer. - if (_hp.HasPerLayerTokenEmbd) + _gpu.Free(qAll); _gpu.Free(kAll); _gpu.Free(vAll); _gpu.Free(attnAll); + } + + /// + /// True packed multi-prompt prefill (issue #193): concatenate the S chunks token-major into + /// the trunk scratch and run one forward pass over N = Σ chunk_len so every trunk + output + /// GEMM amortizes its weight read across all prompts (mirrors the CPU + /// and the single-sequence + /// ). Per-sequence RoPE / QK-norm / KV-append / varlen + /// attention keep each chunk attending only to its own cache. Advances each cache's logical + /// length and returns the final-token logits per sequence where + /// (the chunk completes that prompt), null otherwise. Dense-only; gated by the caller. + /// + private float[]?[] PrefillPackedTrunkMulti( + ReadOnlyMemory[] chunks, int[] startPos, CudaSequenceKvCache[] caches, bool[] wantLogits) + { + // Defense-in-depth: the packed trunk is dense-only — it omits every Gemma-4 step (PLE, + // SWA rings, per-layer head_dim, shared-KV aliasing, k_eq_v, sandwich norms, + // layer_output_scale) and the final-logit softcap. The caller's + // ThrowIfBatchingUnsupported (via DenseBatchedDecodeSupported) already rejects all of + // these, so this is unreachable in production; assert it loudly rather than silently + // emitting garbage if a future caller skips that guard or relaxes it for another batched + // feature. IsBatchedPrefillSupported alone is NOT sufficient — Gemma 4 satisfies it. + if (_isGemma4Like || _hp.HasPerLayerTokenEmbd || _hp.LayerHeadDim is not null + || _hp.SlidingWindowSize > 0 || _hp.KvSourceLayer is not null || _hp.FinalLogitSoftcap > 0f) + throw new InvalidOperationException( + "PrefillPackedTrunkMulti is dense-only; this model requires the Gemma-4 / softcap " + + "path that ThrowIfBatchingUnsupported should have rejected before reaching here."); + + int S = chunks.Length; + var off = new int[S + 1]; + for (int s = 0; s < S; s++) + off[s + 1] = off[s] + chunks[s].Length; + int N = off[S]; + int embDim = _embDim; + + EnsureBatchedTrunkScratch(N); + + // 1. Embed every token token-major into _bpHidden (sequence s's row i at off[s]+i). + // EmbedTokenGpu is the same lookup the single-user prefill uses, so this is + // bit-identical to prefilling each chunk on its own. + for (int s = 0; s < S; s++) { - GpuMatMulBatched(_bpPleGate!, _gpuInpGate![layer], _bpHidden!, N); - _gpu.GeluTanhMulStrided(_bpPleGate!, _bpProjAll!, _pleWidth, - (long)_hp.NumLayers * _pleWidth, (long)layer * _pleWidth, N); - GpuMatMulBatched(_bpPleY!, _gpuPleProj![layer], _bpPleGate!, N); - _gpu.RmsNormBatched(_bpPleY!, _bpPleY!, _gpuPlePostNorm![layer], N, _embDim, _hp.RmsNormEps); - _gpu.AddInPlace(_bpHidden!, _bpPleY!); + var span = chunks[s].Span; + for (int i = 0; i < span.Length; i++) + { + EmbedTokenGpu(span[i]); // writes _hidden + _gpu.CopyDeviceRegion(_bpHidden!, (long)(off[s] + i) * embDim * sizeof(float), + _hidden, 0, (long)embDim * sizeof(float)); + } } + if (_hp.EmbeddingScale != 1f) + _gpu.ScaleInPlace(_bpHidden!, _hp.EmbeddingScale); - if (_layerOutputScale is not null) - _gpu.ScaleInPlace(_bpHidden!, _layerOutputScale[layer]); + // 2. Transformer layers: batched GEMMs over N, per-sequence attention. + for (int layer = 0; layer < _hp.NumLayers; layer++) + GpuLayerPackedTrunk(layer, off, startPos, caches, S, N); - _gpu.Free(qAll); _gpu.Free(kAll); _gpu.Free(vAll); _gpu.Free(attnAll); + // 3. Per-sequence final norm + output projection on each completed prompt's last token. + // The output GEMM is per-sequence (last rows are at scattered offsets), but it runs + // only for the prompts whose final chunk landed this pass, so it's at most S small + // matvecs against the already-resident lm-head weights. + var result = new float[]?[S]; + int vocab = _hp.VocabSize; + for (int s = 0; s < S; s++) + { + // Lengths advance only here, after the whole layer loop — a throw in the layer loop + // leaves every cache length untouched. A throw within THIS step-3 loop can leave + // earlier sequences advanced and later ones not, but that is safe: the only caller + // (ContinuousBatchingEngine.RunPrefillStep) fails and disposes EVERY involved cache + // on any throw, so a partially-committed batch is never observed. + caches[s].Length = startPos[s] + (off[s + 1] - off[s]); + if (!wantLogits[s]) continue; + var lastHidden = _gpu.View(_bpHidden!, (long)(off[s + 1] - 1) * embDim, embDim); + _gpu.RmsNorm(_hidden, lastHidden, _wOutputNorm, _hp.RmsNormEps); + _gpu.Free(lastHidden); + GpuMatMul(_logits, _wOutput, _hidden); + // No softcap: ThrowIfBatchingUnsupported (via DenseBatchedDecodeSupported) rejects + // any FinalLogitSoftcap model before this method is reached, and the dense-only + // assertion at the top re-checks it. + _gpu.Download(_logits, _logitsBuf); + _gpu.Synchronize(); + result[s] = _logitsBuf.AsSpan(0, vocab).ToArray(); + } + return result; + } + + /// + /// Whether the packed multi-prompt prefill can run: every sub-sequence's attention range + /// [0, startPos+len) must fit the active prefill attention kernel's cap. The non-flash + /// shared-scores AttentionBatched kernel throws above startPos+len=4096; the streaming flash + /// kernels have no cap when they cover all layers. Mirrors the single-sequence + /// gate (issue #162); dense models have no SWA, so canChunkPast4096 + /// reduces to "flash covers all layers". + /// + private bool AllChunksPackable(ReadOnlyMemory[] chunks, int[] startPos) + { + bool flashCoversAll = _kvDType is DType.BFloat16 or DType.Q8_0 + ? NarrowedFlashTc2CoversAllLayers() + : PrefillFlashAttnEnabled; + bool canChunkPast4096 = flashCoversAll && (_hp.IsSwaLayer is not null || _hp.SlidingWindowSize <= 0); + int cap = canChunkPast4096 ? _maxSeqLen : 4096; + for (int s = 0; s < chunks.Length; s++) + if (startPos[s] + chunks[s].Length > cap) return false; + return true; } /// @@ -3496,10 +3720,15 @@ internal ReadOnlySpan PrefillWithCache(IReadOnlyList tokens, CudaSeq } /// - /// Prefill several pending sequences' chunks. Cross-prompt packing into one forward pass - /// is a follow-up (issue #190); for now each chunk prefills sequentially into its own - /// per-sequence cache via — still correct and still - /// amortizing the batched-trunk GEMMs within each chunk, just not across prompts. + /// Prefill several pending sequences' chunks in ONE packed forward pass (issue #193): + /// the S chunks are concatenated token-major and run through the trunk over N = Σ chunk_len + /// so every weight read amortizes across all prompts (like + /// does for decode), with per-sequence varlen attention into each chunk's own cache. Falls + /// back to the sequential per-sequence loop (still correct, + /// just not cross-prompt amortized) when the batched trunk can't run this model + /// ( — e.g. attn bias / L2 QK-norm / non-NEOX RoPE), + /// when any sub-sequence's attention range exceeds the kernel cap + /// (), or for a single chunk (nothing to amortize across). /// internal float[]?[] PrefillPackedMulti( ReadOnlyMemory[] chunks, int[] startPos, CudaSequenceKvCache[] caches, bool[] wantLogits) @@ -3513,6 +3742,18 @@ internal ReadOnlySpan PrefillWithCache(IReadOnlyList tokens, CudaSeq if (S == 0) return Array.Empty(); if (startPos.Length != S || caches.Length != S || wantLogits.Length != S) throw new ArgumentException("chunks/startPos/caches/wantLogits lengths must match."); + for (int s = 0; s < S; s++) + { + if (chunks[s].IsEmpty) + throw new ArgumentException($"Chunk for sequence {s} is empty.", nameof(chunks)); + if (startPos[s] < 0 || (long)startPos[s] + chunks[s].Length > _maxSeqLen) + throw new ArgumentException( + $"Sequence {s}: startPos {startPos[s]} + chunk {chunks[s].Length} exceeds maxSeqLen {_maxSeqLen}.", + nameof(startPos)); + } + + if (S >= 2 && IsBatchedPrefillSupported() && AllChunksPackable(chunks, startPos)) + return PrefillPackedTrunkMulti(chunks, startPos, caches, wantLogits); var result = new float[]?[S]; for (int s = 0; s < S; s++) diff --git a/tests/SharpInference.Tests.ForwardPass/CudaBatchForwardMultiTests.cs b/tests/SharpInference.Tests.ForwardPass/CudaBatchForwardMultiTests.cs index 47d19c7..b178191 100644 --- a/tests/SharpInference.Tests.ForwardPass/CudaBatchForwardMultiTests.cs +++ b/tests/SharpInference.Tests.ForwardPass/CudaBatchForwardMultiTests.cs @@ -504,6 +504,178 @@ public void Qwen3_8B_PrefillWithCache_Chunked_MatchesFull() Assert.True(maxAbs < 1.0f, $"Chunked-vs-full prefill maxAbs={maxAbs}."); } + /// + /// Issue #193: packs two prompts into ONE + /// forward pass (weights amortized across both). Each sequence's final-token logits must + /// reproduce the per-sequence oracle (the + /// sequential path #193 replaces) — same kernels via the shared dispatch, only the trunk + /// GEMMs now batch across prompts, so argmax-stable within the cross-path tolerance. + /// + [Fact] + public void Qwen3_8B_PrefillPackedMulti_N2_MatchesSequential() + { + using var gpu = TryCreate(); + if (gpu is null) return; + var path = FindModelPath(); + if (path is null) return; + + using var model = GgufModel.Open(path); + var hp = ModelHyperparams.FromGgufMetadata(model.Metadata, model); + Assert.Null(hp.LayerHeadDim); + using var fwd = NewFwd(model, gpu, hp); + + // Sequential reference: each prompt prefilled on its own (the per-sequence loop). + using var refA = fwd.CreateCache(); + using var refB = fwd.CreateCache(); + float[] seqA = fwd.PrefillWithCache(PromptA, refA).ToArray(); + float[] seqB = fwd.PrefillWithCache(PromptB, refB).ToArray(); + + // Packed: both prompts concatenated into one pass. + using var packA = fwd.CreateCache(); + using var packB = fwd.CreateCache(); + float[]?[] packed = fwd.PrefillPackedMulti( + [PromptA, PromptB], + [0, 0], + [packA, packB], + [true, true]); + + Assert.Equal(2, packed.Length); + Assert.NotNull(packed[0]); + Assert.NotNull(packed[1]); + Assert.Equal(PromptA.Length, packA.Length); + Assert.Equal(PromptB.Length, packB.Length); + + var (maxAbsA, overlapA) = Compare(seqA, packed[0]!); + Assert.Equal(Argmax(seqA), Argmax(packed[0]!)); + Assert.True(overlapA >= 4, $"Seq A packed top-5 overlap {overlapA}/5 (maxAbs={maxAbsA})."); + Assert.True(maxAbsA < 1.0f, $"Seq A packed vs sequential maxAbs={maxAbsA}."); + + var (maxAbsB, overlapB) = Compare(seqB, packed[1]!); + Assert.Equal(Argmax(seqB), Argmax(packed[1]!)); + Assert.True(overlapB >= 4, $"Seq B packed top-5 overlap {overlapB}/5 (maxAbs={maxAbsB})."); + Assert.True(maxAbsB < 1.0f, $"Seq B packed vs sequential maxAbs={maxAbsB}."); + } + + /// + /// Issue #193: chunked packed prefill — two prompts of different lengths advance together in + /// chunk steps (mirroring the engine's RunPrefillStep), and only each prompt's final + /// chunk requests logits. The result must match a single whole-prompt prefill, validating + /// that later chunks read prior chunks' KV correctly AND that a step left with one prompt + /// (the shorter one finished) falls back through the S<2 sequential path cleanly. + /// + [Fact] + public void Qwen3_8B_PrefillPackedMulti_Chunked_MatchesWhole() + { + using var gpu = TryCreate(); + if (gpu is null) return; + var path = FindModelPath(); + if (path is null) return; + + using var model = GgufModel.Open(path); + var hp = ModelHyperparams.FromGgufMetadata(model.Metadata, model); + Assert.Null(hp.LayerHeadDim); + using var fwd = NewFwd(model, gpu, hp); + + int[] longA = { 9707, 11, 1879, 0, 358, 1079, 264, 4108, 1614, 13, 220, 17, 18, 19 }; + int[] longB = { 1079, 264, 4108, 1614, 13, 220, 17, 9707, 11, 1879 }; + + using var refA = fwd.CreateCache(); + using var refB = fwd.CreateCache(); + float[] wholeA = fwd.PrefillWithCache(longA, refA).ToArray(); + float[] wholeB = fwd.PrefillWithCache(longB, refB).ToArray(); + + using var packA = fwd.CreateCache(); + using var packB = fwd.CreateCache(); + int cA = 0, cB = 0; + const int chunk = 4; + float[]? finalA = null, finalB = null; + while (cA < longA.Length || cB < longB.Length) + { + var chunks = new List>(); + var starts = new List(); + var caches = new List(); + var wants = new List(); + var which = new List(); + if (cA < longA.Length) + { + int take = Math.Min(chunk, longA.Length - cA); + chunks.Add(longA.AsMemory(cA, take)); starts.Add(cA); caches.Add(packA); + wants.Add(cA + take == longA.Length); which.Add(0); cA += take; + } + if (cB < longB.Length) + { + int take = Math.Min(chunk, longB.Length - cB); + chunks.Add(longB.AsMemory(cB, take)); starts.Add(cB); caches.Add(packB); + wants.Add(cB + take == longB.Length); which.Add(1); cB += take; + } + var res = fwd.PrefillPackedMulti(chunks.ToArray(), starts.ToArray(), caches.ToArray(), wants.ToArray()); + for (int i = 0; i < which.Count; i++) + if (res[i] is { } lg) { if (which[i] == 0) finalA = lg; else finalB = lg; } + } + + Assert.Equal(longA.Length, packA.Length); + Assert.Equal(longB.Length, packB.Length); + Assert.NotNull(finalA); + Assert.NotNull(finalB); + + Assert.Equal(Argmax(wholeA), Argmax(finalA!)); + var (maxAbsA, overlapA) = Compare(wholeA, finalA!); + Assert.True(overlapA >= 4, $"Chunked A top-5 overlap {overlapA}/5 (maxAbs={maxAbsA})."); + Assert.True(maxAbsA < 1.0f, $"Chunked A maxAbs={maxAbsA}."); + + Assert.Equal(Argmax(wholeB), Argmax(finalB!)); + var (maxAbsB, overlapB) = Compare(wholeB, finalB!); + Assert.True(overlapB >= 4, $"Chunked B top-5 overlap {overlapB}/5 (maxAbs={maxAbsB})."); + Assert.True(maxAbsB < 1.0f, $"Chunked B maxAbs={maxAbsB}."); + } + + /// + /// Issue #193 end-to-end: packed prefill must leave each per-sequence cache in the exact + /// state a batched decode step needs. Pack two prompts, then run one + /// decode step and require each sequence to + /// reproduce the single-user prefill+decode next-token logits — the real engine path + /// (packed admission → batched decode). + /// + [Fact] + public void Qwen3_8B_PrefillPackedMulti_ThenBatchedDecode_MatchesSingleUser() + { + using var gpu = TryCreate(); + if (gpu is null) return; + var path = FindModelPath(); + if (path is null) return; + + using var model = GgufModel.Open(path); + var hp = ModelHyperparams.FromGgufMetadata(model.Metadata, model); + Assert.Null(hp.LayerHeadDim); + using var fwd = NewFwd(model, gpu, hp); + + // Single-user reference: prefill, greedy token, one decode step. + fwd.ResetCache(); + int tokA = Argmax(fwd.Prefill(PromptA)); + float[] refA = fwd.Forward(tokA, PromptA.Length).ToArray(); + fwd.ResetCache(); + int tokB = Argmax(fwd.Prefill(PromptB)); + float[] refB = fwd.Forward(tokB, PromptB.Length).ToArray(); + + using var packA = fwd.CreateCache(); + using var packB = fwd.CreateCache(); + var packed = fwd.PrefillPackedMulti([PromptA, PromptB], [0, 0], [packA, packB], [true, true]); + Assert.Equal(tokA, Argmax(packed[0]!)); + Assert.Equal(tokB, Argmax(packed[1]!)); + + var batch = fwd.BatchForwardMulti([tokA, tokB], [PromptA.Length, PromptB.Length], [packA, packB]); + + var (maxAbsA, overlapA) = Compare(refA, batch[0]); + Assert.Equal(Argmax(refA), Argmax(batch[0])); + Assert.True(overlapA >= 4, $"Seq A packed→decode top-5 overlap {overlapA}/5 (maxAbs={maxAbsA})."); + Assert.True(maxAbsA < 1.0f, $"Seq A packed→decode maxAbs={maxAbsA}."); + + var (maxAbsB, overlapB) = Compare(refB, batch[1]); + Assert.Equal(Argmax(refB), Argmax(batch[1])); + Assert.True(overlapB >= 4, $"Seq B packed→decode top-5 overlap {overlapB}/5 (maxAbs={maxAbsB})."); + Assert.True(maxAbsB < 1.0f, $"Seq B packed→decode maxAbs={maxAbsB}."); + } + /// Empty token list and empty batch are rejected / no-op, matching the CPU path. [Fact] public void Qwen3_8B_BatchForwardMulti_EmptyBatch_ReturnsEmpty()