From 7f7c12cfcd78eb461bd184d5ab06a25de0164e82 Mon Sep 17 00:00:00 2001
From: Pekka Heikura <pekkah@gmail.com>
Date: Wed, 17 Jun 2026 16:08:22 +0300
Subject: [PATCH] perf(cuda): packed multi-prompt prefill for continuous
 batching (#193)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

CudaForwardPass.PrefillPackedMulti prefilled the S pending prompts'
chunks sequentially — a loop over PrefillWithCache, each chunk a separate
batched-trunk forward pass that re-read every weight. With several prompts
admitted concurrently their prefill GEMMs were not amortized across prompts
(unlike the CPU ForwardPass.PrefillPackedMulti, #183 Gap 2).

Pack the S chunks token-major into the batched-trunk scratch and run ONE
forward pass over N = Σ chunk_len so every trunk + output GEMM amortizes its
weight read across all prompts. Per-sequence RoPE / QK-norm / KV-append /
varlen attention run on each chunk's slice at its absolute startPos against
its own per-sequence cache (cu_seqlens-style — no cross-sequence attention,
no padding), mirroring the CPU packed path and the single-sequence
PrefillBatchedTrunk.

- Extract GpuPrefillAppendAttention from GpuLayerBatchedTrunk so the packed
  path drives the IDENTICAL KV-append + attention dispatch per sub-sequence
  (argmax-stable with the single-sequence trunk; verified the extraction is
  byte-for-byte the prior inline dispatch — Qwen3 dense + Gemma4 SWA/shared-KV
  prefill oracles all pass).
- New GpuLayerPackedTrunk (dense-only packed layer) + PrefillPackedTrunkMulti
  (driver) + AllChunksPackable gate (mirrors the single-seq #162 attention cap).
- PrefillPackedMulti packs when S>=2 && IsBatchedPrefillSupported() &&
  AllChunksPackable, else falls back to the (always-correct) sequential loop.
- Dense-only assertion in PrefillPackedTrunkMulti self-enforces the contract
  (ThrowIfBatchingUnsupported is the real Gemma-4/softcap gate, not
  IsBatchedPrefillSupported which Gemma 4 satisfies).

Tests (Qwen3-8B Q4_K, 4070 Ti): packed-vs-sequential final-token logits,
chunked packed prefill vs whole-prompt (cross-chunk KV + S<2 fallback), and
packed prefill -> batched decode vs single-user prefill+decode. All
argmax-stable within the cross-path tolerance the batched-trunk oracles hold.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 src/SharpInference.Engine/CudaForwardPass.cs  | 345 +++++++++++++++---
 .../CudaBatchForwardMultiTests.cs             | 172 +++++++++
 2 files changed, 465 insertions(+), 52 deletions(-)

diff --git a/src/SharpInference.Engine/CudaForwardPass.cs b/src/SharpInference.Engine/CudaForwardPass.cs
index a5d226c..56c8ad4 100644
--- a/src/SharpInference.Engine/CudaForwardPass.cs
+++ b/src/SharpInference.Engine/CudaForwardPass.cs
@@ -3023,20 +3023,88 @@ void ApplyRopeBatched()
             _gpu.HeadNormPureBatched(vAll, layerKv, layerHd, N, _hp.RmsNormEps);
         ApplyRopeBatched();
 
-        if (!kvShared)
+        // Append-target ring (this layer's own KV) and attention-source ring (the effective
+        // layer's — same as `layer` unless shared-KV aliases it). SWA layers wrap a window-
+        // sized ring; everything else is full-context.
+        int appendCtx = isSwa && window > 0 ? SwaRingSize(_maxSeqLen, window) : _maxSeqLen;
+        int effLayerCtx = (_hp.IsSwaLayer is { } swaEff && swaEff[effLayer] && window > 0)
+            ? SwaRingSize(_maxSeqLen, window) : _maxSeqLen;
+        GpuPrefillAppendAttention(
+            qAll, kAll, vAll, attnAll,
+            kvShared ? null : _gpuKCache[layer], kvShared ? null : _gpuVCache[layer],
+            _gpuKCache[effLayer], _gpuVCache[effLayer],
+            _numHeads, layerKv, layerHd, startPos, N,
+            isSwa, window, appendCtx, effLayerCtx);
+
+        GpuMatMulBatched(_bpHidden!, _wo[layer], attnAll, N);
+        if (_wPostAttnNorm is not null)
+            _gpu.RmsNormBatched(_bpHidden!, _bpHidden!, _wPostAttnNorm[layer], N, _embDim, _hp.RmsNormEps);
+        _gpu.AddInPlace(_bpHidden!, _bpResidual!);
+
+        // FFN.
+        _gpu.CopyDevice(_bpResidual!, _bpHidden!);
+        _gpu.RmsNormBatched(_bpNorm!, _bpHidden!, _wFfnNorm[layer], N, _embDim, _hp.RmsNormEps);
+        GpuMatMulBatched(_bpFfnGate!, _wGate[layer], _bpNorm!, N);
+        GpuMatMulBatched(_bpFfnUp!,   _wUp[layer],   _bpNorm!, N);
+        // SwiGLU (Silu, Qwen/Llama) vs GEGLU (GeluApprox, Gemma 4). Both are
+        // elementwise over the whole N·intermDim buffer, so the batched call is
+        // identical to the per-token one bar the activation.
+        if (_hp.FfnActivation == FfnActivation.GeluApprox)
+            _gpu.GeluTanhMul(_bpFfnGate!, _bpFfnUp!);
+        else
+            _gpu.SiLuMul(_bpFfnGate!, _bpFfnUp!);
+        GpuMatMulBatched(_bpHidden!, _wDown[layer], _bpFfnGate!, N);
+        if (_wPostFfwNorm is not null)
+            _gpu.RmsNormBatched(_bpHidden!, _bpHidden!, _wPostFfwNorm[layer], N, _embDim, _hp.RmsNormEps);
+        _gpu.AddInPlace(_bpHidden!, _bpResidual!);
+
+        // PLE injection, batched: gate = inp_gate @ hidden; gelu * proj-slice;
+        // proj @; post-norm; add. proj-slice read with per-token stride via the
+        // strided gelu, so no gather of the [N × L*pleWidth] projection buffer.
+        if (_hp.HasPerLayerTokenEmbd)
         {
-            int layerCtx = isSwa && window > 0 ? SwaRingSize(_maxSeqLen, window) : _maxSeqLen;
+            GpuMatMulBatched(_bpPleGate!, _gpuInpGate![layer], _bpHidden!, N);
+            _gpu.GeluTanhMulStrided(_bpPleGate!, _bpProjAll!, _pleWidth,
+                (long)_hp.NumLayers * _pleWidth, (long)layer * _pleWidth, N);
+            GpuMatMulBatched(_bpPleY!, _gpuPleProj![layer], _bpPleGate!, N);
+            _gpu.RmsNormBatched(_bpPleY!, _bpPleY!, _gpuPlePostNorm![layer], N, _embDim, _hp.RmsNormEps);
+            _gpu.AddInPlace(_bpHidden!, _bpPleY!);
+        }
+
+        if (_layerOutputScale is not null)
+            _gpu.ScaleInPlace(_bpHidden!, _layerOutputScale[layer]);
+
+        _gpu.Free(qAll); _gpu.Free(kAll); _gpu.Free(vAll); _gpu.Free(attnAll);
+    }
+
+    /// <summary>
+    /// Shared KV-append + attention dispatch for a contiguous span of <paramref name="nTok"/>
+    /// query rows at absolute positions [<paramref name="startPos"/>, startPos+nTok). When
+    /// <paramref name="kAppend"/>/<paramref name="vAppend"/> are non-null the rows' K/V are
+    /// appended into that ring (skipped for Gemma-4 shared-KV layers, which reuse the source
+    /// layer's cache); the queries then attend against <paramref name="kAttn"/>/<paramref
+    /// name="vAttn"/> over [0, startPos+i] (causal, SWA-windowed when <paramref name="isSwa"/>).
+    /// Factored out of <see cref="GpuLayerBatchedTrunk"/> so the packed multi-prompt prefill
+    /// (issue #193) drives the IDENTICAL dispatch per sub-sequence — guaranteeing it is
+    /// argmax-stable with the single-sequence batched trunk it amortizes across prompts.
+    /// </summary>
+    private void GpuPrefillAppendAttention(
+        Tensor qAll, Tensor kAll, Tensor vAll, Tensor attnAll,
+        Tensor? kAppend, Tensor? vAppend, Tensor kAttn, Tensor vAttn,
+        int numHeads, int layerKv, int layerHd, int startPos, int nTok,
+        bool isSwa, int window, int appendCtx, int effLayerCtx)
+    {
+        if (kAppend is not null && vAppend is not null)
+        {
+            int kvDimL = layerKv * layerHd;
             if (_kvDType == DType.BFloat16)
-                _gpu.KvAppendBatchedBf16(kAll, vAll, _gpuKCache[layer], _gpuVCache[layer], kvDimL, startPos, layerCtx, N);
+                _gpu.KvAppendBatchedBf16(kAll, vAll, kAppend, vAppend, kvDimL, startPos, appendCtx, nTok);
             else if (_kvDType == DType.Q8_0)
-                _gpu.KvAppendBatchedQ8_0(kAll, vAll, _gpuKCache[layer], _gpuVCache[layer], kvDimL, startPos, layerCtx, N);
+                _gpu.KvAppendBatchedQ8_0(kAll, vAll, kAppend, vAppend, kvDimL, startPos, appendCtx, nTok);
             else
-                _gpu.KvAppendBatched(kAll, vAll, _gpuKCache[layer], _gpuVCache[layer], kvDimL, startPos, layerCtx, N);
+                _gpu.KvAppendBatched(kAll, vAll, kAppend, vAppend, kvDimL, startPos, appendCtx, nTok);
         }
 
-        int effLayerCtx = (_hp.IsSwaLayer is { } swaEff && swaEff[effLayer] && window > 0)
-            ? SwaRingSize(_maxSeqLen, window) : _maxSeqLen;
-
         if (s_prefillProfile) { _gpu.Synchronize(); _profSw.Restart(); }
         // Gemma 4: attention_scale = 1.0, passed explicitly (kernel skips its rsqrtf).
         // Other models pass _attnScale = -1 so the kernel derives 1/sqrt(head_dim).
@@ -3050,47 +3118,120 @@ void ApplyRopeBatched()
             // have no narrowed thunk yet — a trivial follow-up only a non-%64 head_dim model
             // past 4096 would need.
             if (PrefillFlashTcEnabled && !_forceFlashTc1 && (layerHd & 63) == 0)
-                _gpu.FlashAttentionPrefillTc2(qAll, _gpuKCache[effLayer], _gpuVCache[effLayer], attnAll,
-                    _numHeads, layerKv, layerHd, startPos, isSwa ? window : 0, effLayerCtx, N,
+                _gpu.FlashAttentionPrefillTc2(qAll, kAttn, vAttn, attnAll,
+                    numHeads, layerKv, layerHd, startPos, isSwa ? window : 0, effLayerCtx, nTok,
                     attnScale: _attnScale, kvCacheType: _kvDType);
             else if (isSwa && _kvDType == DType.BFloat16)
-                _gpu.AttentionSwaBatchedBf16(qAll, _gpuKCache[effLayer], _gpuVCache[effLayer], attnAll,
-                    _numHeads, layerKv, layerHd, startPos, window, effLayerCtx, N, attnScale: _attnScale);
+                _gpu.AttentionSwaBatchedBf16(qAll, kAttn, vAttn, attnAll,
+                    numHeads, layerKv, layerHd, startPos, window, effLayerCtx, nTok, attnScale: _attnScale);
             else if (isSwa)
-                _gpu.AttentionSwaBatchedQ8_0(qAll, _gpuKCache[effLayer], _gpuVCache[effLayer], attnAll,
-                    _numHeads, layerKv, layerHd, startPos, window, effLayerCtx, N, attnScale: _attnScale);
+                _gpu.AttentionSwaBatchedQ8_0(qAll, kAttn, vAttn, attnAll,
+                    numHeads, layerKv, layerHd, startPos, window, effLayerCtx, nTok, attnScale: _attnScale);
             else if (_kvDType == DType.BFloat16)
-                _gpu.AttentionBatchedBf16(qAll, _gpuKCache[effLayer], _gpuVCache[effLayer], attnAll,
-                    _numHeads, layerKv, layerHd, startPos, effLayerCtx, N, attnScale: _attnScale);
+                _gpu.AttentionBatchedBf16(qAll, kAttn, vAttn, attnAll,
+                    numHeads, layerKv, layerHd, startPos, effLayerCtx, nTok, attnScale: _attnScale);
             else
-                _gpu.AttentionBatchedQ8_0(qAll, _gpuKCache[effLayer], _gpuVCache[effLayer], attnAll,
-                    _numHeads, layerKv, layerHd, startPos, effLayerCtx, N, attnScale: _attnScale);
+                _gpu.AttentionBatchedQ8_0(qAll, kAttn, vAttn, attnAll,
+                    numHeads, layerKv, layerHd, startPos, effLayerCtx, nTok, attnScale: _attnScale);
         }
         else if (PrefillFlashTcEnabled && (layerHd & 15) == 0)
         {
             // #147 multi-warp/d-split when head_dim is a multiple of 64 (W·16); else the
             // #146 single-warp kernel. SHARPI_PREFILL_FLASH_TC1=1 forces single-warp (A/B).
             if (!_forceFlashTc1 && (layerHd & 63) == 0)
-                _gpu.FlashAttentionPrefillTc2(qAll, _gpuKCache[effLayer], _gpuVCache[effLayer], attnAll,
-                    _numHeads, layerKv, layerHd, startPos, isSwa ? window : 0, effLayerCtx, N, attnScale: _attnScale);
+                _gpu.FlashAttentionPrefillTc2(qAll, kAttn, vAttn, attnAll,
+                    numHeads, layerKv, layerHd, startPos, isSwa ? window : 0, effLayerCtx, nTok, attnScale: _attnScale);
             else
-                _gpu.FlashAttentionPrefillTc(qAll, _gpuKCache[effLayer], _gpuVCache[effLayer], attnAll,
-                    _numHeads, layerKv, layerHd, startPos, isSwa ? window : 0, effLayerCtx, N, attnScale: _attnScale);
+                _gpu.FlashAttentionPrefillTc(qAll, kAttn, vAttn, attnAll,
+                    numHeads, layerKv, layerHd, startPos, isSwa ? window : 0, effLayerCtx, nTok, attnScale: _attnScale);
         }
         else if (PrefillFlashAttnEnabled)
-            _gpu.FlashAttentionPrefill(qAll, _gpuKCache[effLayer], _gpuVCache[effLayer], attnAll,
-                _numHeads, layerKv, layerHd, startPos, isSwa ? window : 0, effLayerCtx, N, attnScale: _attnScale);
+            _gpu.FlashAttentionPrefill(qAll, kAttn, vAttn, attnAll,
+                numHeads, layerKv, layerHd, startPos, isSwa ? window : 0, effLayerCtx, nTok, attnScale: _attnScale);
         else if (isSwa)
-            _gpu.AttentionSwaBatched(qAll, _gpuKCache[effLayer], _gpuVCache[effLayer], attnAll,
-                _numHeads, layerKv, layerHd, startPos, window, effLayerCtx, N, attnScale: _attnScale);
+            _gpu.AttentionSwaBatched(qAll, kAttn, vAttn, attnAll,
+                numHeads, layerKv, layerHd, startPos, window, effLayerCtx, nTok, attnScale: _attnScale);
         else
-            _gpu.AttentionBatched(qAll, _gpuKCache[effLayer], _gpuVCache[effLayer], attnAll,
-                _numHeads, layerKv, layerHd, startPos, effLayerCtx, N, attnScale: _attnScale);
+            _gpu.AttentionBatched(qAll, kAttn, vAttn, attnAll,
+                numHeads, layerKv, layerHd, startPos, effLayerCtx, nTok, attnScale: _attnScale);
         if (s_prefillProfile) { _gpu.Synchronize(); _profAttnMs += _profSw.Elapsed.TotalMilliseconds; }
+    }
+
+    /// <summary>
+    /// One transformer layer of the PACKED multi-prompt prefill (issue #193): the dense subset
+    /// of <see cref="GpuLayerBatchedTrunk"/> with the attention block run per sub-sequence. The
+    /// RmsNorm / QKV / O / FFN GEMMs run batched over the full packed N (= Σ chunk_len, so each
+    /// weight read amortizes across all prompts); QK-norm / RoPE / KV-append / attention run per
+    /// sub-sequence on a slice of the packed Q/K/V buffers at that sequence's absolute startPos
+    /// against its own cache (cu_seqlens-style varlen — no cross-sequence attention, no padding).
+    /// Dense-only: the caller's <see cref="ThrowIfBatchingUnsupported"/> (via
+    /// <see cref="DenseBatchedDecodeSupported"/>) rejects every Gemma-4 / softcap model, so no
+    /// PLE / SWA / per-layer-head_dim / shared-KV / k_eq_v / sandwich-norm / layer_output_scale /
+    /// softcap path is reachable here. (<see cref="IsBatchedPrefillSupported"/> alone is NOT
+    /// enough — Gemma 4 satisfies it; <see cref="PrefillPackedTrunkMulti"/> asserts the stronger
+    /// contract.)
+    /// </summary>
+    private void GpuLayerPackedTrunk(int layer, int[] off, int[] startPos, CudaSequenceKvCache[] caches, int S, int N)
+    {
+        int qDim = _numHeads * _headDim;
+        int kvDim = _numKvHeads * _headDim;
+
+        var qAll = _gpu.View(_bpQ!, 0, (long)N * qDim);
+        var kAll = _gpu.View(_bpK!, 0, (long)N * kvDim);
+        var vAll = _gpu.View(_bpV!, 0, (long)N * kvDim);
+        var attnAll = _gpu.View(_bpAttnOut!, 0, (long)N * qDim);
+
+        _gpu.CopyDevice(_bpResidual!, _bpHidden!);
+        _gpu.RmsNormBatched(_bpNorm!, _bpHidden!, _wAttnNorm[layer], N, _embDim, _hp.RmsNormEps);
+
+        GpuMatMulBatched(qAll, _wq[layer], _bpNorm!, N);
+        GpuMatMulBatched(kAll, _wk[layer], _bpNorm!, N);
+        GpuMatMulBatched(vAll, _wv[layer]!, _bpNorm!, N);
+
+        bool useRoPE = _hp.NoRopeLayerStep == 0 || (layer + 1) % _hp.NoRopeLayerStep != 0;
+        float ropeTheta = _hp.RopeTheta;
+
+        // Per sub-sequence: QK-norm before RoPE (#157), then KV-append + varlen attention into
+        // that sequence's own cache via the shared dispatch. Each op acts on the chunk's slice
+        // of the packed buffers at its absolute startPos[s].
+        for (int s = 0; s < S; s++)
+        {
+            int len = off[s + 1] - off[s];
+            var qS = _gpu.View(_bpQ!, (long)off[s] * qDim, (long)len * qDim);
+            var kS = _gpu.View(_bpK!, (long)off[s] * kvDim, (long)len * kvDim);
+            var vS = _gpu.View(_bpV!, (long)off[s] * kvDim, (long)len * kvDim);
+            var aS = _gpu.View(_bpAttnOut!, (long)off[s] * qDim, (long)len * qDim);
+            try
+            {
+                if (_hasQkNorm && !_hp.UseL2QkNorm)
+                    _gpu.HeadNormQkBatched(qS, _wqNorm![layer], kS, _wkNorm![layer],
+                        _numHeads, _numKvHeads, _headDim, len, _hp.RmsNormEps, _hp.IsPerChannelQkNorm);
+                if (useRoPE)
+                {
+                    if (_gpuRopeFreqs is { } rfTbl)
+                    {
+                        _gpu.RoPEWithFactorsBatched(qS, startPos[s], _headDim, ropeTheta, rfTbl, _numHeads, len);
+                        _gpu.RoPEWithFactorsBatched(kS, startPos[s], _headDim, ropeTheta, rfTbl, _numKvHeads, len);
+                    }
+                    else
+                    {
+                        _gpu.RoPEPartialBatched(qS, startPos[s], _headDim, _headDim, ropeTheta, _numHeads, len, neox: true);
+                        _gpu.RoPEPartialBatched(kS, startPos[s], _headDim, _headDim, ropeTheta, _numKvHeads, len, neox: true);
+                    }
+                }
+                var kc = caches[s].K[layer];
+                var vc = caches[s].V[layer];
+                GpuPrefillAppendAttention(qS, kS, vS, aS, kc, vc, kc, vc,
+                    _numHeads, _numKvHeads, _headDim, startPos[s], len,
+                    isSwa: false, window: 0, appendCtx: _maxSeqLen, effLayerCtx: _maxSeqLen);
+            }
+            finally
+            {
+                _gpu.Free(qS); _gpu.Free(kS); _gpu.Free(vS); _gpu.Free(aS);
+            }
+        }
 
         GpuMatMulBatched(_bpHidden!, _wo[layer], attnAll, N);
-        if (_wPostAttnNorm is not null)
-            _gpu.RmsNormBatched(_bpHidden!, _bpHidden!, _wPostAttnNorm[layer], N, _embDim, _hp.RmsNormEps);
         _gpu.AddInPlace(_bpHidden!, _bpResidual!);
 
         // FFN.
@@ -3098,35 +3239,118 @@ void ApplyRopeBatched()
         _gpu.RmsNormBatched(_bpNorm!, _bpHidden!, _wFfnNorm[layer], N, _embDim, _hp.RmsNormEps);
         GpuMatMulBatched(_bpFfnGate!, _wGate[layer], _bpNorm!, N);
         GpuMatMulBatched(_bpFfnUp!,   _wUp[layer],   _bpNorm!, N);
-        // SwiGLU (Silu, Qwen/Llama) vs GEGLU (GeluApprox, Gemma 4). Both are
-        // elementwise over the whole N·intermDim buffer, so the batched call is
-        // identical to the per-token one bar the activation.
         if (_hp.FfnActivation == FfnActivation.GeluApprox)
             _gpu.GeluTanhMul(_bpFfnGate!, _bpFfnUp!);
         else
             _gpu.SiLuMul(_bpFfnGate!, _bpFfnUp!);
         GpuMatMulBatched(_bpHidden!, _wDown[layer], _bpFfnGate!, N);
-        if (_wPostFfwNorm is not null)
-            _gpu.RmsNormBatched(_bpHidden!, _bpHidden!, _wPostFfwNorm[layer], N, _embDim, _hp.RmsNormEps);
         _gpu.AddInPlace(_bpHidden!, _bpResidual!);
 
-        // PLE injection, batched: gate = inp_gate @ hidden; gelu * proj-slice;
-        // proj @; post-norm; add. proj-slice read with per-token stride via the
-        // strided gelu, so no gather of the [N × L*pleWidth] projection buffer.
-        if (_hp.HasPerLayerTokenEmbd)
+        _gpu.Free(qAll); _gpu.Free(kAll); _gpu.Free(vAll); _gpu.Free(attnAll);
+    }
+
+    /// <summary>
+    /// True packed multi-prompt prefill (issue #193): concatenate the S chunks token-major into
+    /// the trunk scratch and run one forward pass over N = Σ chunk_len so every trunk + output
+    /// GEMM amortizes its weight read across all prompts (mirrors the CPU
+    /// <see cref="ForwardPass.PrefillPackedMulti"/> and the single-sequence
+    /// <see cref="PrefillBatchedTrunk"/>). Per-sequence RoPE / QK-norm / KV-append / varlen
+    /// attention keep each chunk attending only to its own cache. Advances each cache's logical
+    /// length and returns the final-token logits per sequence where <paramref name="wantLogits"/>
+    /// (the chunk completes that prompt), null otherwise. Dense-only; gated by the caller.
+    /// </summary>
+    private float[]?[] PrefillPackedTrunkMulti(
+        ReadOnlyMemory<int>[] chunks, int[] startPos, CudaSequenceKvCache[] caches, bool[] wantLogits)
+    {
+        // Defense-in-depth: the packed trunk is dense-only — it omits every Gemma-4 step (PLE,
+        // SWA rings, per-layer head_dim, shared-KV aliasing, k_eq_v, sandwich norms,
+        // layer_output_scale) and the final-logit softcap. The caller's
+        // ThrowIfBatchingUnsupported (via DenseBatchedDecodeSupported) already rejects all of
+        // these, so this is unreachable in production; assert it loudly rather than silently
+        // emitting garbage if a future caller skips that guard or relaxes it for another batched
+        // feature. IsBatchedPrefillSupported alone is NOT sufficient — Gemma 4 satisfies it.
+        if (_isGemma4Like || _hp.HasPerLayerTokenEmbd || _hp.LayerHeadDim is not null
+            || _hp.SlidingWindowSize > 0 || _hp.KvSourceLayer is not null || _hp.FinalLogitSoftcap > 0f)
+            throw new InvalidOperationException(
+                "PrefillPackedTrunkMulti is dense-only; this model requires the Gemma-4 / softcap " +
+                "path that ThrowIfBatchingUnsupported should have rejected before reaching here.");
+
+        int S = chunks.Length;
+        var off = new int[S + 1];
+        for (int s = 0; s < S; s++)
+            off[s + 1] = off[s] + chunks[s].Length;
+        int N = off[S];
+        int embDim = _embDim;
+
+        EnsureBatchedTrunkScratch(N);
+
+        // 1. Embed every token token-major into _bpHidden (sequence s's row i at off[s]+i).
+        //    EmbedTokenGpu is the same lookup the single-user prefill uses, so this is
+        //    bit-identical to prefilling each chunk on its own.
+        for (int s = 0; s < S; s++)
         {
-            GpuMatMulBatched(_bpPleGate!, _gpuInpGate![layer], _bpHidden!, N);
-            _gpu.GeluTanhMulStrided(_bpPleGate!, _bpProjAll!, _pleWidth,
-                (long)_hp.NumLayers * _pleWidth, (long)layer * _pleWidth, N);
-            GpuMatMulBatched(_bpPleY!, _gpuPleProj![layer], _bpPleGate!, N);
-            _gpu.RmsNormBatched(_bpPleY!, _bpPleY!, _gpuPlePostNorm![layer], N, _embDim, _hp.RmsNormEps);
-            _gpu.AddInPlace(_bpHidden!, _bpPleY!);
+            var span = chunks[s].Span;
+            for (int i = 0; i < span.Length; i++)
+            {
+                EmbedTokenGpu(span[i]);   // writes _hidden
+                _gpu.CopyDeviceRegion(_bpHidden!, (long)(off[s] + i) * embDim * sizeof(float),
+                                      _hidden, 0, (long)embDim * sizeof(float));
+            }
         }
+        if (_hp.EmbeddingScale != 1f)
+            _gpu.ScaleInPlace(_bpHidden!, _hp.EmbeddingScale);
 
-        if (_layerOutputScale is not null)
-            _gpu.ScaleInPlace(_bpHidden!, _layerOutputScale[layer]);
+        // 2. Transformer layers: batched GEMMs over N, per-sequence attention.
+        for (int layer = 0; layer < _hp.NumLayers; layer++)
+            GpuLayerPackedTrunk(layer, off, startPos, caches, S, N);
 
-        _gpu.Free(qAll); _gpu.Free(kAll); _gpu.Free(vAll); _gpu.Free(attnAll);
+        // 3. Per-sequence final norm + output projection on each completed prompt's last token.
+        //    The output GEMM is per-sequence (last rows are at scattered offsets), but it runs
+        //    only for the prompts whose final chunk landed this pass, so it's at most S small
+        //    matvecs against the already-resident lm-head weights.
+        var result = new float[]?[S];
+        int vocab = _hp.VocabSize;
+        for (int s = 0; s < S; s++)
+        {
+            // Lengths advance only here, after the whole layer loop — a throw in the layer loop
+            // leaves every cache length untouched. A throw within THIS step-3 loop can leave
+            // earlier sequences advanced and later ones not, but that is safe: the only caller
+            // (ContinuousBatchingEngine.RunPrefillStep) fails and disposes EVERY involved cache
+            // on any throw, so a partially-committed batch is never observed.
+            caches[s].Length = startPos[s] + (off[s + 1] - off[s]);
+            if (!wantLogits[s]) continue;
+            var lastHidden = _gpu.View(_bpHidden!, (long)(off[s + 1] - 1) * embDim, embDim);
+            _gpu.RmsNorm(_hidden, lastHidden, _wOutputNorm, _hp.RmsNormEps);
+            _gpu.Free(lastHidden);
+            GpuMatMul(_logits, _wOutput, _hidden);
+            // No softcap: ThrowIfBatchingUnsupported (via DenseBatchedDecodeSupported) rejects
+            // any FinalLogitSoftcap model before this method is reached, and the dense-only
+            // assertion at the top re-checks it.
+            _gpu.Download(_logits, _logitsBuf);
+            _gpu.Synchronize();
+            result[s] = _logitsBuf.AsSpan(0, vocab).ToArray();
+        }
+        return result;
+    }
+
+    /// <summary>
+    /// Whether the packed multi-prompt prefill can run: every sub-sequence's attention range
+    /// [0, startPos+len) must fit the active prefill attention kernel's cap. The non-flash
+    /// shared-scores AttentionBatched kernel throws above startPos+len=4096; the streaming flash
+    /// kernels have no cap when they cover all layers. Mirrors the single-sequence
+    /// <see cref="Prefill"/> gate (issue #162); dense models have no SWA, so canChunkPast4096
+    /// reduces to "flash covers all layers".
+    /// </summary>
+    private bool AllChunksPackable(ReadOnlyMemory<int>[] chunks, int[] startPos)
+    {
+        bool flashCoversAll = _kvDType is DType.BFloat16 or DType.Q8_0
+            ? NarrowedFlashTc2CoversAllLayers()
+            : PrefillFlashAttnEnabled;
+        bool canChunkPast4096 = flashCoversAll && (_hp.IsSwaLayer is not null || _hp.SlidingWindowSize <= 0);
+        int cap = canChunkPast4096 ? _maxSeqLen : 4096;
+        for (int s = 0; s < chunks.Length; s++)
+            if (startPos[s] + chunks[s].Length > cap) return false;
+        return true;
     }
 
     /// <summary>
@@ -3496,10 +3720,15 @@ internal ReadOnlySpan<float> PrefillWithCache(IReadOnlyList<int> tokens, CudaSeq
     }
 
     /// <summary>
-    /// Prefill several pending sequences' chunks. Cross-prompt packing into one forward pass
-    /// is a follow-up (issue #190); for now each chunk prefills sequentially into its own
-    /// per-sequence cache via <see cref="PrefillWithCache"/> — still correct and still
-    /// amortizing the batched-trunk GEMMs within each chunk, just not across prompts.
+    /// Prefill several pending sequences' chunks in ONE packed forward pass (issue #193):
+    /// the S chunks are concatenated token-major and run through the trunk over N = Σ chunk_len
+    /// so every weight read amortizes across all prompts (like <see cref="BatchForwardMulti"/>
+    /// does for decode), with per-sequence varlen attention into each chunk's own cache. Falls
+    /// back to the sequential per-sequence <see cref="PrefillWithCache"/> loop (still correct,
+    /// just not cross-prompt amortized) when the batched trunk can't run this model
+    /// (<see cref="IsBatchedPrefillSupported"/> — e.g. attn bias / L2 QK-norm / non-NEOX RoPE),
+    /// when any sub-sequence's attention range exceeds the kernel cap
+    /// (<see cref="AllChunksPackable"/>), or for a single chunk (nothing to amortize across).
     /// </summary>
     internal float[]?[] PrefillPackedMulti(
         ReadOnlyMemory<int>[] chunks, int[] startPos, CudaSequenceKvCache[] caches, bool[] wantLogits)
@@ -3513,6 +3742,18 @@ internal ReadOnlySpan<float> PrefillWithCache(IReadOnlyList<int> tokens, CudaSeq
         if (S == 0) return Array.Empty<float[]?>();
         if (startPos.Length != S || caches.Length != S || wantLogits.Length != S)
             throw new ArgumentException("chunks/startPos/caches/wantLogits lengths must match.");
+        for (int s = 0; s < S; s++)
+        {
+            if (chunks[s].IsEmpty)
+                throw new ArgumentException($"Chunk for sequence {s} is empty.", nameof(chunks));
+            if (startPos[s] < 0 || (long)startPos[s] + chunks[s].Length > _maxSeqLen)
+                throw new ArgumentException(
+                    $"Sequence {s}: startPos {startPos[s]} + chunk {chunks[s].Length} exceeds maxSeqLen {_maxSeqLen}.",
+                    nameof(startPos));
+        }
+
+        if (S >= 2 && IsBatchedPrefillSupported() && AllChunksPackable(chunks, startPos))
+            return PrefillPackedTrunkMulti(chunks, startPos, caches, wantLogits);
 
         var result = new float[]?[S];
         for (int s = 0; s < S; s++)
diff --git a/tests/SharpInference.Tests.ForwardPass/CudaBatchForwardMultiTests.cs b/tests/SharpInference.Tests.ForwardPass/CudaBatchForwardMultiTests.cs
index 47d19c7..b178191 100644
--- a/tests/SharpInference.Tests.ForwardPass/CudaBatchForwardMultiTests.cs
+++ b/tests/SharpInference.Tests.ForwardPass/CudaBatchForwardMultiTests.cs
@@ -504,6 +504,178 @@ public void Qwen3_8B_PrefillWithCache_Chunked_MatchesFull()
         Assert.True(maxAbs < 1.0f, $"Chunked-vs-full prefill maxAbs={maxAbs}.");
     }
 
+    /// <summary>
+    /// Issue #193: <see cref="CudaForwardPass.PrefillPackedMulti"/> packs two prompts into ONE
+    /// forward pass (weights amortized across both). Each sequence's final-token logits must
+    /// reproduce the per-sequence <see cref="CudaForwardPass.PrefillWithCache"/> oracle (the
+    /// sequential path #193 replaces) — same kernels via the shared dispatch, only the trunk
+    /// GEMMs now batch across prompts, so argmax-stable within the cross-path tolerance.
+    /// </summary>
+    [Fact]
+    public void Qwen3_8B_PrefillPackedMulti_N2_MatchesSequential()
+    {
+        using var gpu = TryCreate();
+        if (gpu is null) return;
+        var path = FindModelPath();
+        if (path is null) return;
+
+        using var model = GgufModel.Open(path);
+        var hp = ModelHyperparams.FromGgufMetadata(model.Metadata, model);
+        Assert.Null(hp.LayerHeadDim);
+        using var fwd = NewFwd(model, gpu, hp);
+
+        // Sequential reference: each prompt prefilled on its own (the per-sequence loop).
+        using var refA = fwd.CreateCache();
+        using var refB = fwd.CreateCache();
+        float[] seqA = fwd.PrefillWithCache(PromptA, refA).ToArray();
+        float[] seqB = fwd.PrefillWithCache(PromptB, refB).ToArray();
+
+        // Packed: both prompts concatenated into one pass.
+        using var packA = fwd.CreateCache();
+        using var packB = fwd.CreateCache();
+        float[]?[] packed = fwd.PrefillPackedMulti(
+            [PromptA, PromptB],
+            [0, 0],
+            [packA, packB],
+            [true, true]);
+
+        Assert.Equal(2, packed.Length);
+        Assert.NotNull(packed[0]);
+        Assert.NotNull(packed[1]);
+        Assert.Equal(PromptA.Length, packA.Length);
+        Assert.Equal(PromptB.Length, packB.Length);
+
+        var (maxAbsA, overlapA) = Compare(seqA, packed[0]!);
+        Assert.Equal(Argmax(seqA), Argmax(packed[0]!));
+        Assert.True(overlapA >= 4, $"Seq A packed top-5 overlap {overlapA}/5 (maxAbs={maxAbsA}).");
+        Assert.True(maxAbsA < 1.0f, $"Seq A packed vs sequential maxAbs={maxAbsA}.");
+
+        var (maxAbsB, overlapB) = Compare(seqB, packed[1]!);
+        Assert.Equal(Argmax(seqB), Argmax(packed[1]!));
+        Assert.True(overlapB >= 4, $"Seq B packed top-5 overlap {overlapB}/5 (maxAbs={maxAbsB}).");
+        Assert.True(maxAbsB < 1.0f, $"Seq B packed vs sequential maxAbs={maxAbsB}.");
+    }
+
+    /// <summary>
+    /// Issue #193: chunked packed prefill — two prompts of different lengths advance together in
+    /// chunk steps (mirroring the engine's <c>RunPrefillStep</c>), and only each prompt's final
+    /// chunk requests logits. The result must match a single whole-prompt prefill, validating
+    /// that later chunks read prior chunks' KV correctly AND that a step left with one prompt
+    /// (the shorter one finished) falls back through the S&lt;2 sequential path cleanly.
+    /// </summary>
+    [Fact]
+    public void Qwen3_8B_PrefillPackedMulti_Chunked_MatchesWhole()
+    {
+        using var gpu = TryCreate();
+        if (gpu is null) return;
+        var path = FindModelPath();
+        if (path is null) return;
+
+        using var model = GgufModel.Open(path);
+        var hp = ModelHyperparams.FromGgufMetadata(model.Metadata, model);
+        Assert.Null(hp.LayerHeadDim);
+        using var fwd = NewFwd(model, gpu, hp);
+
+        int[] longA = { 9707, 11, 1879, 0, 358, 1079, 264, 4108, 1614, 13, 220, 17, 18, 19 };
+        int[] longB = { 1079, 264, 4108, 1614, 13, 220, 17, 9707, 11, 1879 };
+
+        using var refA = fwd.CreateCache();
+        using var refB = fwd.CreateCache();
+        float[] wholeA = fwd.PrefillWithCache(longA, refA).ToArray();
+        float[] wholeB = fwd.PrefillWithCache(longB, refB).ToArray();
+
+        using var packA = fwd.CreateCache();
+        using var packB = fwd.CreateCache();
+        int cA = 0, cB = 0;
+        const int chunk = 4;
+        float[]? finalA = null, finalB = null;
+        while (cA < longA.Length || cB < longB.Length)
+        {
+            var chunks = new List<ReadOnlyMemory<int>>();
+            var starts = new List<int>();
+            var caches = new List<CudaSequenceKvCache>();
+            var wants = new List<bool>();
+            var which = new List<int>();
+            if (cA < longA.Length)
+            {
+                int take = Math.Min(chunk, longA.Length - cA);
+                chunks.Add(longA.AsMemory(cA, take)); starts.Add(cA); caches.Add(packA);
+                wants.Add(cA + take == longA.Length); which.Add(0); cA += take;
+            }
+            if (cB < longB.Length)
+            {
+                int take = Math.Min(chunk, longB.Length - cB);
+                chunks.Add(longB.AsMemory(cB, take)); starts.Add(cB); caches.Add(packB);
+                wants.Add(cB + take == longB.Length); which.Add(1); cB += take;
+            }
+            var res = fwd.PrefillPackedMulti(chunks.ToArray(), starts.ToArray(), caches.ToArray(), wants.ToArray());
+            for (int i = 0; i < which.Count; i++)
+                if (res[i] is { } lg) { if (which[i] == 0) finalA = lg; else finalB = lg; }
+        }
+
+        Assert.Equal(longA.Length, packA.Length);
+        Assert.Equal(longB.Length, packB.Length);
+        Assert.NotNull(finalA);
+        Assert.NotNull(finalB);
+
+        Assert.Equal(Argmax(wholeA), Argmax(finalA!));
+        var (maxAbsA, overlapA) = Compare(wholeA, finalA!);
+        Assert.True(overlapA >= 4, $"Chunked A top-5 overlap {overlapA}/5 (maxAbs={maxAbsA}).");
+        Assert.True(maxAbsA < 1.0f, $"Chunked A maxAbs={maxAbsA}.");
+
+        Assert.Equal(Argmax(wholeB), Argmax(finalB!));
+        var (maxAbsB, overlapB) = Compare(wholeB, finalB!);
+        Assert.True(overlapB >= 4, $"Chunked B top-5 overlap {overlapB}/5 (maxAbs={maxAbsB}).");
+        Assert.True(maxAbsB < 1.0f, $"Chunked B maxAbs={maxAbsB}.");
+    }
+
+    /// <summary>
+    /// Issue #193 end-to-end: packed prefill must leave each per-sequence cache in the exact
+    /// state a batched decode step needs. Pack two prompts, then run one
+    /// <see cref="CudaForwardPass.BatchForwardMulti"/> decode step and require each sequence to
+    /// reproduce the single-user prefill+decode next-token logits — the real engine path
+    /// (packed admission → batched decode).
+    /// </summary>
+    [Fact]
+    public void Qwen3_8B_PrefillPackedMulti_ThenBatchedDecode_MatchesSingleUser()
+    {
+        using var gpu = TryCreate();
+        if (gpu is null) return;
+        var path = FindModelPath();
+        if (path is null) return;
+
+        using var model = GgufModel.Open(path);
+        var hp = ModelHyperparams.FromGgufMetadata(model.Metadata, model);
+        Assert.Null(hp.LayerHeadDim);
+        using var fwd = NewFwd(model, gpu, hp);
+
+        // Single-user reference: prefill, greedy token, one decode step.
+        fwd.ResetCache();
+        int tokA = Argmax(fwd.Prefill(PromptA));
+        float[] refA = fwd.Forward(tokA, PromptA.Length).ToArray();
+        fwd.ResetCache();
+        int tokB = Argmax(fwd.Prefill(PromptB));
+        float[] refB = fwd.Forward(tokB, PromptB.Length).ToArray();
+
+        using var packA = fwd.CreateCache();
+        using var packB = fwd.CreateCache();
+        var packed = fwd.PrefillPackedMulti([PromptA, PromptB], [0, 0], [packA, packB], [true, true]);
+        Assert.Equal(tokA, Argmax(packed[0]!));
+        Assert.Equal(tokB, Argmax(packed[1]!));
+
+        var batch = fwd.BatchForwardMulti([tokA, tokB], [PromptA.Length, PromptB.Length], [packA, packB]);
+
+        var (maxAbsA, overlapA) = Compare(refA, batch[0]);
+        Assert.Equal(Argmax(refA), Argmax(batch[0]));
+        Assert.True(overlapA >= 4, $"Seq A packed→decode top-5 overlap {overlapA}/5 (maxAbs={maxAbsA}).");
+        Assert.True(maxAbsA < 1.0f, $"Seq A packed→decode maxAbs={maxAbsA}.");
+
+        var (maxAbsB, overlapB) = Compare(refB, batch[1]);
+        Assert.Equal(Argmax(refB), Argmax(batch[1]));
+        Assert.True(overlapB >= 4, $"Seq B packed→decode top-5 overlap {overlapB}/5 (maxAbs={maxAbsB}).");
+        Assert.True(maxAbsB < 1.0f, $"Seq B packed→decode maxAbs={maxAbsB}.");
+    }
+
     /// <summary>Empty token list and empty batch are rejected / no-op, matching the CPU path.</summary>
     [Fact]
     public void Qwen3_8B_BatchForwardMulti_EmptyBatch_ReturnsEmpty()