pekkah · pekkah · Jun 17, 2026 · Jun 17, 2026
diff --git a/README.md b/README.md
@@ -167,7 +167,7 @@ sharpi-cli -m models/Qwen3.6-27B-MTP-Q4_K_M.gguf -g -1 \
 
 | Backend | Size | Prefill t/s | Decode t/s | Notes |
 |---|---:|---:|---:|---|
-| **CUDA** `-g -1 --no-thinking` (hybrid) | 16 GB | **10.0** | **10.4** | 22/64 dense FFN on GPU + GDN/attn KV resident, rest CPU mmap. 90% acceptance; folded k-token batched verify + GDN snapshot ring — **1.68× over MTP-off (6.4)** |
+| **CUDA** `-g -1 --no-thinking` (hybrid) | 16 GB | **10.0** | **12.3** | GDN/attn KV resident on GPU, dense FFN on CPU mmap (the k=4 ring reclaims the VRAM the old k=2 default spent on 22 GPU FFN layers). 84% acceptance; 4-input CPU-FFN `MatVec4In` (#209) moves the verify optimum from k=2 → k=4 — **1.9× over MTP-off (6.5)**, +22% over the old k=2 default (10.1) |
 | **CUDA** `-g -1 --no-thinking` `Q5_K_M` (hybrid) | 19 GB | 6.2 | **5.5** | 13/64 FFN on GPU, 51/64 CPU mmap. 98% acceptance; batched trunk (#119) bit-identical |
 | CPU `--no-thinking` | 16 GB | 3.0 | **3.6** | dense 27B GDN/attn + native MTP head; auto MTP self-spec (#25) at greedy + `--no-thinking`. 90% draft acceptance; folded k-token batched verify (#30/#207) — 1.2× over MTP-off (3.0) |
 | CPU `--no-thinking` `Q5_K_M` | 19 GB | 2.8 | **3.5** | ~10% slower than Q4_K_M; 100% acceptance |

diff --git a/scripts/bench-27b-mtp.ps1 b/scripts/bench-27b-mtp.ps1
@@ -1,6 +1,12 @@
 # One-shot bench harness for Qwen3.6-27B-MTP (issue #28). Runs both MTP-on (default)
 # and MTP-off (SHARPI_DISABLE_MTP=1) on CPU and CUDA-hybrid for one or more quants,
 # so the README can quote both numbers and the no-speedup gap is visible.
+#
+# Issue #209: the MTP-on rows now exercise the k=4 verify batch (the new default:
+# SHARPI_MTP_BATCH_MAX=4 / SHARPI_MTP_DRAFT_N=3) — the 4-input CPU-FFN MatVec4In
+# amortizes the dominant CPU mmap weight read four ways, moving the optimum out from
+# the old pairwise k=2. Measured Q4_K_M CUDA-hybrid: k=4 12.3 vs k=2 10.1 vs k=6 10.4
+# vs MTP-off 6.5 t/s. Set SHARPI_MTP_BATCH_MAX / SHARPI_MTP_DRAFT_N to sweep other k.
 param(
     [string[]]$Quants = @("Q4_K_M", "Q5_K_M"),
     [int]$NTokens = 80,

diff --git a/src/SharpInference.Core/IForwardPass.cs b/src/SharpInference.Core/IForwardPass.cs
@@ -271,20 +271,13 @@ float[][] BatchVerify(int[] tokens, int startPos) =>
     /// </summary>
     int MaxBatchVerifyTokens => int.MaxValue;
 
-    /// <summary>
-    /// Last completed <see cref="BatchForward2"/>'s token-1 pre-output-norm hidden.
-    /// Used by the MTP commit step on the batched verify path. Empty when no batched
-    /// forward has been run.
-    /// </summary>
-    ReadOnlySpan<float> LastHiddenT1 => default;
-
     /// <summary>
     /// Two-token batched forward (issue #30). On entry both caches must be at length
     /// <paramref name="startPos"/>. On return both caches are at length
-    /// <c>startPos + 2</c>, <see cref="LastHidden"/> holds h@startPos+1, and
-    /// <see cref="LastHiddenT1"/> holds h@startPos. A per-layer GDN snapshot is
-    /// captured at the "between t1 and t2" point so a rejected draft can be rolled
-    /// back via <see cref="RestoreBatchSnapshot"/>.
+    /// <c>startPos + 2</c> and <see cref="LastHidden"/> holds h@startPos+1. A per-layer
+    /// GDN snapshot is captured at the "between t1 and t2" point so a rejected draft can
+    /// be rolled back via <see cref="RestoreBatchSnapshot"/>. Both tokens' pre-output-
+    /// norm hiddens are written to the MTP hidden history for later draft chaining.
     /// </summary>
     void BatchForward2(int t1, int t2, int startPos,
         out ReadOnlySpan<float> logits1, out ReadOnlySpan<float> logits2) =>

diff --git a/src/SharpInference.Cpu/SimdKernels.cs b/src/SharpInference.Cpu/SimdKernels.cs
diff --git a/src/SharpInference.Engine/CudaHybridGdnForwardPass.cs b/src/SharpInference.Engine/CudaHybridGdnForwardPass.cs
@@ -398,11 +398,16 @@ public sealed unsafe class CudaHybridGdnForwardPass : IForwardPass
     private readonly Tensor _gpuResidual2;        // [embDim]
     private readonly Tensor _gpuNormBuf2;         // [embDim]
     private readonly Tensor _gpuLogits2;          // [vocabSize]
-    private readonly Tensor _gpuLastHiddenT1;     // [embDim] — h@startPos for the MTP commit
+    // BatchForward2 (SHARPI_CPU_GDN=1 debug trunk) snapshots t1's pre-norm hidden into
+    // these so it can ride the queued DownloadAsync alongside t2's _gpuLastHidden, then
+    // copies it into the MTP hidden history. Internal scratch for that path only — the
+    // production k-token BatchVerify path writes the history straight from the device
+    // stream (the dead public LastHiddenT1 accessor was removed in issue #209).
+    private readonly Tensor _gpuLastHiddenT1;     // [embDim] — t1 hidden device snapshot
     private readonly float[] _logitsBuf2;         // host download for token 2 logits
     private readonly float* _cpuNormBuf2;         // [embDim] — t2's norm download for CPU FFN path
     private readonly float* _cpuMoeHidden2;       // [embDim] — t2's CPU FFN output
-    private readonly float* _lastHiddenT1;        // [embDim] — host span for the t1 hidden
+    private readonly float* _lastHiddenT1;        // [embDim] — pinned t1 hidden host target
 
     private byte* _batchSnapshotBuf;
     private long _batchSnapshotCap;
@@ -441,14 +446,22 @@ public sealed unsafe class CudaHybridGdnForwardPass : IForwardPass
 
     // Max tokens per BatchVerify call = ring slots + 1. Each slot costs ~149 MiB
     // of VRAM that TryUploadDenseFfnLayers would otherwise fill with ~2 dense FFN
-    // layers, hence the conservative default; deeper chains only pay once the CPU
-    // FFN amortizes more than pairwise (4-input MatVec follow-up). Instance-resolved
-    // at construction so tests can override per instance; the knob semantics live
-    // in one place (GdnStateCache.ResolveMtpBatchMax) shared with the CPU pass.
+    // layers, so the default (4 → 3 slots) is the smallest ring that reaches the
+    // measured k=4 optimum now that the 4-input CPU FFN kernel (issue #209) amortizes
+    // the dominant mmap weight read four ways. Instance-resolved at construction so
+    // tests can override per instance; the knob semantics live in one place
+    // (GdnStateCache.ResolveMtpBatchMax) shared with the CPU pass.
     private readonly int _mtpBatchMax = GdnStateCache.ResolveMtpBatchMax();
     // Token-2 host FFN scratch (intermediate gate/up post-MatVec2In, pre-SiLuMul).
     private readonly float* _cpuFfnGateBuf2;
     private readonly float* _cpuFfnUpBuf2;
+    // Lane-3/4 host FFN scratch (issue #209): CpuDenseFfn4 dots one CPU mmap weight
+    // read against four draft tokens via MatVec4In, so it needs four distinct
+    // gate/up scratch slabs (SiLU reads them per-lane before the down projection).
+    private readonly float* _cpuFfnGateBuf3;
+    private readonly float* _cpuFfnUpBuf3;
+    private readonly float* _cpuFfnGateBuf4;
+    private readonly float* _cpuFfnUpBuf4;
 
     // Host-side hidden history; see HybridGdnForwardPass field-level doc.
     private float* _mtpPrefillHiddens;     // [_mtpPrefillHiddensCap × embDim], slot p = h_p
@@ -1393,6 +1406,10 @@ void TraceVram(string label)
                 _gpuFfnUpBufDense2   = gpu.Allocate(TensorShape.D1(_intermDim));
                 _cpuFfnGateBuf2  = Alloc(_intermDim);
                 _cpuFfnUpBuf2    = Alloc(_intermDim);
+                _cpuFfnGateBuf3  = Alloc(_intermDim);
+                _cpuFfnUpBuf3    = Alloc(_intermDim);
+                _cpuFfnGateBuf4  = Alloc(_intermDim);
+                _cpuFfnUpBuf4    = Alloc(_intermDim);
             }
 
             // Host snapshot buffer for BatchForward2's between-token capture — only
@@ -3294,10 +3311,6 @@ public ReadOnlySpan<float> HiddenAt(int position)
     public ReadOnlySpan<float> MtpLastHidden =>
         _mtpSelfHidden != null ? new ReadOnlySpan<float>(_mtpSelfHidden, _embDim) : default;
 
-    /// <inheritdoc />
-    public ReadOnlySpan<float> LastHiddenT1 =>
-        _lastHiddenT1 != null ? new ReadOnlySpan<float>(_lastHiddenT1, _embDim) : default;
-
     /// <inheritdoc />
     public void BatchForward2(int t1, int t2, int startPos,
         out ReadOnlySpan<float> logits1, out ReadOnlySpan<float> logits2)
@@ -3701,19 +3714,22 @@ public float[][] BatchVerify(int[] tokens, int startPos)
             else if (!isMoe && !denseGpuLayer)
             {
                 // CPU mmap dense FFN — the 27B/12GB decode cost center (~8.6 GB
-                // weight reads per token). Pair-batched MatVec2In reads each weight
-                // row once per pair; the odd tail re-runs as a duplicated-input
-                // pair (second output → sink) so every token's bits match the pair
-                // kernel regardless of k parity.
+                // weight reads per token). Quad-batched MatVec4In reads each weight
+                // row once per four tokens (issue #209); the final partial group's
+                // duplicated-tail lanes re-run the last real token with their output
+                // routed to a shared sink, so every token's bits match the quad
+                // kernel regardless of k parity (per-position k-parity independence).
                 _gpu.Download(moeNorm, (nint)_bvNormHost, k * embDim);
-                for (int i = 0; i < k; i += 2)
+                for (int i = 0; i < k; i += 4)
                 {
-                    bool tail = i + 1 >= k;
-                    int j = tail ? i : i + 1;
-                    CpuDenseFfn2(layer,
-                        _bvNormHost + (long)i * embDim, _bvNormHost + (long)j * embDim,
-                        _bvFfnHost + (long)i * embDim,
-                        tail ? _cpuMoeHidden2 : _bvFfnHost + (long)j * embDim);
+                    MtpBatchTail.Group4(i, k, out int j0, out int j1, out int j2, out int j3, out int nReal);
+                    CpuDenseFfn4(layer,
+                        _bvNormHost + (long)j0 * embDim, _bvNormHost + (long)j1 * embDim,
+                        _bvNormHost + (long)j2 * embDim, _bvNormHost + (long)j3 * embDim,
+                        _bvFfnHost + (long)j0 * embDim,
+                        nReal > 1 ? _bvFfnHost + (long)j1 * embDim : _cpuMoeHidden2,
+                        nReal > 2 ? _bvFfnHost + (long)j2 * embDim : _cpuMoeHidden2,
+                        nReal > 3 ? _bvFfnHost + (long)j3 * embDim : _cpuMoeHidden2);
                 }
                 _gpu.UploadInto(_gpuBvFfnAll!, (nint)_bvFfnHost, k * embDim);
                 _gpu.AddInPlace(_gpuBvFfnAll!, blockOut);
@@ -4585,6 +4601,40 @@ private void CpuDenseFfn2(int layer,
             _cpuFfnGateBuf, _cpuFfnGateBuf2, _embDim, _intermDim, wDown.DType);
     }
 
+    /// <summary>
+    /// Batched four-token CPU dense FFN (issue #209). Each gate/up/down weight row is
+    /// read once from the CPU mmap and dotted against all four tokens via
+    /// <see cref="SimdKernels.MatVec4In"/> — one weight HBM read per four draft tokens
+    /// versus <see cref="CpuDenseFfn2"/>'s one-per-two, halving the dominant decode
+    /// cost on the 27B-MTP CUDA-hybrid path at k = 4. Per-token bits are identical to
+    /// <see cref="CpuDenseFfn2"/> and single-token decode (MatVec4In is bit-identical
+    /// per slot). Lanes that are duplicated-tail fillers point their <c>out</c> at a
+    /// shared sink — the value is recomputed-but-discarded; the four gate/up scratch
+    /// slabs stay distinct because SiLU consumes each lane before the down projection.
+    /// </summary>
+    private void CpuDenseFfn4(int layer,
+        float* n0, float* n1, float* n2, float* n3,
+        float* out0, float* out1, float* out2, float* out3)
+    {
+        var wGate = _cpuWFfnGate![layer];
+        var wUp   = _cpuWFfnUp![layer];
+        var wDown = _cpuWFfnDown![layer];
+
+        SimdKernels.MatVec4In(_cpuFfnGateBuf, _cpuFfnGateBuf2, _cpuFfnGateBuf3, _cpuFfnGateBuf4,
+            wGate.DataPtr, n0, n1, n2, n3, _intermDim, _embDim, wGate.DType);
+        SimdKernels.MatVec4In(_cpuFfnUpBuf, _cpuFfnUpBuf2, _cpuFfnUpBuf3, _cpuFfnUpBuf4,
+            wUp.DataPtr, n0, n1, n2, n3, _intermDim, _embDim, wUp.DType);
+
+        SimdKernels.SiLuMul(_cpuFfnGateBuf,  _cpuFfnUpBuf,  _intermDim);
+        SimdKernels.SiLuMul(_cpuFfnGateBuf2, _cpuFfnUpBuf2, _intermDim);
+        SimdKernels.SiLuMul(_cpuFfnGateBuf3, _cpuFfnUpBuf3, _intermDim);
+        SimdKernels.SiLuMul(_cpuFfnGateBuf4, _cpuFfnUpBuf4, _intermDim);
+
+        SimdKernels.MatVec4In(out0, out1, out2, out3, wDown.DataPtr,
+            _cpuFfnGateBuf, _cpuFfnGateBuf2, _cpuFfnGateBuf3, _cpuFfnGateBuf4,
+            _embDim, _intermDim, wDown.DType);
+    }
+
     // =================================================================
     //  GPU dense FFN — for layers whose ffn_gate/up/down were uploaded by
     //  TryUploadDenseFfnLayers. Consumes _gpuNormBuf, produces _gpuHidden.
@@ -6052,6 +6102,10 @@ public void Dispose()
                 if (_gpuFfnUpBufDense2   is { } uB2) _gpu.Free(uB2);
                 if (_cpuFfnGateBuf2 != null) NativeMemory.Free(_cpuFfnGateBuf2);
                 if (_cpuFfnUpBuf2   != null) NativeMemory.Free(_cpuFfnUpBuf2);
+                if (_cpuFfnGateBuf3 != null) NativeMemory.Free(_cpuFfnGateBuf3);
+                if (_cpuFfnUpBuf3   != null) NativeMemory.Free(_cpuFfnUpBuf3);
+                if (_cpuFfnGateBuf4 != null) NativeMemory.Free(_cpuFfnGateBuf4);
+                if (_cpuFfnUpBuf4   != null) NativeMemory.Free(_cpuFfnUpBuf4);
                 if (_batchSnapshotBuf != null)
                 {
                     NativeMemory.Free(_batchSnapshotBuf);

diff --git a/src/SharpInference.Engine/GdnStateCache.cs b/src/SharpInference.Engine/GdnStateCache.cs
@@ -386,15 +386,19 @@ public void RestoreLayerFrom(int gdnLayerIndex, byte* src, long srcBytes)
     /// <summary>
     /// Resolve the SHARPI_MTP_BATCH_MAX knob: max tokens per batched-verify call
     /// (= 1 + max MTP draft-chain length), which sizes the per-token-boundary GDN
-    /// snapshot ring at <c>value − 1</c> slots. Clamped to [2, 8]; default 2 — one
-    /// ring slot (~149 MB for 27B on either side of the PCIe bus), the measured
-    /// k=2 optimum until the CPU FFN amortizes more than pairwise. Shared by both
-    /// hybrid GDN passes so the knob means the same thing on every backend.
+    /// snapshot ring at <c>value − 1</c> slots. Clamped to [2, 8]; default 4 (three
+    /// ring slots, ~149 MB each for 27B on either side of the PCIe bus) — the measured
+    /// k=4 optimum once the 4-input CPU FFN kernel (issue #209) amortizes the dominant
+    /// CPU mmap weight read across four draft tokens (27B Q4_K_M CUDA-hybrid: k=4 12.2
+    /// vs k=2 10.1 vs k=6 10.4 t/s; the GPU-trunk matvec re-stream and lower acceptance
+    /// erode deeper chains). The ring alloc stops on OOM and SupportsBatchVerify clamps
+    /// MaxBatchVerifyTokens to what fit, so a tight-VRAM card degrades gracefully.
+    /// Shared by both hybrid GDN passes so the knob means the same thing on every backend.
     /// </summary>
     public static int ResolveMtpBatchMax()
     {
         var s = Environment.GetEnvironmentVariable("SHARPI_MTP_BATCH_MAX");
-        return s is not null && int.TryParse(s, out var v) ? Math.Clamp(v, 2, 8) : 2;
+        return s is not null && int.TryParse(s, out var v) ? Math.Clamp(v, 2, 8) : 4;
     }
 
     /// <summary>