Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,7 @@ sharpi-cli -m models/Qwen3.6-27B-MTP-Q4_K_M.gguf -g -1 \

| Backend | Size | Prefill t/s | Decode t/s | Notes |
|---|---:|---:|---:|---|
| **CUDA** `-g -1 --no-thinking` (hybrid) | 16 GB | **10.0** | **10.4** | 22/64 dense FFN on GPU + GDN/attn KV resident, rest CPU mmap. 90% acceptance; folded k-token batched verify + GDN snapshot ring — **1.68× over MTP-off (6.4)** |
| **CUDA** `-g -1 --no-thinking` (hybrid) | 16 GB | **10.0** | **12.3** | GDN/attn KV resident on GPU, dense FFN on CPU mmap (the k=4 ring reclaims the VRAM the old k=2 default spent on 22 GPU FFN layers). 84% acceptance; 4-input CPU-FFN `MatVec4In` (#209) moves the verify optimum from k=2 → k=4 — **1.9× over MTP-off (6.5)**, +22% over the old k=2 default (10.1) |
| **CUDA** `-g -1 --no-thinking` `Q5_K_M` (hybrid) | 19 GB | 6.2 | **5.5** | 13/64 FFN on GPU, 51/64 CPU mmap. 98% acceptance; batched trunk (#119) bit-identical |
| CPU `--no-thinking` | 16 GB | 3.0 | **3.6** | dense 27B GDN/attn + native MTP head; auto MTP self-spec (#25) at greedy + `--no-thinking`. 90% draft acceptance; folded k-token batched verify (#30/#207) — 1.2× over MTP-off (3.0) |
| CPU `--no-thinking` `Q5_K_M` | 19 GB | 2.8 | **3.5** | ~10% slower than Q4_K_M; 100% acceptance |
Expand Down
6 changes: 6 additions & 0 deletions scripts/bench-27b-mtp.ps1
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
# One-shot bench harness for Qwen3.6-27B-MTP (issue #28). Runs both MTP-on (default)
# and MTP-off (SHARPI_DISABLE_MTP=1) on CPU and CUDA-hybrid for one or more quants,
# so the README can quote both numbers and the no-speedup gap is visible.
#
# Issue #209: the MTP-on rows now exercise the k=4 verify batch (the new default:
# SHARPI_MTP_BATCH_MAX=4 / SHARPI_MTP_DRAFT_N=3) — the 4-input CPU-FFN MatVec4In
# amortizes the dominant CPU mmap weight read four ways, moving the optimum out from
# the old pairwise k=2. Measured Q4_K_M CUDA-hybrid: k=4 12.3 vs k=2 10.1 vs k=6 10.4
# vs MTP-off 6.5 t/s. Set SHARPI_MTP_BATCH_MAX / SHARPI_MTP_DRAFT_N to sweep other k.
param(
[string[]]$Quants = @("Q4_K_M", "Q5_K_M"),
[int]$NTokens = 80,
Expand Down
15 changes: 4 additions & 11 deletions src/SharpInference.Core/IForwardPass.cs
Original file line number Diff line number Diff line change
Expand Up @@ -271,20 +271,13 @@ float[][] BatchVerify(int[] tokens, int startPos) =>
/// </summary>
int MaxBatchVerifyTokens => int.MaxValue;

/// <summary>
/// Last completed <see cref="BatchForward2"/>'s token-1 pre-output-norm hidden.
/// Used by the MTP commit step on the batched verify path. Empty when no batched
/// forward has been run.
/// </summary>
ReadOnlySpan<float> LastHiddenT1 => default;

/// <summary>
/// Two-token batched forward (issue #30). On entry both caches must be at length
/// <paramref name="startPos"/>. On return both caches are at length
/// <c>startPos + 2</c>, <see cref="LastHidden"/> holds h@startPos+1, and
/// <see cref="LastHiddenT1"/> holds h@startPos. A per-layer GDN snapshot is
/// captured at the "between t1 and t2" point so a rejected draft can be rolled
/// back via <see cref="RestoreBatchSnapshot"/>.
/// <c>startPos + 2</c> and <see cref="LastHidden"/> holds h@startPos+1. A per-layer
/// GDN snapshot is captured at the "between t1 and t2" point so a rejected draft can
/// be rolled back via <see cref="RestoreBatchSnapshot"/>. Both tokens' pre-output-
/// norm hiddens are written to the MTP hidden history for later draft chaining.
/// </summary>
void BatchForward2(int t1, int t2, int startPos,
out ReadOnlySpan<float> logits1, out ReadOnlySpan<float> logits2) =>
Expand Down
425 changes: 425 additions & 0 deletions src/SharpInference.Cpu/SimdKernels.cs

Large diffs are not rendered by default.

96 changes: 75 additions & 21 deletions src/SharpInference.Engine/CudaHybridGdnForwardPass.cs
Original file line number Diff line number Diff line change
Expand Up @@ -398,11 +398,16 @@ public sealed unsafe class CudaHybridGdnForwardPass : IForwardPass
private readonly Tensor _gpuResidual2; // [embDim]
private readonly Tensor _gpuNormBuf2; // [embDim]
private readonly Tensor _gpuLogits2; // [vocabSize]
private readonly Tensor _gpuLastHiddenT1; // [embDim] — h@startPos for the MTP commit
// BatchForward2 (SHARPI_CPU_GDN=1 debug trunk) snapshots t1's pre-norm hidden into
// these so it can ride the queued DownloadAsync alongside t2's _gpuLastHidden, then
// copies it into the MTP hidden history. Internal scratch for that path only — the
// production k-token BatchVerify path writes the history straight from the device
// stream (the dead public LastHiddenT1 accessor was removed in issue #209).
private readonly Tensor _gpuLastHiddenT1; // [embDim] — t1 hidden device snapshot
private readonly float[] _logitsBuf2; // host download for token 2 logits
private readonly float* _cpuNormBuf2; // [embDim] — t2's norm download for CPU FFN path
private readonly float* _cpuMoeHidden2; // [embDim] — t2's CPU FFN output
private readonly float* _lastHiddenT1; // [embDim] — host span for the t1 hidden
private readonly float* _lastHiddenT1; // [embDim] — pinned t1 hidden host target

private byte* _batchSnapshotBuf;
private long _batchSnapshotCap;
Expand Down Expand Up @@ -441,14 +446,22 @@ public sealed unsafe class CudaHybridGdnForwardPass : IForwardPass

// Max tokens per BatchVerify call = ring slots + 1. Each slot costs ~149 MiB
// of VRAM that TryUploadDenseFfnLayers would otherwise fill with ~2 dense FFN
// layers, hence the conservative default; deeper chains only pay once the CPU
// FFN amortizes more than pairwise (4-input MatVec follow-up). Instance-resolved
// at construction so tests can override per instance; the knob semantics live
// in one place (GdnStateCache.ResolveMtpBatchMax) shared with the CPU pass.
// layers, so the default (4 → 3 slots) is the smallest ring that reaches the
// measured k=4 optimum now that the 4-input CPU FFN kernel (issue #209) amortizes
// the dominant mmap weight read four ways. Instance-resolved at construction so
// tests can override per instance; the knob semantics live in one place
// (GdnStateCache.ResolveMtpBatchMax) shared with the CPU pass.
private readonly int _mtpBatchMax = GdnStateCache.ResolveMtpBatchMax();
// Token-2 host FFN scratch (intermediate gate/up post-MatVec2In, pre-SiLuMul).
private readonly float* _cpuFfnGateBuf2;
private readonly float* _cpuFfnUpBuf2;
// Lane-3/4 host FFN scratch (issue #209): CpuDenseFfn4 dots one CPU mmap weight
// read against four draft tokens via MatVec4In, so it needs four distinct
// gate/up scratch slabs (SiLU reads them per-lane before the down projection).
private readonly float* _cpuFfnGateBuf3;
private readonly float* _cpuFfnUpBuf3;
private readonly float* _cpuFfnGateBuf4;
private readonly float* _cpuFfnUpBuf4;

// Host-side hidden history; see HybridGdnForwardPass field-level doc.
private float* _mtpPrefillHiddens; // [_mtpPrefillHiddensCap × embDim], slot p = h_p
Expand Down Expand Up @@ -1393,6 +1406,10 @@ void TraceVram(string label)
_gpuFfnUpBufDense2 = gpu.Allocate(TensorShape.D1(_intermDim));
_cpuFfnGateBuf2 = Alloc(_intermDim);
_cpuFfnUpBuf2 = Alloc(_intermDim);
_cpuFfnGateBuf3 = Alloc(_intermDim);
_cpuFfnUpBuf3 = Alloc(_intermDim);
_cpuFfnGateBuf4 = Alloc(_intermDim);
_cpuFfnUpBuf4 = Alloc(_intermDim);
}

// Host snapshot buffer for BatchForward2's between-token capture — only
Expand Down Expand Up @@ -3294,10 +3311,6 @@ public ReadOnlySpan<float> HiddenAt(int position)
public ReadOnlySpan<float> MtpLastHidden =>
_mtpSelfHidden != null ? new ReadOnlySpan<float>(_mtpSelfHidden, _embDim) : default;

/// <inheritdoc />
public ReadOnlySpan<float> LastHiddenT1 =>
_lastHiddenT1 != null ? new ReadOnlySpan<float>(_lastHiddenT1, _embDim) : default;

/// <inheritdoc />
public void BatchForward2(int t1, int t2, int startPos,
out ReadOnlySpan<float> logits1, out ReadOnlySpan<float> logits2)
Expand Down Expand Up @@ -3701,19 +3714,22 @@ public float[][] BatchVerify(int[] tokens, int startPos)
else if (!isMoe && !denseGpuLayer)
{
// CPU mmap dense FFN — the 27B/12GB decode cost center (~8.6 GB
// weight reads per token). Pair-batched MatVec2In reads each weight
// row once per pair; the odd tail re-runs as a duplicated-input
// pair (second output → sink) so every token's bits match the pair
// kernel regardless of k parity.
// weight reads per token). Quad-batched MatVec4In reads each weight
// row once per four tokens (issue #209); the final partial group's
// duplicated-tail lanes re-run the last real token with their output
// routed to a shared sink, so every token's bits match the quad
// kernel regardless of k parity (per-position k-parity independence).
_gpu.Download(moeNorm, (nint)_bvNormHost, k * embDim);
for (int i = 0; i < k; i += 2)
for (int i = 0; i < k; i += 4)
{
bool tail = i + 1 >= k;
int j = tail ? i : i + 1;
CpuDenseFfn2(layer,
_bvNormHost + (long)i * embDim, _bvNormHost + (long)j * embDim,
_bvFfnHost + (long)i * embDim,
tail ? _cpuMoeHidden2 : _bvFfnHost + (long)j * embDim);
MtpBatchTail.Group4(i, k, out int j0, out int j1, out int j2, out int j3, out int nReal);
CpuDenseFfn4(layer,
_bvNormHost + (long)j0 * embDim, _bvNormHost + (long)j1 * embDim,
_bvNormHost + (long)j2 * embDim, _bvNormHost + (long)j3 * embDim,
_bvFfnHost + (long)j0 * embDim,
nReal > 1 ? _bvFfnHost + (long)j1 * embDim : _cpuMoeHidden2,
nReal > 2 ? _bvFfnHost + (long)j2 * embDim : _cpuMoeHidden2,
nReal > 3 ? _bvFfnHost + (long)j3 * embDim : _cpuMoeHidden2);
}
_gpu.UploadInto(_gpuBvFfnAll!, (nint)_bvFfnHost, k * embDim);
_gpu.AddInPlace(_gpuBvFfnAll!, blockOut);
Expand Down Expand Up @@ -4585,6 +4601,40 @@ private void CpuDenseFfn2(int layer,
_cpuFfnGateBuf, _cpuFfnGateBuf2, _embDim, _intermDim, wDown.DType);
}

/// <summary>
/// Batched four-token CPU dense FFN (issue #209). Each gate/up/down weight row is
/// read once from the CPU mmap and dotted against all four tokens via
/// <see cref="SimdKernels.MatVec4In"/> — one weight HBM read per four draft tokens
/// versus <see cref="CpuDenseFfn2"/>'s one-per-two, halving the dominant decode
/// cost on the 27B-MTP CUDA-hybrid path at k = 4. Per-token bits are identical to
/// <see cref="CpuDenseFfn2"/> and single-token decode (MatVec4In is bit-identical
/// per slot). Lanes that are duplicated-tail fillers point their <c>out</c> at a
/// shared sink — the value is recomputed-but-discarded; the four gate/up scratch
/// slabs stay distinct because SiLU consumes each lane before the down projection.
/// </summary>
private void CpuDenseFfn4(int layer,
float* n0, float* n1, float* n2, float* n3,
float* out0, float* out1, float* out2, float* out3)
{
var wGate = _cpuWFfnGate![layer];
var wUp = _cpuWFfnUp![layer];
var wDown = _cpuWFfnDown![layer];

SimdKernels.MatVec4In(_cpuFfnGateBuf, _cpuFfnGateBuf2, _cpuFfnGateBuf3, _cpuFfnGateBuf4,
wGate.DataPtr, n0, n1, n2, n3, _intermDim, _embDim, wGate.DType);
SimdKernels.MatVec4In(_cpuFfnUpBuf, _cpuFfnUpBuf2, _cpuFfnUpBuf3, _cpuFfnUpBuf4,
wUp.DataPtr, n0, n1, n2, n3, _intermDim, _embDim, wUp.DType);

SimdKernels.SiLuMul(_cpuFfnGateBuf, _cpuFfnUpBuf, _intermDim);
SimdKernels.SiLuMul(_cpuFfnGateBuf2, _cpuFfnUpBuf2, _intermDim);
SimdKernels.SiLuMul(_cpuFfnGateBuf3, _cpuFfnUpBuf3, _intermDim);
SimdKernels.SiLuMul(_cpuFfnGateBuf4, _cpuFfnUpBuf4, _intermDim);

SimdKernels.MatVec4In(out0, out1, out2, out3, wDown.DataPtr,
_cpuFfnGateBuf, _cpuFfnGateBuf2, _cpuFfnGateBuf3, _cpuFfnGateBuf4,
_embDim, _intermDim, wDown.DType);
}

// =================================================================
// GPU dense FFN — for layers whose ffn_gate/up/down were uploaded by
// TryUploadDenseFfnLayers. Consumes _gpuNormBuf, produces _gpuHidden.
Expand Down Expand Up @@ -6052,6 +6102,10 @@ public void Dispose()
if (_gpuFfnUpBufDense2 is { } uB2) _gpu.Free(uB2);
if (_cpuFfnGateBuf2 != null) NativeMemory.Free(_cpuFfnGateBuf2);
if (_cpuFfnUpBuf2 != null) NativeMemory.Free(_cpuFfnUpBuf2);
if (_cpuFfnGateBuf3 != null) NativeMemory.Free(_cpuFfnGateBuf3);
if (_cpuFfnUpBuf3 != null) NativeMemory.Free(_cpuFfnUpBuf3);
if (_cpuFfnGateBuf4 != null) NativeMemory.Free(_cpuFfnGateBuf4);
if (_cpuFfnUpBuf4 != null) NativeMemory.Free(_cpuFfnUpBuf4);
if (_batchSnapshotBuf != null)
{
NativeMemory.Free(_batchSnapshotBuf);
Expand Down
14 changes: 9 additions & 5 deletions src/SharpInference.Engine/GdnStateCache.cs
Original file line number Diff line number Diff line change
Expand Up @@ -386,15 +386,19 @@ public void RestoreLayerFrom(int gdnLayerIndex, byte* src, long srcBytes)
/// <summary>
/// Resolve the SHARPI_MTP_BATCH_MAX knob: max tokens per batched-verify call
/// (= 1 + max MTP draft-chain length), which sizes the per-token-boundary GDN
/// snapshot ring at <c>value − 1</c> slots. Clamped to [2, 8]; default 2 — one
/// ring slot (~149 MB for 27B on either side of the PCIe bus), the measured
/// k=2 optimum until the CPU FFN amortizes more than pairwise. Shared by both
/// hybrid GDN passes so the knob means the same thing on every backend.
/// snapshot ring at <c>value − 1</c> slots. Clamped to [2, 8]; default 4 (three
/// ring slots, ~149 MB each for 27B on either side of the PCIe bus) — the measured
/// k=4 optimum once the 4-input CPU FFN kernel (issue #209) amortizes the dominant
/// CPU mmap weight read across four draft tokens (27B Q4_K_M CUDA-hybrid: k=4 12.2
/// vs k=2 10.1 vs k=6 10.4 t/s; the GPU-trunk matvec re-stream and lower acceptance
/// erode deeper chains). The ring alloc stops on OOM and SupportsBatchVerify clamps
/// MaxBatchVerifyTokens to what fit, so a tight-VRAM card degrades gracefully.
/// Shared by both hybrid GDN passes so the knob means the same thing on every backend.
/// </summary>
public static int ResolveMtpBatchMax()
{
var s = Environment.GetEnvironmentVariable("SHARPI_MTP_BATCH_MAX");
return s is not null && int.TryParse(s, out var v) ? Math.Clamp(v, 2, 8) : 2;
return s is not null && int.TryParse(s, out var v) ? Math.Clamp(v, 2, 8) : 4;
}

/// <summary>
Expand Down
Loading