pekkah · pekkah · Jun 5, 2026 · Jun 5, 2026 · Jun 6, 2026 · Jun 6, 2026
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -18,9 +18,9 @@ dotnet test --filter "FullyQualifiedName~SomeTest"  # Run a single test
 dotnet run --project src/SharpInference.Cli -c Release -- \
   -m models/SmolLM2-1.7B-Instruct-Q4_K_M.gguf -p "prompt" --temp 0
 
-# GPU backend (all layers offloaded)
+# GPU backend (all layers offloaded; --ngl matches llama.cpp, --device N selects a specific GPU)
 dotnet run --project src/SharpInference.Cli -c Release -- \
-  -m models/SmolLM2-1.7B-Instruct-Q4_K_M.gguf -p "prompt" --temp 0 -g -1
+  -m models/SmolLM2-1.7B-Instruct-Q4_K_M.gguf -p "prompt" --temp 0 --ngl -1
 
 # Start API server (OpenAI + Anthropic compatible). SharpInference.Server is the
 # ASP.NET Core library that ships AddSharpInference() / MapSharpInference();

diff --git a/README.md b/README.md
@@ -22,37 +22,37 @@ coherent (`scripts/bench-all.ps1`); top-1 parity vs llama.cpp b8585 verified on
 | Model | Repo | Size | Backend | Prefill t/s | Decode t/s | Notes |
 |---|---|---:|---|---:|---:|---|
 | SmolLM2 1.7B Instruct | [HuggingFaceTB](https://huggingface.co/HuggingFaceTB/SmolLM2-1.7B-Instruct-GGUF) | 1 GB | CPU | 40.4 | 38.9 | AVX2 fused dequant-matvec |
-| SmolLM2 1.7B Instruct | (same) | 1 GB | Vulkan `-g -1` | 123.2 | **139.7** | GLSL `subgroupAdd` reduce |
-| SmolLM2 1.7B Instruct | (same) | 1 GB | **CUDA** `-g -1` | **163.1** | **158.1** | NVRTC `__dp4a` + Q8_1 |
+| SmolLM2 1.7B Instruct | (same) | 1 GB | Vulkan `--ngl -1` | 123.2 | **139.7** | GLSL `subgroupAdd` reduce |
+| SmolLM2 1.7B Instruct | (same) | 1 GB | **CUDA** `--ngl -1` | **163.1** | **158.1** | NVRTC `__dp4a` + Q8_1 |
 | Qwen3 8B | [Qwen](https://huggingface.co/Qwen/Qwen3-8B-GGUF) | 5 GB | CPU | 9.9 | 11.7 | dense, no KV compression |
 | Qwen3 8B | (same) | 5 GB | CPU `--tq` | 9.5 | **11.9** | 3-bit KV → 40 960 ctx; FastScan K+V (#34) keeps long-ctx decode ~flat (10.2 @ 3K, 9.4 @ 6K) |
-| Qwen3 8B | (same) | 5 GB | Vulkan `-g -1` | 45.4 | 45.8 | 11.4K auto-ctx |
-| Qwen3 8B | (same) | 5 GB | Vulkan `-g -1 --tq` | 40.7 | 45.5 | 3-bit KV → 40 960 ctx |
-| Qwen3 8B | (same) | 5 GB | **CUDA** `-g -1` | **61.7** | **58.6** | ~2.8× Vulkan prefill |
-| Qwen3 8B | (same) | 5 GB | **CUDA** `-g -1 --no-thinking` | **61.8** | **58.2** | reasoning suppressed in template |
-| Qwen3 8B | (same) | 5 GB | **CUDA** `-g -1 --tq` | **57.4** | **58.4** | 3-bit KV → 40 960 ctx; 17 t/s @ 8K, 10 @ 16K |
-| Qwen3 8B | (same) | 5 GB | **CUDA** `-g -1 --tq --no-thinking` | **57.5** | **58.1** | as `--tq`, reasoning suppressed |
+| Qwen3 8B | (same) | 5 GB | Vulkan `--ngl -1` | 45.4 | 45.8 | 11.4K auto-ctx |
+| Qwen3 8B | (same) | 5 GB | Vulkan `--ngl -1 --tq` | 40.7 | 45.5 | 3-bit KV → 40 960 ctx |
+| Qwen3 8B | (same) | 5 GB | **CUDA** `--ngl -1` | **61.7** | **58.6** | ~2.8× Vulkan prefill |
+| Qwen3 8B | (same) | 5 GB | **CUDA** `--ngl -1 --no-thinking` | **61.8** | **58.2** | reasoning suppressed in template |
+| Qwen3 8B | (same) | 5 GB | **CUDA** `--ngl -1 --tq` | **57.4** | **58.4** | 3-bit KV → 40 960 ctx; 17 t/s @ 8K, 10 @ 16K |
+| Qwen3 8B | (same) | 5 GB | **CUDA** `--ngl -1 --tq --no-thinking` | **57.5** | **58.1** | as `--tq`, reasoning suppressed |
 | OLMoE 1B-7B Instruct (MoE) | [allenai](https://huggingface.co/allenai/OLMoE-1B-7B-0924-Instruct-GGUF) | 4 GB | CPU | 51.6 | 55.7 | 64 experts / 8 active; per-channel QK-norm; `norm_topk_prob=false` |
-| OLMoE 1B-7B Instruct (MoE) | (same) | 4 GB | Vulkan `-g -1` | 112.3 | **121.2** | greedy unstable across backends — use `--temp 0.6 --top-p 0.95` |
-| OLMoE 1B-7B Instruct (MoE) | (same) | 4 GB | **CUDA** `-g -1` | **117.6** | **111.7** | greedy varies, sampling coherent |
+| OLMoE 1B-7B Instruct (MoE) | (same) | 4 GB | Vulkan `--ngl -1` | 112.3 | **121.2** | greedy unstable across backends — use `--temp 0.6 --top-p 0.95` |
+| OLMoE 1B-7B Instruct (MoE) | (same) | 4 GB | **CUDA** `--ngl -1` | **117.6** | **111.7** | greedy varies, sampling coherent |
 | Qwen3-Coder 30B-A3B (MoE) | [Qwen](https://huggingface.co/Qwen/Qwen3-Coder-30B-A3B-Instruct-GGUF) | 17 GB | CPU | 19.4 | 21.1 | 128 experts / 8 active |
 | Qwen3-Coder 30B-A3B (MoE) | (same) | 17 GB | CPU `--tq` | 18.8 | 21.0 | 3-bit KV; FastScan (#34) → 15.5 t/s decode @ 3.2K ctx |
-| Qwen3-Coder 30B-A3B (MoE) | (same) | 17 GB | Vulkan `-g -1` (hybrid) | 1.1 | 5.3 | 29 GPU + 19 CPU layers, SLRU expert cache; predictive prefetch (#50/#77) on by default (`--no-moe-predict-prefetch`). Prefill is the original short-ctx run (Vulkan-hybrid errored on the ~1K prompt) |
-| Qwen3-Coder 30B-A3B (MoE) | (same) | 17 GB | **CUDA** `-g -1` (hybrid) | **30.1** | **25.0** | 29 GPU + 19 CPU layers; routed experts stream through `CudaExpertSlotManager` SLRU (#72/#77). Batched-trunk prefill (#123, bit-identical; `SHARPI_BATCHED_PREFILL=0` to bisect). `SHARPI_EXPERT_STATS=path` for hit rates |
+| Qwen3-Coder 30B-A3B (MoE) | (same) | 17 GB | Vulkan `--ngl -1` (hybrid) | 1.1 | 5.3 | 29 GPU + 19 CPU layers, SLRU expert cache; predictive prefetch (#50/#77) on by default (`--no-moe-predict-prefetch`). Prefill is the original short-ctx run (Vulkan-hybrid errored on the ~1K prompt) |
+| Qwen3-Coder 30B-A3B (MoE) | (same) | 17 GB | **CUDA** `--ngl -1` (hybrid) | **30.1** | **25.0** | 29 GPU + 19 CPU layers; routed experts stream through `CudaExpertSlotManager` SLRU (#72/#77). Batched-trunk prefill (#123, bit-identical; `SHARPI_BATCHED_PREFILL=0` to bisect). `SHARPI_EXPERT_STATS=path` for hit rates |
 | Llama-4 Scout 17B-16E (MoE) | [meta-llama](https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct) | 61 GB | CPU | 2.1 | 4.3 | 48 layers, 17B active; split GGUF (not on bench machine) |
-| Llama-4 Scout 17B-16E (MoE) | (same) | 61 GB | CUDA `-g -1` (hybrid) | 1.2 | 2.6 | 7 GPU + 41 CPU layers — model dwarfs the 12 GB card so CPU-only wins; per-expert SLRU streaming (#72/#77) still lifts both |
+| Llama-4 Scout 17B-16E (MoE) | (same) | 61 GB | CUDA `--ngl -1` (hybrid) | 1.2 | 2.6 | 7 GPU + 41 CPU layers — model dwarfs the 12 GB card so CPU-only wins; per-expert SLRU streaming (#72/#77) still lifts both |
 | Qwen3.6-35B-A3B (GDN+MoE) | [unsloth](https://huggingface.co/unsloth/Qwen3.6-35B-A3B-GGUF) | 22 GB | CPU | 8.4 | 8.5 | hybrid GDN/attn, 256 experts / 8 active |
-| Qwen3.6-35B-A3B (GDN+MoE) | (same) | 22 GB | **CUDA** `-g -1` (hybrid) | **63.7** | **23.2** | 10 attn + 30 GDN on GPU; MoE auto-routed to CPU, shared expert on GPU overlapped with the routed loop. Fused GDN scan + batched-query SDPA (#114-B/#118), bit-identical, win grows with ctx. Forcing on-GPU experts (`SHARPI_CPU_MOE=0`, non-default) gets the #129 fused MoE-reduce kernel: GPU-SLRU prefill +20% (45.3 → 54.3 t/s) |
+| Qwen3.6-35B-A3B (GDN+MoE) | (same) | 22 GB | **CUDA** `--ngl -1` (hybrid) | **63.7** | **23.2** | 10 attn + 30 GDN on GPU; MoE auto-routed to CPU, shared expert on GPU overlapped with the routed loop. Fused GDN scan + batched-query SDPA (#114-B/#118), bit-identical, win grows with ctx. Forcing on-GPU experts (`SHARPI_CPU_MOE=0`, non-default) gets the #129 fused MoE-reduce kernel: GPU-SLRU prefill +20% (45.3 → 54.3 t/s) |
 | Qwen3.6-27B-MTP (GDN) | [unsloth](https://huggingface.co/unsloth/Qwen3.6-27B-MTP-GGUF) | 16 GB | CPU `--no-thinking` | 3.2 | **3.8** | dense 27B GDN/attn + native MTP head; auto MTP self-spec (#25) at greedy + `--no-thinking`. 95% draft acceptance; batched N=2 verify (#30) → 1.4× over MTP-off |
-| Qwen3.6-27B-MTP (GDN) | (same) | 16 GB | **CUDA** `-g -1 --no-thinking` (hybrid) | **8.3** | **10.7** | 20/64 dense FFN on GPU + GDN/attn KV resident, 44/64 FFN CPU mmap. 95% acceptance; batched verify → 1.73×. Batched trunk + on-GPU dense-FFN (#119/#121), bit-identical |
+| Qwen3.6-27B-MTP (GDN) | (same) | 16 GB | **CUDA** `--ngl -1 --no-thinking` (hybrid) | **8.3** | **10.7** | 20/64 dense FFN on GPU + GDN/attn KV resident, 44/64 FFN CPU mmap. 95% acceptance; batched verify → 1.73×. Batched trunk + on-GPU dense-FFN (#119/#121), bit-identical |
 | Qwen3.6-27B-MTP (GDN) | (same) | 19 GB | CPU `--no-thinking` `Q5_K_M` | 2.8 | **3.5** | ~10% slower than Q4_K_M; 100% acceptance; batched verify → 1.46× |
-| Qwen3.6-27B-MTP (GDN) | (same) | 19 GB | **CUDA** `-g -1 --no-thinking` `Q5_K_M` (hybrid) | 5.4 | **7.9** | 13/64 FFN on GPU, 51/64 CPU mmap. 98% acceptance; batched verify → 1.84×. Batched trunk (#119) bit-identical; FFN batching prefill-neutral here (CPU-mmap bound) |
+| Qwen3.6-27B-MTP (GDN) | (same) | 19 GB | **CUDA** `--ngl -1 --no-thinking` `Q5_K_M` (hybrid) | 5.4 | **7.9** | 13/64 FFN on GPU, 51/64 CPU mmap. 98% acceptance; batched verify → 1.84×. Batched trunk (#119) bit-identical; FFN batching prefill-neutral here (CPU-mmap bound) |
 | Qwen3.6-35B-A3B-MTP (GDN+MoE) | [unsloth](https://huggingface.co/unsloth/Qwen3.6-35B-A3B-MTP-GGUF) | 22 GB | CPU `--no-thinking` | 8.5 | 8.0 | GDN/attn + 256-expert MoE + MTP head (#44). 100% acceptance; MoE-MTP batched verify (#45) — routed experts sequential per token, so ~MTP-off parity |
-| Qwen3.6-35B-A3B-MTP (GDN+MoE) | (same) | 22 GB | **CUDA** `-g -1 --no-thinking` (hybrid) | **65.0** | **22.9** | Requires `SHARPI_CPU_MOE=1`: 30 GDN + 10 attn + shared expert on GPU, routed experts CPU mmap. 100% acceptance. Fused GDN scan + batched SDPA (#114-B/#118), bit-identical, grows with ctx |
-| Carnice (Qwen3.6-35B-A3B-MTP finetune) | [mudler](https://huggingface.co/mudler/Carnice-Qwen3.6-MoE-35B-A3B-APEX-MTP-GGUF) | 17 GB | **CUDA** `-g -1 --no-thinking` (hybrid) | **43.6** | **25.0** | agentic finetune of 35B-A3B-MTP; 77% acceptance (`bench-carnice.ps1` — the default prompt 1-token-EOSes on this terser tune). APEX mixed-precision (Q3_K + Q8_0 experts); Q8_KS per-32 int dots auto-enable at load (#99/#101/#107), +4.6% decode at ~4× tighter parity vs plain Q8_K (`SHARPI_Q3K_Q8K=0`/`SHARPI_Q8_0_Q8K=0` to disable). Fused GDN scan + wave SDPA (#114-B/#118) bit-identical past 4096 |
+| Qwen3.6-35B-A3B-MTP (GDN+MoE) | (same) | 22 GB | **CUDA** `--ngl -1 --no-thinking` (hybrid) | **65.0** | **22.9** | Requires `SHARPI_CPU_MOE=1`: 30 GDN + 10 attn + shared expert on GPU, routed experts CPU mmap. 100% acceptance. Fused GDN scan + batched SDPA (#114-B/#118), bit-identical, grows with ctx |
+| Carnice (Qwen3.6-35B-A3B-MTP finetune) | [mudler](https://huggingface.co/mudler/Carnice-Qwen3.6-MoE-35B-A3B-APEX-MTP-GGUF) | 17 GB | **CUDA** `--ngl -1 --no-thinking` (hybrid) | **43.6** | **25.0** | agentic finetune of 35B-A3B-MTP; 77% acceptance (`bench-carnice.ps1` — the default prompt 1-token-EOSes on this terser tune). APEX mixed-precision (Q3_K + Q8_0 experts); Q8_KS per-32 int dots auto-enable at load (#99/#101/#107), +4.6% decode at ~4× tighter parity vs plain Q8_K (`SHARPI_Q3K_Q8K=0`/`SHARPI_Q8_0_Q8K=0` to disable). Fused GDN scan + wave SDPA (#114-B/#118) bit-identical past 4096 |
 | Gemma 4 E4B-it Q8 | [unsloth](https://huggingface.co/unsloth/gemma-4-E4B-it-GGUF) | 8 GB | CPU | 4.9 | 5.0 | dense 42-layer gemma4: per-layer head_dim (256 SWA / 512 global), dual-RoPE, KV-share tail (18 layers), 5:1 SWA:global, logit softcap 30, PLE-256 injection (~4.2 GB mmap-resident) |
-| Gemma 4 E4B-it Q8 | (same) | 8 GB | **CUDA** `-g -1 -c 2048` | **2853** | **59** | all 42 layers fit at `-c 2048`. KV-share alias + SWA/global split per layer; PLE projections (~215 MB) upload at construction. **Prefill (#141):** int8 **tensor-core MMQ** matmul (`mma.m16n8k32.s8`, each Q8_0 weight read once as int8 — beats the dequant→fp16→cuBLAS GEMM, drops its fp16 HBM temp; `SHARPI_PREFILL_MMQ=0` reverts) + a memory-efficient **flash-attention** prefill (shared K/V tiles + online softmax + half2 fp16x2 QK dot — replaces the scalar O(n²) per-query attention that re-streamed each query's K/V window up to ~512×; `SHARPI_PREFILL_FLASH=0` reverts) + a batched Q8_0 embedding lookup. **~1.8× at ~1K ctx (1564→2853), ~2.05× at 1.8K** — profiling showed *attention*, not the matmul, was the dominant prefill cost at realistic prompt lengths, so the win grows with prompt length. **Decode (#142):** dp4a/Q8_1 int8 matvec (`SHARPI_Q80_DP4A=0` to bisect) + CUDA-graph capture/replay default-on (`SHARPI_CUDA_GRAPH=0` to bisect). All prefill/decode fast paths are argmax-stable vs the fp32 path, not bit-exact. Remaining gap to llama.cpp (~8000 prefill / ~78 decode): full tensor-core flash at d=512 + decode matvec work |
-| Gemma 4 E4B-it Q8 | (same) | 8 GB | **CUDA** `-g 22 -c 2048` (hybrid) | 6.6 | 6.8 | 22 GPU + 20 CPU layers. `-g ≤ 22` required so the CPU shared-KV tail can read its own-KV source layers; CPU dense-FFN dominates decode (bandwidth-bound). `SHARPI_CUDA_PROFILE=1` for per-phase breakdown |
+| Gemma 4 E4B-it Q8 | (same) | 8 GB | **CUDA** `--ngl -1 -c 2048` | **2853** | **59** | all 42 layers fit at `-c 2048`. KV-share alias + SWA/global split per layer; PLE projections (~215 MB) upload at construction. **Prefill (#141):** int8 **tensor-core MMQ** matmul (`mma.m16n8k32.s8`, each Q8_0 weight read once as int8 — beats the dequant→fp16→cuBLAS GEMM, drops its fp16 HBM temp; `SHARPI_PREFILL_MMQ=0` reverts) + a memory-efficient **flash-attention** prefill (shared K/V tiles + online softmax + half2 fp16x2 QK dot — replaces the scalar O(n²) per-query attention that re-streamed each query's K/V window up to ~512×; `SHARPI_PREFILL_FLASH=0` reverts) + a batched Q8_0 embedding lookup. **~1.8× at ~1K ctx (1564→2853), ~2.05× at 1.8K** — profiling showed *attention*, not the matmul, was the dominant prefill cost at realistic prompt lengths, so the win grows with prompt length. **Decode (#142):** dp4a/Q8_1 int8 matvec (`SHARPI_Q80_DP4A=0` to bisect) + CUDA-graph capture/replay default-on (`SHARPI_CUDA_GRAPH=0` to bisect). All prefill/decode fast paths are argmax-stable vs the fp32 path, not bit-exact. Remaining gap to llama.cpp (~8000 prefill / ~78 decode): full tensor-core flash at d=512 + decode matvec work |
+| Gemma 4 E4B-it Q8 | (same) | 8 GB | **CUDA** `--ngl 22 -c 2048` (hybrid) | 6.6 | 6.8 | 22 GPU + 20 CPU layers. `--ngl ≤ 22` required so the CPU shared-KV tail can read its own-KV source layers; CPU dense-FFN dominates decode (bandwidth-bound). `SHARPI_CUDA_PROFILE=1` for per-phase breakdown |
 
 _Numbers re-measured across every on-disk row at ~1K ctx so the prefill column is comparable; per-issue
 before/after figures in the notes are historical. Llama-4 Scout and Qwen3-Coder Vulkan-hybrid keep their
@@ -141,7 +141,7 @@ dotnet run --project src/SharpInference.Cli -c Release -- \
 
 # Full GPU offload (auto-picks CUDA)
 dotnet run --project src/SharpInference.Cli -c Release -- \
-  -m models/Qwen3-8B-Q4_K_M.gguf -p "Write a quicksort in Python" --temp 0 -g -1
+  -m models/Qwen3-8B-Q4_K_M.gguf -p "Write a quicksort in Python" --temp 0 --ngl -1
 
 # MoE on CPU with 3-bit KV compression
 dotnet run --project src/SharpInference.Cli -c Release -- \

diff --git a/samples/SharpInference.Sample.ToolCall/Program.cs b/samples/SharpInference.Sample.ToolCall/Program.cs
@@ -13,18 +13,18 @@
 //
 // Run (CUDA, all layers on GPU):
 //   dotnet run --project samples/SharpInference.Sample.ToolCall -c Release -- \
-//       -m models/Qwen3-35B-A3B-Q4_K_M.gguf --backend cuda -g -1
+//       -m models/Qwen3-35B-A3B-Q4_K_M.gguf --backend cuda --ngl -1
 //
 // Run (CUDA hybrid — first 20 layers on GPU, rest on CPU):
 //   dotnet run --project samples/SharpInference.Sample.ToolCall -c Release -- \
-//       -m models/Qwen3-35B-A3B-Q4_K_M.gguf --backend cuda -g 20
+//       -m models/Qwen3-35B-A3B-Q4_K_M.gguf --backend cuda --ngl 20
 //
 // Flags:
 //   -m / --model     <path>    Path to GGUF model (or set SHARPI_MODEL env var)
 //   -p / --prompt    <text>    User question (default: built-in demo question)
 //   --temp           <float>   Sampling temperature (default: 0.6)
 //   --backend        cpu|cuda  Compute backend (default: cpu)
-//   -g               <int>     GPU layer count: 0=CPU-only, -1=auto/all, N=N layers on GPU
+//   --ngl/--n-gpu-layers <int> GPU layer count: 0=CPU-only, -1=auto/all, N=N layers on GPU
 
 using System.Text;
 using SharpInference.Core;
@@ -48,10 +48,10 @@
         case "-p" or "--prompt"  when i + 1 < args.Length: question    = args[++i]; break;
         case "--temp"            when i + 1 < args.Length: temperature = float.Parse(args[++i], System.Globalization.CultureInfo.InvariantCulture); break;
         case "--backend"         when i + 1 < args.Length: backendStr  = args[++i].ToLowerInvariant(); break;
-        case "-g"                when i + 1 < args.Length: nGpuLayers  = int.Parse(args[++i]); break;
+        case "--ngl" or "--n-gpu-layers" when i + 1 < args.Length: nGpuLayers  = int.Parse(args[++i]); break;
         case "-h" or "--help":
             Console.Error.WriteLine(
-                "usage: sharpi-sample-toolcall -m <model.gguf> [-p <question>] [--temp 0.6] [--backend cpu|cuda] [-g <layers>]");
+                "usage: sharpi-sample-toolcall -m <model.gguf> [-p <question>] [--temp 0.6] [--backend cpu|cuda] [--ngl <layers>]");
             return 0;
     }
 }

diff --git a/samples/SharpInference.Sample.ToolCall/README.md b/samples/SharpInference.Sample.ToolCall/README.md
@@ -18,11 +18,11 @@ dotnet run --project samples/SharpInference.Sample.ToolCall -c Release -- \
 
 # CUDA — auto-detect how many layers fit in VRAM
 dotnet run --project samples/SharpInference.Sample.ToolCall -c Release -- \
-    -m models/Qwen3-35B-A3B-Q4_K_M.gguf --backend cuda -g -1
+    -m models/Qwen3-35B-A3B-Q4_K_M.gguf --backend cuda --ngl -1
 
 # CUDA — force all layers on GPU (will OOM if model doesn't fit)
 dotnet run --project samples/SharpInference.Sample.ToolCall -c Release -- \
-    -m models/Qwen3-35B-A3B-Q4_K_M.gguf --backend cuda -g 999
+    -m models/Qwen3-35B-A3B-Q4_K_M.gguf --backend cuda --ngl 999
 ```
 
 Custom question:

diff --git a/scripts/bench-129-ab.ps1 b/scripts/bench-129-ab.ps1
@@ -34,7 +34,7 @@ if (-not (Test-Path $outDir)) { New-Item -ItemType Directory -Path $outDir | Out
 
 function Run([string]$label, [string]$dll, [bool]$warm) {
   $args = @($dll, "-m", $Model, "-p", $prompt, "--temp", "0", "-n", "$NTokens",
-            "-g", "-1", "--backend", "cuda", "--single-turn", "--verbose-prompt")
+            "--ngl", "-1", "--backend", "cuda", "--single-turn", "--verbose-prompt")
   $psi = New-Object System.Diagnostics.ProcessStartInfo
   $psi.FileName = $dotnet
   $psi.Arguments = ($args | ForEach-Object { if ($_ -match '\s') { "`"$_`"" } else { $_ } }) -join ' '