TraceCoreAI · trilamsr · Jun 2, 2026 · Jun 2, 2026
diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml
@@ -7,13 +7,24 @@
 #
 # Triggers: push to main (capture baseline at every merge) +
 # workflow_dispatch (operator can re-run on demand to compare an
-# arbitrary SHA).
+# arbitrary SHA, or capture a fresh linux/amd64 baseline).
+#
+# Artifact uploads (issue #420 Phase 2): every run uploads the raw
+# `go test -bench` output for `./bench/detectors/`. `scripts/bench-
+# cv-rolling.sh` consumes the last N artifacts to compute per-
+# detector allocs/op CV — the load-bearing input for the soft → hard
+# gate graduation decision documented in `bench/detectors/README.md`.
 name: bench
 
 on:
   push:
     branches: [main]
   workflow_dispatch:
+    inputs:
+      capture_linux_baseline:
+        description: 'If true, run the per-detector bench at full count=10 x 500ms and surface a baselines.json diff in the job summary (linux/amd64 baseline capture mode, issue #420).'
+        required: false
+        default: 'false'
 
 permissions:
   contents: read
@@ -22,7 +33,7 @@ jobs:
   patterns:
     name: bench-patterns
     runs-on: ubuntu-latest
-    timeout-minutes: 10
+    timeout-minutes: 15
     steps:
       - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
       - uses: actions/setup-go@4a3601121dd01d1626a1e23e37211e3254c1c06c  # v6.4.0
@@ -50,7 +61,9 @@ jobs:
       - name: bench-detectors-check (soft gate, issue #302)
         # Soft gate per bench/detectors/README.md graduation criterion.
         # The script always exits 0 today; flip to hard-fail once
-        # N=10 PRs see alloc-CV < 1% on every detector.
+        # N=10 PRs see alloc-CV < 1% on every detector. Issue #420
+        # tracks the graduation criteria + the CV-rolling measurement
+        # scaffold that produces an objective flip decision.
         run: |
           set -euo pipefail
           bash scripts/bench-check-detectors.sh | tee /tmp/bench-detectors.txt
@@ -62,3 +75,64 @@ jobs:
             cat /tmp/bench-detectors.txt
             echo '```'
           } >> "$GITHUB_STEP_SUMMARY"
+
+      - name: Capture raw per-detector bench output (issue #420)
+        # The CV-rolling computation needs the *raw* `go test -bench`
+        # output (count=10 samples per detector) — `bench-check-
+        # detectors.sh` already prints the median + a delta summary,
+        # which collapses the variance signal. Re-run the raw bench
+        # and upload it as an artifact so `scripts/bench-cv-rolling.sh`
+        # can compute per-detector allocs/op CV across the last N
+        # workflow runs.
+        run: |
+          set -euo pipefail
+          mkdir -p bench-artifacts
+          go test -bench=. -benchmem -benchtime=500ms -count=10 \
+            -run='^$' ./bench/detectors/ \
+            | tee bench-artifacts/bench-detectors-raw.txt
+          {
+            echo "sha=${GITHUB_SHA}"
+            echo "run_id=${GITHUB_RUN_ID}"
+            echo "timestamp=$(date -u +%Y-%m-%dT%H:%M:%SZ)"
+            echo "runner=ubuntu-latest"
+            echo "arch=amd64"
+          } > bench-artifacts/metadata.txt
+
+      - name: Linux baseline capture (issue #420, workflow_dispatch only)
+        if: github.event_name == 'workflow_dispatch' && github.event.inputs.capture_linux_baseline == 'true'
+        # Runs the bench-baseline-detectors.sh regeneration on
+        # linux/amd64 and surfaces a `git diff bench/detectors/
+        # baselines.json` in the job summary so the operator can
+        # confirm M1-vs-linux equivalence (or flag drift requiring a
+        # re-baseline PR). Does NOT open a PR automatically — a
+        # follow-up `release-prep` style PR is the right shape since
+        # ceiling moves require adversarial review.
+        run: |
+          set -euo pipefail
+          if ! command -v jq >/dev/null 2>&1; then
+            sudo apt-get update -qq && sudo apt-get install -y -qq jq
+          fi
+          bash scripts/bench-baseline-detectors.sh
+          {
+            echo ""
+            echo "### Linux/amd64 baseline capture (issue #420)"
+            echo ""
+            echo "Diff vs committed baselines.json (empty = M1 baseline confirmed on linux/amd64):"
+            echo ""
+            echo '```diff'
+            git --no-pager diff -- bench/detectors/baselines.json || true
+            echo '```'
+          } >> "$GITHUB_STEP_SUMMARY"
+          cp bench/detectors/baselines.json bench-artifacts/baselines-linux-amd64.json
+
+      - name: Upload bench artifact (issue #420)
+        # Always upload — even if bench-check-detectors.sh prints a
+        # regression. The whole point of the artifact is to give the
+        # CV-rolling script a sample regardless of pass/fail.
+        if: always()
+        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a  # v7.0.1
+        with:
+          name: bench-detectors-raw-${{ github.run_id }}
+          path: bench-artifacts/
+          if-no-files-found: warn
+          retention-days: 90
diff --git a/Makefile b/Makefile
@@ -2,7 +2,7 @@
 .PHONY: help build clean hooks
 
 # Test suites
-.PHONY: test test-extras test-extras-sustained test-extras-fuzz test-extras-fuzz-kmsg test-extras-fuzz-journald test-extras-fuzz-nccl-fr test-extras-race bench bench-check bench-allocs-check bench-baseline bench-detectors bench-detectors-check bench-detectors-baseline
+.PHONY: test test-extras test-extras-sustained test-extras-fuzz test-extras-fuzz-kmsg test-extras-fuzz-journald test-extras-fuzz-nccl-fr test-extras-race bench bench-check bench-allocs-check bench-baseline bench-detectors bench-detectors-check bench-detectors-baseline bench-cv-report
 
 # Format + tidy
 .PHONY: fmt fmt-fix vet lint lint-fix tidy tidy-check mod-verify bump-otel
@@ -105,6 +105,12 @@ bench-detectors-check:  ## Compare current per-detector allocs/op against bench/
 bench-detectors-baseline:  ## Regenerate bench/detectors/baselines.json on local hardware. Commit the diff after vetting.
 	scripts/bench-baseline-detectors.sh
 
+bench-cv-report:  ## Print per-detector allocs/op CV across the last N bench.yml runs (issue #420). Drives the soft→hard gate graduation decision. Requires `gh` auth; falls back to baselines.json single-sample if unauthed.
+	@# N override: `N=20 make bench-cv-report` walks the last 20 successful
+	@# bench.yml runs instead of 10. Cache lives under
+	@# /tmp/tracecore-bench-artifacts/ — safe to wipe between sessions.
+	scripts/bench-cv-rolling.sh
+
 
 fmt:  ## Check formatting; fails if any file is not gofumpt-clean.
 	@# gofumpt has no native exclude flag; filter ./_build/ (OCB-generated,

diff --git a/bench/detectors/README.md b/bench/detectors/README.md
@@ -78,20 +78,70 @@ Soft-gate posture is the right shape at v0.4 because:
 
 ## Graduation criterion: soft → hard
 
-Flip `$gate_mode` in `baselines.json` from `soft` to `hard` when:
-
-- **N ≥ 10 consecutive PRs** have landed without a flake-driven false
-  positive on any of the 6 detectors.
-- **alloc-CV < 1%** on each detector across those 10 PRs. Compute by
-  collecting CI's bench output per-PR and running
-  `python -c 'import statistics; print(statistics.stdev(<allocs>) / statistics.mean(<allocs>))'`
-  on each detector's column.
-- **No detector pending an in-flight refactor**. A detector under
-  refactor in an open PR pins its old alloc count to its baseline;
-  flipping the gate hard would block the refactor's merge.
-
-The graduation PR sets `gate_mode: "hard"` and changes
-`scripts/bench-check-detectors.sh` to `exit 1` on regression > 10%.
+Flip `$gate_mode` in `baselines.json` from `soft` to `hard` when every
+box below is checked.
+
+### Measurement (automated, issue #420)
+
+The CV-rolling measurement scaffold ships in PR-#420-followup:
+
+- `.github/workflows/bench.yml` uploads the raw `go test -bench`
+  output as a per-run artifact (`bench-detectors-raw-<run_id>`).
+- `scripts/bench-cv-rolling.sh` pulls the last N artifacts via
+  `gh run download` and prints a per-detector CV table.
+- `make bench-cv-report` is the operator-facing entry point.
+
+Run the report at any time:
+
+```bash
+make bench-cv-report                  # last 10 main-branch runs
+N=20 make bench-cv-report             # last 20 runs
+```
+
+Each detector row shows `n_samples / mean / stdev / cv% / gate-status`.
+The gate-status column reads `OK (<1%)` or `OVER (graduation blocked)`
+against the 1% CV threshold.
+
+### Graduation checklist
+
+Tick every box before opening the graduation PR. The CV-report column
+is the load-bearing automated check; the others are operator judgment.
+
+- [ ] **N ≥ 10 consecutive main-branch `bench.yml` runs** have landed
+      with the artifact upload step green (i.e. 10 runs of CV-rolling
+      data exist).
+- [ ] `make bench-cv-report` shows **`OK (<1%)`** for every detector:
+    - [ ] `BenchmarkPodEvictedDetector`
+    - [ ] `BenchmarkXidCorrelationDetector`
+    - [ ] `BenchmarkHBMECCDetector`
+    - [ ] `BenchmarkNCCLHangDetector`
+    - [ ] `BenchmarkThermalThrottleDetector`
+    - [ ] `BenchmarkPCIeAERDetector`
+- [ ] **Zero flake-driven false positives** on `bench-detectors-check`
+      across those 10 PRs (i.e. no PR triggered a >10% delta warning
+      that turned out to be runner noise on rerun).
+- [ ] **No detector pending an in-flight refactor**. A detector under
+      refactor in an open PR pins its old alloc count to its baseline;
+      flipping the gate hard would block the refactor's merge. Confirm
+      via `gh pr list --search "bench-ratchet OR detector"`.
+- [ ] **Linux/amd64 baseline confirmed**. Trigger the
+      `workflow_dispatch` run of `bench.yml` with input
+      `capture_linux_baseline=true`; verify the job-summary diff vs
+      committed `baselines.json` is empty (or within ±1 alloc/op per
+      detector — the README's stated tolerance). If a detector drifts
+      ≥ 2 allocs consistently, land a re-baseline PR before
+      graduation.
+
+### The graduation PR
+
+The graduation PR is one diff:
+
+1. `bench/detectors/baselines.json` — flip `$gate_mode: "soft"` →
+   `"hard"`.
+2. `scripts/bench-check-detectors.sh` — change the final `exit 0` to
+   `exit "$status"`.
+3. PR body links to the `make bench-cv-report` output (paste table)
+   as the evidence trail.
 
 ## Adding a new detector