From 5ec7e9230eab50484d318e054aec30b893e0ee9b Mon Sep 17 00:00:00 2001 From: Tri Lam Date: Wed, 20 May 2026 15:35:42 -0700 Subject: [PATCH 1/2] [m19] chaos.yml pattern-pod-evicted row + real-world replay slot MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes two M19 carry-forwards (chaos matrix row, anonymized real-world fixture slot) and tightens MILESTONES bookkeeping. - chaos.yml gains a `pattern-pod-evicted` job that runs the hermetic `go test ./internal/synthesis/...` replay-corpus path under `-race`. The pod-evict CLI half is covered by a new SHA pin in `tools/failure-inject/testdata/golden.sha256` — the existing `harness-determinism` Golden-SHA-Pin loop now verifies pod-evict alongside xid. Mutation-verified: changing `--seed=1` flips the SHA and the gate fails. - `internal/synthesis/replay/pod_evicted/_real_world/` is the contribution slot for anonymized production captures. The replay loader's underscore-group convention (per `_negative/`) means new fixtures plug in without code changes; README.md documents the contribution checklist. Empty group today; contributions are the remaining M19 carry-forward. - MILESTONES rubric L409 flips ⧗ → ☑; carry-forward list trimmed from 5 → 3 (drops the chaos-row line and rewrites the real-world slot to point at the now-present `_real_world/` group). Carry-forward (3) detector-overhead bench was already ☑ on rubric L418 and is removed from the list as stale. - chaos.yml `paths:` filter gains `internal/synthesis/**` so detector / replay-corpus changes retrigger the workflow on PRs. Carry-forwards still open: detection-latency ≤5s p95 (needs live cluster); `--filler=/tmp` real-kubelet eviction (needs `--allow-cluster-write` kube-apiserver seam); real-world fixture contributions. Signed-off-by: Tri Lam --- .github/workflows/chaos.yml | 31 ++++++++-- MILESTONES.md | 6 +- .../replay/pod_evicted/_real_world/README.md | 58 +++++++++++++++++++ tools/failure-inject/testdata/golden.sha256 | 1 + 4 files changed, 87 insertions(+), 9 deletions(-) create mode 100644 internal/synthesis/replay/pod_evicted/_real_world/README.md diff --git a/.github/workflows/chaos.yml b/.github/workflows/chaos.yml index e1657ab6..68b1f55b 100644 --- a/.github/workflows/chaos.yml +++ b/.github/workflows/chaos.yml @@ -1,6 +1,6 @@ name: Chaos -# M4b nightly chaos / failure-injection workflow. Three jobs: +# M4b nightly chaos / failure-injection workflow. Jobs: # - harness-determinism: same argv + --seed produces byte-identical # output across two runs, matching tools/failure-inject/testdata/ # golden.sha256. @@ -10,13 +10,13 @@ name: Chaos # - cpu-steal-mpstat: failure-inject cpu-steal pins a busy-loop and # mpstat reports %steal+%user ≥ 95% on the pinned core for ≥ D-1 # seconds. +# - pattern-pod-evicted (M19): runs the hermetic replay-corpus +# detector test plus pins the pod-evict CLI output SHA so harness +# drift and detector drift are caught in the same workflow. # # Matrix-of-patterns rule: per MILESTONES.md §M4b the workflow grows -# a row when each pattern lands. Today M17 / M18 / M19 are open, so -# no pattern rows are forward-referenced. When M19 lands, add a -# `pattern: pod_evicted` matrix entry that invokes -# `failure-inject pod-evict --reason=DiskPressure` against the M19 -# detector's replay corpus. +# a row when each pattern lands. M17 / M18 are still open and will +# add their own rows when they land. on: schedule: @@ -28,11 +28,13 @@ on: paths: - "tools/failure-inject/**" - "internal/pipeline/chaos_test.go" + - "internal/synthesis/**" - ".github/workflows/chaos.yml" pull_request: paths: - "tools/failure-inject/**" - "internal/pipeline/chaos_test.go" + - "internal/synthesis/**" - ".github/workflows/chaos.yml" permissions: @@ -158,3 +160,20 @@ jobs: printf "PASS: average %%usr+%%steal=%.1f >= 95\n", total } ' mpstat.txt + + pattern-pod-evicted: + name: pattern-pod-evicted (M19) + runs-on: ubuntu-latest + timeout-minutes: 10 + steps: + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + - uses: actions/setup-go@4a3601121dd01d1626a1e23e37211e3254c1c06c # v6.4.0 + with: + go-version-file: go.mod + cache: true + # Hermetic replay-corpus run: detector consumes the same canonical + # fixture the harness-determinism job pins via golden.sha256 (the + # pod-evict CLI argv row covers byte-stable emission). This step + # asserts the detector half — golden-verdict JSON diff under -race. + - name: Hermetic replay-corpus detector run + run: go test -race -count=1 ./internal/synthesis/... diff --git a/MILESTONES.md b/MILESTONES.md index da589eaf..c53ee005 100644 --- a/MILESTONES.md +++ b/MILESTONES.md @@ -398,15 +398,15 @@ Alpha unified-source logs receiver covering L2 + L9 (kernel + system events). Ta - **Status:** ⧗ partial - **Depends on:** M10, M4b -- **Landed:** `internal/synthesis/patterns/pod_evicted.go` + `internal/synthesis/replay/` runner + canonical + 3 negative fixtures + JSON schema; `k8sevents.NodeRecord` + Node informer + RBAC delta; `tracecore failure-inject pod-evict` CLI; `Record.ReportingInstance` for node-name joining. -- **Carry-forward:** (1) Anonymized real-world replay-corpus slot at `replay/pod_evicted/real_world_*/`; (2) detection-latency rubric (≤5s p95) requires live-cluster integration; (3) detector overhead bench (`bench_test.go` ≤1ms/eval on 1k-event window); (4) `chaos.yml` matrix row `pattern: pod_evicted` invoking `failure-inject pod-evict --reason=DiskPressure` against the replay corpus (FOLLOWUPS L927-934 trigger fired); (5) `--filler=/tmp` real-kubelet eviction flag for `failure-inject pod-evict` once `--allow-cluster-write` gains a kube-apiserver write seam. +- **Landed:** `internal/synthesis/patterns/pod_evicted.go` + `internal/synthesis/replay/` runner + canonical + 3 negative fixtures + `_real_world/` contribution slot + JSON schema; `k8sevents.NodeRecord` + Node informer + RBAC delta; `tracecore failure-inject pod-evict` CLI; `Record.ReportingInstance` for node-name joining; `chaos.yml` `pattern-pod-evicted` job + `pod-evict --reason=DiskPressure` golden SHA pin in `tools/failure-inject/testdata/golden.sha256`. +- **Carry-forward:** (1) Anonymized real-world fixtures populating `replay/pod_evicted/_real_world//` (slot + contribution README exist; no captures contributed yet); (2) detection-latency rubric (≤5s p95) requires live-cluster integration; (3) `--filler=/tmp` real-kubelet eviction flag for `failure-inject pod-evict` once `--allow-cluster-write` gains a kube-apiserver write seam. **Functional rubrics:** - ☑ Detector at `internal/synthesis/patterns/pod_evicted.go` exports `PodEvictedDetector` consumed by `internal/synthesis/replay/runner_test.go`. Generic `Detector` interface deferred until M17/M18 add a second consumer per PRINCIPLES §3 rule-of-three. - ☑ Input: stream of M10 typed records with `k8s.event.hint=pod_evicted` (derived from `Evicted` reason per M10's hint-mapping table) and node-condition records carrying `MemoryPressure`/`DiskPressure`/`PIDPressure`/`NodeNotReady`. (per https://kubernetes.io/docs/concepts/scheduling-eviction/node-pressure-eviction/) NodeRecord surface ships in this PR with opt-in Node informer (`Config.NodeConditions.Enabled`; default false because of the wider RBAC scope). - ☑ For the canonical disk-pressure fixture, `Headline` matches `/Pod .* evicted at .* due to disk pressure/` and `Remediation` matches `/relocate.*NVMe/`. `--filler=/tmp` flag deferred to carry-forward — the CLI's dry-run YAML path supplies the replay corpus today. - ☑ Output: JSON-serializable `patterns.PodEvictedVerdict` with `pattern.id="14"`, `headline`, `evidence_trail[]`, `remediation`, `confidence`; lowercase JSON / uppercase Go. Schema-conformance test at `verdict_schema_test.go` loads `testdata/verdict.schema.json` (additionalProperties:false; drift rejection asserted). -- ⧗ Replay corpus at `internal/synthesis/replay/pod_evicted/canonical/`: deterministic synthetic fixture + manifest. Second slot for anonymized real-world capture is the carry-forward. +- ☑ Replay corpus at `internal/synthesis/replay/pod_evicted/canonical/`: deterministic synthetic fixture + manifest. Anonymized real-world contribution slot at `_real_world/` (with contribution README); populating it with captures is the carry-forward. - ☑ Detector emits null/no-match for `reason="Killing"`, `reason="Preempted"`, `reason="FailedScheduling"`. Negative-fixture tests under `_negative/` and per-subtest entries in `runner_test.go`. - ☑ Cross-receiver query is structured (typed accessor against M10's exported `Record` + `NodeRecord` types), not string grep; compile-time gate in `components/receivers/k8sevents/pattern_consumer_test.go` (Record + NodeRecord + Hint + NodePressureKind constants + attribute names all pinned). diff --git a/internal/synthesis/replay/pod_evicted/_real_world/README.md b/internal/synthesis/replay/pod_evicted/_real_world/README.md new file mode 100644 index 00000000..ee75739f --- /dev/null +++ b/internal/synthesis/replay/pod_evicted/_real_world/README.md @@ -0,0 +1,58 @@ +# Anonymized real-world replay fixtures + +This directory is the M19 carry-forward slot for anonymized, +real-world `pod_evicted` captures. Empty today; populated as +operators contribute traces from production clusters that the +canonical synthetic fixture cannot exercise. + +## Why a second fixture slot exists + +The canonical fixture at `../canonical/` covers one well-formed +disk-pressure scenario and is sufficient for the regression gate. +Real production captures regularly carry surprises the synthetic +case never anticipated: clock skew between kubelet and the node, +multi-condition transitions (DiskPressure + MemoryPressure within +the same second), partial node-condition updates, Reason strings +the upstream kubelet emits that diverge from the documented set, +etc. Each contributed fixture extends detector coverage to a +class of real failures the synthetic corpus cannot represent. + +## Directory shape + +Each contributed fixture lives in its own subdirectory and +follows the same layout the canonical fixture uses: + +``` +_real_world/ + / + manifest.json required + events.json required + node_conditions.json optional + golden.json required (verdict slice; [] for negatives) +``` + +The loader at `internal/synthesis/replay/runner.go` walks +underscore-prefixed groups (`_negative/`, `_real_world/`) one +level deeper and registers each subdirectory as a fixture, so +new captures plug in without any code change. + +## Contribution checklist + +1. **Anonymize first.** Strip pod names, namespaces, node names, + UIDs, image refs, and any user-identifying labels. Use stable + pseudonyms (e.g. `pod-aaa`, `node-bbb`) so the fixture is + self-consistent across `events.json` and `node_conditions.json`. + Reset timestamps to start at a fixed epoch so diffs stay + readable. +2. **Confirm the capture reproduces an interesting failure mode + the canonical fixture does not.** State which mode in the + manifest `description`. +3. **Land the golden verdict alongside.** If the detector misses + the real-world case, that is the point — file the gap in + `docs/FOLLOWUPS.md` and add the fixture under `_negative/` + until the detector catches up. The replay corpus is the + evidence trail. +4. **Make `manifest.json`'s `fixture_name` unique** across the + corpus; the runner uses it for test-failure labels. + +No contributed fixtures yet — this README is the contract. diff --git a/tools/failure-inject/testdata/golden.sha256 b/tools/failure-inject/testdata/golden.sha256 index 56e0038b..7454a9fb 100644 --- a/tools/failure-inject/testdata/golden.sha256 +++ b/tools/failure-inject/testdata/golden.sha256 @@ -1,2 +1,3 @@ 859b96587fb10e47321ef4b459e2f83c7097d27b42e7f4611a7a8db39573173f failure-inject --seed=0 xid --code=79 --format=kmsg d8df057d69076da16d67dd02af22bd745f1d36faf09b311410b89b89a79c8775 failure-inject --seed=0 xid --code=79 --format=journald +1c25d41430eeaccbede07ba3b38d5f4ce848e8025642881f9180d023071f8f4e failure-inject --seed=0 pod-evict --reason=DiskPressure From 281b12330d4a76908eabe39febdbed457eb19ed7 Mon Sep 17 00:00:00 2001 From: Tri Lam Date: Wed, 20 May 2026 16:07:12 -0700 Subject: [PATCH 2/2] [m19] A+ pass: loader regression test, fixture template, em-dash cleanup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Five small additions on top of the M19 chaos-replay PR after self-review surfaced gaps the single-pass review missed: - `TestPodEvictedReplay_RealWorldGroupLoaderSafe` asserts the `_real_world/` group walks empty and contributes 0 fixtures (canonical + 3 negatives = 4 total). Mutation-verified by dropping a manifest-less subdir and watching the test fail with the path citation; restored. Catches contributor partial-fixture drops before the rubric harness sees them. - `_real_world/README.md` gains an inline fixture template (manifest / events / node_conditions / golden JSON shapes with placeholder values). Reduces contribution friction from "reverse-engineer from canonical/" to "copy template, edit placeholders". Template lives in the README rather than as a fixture dir because every subdir of `_real_world/` is loaded; a directory-based template would either run as a test or require a loader special-case neither rule-of-three justifies. - `pattern-pod-evicted` job timeout tightened from 10min to 5min (observed runtime 1m48s; ~3× headroom). Fails fast on flake rather than soaking the chaos-job boilerplate ceiling. - MILESTONES carry-forward (1) clause documents the `real_world_*/` → `_real_world/` rename and its reason (loader-convention parity with the existing `_negative/` group). Original carry-forward text named the un-prefixed shape; the loader's underscore-group walk is the constraint. - Four em-dashes in the new README replaced with semicolons / full stops to satisfy `make doc-check`'s diff-scope em-dash gate. The diff-gate compares vs origin/main and the em-dashes were introduced by this PR, so they are this PR's punctuation to fix. Explicitly skipped: per-job path filtering trim of `internal/synthesis/**` from `harness-determinism` (CONCERN flagged during review). Would need either `dorny/paths-filter` (added dependency) or a workflow split (scope creep). The ~2min of redundant CI serves two distinct gates (CLI bytes vs detector verdict) and is justified per PRINCIPLES §13 operator-first. Revisit if the redundancy pattern recurs. Signed-off-by: Tri Lam --- .github/workflows/chaos.yml | 3 +- MILESTONES.md | 2 +- .../replay/pod_evicted/_real_world/README.md | 94 ++++++++++++++++++- internal/synthesis/replay/runner_test.go | 19 ++++ 4 files changed, 114 insertions(+), 4 deletions(-) diff --git a/.github/workflows/chaos.yml b/.github/workflows/chaos.yml index 68b1f55b..97bfda52 100644 --- a/.github/workflows/chaos.yml +++ b/.github/workflows/chaos.yml @@ -164,7 +164,8 @@ jobs: pattern-pod-evicted: name: pattern-pod-evicted (M19) runs-on: ubuntu-latest - timeout-minutes: 10 + # Observed runtime 1m48s; 5min is ~3× headroom and fails fast on flake. + timeout-minutes: 5 steps: - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - uses: actions/setup-go@4a3601121dd01d1626a1e23e37211e3254c1c06c # v6.4.0 diff --git a/MILESTONES.md b/MILESTONES.md index c53ee005..99d0c56c 100644 --- a/MILESTONES.md +++ b/MILESTONES.md @@ -399,7 +399,7 @@ Alpha unified-source logs receiver covering L2 + L9 (kernel + system events). Ta - **Status:** ⧗ partial - **Depends on:** M10, M4b - **Landed:** `internal/synthesis/patterns/pod_evicted.go` + `internal/synthesis/replay/` runner + canonical + 3 negative fixtures + `_real_world/` contribution slot + JSON schema; `k8sevents.NodeRecord` + Node informer + RBAC delta; `tracecore failure-inject pod-evict` CLI; `Record.ReportingInstance` for node-name joining; `chaos.yml` `pattern-pod-evicted` job + `pod-evict --reason=DiskPressure` golden SHA pin in `tools/failure-inject/testdata/golden.sha256`. -- **Carry-forward:** (1) Anonymized real-world fixtures populating `replay/pod_evicted/_real_world//` (slot + contribution README exist; no captures contributed yet); (2) detection-latency rubric (≤5s p95) requires live-cluster integration; (3) `--filler=/tmp` real-kubelet eviction flag for `failure-inject pod-evict` once `--allow-cluster-write` gains a kube-apiserver write seam. +- **Carry-forward:** (1) Anonymized real-world fixtures populating `replay/pod_evicted/_real_world//` (slot + contribution README + fixture-shape template exist; no captures contributed yet). Slot uses the `_real_world/` underscore prefix (was `real_world_*/` in the original carry-forward text) for parity with the existing `_negative/` group convention the loader already walks; (2) detection-latency rubric (≤5s p95) requires live-cluster integration; (3) `--filler=/tmp` real-kubelet eviction flag for `failure-inject pod-evict` once `--allow-cluster-write` gains a kube-apiserver write seam. **Functional rubrics:** - ☑ Detector at `internal/synthesis/patterns/pod_evicted.go` exports `PodEvictedDetector` consumed by `internal/synthesis/replay/runner_test.go`. Generic `Detector` interface deferred until M17/M18 add a second consumer per PRINCIPLES §3 rule-of-three. diff --git a/internal/synthesis/replay/pod_evicted/_real_world/README.md b/internal/synthesis/replay/pod_evicted/_real_world/README.md index ee75739f..872ef36d 100644 --- a/internal/synthesis/replay/pod_evicted/_real_world/README.md +++ b/internal/synthesis/replay/pod_evicted/_real_world/README.md @@ -48,11 +48,101 @@ new captures plug in without any code change. the canonical fixture does not.** State which mode in the manifest `description`. 3. **Land the golden verdict alongside.** If the detector misses - the real-world case, that is the point — file the gap in + the real-world case, that is the point; file the gap in `docs/FOLLOWUPS.md` and add the fixture under `_negative/` until the detector catches up. The replay corpus is the evidence trail. 4. **Make `manifest.json`'s `fixture_name` unique** across the corpus; the runner uses it for test-failure labels. -No contributed fixtures yet — this README is the contract. +No contributed fixtures yet; this README is the contract. + +## Fixture template + +Copy these four files into `_real_world//`, +fill in the placeholders, and the loader picks the fixture up +automatically (no code change needed). Field shapes are pinned +by `components/receivers/k8sevents/record.go` (`Record`, +`NodeRecord`) and `internal/synthesis/patterns/pod_evicted.go` +(`PodEvictedVerdict`). + +`manifest.json`: + +```json +{ + "pattern_id": "14", + "fixture_name": "real_world_", + "description": "One sentence on what real failure mode this capture exercises that the canonical fixture does not.", + "expected_timing": "Optional: node-condition / eviction event ordering relevant to the detector decision." +} +``` + +`events.json` (one or more pod-events; `regarding.uid` and +`reporting_instance` are the join keys the detector uses): + +```json +[ + { + "event_uid": "", + "action": "Evicting", + "reason": "Evicted", + "hint": "pod_evicted", + "regarding": { + "kind": "Pod", + "namespace": "", + "name": "", + "uid": "" + }, + "reporting_controller": "kubelet", + "reporting_instance": "", + "note": "The node was low on resource: ephemeral-storage.", + "event_time": "2026-01-01T00:00:00Z", + "type": "Warning" + } +] +``` + +`node_conditions.json` (optional; omit the file if the capture +does not include node-condition data; `pressure` is one of +`disk` / `memory` / `pid` / `not_ready`): + +```json +[ + { + "node_name": "", + "node_uid": "", + "hint": "node_pressure", + "pressure": "disk", + "transition_at": "2025-12-31T23:59:55Z", + "message": "imagefs.available<15%" + } +] +``` + +`golden.json` (the verdict slice the detector must produce; +empty `[]` for negative fixtures where the detector should +emit nothing): + +```json +[ + { + "pattern.id": "14", + "headline": "Pod / evicted at 2026-01-01T00:00:00Z due to disk pressure", + "remediation": "On node : Free imagefs or relocate the training write path to NVMe; tighten kubelet --eviction-hard nodefs.available.", + "confidence": "full", + "evidence_trail": [ + { + "kind": "pod_event", + "uid": "", + "timestamp": "2026-01-01T00:00:00Z", + "description": "Pod / evicted (reason=Evicted) on node " + } + ] + } +] +``` + +Run `go test ./internal/synthesis/...` after dropping the +fixture in; a passing run means the fixture parses and the +detector's verdict matches `golden.json`. A JSON-diff failure +points at the first mismatched field. diff --git a/internal/synthesis/replay/runner_test.go b/internal/synthesis/replay/runner_test.go index 4401fa7b..ef987685 100644 --- a/internal/synthesis/replay/runner_test.go +++ b/internal/synthesis/replay/runner_test.go @@ -63,3 +63,22 @@ func TestPodEvictedReplay_CanonicalFixturePresent(t *testing.T) { require.Contains(t, names, "preempted", "rubric L407 negative-fixture: Preempted") require.Contains(t, names, "failed_scheduling", "rubric L407 negative-fixture: FailedScheduling") } + +// TestPodEvictedReplay_RealWorldGroupLoaderSafe asserts the +// `_real_world/` contribution slot is loader-safe while empty: the +// loader walks the group, finds no fixtures, and returns the four +// canonical+negative fixtures without error. Falsifier: dropping a +// dir under `_real_world/` without a manifest.json makes LoadFixture +// fail; this test catches that before it reaches the rubric harness. +func TestPodEvictedReplay_RealWorldGroupLoaderSafe(t *testing.T) { + t.Parallel() + + fixtures, err := replay.LoadFixturesUnder("pod_evicted") + require.NoError(t, err) + require.Len(t, fixtures, 4, + "expected exactly canonical + 3 negatives; _real_world/ group must contribute 0 fixtures while empty") + + for _, f := range fixtures { + require.NotEqual(t, "_real_world", f.Name, "_real_world is a group, not a fixture") + } +}