From e95f941813f927e3c5a5129c16de7a97279fd00d Mon Sep 17 00:00:00 2001 From: Tri Lam Date: Mon, 1 Jun 2026 15:09:37 -0700 Subject: [PATCH] test(sdk): single-source verdict fixtures across Go + Python (#368) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Shipped-pattern verdict fixtures lived in three places — the Go SDK decode test, the Python SDK decode test, and the envelope schema test — and had to be hand-synced. Drift was inevitable. Replace the three literal fixture sets with one canonical JSON file at docs/schemas/fixtures/shipped-patterns-v1.0.0-rc1.json. All three test sites parametrise off it at test time. Adding a pattern verdict is now a one-file edit. New CI gate `make verdict-fixtures-check` asserts the canonical file exists, every consumer still references it by path, and all three test sites round-trip the fixture set green. Wired into `verify` and `ci-full`. Mutation-verified — deleting the canonical file fails the gate, and dropping a reference from any consumer fails the gate. Net diff: -204 / +176 (over 100 lines of duplicated fixture literals eliminated). Signed-off-by: Tri Lam --- Makefile | 37 ++++- docs/schemas/README.md | 13 +- .../fixtures/shipped-patterns-v1.0.0-rc1.json | 156 ++++++++++++++++++ .../patterns/verdict_envelope_schema_test.go | 51 ++++++ module/sdk/verdict/decode_test.go | 137 ++++++--------- python/tracecore_verdict/test_decode.py | 142 ++++------------ 6 files changed, 332 insertions(+), 204 deletions(-) create mode 100644 docs/schemas/fixtures/shipped-patterns-v1.0.0-rc1.json diff --git a/Makefile b/Makefile index 9bcccd2b..c889c42c 100644 --- a/Makefile +++ b/Makefile @@ -8,7 +8,7 @@ .PHONY: fmt fmt-fix vet lint lint-fix tidy tidy-check mod-verify bump-otel # Code generation -.PHONY: generate-fixtures generate-fixtures-check +.PHONY: generate-fixtures generate-fixtures-check verdict-fixtures-check # Coverage .PHONY: coverage coverage-check @@ -184,6 +184,37 @@ generate-fixtures-check: ## Fail if `make generate-fixtures` would produce a di git --no-pager diff -- module/pkg/nccl/fr_parser/testdata/; exit 1; \ fi +verdict-fixtures-check: ## CI gate for issue #368: assert the canonical shipped-pattern fixture file exists and is consumed by Go SDK + Python SDK + envelope schema tests. Detects drift at the source. + @# Single source of truth: docs/schemas/fixtures/shipped-patterns-v1.0.0-rc1.json + @# Three test suites read from it; if anyone reintroduces a hand-rolled + @# fixture literal in any of the three sites, the grep below catches it. + @if [ ! -f docs/schemas/fixtures/shipped-patterns-v1.0.0-rc1.json ]; then \ + echo "docs/schemas/fixtures/shipped-patterns-v1.0.0-rc1.json missing — this is the cross-SDK fixture contract (issue #368)."; exit 1; \ + fi + @# Confirm each consumer references the canonical file by path. The + @# string match is the cheapest possible "is this site still wired up" + @# gate; replacing it with a hand-rolled fixture literal would drop + @# the reference and trip this guard. + @for f in \ + module/sdk/verdict/decode_test.go \ + python/tracecore_verdict/test_decode.py \ + module/pkg/patterns/verdict_envelope_schema_test.go; do \ + if ! grep -q 'shipped-patterns-v1\.0\.0-rc1\.json' "$$f"; then \ + echo "$$f no longer references docs/schemas/fixtures/shipped-patterns-v1.0.0-rc1.json"; \ + echo " Re-wire it to the canonical fixture file (issue #368) or this drift gate stays red."; \ + exit 1; \ + fi; \ + done + @# And run the three test sites in --check mode (-count=1, fail-fast): + @# any divergence between the canonical fixtures and what each suite + @# round-trips against trips here. + @(cd module && GOWORK=off go test -count=1 -run 'TestDecode_AllShippedPatternsRoundTrip|TestVerdictEnvelopeV1RC1_CanonicalShippedFixturesValidate' ./sdk/verdict/ ./pkg/patterns/) || { \ + echo "verdict-fixtures-check: Go fixture round-trip failed; see test output above (issue #368)."; exit 1; \ + } + @(cd python/tracecore_verdict && python3 -m pytest -q test_decode.py::test_decode_all_shipped_patterns_round_trip) || { \ + echo "verdict-fixtures-check: Python fixture round-trip failed; see test output above (issue #368)."; exit 1; \ + } + coverage: ## Run all tests under the race detector with coverage profiling; emit coverage.out + coverage.html. @# This is the single canonical test-execution target in `make ci`: @# it runs every test in the repo with -race AND collects coverage, @@ -243,7 +274,7 @@ cut-criteria-check: ## Drift gate: rendered docs/v1-rc1-cut-criteria.md must ma fi; \ rm -f "$$tmp" -verify: check license-check generate-fixtures-check build-tags nccl-fr-rce-gate register-lint actionlint zizmor doc-check deprecation-check no-autoupdate-check ## Pre-push gate. Medium (<30s); CI handles heavy gates (test, coverage, govulncheck, fuzz, build). +verify: check license-check generate-fixtures-check verdict-fixtures-check build-tags nccl-fr-rce-gate register-lint actionlint zizmor doc-check deprecation-check no-autoupdate-check ## Pre-push gate. Medium (<30s); CI handles heavy gates (test, coverage, govulncheck, fuzz, build). test-extras-sustained: ## (sub-target) sustained-load (5 min); see `make test-extras`. @# kernelevents was the sole sustained-load suite consumer; deleted @@ -342,7 +373,7 @@ ci-fast: lint vet mod-verify attribute-namespace-check doc-check ## Fast-feedba # back-compat alias so existing scripts, docs, and hooks invoking `make ci` # remain semantically unchanged. New callers should prefer the explicit # `ci-fast` (inner loop) vs `ci-full` (pre-PR) split per PRINCIPLES §10. -ci-full: license-check generate-fixtures-check vet build-tags tidy-check mod-verify lint nccl-fr-rce-gate register-lint actionlint zizmor coverage-check ci-fuzz-nccl-fr govulncheck doc-check deprecation-check no-autoupdate-check build smoke-quickstart ## Everything CI runs. Run before opening a PR. +ci-full: license-check generate-fixtures-check verdict-fixtures-check vet build-tags tidy-check mod-verify lint nccl-fr-rce-gate register-lint actionlint zizmor coverage-check ci-fuzz-nccl-fr govulncheck doc-check deprecation-check no-autoupdate-check build smoke-quickstart ## Everything CI runs. Run before opening a PR. ci: ci-full ## Back-compat alias for `ci-full`. Existing scripts and `make ci` invocations keep working unchanged. diff --git a/docs/schemas/README.md b/docs/schemas/README.md index 4010e4a9..5a227f62 100644 --- a/docs/schemas/README.md +++ b/docs/schemas/README.md @@ -10,6 +10,7 @@ released version; consumers pin to that filename rather than chasing | Artifact | Surface | Status | Pinned by | |---|---|---|---| | [`verdict-1.0.0-rc1.json`](verdict-1.0.0-rc1.json) | Pattern-detector verdict envelope (every pattern emits this shape; per-pattern fields layer on top) | `v1.0-rc1` — published for [v1-rc1 cut criterion 2](../v1-rc1-cut-criteria.md#2-verdict-schema-v10-published-and-stable) | [`module/pkg/patterns/verdict_envelope_schema_test.go`](../../module/pkg/patterns/verdict_envelope_schema_test.go) | +| [`fixtures/shipped-patterns-v1.0.0-rc1.json`](fixtures/shipped-patterns-v1.0.0-rc1.json) | Cross-SDK shipped-verdict fixture set (single source of truth for the Go + Python SDK suites and the envelope schema test, issue [#368](https://github.com/TraceCoreAI/tracecore/issues/368)) | `v1.0-rc1` — one row per shipped pattern | Go SDK ([`module/sdk/verdict/decode_test.go`](../../module/sdk/verdict/decode_test.go)), Python SDK ([`python/tracecore_verdict/test_decode.py`](../../python/tracecore_verdict/test_decode.py)), envelope test ([`module/pkg/patterns/verdict_envelope_schema_test.go`](../../module/pkg/patterns/verdict_envelope_schema_test.go)) | | [`nccl_fr/v0.md`](nccl_fr/v0.md) | NCCL FlightRecorder receiver — OTel attribute vocabulary on the emitted log records | `v0` — additive within major | `schema_url` on every ResourceLogs / ScopeLogs envelope the receiver emits | ## Verdict envelope (`verdict-1.0.0-rc1.json`) @@ -93,8 +94,15 @@ cut. that is a minor-version bump (additive). 4. Add a fixture row to `TestVerdictEnvelopeV1RC1_AllShippedVerdictsValidate` in - [`verdict_envelope_schema_test.go`](../../module/pkg/patterns/verdict_envelope_schema_test.go). -5. Cross-link from + [`verdict_envelope_schema_test.go`](../../module/pkg/patterns/verdict_envelope_schema_test.go) + (typed-struct envelope-validation case). +5. Add one row to the canonical cross-SDK fixture file + [`fixtures/shipped-patterns-v1.0.0-rc1.json`](fixtures/shipped-patterns-v1.0.0-rc1.json). + This is the ONLY edit the Go SDK, Python SDK, and the + `TestVerdictEnvelopeV1RC1_CanonicalShippedFixturesValidate` + subtest need — they all consume the same file (issue + [#368](https://github.com/TraceCoreAI/tracecore/issues/368)). +6. Cross-link from [`docs/ATTRIBUTES.md`](../ATTRIBUTES.md#pattern--verdict-log-record-attributes). ## How drift is caught @@ -102,6 +110,7 @@ cut. | Drift class | Caught by | |---|---| | Envelope-level field added / removed / loosened in the Go type without a schema bump | `TestVerdictEnvelopeV1RC1_AllShippedVerdictsValidate` (every shipped Verdict round-trips through the published artifact) | +| Cross-SDK shipped-fixture drift (Go suite, Python suite, envelope test all disagree on the shipped-verdict set) | `TestVerdictEnvelopeV1RC1_CanonicalShippedFixturesValidate` + the SDK-side `TestDecode_AllShippedPatternsRoundTrip` / `test_decode_all_shipped_patterns_round_trip` parametrise off the one canonical file — drift is impossible because there is only one source | | Envelope guard removed in the schema (e.g. `headline` becomes optional, `confidence` enum loosens) | `TestVerdictEnvelopeV1RC1_RejectsEnvelopeDrift` (ten falsifier rows; loosening a guard flips its row to PASS) | | `$id` changed without a major-version bump | `TestVerdictEnvelopeV1RC1_HasStableID` (pins the published `$id` literal) | | Per-pattern field drift | per-pattern `TestVerdict_SchemaConformance` / `_SchemaRejectsDrift` | diff --git a/docs/schemas/fixtures/shipped-patterns-v1.0.0-rc1.json b/docs/schemas/fixtures/shipped-patterns-v1.0.0-rc1.json new file mode 100644 index 00000000..f519caca --- /dev/null +++ b/docs/schemas/fixtures/shipped-patterns-v1.0.0-rc1.json @@ -0,0 +1,156 @@ +{ + "$comment": "Canonical v1.0-rc1 verdict fixtures — single source of truth shared by the Go SDK (module/sdk/verdict/decode_test.go), the Python SDK (python/tracecore_verdict/test_decode.py), and the envelope schema test (module/pkg/patterns/verdict_envelope_schema_test.go). Adding a pattern verdict at rc1 or v0.4 MUST add a fixture HERE — the three test suites read this file at test time, so hand-rolled drift is impossible. See issue #368.", + "$schema_ref": "../verdict-1.0.0-rc1.json", + "fixtures": [ + { + "name": "pod_evicted", + "payload": { + "pattern.id": "14", + "headline": "h", + "remediation": "r", + "confidence": "full", + "evidence_trail": [ + {"kind": "pod_event", "uid": "u", "timestamp": "2026-05-18T10:00:00Z", "description": "d"}, + {"kind": "node_condition", "uid": "u", "timestamp": "2026-05-18T10:00:00Z", "description": "d"} + ], + "k8s.pod.name": "p", + "k8s.pod.namespace": "ns", + "k8s.node.name": "n" + } + }, + { + "name": "nccl_hang", + "payload": { + "pattern.id": "15", + "headline": "h", + "remediation": "r", + "evidence_trail": [ + {"kind": "nccl_fr", "uid": "u", "timestamp": "2026-05-18T10:00:00Z", "description": "d"} + ], + "pg_id": 7, + "collective_seq_id": 42, + "hanging_ranks": [1, 3] + } + }, + { + "name": "xid_correlation", + "payload": { + "pattern.id": "16", + "headline": "h", + "remediation": "r", + "evidence_trail": [ + {"kind": "kernel_event", "uid": "u", "timestamp": "2026-05-18T10:00:00Z", "description": "d"}, + {"kind": "pod_event", "uid": "u", "timestamp": "2026-05-18T10:00:00Z", "description": "d"} + ], + "xid_code": 79, + "node": "n", + "evicted_pod": "ns/p" + } + }, + { + "name": "hbm_ecc", + "payload": { + "pattern.id": "17", + "headline": "h", + "remediation": "r", + "evidence_trail": [ + {"kind": "hw_error", "uid": "u", "timestamp": "2026-05-18T10:00:00Z", "description": "d"}, + {"kind": "kernel_event", "uid": "u", "timestamp": "2026-05-18T10:00:00Z", "description": "d"} + ], + "xid_code": 48, + "gpu_id": "0000:31:00.0", + "ecc_delta": 1, + "node": "n" + } + }, + { + "name": "thermal_throttle", + "payload": { + "pattern.id": "18", + "headline": "h", + "remediation": "r", + "evidence_trail": [ + {"kind": "hw_throttle", "uid": "u", "timestamp": "2026-05-18T10:00:00Z", "description": "d"}, + {"kind": "hw_throttle", "uid": "u", "timestamp": "2026-05-18T10:00:00Z", "description": "d"} + ], + "node": "n", + "gpu_count": 2, + "gpu_ids": ["0000:31:00.0", "0000:32:00.0"] + } + }, + { + "name": "pcie_aer", + "payload": { + "pattern.id": "19", + "headline": "h", + "remediation": "r", + "evidence_trail": [ + {"kind": "pcie_aer", "uid": "u", "timestamp": "2026-05-18T10:00:00Z", "description": "d"}, + {"kind": "hw_io_collapse", "uid": "u", "timestamp": "2026-05-18T10:00:00Z", "description": "d"} + ], + "gpu_id": "0000:31:00.0", + "severity": "Fatal", + "aer_type": "Data Link Layer", + "drop_ratio": 0.9, + "node": "n" + } + }, + { + "name": "cuda_oom", + "payload": { + "pattern.id": "20", + "headline": "h", + "remediation": "r", + "confidence": "full", + "evidence_trail": [ + {"kind": "cuda_oom", "uid": "u", "timestamp": "2026-05-18T10:00:00Z", "description": "d"}, + {"kind": "hw_fb", "uid": "u", "timestamp": "2026-05-18T10:00:00Z", "description": "d"} + ], + "gpu_id": "0000:31:00.0", + "node": "n", + "kind": "true_oom", + "tried_alloc_bytes": 1024, + "fb_free_bytes": 0, + "fb_free_ratio": 0.0 + } + }, + { + "name": "ib_link_flap", + "payload": { + "pattern.id": "21", + "headline": "h", + "remediation": "r", + "confidence": "partial", + "evidence_trail": [ + {"kind": "ib_port_state", "uid": "u", "timestamp": "2026-05-18T10:00:00Z", "description": "d"} + ], + "missing_layers": ["hw_throttle"], + "node": "n", + "hca_device": "mlx5_0", + "port": 1, + "transition_count": 3 + } + }, + { + "name": "silent_data_corruption", + "payload": { + "pattern.id": "13", + "headline": "h", + "remediation": "r", + "confidence": "full", + "kind": "vendor_signaled", + "evidence_trail": [ + {"kind": "hw_gpu_sdc", "uid": "u", "timestamp": "2026-05-18T10:00:00Z", "description": "d"}, + {"kind": "gen_ai_training_eval_accuracy", "uid": "u", "timestamp": "2026-05-18T10:00:00Z", "description": "d"} + ], + "gen_ai.training.job_id": "job-42", + "accuracy_drop": 0.15, + "baseline_accuracy": 0.95, + "observed_accuracy": 0.80, + "suspect_gpu_id": "0000:31:00.0", + "suspect_node": "n", + "sdc_counter_delta": 3 + } + } + ] +} diff --git a/module/pkg/patterns/verdict_envelope_schema_test.go b/module/pkg/patterns/verdict_envelope_schema_test.go index 252f6b7e..2d1ddb2a 100644 --- a/module/pkg/patterns/verdict_envelope_schema_test.go +++ b/module/pkg/patterns/verdict_envelope_schema_test.go @@ -162,6 +162,57 @@ func TestVerdictEnvelopeV1RC1_AllShippedVerdictsValidate(t *testing.T) { } } +// canonicalShippedFixturesPath is the cross-SDK fixture file (issue +// #368) — the SAME file the Go + Python verdict SDK suites consume. +// Validating it against the envelope schema here closes the loop: +// the fixture set the SDK suites round-trip against is provably an +// envelope-conforming subset, so a fixture-edit that schema-drifts +// fails this test BEFORE it ships in either SDK. +var canonicalShippedFixturesPath = filepath.Join( + "..", "..", "..", "docs", "schemas", "fixtures", "shipped-patterns-v1.0.0-rc1.json", +) + +// TestVerdictEnvelopeV1RC1_CanonicalShippedFixturesValidate asserts +// every row in docs/schemas/fixtures/shipped-patterns-v1.0.0-rc1.json +// (the shared Go+Python SDK fixture set) validates against the +// published envelope schema. The fixtures are the single source of +// truth consumed by: +// +// - module/sdk/verdict/decode_test.go (Go SDK) +// - python/tracecore_verdict/test_decode.py (Python SDK) +// +// Drift between any of those and the envelope is caught here. +func TestVerdictEnvelopeV1RC1_CanonicalShippedFixturesValidate(t *testing.T) { + t.Parallel() + schema := loadEnvelopeSchema(t) + + bs, err := os.ReadFile(canonicalShippedFixturesPath) //nolint:gosec // test-local relative path + require.NoError(t, err, + "docs/schemas/fixtures/shipped-patterns-v1.0.0-rc1.json must exist; "+ + "it is the cross-SDK fixture contract from issue #368.") + + var doc struct { + Fixtures []struct { + Name string `json:"name"` + Payload map[string]any `json:"payload"` + } `json:"fixtures"` + } + require.NoError(t, json.Unmarshal(bs, &doc), + "canonical shipped-pattern fixtures file must be valid JSON") + require.NotEmpty(t, doc.Fixtures, "canonical fixture set must be non-empty") + + for _, fx := range doc.Fixtures { + fx := fx + t.Run(fx.Name, func(t *testing.T) { + t.Parallel() + require.NoError(t, schema.Validate(fx.Payload), + "canonical fixture %q must validate against the published envelope "+ + "schema; if intentional, update the envelope under the "+ + "docs/DEPRECATION.md policy.", fx.Name) + }) + } +} + // TestVerdictEnvelopeV1RC1_RejectsEnvelopeDrift asserts the // envelope's bite: removing/mangling any envelope-level required // field fails validation. Each row is a one-mutation falsifier for diff --git a/module/sdk/verdict/decode_test.go b/module/sdk/verdict/decode_test.go index 1deb7a62..69bbc265 100644 --- a/module/sdk/verdict/decode_test.go +++ b/module/sdk/verdict/decode_test.go @@ -4,6 +4,8 @@ package verdict_test import ( "encoding/json" + "os" + "path/filepath" "testing" "time" @@ -12,6 +14,44 @@ import ( "github.com/tracecoreai/tracecore/module/sdk/verdict" ) +// canonicalFixturesPath is the single source of truth for the +// SHIPPED_PATTERN_FIXTURES set, shared between the Go SDK +// (this test), the Python SDK +// (python/tracecore_verdict/test_decode.py), and the envelope +// schema test (module/pkg/patterns/verdict_envelope_schema_test.go). +// Adding a pattern verdict MUST edit only that one file — +// hand-rolled per-language drift is closed at the source (issue #368). +var canonicalFixturesPath = filepath.Join( + "..", "..", "..", "docs", "schemas", "fixtures", "shipped-patterns-v1.0.0-rc1.json", +) + +// shippedFixture is one row of the canonical fixture file. +type shippedFixture struct { + Name string `json:"name"` + Payload map[string]any `json:"payload"` +} + +// loadShippedFixtures reads the canonical fixture file and returns +// the ordered list of (name, payload) rows. Fails the test if the +// artifact is missing or malformed — that artifact is the +// cross-SDK fixture contract for issue #368. +func loadShippedFixtures(t *testing.T) []shippedFixture { + t.Helper() + bs, err := os.ReadFile(canonicalFixturesPath) //nolint:gosec // test-local relative path + require.NoError(t, err, + "docs/schemas/fixtures/shipped-patterns-v1.0.0-rc1.json must exist; "+ + "this file is the single source of truth for shipped pattern fixtures "+ + "consumed by Go + Python verdict SDKs (issue #368).") + var doc struct { + Fixtures []shippedFixture `json:"fixtures"` + } + require.NoError(t, json.Unmarshal(bs, &doc), + "canonical shipped-pattern fixtures file must be valid JSON") + require.NotEmpty(t, doc.Fixtures, + "canonical fixture set is empty; at least one shipped pattern must be defined") + return doc.Fixtures +} + // goodVerdict is the canonical happy-path fixture — minimum-well-formed // pod_evicted verdict shaped per the published envelope. const goodVerdict = `{ @@ -158,100 +198,27 @@ func TestDecode_PreservesExtras(t *testing.T) { // envelope-superset contract: adding a new pattern that breaks this // case means the envelope itself has drifted — and the test is the // canary. +// +// Fixtures are read from docs/schemas/fixtures/shipped-patterns-v1.0.0-rc1.json +// (issue #368): the same file feeds the Python SDK suite and the +// envelope schema test, so adding a pattern verdict is a one-edit +// operation rather than a three-way hand-sync. func TestDecode_AllShippedPatternsRoundTrip(t *testing.T) { t.Parallel() - ts := "2026-05-18T10:00:00Z" - ev := func(kind string) map[string]any { - return map[string]any{ - "kind": kind, "uid": "u", - "timestamp": ts, "description": "d", - } - } + fixtures := loadShippedFixtures(t) - cases := []struct { - name string - verdict map[string]any - }{ - {"pod_evicted", map[string]any{ - "pattern.id": "14", "headline": "h", "remediation": "r", - "confidence": "full", - "evidence_trail": []any{ev("pod_event"), ev("node_condition")}, - "k8s.pod.name": "p", "k8s.pod.namespace": "ns", "k8s.node.name": "n", - }}, - {"nccl_hang", map[string]any{ - "pattern.id": "15", "headline": "h", "remediation": "r", - "evidence_trail": []any{ev("nccl_fr")}, - "pg_id": float64(7), - "collective_seq_id": float64(42), - "hanging_ranks": []any{float64(1), float64(3)}, - }}, - {"xid_correlation", map[string]any{ - "pattern.id": "16", "headline": "h", "remediation": "r", - "evidence_trail": []any{ev("kernel_event"), ev("pod_event")}, - "xid_code": float64(79), "node": "n", "evicted_pod": "ns/p", - }}, - {"hbm_ecc", map[string]any{ - "pattern.id": "17", "headline": "h", "remediation": "r", - "evidence_trail": []any{ev("hw_error"), ev("kernel_event")}, - "xid_code": float64(48), "gpu_id": "0000:31:00.0", - "ecc_delta": float64(1), "node": "n", - }}, - {"thermal_throttle", map[string]any{ - "pattern.id": "18", "headline": "h", "remediation": "r", - "evidence_trail": []any{ev("hw_throttle"), ev("hw_throttle")}, - "node": "n", "gpu_count": float64(2), - "gpu_ids": []any{"0000:31:00.0", "0000:32:00.0"}, - }}, - {"pcie_aer", map[string]any{ - "pattern.id": "19", "headline": "h", "remediation": "r", - "evidence_trail": []any{ev("pcie_aer"), ev("hw_io_collapse")}, - "gpu_id": "0000:31:00.0", "severity": "Fatal", - "aer_type": "Data Link Layer", "drop_ratio": 0.9, "node": "n", - }}, - {"cuda_oom", map[string]any{ - "pattern.id": "20", "headline": "h", "remediation": "r", - "confidence": "full", - "evidence_trail": []any{ev("cuda_oom"), ev("hw_fb")}, - "gpu_id": "0000:31:00.0", "node": "n", - "kind": "true_oom", "tried_alloc_bytes": float64(1024), - "fb_free_bytes": float64(0), "fb_free_ratio": 0.0, - }}, - {"ib_link_flap", map[string]any{ - "pattern.id": "21", "headline": "h", "remediation": "r", - "confidence": "partial", - "evidence_trail": []any{ev("ib_port_state")}, - "missing_layers": []any{"hw_throttle"}, - "node": "n", "hca_device": "mlx5_0", - "port": float64(1), - "transition_count": float64(3), - }}, - {"silent_data_corruption", map[string]any{ - "pattern.id": "13", "headline": "h", "remediation": "r", - "confidence": "full", - "kind": "vendor_signaled", - "evidence_trail": []any{ev("hw_gpu_sdc"), ev("gen_ai_training_eval_accuracy")}, - "gen_ai.training.job_id": "job-42", - "accuracy_drop": 0.15, - "baseline_accuracy": 0.95, - "observed_accuracy": 0.80, - "suspect_gpu_id": "0000:31:00.0", - "suspect_node": "n", - "sdc_counter_delta": float64(3), - }}, - } - - for _, tc := range cases { - tc := tc - t.Run(tc.name, func(t *testing.T) { + for _, fx := range fixtures { + fx := fx + t.Run(fx.Name, func(t *testing.T) { t.Parallel() - raw, err := json.Marshal(tc.verdict) + raw, err := json.Marshal(fx.Payload) require.NoError(t, err) v, err := verdict.Decode(raw) require.NoError(t, err, - "shipped pattern %q must decode cleanly through the envelope SDK", tc.name) - require.Equal(t, tc.verdict["pattern.id"], v.PatternID) + "shipped pattern %q must decode cleanly through the envelope SDK", fx.Name) + require.Equal(t, fx.Payload["pattern.id"], v.PatternID) require.NotEmpty(t, v.EvidenceTrail) }) } diff --git a/python/tracecore_verdict/test_decode.py b/python/tracecore_verdict/test_decode.py index 84d97e58..1fffd5ef 100644 --- a/python/tracecore_verdict/test_decode.py +++ b/python/tracecore_verdict/test_decode.py @@ -203,123 +203,37 @@ def test_decode_preserves_extras() -> None: assert "evidence_trail" not in v.extras -def _ev(kind: str) -> dict: - return { - "kind": kind, - "uid": "u", - "timestamp": "2026-05-18T10:00:00Z", - "description": "d", - } +# SHIPPED_PATTERN_FIXTURES is the v1.0-rc1 shipped-verdict fixture +# set. Source of truth is +# ``docs/schemas/fixtures/shipped-patterns-v1.0.0-rc1.json`` — the +# SAME file feeds the Go SDK suite +# (``module/sdk/verdict/decode_test.go``) and the envelope schema +# test (``module/pkg/patterns/verdict_envelope_schema_test.go``). +# Adding a pattern verdict is a one-edit operation against that +# file; this module just reads it (issue #368). +_CANONICAL_FIXTURES_PATH = ( + Path(__file__).parent + / ".." + / ".." + / "docs" + / "schemas" + / "fixtures" + / "shipped-patterns-v1.0.0-rc1.json" +) -# Mirror of the Go SDK's TestDecode_AllShippedPatternsRoundTrip: every -# pattern verdict shipped at v1.0-rc1 must decode cleanly. Adding a -# pattern in a future PR MUST add a row here. -SHIPPED_PATTERN_FIXTURES = { - "pod_evicted": { - "pattern.id": "14", - "headline": "h", - "remediation": "r", - "confidence": "full", - "evidence_trail": [_ev("pod_event"), _ev("node_condition")], - "k8s.pod.name": "p", - "k8s.pod.namespace": "ns", - "k8s.node.name": "n", - }, - "nccl_hang": { - "pattern.id": "15", - "headline": "h", - "remediation": "r", - "evidence_trail": [_ev("nccl_fr")], - "pg_id": 7, - "collective_seq_id": 42, - "hanging_ranks": [1, 3], - }, - "xid_correlation": { - "pattern.id": "16", - "headline": "h", - "remediation": "r", - "evidence_trail": [_ev("kernel_event"), _ev("pod_event")], - "xid_code": 79, - "node": "n", - "evicted_pod": "ns/p", - }, - "hbm_ecc": { - "pattern.id": "17", - "headline": "h", - "remediation": "r", - "evidence_trail": [_ev("hw_error"), _ev("kernel_event")], - "xid_code": 48, - "gpu_id": "0000:31:00.0", - "ecc_delta": 1, - "node": "n", - }, - "thermal_throttle": { - "pattern.id": "18", - "headline": "h", - "remediation": "r", - "evidence_trail": [_ev("hw_throttle"), _ev("hw_throttle")], - "node": "n", - "gpu_count": 2, - "gpu_ids": ["0000:31:00.0", "0000:32:00.0"], - }, - "pcie_aer": { - "pattern.id": "19", - "headline": "h", - "remediation": "r", - "evidence_trail": [_ev("pcie_aer"), _ev("hw_io_collapse")], - "gpu_id": "0000:31:00.0", - "severity": "Fatal", - "aer_type": "Data Link Layer", - "drop_ratio": 0.9, - "node": "n", - }, - "cuda_oom": { - "pattern.id": "20", - "headline": "h", - "remediation": "r", - "confidence": "full", - "evidence_trail": [_ev("cuda_oom"), _ev("hw_fb")], - "gpu_id": "0000:31:00.0", - "node": "n", - "kind": "true_oom", - "tried_alloc_bytes": 1024, - "fb_free_bytes": 0, - "fb_free_ratio": 0.0, - }, - "ib_link_flap": { - "pattern.id": "21", - "headline": "h", - "remediation": "r", - "confidence": "partial", - "evidence_trail": [_ev("ib_port_state")], - "missing_layers": ["hw_throttle"], - "node": "n", - "hca_device": "mlx5_0", - "port": 1, - "transition_count": 3, - }, - "silent_data_corruption": { - "pattern.id": "13", - "headline": "h", - "remediation": "r", - "confidence": "full", - "kind": "vendor_signaled", - "evidence_trail": [_ev("hw_gpu_sdc"), _ev("gen_ai_training_eval_accuracy")], - "gen_ai.training.job_id": "job-42", - "accuracy_drop": 0.15, - "baseline_accuracy": 0.95, - "observed_accuracy": 0.80, - "suspect_gpu_id": "0000:31:00.0", - "suspect_node": "n", - "sdc_counter_delta": 3, - }, -} - - -@pytest.mark.parametrize("name,payload", list(SHIPPED_PATTERN_FIXTURES.items())) +def _load_shipped_fixtures() -> list[tuple[str, dict]]: + """Return the ordered list of (name, payload) rows from canonical JSON.""" + doc = json.loads(_CANONICAL_FIXTURES_PATH.read_text()) + return [(row["name"], row["payload"]) for row in doc["fixtures"]] + + +SHIPPED_PATTERN_FIXTURES = _load_shipped_fixtures() + + +@pytest.mark.parametrize("name,payload", SHIPPED_PATTERN_FIXTURES) def test_decode_all_shipped_patterns_round_trip(name: str, payload: dict) -> None: - """All 9 shipped pattern verdicts decode cleanly.""" + """All shipped pattern verdicts decode cleanly.""" v = decode(json.dumps(payload)) assert v.pattern_id == payload["pattern.id"] assert len(v.evidence_trail) >= 1