Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 34 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
.PHONY: fmt fmt-fix vet lint lint-fix tidy tidy-check mod-verify bump-otel

# Code generation
.PHONY: generate-fixtures generate-fixtures-check
.PHONY: generate-fixtures generate-fixtures-check verdict-fixtures-check

# Coverage
.PHONY: coverage coverage-check
Expand Down Expand Up @@ -184,6 +184,37 @@ generate-fixtures-check: ## Fail if `make generate-fixtures` would produce a di
git --no-pager diff -- module/pkg/nccl/fr_parser/testdata/; exit 1; \
fi

verdict-fixtures-check: ## CI gate for issue #368: assert the canonical shipped-pattern fixture file exists and is consumed by Go SDK + Python SDK + envelope schema tests. Detects drift at the source.
@# Single source of truth: docs/schemas/fixtures/shipped-patterns-v1.0.0-rc1.json
@# Three test suites read from it; if anyone reintroduces a hand-rolled
@# fixture literal in any of the three sites, the grep below catches it.
@if [ ! -f docs/schemas/fixtures/shipped-patterns-v1.0.0-rc1.json ]; then \
echo "docs/schemas/fixtures/shipped-patterns-v1.0.0-rc1.json missing — this is the cross-SDK fixture contract (issue #368)."; exit 1; \
fi
@# Confirm each consumer references the canonical file by path. The
@# string match is the cheapest possible "is this site still wired up"
@# gate; replacing it with a hand-rolled fixture literal would drop
@# the reference and trip this guard.
@for f in \
module/sdk/verdict/decode_test.go \
python/tracecore_verdict/test_decode.py \
module/pkg/patterns/verdict_envelope_schema_test.go; do \
if ! grep -q 'shipped-patterns-v1\.0\.0-rc1\.json' "$$f"; then \
echo "$$f no longer references docs/schemas/fixtures/shipped-patterns-v1.0.0-rc1.json"; \
echo " Re-wire it to the canonical fixture file (issue #368) or this drift gate stays red."; \
exit 1; \
fi; \
done
@# And run the three test sites in --check mode (-count=1, fail-fast):
@# any divergence between the canonical fixtures and what each suite
@# round-trips against trips here.
@(cd module && GOWORK=off go test -count=1 -run 'TestDecode_AllShippedPatternsRoundTrip|TestVerdictEnvelopeV1RC1_CanonicalShippedFixturesValidate' ./sdk/verdict/ ./pkg/patterns/) || { \
echo "verdict-fixtures-check: Go fixture round-trip failed; see test output above (issue #368)."; exit 1; \
}
@(cd python/tracecore_verdict && python3 -m pytest -q test_decode.py::test_decode_all_shipped_patterns_round_trip) || { \
echo "verdict-fixtures-check: Python fixture round-trip failed; see test output above (issue #368)."; exit 1; \
}

coverage: ## Run all tests under the race detector with coverage profiling; emit coverage.out + coverage.html.
@# This is the single canonical test-execution target in `make ci`:
@# it runs every test in the repo with -race AND collects coverage,
Expand Down Expand Up @@ -243,7 +274,7 @@ cut-criteria-check: ## Drift gate: rendered docs/v1-rc1-cut-criteria.md must ma
fi; \
rm -f "$$tmp"

verify: check license-check generate-fixtures-check build-tags nccl-fr-rce-gate register-lint actionlint zizmor doc-check deprecation-check no-autoupdate-check ## Pre-push gate. Medium (<30s); CI handles heavy gates (test, coverage, govulncheck, fuzz, build).
verify: check license-check generate-fixtures-check verdict-fixtures-check build-tags nccl-fr-rce-gate register-lint actionlint zizmor doc-check deprecation-check no-autoupdate-check ## Pre-push gate. Medium (<30s); CI handles heavy gates (test, coverage, govulncheck, fuzz, build).

test-extras-sustained: ## (sub-target) sustained-load (5 min); see `make test-extras`.
@# kernelevents was the sole sustained-load suite consumer; deleted
Expand Down Expand Up @@ -342,7 +373,7 @@ ci-fast: lint vet mod-verify attribute-namespace-check doc-check ## Fast-feedba
# back-compat alias so existing scripts, docs, and hooks invoking `make ci`
# remain semantically unchanged. New callers should prefer the explicit
# `ci-fast` (inner loop) vs `ci-full` (pre-PR) split per PRINCIPLES §10.
ci-full: license-check generate-fixtures-check vet build-tags tidy-check mod-verify lint nccl-fr-rce-gate register-lint actionlint zizmor coverage-check ci-fuzz-nccl-fr govulncheck doc-check deprecation-check no-autoupdate-check build smoke-quickstart ## Everything CI runs. Run before opening a PR.
ci-full: license-check generate-fixtures-check verdict-fixtures-check vet build-tags tidy-check mod-verify lint nccl-fr-rce-gate register-lint actionlint zizmor coverage-check ci-fuzz-nccl-fr govulncheck doc-check deprecation-check no-autoupdate-check build smoke-quickstart ## Everything CI runs. Run before opening a PR.

ci: ci-full ## Back-compat alias for `ci-full`. Existing scripts and `make ci` invocations keep working unchanged.

Expand Down
13 changes: 11 additions & 2 deletions docs/schemas/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ released version; consumers pin to that filename rather than chasing
| Artifact | Surface | Status | Pinned by |
|---|---|---|---|
| [`verdict-1.0.0-rc1.json`](verdict-1.0.0-rc1.json) | Pattern-detector verdict envelope (every pattern emits this shape; per-pattern fields layer on top) | `v1.0-rc1` — published for [v1-rc1 cut criterion 2](../v1-rc1-cut-criteria.md#2-verdict-schema-v10-published-and-stable) | [`module/pkg/patterns/verdict_envelope_schema_test.go`](../../module/pkg/patterns/verdict_envelope_schema_test.go) |
| [`fixtures/shipped-patterns-v1.0.0-rc1.json`](fixtures/shipped-patterns-v1.0.0-rc1.json) | Cross-SDK shipped-verdict fixture set (single source of truth for the Go + Python SDK suites and the envelope schema test, issue [#368](https://github.com/TraceCoreAI/tracecore/issues/368)) | `v1.0-rc1` — one row per shipped pattern | Go SDK ([`module/sdk/verdict/decode_test.go`](../../module/sdk/verdict/decode_test.go)), Python SDK ([`python/tracecore_verdict/test_decode.py`](../../python/tracecore_verdict/test_decode.py)), envelope test ([`module/pkg/patterns/verdict_envelope_schema_test.go`](../../module/pkg/patterns/verdict_envelope_schema_test.go)) |
| [`nccl_fr/v0.md`](nccl_fr/v0.md) | NCCL FlightRecorder receiver — OTel attribute vocabulary on the emitted log records | `v0` — additive within major | `schema_url` on every ResourceLogs / ScopeLogs envelope the receiver emits |

## Verdict envelope (`verdict-1.0.0-rc1.json`)
Expand Down Expand Up @@ -93,15 +94,23 @@ cut.
that is a minor-version bump (additive).
4. Add a fixture row to
`TestVerdictEnvelopeV1RC1_AllShippedVerdictsValidate` in
[`verdict_envelope_schema_test.go`](../../module/pkg/patterns/verdict_envelope_schema_test.go).
5. Cross-link from
[`verdict_envelope_schema_test.go`](../../module/pkg/patterns/verdict_envelope_schema_test.go)
(typed-struct envelope-validation case).
5. Add one row to the canonical cross-SDK fixture file
[`fixtures/shipped-patterns-v1.0.0-rc1.json`](fixtures/shipped-patterns-v1.0.0-rc1.json).
This is the ONLY edit the Go SDK, Python SDK, and the
`TestVerdictEnvelopeV1RC1_CanonicalShippedFixturesValidate`
subtest need — they all consume the same file (issue
[#368](https://github.com/TraceCoreAI/tracecore/issues/368)).
6. Cross-link from
[`docs/ATTRIBUTES.md`](../ATTRIBUTES.md#pattern--verdict-log-record-attributes).

## How drift is caught

| Drift class | Caught by |
|---|---|
| Envelope-level field added / removed / loosened in the Go type without a schema bump | `TestVerdictEnvelopeV1RC1_AllShippedVerdictsValidate` (every shipped Verdict round-trips through the published artifact) |
| Cross-SDK shipped-fixture drift (Go suite, Python suite, envelope test all disagree on the shipped-verdict set) | `TestVerdictEnvelopeV1RC1_CanonicalShippedFixturesValidate` + the SDK-side `TestDecode_AllShippedPatternsRoundTrip` / `test_decode_all_shipped_patterns_round_trip` parametrise off the one canonical file — drift is impossible because there is only one source |
| Envelope guard removed in the schema (e.g. `headline` becomes optional, `confidence` enum loosens) | `TestVerdictEnvelopeV1RC1_RejectsEnvelopeDrift` (ten falsifier rows; loosening a guard flips its row to PASS) |
| `$id` changed without a major-version bump | `TestVerdictEnvelopeV1RC1_HasStableID` (pins the published `$id` literal) |
| Per-pattern field drift | per-pattern `Test<Pattern>Verdict_SchemaConformance` / `_SchemaRejectsDrift` |
Expand Down
156 changes: 156 additions & 0 deletions docs/schemas/fixtures/shipped-patterns-v1.0.0-rc1.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
{
"$comment": "Canonical v1.0-rc1 verdict fixtures — single source of truth shared by the Go SDK (module/sdk/verdict/decode_test.go), the Python SDK (python/tracecore_verdict/test_decode.py), and the envelope schema test (module/pkg/patterns/verdict_envelope_schema_test.go). Adding a pattern verdict at rc1 or v0.4 MUST add a fixture HERE — the three test suites read this file at test time, so hand-rolled drift is impossible. See issue #368.",
"$schema_ref": "../verdict-1.0.0-rc1.json",
"fixtures": [
{
"name": "pod_evicted",
"payload": {
"pattern.id": "14",
"headline": "h",
"remediation": "r",
"confidence": "full",
"evidence_trail": [
{"kind": "pod_event", "uid": "u", "timestamp": "2026-05-18T10:00:00Z", "description": "d"},
{"kind": "node_condition", "uid": "u", "timestamp": "2026-05-18T10:00:00Z", "description": "d"}
],
"k8s.pod.name": "p",
"k8s.pod.namespace": "ns",
"k8s.node.name": "n"
}
},
{
"name": "nccl_hang",
"payload": {
"pattern.id": "15",
"headline": "h",
"remediation": "r",
"evidence_trail": [
{"kind": "nccl_fr", "uid": "u", "timestamp": "2026-05-18T10:00:00Z", "description": "d"}
],
"pg_id": 7,
"collective_seq_id": 42,
"hanging_ranks": [1, 3]
}
},
{
"name": "xid_correlation",
"payload": {
"pattern.id": "16",
"headline": "h",
"remediation": "r",
"evidence_trail": [
{"kind": "kernel_event", "uid": "u", "timestamp": "2026-05-18T10:00:00Z", "description": "d"},
{"kind": "pod_event", "uid": "u", "timestamp": "2026-05-18T10:00:00Z", "description": "d"}
],
"xid_code": 79,
"node": "n",
"evicted_pod": "ns/p"
}
},
{
"name": "hbm_ecc",
"payload": {
"pattern.id": "17",
"headline": "h",
"remediation": "r",
"evidence_trail": [
{"kind": "hw_error", "uid": "u", "timestamp": "2026-05-18T10:00:00Z", "description": "d"},
{"kind": "kernel_event", "uid": "u", "timestamp": "2026-05-18T10:00:00Z", "description": "d"}
],
"xid_code": 48,
"gpu_id": "0000:31:00.0",
"ecc_delta": 1,
"node": "n"
}
},
{
"name": "thermal_throttle",
"payload": {
"pattern.id": "18",
"headline": "h",
"remediation": "r",
"evidence_trail": [
{"kind": "hw_throttle", "uid": "u", "timestamp": "2026-05-18T10:00:00Z", "description": "d"},
{"kind": "hw_throttle", "uid": "u", "timestamp": "2026-05-18T10:00:00Z", "description": "d"}
],
"node": "n",
"gpu_count": 2,
"gpu_ids": ["0000:31:00.0", "0000:32:00.0"]
}
},
{
"name": "pcie_aer",
"payload": {
"pattern.id": "19",
"headline": "h",
"remediation": "r",
"evidence_trail": [
{"kind": "pcie_aer", "uid": "u", "timestamp": "2026-05-18T10:00:00Z", "description": "d"},
{"kind": "hw_io_collapse", "uid": "u", "timestamp": "2026-05-18T10:00:00Z", "description": "d"}
],
"gpu_id": "0000:31:00.0",
"severity": "Fatal",
"aer_type": "Data Link Layer",
"drop_ratio": 0.9,
"node": "n"
}
},
{
"name": "cuda_oom",
"payload": {
"pattern.id": "20",
"headline": "h",
"remediation": "r",
"confidence": "full",
"evidence_trail": [
{"kind": "cuda_oom", "uid": "u", "timestamp": "2026-05-18T10:00:00Z", "description": "d"},
{"kind": "hw_fb", "uid": "u", "timestamp": "2026-05-18T10:00:00Z", "description": "d"}
],
"gpu_id": "0000:31:00.0",
"node": "n",
"kind": "true_oom",
"tried_alloc_bytes": 1024,
"fb_free_bytes": 0,
"fb_free_ratio": 0.0
}
},
{
"name": "ib_link_flap",
"payload": {
"pattern.id": "21",
"headline": "h",
"remediation": "r",
"confidence": "partial",
"evidence_trail": [
{"kind": "ib_port_state", "uid": "u", "timestamp": "2026-05-18T10:00:00Z", "description": "d"}
],
"missing_layers": ["hw_throttle"],
"node": "n",
"hca_device": "mlx5_0",
"port": 1,
"transition_count": 3
}
},
{
"name": "silent_data_corruption",
"payload": {
"pattern.id": "13",
"headline": "h",
"remediation": "r",
"confidence": "full",
"kind": "vendor_signaled",
"evidence_trail": [
{"kind": "hw_gpu_sdc", "uid": "u", "timestamp": "2026-05-18T10:00:00Z", "description": "d"},
{"kind": "gen_ai_training_eval_accuracy", "uid": "u", "timestamp": "2026-05-18T10:00:00Z", "description": "d"}
],
"gen_ai.training.job_id": "job-42",
"accuracy_drop": 0.15,
"baseline_accuracy": 0.95,
"observed_accuracy": 0.80,
"suspect_gpu_id": "0000:31:00.0",
"suspect_node": "n",
"sdc_counter_delta": 3
}
}
]
}
51 changes: 51 additions & 0 deletions module/pkg/patterns/verdict_envelope_schema_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,57 @@ func TestVerdictEnvelopeV1RC1_AllShippedVerdictsValidate(t *testing.T) {
}
}

// canonicalShippedFixturesPath is the cross-SDK fixture file (issue
// #368) — the SAME file the Go + Python verdict SDK suites consume.
// Validating it against the envelope schema here closes the loop:
// the fixture set the SDK suites round-trip against is provably an
// envelope-conforming subset, so a fixture-edit that schema-drifts
// fails this test BEFORE it ships in either SDK.
var canonicalShippedFixturesPath = filepath.Join(
"..", "..", "..", "docs", "schemas", "fixtures", "shipped-patterns-v1.0.0-rc1.json",
)

// TestVerdictEnvelopeV1RC1_CanonicalShippedFixturesValidate asserts
// every row in docs/schemas/fixtures/shipped-patterns-v1.0.0-rc1.json
// (the shared Go+Python SDK fixture set) validates against the
// published envelope schema. The fixtures are the single source of
// truth consumed by:
//
// - module/sdk/verdict/decode_test.go (Go SDK)
// - python/tracecore_verdict/test_decode.py (Python SDK)
//
// Drift between any of those and the envelope is caught here.
func TestVerdictEnvelopeV1RC1_CanonicalShippedFixturesValidate(t *testing.T) {
t.Parallel()
schema := loadEnvelopeSchema(t)

bs, err := os.ReadFile(canonicalShippedFixturesPath) //nolint:gosec // test-local relative path
require.NoError(t, err,
"docs/schemas/fixtures/shipped-patterns-v1.0.0-rc1.json must exist; "+
"it is the cross-SDK fixture contract from issue #368.")

var doc struct {
Fixtures []struct {
Name string `json:"name"`
Payload map[string]any `json:"payload"`
} `json:"fixtures"`
}
require.NoError(t, json.Unmarshal(bs, &doc),
"canonical shipped-pattern fixtures file must be valid JSON")
require.NotEmpty(t, doc.Fixtures, "canonical fixture set must be non-empty")

for _, fx := range doc.Fixtures {
fx := fx
t.Run(fx.Name, func(t *testing.T) {
t.Parallel()
require.NoError(t, schema.Validate(fx.Payload),
"canonical fixture %q must validate against the published envelope "+
"schema; if intentional, update the envelope under the "+
"docs/DEPRECATION.md policy.", fx.Name)
})
}
}

// TestVerdictEnvelopeV1RC1_RejectsEnvelopeDrift asserts the
// envelope's bite: removing/mangling any envelope-level required
// field fails validation. Each row is a one-mutation falsifier for
Expand Down
Loading