diff --git a/docs/ATTRIBUTES.md b/docs/ATTRIBUTES.md index caf9e0ff..d038a187 100644 --- a/docs/ATTRIBUTES.md +++ b/docs/ATTRIBUTES.md @@ -53,7 +53,7 @@ trail for downstream consumers that want it. | Attribute | Type | Source | Stability | Description | Emitted by | Consumed by | |---|---|---|---|---|---|---| -| `pattern.id` | string | tracecore-ext | stable | Canonical pattern identifier (`pod_evicted`, `xid_correlation`, `hbm_ecc`, `nccl_hang`, `thermal_throttle`, `pcie_aer`) | `patterndetectorprocessor` (`VerdictAttrPatternID`) | Dashboards, LogQL filters, runbooks | +| `pattern.id` | string | tracecore-ext | stable | Canonical pattern identifier (`pod_evicted`, `xid_correlation`, `hbm_ecc`, `nccl_hang`, `thermal_throttle`, `pcie_aer`, `ib_link_flap`, `cuda_oom`, `silent_data_corruption`) | `patterndetectorprocessor` (`VerdictAttrPatternID`) | Dashboards, LogQL filters, runbooks | | `pattern.confidence` | string | tracecore-ext | stable | Verdict confidence (`high`, `partial`) | `patterndetectorprocessor` (`VerdictAttrConfidence`) | Dashboards | | `pattern.headline` | string | tracecore-ext | stable | Operator-facing one-line summary | `patterndetectorprocessor` (`VerdictAttrHeadline`) | Dashboards, alerting | | `pattern.remediation` | string | tracecore-ext | stable | Operator-actionable remediation prose | `patterndetectorprocessor` (`VerdictAttrRemediation`) | Dashboards | @@ -112,6 +112,10 @@ hardware signal. | `tracecore.alert.pcie_rate_collapse.direction` | string | tracecore-ext | alpha | `transmit` or `receive` — falls back to upstream `network.io.direction` if absent | OTTL metrics→logs recipe | `projectPCIeIORecord` | | `tracecore.alert.pcie_rate_collapse.drop_ratio` | double | tracecore-ext | alpha | Promoted drop-ratio scalar on `pcie_aer` verdicts so dashboards render histograms without parsing JSON | `patterndetectorprocessor.appendPCIeAERVerdict` | Operator dashboards (verdict-stream tier) | | `tracecore.alert.ib_link_flap.transition_count` | int | tracecore-ext | alpha | In-window ACTIVE→DOWN transition count promoted on `ib_link_flap` verdicts so dashboards distinguish "noisy 4 flaps" from "thrashing 40 flaps" | `patterndetectorprocessor.appendIBLinkFlapVerdict` | Operator dashboards | +| `tracecore.alert.silent_data_corruption.kind` | string | tracecore-ext | alpha | `vendor_signaled` (full confidence; same-job `hw.gpu.sdc.*` rose during job window) or `accuracy_only` (partial; eval drop >= 2x threshold, no vendor signal) | `patterndetectorprocessor.appendSilentDataCorruptionVerdict` | Operator dashboards (verdict-stream tier) | +| `tracecore.alert.silent_data_corruption.accuracy_drop` | double | tracecore-ext | alpha | `baseline - observed` accuracy drop in absolute units. Range [0, 1]. Promoted so dashboards bucket regression magnitudes without parsing JSON | `appendSilentDataCorruptionVerdict` | Operator dashboards | +| `tracecore.alert.silent_data_corruption.suspect_gpu_id` | string | tracecore-ext | alpha | PCI BDF of the GPU whose vendor SDC counter rose during the job window. Omitted (not empty-stamped) on `accuracy_only` verdicts to avoid empty-filter false-matches | `appendSilentDataCorruptionVerdict` | Operator dashboards (drain candidate) | +| `tracecore.alert.silent_data_corruption.suspect_node` | string | tracecore-ext | alpha | Kubernetes node name carrying the suspect GPU. Omitted on `accuracy_only` verdicts | `appendSilentDataCorruptionVerdict` | Operator dashboards | --- @@ -157,6 +161,8 @@ issues [#265](https://github.com/TraceCoreAI/tracecore/issues/265) | `hw.gpu.throttle.cascade_size` | int | tracecore-ext | development | Promoted scalar on `thermal_throttle` verdicts (see `pattern.*` table) | `patterndetectorprocessor` | Dashboards | | `hw.gpu.memory.free` | int | upstream-proposal ([#303](https://github.com/TraceCoreAI/tracecore/issues/303)) | development | Per-GPU framebuffer free bytes projected from `DCGM_FI_DEV_FB_FREE` onto the logs path | DCGM OTTL transform | Pattern #10 (CUDA OOM) | | `hw.gpu.memory.total` | int | upstream-proposal ([#303](https://github.com/TraceCoreAI/tracecore/issues/303)) | development | Per-GPU framebuffer capacity (`DCGM_FI_DEV_FB_USED + FREE`) projected onto the logs path | DCGM OTTL transform | Pattern #10 | +| `hw.gpu.sdc.delta` | int | tracecore-ext | alpha | Per-GPU vendor SDC counter rise (NVIDIA SDC catcher / row-remap / AMD ECC at non-fatal threshold). Counter rise > 0 inside a job window gates pattern #13's full-confidence branch | OTTL metrics→logs recipe (TBD; blocked on RFC-0014 PR-B + vendor exporter wiring) | `projectSDCCounterRecord` (gates `silent_data_corruption`) | +| `hw.gpu.sdc.kind` | string | tracecore-ext | alpha | Which vendor SDC family rose — `remap_pending` / `remap_failure` / `catcher_count`. Carried onto the verdict's evidence trail for operator-side DCGM debug queries | OTTL metrics→logs recipe | `projectSDCCounterRecord` | --- @@ -255,15 +261,25 @@ is no equivalent upstream contract. ## `gen_ai.training.*` — training-job join keys Upstream OTel `gen_ai` namespace. We use it as the cross-receiver -join surface for distributed training workloads (rank, job id). -Today only `gen_ai.training.rank` is consumed; `gen_ai.training.job_id` -is contracted in RFC-0013 §3 but not yet wired through the pattern -library (future cross-receiver join surface). +join surface for distributed training workloads (rank, job id, eval +accuracy). `gen_ai.training.rank` is consumed by the NCCL +FlightRecorder projection; `gen_ai.training.job_id` is the same-job +join key for pattern #13 (silent data corruption); the +`eval_accuracy.*` + `checkpoint.*` keys gate the SDC detector's +discriminator + false-positive guards. | Attribute | Type | Source | Stability | Description | Emitted by | Consumed by | |---|---|---|---|---|---|---| | `gen_ai.training.rank` | int | upstream-semconv (alpha) | development | Rank index — canonical per M19 | `rankjoinprocessor` (`module/processor/rankjoinprocessor`) | `projectNCCLFRRecord` (preferred over `nccl.rank` / `nccl.fr.rank`) | -| `gen_ai.training.job_id` | string | upstream-semconv (alpha) | alpha | Training-job id — contracted but not wired into pattern library yet | (future) | (future) | +| `gen_ai.training.job_id` | string | upstream-semconv (alpha) | alpha | Training-job id — same-job join key for pattern #13 (vendor SDC counter → eval accuracy). Verdict scalar on `silent_data_corruption` | eval-pipeline OTTL recipe + vendor SDC OTTL recipe (TBD) | `projectEvalAccuracyRecord`, `projectSDCCounterRecord` | +| `gen_ai.training.eval_accuracy` | double | upstream-proposal | development | Eval-pass accuracy in [0, 1]. Gates pattern #13 | eval-pipeline OTTL recipe (blocked on upstream framework instrumentation per `docs/patterns/13` §"Open questions" 5) | `projectEvalAccuracyRecord` | +| `gen_ai.training.eval_accuracy.baseline` | double | tracecore-ext | alpha | Operator-stamped reference accuracy the current eval is compared against. Zero or absent skips pattern #13 evaluation (no comparator) | eval-pipeline OTTL recipe / operator config | `projectEvalAccuracyRecord` | +| `gen_ai.training.eval_set.checksum` | string | tracecore-ext | alpha | Eval-set checksum. Compared against `baseline_checksum` to suppress dataset-drift false positives on pattern #13 | eval-pipeline OTTL recipe | `projectEvalAccuracyRecord` | +| `gen_ai.training.eval_set.baseline_checksum` | string | tracecore-ext | alpha | Eval-set checksum the baseline accuracy was measured against | eval-pipeline OTTL recipe / operator config | `projectEvalAccuracyRecord` | +| `gen_ai.training.checkpoint.step` | int | tracecore-ext | alpha | Training step the evaluated checkpoint was saved at. Compared against `baseline_step` to suppress cherry-picked-checkpoint false positives on pattern #13 | eval-pipeline OTTL recipe | `projectEvalAccuracyRecord` | +| `gen_ai.training.checkpoint.baseline_step` | int | tracecore-ext | alpha | Training step the baseline accuracy was measured at | eval-pipeline OTTL recipe / operator config | `projectEvalAccuracyRecord` | +| `gen_ai.training.job.start_unix_nano` | int | tracecore-ext | alpha | Job start wall-clock (unix nanos). Lower bound for the same-job SDC counter join window on pattern #13 | eval-pipeline OTTL recipe / `rankjoinprocessor` | `projectEvalAccuracyRecord` | +| `gen_ai.training.job.end_unix_nano` | int | tracecore-ext | alpha | Job end wall-clock (unix nanos). Upper bound for the SDC counter join window. Falls back to eval record's Timestamp when absent | eval-pipeline OTTL recipe / `rankjoinprocessor` | `projectEvalAccuracyRecord` | --- @@ -303,6 +319,7 @@ need to fire?" without reading source. | `thermal_throttle` | `hw.gpu.throttle.duration.delta` + `hw.gpu.throttle.reason` + `gpu.id` | `hw.gpu.index`, `k8s.node.name` | `pattern.*`, `hw.gpu.throttle.cascade_size` | | `pcie_aer` *(wiring on a follow-up PR — projections present in library)* | `kernelevents.pcie_aer.severity` + `gpu.id` OR `tracecore.alert.pcie_rate_collapse.bytes_per_second` + `gpu.id` | `kernelevents.pcie_aer.type`, `network.io.direction`, `tracecore.alert.pcie_rate_collapse.{baseline_bytes_per_second,direction}` | `pattern.*` | | `nccl_hang` | `nccl.fr.collective_seq_id` + (one of `gen_ai.training.rank` \| `nccl.rank` \| `nccl.fr.rank`) | `nccl.fr.{pg_id,state,profiling_name,time_discovered_started_ns}` | `pattern.*`, `nccl.fr.{pg_id,collective_seq_id,hanging_ranks_count}` | +| `silent_data_corruption` | (eval) `gen_ai.training.eval_accuracy` + `gen_ai.training.job_id`; (sdc) `hw.gpu.sdc.delta` + `gen_ai.training.job_id` | `gen_ai.training.eval_accuracy.baseline`, `gen_ai.training.eval_set.{checksum,baseline_checksum}`, `gen_ai.training.checkpoint.{step,baseline_step}`, `gen_ai.training.job.{start,end}_unix_nano`, `hw.gpu.sdc.kind`, `gpu.id`, `k8s.node.name` | `pattern.*`, `tracecore.alert.silent_data_corruption.{kind,accuracy_drop,suspect_gpu_id,suspect_node}`, `gen_ai.training.job_id` | | `node_condition` (input to multiple patterns) | `k8s.node.name` + `k8s.node.condition.pressure` | `k8s.node.{uid,condition.message}` | (no direct verdict — feeds pod-eviction correlation) | --- diff --git a/docs/patterns/13-silent-data-corruption.md b/docs/patterns/13-silent-data-corruption.md index 0cdf3cc1..7a735912 100644 --- a/docs/patterns/13-silent-data-corruption.md +++ b/docs/patterns/13-silent-data-corruption.md @@ -1,8 +1,8 @@ # Pattern #13 — Silent data corruption (SDC) -**Status:** ☐ planned (no detector implementation yet) — frontier-layer pattern, hardest to detect +**Status:** ☑ shipped — `patterns.SilentDataCorruptionDetector` in `module/pkg/patterns/silent_data_corruption.go`, wired via `module/processor/patterndetectorprocessor/silent_data_corruption.go`. Verdict shape pinned by `module/pkg/patterns/testdata/silent_data_corruption_verdict.schema.json`. Spec preserved here as the engineering record; the detector implements the algorithm in §"Detector evaluation rule" with conservative-by-default thresholds. -Design spec for the pattern-#13 detector. SDC is the highest-difficulty pattern in the 15-set: the run completes, loss looks normal, but downstream eval shows degraded model quality. The detector must surface a *suspicion* with a clear evidence trail; certainty requires re-run. +Design spec for the pattern-#13 detector. SDC is the highest-difficulty pattern in the 15-set: the run completes, loss looks normal, but downstream eval shows degraded model quality. The detector surfaces a *suspicion* with a clear evidence trail; certainty requires re-run on different hardware. ## Symptom diff --git a/docs/patterns/README.md b/docs/patterns/README.md index b7187c98..0251bddc 100644 --- a/docs/patterns/README.md +++ b/docs/patterns/README.md @@ -59,7 +59,7 @@ Engineering-facing pattern-design specs for the 8 unspec'd v1 patterns. Each fol | #10 CUDA OOM, deceptive allocator | [10-cuda-oom-deceptive.md](10-cuda-oom-deceptive.md) | ☐ planned ([#303](https://github.com/TraceCoreAI/tracecore/issues/303) filed) | | #11 Checkpointer hang | [11-checkpointer-hang.md](11-checkpointer-hang.md) | ☐ planned | | #12 Loss spike → NaN | [12-loss-spike-nan.md](12-loss-spike-nan.md) | ☐ planned | -| #13 Silent data corruption | [13-silent-data-corruption.md](13-silent-data-corruption.md) | ☐ planned | +| #13 Silent data corruption | [13-silent-data-corruption.md](13-silent-data-corruption.md) | ☑ shipped | ## Replay test fixture diff --git a/module/pkg/patterns/silent_data_corruption.go b/module/pkg/patterns/silent_data_corruption.go new file mode 100644 index 00000000..4058a5d0 --- /dev/null +++ b/module/pkg/patterns/silent_data_corruption.go @@ -0,0 +1,544 @@ +// SPDX-License-Identifier: Apache-2.0 + +package patterns + +import ( + "fmt" + "sort" + "time" +) + +// DefaultSDCAccuracyDropThreshold is the absolute accuracy regression +// (baseline - observed) that promotes an eval cycle into pattern-13 +// consideration. 0.005 (0.5pp) mirrors the spec in +// docs/patterns/13-silent-data-corruption.md §"Detector evaluation +// rule" — typical SDC regressions land between 0.5pp and 3pp. The +// guard is conservative-by-default: an operator with a noisy recipe +// raises this above the recipe's known run-to-run variance band via +// SilentDataCorruptionDetector.AccuracyDropThreshold. +const DefaultSDCAccuracyDropThreshold = 0.005 + +// DefaultSDCAccuracyOnlyMultiplier is the threshold multiplier that +// gates the partial-confidence (kind=accuracy_only) branch. The +// canonical rule (spec §"Detector evaluation rule"): an accuracy +// regression alone — no same-job vendor SDC signal — must clear +// `AccuracyDropThreshold * Multiplier` before the detector commits to +// the (advisory) verdict. 2.0 keeps the partial branch +// false-positive-conservative; operators tuning for high-precision- +// only deployments raise the multiplier. +const DefaultSDCAccuracyOnlyMultiplier = 2.0 + +// EvalAccuracyRecord is the typed projection of one +// `gen_ai.training.eval_accuracy` measurement the detector consumes. +// The patterndetectorprocessor builds these from log records the +// future eval-pipeline OTTL recipe emits (the upstream framework +// instrumentation hasn't landed `gen_ai.training.eval_accuracy` yet — +// spec §"Open questions" 5). Detectors read EvalAccuracyRecord +// values directly so a schema rename in the upstream recipe surfaces +// as a compile error here, not a silent pattern-evaluation regression. +type EvalAccuracyRecord struct { + // JobID is the customer-stable `gen_ai.training.job_id` resource + // attribute. Load-bearing for the same-job join with + // SDCCounterRecord.JobID. Empty JobID skips the record — the + // SDC join is meaningless without a job identifier. + JobID string `json:"job_id"` + + // Accuracy is the eval-pass accuracy on this run, in [0, 1]. + // Compared against BaselineAccuracy to compute the drop. + Accuracy float64 `json:"accuracy"` + + // BaselineAccuracy is the operator-stamped reference accuracy + // (spec §"Open questions" 1 — comparator provenance is a product + // question). A zero or negative baseline means "no comparator" + // and skips the record (spec §"Detector evaluation rule" first + // guard). + BaselineAccuracy float64 `json:"baseline_accuracy,omitempty"` + + // EvalSetChecksum is the operator-stamped eval-set checksum + // (spec §"Edge cases" — dataset-drift false-positive guard). + // Compared against BaselineEvalSetChecksum; mismatch suppresses + // the verdict. + EvalSetChecksum string `json:"eval_set_checksum,omitempty"` + + // BaselineEvalSetChecksum is the eval-set checksum the + // BaselineAccuracy was measured against. Mismatch with + // EvalSetChecksum means the operator changed the eval set — + // the accuracy delta is dataset drift, not SDC. + BaselineEvalSetChecksum string `json:"baseline_eval_set_checksum,omitempty"` + + // CheckpointStep is the training step the evaluated checkpoint + // was saved at (spec §"Edge cases" — cherry-picked-checkpoint + // false-positive guard). Compared against BaselineCheckpointStep + // when both are non-zero; mismatch suppresses the verdict. + CheckpointStep int64 `json:"checkpoint_step,omitempty"` + + // BaselineCheckpointStep is the training step the + // BaselineAccuracy was measured at. Mismatch with CheckpointStep + // means the operator evaluated an intermediate checkpoint — the + // accuracy delta is expected, not SDC. + BaselineCheckpointStep int64 `json:"baseline_checkpoint_step,omitempty"` + + // JobStart is the wall-clock time the affected training job + // started. Bounds the same-job SDC counter join window + // (spec §"Detector evaluation rule" — sdc.Timestamp during + // (eval.JobStart, eval.JobEnd)). + JobStart time.Time `json:"job_start,omitempty"` + + // JobEnd is the wall-clock time the affected training job + // ended. Bounds the same-job SDC counter join window on the + // upper side. A zero JobEnd means "ongoing"; the detector falls + // back to Timestamp for the upper bound. + JobEnd time.Time `json:"job_end,omitempty"` + + // Timestamp is the wall-clock time the eval cycle completed. + // Used as the verdict's evidence-trail timestamp + fallback + // upper bound when JobEnd is zero. + Timestamp time.Time `json:"timestamp"` + + // Node is the Kubernetes node name the eval ran on (k8s.node.name + // resource attr stamped by k8sattributes). Optional — carried on + // the verdict for operator triage. + Node string `json:"node,omitempty"` +} + +// SDCCounterRecord is the typed projection of one vendor SDC counter +// rise the detector consumes — DCGM SDC catcher / row-remap deltas +// the OTTL recipe projects onto `hw.gpu.sdc.*` Counters (spec §"Signal +// sources"). Operators on AMD MI300X / Intel Habana wire equivalent +// vendor SDC counters into the same `hw.gpu.sdc.*` family via their +// own OTTL recipes (spec §"Open questions" 2). +type SDCCounterRecord struct { + // JobID is the same-job join key against EvalAccuracyRecord.JobID. + // Empty JobID skips the record — without a job tag the SDC event + // can't be attributed to a specific eval window. + JobID string `json:"job_id"` + + // GPUID is the customer-stable `gpu.id` resource attribute — the + // PCI BDF of the GPU whose SDC counter rose. Carried onto the + // verdict as `suspect_gpu_id` so the alert payload names the + // drainable hardware. + GPUID string `json:"gpu_id"` + + // Delta is the SDC counter rise observed in the scrape window. + // Pattern-13 fires on any Delta > 0; the spec's PromQL is + // `increase(...) > 0` (vendor SDC counters are rare-by-design, + // not noisy). + Delta int64 `json:"delta"` + + // Kind names which `hw.gpu.sdc.*` Counter rose — "remap_pending" + // / "remap_failure" / "catcher_count". Carried onto the verdict's + // evidence-trail description so the operator can paste it into a + // DCGM debug query. + Kind string `json:"kind,omitempty"` + + // Timestamp is the wall-clock time the counter rise was observed + // (the end of the scrape interval that produced the delta). + // Bounded by (eval.JobStart, eval.JobEnd) on the join. + Timestamp time.Time `json:"timestamp"` + + // Node is the Kubernetes node name the GPU lives on. Carried onto + // the verdict as `suspect_node`. + Node string `json:"node,omitempty"` +} + +// SDCKind enumerates the discriminator branches. Named type rejects +// raw string-literal switch arms. +type SDCKind string + +// Canonical SDCKind values. The set is pinned by the schema's +// `kind` enum + the detector tests. +const ( + // SDCKindVendorSignaled names the high-confidence branch: + // accuracy regression AND a same-job vendor SDC counter rose + // during the job window. Verdict still says "suspected SDC; re- + // run recommended" — the spec is explicit that no single + // observation proves SDC (spec §"Detector evaluation rule"). + SDCKindVendorSignaled SDCKind = "vendor_signaled" + + // SDCKindAccuracyOnly names the partial-confidence branch: a + // large accuracy regression (>= AccuracyDropThreshold * + // AccuracyOnlyMultiplier) with no same-job vendor SDC signal. + // Could be SDC; could be a model-side bug. The verdict routes + // the operator to a same-recipe + same-seed re-run on different + // hardware as the disambiguator. + SDCKindAccuracyOnly SDCKind = "accuracy_only" +) + +// SilentDataCorruptionVerdict is the v0.3.x pattern #13 +// (silent_data_corruption) output. JSON field names follow the +// verdict.schema.json snake-case convention. CARRIES Confidence — the +// vendor_signaled branch is full; the accuracy_only branch is partial. +type SilentDataCorruptionVerdict struct { + PatternID string `json:"pattern.id"` + Headline string `json:"headline"` + Remediation string `json:"remediation"` + Confidence Confidence `json:"confidence"` + Kind SDCKind `json:"kind"` + EvidenceTrail []EvidenceRef `json:"evidence_trail"` + + // JobID is the affected training job id. Promoted onto the + // verdict (issue #270 contract) so dashboards table-aggregate by + // job without server-side parsing of pattern.verdict_json. + JobID string `json:"gen_ai.training.job_id"` + + // AccuracyDrop is baseline - observed in absolute units. Range + // [0, 1] — eval accuracy is a probability. + AccuracyDrop float64 `json:"accuracy_drop"` + + // BaselineAccuracy is the operator-stamped reference accuracy + // the verdict was measured against. Carried for the alert + // payload's self-containment. + BaselineAccuracy float64 `json:"baseline_accuracy,omitempty"` + + // ObservedAccuracy is the eval-pass accuracy on this run. + ObservedAccuracy float64 `json:"observed_accuracy,omitempty"` + + // SuspectGPUID is the PCI BDF of the GPU whose vendor SDC + // counter rose during the job window. Empty when + // Kind=accuracy_only. + SuspectGPUID string `json:"suspect_gpu_id,omitempty"` + + // SuspectNode is the Kubernetes node name carrying the suspect + // GPU. Empty when Kind=accuracy_only. + SuspectNode string `json:"suspect_node,omitempty"` + + // SDCCounterDelta is the vendor SDC counter rise observed during + // the job window. Zero when Kind=accuracy_only. + SDCCounterDelta int64 `json:"sdc_counter_delta,omitempty"` + + // MissingLayers names the evidence layers that did not join. + // Empty when Confidence=Full. Populated to ["hw_gpu_sdc"] when + // the vendor SDC layer was missing (Kind=accuracy_only). + MissingLayers []string `json:"missing_layers,omitempty"` +} + +// SilentDataCorruptionDetector is the silent_data_corruption pattern +// detector (pattern #13). Zero-value usage is permitted — +// AccuracyDropThreshold defaults to DefaultSDCAccuracyDropThreshold; +// AccuracyOnlyMultiplier defaults to DefaultSDCAccuracyOnlyMultiplier. +type SilentDataCorruptionDetector struct { + // AccuracyDropThreshold is the absolute drop (baseline - observed) + // at or above which an eval cycle becomes pattern-13 eligible. + // Zero means use DefaultSDCAccuracyDropThreshold. Must exceed the + // recipe's known run-to-run variance band (spec §"Edge cases" — + // legitimate accuracy variance guard). + AccuracyDropThreshold float64 + + // AccuracyOnlyMultiplier is the threshold multiplier that gates + // the partial-confidence (kind=accuracy_only) branch. Zero means + // use DefaultSDCAccuracyOnlyMultiplier (2.0). Operators tuning + // for high-precision-only deployments raise this multiplier. + AccuracyOnlyMultiplier float64 +} + +// Evaluate scans EvalAccuracyRecords and SDCCounterRecords and emits +// one SilentDataCorruptionVerdict per eval cycle that satisfies the +// discriminator. The discriminator (spec §"Detector evaluation rule"): +// +// - baseline == 0 → skip (no comparator) +// - dataset-drift guard: EvalSetChecksum mismatch → skip +// - cherry-picked-checkpoint guard: CheckpointStep mismatch → skip +// - accuracy_drop < AccuracyDropThreshold → skip +// - SDC counter rose during job window → vendor_signaled (full) +// - accuracy_drop >= AccuracyDropThreshold * AccuracyOnlyMultiplier +// → accuracy_only (partial) +// - otherwise → skip (under-confident; in the noise band) +// +// Output is sorted by (eval Timestamp ascending, JobID ascending) so +// the slice is deterministic for golden tests. +// +// Inputs are read-only snapshots; the detector does not mutate either +// slice. Order of inputs is not assumed. +func (d SilentDataCorruptionDetector) Evaluate(evals []EvalAccuracyRecord, sdcs []SDCCounterRecord) []SilentDataCorruptionVerdict { + threshold := d.AccuracyDropThreshold + if threshold <= 0 { + threshold = DefaultSDCAccuracyDropThreshold + } + multiplier := d.AccuracyOnlyMultiplier + if multiplier <= 0 { + multiplier = DefaultSDCAccuracyOnlyMultiplier + } + + sdcByJob := indexSDCByJob(sdcs) + + verdicts := make([]SilentDataCorruptionVerdict, 0, len(evals)) + for _, eval := range evals { + if eval.JobID == "" { + continue + } + if eval.BaselineAccuracy <= 0 { + continue + } + // Dataset-drift guard — only suppress when BOTH sides set the + // checksum and they disagree. Either side empty means "operator + // did not opt into the guard"; the detector still evaluates. + if eval.EvalSetChecksum != "" && eval.BaselineEvalSetChecksum != "" && + eval.EvalSetChecksum != eval.BaselineEvalSetChecksum { + continue + } + // Cherry-picked-checkpoint guard — same opt-in policy: only + // suppress when both step fields are populated and disagree. + if eval.CheckpointStep != 0 && eval.BaselineCheckpointStep != 0 && + eval.CheckpointStep != eval.BaselineCheckpointStep { + continue + } + drop := eval.BaselineAccuracy - eval.Accuracy + if drop < threshold { + continue + } + + sdc, sdcOK := sdcInJobWindow(sdcByJob[eval.JobID], eval) + if sdcOK { + verdicts = append(verdicts, buildSDCVendorSignaledVerdict(eval, sdc, drop)) + continue + } + if drop >= threshold*multiplier { + verdicts = append(verdicts, buildSDCAccuracyOnlyVerdict(eval, drop)) + } + } + + sort.SliceStable(verdicts, func(i, j int) bool { + ti := verdicts[i].EvidenceTrail[len(verdicts[i].EvidenceTrail)-1].Timestamp + tj := verdicts[j].EvidenceTrail[len(verdicts[j].EvidenceTrail)-1].Timestamp + if !ti.Equal(tj) { + return ti.Before(tj) + } + return verdicts[i].JobID < verdicts[j].JobID + }) + return verdicts +} + +// indexSDCByJob groups SDC counter records by JobID and sorts each +// bucket by Timestamp ascending so the in-window scan is monotonic. +func indexSDCByJob(sdcs []SDCCounterRecord) map[string][]SDCCounterRecord { + idx := map[string][]SDCCounterRecord{} + for _, s := range sdcs { + if s.JobID == "" { + continue + } + if s.Delta <= 0 { + continue + } + idx[s.JobID] = append(idx[s.JobID], s) + } + for k := range idx { + recs := idx[k] + sort.SliceStable(recs, func(i, j int) bool { + return recs[i].Timestamp.Before(recs[j].Timestamp) + }) + idx[k] = recs + } + return idx +} + +// sdcInJobWindow returns the most recent SDC counter rise (largest +// delta on a tie) that fell during the eval's job window. The window +// is [JobStart, JobEnd] inclusive; a zero JobEnd falls back to the +// eval's Timestamp so an "ongoing" job still has a defined upper +// bound. Returns ok=false when no SDC record on the same job falls in +// the window. +// +// "Most recent" is chosen as the proximate cause because the eval +// pass executes at the end of the job — the counter rise closest in +// time to the eval is the strongest causal candidate. Ties on +// Timestamp prefer the larger Delta (vendor counters can stamp +// multiple BDF deltas at the same instant; the larger delta is more +// operator-relevant). +func sdcInJobWindow(bucket []SDCCounterRecord, eval EvalAccuracyRecord) (SDCCounterRecord, bool) { + if len(bucket) == 0 { + return SDCCounterRecord{}, false + } + start := eval.JobStart + end := eval.JobEnd + if end.IsZero() { + end = eval.Timestamp + } + var best SDCCounterRecord + found := false + for _, s := range bucket { + if !start.IsZero() && s.Timestamp.Before(start) { + continue + } + if !end.IsZero() && s.Timestamp.After(end) { + continue + } + if !found { + best = s + found = true + continue + } + if s.Timestamp.After(best.Timestamp) { + best = s + } else if s.Timestamp.Equal(best.Timestamp) && s.Delta > best.Delta { + best = s + } + } + return best, found +} + +// buildSDCVendorSignaledVerdict materializes the full-confidence +// branch verdict — vendor SDC counter joined the accuracy regression +// inside the job window. Evidence trail is in causal order: SDC +// counter rise first (the earliest observable hardware signal), eval +// accuracy second (the user-visible symptom). +func buildSDCVendorSignaledVerdict(eval EvalAccuracyRecord, sdc SDCCounterRecord, drop float64) SilentDataCorruptionVerdict { + return SilentDataCorruptionVerdict{ + PatternID: PatternIDSilentDataCorruption, + Confidence: ConfidenceFull, + Kind: SDCKindVendorSignaled, + JobID: eval.JobID, + AccuracyDrop: drop, + BaselineAccuracy: eval.BaselineAccuracy, + ObservedAccuracy: eval.Accuracy, + SuspectGPUID: sdc.GPUID, + SuspectNode: sdc.Node, + SDCCounterDelta: sdc.Delta, + EvidenceTrail: []EvidenceRef{ + { + Kind: EvidenceKindHwGPUSDC, + UID: sdcEvidenceUID(sdc), + Timestamp: sdc.Timestamp, + Description: sdcEvidenceDescription(sdc), + }, + { + Kind: EvidenceKindEvalAccuracy, + UID: evalAccuracyEvidenceUID(eval), + Timestamp: eval.Timestamp, + Description: evalAccuracyEvidenceDescription(eval, drop), + }, + }, + Headline: sdcVendorSignaledHeadline(eval, sdc, drop), + Remediation: sdcRemediation(eval, sdc, SDCKindVendorSignaled), + } +} + +// buildSDCAccuracyOnlyVerdict materializes the partial-confidence +// branch verdict — accuracy regression alone, no vendor SDC signal in +// the job window. MissingLayers names the absent layer so the operator +// knows what to look for on the re-run. +func buildSDCAccuracyOnlyVerdict(eval EvalAccuracyRecord, drop float64) SilentDataCorruptionVerdict { + return SilentDataCorruptionVerdict{ + PatternID: PatternIDSilentDataCorruption, + Confidence: ConfidencePartial, + Kind: SDCKindAccuracyOnly, + JobID: eval.JobID, + AccuracyDrop: drop, + BaselineAccuracy: eval.BaselineAccuracy, + ObservedAccuracy: eval.Accuracy, + MissingLayers: []string{EvidenceKindHwGPUSDC}, + EvidenceTrail: []EvidenceRef{ + { + Kind: EvidenceKindEvalAccuracy, + UID: evalAccuracyEvidenceUID(eval), + Timestamp: eval.Timestamp, + Description: evalAccuracyEvidenceDescription(eval, drop), + }, + }, + Headline: sdcAccuracyOnlyHeadline(eval, drop), + Remediation: sdcRemediation(eval, SDCCounterRecord{}, SDCKindAccuracyOnly), + } +} + +// sdcEvidenceUID synthesizes a stable identifier for the SDC evidence +// ref. Vendor SDC counter scrapes carry no upstream UID — the +// (JobID, GPUID, timestamp) tuple is the smallest globally-unique +// key. +func sdcEvidenceUID(sdc SDCCounterRecord) string { + return fmt.Sprintf("%s/%s/hw_gpu_sdc/%d", sdc.JobID, sdc.GPUID, sdc.Timestamp.UnixNano()) +} + +// sdcEvidenceDescription renders the operator-facing prose for the +// SDC evidence ref. Spells out the (counter kind + delta + GPU) +// tuple so the operator can paste it into a DCGM debug query. +func sdcEvidenceDescription(sdc SDCCounterRecord) string { + kind := sdc.Kind + if kind == "" { + kind = "counter" + } + return fmt.Sprintf( + "hw.gpu.sdc.%s rose by %d on GPU %s", + kind, sdc.Delta, sdc.GPUID, + ) +} + +// evalAccuracyEvidenceUID synthesizes a stable identifier for the +// eval-accuracy evidence ref. Eval-pipeline emissions carry no +// upstream UID either. +func evalAccuracyEvidenceUID(eval EvalAccuracyRecord) string { + return fmt.Sprintf("%s/gen_ai_training_eval_accuracy/%d", eval.JobID, eval.Timestamp.UnixNano()) +} + +// evalAccuracyEvidenceDescription spells out the (observed, baseline, +// drop) tuple so the operator can confirm the regression magnitude +// without reaching into the verdict JSON. +func evalAccuracyEvidenceDescription(eval EvalAccuracyRecord, drop float64) string { + return fmt.Sprintf( + "gen_ai.training.eval_accuracy=%.4f vs baseline=%.4f (drop %.4f) on job %s", + eval.Accuracy, eval.BaselineAccuracy, drop, eval.JobID, + ) +} + +// sdcVendorSignaledHeadline renders the operator-facing one-liner for +// the vendor_signaled branch. Names the regression magnitude in basis +// points, the suspect GPU, and the SDC kind so the alert payload is +// self-contained at the headline level. +func sdcVendorSignaledHeadline(eval EvalAccuracyRecord, sdc SDCCounterRecord, drop float64) string { + return fmt.Sprintf( + "Eval accuracy regressed %.0fbp vs baseline on job %s; SDC counter rose on GPU %s.", + drop*10_000, eval.JobID, sdc.GPUID, + ) +} + +// sdcAccuracyOnlyHeadline renders the headline for the accuracy_only +// branch. Names the regression magnitude + job but does not claim a +// suspect GPU — the vendor SDC layer was absent. +func sdcAccuracyOnlyHeadline(eval EvalAccuracyRecord, drop float64) string { + return fmt.Sprintf( + "Eval accuracy regressed %.0fbp vs baseline on job %s; no vendor SDC signal — suspected SDC, re-run recommended.", + drop*10_000, eval.JobID, + ) +} + +// sdcRemediation returns the operator-actionable remediation prose +// branching on kind. Both branches route to the same disambiguator — +// a re-run on different hardware — because the spec is explicit that +// no single observation proves SDC. +func sdcRemediation(eval EvalAccuracyRecord, sdc SDCCounterRecord, kind SDCKind) string { + switch kind { + case SDCKindVendorSignaled: + node := sdc.Node + if node == "" { + node = eval.Node + } + if node == "" { + node = "" + } + return fmt.Sprintf( + "Suspected silent data corruption on job %s. GPU %s (node %s) emitted a vendor SDC counter rise during the job window. Drain the suspect node, re-run the same recipe + seed on different hardware to confirm. If the regression persists across hardware, the suspect is the recipe/dataset; if it disappears, escalate the original GPU via vendor SDC root-cause tooling (NVIDIA SDC catcher logs / AMD ECC counters).", + eval.JobID, sdc.GPUID, node, + ) + case SDCKindAccuracyOnly: + return fmt.Sprintf( + "Suspected silent data corruption on job %s based on accuracy regression alone — no vendor SDC counter rose during the job window. Re-run the same recipe + seed on different hardware. If the regression persists, the suspect is the recipe/dataset (NOT the GPU); if it disappears, escalate the original node's GPUs via vendor SDC root-cause tooling. The verdict is advisory; SDC reproduction is non-deterministic.", + eval.JobID, + ) + default: + return "" + } +} + +// PatternIDSilentDataCorruption is the silent_data_corruption pattern +// identifier. Matches docs/patterns/13-silent-data-corruption.md's +// pattern.id="13". +const PatternIDSilentDataCorruption = "13" + +// EvidenceKindHwGPUSDC names the vendor SDC counter evidence surface. +// The "hw_gpu_sdc" wire value mirrors the customer-stable +// `hw.gpu.sdc.*` metric family (spec §"Signal sources"). +const EvidenceKindHwGPUSDC = "hw_gpu_sdc" + +// EvidenceKindEvalAccuracy names the user-visible eval-accuracy +// regression evidence surface. The "gen_ai_training_eval_accuracy" +// wire value mirrors the customer-stable `gen_ai.training. +// eval_accuracy` metric (spec §"Signal sources"). +const EvidenceKindEvalAccuracy = "gen_ai_training_eval_accuracy" diff --git a/module/pkg/patterns/silent_data_corruption_test.go b/module/pkg/patterns/silent_data_corruption_test.go new file mode 100644 index 00000000..30ccbe35 --- /dev/null +++ b/module/pkg/patterns/silent_data_corruption_test.go @@ -0,0 +1,687 @@ +// SPDX-License-Identifier: Apache-2.0 + +package patterns_test + +import ( + "encoding/json" + "os" + "path/filepath" + "regexp" + "testing" + "time" + + "github.com/santhosh-tekuri/jsonschema/v6" + "github.com/stretchr/testify/require" + + "github.com/tracecoreai/tracecore/module/pkg/patterns" +) + +// silent_data_corruption detector test suite (pattern #13). The +// detector reads per-eval `gen_ai.training.eval_accuracy` records +// alongside same-job `hw.gpu.sdc.*` counter rises and emits one +// SilentDataCorruptionVerdict per eval cycle whose accuracy regressed +// vs an operator-stamped baseline. The discriminator branches on +// vendor SDC counter presence: a same-job counter rise during the job +// window flips the verdict to high-confidence (kind=vendor_signaled); +// a larger-than-2x-threshold regression alone emits a partial- +// confidence (kind=accuracy_only) verdict. Smaller regressions and +// vendor-counter-without-regression cases emit no verdict — the +// pattern is conservative-by-design because SDC repro is non- +// deterministic (spec §"Symptom" + §"Detector evaluation rule"). + +// TestSDCDetector_PositiveVendorSignaled pins the canonical +// vendor-signaled journey: a 2pp eval accuracy regression on job +// "llama3-finetune-001" coincides with a `hw.gpu.sdc.catcher_count` +// rise on GPU PCI:0000:3b:00 during the job window. The detector +// emits one verdict with kind=vendor_signaled, confidence=full, and +// a 2-element evidence trail in causal order (SDC counter first, eval +// accuracy second). +func TestSDCDetector_PositiveVendorSignaled(t *testing.T) { + t.Parallel() + + jobStart := time.Unix(1_700_000_000, 0).UTC() + jobEnd := jobStart.Add(2 * time.Hour) + evalAt := jobEnd.Add(5 * time.Minute) + sdcAt := jobStart.Add(45 * time.Minute) + + evals := []patterns.EvalAccuracyRecord{ + { + JobID: "llama3-finetune-001", + Accuracy: 0.78, + BaselineAccuracy: 0.80, + JobStart: jobStart, + JobEnd: jobEnd, + Timestamp: evalAt, + Node: "gpu-node-0001", + }, + } + sdcs := []patterns.SDCCounterRecord{ + { + JobID: "llama3-finetune-001", + GPUID: "PCI:0000:3b:00", + Delta: 1, + Kind: "catcher_count", + Timestamp: sdcAt, + Node: "gpu-node-0001", + }, + } + + verdicts := patterns.SilentDataCorruptionDetector{}.Evaluate(evals, sdcs) + require.Len(t, verdicts, 1) + + v := verdicts[0] + require.Equal(t, patterns.PatternIDSilentDataCorruption, v.PatternID) + require.Equal(t, "llama3-finetune-001", v.JobID) + require.Equal(t, patterns.SDCKindVendorSignaled, v.Kind) + require.Equal(t, patterns.ConfidenceFull, v.Confidence) + require.InDelta(t, 0.02, v.AccuracyDrop, 1e-9, "0.80 - 0.78 = 0.02") + require.InDelta(t, 0.80, v.BaselineAccuracy, 1e-9) + require.InDelta(t, 0.78, v.ObservedAccuracy, 1e-9) + require.Equal(t, "PCI:0000:3b:00", v.SuspectGPUID) + require.Equal(t, "gpu-node-0001", v.SuspectNode) + require.Equal(t, int64(1), v.SDCCounterDelta) + require.Empty(t, v.MissingLayers) + + require.Regexp(t, regexp.MustCompile(`(?i)eval|accuracy|SDC`), v.Headline) + require.Regexp(t, regexp.MustCompile(`(?i)re-run|different hardware`), v.Remediation, + "vendor_signaled remediation must route operator to a hardware re-run") + + require.Len(t, v.EvidenceTrail, 2, "SDC counter + eval accuracy") + require.Equal(t, patterns.EvidenceKindHwGPUSDC, v.EvidenceTrail[0].Kind, "SDC first (causal)") + require.Equal(t, patterns.EvidenceKindEvalAccuracy, v.EvidenceTrail[1].Kind) +} + +// TestSDCDetector_PositiveAccuracyOnly pins the partial-confidence +// branch: a 1.5pp drop (3x default 0.005 threshold) with NO same-job +// SDC counter emits a kind=accuracy_only / confidence=partial verdict. +// MissingLayers names the absent vendor SDC layer so the operator +// knows what to look for on the re-run. +func TestSDCDetector_PositiveAccuracyOnly(t *testing.T) { + t.Parallel() + + jobStart := time.Unix(1_700_000_000, 0).UTC() + jobEnd := jobStart.Add(1 * time.Hour) + evalAt := jobEnd.Add(1 * time.Minute) + + evals := []patterns.EvalAccuracyRecord{ + { + JobID: "llama3-finetune-002", + Accuracy: 0.785, + BaselineAccuracy: 0.80, + JobStart: jobStart, + JobEnd: jobEnd, + Timestamp: evalAt, + }, + } + + verdicts := patterns.SilentDataCorruptionDetector{}.Evaluate(evals, nil) + require.Len(t, verdicts, 1) + + v := verdicts[0] + require.Equal(t, patterns.SDCKindAccuracyOnly, v.Kind) + require.Equal(t, patterns.ConfidencePartial, v.Confidence) + require.InDelta(t, 0.015, v.AccuracyDrop, 1e-9) + require.Empty(t, v.SuspectGPUID, "no vendor SDC layer joined") + require.Empty(t, v.SuspectNode) + require.Equal(t, int64(0), v.SDCCounterDelta) + require.Equal(t, []string{patterns.EvidenceKindHwGPUSDC}, v.MissingLayers) + + require.Len(t, v.EvidenceTrail, 1, "eval accuracy only") + require.Equal(t, patterns.EvidenceKindEvalAccuracy, v.EvidenceTrail[0].Kind) +} + +// TestSDCDetector_NegativeEvalDropOnlyUnderTwoX pins the conservative- +// by-default contract: an accuracy drop ABOVE the threshold but UNDER +// 2x the threshold (the partial-branch gate) with NO vendor SDC +// counter MUST emit no verdict. Spec §"Detector evaluation rule" +// keeps the partial branch high-precision; smaller drops are noise. +func TestSDCDetector_NegativeEvalDropOnlyUnderTwoX(t *testing.T) { + t.Parallel() + + jobStart := time.Unix(1_700_000_000, 0).UTC() + jobEnd := jobStart.Add(1 * time.Hour) + + evals := []patterns.EvalAccuracyRecord{ + { + JobID: "llama3-finetune-003", + Accuracy: 0.793, + BaselineAccuracy: 0.80, // drop=0.007, between 0.005 and 0.010 + JobStart: jobStart, + JobEnd: jobEnd, + Timestamp: jobEnd.Add(time.Minute), + }, + } + + require.Empty(t, patterns.SilentDataCorruptionDetector{}.Evaluate(evals, nil), + "sub-2x-threshold drop without vendor SDC signal MUST NOT emit a verdict; could be model-side noise") +} + +// TestSDCDetector_NegativeSDCOnlyNoRegression pins the inverse contract +// (spec §"Edge cases" — hardware-attribution false positive): a vendor +// SDC counter rise WITHOUT a same-job accuracy regression MUST NOT +// emit a verdict. A vendor SDC event proves the GPU saw SDC; it does +// NOT prove THIS job consumed corrupted data — the detector waits for +// the user-visible symptom to confirm operator-relevance. +func TestSDCDetector_NegativeSDCOnlyNoRegression(t *testing.T) { + t.Parallel() + + jobStart := time.Unix(1_700_000_000, 0).UTC() + jobEnd := jobStart.Add(1 * time.Hour) + + evals := []patterns.EvalAccuracyRecord{ + { + JobID: "llama3-finetune-004", + Accuracy: 0.802, // 0.002 above baseline — within noise + BaselineAccuracy: 0.80, + JobStart: jobStart, + JobEnd: jobEnd, + Timestamp: jobEnd.Add(time.Minute), + }, + } + sdcs := []patterns.SDCCounterRecord{ + { + JobID: "llama3-finetune-004", + GPUID: "PCI:0000:3b:00", + Delta: 1, + Kind: "catcher_count", + Timestamp: jobStart.Add(30 * time.Minute), + }, + } + + require.Empty(t, patterns.SilentDataCorruptionDetector{}.Evaluate(evals, sdcs), + "vendor SDC counter without accuracy regression MUST NOT emit a verdict; SDC event is transient") +} + +// TestSDCDetector_NegativeNoBaseline pins the no-comparator guard: +// an eval with BaselineAccuracy==0 (operator did not stamp the +// reference) MUST be skipped. Spec §"Detector evaluation rule" first +// guard. +func TestSDCDetector_NegativeNoBaseline(t *testing.T) { + t.Parallel() + + jobStart := time.Unix(1_700_000_000, 0).UTC() + evals := []patterns.EvalAccuracyRecord{ + { + JobID: "j", + Accuracy: 0.50, + JobStart: jobStart, + JobEnd: jobStart.Add(time.Hour), + Timestamp: jobStart.Add(time.Hour + time.Minute), + }, + } + require.Empty(t, patterns.SilentDataCorruptionDetector{}.Evaluate(evals, nil), + "missing BaselineAccuracy MUST skip — no comparator") +} + +// TestSDCDetector_NegativeDatasetDrift pins the dataset-drift false- +// positive guard (spec §"Edge cases"): when the eval set checksum +// differs from the baseline's checksum, the verdict MUST be +// suppressed — the apparent drop is dataset drift, not SDC. +func TestSDCDetector_NegativeDatasetDrift(t *testing.T) { + t.Parallel() + + jobStart := time.Unix(1_700_000_000, 0).UTC() + jobEnd := jobStart.Add(time.Hour) + evals := []patterns.EvalAccuracyRecord{ + { + JobID: "j", + Accuracy: 0.70, + BaselineAccuracy: 0.80, + EvalSetChecksum: "sha256:NEW_EVALSET", + BaselineEvalSetChecksum: "sha256:OLD_EVALSET", + JobStart: jobStart, + JobEnd: jobEnd, + Timestamp: jobEnd.Add(time.Minute), + }, + } + sdcs := []patterns.SDCCounterRecord{ + {JobID: "j", GPUID: "PCI:0000:3b:00", Delta: 1, Timestamp: jobStart.Add(30 * time.Minute)}, + } + require.Empty(t, patterns.SilentDataCorruptionDetector{}.Evaluate(evals, sdcs), + "eval_set checksum mismatch MUST suppress — apparent drop is dataset drift") +} + +// TestSDCDetector_NegativeCherryPickedCheckpoint pins the cherry- +// picked-checkpoint false-positive guard (spec §"Edge cases"): when +// the evaluated checkpoint step differs from the baseline's +// checkpoint step, the verdict MUST be suppressed — the operator +// evaluated an intermediate checkpoint vs a final-checkpoint +// baseline, and the drop is expected. +func TestSDCDetector_NegativeCherryPickedCheckpoint(t *testing.T) { + t.Parallel() + + jobStart := time.Unix(1_700_000_000, 0).UTC() + jobEnd := jobStart.Add(time.Hour) + evals := []patterns.EvalAccuracyRecord{ + { + JobID: "j", + Accuracy: 0.70, + BaselineAccuracy: 0.80, + CheckpointStep: 5000, + BaselineCheckpointStep: 10000, + JobStart: jobStart, + JobEnd: jobEnd, + Timestamp: jobEnd.Add(time.Minute), + }, + } + sdcs := []patterns.SDCCounterRecord{ + {JobID: "j", GPUID: "PCI:0000:3b:00", Delta: 1, Timestamp: jobStart.Add(30 * time.Minute)}, + } + require.Empty(t, patterns.SilentDataCorruptionDetector{}.Evaluate(evals, sdcs), + "checkpoint_step mismatch MUST suppress — intermediate checkpoint vs final-checkpoint baseline") +} + +// TestSDCDetector_NegativeSDCOutsideJobWindow pins the temporal +// scoping (spec §"Detector evaluation rule" — sdc.Timestamp during +// (eval.JobStart, eval.JobEnd)): an SDC counter rise BEFORE the job +// started (or AFTER it ended) MUST NOT join — it belongs to a +// different job. With no in-window vendor signal AND drop below the +// 2x partial gate, no verdict emits. +func TestSDCDetector_NegativeSDCOutsideJobWindow(t *testing.T) { + t.Parallel() + + jobStart := time.Unix(1_700_000_000, 0).UTC() + jobEnd := jobStart.Add(time.Hour) + + // 0.7pp drop — above threshold, under partial-branch 2x gate. + evals := []patterns.EvalAccuracyRecord{ + { + JobID: "j", + Accuracy: 0.793, + BaselineAccuracy: 0.80, + JobStart: jobStart, + JobEnd: jobEnd, + Timestamp: jobEnd.Add(time.Minute), + }, + } + sdcs := []patterns.SDCCounterRecord{ + // SDC rose 10 minutes BEFORE job start — belongs to a prior job. + {JobID: "j", GPUID: "PCI:0000:3b:00", Delta: 1, Timestamp: jobStart.Add(-10 * time.Minute)}, + } + require.Empty(t, patterns.SilentDataCorruptionDetector{}.Evaluate(evals, sdcs), + "SDC counter outside job window MUST NOT join; drop alone under 2x gate emits nothing") +} + +// TestSDCDetector_NegativeCrossJobSDC pins the same-job-required +// contract: an SDC counter rise on job "other-job" MUST NOT join +// against eval on "this-job", even when both run on the same GPU at +// overlapping times. Job attribution is load-bearing — operators +// triage per-job-id. +func TestSDCDetector_NegativeCrossJobSDC(t *testing.T) { + t.Parallel() + + jobStart := time.Unix(1_700_000_000, 0).UTC() + jobEnd := jobStart.Add(time.Hour) + + evals := []patterns.EvalAccuracyRecord{ + { + JobID: "this-job", + Accuracy: 0.793, + BaselineAccuracy: 0.80, + JobStart: jobStart, + JobEnd: jobEnd, + Timestamp: jobEnd.Add(time.Minute), + }, + } + sdcs := []patterns.SDCCounterRecord{ + {JobID: "other-job", GPUID: "PCI:0000:3b:00", Delta: 1, Timestamp: jobStart.Add(30 * time.Minute)}, + } + require.Empty(t, patterns.SilentDataCorruptionDetector{}.Evaluate(evals, sdcs), + "cross-job SDC MUST NOT join; verdict requires same gen_ai.training.job_id") +} + +// TestSDCDetector_NegativeZeroDeltaSDC pins the spec's `sdc.Delta > 0` +// requirement: a zero-delta SDC record MUST NOT count as a vendor +// signal even when it lands inside the job window. Zero-delta records +// happen on every scrape between counter rises; the operator-relevant +// event is the rise. +func TestSDCDetector_NegativeZeroDeltaSDC(t *testing.T) { + t.Parallel() + + jobStart := time.Unix(1_700_000_000, 0).UTC() + jobEnd := jobStart.Add(time.Hour) + + evals := []patterns.EvalAccuracyRecord{ + { + JobID: "j", + Accuracy: 0.793, + BaselineAccuracy: 0.80, + JobStart: jobStart, + JobEnd: jobEnd, + Timestamp: jobEnd.Add(time.Minute), + }, + } + sdcs := []patterns.SDCCounterRecord{ + {JobID: "j", GPUID: "PCI:0000:3b:00", Delta: 0, Timestamp: jobStart.Add(30 * time.Minute)}, + } + require.Empty(t, patterns.SilentDataCorruptionDetector{}.Evaluate(evals, sdcs), + "zero-delta SDC scrape MUST NOT join; only a counter rise (delta > 0) counts") +} + +// TestSDCDetector_EdgeAtTwoXThreshold pins the partial-branch gate +// boundary: an accuracy drop EXACTLY equal to AccuracyDropThreshold * +// AccuracyOnlyMultiplier (default 0.005 * 2.0 = 0.010) with no vendor +// SDC signal MUST fire kind=accuracy_only (>= gate). One ULP below +// fires nothing (gate is strict-greater-equal). +func TestSDCDetector_EdgeAtTwoXThreshold(t *testing.T) { + t.Parallel() + + jobStart := time.Unix(1_700_000_000, 0).UTC() + jobEnd := jobStart.Add(time.Hour) + at := jobEnd.Add(time.Minute) + + // Drop = 0.010 exactly. + atGate := []patterns.EvalAccuracyRecord{ + {JobID: "j", Accuracy: 0.79, BaselineAccuracy: 0.80, JobStart: jobStart, JobEnd: jobEnd, Timestamp: at}, + } + got := patterns.SilentDataCorruptionDetector{}.Evaluate(atGate, nil) + require.Len(t, got, 1, "drop exactly at 2x gate MUST fire accuracy_only") + require.Equal(t, patterns.SDCKindAccuracyOnly, got[0].Kind) +} + +// TestSDCDetector_EdgeBelowThreshold pins the lower threshold +// boundary: drop strictly below AccuracyDropThreshold (default 0.005) +// MUST fire nothing, regardless of vendor SDC presence. The spec is +// explicit (first guard): "if accuracy_drop < AccuracyDropThreshold: +// skip". +func TestSDCDetector_EdgeBelowThreshold(t *testing.T) { + t.Parallel() + + jobStart := time.Unix(1_700_000_000, 0).UTC() + jobEnd := jobStart.Add(time.Hour) + evals := []patterns.EvalAccuracyRecord{ + {JobID: "j", Accuracy: 0.797, BaselineAccuracy: 0.80, JobStart: jobStart, JobEnd: jobEnd, Timestamp: jobEnd.Add(time.Minute)}, + } + sdcs := []patterns.SDCCounterRecord{ + {JobID: "j", GPUID: "PCI:0000:3b:00", Delta: 1, Timestamp: jobStart.Add(30 * time.Minute)}, + } + require.Empty(t, patterns.SilentDataCorruptionDetector{}.Evaluate(evals, sdcs), + "drop below AccuracyDropThreshold MUST fire nothing even with vendor SDC signal") +} + +// TestSDCDetector_ThresholdConfigurable asserts the +// AccuracyDropThreshold field overrides the default. Noisy recipes +// raise it above their run-to-run variance band; high-precision +// deployments lower it. +func TestSDCDetector_ThresholdConfigurable(t *testing.T) { + t.Parallel() + + jobStart := time.Unix(1_700_000_000, 0).UTC() + jobEnd := jobStart.Add(time.Hour) + evals := []patterns.EvalAccuracyRecord{ + {JobID: "j", Accuracy: 0.797, BaselineAccuracy: 0.80, JobStart: jobStart, JobEnd: jobEnd, Timestamp: jobEnd.Add(time.Minute)}, + } + sdcs := []patterns.SDCCounterRecord{ + {JobID: "j", GPUID: "PCI:0000:3b:00", Delta: 1, Timestamp: jobStart.Add(30 * time.Minute)}, + } + + // Default 0.005: 0.003 drop is below → no verdict. + require.Empty(t, patterns.SilentDataCorruptionDetector{}.Evaluate(evals, sdcs)) + + // Lowered to 0.001: 0.003 drop is above → vendor_signaled fires. + verdicts := patterns.SilentDataCorruptionDetector{AccuracyDropThreshold: 0.001}.Evaluate(evals, sdcs) + require.Len(t, verdicts, 1) + require.Equal(t, patterns.SDCKindVendorSignaled, verdicts[0].Kind) +} + +// TestSDCDetector_VendorSignaledWinsOverAccuracyOnly pins the +// discriminator priority: when an eval qualifies for BOTH the +// vendor_signaled branch AND the accuracy_only branch (drop >= 2x +// threshold AND a same-job SDC in-window), the verdict MUST be +// vendor_signaled (full confidence). The branches are not +// double-emitted; the full-confidence branch is strictly more +// informative. +func TestSDCDetector_VendorSignaledWinsOverAccuracyOnly(t *testing.T) { + t.Parallel() + + jobStart := time.Unix(1_700_000_000, 0).UTC() + jobEnd := jobStart.Add(time.Hour) + evals := []patterns.EvalAccuracyRecord{ + { + JobID: "j", + Accuracy: 0.75, // 5pp drop — well over 2x gate + BaselineAccuracy: 0.80, + JobStart: jobStart, + JobEnd: jobEnd, + Timestamp: jobEnd.Add(time.Minute), + }, + } + sdcs := []patterns.SDCCounterRecord{ + {JobID: "j", GPUID: "PCI:0000:3b:00", Delta: 1, Kind: "catcher_count", Timestamp: jobStart.Add(30 * time.Minute), Node: "n"}, + } + verdicts := patterns.SilentDataCorruptionDetector{}.Evaluate(evals, sdcs) + require.Len(t, verdicts, 1, "no double-emission across branches") + require.Equal(t, patterns.SDCKindVendorSignaled, verdicts[0].Kind) + require.Equal(t, patterns.ConfidenceFull, verdicts[0].Confidence) +} + +// TestSDCDetector_MostRecentInWindowSDCWins pins the multiple-SDC- +// records-in-window contract: when several SDC counter rises on the +// same job fall inside the job window, the verdict cites the MOST +// RECENT one as the proximate counter rise — the strongest causal +// candidate to the eval at job-end. +func TestSDCDetector_MostRecentInWindowSDCWins(t *testing.T) { + t.Parallel() + + jobStart := time.Unix(1_700_000_000, 0).UTC() + jobEnd := jobStart.Add(time.Hour) + evals := []patterns.EvalAccuracyRecord{ + {JobID: "j", Accuracy: 0.78, BaselineAccuracy: 0.80, JobStart: jobStart, JobEnd: jobEnd, Timestamp: jobEnd.Add(time.Minute)}, + } + sdcs := []patterns.SDCCounterRecord{ + {JobID: "j", GPUID: "PCI:0000:3b:00", Delta: 1, Timestamp: jobStart.Add(10 * time.Minute), Node: "node-a"}, + {JobID: "j", GPUID: "PCI:0000:af:00", Delta: 1, Timestamp: jobStart.Add(50 * time.Minute), Node: "node-b"}, + } + verdicts := patterns.SilentDataCorruptionDetector{}.Evaluate(evals, sdcs) + require.Len(t, verdicts, 1) + require.Equal(t, "PCI:0000:af:00", verdicts[0].SuspectGPUID, "most-recent in-window SDC is the proximate cause") + require.Equal(t, "node-b", verdicts[0].SuspectNode) +} + +// TestSDCDetector_DeterministicOrder pins the golden-test stability +// contract: output sorted by (eval Timestamp ascending, JobID +// ascending) so replay runs are reproducible. +func TestSDCDetector_DeterministicOrder(t *testing.T) { + t.Parallel() + + jobStart := time.Unix(1_700_000_000, 0).UTC() + jobEnd := jobStart.Add(time.Hour) + mkEval := func(job string, evalAt time.Time) patterns.EvalAccuracyRecord { + return patterns.EvalAccuracyRecord{ + JobID: job, + Accuracy: 0.75, + BaselineAccuracy: 0.80, + JobStart: jobStart, + JobEnd: jobEnd, + Timestamp: evalAt, + } + } + evals := []patterns.EvalAccuracyRecord{ + mkEval("job-c", jobEnd.Add(30*time.Second)), + mkEval("job-a", jobEnd.Add(10*time.Second)), + mkEval("job-b", jobEnd.Add(20*time.Second)), + } + verdicts := patterns.SilentDataCorruptionDetector{}.Evaluate(evals, nil) + require.Len(t, verdicts, 3) + require.Equal(t, "job-a", verdicts[0].JobID, "earliest eval first") + require.Equal(t, "job-b", verdicts[1].JobID) + require.Equal(t, "job-c", verdicts[2].JobID) +} + +// TestSDCDetector_NegativeNoJobID pins the join-key requirement: an +// eval with empty JobID MUST be skipped — the same-job SDC join key +// is undefined. No silent partial-verdict either; pattern-13 is +// scoped per-job. +func TestSDCDetector_NegativeNoJobID(t *testing.T) { + t.Parallel() + + jobStart := time.Unix(1_700_000_000, 0).UTC() + jobEnd := jobStart.Add(time.Hour) + evals := []patterns.EvalAccuracyRecord{ + {JobID: "", Accuracy: 0.75, BaselineAccuracy: 0.80, JobStart: jobStart, JobEnd: jobEnd, Timestamp: jobEnd.Add(time.Minute)}, + } + require.Empty(t, patterns.SilentDataCorruptionDetector{}.Evaluate(evals, nil), + "empty JobID MUST skip — join key undefined") +} + +// TestSDCDetector_JobEndFallbackToTimestamp pins the JobEnd-fallback +// contract: when an eval omits JobEnd (e.g. an ongoing-job evaluation +// pass), the upper window bound falls back to the eval Timestamp. +// SDC rises before the eval Timestamp still join. +func TestSDCDetector_JobEndFallbackToTimestamp(t *testing.T) { + t.Parallel() + + jobStart := time.Unix(1_700_000_000, 0).UTC() + evalAt := jobStart.Add(45 * time.Minute) + evals := []patterns.EvalAccuracyRecord{ + { + JobID: "j", + Accuracy: 0.78, + BaselineAccuracy: 0.80, + JobStart: jobStart, + // JobEnd intentionally zero. + Timestamp: evalAt, + }, + } + sdcs := []patterns.SDCCounterRecord{ + {JobID: "j", GPUID: "PCI:0000:3b:00", Delta: 1, Timestamp: jobStart.Add(30 * time.Minute)}, + } + verdicts := patterns.SilentDataCorruptionDetector{}.Evaluate(evals, sdcs) + require.Len(t, verdicts, 1, "zero JobEnd falls back to eval Timestamp for upper bound") + require.Equal(t, patterns.SDCKindVendorSignaled, verdicts[0].Kind) +} + +// TestSDCVerdict_SchemaConformance pins the SilentDataCorruption- +// Verdict JSON shape against testdata/silent_data_corruption_verdict. +// schema.json. Struct drift or schema loosening fails this test +// before it ships. +func TestSDCVerdict_SchemaConformance(t *testing.T) { + t.Parallel() + + schemaPath := filepath.Join("testdata", "silent_data_corruption_verdict.schema.json") + schemaBytes, err := os.ReadFile(schemaPath) //nolint:gosec // schemaPath is a test-local relative path + require.NoError(t, err) + + compiler := jsonschema.NewCompiler() + var schemaDoc any + require.NoError(t, json.Unmarshal(schemaBytes, &schemaDoc)) + require.NoError(t, compiler.AddResource(schemaPath, schemaDoc)) + schema, err := compiler.Compile(schemaPath) + require.NoError(t, err) + + jobStart := time.Unix(1_700_000_000, 0).UTC() + jobEnd := jobStart.Add(time.Hour) + evals := []patterns.EvalAccuracyRecord{ + { + JobID: "llama3-finetune-001", + Accuracy: 0.78, + BaselineAccuracy: 0.80, + JobStart: jobStart, + JobEnd: jobEnd, + Timestamp: jobEnd.Add(5 * time.Minute), + Node: "gpu-node-0001", + }, + } + sdcs := []patterns.SDCCounterRecord{ + {JobID: "llama3-finetune-001", GPUID: "PCI:0000:3b:00", Delta: 1, Kind: "catcher_count", Timestamp: jobStart.Add(45 * time.Minute), Node: "gpu-node-0001"}, + } + verdicts := patterns.SilentDataCorruptionDetector{}.Evaluate(evals, sdcs) + require.Len(t, verdicts, 1) + + bs, err := json.Marshal(verdicts[0]) + require.NoError(t, err) + var decoded any + require.NoError(t, json.Unmarshal(bs, &decoded)) + require.NoError(t, schema.Validate(decoded), + "vendor_signaled verdict failed schema validation; struct drifted or schema needs updating") + + // Also validate an accuracy_only verdict (different evidence shape). + evals2 := []patterns.EvalAccuracyRecord{ + {JobID: "j", Accuracy: 0.785, BaselineAccuracy: 0.80, JobStart: jobStart, JobEnd: jobEnd, Timestamp: jobEnd.Add(time.Minute)}, + } + v2 := patterns.SilentDataCorruptionDetector{}.Evaluate(evals2, nil) + require.Len(t, v2, 1) + bs2, err := json.Marshal(v2[0]) + require.NoError(t, err) + var decoded2 any + require.NoError(t, json.Unmarshal(bs2, &decoded2)) + require.NoError(t, schema.Validate(decoded2), + "accuracy_only verdict failed schema validation") +} + +// TestSDCVerdict_SchemaRejectsDrift is the drift-rejection battery. +// Each row is a falsifier for one schema constraint; removing the +// constraint flips the row to PASS. +func TestSDCVerdict_SchemaRejectsDrift(t *testing.T) { + t.Parallel() + + schemaPath := filepath.Join("testdata", "silent_data_corruption_verdict.schema.json") + schemaBytes, err := os.ReadFile(schemaPath) //nolint:gosec // schemaPath is a test-local relative path + require.NoError(t, err) + + compiler := jsonschema.NewCompiler() + var schemaDoc any + require.NoError(t, json.Unmarshal(schemaBytes, &schemaDoc)) + require.NoError(t, compiler.AddResource(schemaPath, schemaDoc)) + schema, err := compiler.Compile(schemaPath) + require.NoError(t, err) + + validEvidence := []any{ + map[string]any{"kind": "hw_gpu_sdc", "uid": "u1", "timestamp": "2026-05-18T10:00:00Z", "description": "d"}, + map[string]any{"kind": "gen_ai_training_eval_accuracy", "uid": "u2", "timestamp": "2026-05-18T10:00:30Z", "description": "d"}, + } + base := func() map[string]any { + return map[string]any{ + "pattern.id": "13", + "headline": "x", + "remediation": "y", + "confidence": "full", + "kind": "vendor_signaled", + "gen_ai.training.job_id": "j", + "accuracy_drop": 0.02, + "baseline_accuracy": 0.80, + "observed_accuracy": 0.78, + "suspect_gpu_id": "PCI:0000:3b:00", + "suspect_node": "n", + "sdc_counter_delta": int64(1), + "evidence_trail": validEvidence, + } + } + + cases := []struct { + name string + mutate func(map[string]any) + guardName string + }{ + {"extra_top_level_field", func(m map[string]any) { m["future_field"] = "rejected" }, "additionalProperties:false"}, + {"pattern_id_numeric_not_string", func(m map[string]any) { m["pattern.id"] = 13 }, "pattern.id string const"}, + {"pattern_id_wrong_value", func(m map[string]any) { m["pattern.id"] = "99" }, "pattern.id const guard"}, + {"kind_outside_enum", func(m map[string]any) { m["kind"] = "definitely_sdc" }, "kind enum"}, + {"confidence_outside_enum", func(m map[string]any) { m["confidence"] = "maybe" }, "confidence enum"}, + {"job_id_empty", func(m map[string]any) { m["gen_ai.training.job_id"] = "" }, "job_id minLength"}, + {"accuracy_drop_negative", func(m map[string]any) { m["accuracy_drop"] = -0.1 }, "accuracy_drop minimum"}, + {"accuracy_drop_over_one", func(m map[string]any) { m["accuracy_drop"] = 1.1 }, "accuracy_drop maximum"}, + {"baseline_accuracy_over_one", func(m map[string]any) { m["baseline_accuracy"] = 1.5 }, "baseline_accuracy maximum"}, + {"observed_accuracy_negative", func(m map[string]any) { m["observed_accuracy"] = -0.1 }, "observed_accuracy minimum"}, + {"sdc_counter_delta_negative", func(m map[string]any) { m["sdc_counter_delta"] = -1 }, "sdc_counter_delta minimum"}, + {"evidence_kind_outside_enum", func(m map[string]any) { + m["evidence_trail"] = []any{ + map[string]any{"kind": "kernel_event", "uid": "u", "timestamp": "2026-05-18T10:00:00Z", "description": "d"}, + } + }, "evidence_trail.kind enum"}, + {"evidence_trail_empty", func(m map[string]any) { + m["evidence_trail"] = []any{} + }, "evidence_trail minItems:1"}, + } + + for _, tc := range cases { + tc := tc + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + m := base() + tc.mutate(m) + require.Error(t, schema.Validate(m), + "schema must reject %s; guard %q regressed", tc.name, tc.guardName) + }) + } +} diff --git a/module/pkg/patterns/testdata/silent_data_corruption_verdict.schema.json b/module/pkg/patterns/testdata/silent_data_corruption_verdict.schema.json new file mode 100644 index 00000000..c984b02b --- /dev/null +++ b/module/pkg/patterns/testdata/silent_data_corruption_verdict.schema.json @@ -0,0 +1,105 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "https://tracecore.ai/schemas/patterns/silent_data_corruption_verdict/v0", + "title": "SilentDataCorruptionVerdict", + "description": "v0.3.x pattern #13 (silent_data_corruption) verdict shape. Pinned by TestSilentDataCorruptionVerdict_SchemaConformance. Emitted when an eval-accuracy regression vs baseline crosses the threshold; a same-job hw.gpu.sdc.* counter rise within the job window flips the verdict to full-confidence (kind=vendor_signaled). The verdict is advisory — operators re-run the recipe on different hardware to confirm.", + "type": "object", + "required": ["pattern.id", "headline", "remediation", "confidence", "kind", "evidence_trail", "gen_ai.training.job_id", "accuracy_drop"], + "additionalProperties": false, + "properties": { + "pattern.id": { + "type": "string", + "const": "13", + "description": "silent_data_corruption pattern identifier; matches docs/patterns/13-silent-data-corruption.md." + }, + "headline": { + "type": "string", + "minLength": 1 + }, + "remediation": { + "type": "string", + "minLength": 1 + }, + "confidence": { + "type": "string", + "enum": ["full", "partial"], + "description": "full = vendor SDC counter joined the accuracy regression (kind=vendor_signaled); partial = accuracy regression alone (kind=accuracy_only)." + }, + "kind": { + "type": "string", + "enum": ["vendor_signaled", "accuracy_only"], + "description": "Discriminator. vendor_signaled = same-job hw.gpu.sdc.* counter rose during the job window; accuracy_only = no vendor signal but accuracy_drop >= 2x threshold." + }, + "gen_ai.training.job_id": { + "type": "string", + "minLength": 1, + "description": "Affected training job id (per docs/patterns/13 §Verdict attributes)." + }, + "accuracy_drop": { + "type": "number", + "minimum": 0, + "maximum": 1, + "description": "baseline - observed_accuracy in absolute units. Bounded [0, 1] because eval accuracy is a [0, 1] probability." + }, + "baseline_accuracy": { + "type": "number", + "minimum": 0, + "maximum": 1, + "description": "Operator-provided reference run accuracy; eval was compared against this scalar." + }, + "observed_accuracy": { + "type": "number", + "minimum": 0, + "maximum": 1, + "description": "Eval-pass accuracy on this run." + }, + "suspect_gpu_id": { + "type": "string", + "description": "Customer-stable PCI BDF of the GPU whose vendor SDC counter rose during the job window. Empty when kind=accuracy_only." + }, + "suspect_node": { + "type": "string", + "description": "Kubernetes node name carrying the suspect GPU. Empty when kind=accuracy_only." + }, + "sdc_counter_delta": { + "type": "integer", + "minimum": 0, + "description": "Vendor SDC counter rise observed during the job window. Zero when kind=accuracy_only." + }, + "missing_layers": { + "type": "array", + "items": { + "type": "string", + "enum": ["hw_gpu_sdc"] + }, + "description": "Evidence layers that did not join. Populated when confidence=partial (kind=accuracy_only)." + }, + "evidence_trail": { + "type": "array", + "minItems": 1, + "items": { + "type": "object", + "required": ["kind", "uid", "timestamp", "description"], + "additionalProperties": false, + "properties": { + "kind": { + "type": "string", + "enum": ["gen_ai_training_eval_accuracy", "hw_gpu_sdc"] + }, + "uid": { + "type": "string", + "minLength": 1 + }, + "timestamp": { + "type": "string", + "format": "date-time" + }, + "description": { + "type": "string", + "minLength": 1 + } + } + } + } + } +} diff --git a/module/processor/patterndetectorprocessor/config.go b/module/processor/patterndetectorprocessor/config.go index 809e6e74..9871fcc5 100644 --- a/module/processor/patterndetectorprocessor/config.go +++ b/module/processor/patterndetectorprocessor/config.go @@ -99,6 +99,25 @@ const DefaultCUDAOOMCorrelationWindow = 2 * time.Minute // lower it. const DefaultCUDAOOMFBFreeFragmentationThreshold = 0.05 +// DefaultSDCAccuracyDropThreshold mirrors +// patterns.DefaultSDCAccuracyDropThreshold — the absolute eval +// accuracy regression (baseline - observed) at or above which an +// eval cycle becomes pattern-13 eligible. 0.005 (0.5pp) matches the +// spec's typical SDC regression band. Operators with noisier recipes +// raise this above their run-to-run variance band via +// sdc_accuracy_drop_threshold. +const DefaultSDCAccuracyDropThreshold = 0.005 + +// DefaultSDCAccuracyOnlyMultiplier mirrors +// patterns.DefaultSDCAccuracyOnlyMultiplier — the threshold +// multiplier that gates the partial-confidence (kind=accuracy_only) +// branch. 2.0 keeps the partial branch high-precision (drop must be +// at least 2x the base threshold before the detector commits to an +// advisory verdict without vendor SDC signal). Operators tuning for +// high-precision-only deployments raise via +// sdc_accuracy_only_multiplier. +const DefaultSDCAccuracyOnlyMultiplier = 2.0 + // Config is the operator-facing YAML for the patterndetector processor. type Config struct { // JoinWindow is the max gap between a node-pressure condition's @@ -200,6 +219,22 @@ type Config struct { // [0, 1] — outside the range is unreachable on a well-formed // FB record. CUDAOOMFBFreeFragmentationThreshold float64 `yaml:"cuda_oom_fb_free_fragmentation_threshold,omitempty" mapstructure:"cuda_oom_fb_free_fragmentation_threshold"` + + // SDCAccuracyDropThreshold is the absolute eval-accuracy drop + // (baseline - observed) at or above which an eval cycle becomes + // pattern-13 eligible. Zero means use + // DefaultSDCAccuracyDropThreshold. Must be in [0, 1] — eval + // accuracy is a probability; a threshold outside that range + // can never fire. + SDCAccuracyDropThreshold float64 `yaml:"sdc_accuracy_drop_threshold,omitempty" mapstructure:"sdc_accuracy_drop_threshold"` + + // SDCAccuracyOnlyMultiplier is the threshold multiplier that + // gates the partial-confidence (kind=accuracy_only) branch. + // Zero means use DefaultSDCAccuracyOnlyMultiplier. Must be >= 1 + // — a multiplier below 1 would make the partial branch easier + // to trigger than the vendor_signaled branch, inverting the + // confidence semantics. + SDCAccuracyOnlyMultiplier float64 `yaml:"sdc_accuracy_only_multiplier,omitempty" mapstructure:"sdc_accuracy_only_multiplier"` } // Validate enforces operator-actionable rules. @@ -246,6 +281,12 @@ func (c *Config) Validate() error { if c.CUDAOOMFBFreeFragmentationThreshold < 0 || c.CUDAOOMFBFreeFragmentationThreshold > 1 { return fmt.Errorf("cuda_oom_fb_free_fragmentation_threshold: must be in [0, 1], got %v", c.CUDAOOMFBFreeFragmentationThreshold) } + if c.SDCAccuracyDropThreshold < 0 || c.SDCAccuracyDropThreshold > 1 { + return fmt.Errorf("sdc_accuracy_drop_threshold: must be in [0, 1], got %v", c.SDCAccuracyDropThreshold) + } + if c.SDCAccuracyOnlyMultiplier != 0 && c.SDCAccuracyOnlyMultiplier < 1 { + return fmt.Errorf("sdc_accuracy_only_multiplier: must be >= 1, got %v", c.SDCAccuracyOnlyMultiplier) + } return nil } @@ -267,6 +308,8 @@ func defaultConfig() *Config { IBLinkFlapMinTransitions: DefaultIBLinkFlapMinTransitions, CUDAOOMCorrelationWindow: DefaultCUDAOOMCorrelationWindow, CUDAOOMFBFreeFragmentationThreshold: DefaultCUDAOOMFBFreeFragmentationThreshold, + SDCAccuracyDropThreshold: DefaultSDCAccuracyDropThreshold, + SDCAccuracyOnlyMultiplier: DefaultSDCAccuracyOnlyMultiplier, } } @@ -318,6 +361,12 @@ func (c *Config) withDefaults() *Config { if out.IBLinkFlapMinTransitions == 0 { out.IBLinkFlapMinTransitions = DefaultIBLinkFlapMinTransitions } + if out.SDCAccuracyDropThreshold == 0 { + out.SDCAccuracyDropThreshold = DefaultSDCAccuracyDropThreshold + } + if out.SDCAccuracyOnlyMultiplier == 0 { + out.SDCAccuracyOnlyMultiplier = DefaultSDCAccuracyOnlyMultiplier + } return &out } diff --git a/module/processor/patterndetectorprocessor/example_config.yaml b/module/processor/patterndetectorprocessor/example_config.yaml index 43be7954..95342751 100644 --- a/module/processor/patterndetectorprocessor/example_config.yaml +++ b/module/processor/patterndetectorprocessor/example_config.yaml @@ -52,6 +52,17 @@ processors: # the window emits a partial verdict (kind=unknown). cuda_oom_correlation_window: 2m cuda_oom_fb_free_fragmentation_threshold: 0.05 + # silent_data_corruption pattern (#13): absolute eval-accuracy + # regression (baseline - observed) at or above which a same-job + # eval becomes pattern-13 eligible, and the threshold multiplier + # that gates the partial-confidence (kind=accuracy_only) branch. + # An eval drop joined by a same-job hw.gpu.sdc.* counter rise + # during the job window emits kind=vendor_signaled (full); a drop + # alone of >= multiplier × threshold emits kind=accuracy_only + # (partial). Verdict is advisory — operators re-run the recipe on + # different hardware to confirm. + sdc_accuracy_drop_threshold: 0.005 + sdc_accuracy_only_multiplier: 2.0 exporters: debug: diff --git a/module/processor/patterndetectorprocessor/patterndetector.go b/module/processor/patterndetectorprocessor/patterndetector.go index 0dba29ea..6293e869 100644 --- a/module/processor/patterndetectorprocessor/patterndetector.go +++ b/module/processor/patterndetectorprocessor/patterndetector.go @@ -315,6 +315,7 @@ func (p *patterndetectorProcessor) ConsumeLogs(ctx context.Context, ld plog.Logs } runCUDAOOMDetector(ld, p.cfg, p.cfg.emitPartialEnabled(), p.logger()) + runSDCDetector(ld, p.cfg, p.cfg.emitPartialEnabled(), p.logger()) if err := p.next.ConsumeLogs(ctx, ld); err != nil { return fmt.Errorf("patterndetector: next.ConsumeLogs: %w", err) diff --git a/module/processor/patterndetectorprocessor/silent_data_corruption.go b/module/processor/patterndetectorprocessor/silent_data_corruption.go new file mode 100644 index 00000000..0b1839b8 --- /dev/null +++ b/module/processor/patterndetectorprocessor/silent_data_corruption.go @@ -0,0 +1,276 @@ +// SPDX-License-Identifier: Apache-2.0 + +package patterndetectorprocessor + +import ( + "time" + + "go.opentelemetry.io/collector/pdata/pcommon" + "go.opentelemetry.io/collector/pdata/plog" + "go.uber.org/zap" + + "github.com/tracecoreai/tracecore/module/pkg/patterns" +) + +// Silent data corruption (pattern #13) verdict-attribute namespace. +// Promoted onto the verdict log record per issue #270 scalar-promotion +// contract so downstream LogQL / Grafana queries can table-aggregate +// by job + suspect-GPU + kind without server-side parsing of +// pattern.verdict_json. Attribute keys mirror the customer-stable +// `tracecore.alert.silent_data_corruption.*` family the +// docs/patterns/13-silent-data-corruption.md §"Verdict attributes" +// table pins. +const ( + // verdictAttrSDCKind promotes the silent_data_corruption + // discriminator branch (`vendor_signaled` / `accuracy_only`). + verdictAttrSDCKind = "tracecore.alert.silent_data_corruption.kind" + + // verdictAttrSDCAccuracyDrop promotes the absolute accuracy drop + // (baseline - observed) so dashboards can branch on + // regression-magnitude buckets without parsing JSON. Range + // [0, 1]. + verdictAttrSDCAccuracyDrop = "tracecore.alert.silent_data_corruption.accuracy_drop" + + // verdictAttrSDCSuspectGPUID promotes the PCI BDF of the GPU + // whose vendor SDC counter rose during the job window. Empty + // when kind=accuracy_only — guarded by putStrIfSet so the + // attribute is omitted (no empty-string false-match on dashboard + // filters). + verdictAttrSDCSuspectGPUID = "tracecore.alert.silent_data_corruption.suspect_gpu_id" + + // verdictAttrSDCSuspectNode promotes the Kubernetes node name + // carrying the suspect GPU. Empty when kind=accuracy_only. + verdictAttrSDCSuspectNode = "tracecore.alert.silent_data_corruption.suspect_node" + + // verdictAttrSDCJobID promotes the affected training job id. + // Mirrors the customer-stable `gen_ai.training.job_id` attribute + // the eval-pipeline OTTL recipe emits — same name on the verdict + // as on the underlying input record so a LogQL query joins + // without renaming. + verdictAttrSDCJobID = "gen_ai.training.job_id" +) + +// projectEvalAccuracyRecord reads OTel attributes off a log record +// and builds a patterns.EvalAccuracyRecord. The projection's gate is +// the presence of BOTH `gen_ai.training.eval_accuracy` AND +// `gen_ai.training.job_id` — accuracy alone is meaningless without +// the job-attribution scalar (the same-job SDC join key); the job_id +// alone is a non-eval framework attribute (every training log carries +// it). +// +// Resource-attr fallback for job_id mirrors the gpu.id resource-attr +// fallback on the cuda_oom / xid projections: the eval-pipeline +// recipe author may stamp the job identifier on either the resource +// (cluster-stable identifier) or the per-record level. +// +// The integration-gap follow-up is the eval-pipeline OTTL recipe + +// upstream framework instrumentation for `gen_ai.training. +// eval_accuracy` (spec §"Open questions" 5). Until those land the +// projection compiles but no real-world input arrives at runtime; the +// wiring tests exercise it against hand-crafted records. +func projectEvalAccuracyRecord(lr plog.LogRecord, resAttrs pcommon.Map) (patterns.EvalAccuracyRecord, bool) { + attrs := lr.Attributes() + accuracy, ok := attrs.Get("gen_ai.training.eval_accuracy") + if !ok { + return patterns.EvalAccuracyRecord{}, false + } + jobID, ok := attrs.Get("gen_ai.training.job_id") + if !ok { + if v, rOK := resAttrs.Get("gen_ai.training.job_id"); rOK { + jobID = v + ok = true + } + } + if !ok { + return patterns.EvalAccuracyRecord{}, false + } + r := patterns.EvalAccuracyRecord{ + JobID: jobID.AsString(), + Accuracy: accuracy.Double(), + } + if v, ok := attrs.Get("gen_ai.training.eval_accuracy.baseline"); ok { + r.BaselineAccuracy = v.Double() + } + if v, ok := attrs.Get("gen_ai.training.eval_set.checksum"); ok { + r.EvalSetChecksum = v.AsString() + } + if v, ok := attrs.Get("gen_ai.training.eval_set.baseline_checksum"); ok { + r.BaselineEvalSetChecksum = v.AsString() + } + if v, ok := attrs.Get("gen_ai.training.checkpoint.step"); ok { + r.CheckpointStep = v.Int() + } + if v, ok := attrs.Get("gen_ai.training.checkpoint.baseline_step"); ok { + r.BaselineCheckpointStep = v.Int() + } + if v, ok := attrs.Get("gen_ai.training.job.start_unix_nano"); ok { + r.JobStart = time.Unix(0, v.Int()) + } + if v, ok := attrs.Get("gen_ai.training.job.end_unix_nano"); ok { + r.JobEnd = time.Unix(0, v.Int()) + } + if v, ok := resAttrs.Get("k8s.node.name"); ok { + r.Node = v.AsString() + } else if v, ok := attrs.Get("k8s.node.name"); ok { + r.Node = v.AsString() + } + if t := lr.Timestamp(); t != 0 { + r.Timestamp = t.AsTime() + } else { + r.Timestamp = lr.ObservedTimestamp().AsTime() + } + return r, true +} + +// projectSDCCounterRecord reads OTel attributes off a log record and +// builds a patterns.SDCCounterRecord. The projection's gate is the +// presence of BOTH `hw.gpu.sdc.delta` AND `gen_ai.training.job_id` — +// the counter rise without a job tag can't join with an eval (job- +// scoped), and the job_id alone is not an SDC signal. `gpu.id` is +// strongly recommended (the verdict promotes it as `suspect_gpu_id`) +// but optional — an unknown-BDF SDC event still helps the operator +// (the node attribution suffices to drain). +// +// Resource-attr fallback for job_id and gpu.id mirrors the eval- +// accuracy projection and the cuda_oom / xid projections. +func projectSDCCounterRecord(lr plog.LogRecord, resAttrs pcommon.Map) (patterns.SDCCounterRecord, bool) { + attrs := lr.Attributes() + delta, ok := attrs.Get("hw.gpu.sdc.delta") + if !ok { + return patterns.SDCCounterRecord{}, false + } + jobID, ok := attrs.Get("gen_ai.training.job_id") + if !ok { + if v, rOK := resAttrs.Get("gen_ai.training.job_id"); rOK { + jobID = v + ok = true + } + } + if !ok { + return patterns.SDCCounterRecord{}, false + } + r := patterns.SDCCounterRecord{ + JobID: jobID.AsString(), + Delta: delta.Int(), + } + if v, ok := attrs.Get("gpu.id"); ok { + r.GPUID = v.AsString() + } else if v, ok := resAttrs.Get("gpu.id"); ok { + r.GPUID = v.AsString() + } + if v, ok := attrs.Get("hw.gpu.sdc.kind"); ok { + r.Kind = v.AsString() + } + if v, ok := resAttrs.Get("k8s.node.name"); ok { + r.Node = v.AsString() + } else if v, ok := attrs.Get("k8s.node.name"); ok { + r.Node = v.AsString() + } + if t := lr.Timestamp(); t != 0 { + r.Timestamp = t.AsTime() + } else { + r.Timestamp = lr.ObservedTimestamp().AsTime() + } + return r, true +} + +// appendSilentDataCorruptionVerdict emits a silent_data_corruption +// verdict log record. Promoted attrs: pattern.confidence, +// tracecore.alert.silent_data_corruption.{kind,accuracy_drop, +// suspect_gpu_id,suspect_node}, gen_ai.training.job_id (issue #270 +// scalar-promotion contract). +// +// putStrIfSet guards optional scalars (suspect_gpu_id, suspect_node) +// so the partial-confidence (kind=accuracy_only) verdict — which +// carries no suspect hardware — does not stamp empty-string +// attributes that would silently match empty-filter dashboard +// queries. +func appendSilentDataCorruptionVerdict(ld plog.Logs, v patterns.SilentDataCorruptionVerdict, logger *zap.Logger) { + appendVerdictRecord(ld, logger, verdictCommon{ + PatternID: v.PatternID, + Headline: v.Headline, + Remediation: v.Remediation, + EvidenceTrail: v.EvidenceTrail, + }, v, "silent_data_corruption", func(attrs pcommon.Map) { + attrs.PutStr(verdictAttrConfidence, string(v.Confidence)) + attrs.PutStr(verdictAttrSDCKind, string(v.Kind)) + attrs.PutDouble(verdictAttrSDCAccuracyDrop, v.AccuracyDrop) + putStrIfSet(attrs, verdictAttrSDCSuspectGPUID, v.SuspectGPUID) + putStrIfSet(attrs, verdictAttrSDCSuspectNode, v.SuspectNode) + putStrIfSet(attrs, verdictAttrSDCJobID, v.JobID) + }) +} + +// collectSDCInputs walks the incoming plog.Logs and projects +// eval-accuracy + SDC-counter log records out. Hoisted into the +// silent_data_corruption-specific file (mirroring cuda_oom's +// collectCUDAOOMInputs) so the cross-cutting `collectInputs` doesn't +// have to grow another two return values per pattern. +// +// Skips records the processor itself emitted (the verdict scope) so a +// downstream re-injection of a silent_data_corruption verdict is not +// re-projected as input. +func collectSDCInputs(ld plog.Logs) ([]patterns.EvalAccuracyRecord, []patterns.SDCCounterRecord) { + var evals []patterns.EvalAccuracyRecord + var sdcs []patterns.SDCCounterRecord + for i := 0; i < ld.ResourceLogs().Len(); i++ { + rl := ld.ResourceLogs().At(i) + resAttrs := rl.Resource().Attributes() + for j := 0; j < rl.ScopeLogs().Len(); j++ { + sl := rl.ScopeLogs().At(j) + if sl.Scope().Name() == instrumentationScope { + continue + } + for k := 0; k < sl.LogRecords().Len(); k++ { + lr := sl.LogRecords().At(k) + if rec, ok := projectEvalAccuracyRecord(lr, resAttrs); ok { + evals = append(evals, rec) + continue + } + if rec, ok := projectSDCCounterRecord(lr, resAttrs); ok { + sdcs = append(sdcs, rec) + } + } + } + } + return evals, sdcs +} + +// runSDCDetector wires the patterns.SilentDataCorruptionDetector +// against the processor config and emits one verdict per match. +// Hoisted out of patterndetectorProcessor.ConsumeLogs so the +// silent_data_corruption-specific surface stays in one file +// (alongside the projections + writer), mirroring cuda_oom. +func runSDCDetector(ld plog.Logs, cfg *Config, emitPartial bool, logger *zap.Logger) { + evals, sdcs := collectSDCInputs(ld) + if len(evals) == 0 { + return + } + det := patterns.SilentDataCorruptionDetector{ + AccuracyDropThreshold: sdcAccuracyDropThreshold(cfg), + AccuracyOnlyMultiplier: sdcAccuracyOnlyMultiplier(cfg), + } + for _, v := range det.Evaluate(evals, sdcs) { + if v.Confidence == patterns.ConfidencePartial && !emitPartial { + continue + } + appendSilentDataCorruptionVerdict(ld, v, logger) + } +} + +// sdcAccuracyDropThreshold + sdcAccuracyOnlyMultiplier pull from +// Config with the library-level defaults as the fallback. Hoisted so +// the runSDCDetector call site is one expression per knob. +func sdcAccuracyDropThreshold(cfg *Config) float64 { + if cfg == nil || cfg.SDCAccuracyDropThreshold <= 0 { + return patterns.DefaultSDCAccuracyDropThreshold + } + return cfg.SDCAccuracyDropThreshold +} + +func sdcAccuracyOnlyMultiplier(cfg *Config) float64 { + if cfg == nil || cfg.SDCAccuracyOnlyMultiplier <= 0 { + return patterns.DefaultSDCAccuracyOnlyMultiplier + } + return cfg.SDCAccuracyOnlyMultiplier +} diff --git a/module/processor/patterndetectorprocessor/silent_data_corruption_test.go b/module/processor/patterndetectorprocessor/silent_data_corruption_test.go new file mode 100644 index 00000000..1a9cd93f --- /dev/null +++ b/module/processor/patterndetectorprocessor/silent_data_corruption_test.go @@ -0,0 +1,359 @@ +// SPDX-License-Identifier: Apache-2.0 + +package patterndetectorprocessor + +import ( + "context" + "encoding/json" + "testing" + "time" + + "github.com/stretchr/testify/require" + "go.opentelemetry.io/collector/pdata/pcommon" + "go.opentelemetry.io/collector/pdata/plog" + + "github.com/tracecoreai/tracecore/module/pkg/patterns" +) + +// TestPatternDetector_SDCWiringEmitsVendorSignaledVerdict pins the +// canonical wiring contract: an eval-accuracy log record +// (gen_ai.training.eval_accuracy + .baseline + job_id) arriving +// alongside a same-job hw.gpu.sdc.delta record inside the job window +// emits one silent_data_corruption verdict whose kind=vendor_signaled +// and whose promoted scalars match the spec's verdict-attributes +// table. +func TestPatternDetector_SDCWiringEmitsVendorSignaledVerdict(t *testing.T) { + t.Parallel() + + jobStart := mustParseTime(t, "2026-06-01T08:00:00Z") + jobEnd := jobStart.Add(2 * time.Hour) + sdcAt := jobStart.Add(45 * time.Minute) + evalAt := jobEnd.Add(5 * time.Minute) + + ld := plog.NewLogs() + + // Vendor SDC counter log record (metrics→logs OTTL recipe output; + // the recipe stamps hw.gpu.sdc.delta on the customer-stable family + // declared in docs/patterns/13 §"Signal sources"). Resource carries + // the standard k8sattributes stamps; attrs carry the delta + kind + + // job + gpu. + sdcRL := ld.ResourceLogs().AppendEmpty() + sdcRL.Resource().Attributes().PutStr("k8s.node.name", "gpu-node-0001") + sdcSL := sdcRL.ScopeLogs().AppendEmpty() + sdcLR := sdcSL.LogRecords().AppendEmpty() + sdcLR.SetTimestamp(pcommon.NewTimestampFromTime(sdcAt)) + sa := sdcLR.Attributes() + sa.PutStr("gen_ai.training.job_id", "llama3-finetune-001") + sa.PutStr("gpu.id", "PCI:0000:3b:00") + sa.PutInt("hw.gpu.sdc.delta", 1) + sa.PutStr("hw.gpu.sdc.kind", "catcher_count") + + // Eval-accuracy log record (eval-pipeline OTTL recipe output; + // blocked on framework instrumentation per spec §"Open questions" + // 5 — the wiring test exercises the projection against a hand- + // crafted record). + evalRL := ld.ResourceLogs().AppendEmpty() + evalRL.Resource().Attributes().PutStr("k8s.node.name", "gpu-node-0001") + evalSL := evalRL.ScopeLogs().AppendEmpty() + evalLR := evalSL.LogRecords().AppendEmpty() + evalLR.SetTimestamp(pcommon.NewTimestampFromTime(evalAt)) + ea := evalLR.Attributes() + ea.PutStr("gen_ai.training.job_id", "llama3-finetune-001") + ea.PutDouble("gen_ai.training.eval_accuracy", 0.78) + ea.PutDouble("gen_ai.training.eval_accuracy.baseline", 0.80) + ea.PutInt("gen_ai.training.job.start_unix_nano", jobStart.UnixNano()) + ea.PutInt("gen_ai.training.job.end_unix_nano", jobEnd.UnixNano()) + + sink := newLogsSink() + p := newProcessor(testSettings(), defaultConfig(), sink) + require.NoError(t, p.Start(context.Background(), componenttestHost{})) + t.Cleanup(func() { _ = p.Shutdown(context.Background()) }) + require.NoError(t, p.ConsumeLogs(context.Background(), ld)) + + verdicts := extractSDCVerdicts(t, sink.at(0)) + require.Len(t, verdicts, 1) + v := verdicts[0] + require.Equal(t, patterns.PatternIDSilentDataCorruption, v.PatternID) + require.Equal(t, "llama3-finetune-001", v.JobID) + require.Equal(t, patterns.SDCKindVendorSignaled, v.Kind) + require.Equal(t, patterns.ConfidenceFull, v.Confidence) + require.Equal(t, "PCI:0000:3b:00", v.SuspectGPUID) + require.Equal(t, "gpu-node-0001", v.SuspectNode) + require.Equal(t, int64(1), v.SDCCounterDelta) + require.InDelta(t, 0.02, v.AccuracyDrop, 1e-9) + + // Promoted scalars land on the verdict log record (issue #270). + gotPromoted := extractSDCPromotedAttrs(t, sink.at(0)) + require.Equal(t, "full", gotPromoted["pattern.confidence"]) + require.Equal(t, "vendor_signaled", gotPromoted["tracecore.alert.silent_data_corruption.kind"]) + require.InDelta(t, 0.02, gotPromoted["tracecore.alert.silent_data_corruption.accuracy_drop"], 1e-9) + require.Equal(t, "PCI:0000:3b:00", gotPromoted["tracecore.alert.silent_data_corruption.suspect_gpu_id"]) + require.Equal(t, "gpu-node-0001", gotPromoted["tracecore.alert.silent_data_corruption.suspect_node"]) + require.Equal(t, "llama3-finetune-001", gotPromoted["gen_ai.training.job_id"]) +} + +// TestPatternDetector_SDCWiringEmitsAccuracyOnlyPartial pins the +// partial-confidence branch wiring: an eval-accuracy record with a +// >= 2x-threshold drop and NO same-job SDC counter emits one +// kind=accuracy_only / confidence=partial verdict. Default +// emit_partial_verdicts=true preserves visibility. +func TestPatternDetector_SDCWiringEmitsAccuracyOnlyPartial(t *testing.T) { + t.Parallel() + + jobStart := mustParseTime(t, "2026-06-01T08:00:00Z") + jobEnd := jobStart.Add(time.Hour) + evalAt := jobEnd.Add(time.Minute) + + ld := plog.NewLogs() + evalRL := ld.ResourceLogs().AppendEmpty() + evalSL := evalRL.ScopeLogs().AppendEmpty() + evalLR := evalSL.LogRecords().AppendEmpty() + evalLR.SetTimestamp(pcommon.NewTimestampFromTime(evalAt)) + ea := evalLR.Attributes() + ea.PutStr("gen_ai.training.job_id", "j") + ea.PutDouble("gen_ai.training.eval_accuracy", 0.785) // drop = 0.015 + ea.PutDouble("gen_ai.training.eval_accuracy.baseline", 0.80) // 3x default threshold + ea.PutInt("gen_ai.training.job.start_unix_nano", jobStart.UnixNano()) + ea.PutInt("gen_ai.training.job.end_unix_nano", jobEnd.UnixNano()) + + sink := newLogsSink() + p := newProcessor(testSettings(), defaultConfig(), sink) + require.NoError(t, p.Start(context.Background(), componenttestHost{})) + t.Cleanup(func() { _ = p.Shutdown(context.Background()) }) + require.NoError(t, p.ConsumeLogs(context.Background(), ld)) + + verdicts := extractSDCVerdicts(t, sink.at(0)) + require.Len(t, verdicts, 1) + require.Equal(t, patterns.SDCKindAccuracyOnly, verdicts[0].Kind) + require.Equal(t, patterns.ConfidencePartial, verdicts[0].Confidence) + require.Empty(t, verdicts[0].SuspectGPUID) + require.Equal(t, []string{patterns.EvidenceKindHwGPUSDC}, verdicts[0].MissingLayers) + + // suspect_gpu_id / suspect_node MUST NOT be stamped on the partial + // verdict — empty-filter dashboard queries would false-match. + gotPromoted := extractSDCPromotedAttrs(t, sink.at(0)) + _, hasGPU := gotPromoted["tracecore.alert.silent_data_corruption.suspect_gpu_id"] + require.False(t, hasGPU, "suspect_gpu_id MUST be omitted on accuracy_only verdict") + _, hasNode := gotPromoted["tracecore.alert.silent_data_corruption.suspect_node"] + require.False(t, hasNode, "suspect_node MUST be omitted on accuracy_only verdict") +} + +// TestPatternDetector_SDCWiringSuppressesPartialWhenDisabled pins +// the emit_partial_verdicts=false path: the accuracy_only +// (partial-confidence) verdict is swallowed when the operator opts +// out. The vendor_signaled branch is unaffected (covered by +// TestPatternDetector_SDCWiringEmitsVendorSignaledVerdict). +func TestPatternDetector_SDCWiringSuppressesPartialWhenDisabled(t *testing.T) { + t.Parallel() + + jobStart := mustParseTime(t, "2026-06-01T08:00:00Z") + jobEnd := jobStart.Add(time.Hour) + + ld := plog.NewLogs() + evalRL := ld.ResourceLogs().AppendEmpty() + evalSL := evalRL.ScopeLogs().AppendEmpty() + evalLR := evalSL.LogRecords().AppendEmpty() + evalLR.SetTimestamp(pcommon.NewTimestampFromTime(jobEnd.Add(time.Minute))) + ea := evalLR.Attributes() + ea.PutStr("gen_ai.training.job_id", "j") + ea.PutDouble("gen_ai.training.eval_accuracy", 0.785) + ea.PutDouble("gen_ai.training.eval_accuracy.baseline", 0.80) + ea.PutInt("gen_ai.training.job.start_unix_nano", jobStart.UnixNano()) + ea.PutInt("gen_ai.training.job.end_unix_nano", jobEnd.UnixNano()) + + f := false + cfg := defaultConfig() + cfg.EmitPartialVerdicts = &f + + sink := newLogsSink() + p := newProcessor(testSettings(), cfg, sink) + require.NoError(t, p.Start(context.Background(), componenttestHost{})) + t.Cleanup(func() { _ = p.Shutdown(context.Background()) }) + require.NoError(t, p.ConsumeLogs(context.Background(), ld)) + + require.Empty(t, extractSDCVerdicts(t, sink.at(0)), + "partial-confidence accuracy_only verdict suppressed when emit_partial_verdicts=false") +} + +// TestPatternDetector_SDCWiringNoEvalNoVerdict pins the input- +// gating contract: a vendor SDC counter record arriving alone (no +// eval-accuracy record on the same job) emits no verdict. The +// pattern requires the user-visible symptom — vendor SDC counters +// fire on hardware events that don't necessarily corrupt the in- +// flight job's data (spec §"Edge cases" — hardware-attribution +// false positive). +func TestPatternDetector_SDCWiringNoEvalNoVerdict(t *testing.T) { + t.Parallel() + + ld := plog.NewLogs() + sdcRL := ld.ResourceLogs().AppendEmpty() + sdcRL.Resource().Attributes().PutStr("k8s.node.name", "gpu-node-0001") + sdcSL := sdcRL.ScopeLogs().AppendEmpty() + sdcLR := sdcSL.LogRecords().AppendEmpty() + sdcLR.SetTimestamp(pcommon.NewTimestampFromTime(mustParseTime(t, "2026-06-01T08:30:00Z"))) + sa := sdcLR.Attributes() + sa.PutStr("gen_ai.training.job_id", "j") + sa.PutStr("gpu.id", "PCI:0000:3b:00") + sa.PutInt("hw.gpu.sdc.delta", 1) + + sink := newLogsSink() + p := newProcessor(testSettings(), defaultConfig(), sink) + require.NoError(t, p.Start(context.Background(), componenttestHost{})) + t.Cleanup(func() { _ = p.Shutdown(context.Background()) }) + require.NoError(t, p.ConsumeLogs(context.Background(), ld)) + + require.Empty(t, extractSDCVerdicts(t, sink.at(0)), + "vendor SDC counter alone MUST NOT emit a verdict — pattern requires the eval-accuracy regression symptom") +} + +// TestPatternDetector_SDCWiringThresholdConfigurable asserts the +// processor surfaces the detector's AccuracyDropThreshold knob via +// YAML config. Mirrors the cuda_oom threshold-configurable wiring +// test. +func TestPatternDetector_SDCWiringThresholdConfigurable(t *testing.T) { + t.Parallel() + + jobStart := mustParseTime(t, "2026-06-01T08:00:00Z") + jobEnd := jobStart.Add(time.Hour) + sdcAt := jobStart.Add(30 * time.Minute) + evalAt := jobEnd.Add(time.Minute) + + build := func() plog.Logs { + ld := plog.NewLogs() + + sdcRL := ld.ResourceLogs().AppendEmpty() + sdcRL.Resource().Attributes().PutStr("k8s.node.name", "n") + sdcSL := sdcRL.ScopeLogs().AppendEmpty() + sdcLR := sdcSL.LogRecords().AppendEmpty() + sdcLR.SetTimestamp(pcommon.NewTimestampFromTime(sdcAt)) + sa := sdcLR.Attributes() + sa.PutStr("gen_ai.training.job_id", "j") + sa.PutStr("gpu.id", "PCI:0000:3b:00") + sa.PutInt("hw.gpu.sdc.delta", 1) + + evalRL := ld.ResourceLogs().AppendEmpty() + evalSL := evalRL.ScopeLogs().AppendEmpty() + evalLR := evalSL.LogRecords().AppendEmpty() + evalLR.SetTimestamp(pcommon.NewTimestampFromTime(evalAt)) + ea := evalLR.Attributes() + ea.PutStr("gen_ai.training.job_id", "j") + // drop = 0.003 — below default 0.005 threshold, above tuned 0.001. + ea.PutDouble("gen_ai.training.eval_accuracy", 0.797) + ea.PutDouble("gen_ai.training.eval_accuracy.baseline", 0.80) + ea.PutInt("gen_ai.training.job.start_unix_nano", jobStart.UnixNano()) + ea.PutInt("gen_ai.training.job.end_unix_nano", jobEnd.UnixNano()) + return ld + } + + // Default 0.005: drop=0.003 is below → no verdict. + sink1 := newLogsSink() + p1 := newProcessor(testSettings(), defaultConfig(), sink1) + require.NoError(t, p1.Start(context.Background(), componenttestHost{})) + t.Cleanup(func() { _ = p1.Shutdown(context.Background()) }) + require.NoError(t, p1.ConsumeLogs(context.Background(), build())) + require.Empty(t, extractSDCVerdicts(t, sink1.at(0))) + + // Lowered to 0.001: drop=0.003 is above → vendor_signaled fires. + cfg := defaultConfig() + cfg.SDCAccuracyDropThreshold = 0.001 + sink2 := newLogsSink() + p2 := newProcessor(testSettings(), cfg, sink2) + require.NoError(t, p2.Start(context.Background(), componenttestHost{})) + t.Cleanup(func() { _ = p2.Shutdown(context.Background()) }) + require.NoError(t, p2.ConsumeLogs(context.Background(), build())) + v := extractSDCVerdicts(t, sink2.at(0)) + require.Len(t, v, 1) + require.Equal(t, patterns.SDCKindVendorSignaled, v[0].Kind) +} + +// TestConfig_SDCAccuracyDropThresholdValidation pins the Validate +// guard: sdc_accuracy_drop_threshold outside [0, 1] is operator- +// actionable malconfig (eval accuracy is a probability). +func TestConfig_SDCAccuracyDropThresholdValidation(t *testing.T) { + t.Parallel() + require.Error(t, (&Config{SDCAccuracyDropThreshold: 1.5}).Validate(), "threshold > 1 must reject") + require.Error(t, (&Config{SDCAccuracyDropThreshold: -0.1}).Validate(), "threshold < 0 must reject") + require.NoError(t, (&Config{SDCAccuracyDropThreshold: 0.005}).Validate()) +} + +// TestConfig_SDCAccuracyOnlyMultiplierValidation pins the Validate +// guard: sdc_accuracy_only_multiplier < 1 inverts confidence +// semantics (partial branch easier than full); operator-actionable +// malconfig. +func TestConfig_SDCAccuracyOnlyMultiplierValidation(t *testing.T) { + t.Parallel() + require.Error(t, (&Config{SDCAccuracyOnlyMultiplier: 0.5}).Validate(), "multiplier < 1 must reject") + require.NoError(t, (&Config{SDCAccuracyOnlyMultiplier: 1.0}).Validate()) + require.NoError(t, (&Config{SDCAccuracyOnlyMultiplier: 0}).Validate(), "zero means default — accepted") +} + +// extractSDCVerdicts walks the output plog.Logs and decodes the +// silent_data_corruption verdict JSON attribute on each verdict- +// shaped log record. Mirrors extractCUDAOOMVerdicts. +func extractSDCVerdicts(t *testing.T, ld plog.Logs) []patterns.SilentDataCorruptionVerdict { + t.Helper() + out := []patterns.SilentDataCorruptionVerdict{} + for i := 0; i < ld.ResourceLogs().Len(); i++ { + rl := ld.ResourceLogs().At(i) + for j := 0; j < rl.ScopeLogs().Len(); j++ { + sl := rl.ScopeLogs().At(j) + if sl.Scope().Name() != instrumentationScope { + continue + } + for k := 0; k < sl.LogRecords().Len(); k++ { + lr := sl.LogRecords().At(k) + patternID, ok := lr.Attributes().Get(verdictAttrPatternID) + if !ok || patternID.AsString() != patterns.PatternIDSilentDataCorruption { + continue + } + js, ok := lr.Attributes().Get(verdictAttrVerdictJSON) + if !ok { + continue + } + var v patterns.SilentDataCorruptionVerdict + require.NoError(t, json.Unmarshal([]byte(js.AsString()), &v)) + out = append(out, v) + } + } + } + return out +} + +// extractSDCPromotedAttrs walks the output plog.Logs and returns the +// promoted-scalar attributes off the first silent_data_corruption +// verdict record. Used to assert the issue-#270 scalar-promotion +// contract. +func extractSDCPromotedAttrs(t *testing.T, ld plog.Logs) map[string]any { + t.Helper() + for i := 0; i < ld.ResourceLogs().Len(); i++ { + rl := ld.ResourceLogs().At(i) + for j := 0; j < rl.ScopeLogs().Len(); j++ { + sl := rl.ScopeLogs().At(j) + if sl.Scope().Name() != instrumentationScope { + continue + } + for k := 0; k < sl.LogRecords().Len(); k++ { + lr := sl.LogRecords().At(k) + patternID, ok := lr.Attributes().Get(verdictAttrPatternID) + if !ok || patternID.AsString() != patterns.PatternIDSilentDataCorruption { + continue + } + out := map[string]any{} + lr.Attributes().Range(func(k string, v pcommon.Value) bool { + switch v.Type() { + case pcommon.ValueTypeStr: + out[k] = v.AsString() + case pcommon.ValueTypeDouble: + out[k] = v.Double() + case pcommon.ValueTypeInt: + out[k] = v.Int() + } + return true + }) + return out + } + } + } + t.Fatal("no silent_data_corruption verdict record found in output") + return nil +}