diff --git a/module/pkg/patterns/checkpointer_hang.go b/module/pkg/patterns/checkpointer_hang.go index 4fb20477..25d11313 100644 --- a/module/pkg/patterns/checkpointer_hang.go +++ b/module/pkg/patterns/checkpointer_hang.go @@ -279,6 +279,7 @@ func (v CheckpointerHangVerdict) Common() VerdictCommon { Remediation: v.Remediation, EvidenceTrail: v.EvidenceTrail, Kind: "checkpointer_hang", + Confidence: v.Confidence, } } diff --git a/module/pkg/patterns/cuda_oom.go b/module/pkg/patterns/cuda_oom.go index 0413770f..20bdcac7 100644 --- a/module/pkg/patterns/cuda_oom.go +++ b/module/pkg/patterns/cuda_oom.go @@ -204,6 +204,7 @@ func (v CUDAOOMVerdict) Common() VerdictCommon { Remediation: v.Remediation, EvidenceTrail: v.EvidenceTrail, Kind: "cuda_oom", + Confidence: v.Confidence, } } diff --git a/module/pkg/patterns/dataloader_hang.go b/module/pkg/patterns/dataloader_hang.go index f2f54bf7..06f706bb 100644 --- a/module/pkg/patterns/dataloader_hang.go +++ b/module/pkg/patterns/dataloader_hang.go @@ -193,6 +193,7 @@ func (v DataLoaderHangVerdict) Common() VerdictCommon { Remediation: v.Remediation, EvidenceTrail: v.EvidenceTrail, Kind: "dataloader_hang", + Confidence: v.Confidence, } } diff --git a/module/pkg/patterns/ib_link_flap.go b/module/pkg/patterns/ib_link_flap.go index 197c5054..d47f3412 100644 --- a/module/pkg/patterns/ib_link_flap.go +++ b/module/pkg/patterns/ib_link_flap.go @@ -142,6 +142,7 @@ func (v IBLinkFlapVerdict) Common() VerdictCommon { Remediation: v.Remediation, EvidenceTrail: v.EvidenceTrail, Kind: "ib_link_flap", + Confidence: v.Confidence, } } diff --git a/module/pkg/patterns/nccl_bootstrap.go b/module/pkg/patterns/nccl_bootstrap.go index 74130ee5..02ca0eef 100644 --- a/module/pkg/patterns/nccl_bootstrap.go +++ b/module/pkg/patterns/nccl_bootstrap.go @@ -232,6 +232,7 @@ func (v NCCLBootstrapTimeoutVerdict) Common() VerdictCommon { Remediation: v.Remediation, EvidenceTrail: v.EvidenceTrail, Kind: "nccl_bootstrap", + Confidence: v.Confidence, } } diff --git a/module/pkg/patterns/silent_data_corruption.go b/module/pkg/patterns/silent_data_corruption.go index 1e8c0cfe..7986353c 100644 --- a/module/pkg/patterns/silent_data_corruption.go +++ b/module/pkg/patterns/silent_data_corruption.go @@ -225,6 +225,7 @@ func (v SilentDataCorruptionVerdict) Common() VerdictCommon { Remediation: v.Remediation, EvidenceTrail: v.EvidenceTrail, Kind: "silent_data_corruption", + Confidence: v.Confidence, } } diff --git a/module/pkg/patterns/verdict.go b/module/pkg/patterns/verdict.go index f21a3761..fd786aeb 100644 --- a/module/pkg/patterns/verdict.go +++ b/module/pkg/patterns/verdict.go @@ -47,6 +47,14 @@ type VerdictCommon struct { Remediation string EvidenceTrail []EvidenceRef Kind string + + // Confidence is the verdict's evidence-completeness label. Hoisted + // onto VerdictCommon so the processor's generic emit path can + // apply partial-gating uniformly (instead of carving per-verdict + // wrappers around the for-range loop). Verdict types that don't + // track confidence leave this as the zero value (empty string) — + // emitAll preserves the legacy empty-label telemetry tick for them. + Confidence Confidence } // Verdict attribute keys owned by the producer (patterns package) so @@ -212,6 +220,7 @@ func (v PodEvictedVerdict) Common() VerdictCommon { Remediation: v.Remediation, EvidenceTrail: v.EvidenceTrail, Kind: "pod_evicted", + Confidence: v.Confidence, } } diff --git a/module/processor/patterndetectorprocessor/patterndetector.go b/module/processor/patterndetectorprocessor/patterndetector.go index f8d51c8c..95bbed77 100644 --- a/module/processor/patterndetectorprocessor/patterndetector.go +++ b/module/processor/patterndetectorprocessor/patterndetector.go @@ -267,45 +267,30 @@ func (p *patterndetectorProcessor) ConsumeLogs(ctx context.Context, ld plog.Logs return nil } -// emitAll is the shared loop body for runners whose detector does -// not emit partial-confidence verdicts. Appends each verdict and -// ticks the IncVerdict counter with the legacy empty-string -// confidence label (preserves pre-registry telemetry vocabulary). +// emitAll is the shared loop body for every registered detector. +// Reads the verdict's Confidence via VerdictCommon (hoisted in +// module/pkg/patterns/verdict.go) so the gating + telemetry-label +// behavior previously carved into emitPodEvicted / emitIBLinkFlap +// applies uniformly: +// +// - Verdicts with Confidence == ConfidencePartial are dropped +// unless the operator opt-in (cfg.emitPartialEnabled) is set — +// mirrors the per-detector run* helpers' gating contract. +// - IncVerdict is ticked with string(Confidence). For the five +// emitAll-path detectors that don't track confidence (NCCLHang, +// XidCorrelation, HBMECC, ThermalThrottle, PCIeAER) Common() +// returns the zero-value Confidence ("") — byte-identical to +// the legacy empty-label tick pinned by +// TestNoopFallback_IncVerdict_TableDriven. func emitAll[V verdictAttrer](p *patterndetectorProcessor, ld plog.Logs, verdicts []V) { - for _, v := range verdicts { - appendVerdict(ld, v, p.logger()) - p.telemetry.IncVerdict(v.Common().PatternID, "") - } -} - -// emitPodEvicted gates partial verdicts behind the operator opt-in -// and stamps the real confidence label on IncVerdict. Carved out -// (rather than a generic gated emitter) because Go generics cannot -// abstract over a struct field — PodEvictedVerdict.Confidence and -// IBLinkFlapVerdict.Confidence are field reads, not interface -// methods, and adding a Confidence() accessor would clash with the -// existing exported field name on the same type. -func emitPodEvicted(p *patterndetectorProcessor, ld plog.Logs, verdicts []patterns.PodEvictedVerdict) { - emitPartial := p.cfg.emitPartialEnabled() - for _, v := range verdicts { - if v.Confidence == patterns.ConfidencePartial && !emitPartial { - continue - } - appendVerdict(ld, v, p.logger()) - p.telemetry.IncVerdict(v.PatternID, string(v.Confidence)) - } -} - -// emitIBLinkFlap mirrors emitPodEvicted for the IBLinkFlap detector -// — same gating + telemetry shape; see emitPodEvicted comment. -func emitIBLinkFlap(p *patterndetectorProcessor, ld plog.Logs, verdicts []patterns.IBLinkFlapVerdict) { emitPartial := p.cfg.emitPartialEnabled() for _, v := range verdicts { - if v.Confidence == patterns.ConfidencePartial && !emitPartial { + c := v.Common() + if c.Confidence == patterns.ConfidencePartial && !emitPartial { continue } appendVerdict(ld, v, p.logger()) - p.telemetry.IncVerdict(v.PatternID, string(v.Confidence)) + p.telemetry.IncVerdict(c.PatternID, string(c.Confidence)) } } @@ -330,7 +315,7 @@ type detectorRunner func(p *patterndetectorProcessor, ld plog.Logs, in projected var detectorRunners = []detectorRunner{ func(p *patterndetectorProcessor, ld plog.Logs, in projectedInputs) { det := patterns.PodEvictedDetector{JoinWindow: p.cfg.JoinWindow} - emitPodEvicted(p, ld, det.Evaluate(in.events, in.nodeConds)) + emitAll(p, ld, det.Evaluate(in.events, in.nodeConds)) }, func(p *patterndetectorProcessor, ld plog.Logs, in projectedInputs) { det := patterns.NCCLHangDetector{HangThreshold: p.cfg.NCCLHangThreshold} @@ -367,7 +352,7 @@ var detectorRunners = []detectorRunner{ CorrelationWindow: p.cfg.IBLinkFlapWindow, MinTransitions: p.cfg.IBLinkFlapMinTransitions, } - emitIBLinkFlap(p, ld, det.Evaluate(in.ibs, in.nccl)) + emitAll(p, ld, det.Evaluate(in.ibs, in.nccl)) }, // Wrapped runners — pattern-specific projections live inside // each runXxxDetector helper (they walk plog.Logs themselves