Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions module/pkg/patterns/checkpointer_hang.go
Original file line number Diff line number Diff line change
Expand Up @@ -279,6 +279,7 @@ func (v CheckpointerHangVerdict) Common() VerdictCommon {
Remediation: v.Remediation,
EvidenceTrail: v.EvidenceTrail,
Kind: "checkpointer_hang",
Confidence: v.Confidence,
}
}

Expand Down
1 change: 1 addition & 0 deletions module/pkg/patterns/cuda_oom.go
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,7 @@ func (v CUDAOOMVerdict) Common() VerdictCommon {
Remediation: v.Remediation,
EvidenceTrail: v.EvidenceTrail,
Kind: "cuda_oom",
Confidence: v.Confidence,
}
}

Expand Down
1 change: 1 addition & 0 deletions module/pkg/patterns/dataloader_hang.go
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,7 @@ func (v DataLoaderHangVerdict) Common() VerdictCommon {
Remediation: v.Remediation,
EvidenceTrail: v.EvidenceTrail,
Kind: "dataloader_hang",
Confidence: v.Confidence,
}
}

Expand Down
1 change: 1 addition & 0 deletions module/pkg/patterns/ib_link_flap.go
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,7 @@ func (v IBLinkFlapVerdict) Common() VerdictCommon {
Remediation: v.Remediation,
EvidenceTrail: v.EvidenceTrail,
Kind: "ib_link_flap",
Confidence: v.Confidence,
}
}

Expand Down
1 change: 1 addition & 0 deletions module/pkg/patterns/nccl_bootstrap.go
Original file line number Diff line number Diff line change
Expand Up @@ -232,6 +232,7 @@ func (v NCCLBootstrapTimeoutVerdict) Common() VerdictCommon {
Remediation: v.Remediation,
EvidenceTrail: v.EvidenceTrail,
Kind: "nccl_bootstrap",
Confidence: v.Confidence,
}
}

Expand Down
1 change: 1 addition & 0 deletions module/pkg/patterns/silent_data_corruption.go
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,7 @@ func (v SilentDataCorruptionVerdict) Common() VerdictCommon {
Remediation: v.Remediation,
EvidenceTrail: v.EvidenceTrail,
Kind: "silent_data_corruption",
Confidence: v.Confidence,
}
}

Expand Down
9 changes: 9 additions & 0 deletions module/pkg/patterns/verdict.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,14 @@ type VerdictCommon struct {
Remediation string
EvidenceTrail []EvidenceRef
Kind string

// Confidence is the verdict's evidence-completeness label. Hoisted
// onto VerdictCommon so the processor's generic emit path can
// apply partial-gating uniformly (instead of carving per-verdict
// wrappers around the for-range loop). Verdict types that don't
// track confidence leave this as the zero value (empty string) —
// emitAll preserves the legacy empty-label telemetry tick for them.
Confidence Confidence
}

// Verdict attribute keys owned by the producer (patterns package) so
Expand Down Expand Up @@ -212,6 +220,7 @@ func (v PodEvictedVerdict) Common() VerdictCommon {
Remediation: v.Remediation,
EvidenceTrail: v.EvidenceTrail,
Kind: "pod_evicted",
Confidence: v.Confidence,
}
}

Expand Down
55 changes: 20 additions & 35 deletions module/processor/patterndetectorprocessor/patterndetector.go
Original file line number Diff line number Diff line change
Expand Up @@ -267,45 +267,30 @@ func (p *patterndetectorProcessor) ConsumeLogs(ctx context.Context, ld plog.Logs
return nil
}

// emitAll is the shared loop body for runners whose detector does
// not emit partial-confidence verdicts. Appends each verdict and
// ticks the IncVerdict counter with the legacy empty-string
// confidence label (preserves pre-registry telemetry vocabulary).
// emitAll is the shared loop body for every registered detector.
// Reads the verdict's Confidence via VerdictCommon (hoisted in
// module/pkg/patterns/verdict.go) so the gating + telemetry-label
// behavior previously carved into emitPodEvicted / emitIBLinkFlap
// applies uniformly:
//
// - Verdicts with Confidence == ConfidencePartial are dropped
// unless the operator opt-in (cfg.emitPartialEnabled) is set —
// mirrors the per-detector run* helpers' gating contract.
// - IncVerdict is ticked with string(Confidence). For the five
// emitAll-path detectors that don't track confidence (NCCLHang,
// XidCorrelation, HBMECC, ThermalThrottle, PCIeAER) Common()
// returns the zero-value Confidence ("") — byte-identical to
// the legacy empty-label tick pinned by
// TestNoopFallback_IncVerdict_TableDriven.
func emitAll[V verdictAttrer](p *patterndetectorProcessor, ld plog.Logs, verdicts []V) {
for _, v := range verdicts {
appendVerdict(ld, v, p.logger())
p.telemetry.IncVerdict(v.Common().PatternID, "")
}
}

// emitPodEvicted gates partial verdicts behind the operator opt-in
// and stamps the real confidence label on IncVerdict. Carved out
// (rather than a generic gated emitter) because Go generics cannot
// abstract over a struct field — PodEvictedVerdict.Confidence and
// IBLinkFlapVerdict.Confidence are field reads, not interface
// methods, and adding a Confidence() accessor would clash with the
// existing exported field name on the same type.
func emitPodEvicted(p *patterndetectorProcessor, ld plog.Logs, verdicts []patterns.PodEvictedVerdict) {
emitPartial := p.cfg.emitPartialEnabled()
for _, v := range verdicts {
if v.Confidence == patterns.ConfidencePartial && !emitPartial {
continue
}
appendVerdict(ld, v, p.logger())
p.telemetry.IncVerdict(v.PatternID, string(v.Confidence))
}
}

// emitIBLinkFlap mirrors emitPodEvicted for the IBLinkFlap detector
// — same gating + telemetry shape; see emitPodEvicted comment.
func emitIBLinkFlap(p *patterndetectorProcessor, ld plog.Logs, verdicts []patterns.IBLinkFlapVerdict) {
emitPartial := p.cfg.emitPartialEnabled()
for _, v := range verdicts {
if v.Confidence == patterns.ConfidencePartial && !emitPartial {
c := v.Common()
if c.Confidence == patterns.ConfidencePartial && !emitPartial {
continue
}
appendVerdict(ld, v, p.logger())
p.telemetry.IncVerdict(v.PatternID, string(v.Confidence))
p.telemetry.IncVerdict(c.PatternID, string(c.Confidence))
}
}

Expand All @@ -330,7 +315,7 @@ type detectorRunner func(p *patterndetectorProcessor, ld plog.Logs, in projected
var detectorRunners = []detectorRunner{
func(p *patterndetectorProcessor, ld plog.Logs, in projectedInputs) {
det := patterns.PodEvictedDetector{JoinWindow: p.cfg.JoinWindow}
emitPodEvicted(p, ld, det.Evaluate(in.events, in.nodeConds))
emitAll(p, ld, det.Evaluate(in.events, in.nodeConds))
},
func(p *patterndetectorProcessor, ld plog.Logs, in projectedInputs) {
det := patterns.NCCLHangDetector{HangThreshold: p.cfg.NCCLHangThreshold}
Expand Down Expand Up @@ -367,7 +352,7 @@ var detectorRunners = []detectorRunner{
CorrelationWindow: p.cfg.IBLinkFlapWindow,
MinTransitions: p.cfg.IBLinkFlapMinTransitions,
}
emitIBLinkFlap(p, ld, det.Evaluate(in.ibs, in.nccl))
emitAll(p, ld, det.Evaluate(in.ibs, in.nccl))
},
// Wrapped runners — pattern-specific projections live inside
// each runXxxDetector helper (they walk plog.Logs themselves
Expand Down