Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
100 changes: 100 additions & 0 deletions module/pkg/patterns/detector.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
// SPDX-License-Identifier: Apache-2.0

package patterns

// Detector is the minimal common contract every shipped pattern
// detector satisfies. It is intentionally a metadata-only seam:
// each detector's Evaluate signature is intrinsically heterogeneous
// (different input record shapes, different verdict types), so a
// uniform Evaluate method on the interface would force a lossy
// any-typed contract that the typed test suite has been fighting
// for the shipped patterns.
//
// Instead, this interface pins identity. The companion Registered
// slice below is the single source of truth for "every shipped
// pattern detector" — adding a new pattern is `append(Registered, ...)`,
// not editing the processor fanout site.
//
// Consumers (chiefly the patterndetectorprocessor) iterate Registered
// for shape-uniform behavior (count, telemetry-label enumeration,
// drift tests) and continue to bind the typed Evaluate calls at
// their concrete-runner sites. This is a deliberately conservative
// seam — see docs/rfcs/0013-distro-first-pivot.md for the wider
// pattern-library evolution.
type Detector interface {
// PatternID returns the stable string-typed numeric pattern ID
// (matches PatternID* constants and the on-the-wire
// `pattern.id` attribute on every emitted verdict log record).
PatternID() string
}

// Registered is the canonical, ordered list of every shipped pattern
// detector. Iteration order is the registration order — tests pin
// the full set so accidental drops surface immediately.
//
// Adding a new pattern: implement the detector struct + verdict in
// a sibling file, add a PatternID() method returning the new
// PatternID* constant, and append the zero-value detector pointer
// here. The pin test in detector_test.go enforces count + ID set.
var Registered = []Detector{
&PodEvictedDetector{},
&NCCLHangDetector{},
&XidCorrelationDetector{},
&HBMECCDetector{},
&ThermalThrottleDetector{},
&PCIeAERDetector{},
&IBLinkFlapDetector{},
&CUDAOOMDetector{},
&CheckpointerHangDetector{},
&SilentDataCorruptionDetector{},
&NCCLBootstrapDetector{},
&DataLoaderHangDetector{},
}

// PatternID method implementations. Co-located here (not next to
// each *Detector struct) because they are one-line constant returns
// — keeping them in one file makes the registry contract auditable
// at a glance and removes a footgun where adding a new detector
// requires editing two files (struct file + here) instead of one.
// The constants themselves (PatternID*) still live next to each
// detector's verdict type for the verdict-emitting hot path.
//
// Each method returns the stable string-typed numeric pattern ID
// matching the corresponding PatternID* constant and the on-the-wire
// `pattern.id` attribute on every emitted verdict log record.

// PatternID satisfies the Detector interface for PodEvictedDetector.
func (*PodEvictedDetector) PatternID() string { return PatternIDPodEvicted }

// PatternID satisfies the Detector interface for NCCLHangDetector.
func (*NCCLHangDetector) PatternID() string { return PatternIDNCCLHang }

// PatternID satisfies the Detector interface for XidCorrelationDetector.
func (*XidCorrelationDetector) PatternID() string { return PatternIDXidCorrelation }

// PatternID satisfies the Detector interface for HBMECCDetector.
func (*HBMECCDetector) PatternID() string { return PatternIDHBMECC }

// PatternID satisfies the Detector interface for ThermalThrottleDetector.
func (*ThermalThrottleDetector) PatternID() string { return PatternIDThermalThrottle }

// PatternID satisfies the Detector interface for PCIeAERDetector.
func (*PCIeAERDetector) PatternID() string { return PatternIDPCIeAER }

// PatternID satisfies the Detector interface for IBLinkFlapDetector.
func (*IBLinkFlapDetector) PatternID() string { return PatternIDIBLinkFlap }

// PatternID satisfies the Detector interface for CUDAOOMDetector.
func (*CUDAOOMDetector) PatternID() string { return PatternIDCUDAOOM }

// PatternID satisfies the Detector interface for CheckpointerHangDetector.
func (*CheckpointerHangDetector) PatternID() string { return PatternIDCheckpointerHang }

// PatternID satisfies the Detector interface for SilentDataCorruptionDetector.
func (*SilentDataCorruptionDetector) PatternID() string { return PatternIDSilentDataCorruption }

// PatternID satisfies the Detector interface for NCCLBootstrapDetector.
func (*NCCLBootstrapDetector) PatternID() string { return PatternIDNCCLBootstrap }

// PatternID satisfies the Detector interface for DataLoaderHangDetector.
func (*DataLoaderHangDetector) PatternID() string { return PatternIDDataLoaderHang }
91 changes: 91 additions & 0 deletions module/pkg/patterns/detector_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
// SPDX-License-Identifier: Apache-2.0

package patterns

import (
"sort"
"testing"
)

// TestRegistered_PinsAllPatterns is the registry drift gate. It
// asserts the exact count and exact set of PatternIDs in Registered.
// Adding a new pattern requires updating this list — that's the point.
// Catches the silent-drop failure mode: a refactor that accidentally
// removes a detector from Registered would otherwise silently stop
// running it.
//
// The expected slice is sorted for set-equality semantics — the
// declaration order in detector.go is meaningful for iteration but
// not load-bearing for "is every pattern present".
func TestRegistered_PinsAllPatterns(t *testing.T) {
want := []string{
PatternIDCUDAOOM,
PatternIDCheckpointerHang,
PatternIDDataLoaderHang,
PatternIDHBMECC,
PatternIDIBLinkFlap,
PatternIDNCCLBootstrap,
PatternIDNCCLHang,
PatternIDPCIeAER,
PatternIDPodEvicted,
PatternIDSilentDataCorruption,
PatternIDThermalThrottle,
PatternIDXidCorrelation,
}
sort.Strings(want)

if got, wantN := len(Registered), len(want); got != wantN {
t.Fatalf("Registered: len=%d, want %d (adding a pattern? update want[] above too)", got, wantN)
}

got := make([]string, 0, len(Registered))
for i, d := range Registered {
if d == nil {
t.Errorf("Registered[%d] = nil; every entry must be a non-nil detector pointer", i)
continue
}
got = append(got, d.PatternID())
}
sort.Strings(got)

for i := range want {
if got[i] != want[i] {
t.Errorf("Registered sorted PatternID set mismatch:\n got: %v\n want: %v", got, want)
return
}
}
}

// TestRegistered_UniquePatternIDs guards against the
// copy-paste-the-wrong-constant failure mode: registering two
// detector pointers whose PatternID() returns the same string.
// Would silently double-count in any future per-PatternID
// telemetry aggregation.
func TestRegistered_UniquePatternIDs(t *testing.T) {
seen := make(map[string]int, len(Registered))
for i, d := range Registered {
if d == nil {
continue
}
id := d.PatternID()
if prev, ok := seen[id]; ok {
t.Errorf("Registered[%d].PatternID()=%q collides with Registered[%d]", i, id, prev)
}
seen[id] = i
}
}

// TestRegistered_NonEmptyPatternIDs pins the "PatternID() returns
// the constant, not the zero string" contract. A new detector that
// forgets to wire the method body would zero-string here and silently
// pass the count check.
func TestRegistered_NonEmptyPatternIDs(t *testing.T) {
for i, d := range Registered {
if d == nil {
continue
}
if id := d.PatternID(); id == "" {
t.Errorf("Registered[%d].PatternID() = empty; wire it to the PatternID* constant", i)
}
}
}
Loading
Loading