Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
65 changes: 65 additions & 0 deletions module/pkg/patterns/testdata/xid_correlation_verdict.schema.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
{
"$schema": "https://json-schema.org/draft/2020-12/schema",
"$id": "https://tracecore.ai/schemas/patterns/xid_correlation_verdict/v0",
"title": "XidCorrelationVerdict",
"description": "v0.3.0 NORTHSTAR pattern #3 (xid_correlation) verdict shape. Pinned by TestXidCorrelationVerdict_SchemaConformance.",
"type": "object",
"required": ["pattern.id", "headline", "remediation", "evidence_trail", "xid_code", "node", "evicted_pod"],
"additionalProperties": false,
"properties": {
"pattern.id": {
"type": "string",
"const": "16",
"description": "xid_correlation pattern identifier; string-typed numeric uniform across pattern detectors."
},
"headline": {
"type": "string",
"minLength": 1
},
"remediation": {
"type": "string",
"minLength": 1
},
"xid_code": {
"type": "integer",
"description": "NVIDIA driver Xid event code (e.g. 79 = GPU fallen off the bus)."
},
"node": {
"type": "string",
"minLength": 1,
"description": "Kubernetes node name where both the Xid and the eviction occurred."
},
"evicted_pod": {
"type": "string",
"minLength": 1,
"description": "namespace/name of the evicted pod that correlated with the Xid."
},
"evidence_trail": {
"type": "array",
"minItems": 2,
"items": {
"type": "object",
"required": ["kind", "uid", "timestamp", "description"],
"additionalProperties": false,
"properties": {
"kind": {
"type": "string",
"enum": ["kernel_event", "pod_event"]
},
"uid": {
"type": "string",
"minLength": 1
},
"timestamp": {
"type": "string",
"format": "date-time"
},
"description": {
"type": "string",
"minLength": 1
}
}
}
}
}
}
4 changes: 2 additions & 2 deletions module/pkg/patterns/verdict.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,8 @@ const (
// bottom as the operator-facing timeline.
type EvidenceRef struct {
// Kind names the receiver-side surface the evidence came from.
// Today's vocabulary: "pod_event", "node_condition". M17/M18 will
// extend with "kernel_event", "nccl_fr", "kineto", "pyspy".
// Today's vocabulary: "pod_event", "node_condition", "nccl_fr",
// "kernel_event". Future extensions: "kineto", "pyspy".
Kind string `json:"kind"`

// UID is the upstream identifier — for Kubernetes Events
Expand Down
252 changes: 252 additions & 0 deletions module/pkg/patterns/xid_correlation.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,252 @@
// SPDX-License-Identifier: Apache-2.0

package patterns

import (
"fmt"
"sort"
"time"
)

// DefaultXidCorrelationWindow is the maximum gap between an Xid
// kernel event and a downstream pod eviction on the same node for
// the detector to join them. 60s reflects the kubelet's typical
// reaction to a GPU-driven NotReady transition: the driver-side Xid
// fires immediately on the hardware fault, the node-status loop
// flips NotReady within ~10-20s, the eviction-manager begins evicting
// pods within tens of seconds after that. Operators on long-tail
// drain paths (eviction-soft, controller-managed graceful drain)
// raise this via XidCorrelationDetector.CorrelationWindow.
const DefaultXidCorrelationWindow = 60 * time.Second

// XidRecord is the typed projection of one GPU Xid kernel event. The
// patterndetectorprocessor builds these from log records carrying
// the customer-stable `kernelevents.xid` attribute (per RFC-0013 §3
// and the journald-kernel recipe); the detector reads XidRecord
// values directly — no plog grep — so a schema rename in the OTTL
// recipe surfaces as a compile error.
type XidRecord struct {
// Code is the NVIDIA driver Xid event code. 79 ("GPU has fallen
// off the bus") is the canonical hardware-failure trigger; the
// full enumeration is documented in the NVIDIA driver release
// notes. The detector doesn't gate on Code value — any Xid +
// downstream eviction is operator-relevant.
Code int `json:"code"`

// Node is the Kubernetes node name the Xid fired on. Stamped by
// the k8sattributes processor on the DaemonSet's resource
// attributes (`k8s.node.name`); the patterndetectorprocessor
// hoists it onto each XidRecord.
Node string `json:"node"`

// Timestamp is the kernel log record's wall-clock time — the
// moment the GPU driver emitted the Xid line.
Timestamp time.Time `json:"timestamp"`

// Detail is the raw kernel log line body, e.g. "NVRM: Xid (PCI:
// 0000:3b:00): 79, GPU has fallen off the bus". Rendered into
// the verdict's evidence-trail description.
Detail string `json:"detail,omitempty"`
}

// XidCorrelationVerdict is the v0.3.0 NORTHSTAR pattern #3 output.
// JSON field names follow the verdict.schema.json snake-case
// convention. Distinct shape from PodEvictedVerdict (no confidence)
// because the pattern's emission rule is "both layers joined or no
// verdict" — partial verdicts would be Xid-without-eviction, which
// operators already see via the raw kernelevents.xid telemetry.
type XidCorrelationVerdict struct {
PatternID string `json:"pattern.id"`
Headline string `json:"headline"`
Remediation string `json:"remediation"`
EvidenceTrail []EvidenceRef `json:"evidence_trail"`

// XidCode is the NVIDIA driver Xid code that triggered the
// correlation. Load-bearing for operator triage — pasted into
// NVIDIA driver-bug lookups.
XidCode int `json:"xid_code"`

// Node is the Kubernetes node where both the Xid and the
// eviction occurred. Pinned on the verdict so the alert payload
// is self-contained.
Node string `json:"node"`

// EvictedPod is the namespace/name of the evicted pod. One
// verdict emits per evicted pod (not per Xid) so alert routing
// can fan out to per-pod owners.
EvictedPod string `json:"evicted_pod"`
}

// XidCorrelationDetector is the xid_correlation pattern detector
// (NORTHSTAR pattern #3 in the v0.3.0 ladder). Zero-value usage is
// permitted — CorrelationWindow defaults to
// DefaultXidCorrelationWindow.
type XidCorrelationDetector struct {
// CorrelationWindow is the maximum (eviction.EventTime -
// xid.Timestamp) for a same-node Xid → eviction pair to join.
// Zero means use DefaultXidCorrelationWindow.
CorrelationWindow time.Duration
}

// Evaluate scans Xid kernel events and Pod eviction records and
// emits one XidCorrelationVerdict per (xid → evicted_pod) pair where
// the same node hosted both within CorrelationWindow and the
// eviction's EventTime is >= the Xid's Timestamp (causality flows
// forward). When several Xids on the same node fall inside the
// window before an eviction, the verdict cites the MOST RECENT Xid
// as the proximate cause.
//
// Output is sorted by (eviction EventTime ascending, EventUID
// ascending) so the slice is deterministic for golden tests.
//
// Inputs are read-only snapshots; the detector does not mutate
// either slice. Order of inputs is not assumed.
func (d XidCorrelationDetector) Evaluate(xids []XidRecord, events []Record) []XidCorrelationVerdict {
window := d.CorrelationWindow
if window <= 0 {
window = DefaultXidCorrelationWindow
}

// Index Xids by node, ascending Timestamp, so we can binary-
// search for the most-recent Xid <= an eviction's EventTime in
// O(log N) per eviction.
xidIdx := indexXidsByNode(xids)

verdicts := make([]XidCorrelationVerdict, 0)
for i := range events {
ev := events[i]
if ev.Hint != HintPodEvicted {
continue
}
xid, ok := mostRecentXidWithin(xidIdx[ev.ReportingInstance], ev.EventTime, window)
if !ok {
continue
}
verdicts = append(verdicts, buildXidCorrelationVerdict(xid, ev))
}

sort.SliceStable(verdicts, func(i, j int) bool {
ti := verdicts[i].EvidenceTrail[1].Timestamp
tj := verdicts[j].EvidenceTrail[1].Timestamp
if !ti.Equal(tj) {
return ti.Before(tj)
}
return verdicts[i].EvidenceTrail[1].UID < verdicts[j].EvidenceTrail[1].UID
})
return verdicts
}

// indexXidsByNode groups Xid records by node name and sorts each
// bucket by Timestamp ascending so the detector can binary-search
// for the most-recent Xid before an eviction.
func indexXidsByNode(xids []XidRecord) map[string][]XidRecord {
idx := map[string][]XidRecord{}
for _, x := range xids {
idx[x.Node] = append(idx[x.Node], x)
}
for k := range idx {
recs := idx[k]
sort.SliceStable(recs, func(i, j int) bool {
return recs[i].Timestamp.Before(recs[j].Timestamp)
})
idx[k] = recs
}
return idx
}

// mostRecentXidWithin returns the most recent XidRecord in the
// per-node-sorted bucket whose Timestamp is <= evTime and
// >= evTime-window. Bucket is sorted ascending; binary-search finds
// the rightmost element <= evTime, then we verify the window bound.
// Mirrors mostRecentConditionWithin's shape on the pod_evicted side.
func mostRecentXidWithin(bucket []XidRecord, evTime time.Time, window time.Duration) (XidRecord, bool) {
if len(bucket) == 0 {
return XidRecord{}, false
}
i := sort.Search(len(bucket), func(i int) bool {
return bucket[i].Timestamp.After(evTime)
})
if i == 0 {
return XidRecord{}, false
}
candidate := bucket[i-1]
if evTime.Sub(candidate.Timestamp) > window {
return XidRecord{}, false
}
return candidate, true
}

// buildXidCorrelationVerdict materializes the verdict for one Xid →
// eviction join. Evidence trail is in causal order: kernel_event
// first (the hardware fault), pod_event second (the kubelet's
// response).
func buildXidCorrelationVerdict(xid XidRecord, ev Record) XidCorrelationVerdict {
pod := displayPodName(ev)
dt := ev.EventTime.Sub(xid.Timestamp)
return XidCorrelationVerdict{
PatternID: PatternIDXidCorrelation,
XidCode: xid.Code,
Node: xid.Node,
EvictedPod: pod,
Headline: xidCorrelationHeadline(xid, pod, dt),
Remediation: xidCorrelationRemediation(xid, pod),
EvidenceTrail: []EvidenceRef{
{
Kind: EvidenceKindXid,
UID: xidEvidenceUID(xid),
Timestamp: xid.Timestamp,
Description: xidEvidenceDescription(xid),
},
{
Kind: EvidenceKindPodEvent,
UID: ev.EventUID,
Timestamp: ev.EventTime,
Description: fmt.Sprintf("Pod %s evicted on node %s", pod, ev.ReportingInstance),
},
},
}
}

// xidEvidenceUID synthesizes a stable identifier for the Xid
// evidence ref. kmsg lines have no upstream UID — the (node, code,
// timestamp) triple is the smallest globally-unique key.
func xidEvidenceUID(xid XidRecord) string {
return fmt.Sprintf("%s/xid=%d/%d", xid.Node, xid.Code, xid.Timestamp.UnixNano())
}

// xidEvidenceDescription renders the operator-facing prose for the
// Xid evidence ref. Falls back to a generic shape when Detail is
// empty (e.g. a downstream stripped the body for PII reasons).
func xidEvidenceDescription(xid XidRecord) string {
if xid.Detail != "" {
return xid.Detail
}
return fmt.Sprintf("GPU Xid %d on node %s", xid.Code, xid.Node)
}

// xidCorrelationHeadline renders the operator-facing one-liner. The
// shape is regex-asserted: /Xid \d+ on .* → .* evicted .*s later/.
func xidCorrelationHeadline(xid XidRecord, pod string, dt time.Duration) string {
return fmt.Sprintf("Xid %d on %s → %s evicted %ds later", xid.Code, xid.Node, pod, int(dt.Seconds()))
}

// xidCorrelationRemediation returns the operator-actionable
// remediation prose. Pins the offending node + pod so the alert
// payload is self-contained.
func xidCorrelationRemediation(xid XidRecord, pod string) string {
return fmt.Sprintf(
"Likely GPU hardware failure on node %s (Xid %d). Drain the node, reseat or RMA the GPU, and reschedule pod %s once node returns Ready. Inspect dmesg / nvidia-smi -q for further driver state.",
xid.Node, xid.Code, pod,
)
}

// PatternIDXidCorrelation is the xid-correlation pattern identifier.
// String-typed numeric, uniform across pattern detectors. Unique
// vs PatternIDPodEvicted ("14") and PatternIDNCCLHang ("15").
const PatternIDXidCorrelation = "16"

// EvidenceKindXid names the GPU-Xid kernel-event surface. Hoisted
// alongside EvidenceKindPodEvent / EvidenceKindNodeCondition /
// EvidenceKindNCCLFR. The "kernel_event" wire value matches the
// vocabulary documented on verdict.go's EvidenceRef.Kind comment.
const EvidenceKindXid = "kernel_event"
Loading
Loading