Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion module/pkg/patterns/pod_evicted.go
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,11 @@ func indexNodeConds(recs []NodeRecord) map[string][]NodeRecord {
// with MissingLayers=["node_condition"].
func buildVerdict(ev Record, condIdx map[string][]NodeRecord, window time.Duration) PodEvictedVerdict {
v := PodEvictedVerdict{
PatternID: PatternIDPodEvicted,
PatternID: PatternIDPodEvicted,
PodName: ev.Regarding.Name,
PodNamespace: ev.Regarding.Namespace,
NodeName: ev.ReportingInstance,
EventReason: ev.Reason,
EvidenceTrail: []EvidenceRef{
{
Kind: EvidenceKindPodEvent,
Expand Down
8 changes: 8 additions & 0 deletions module/pkg/patterns/pod_evicted_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,14 @@ func TestPodEvictedDetector_CanonicalDiskPressure(t *testing.T) {
require.Equal(t, patterns.EvidenceKindPodEvent, v.EvidenceTrail[0].Kind)
require.Equal(t, patterns.EvidenceKindNodeCondition, v.EvidenceTrail[1].Kind)
require.Equal(t, "evict-1", v.EvidenceTrail[0].UID)

// Operator-facing scalars hoisted onto the verdict for issue
// #270 — the patterndetectorprocessor promotes them to top-level
// OTLP log-record attributes.
require.Equal(t, "job-rank-3", v.PodName)
require.Equal(t, "training", v.PodNamespace)
require.Equal(t, "gpu-node-0001", v.NodeName)
require.Equal(t, "Evicted", v.EventReason)
}

// TestPodEvictedDetector_PartialNoNodeCondition asserts the rubric's
Expand Down
16 changes: 16 additions & 0 deletions module/pkg/patterns/testdata/verdict.schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,22 @@
"type": "string",
"enum": ["node_condition"]
}
},
"k8s.pod.name": {
"type": "string",
"description": "Evicted Pod's metadata.name. Promoted to a top-level OTLP attribute by the patterndetectorprocessor (issue #270)."
},
"k8s.pod.namespace": {
"type": "string",
"description": "Evicted Pod's metadata.namespace. See k8s.pod.name."
},
"k8s.node.name": {
"type": "string",
"description": "Node the pod was evicted from. See k8s.pod.name."
},
"k8s.event.reason": {
"type": "string",
"description": "Upstream Kubernetes Event Reason (e.g. \"Evicted\"). See k8s.pod.name."
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,14 @@
"minLength": 1,
"description": "namespace/name of the evicted pod that correlated with the Xid."
},
"k8s.pod.name": {
"type": "string",
"description": "Evicted Pod's metadata.name (split from evicted_pod). Promoted to a top-level OTLP attribute by the patterndetectorprocessor (issue #270)."
},
"k8s.pod.namespace": {
"type": "string",
"description": "Evicted Pod's metadata.namespace. See k8s.pod.name."
},
"evidence_trail": {
"type": "array",
"minItems": 2,
Expand Down
19 changes: 19 additions & 0 deletions module/pkg/patterns/verdict.go
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,25 @@ type PodEvictedVerdict struct {
Confidence Confidence `json:"confidence"`
EvidenceTrail []EvidenceRef `json:"evidence_trail"`

// PodName is the evicted Pod's metadata.name. Hoisted onto the
// verdict (issue #270) so the patterndetectorprocessor can promote
// it to a top-level OTLP log-record attribute (k8s.pod.name) —
// dashboards table-aggregate by pod without server-side parsing of
// pattern.verdict_json.
PodName string `json:"k8s.pod.name,omitempty"`

// PodNamespace is the evicted Pod's metadata.namespace. See PodName
// comment.
PodNamespace string `json:"k8s.pod.namespace,omitempty"`

// NodeName is the kubelet's reporting_instance — the node the pod
// was evicted from. See PodName comment.
NodeName string `json:"k8s.node.name,omitempty"`

// EventReason is the upstream Kubernetes Event Reason ("Evicted").
// See PodName comment.
EventReason string `json:"k8s.event.reason,omitempty"`

// MissingLayers names the evidence layers that did not join.
// Empty when Confidence==Full. Populated in causal order:
// "node_condition" appears if no node-pressure transition was
Expand Down
24 changes: 18 additions & 6 deletions module/pkg/patterns/xid_correlation.go
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,16 @@ type XidCorrelationVerdict struct {
// verdict emits per evicted pod (not per Xid) so alert routing
// can fan out to per-pod owners.
EvictedPod string `json:"evicted_pod"`

// PodName is the evicted Pod's metadata.name (split from
// EvictedPod for downstream dashboard convenience). Hoisted on
// the verdict (issue #270) so the patterndetectorprocessor
// promotes it to a top-level OTLP attribute (k8s.pod.name).
PodName string `json:"k8s.pod.name,omitempty"`

// PodNamespace is the evicted Pod's metadata.namespace. See
// PodName comment.
PodNamespace string `json:"k8s.pod.namespace,omitempty"`
}

// XidCorrelationDetector is the xid_correlation pattern detector
Expand Down Expand Up @@ -184,12 +194,14 @@ func buildXidCorrelationVerdict(xid XidRecord, ev Record) XidCorrelationVerdict
pod := displayPodName(ev)
dt := ev.EventTime.Sub(xid.Timestamp)
return XidCorrelationVerdict{
PatternID: PatternIDXidCorrelation,
XidCode: xid.Code,
Node: xid.Node,
EvictedPod: pod,
Headline: xidCorrelationHeadline(xid, pod, dt),
Remediation: xidCorrelationRemediation(xid, pod),
PatternID: PatternIDXidCorrelation,
XidCode: xid.Code,
Node: xid.Node,
EvictedPod: pod,
PodName: ev.Regarding.Name,
PodNamespace: ev.Regarding.Namespace,
Headline: xidCorrelationHeadline(xid, pod, dt),
Remediation: xidCorrelationRemediation(xid, pod),
EvidenceTrail: []EvidenceRef{
{
Kind: EvidenceKindXid,
Expand Down
6 changes: 6 additions & 0 deletions module/pkg/patterns/xid_correlation_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,12 @@ func TestXidCorrelationDetector_PositiveXid79ThenEviction(t *testing.T) {
require.Equal(t, patterns.EvidenceKindXid, v.EvidenceTrail[0].Kind, "Xid surfaces first (causal order)")
require.Equal(t, patterns.EvidenceKindPodEvent, v.EvidenceTrail[1].Kind)
require.Equal(t, "evict-1", v.EvidenceTrail[1].UID)

// Operator-facing scalars (issue #270): pod name/namespace split
// from EvictedPod so the patterndetectorprocessor can promote
// them to top-level OTLP log-record attributes.
require.Equal(t, "job-rank-3", v.PodName)
require.Equal(t, "training", v.PodNamespace)
}

// TestXidCorrelationDetector_NegativeXidNoEviction pins the
Expand Down
4 changes: 4 additions & 0 deletions module/pkg/replay/pod_evicted/canonical/golden.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,10 @@
"headline": "Pod training/job-rank-3 evicted at 2026-05-18T10:00:00Z due to disk pressure",
"remediation": "On node gpu-node-0001: Free imagefs or relocate the training write path to NVMe; tighten kubelet --eviction-hard nodefs.available.",
"confidence": "full",
"k8s.pod.name": "job-rank-3",
"k8s.pod.namespace": "training",
"k8s.node.name": "gpu-node-0001",
"k8s.event.reason": "Evicted",
"evidence_trail": [
{
"kind": "pod_event",
Expand Down
68 changes: 68 additions & 0 deletions module/processor/patterndetectorprocessor/patterndetector.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,51 @@ const VerdictAttrRemediation = "pattern.remediation"
// alongside the broken-out scalar fields above.
const VerdictAttrVerdictJSON = "pattern.verdict_json"

// Operator-facing scalar attributes promoted onto verdict log
// records (issue #270). Dashboards table-aggregate and LogQL queries
// filter on these without server-side JSON parsing of
// pattern.verdict_json. Vocabulary tracks OTel semantic conventions
// (k8s.*) and the recipe-canonical kernelevents.xid / nccl.fr.*
// keys so a dashboard query reads the same name on the verdict as
// on the underlying input record.
const (
// VerdictAttrK8sPodName is the evicted/affected Pod's
// metadata.name. Emitted on pod_evicted and xid_correlation
// verdicts.
VerdictAttrK8sPodName = "k8s.pod.name"

// VerdictAttrK8sPodNamespace is the evicted/affected Pod's
// metadata.namespace. Emitted alongside VerdictAttrK8sPodName.
VerdictAttrK8sPodNamespace = "k8s.pod.namespace"

// VerdictAttrK8sNodeName is the node the eviction/Xid was
// attributed to. Emitted on pod_evicted and xid_correlation
// verdicts.
VerdictAttrK8sNodeName = "k8s.node.name"

// VerdictAttrK8sEventReason is the upstream Kubernetes Event
// Reason (e.g. "Evicted"). Emitted on pod_evicted verdicts.
VerdictAttrK8sEventReason = "k8s.event.reason"

// VerdictAttrNCCLPgID names the NCCL process-group id the hang
// is scoped to. Emitted on nccl_hang verdicts.
VerdictAttrNCCLPgID = "nccl.fr.pg_id"

// VerdictAttrNCCLCollectiveSeqID names the collective_seq_id the
// hanging ranks are blocked on. Emitted on nccl_hang verdicts.
VerdictAttrNCCLCollectiveSeqID = "nccl.fr.collective_seq_id"

// VerdictAttrNCCLHangingRanksCount is the cohort size — len of
// HangingRanks. Promoted as a scalar so dashboards can graph
// hang severity without unmarshalling the rank list.
VerdictAttrNCCLHangingRanksCount = "nccl.fr.hanging_ranks_count"

// VerdictAttrKernelXid is the NVIDIA driver Xid event code that
// triggered the correlation. Emitted on xid_correlation
// verdicts.
VerdictAttrKernelXid = "kernelevents.xid"
)

// patterndetectorProcessor implements processor.Logs. Pure data-path
// component — the underlying detector library is stateless across
// calls.
Expand Down Expand Up @@ -382,13 +427,29 @@ func appendVerdict(ld plog.Logs, v patterns.PodEvictedVerdict, logger *zap.Logge
lr.Attributes().PutStr(VerdictAttrConfidence, string(v.Confidence))
lr.Attributes().PutStr(VerdictAttrHeadline, v.Headline)
lr.Attributes().PutStr(VerdictAttrRemediation, v.Remediation)
putStrIfSet(lr.Attributes(), VerdictAttrK8sPodName, v.PodName)
putStrIfSet(lr.Attributes(), VerdictAttrK8sPodNamespace, v.PodNamespace)
putStrIfSet(lr.Attributes(), VerdictAttrK8sNodeName, v.NodeName)
putStrIfSet(lr.Attributes(), VerdictAttrK8sEventReason, v.EventReason)
if b, err := json.Marshal(v); err == nil {
lr.Attributes().PutStr(VerdictAttrVerdictJSON, string(b))
} else if logger != nil {
logger.Warn("patterndetector: failed to marshal verdict JSON; broken-out attrs still emit", zap.Error(err))
}
}

// putStrIfSet is a small guard so an unpopulated optional scalar
// (e.g. an evicted pod with no namespace on a malformed event)
// doesn't stamp an empty-string attribute on the verdict record.
// Dashboards filter by attribute presence; the empty-string variant
// would silently match empty-filter queries.
func putStrIfSet(attrs pcommon.Map, key, value string) {
if value == "" {
return
}
attrs.PutStr(key, value)
}

func evidenceTimestamp(v patterns.PodEvictedVerdict) time.Time {
if len(v.EvidenceTrail) > 0 {
return v.EvidenceTrail[0].Timestamp
Expand Down Expand Up @@ -416,6 +477,9 @@ func appendNCCLHangVerdict(ld plog.Logs, v patterns.NCCLHangVerdict, logger *zap
lr.Attributes().PutStr(VerdictAttrPatternID, v.PatternID)
lr.Attributes().PutStr(VerdictAttrHeadline, v.Headline)
lr.Attributes().PutStr(VerdictAttrRemediation, v.Remediation)
lr.Attributes().PutInt(VerdictAttrNCCLPgID, v.PgID)
lr.Attributes().PutInt(VerdictAttrNCCLCollectiveSeqID, v.CollectiveSeqID)
lr.Attributes().PutInt(VerdictAttrNCCLHangingRanksCount, int64(len(v.HangingRanks)))
if b, err := json.Marshal(v); err == nil {
lr.Attributes().PutStr(VerdictAttrVerdictJSON, string(b))
} else if logger != nil {
Expand Down Expand Up @@ -451,6 +515,10 @@ func appendXidCorrelationVerdict(ld plog.Logs, v patterns.XidCorrelationVerdict,
lr.Attributes().PutStr(VerdictAttrPatternID, v.PatternID)
lr.Attributes().PutStr(VerdictAttrHeadline, v.Headline)
lr.Attributes().PutStr(VerdictAttrRemediation, v.Remediation)
lr.Attributes().PutInt(VerdictAttrKernelXid, int64(v.XidCode))
putStrIfSet(lr.Attributes(), VerdictAttrK8sNodeName, v.Node)
putStrIfSet(lr.Attributes(), VerdictAttrK8sPodName, v.PodName)
putStrIfSet(lr.Attributes(), VerdictAttrK8sPodNamespace, v.PodNamespace)
if b, err := json.Marshal(v); err == nil {
lr.Attributes().PutStr(VerdictAttrVerdictJSON, string(b))
} else if logger != nil {
Expand Down
Loading