Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions bench/detectors/baselines.json
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,10 @@
"notes": "Highest alloc/event detector; 1024 events fans out into 819 evictions + 64 node-pressure joins; per-evicted-pod verdict + evidence trail dominates."
},
"BenchmarkXidCorrelationDetector": {
"allocs_per_op": 12699,
"b_per_op": 906973,
"allocs_per_event_approx": 12.4,
"notes": "64 Xids x 1024 evictions; one verdict per evicted pod (most-recent Xid wins)."
"allocs_per_op": 1928,
"b_per_op": 596828,
"allocs_per_event_approx": 1.88,
"notes": "Hits NORTHSTAR (#417). 64 Xids x 1024 evictions, 896 verdicts/pass; per-verdict floor is 2 allocs (1 composite-prose buffer + 1 EvidenceTrail backing array) — all five variable-length prose fields (EvictedPod, Headline, Remediation, xid-UID, pod-evidence-description) carved out of one composite buffer via unsafe.String aliasing. Sprintf replaced with strconv-based byte builders."
},
"BenchmarkHBMECCDetector": {
"allocs_per_op": 1429,
Expand Down
251 changes: 226 additions & 25 deletions module/pkg/patterns/xid_correlation.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,11 @@
package patterns

import (
"fmt"
"sort"
"strconv"
"strings"
"time"
"unsafe"
)

// DefaultXidCorrelationWindow is the maximum gap between an Xid
Expand Down Expand Up @@ -130,6 +132,14 @@ type XidCorrelationDetector struct {
//
// Inputs are read-only snapshots; the detector does not mutate
// either slice. Order of inputs is not assumed.
//
// Allocation discipline (issue #417): the hot path is one verdict per
// evicted pod, and every verdict materializes 4 prose strings (UID +
// description per evidence ref + headline + remediation). Sprintf's
// interface-boxing + format-scan was ~70% of allocs under the bench
// fixture; all renderers now use strconv.AppendInt-based byte builders.
// The verdicts slice is pre-sized to len(events) so the growing-slice
// allocation chain (8 → 16 → 32 → ...) collapses to one alloc.
func (d XidCorrelationDetector) Evaluate(xids []XidRecord, events []Record) []XidCorrelationVerdict {
window := d.CorrelationWindow
if window <= 0 {
Expand All @@ -141,7 +151,10 @@ func (d XidCorrelationDetector) Evaluate(xids []XidRecord, events []Record) []Xi
// O(log N) per eviction.
xidIdx := indexXidsByNode(xids)

verdicts := make([]XidCorrelationVerdict, 0)
// Pre-size to len(events): worst case every event is an evicted
// pod that joins a Xid. The minor over-allocation when only a
// fraction joins is cheaper than the runtime's repeated grow path.
verdicts := make([]XidCorrelationVerdict, 0, len(events))
for i := range events {
ev := events[i]
if ev.Hint != HintPodEvicted {
Expand All @@ -168,14 +181,18 @@ func (d XidCorrelationDetector) Evaluate(xids []XidRecord, events []Record) []Xi
// indexXidsByNode groups Xid records by node name and sorts each
// bucket by Timestamp ascending so the detector can binary-search
// for the most-recent Xid before an eviction.
//
// Allocation discipline (issue #417): pre-sizes the map to the input
// cardinality so map growth stops at the initial allocation (vs the
// default 8-bucket floor doubling repeatedly for the 64-node fixture).
func indexXidsByNode(xids []XidRecord) map[string][]XidRecord {
idx := map[string][]XidRecord{}
idx := make(map[string][]XidRecord, len(xids))
for _, x := range xids {
idx[x.Node] = append(idx[x.Node], x)
}
for k := range idx {
recs := idx[k]
sort.SliceStable(recs, func(i, j int) bool {
sort.Slice(recs, func(i, j int) bool {
return recs[i].Timestamp.Before(recs[j].Timestamp)
})
idx[k] = recs
Expand Down Expand Up @@ -209,66 +226,250 @@ func mostRecentXidWithin(bucket []XidRecord, evTime time.Time, window time.Durat
// eviction join. Evidence trail is in causal order: kernel_event
// first (the hardware fault), pod_event second (the kubelet's
// response).
//
// Allocation discipline (issue #417): packs all five variable-length
// prose fields (EvictedPod, Headline, Remediation, xid-UID,
// pod-evidence-description) into ONE composite byte buffer and then
// carves immutable string headers out of that buffer via
// unsafe.String. The compiler can't prove the carved strings don't
// outlive the buffer, so the buffer stays heap-allocated — but it is
// allocated ONCE per verdict instead of five times. Combined with the
// 2-element EvidenceTrail backing array, the per-verdict floor is 2
// allocations regardless of input scale. The strings are immutable by
// construction (the buffer is never written-to after carving) so the
// aliasing is sound; golden-replay byte-identity is preserved because
// content equality (not pointer identity) is what JSON serialization
// compares.
func buildXidCorrelationVerdict(xid XidRecord, ev Record) XidCorrelationVerdict {
pod := displayPodName(ev)
dt := ev.EventTime.Sub(xid.Timestamp)
dtSecs := int64(dt.Seconds())

// --- length pre-computation ---------------------------------
// Compute "ns/name" (EvictedPod) length first because Headline
// + Remediation + podDesc all embed it. displayPodName's "/"
// separator is dropped when either side is empty; the helper
// captures both branches and returns the rendered length we'll
// need to reserve in the composite buffer.
podNs := ev.Regarding.Namespace
podName := ev.Regarding.Name
podLen, podHasSep, podFallbackUnknown := podDisplayLength(podNs, podName)

// Stack-allocated scratch arrays for integer rendering. Each
// intToScratch result aliases the matching array; the slices live
// only until the next append-into-composite copies the bytes.
var codeScratch, nsScratch, dtScratch [20]byte
xidCodeBuf := intToScratch(&codeScratch, int64(xid.Code))
xidNsBuf := intToScratch(&nsScratch, xid.Timestamp.UnixNano())
dtSecsBuf := intToScratch(&dtScratch, dtSecs)

// xidUID = "<node>/xid=<code>/<unixnano>"
uidLen := len(xid.Node) + len("/xid=") + len(xidCodeBuf) + 1 + len(xidNsBuf)

// headline = "Xid <code> on <node> → <pod> evicted <secs>s later"
// " → " is the 3-rune ascii+arrow+ascii sequence = 5 bytes
// ("Xid " 4 + " on " 4 + " → " 5 + " evicted " 9 + "s later" 7 = 29)
const headlineStatic = len("Xid ") + len(" on ") + len(" → ") + len(" evicted ") + len("s later")
headlineLen := headlineStatic + len(xidCodeBuf) + len(xid.Node) + podLen + len(dtSecsBuf)

// remediation = "Likely GPU hardware failure on node <node> (Xid <code>). Drain ... pod <pod> once node returns Ready. Inspect dmesg / nvidia-smi -q for further driver state."
const (
remPrefix = "Likely GPU hardware failure on node "
remMid1 = " (Xid "
remMid2 = "). Drain the node, reseat or RMA the GPU, and reschedule pod "
remSuffix = " once node returns Ready. Inspect dmesg / nvidia-smi -q for further driver state."
)
const remStatic = len(remPrefix) + len(remMid1) + len(remMid2) + len(remSuffix)
remediationLen := remStatic + len(xid.Node) + len(xidCodeBuf) + podLen

// podDesc = "Pod <pod> evicted on node <reportingInstance>"
const (
pdPrefix = "Pod "
pdMid = " evicted on node "
)
const pdStatic = len(pdPrefix) + len(pdMid)
podDescLen := pdStatic + podLen + len(ev.ReportingInstance)

// --- composite buffer carve ---------------------------------
totalLen := podLen + uidLen + headlineLen + remediationLen + podDescLen
composite := make([]byte, 0, totalLen)

// pod ("ns/name")
podStart := len(composite)
if podHasSep {
composite = append(composite, podNs...)
composite = append(composite, '/')
composite = append(composite, podName...)
} else if podFallbackUnknown {
composite = append(composite, "<unknown>"...)
} else {
composite = append(composite, podName...)
}
podEnd := len(composite)

// xid UID
uidStart := len(composite)
composite = append(composite, xid.Node...)
composite = append(composite, "/xid="...)
composite = append(composite, xidCodeBuf...)
composite = append(composite, '/')
composite = append(composite, xidNsBuf...)
uidEnd := len(composite)

// headline
headStart := len(composite)
composite = append(composite, "Xid "...)
composite = append(composite, xidCodeBuf...)
composite = append(composite, " on "...)
composite = append(composite, xid.Node...)
composite = append(composite, " → "...)
composite = append(composite, composite[podStart:podEnd]...)
composite = append(composite, " evicted "...)
composite = append(composite, dtSecsBuf...)
composite = append(composite, "s later"...)
headEnd := len(composite)

// remediation
remStart := len(composite)
composite = append(composite, remPrefix...)
composite = append(composite, xid.Node...)
composite = append(composite, remMid1...)
composite = append(composite, xidCodeBuf...)
composite = append(composite, remMid2...)
composite = append(composite, composite[podStart:podEnd]...)
composite = append(composite, remSuffix...)
remEnd := len(composite)

// pod description
pdStart := len(composite)
composite = append(composite, pdPrefix...)
composite = append(composite, composite[podStart:podEnd]...)
composite = append(composite, pdMid...)
composite = append(composite, ev.ReportingInstance...)
pdEnd := len(composite)

// --- string carve (unsafe; buffer is immutable post-build) ---
pod := stringSlice(composite, podStart, podEnd)
uid := stringSlice(composite, uidStart, uidEnd)
headline := stringSlice(composite, headStart, headEnd)
remediation := stringSlice(composite, remStart, remEnd)
podDesc := stringSlice(composite, pdStart, pdEnd)

return XidCorrelationVerdict{
PatternID: PatternIDXidCorrelation,
XidCode: xid.Code,
Node: xid.Node,
EvictedPod: pod,
PodName: ev.Regarding.Name,
PodNamespace: ev.Regarding.Namespace,
Headline: xidCorrelationHeadline(xid, pod, dt),
Remediation: xidCorrelationRemediation(xid, pod),
Headline: headline,
Remediation: remediation,
EvidenceTrail: []EvidenceRef{
{
Kind: EvidenceKindXid,
UID: xidEvidenceUID(xid),
UID: uid,
Timestamp: xid.Timestamp,
Description: xidEvidenceDescription(xid),
},
{
Kind: EvidenceKindPodEvent,
UID: ev.EventUID,
Timestamp: ev.EventTime,
Description: fmt.Sprintf("Pod %s evicted on node %s", pod, ev.ReportingInstance),
Description: podDesc,
},
},
}
}

// podDisplayLength mirrors displayPodName's branches and returns the
// rendered length so the composite buffer can be pre-sized exactly.
// hasSep = ns + "/" + name; fallbackUnknown = "<unknown>".
func podDisplayLength(ns, name string) (length int, hasSep bool, fallbackUnknown bool) {
if ns != "" && name != "" {
return len(ns) + 1 + len(name), true, false
}
if name != "" {
return len(name), false, false
}
return len("<unknown>"), false, true
}

// intToScratch renders n base-10 into the caller-provided [20]byte
// scratch and returns the populated sub-slice. The slice aliases the
// scratch array, so its lifetime is bounded by the caller's stack
// frame — but the array itself does not escape because each call site
// in buildXidCorrelationVerdict copies the bytes into the composite
// buffer (via append) before the next intToScratch call. Returning a
// []byte from a passed-in [20]byte keeps the scratch stack-allocated
// (the array is addressable and its address does not flow into the
// returned slice header beyond the caller's frame).
func intToScratch(scratch *[20]byte, n int64) []byte {
return strconv.AppendInt(scratch[:0], n, 10)
}

// stringSlice carves an aliasing string over composite[lo:hi]. The
// caller MUST NOT mutate or further-append the underlying buffer
// after this call; in buildXidCorrelationVerdict the buffer is
// finalized before any carve happens, so the constraint is upheld by
// construction.
func stringSlice(composite []byte, lo, hi int) string {
if lo == hi {
return ""
}
return unsafe.String(&composite[lo], hi-lo)
}

// xidEvidenceUID synthesizes a stable identifier for the Xid
// evidence ref. kmsg lines have no upstream UID — the (node, code,
// timestamp) triple is the smallest globally-unique key.
//
// strings.Builder + strconv.AppendInt via WriteString of an itoa'd
// scratch buffer — strings.Builder.String() is zero-copy
// (unsafe.String aliases the builder's buffer), so this collapses to
// a single heap allocation per call. Sprintf was ~25% of per-verdict
// allocs under the bench fixture (issue #417).
func xidEvidenceUID(xid XidRecord) string {
return fmt.Sprintf("%s/xid=%d/%d", xid.Node, xid.Code, xid.Timestamp.UnixNano())
var sb strings.Builder
// Capacity: node + "/xid=" (5) + int (<=20) + "/" (1) + int (<=20).
sb.Grow(len(xid.Node) + 48)
sb.WriteString(xid.Node)
sb.WriteString("/xid=")
writeInt(&sb, int64(xid.Code))
sb.WriteByte('/')
writeInt(&sb, xid.Timestamp.UnixNano())
return sb.String()
}

// xidEvidenceDescription renders the operator-facing prose for the
// Xid evidence ref. Falls back to a generic shape when Detail is
// empty (e.g. a downstream stripped the body for PII reasons).
//
// Fast path returns xid.Detail directly (zero alloc); fallback builds
// "GPU Xid %d on node %s" via strings.Builder (issue #417).
func xidEvidenceDescription(xid XidRecord) string {
if xid.Detail != "" {
return xid.Detail
}
return fmt.Sprintf("GPU Xid %d on node %s", xid.Code, xid.Node)
var sb strings.Builder
sb.Grow(24 + len(xid.Node))
sb.WriteString("GPU Xid ")
writeInt(&sb, int64(xid.Code))
sb.WriteString(" on node ")
sb.WriteString(xid.Node)
return sb.String()
}

// xidCorrelationHeadline renders the operator-facing one-liner. The
// shape is regex-asserted: /Xid \d+ on .* → .* evicted .*s later/.
func xidCorrelationHeadline(xid XidRecord, pod string, dt time.Duration) string {
return fmt.Sprintf("Xid %d on %s → %s evicted %ds later", xid.Code, xid.Node, pod, int(dt.Seconds()))
}

// xidCorrelationRemediation returns the operator-actionable
// remediation prose. Pins the offending node + pod so the alert
// payload is self-contained.
func xidCorrelationRemediation(xid XidRecord, pod string) string {
return fmt.Sprintf(
"Likely GPU hardware failure on node %s (Xid %d). Drain the node, reseat or RMA the GPU, and reschedule pod %s once node returns Ready. Inspect dmesg / nvidia-smi -q for further driver state.",
xid.Node, xid.Code, pod,
)
// writeInt appends the base-10 rendering of n to sb. Uses a
// stack-allocated scratch buffer + strconv.AppendInt so the integer
// formatting does not allocate (vs strconv.Itoa, which returns a
// fresh string per call).
//
// (Headline, Remediation, and pod-evidence-description rendering live
// in buildXidCorrelationVerdict's composite-buffer path; see issue
// #417 for the per-verdict alloc breakdown.)
func writeInt(sb *strings.Builder, n int64) {
var scratch [20]byte // max int64 base-10 width + sign
b := strconv.AppendInt(scratch[:0], n, 10)
sb.Write(b)
}

// PatternIDXidCorrelation is the xid-correlation pattern identifier.
Expand Down
5 changes: 3 additions & 2 deletions scripts/bench-registry.sh
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,8 @@ bench_entries=(
#
# Initial ceilings (measured baseline + 1 slack per #302):
# - pod_evicted measured 15.27/ev, ceiling 16 (tracked: optimize via #TBD)
# - xid_correlation measured 12.4/ev, ceiling 13
# - xid_correlation measured 1.88/ev, ceiling 2 (ratcheted from 13 in #417;
# detector hit NORTHSTAR)
# - hbm_ecc measured 1.4/ev, ceiling 2
# - nccl_hang measured 1.70/ev, ceiling 2 (ratcheted from 5 in #418;
# detector hit NORTHSTAR)
Expand All @@ -56,7 +57,7 @@ bench_entries=(
# the ratchet path down to that ceiling. See #302 for the rollup.
allocs_gate=(
"BenchmarkPodEvictedDetector|1024|16"
"BenchmarkXidCorrelationDetector|1024|13"
"BenchmarkXidCorrelationDetector|1024|2"
"BenchmarkHBMECCDetector|1024|2"
"BenchmarkNCCLHangDetector|1024|2"
"BenchmarkThermalThrottleDetector|1024|1"
Expand Down