diff --git a/bench/detectors/baselines.json b/bench/detectors/baselines.json index cdbcd913..397d21a1 100644 --- a/bench/detectors/baselines.json +++ b/bench/detectors/baselines.json @@ -14,10 +14,10 @@ "notes": "Highest alloc/event detector; 1024 events fans out into 819 evictions + 64 node-pressure joins; per-evicted-pod verdict + evidence trail dominates." }, "BenchmarkXidCorrelationDetector": { - "allocs_per_op": 12699, - "b_per_op": 906973, - "allocs_per_event_approx": 12.4, - "notes": "64 Xids x 1024 evictions; one verdict per evicted pod (most-recent Xid wins)." + "allocs_per_op": 1928, + "b_per_op": 596828, + "allocs_per_event_approx": 1.88, + "notes": "Hits NORTHSTAR (#417). 64 Xids x 1024 evictions, 896 verdicts/pass; per-verdict floor is 2 allocs (1 composite-prose buffer + 1 EvidenceTrail backing array) — all five variable-length prose fields (EvictedPod, Headline, Remediation, xid-UID, pod-evidence-description) carved out of one composite buffer via unsafe.String aliasing. Sprintf replaced with strconv-based byte builders." }, "BenchmarkHBMECCDetector": { "allocs_per_op": 1429, diff --git a/module/pkg/patterns/xid_correlation.go b/module/pkg/patterns/xid_correlation.go index 23e68c87..eabc6024 100644 --- a/module/pkg/patterns/xid_correlation.go +++ b/module/pkg/patterns/xid_correlation.go @@ -3,9 +3,11 @@ package patterns import ( - "fmt" "sort" + "strconv" + "strings" "time" + "unsafe" ) // DefaultXidCorrelationWindow is the maximum gap between an Xid @@ -130,6 +132,14 @@ type XidCorrelationDetector struct { // // Inputs are read-only snapshots; the detector does not mutate // either slice. Order of inputs is not assumed. +// +// Allocation discipline (issue #417): the hot path is one verdict per +// evicted pod, and every verdict materializes 4 prose strings (UID + +// description per evidence ref + headline + remediation). Sprintf's +// interface-boxing + format-scan was ~70% of allocs under the bench +// fixture; all renderers now use strconv.AppendInt-based byte builders. +// The verdicts slice is pre-sized to len(events) so the growing-slice +// allocation chain (8 → 16 → 32 → ...) collapses to one alloc. func (d XidCorrelationDetector) Evaluate(xids []XidRecord, events []Record) []XidCorrelationVerdict { window := d.CorrelationWindow if window <= 0 { @@ -141,7 +151,10 @@ func (d XidCorrelationDetector) Evaluate(xids []XidRecord, events []Record) []Xi // O(log N) per eviction. xidIdx := indexXidsByNode(xids) - verdicts := make([]XidCorrelationVerdict, 0) + // Pre-size to len(events): worst case every event is an evicted + // pod that joins a Xid. The minor over-allocation when only a + // fraction joins is cheaper than the runtime's repeated grow path. + verdicts := make([]XidCorrelationVerdict, 0, len(events)) for i := range events { ev := events[i] if ev.Hint != HintPodEvicted { @@ -168,14 +181,18 @@ func (d XidCorrelationDetector) Evaluate(xids []XidRecord, events []Record) []Xi // indexXidsByNode groups Xid records by node name and sorts each // bucket by Timestamp ascending so the detector can binary-search // for the most-recent Xid before an eviction. +// +// Allocation discipline (issue #417): pre-sizes the map to the input +// cardinality so map growth stops at the initial allocation (vs the +// default 8-bucket floor doubling repeatedly for the 64-node fixture). func indexXidsByNode(xids []XidRecord) map[string][]XidRecord { - idx := map[string][]XidRecord{} + idx := make(map[string][]XidRecord, len(xids)) for _, x := range xids { idx[x.Node] = append(idx[x.Node], x) } for k := range idx { recs := idx[k] - sort.SliceStable(recs, func(i, j int) bool { + sort.Slice(recs, func(i, j int) bool { return recs[i].Timestamp.Before(recs[j].Timestamp) }) idx[k] = recs @@ -209,9 +226,134 @@ func mostRecentXidWithin(bucket []XidRecord, evTime time.Time, window time.Durat // eviction join. Evidence trail is in causal order: kernel_event // first (the hardware fault), pod_event second (the kubelet's // response). +// +// Allocation discipline (issue #417): packs all five variable-length +// prose fields (EvictedPod, Headline, Remediation, xid-UID, +// pod-evidence-description) into ONE composite byte buffer and then +// carves immutable string headers out of that buffer via +// unsafe.String. The compiler can't prove the carved strings don't +// outlive the buffer, so the buffer stays heap-allocated — but it is +// allocated ONCE per verdict instead of five times. Combined with the +// 2-element EvidenceTrail backing array, the per-verdict floor is 2 +// allocations regardless of input scale. The strings are immutable by +// construction (the buffer is never written-to after carving) so the +// aliasing is sound; golden-replay byte-identity is preserved because +// content equality (not pointer identity) is what JSON serialization +// compares. func buildXidCorrelationVerdict(xid XidRecord, ev Record) XidCorrelationVerdict { - pod := displayPodName(ev) dt := ev.EventTime.Sub(xid.Timestamp) + dtSecs := int64(dt.Seconds()) + + // --- length pre-computation --------------------------------- + // Compute "ns/name" (EvictedPod) length first because Headline + // + Remediation + podDesc all embed it. displayPodName's "/" + // separator is dropped when either side is empty; the helper + // captures both branches and returns the rendered length we'll + // need to reserve in the composite buffer. + podNs := ev.Regarding.Namespace + podName := ev.Regarding.Name + podLen, podHasSep, podFallbackUnknown := podDisplayLength(podNs, podName) + + // Stack-allocated scratch arrays for integer rendering. Each + // intToScratch result aliases the matching array; the slices live + // only until the next append-into-composite copies the bytes. + var codeScratch, nsScratch, dtScratch [20]byte + xidCodeBuf := intToScratch(&codeScratch, int64(xid.Code)) + xidNsBuf := intToScratch(&nsScratch, xid.Timestamp.UnixNano()) + dtSecsBuf := intToScratch(&dtScratch, dtSecs) + + // xidUID = "/xid=/" + uidLen := len(xid.Node) + len("/xid=") + len(xidCodeBuf) + 1 + len(xidNsBuf) + + // headline = "Xid on evicted s later" + // " → " is the 3-rune ascii+arrow+ascii sequence = 5 bytes + // ("Xid " 4 + " on " 4 + " → " 5 + " evicted " 9 + "s later" 7 = 29) + const headlineStatic = len("Xid ") + len(" on ") + len(" → ") + len(" evicted ") + len("s later") + headlineLen := headlineStatic + len(xidCodeBuf) + len(xid.Node) + podLen + len(dtSecsBuf) + + // remediation = "Likely GPU hardware failure on node (Xid ). Drain ... pod once node returns Ready. Inspect dmesg / nvidia-smi -q for further driver state." + const ( + remPrefix = "Likely GPU hardware failure on node " + remMid1 = " (Xid " + remMid2 = "). Drain the node, reseat or RMA the GPU, and reschedule pod " + remSuffix = " once node returns Ready. Inspect dmesg / nvidia-smi -q for further driver state." + ) + const remStatic = len(remPrefix) + len(remMid1) + len(remMid2) + len(remSuffix) + remediationLen := remStatic + len(xid.Node) + len(xidCodeBuf) + podLen + + // podDesc = "Pod evicted on node " + const ( + pdPrefix = "Pod " + pdMid = " evicted on node " + ) + const pdStatic = len(pdPrefix) + len(pdMid) + podDescLen := pdStatic + podLen + len(ev.ReportingInstance) + + // --- composite buffer carve --------------------------------- + totalLen := podLen + uidLen + headlineLen + remediationLen + podDescLen + composite := make([]byte, 0, totalLen) + + // pod ("ns/name") + podStart := len(composite) + if podHasSep { + composite = append(composite, podNs...) + composite = append(composite, '/') + composite = append(composite, podName...) + } else if podFallbackUnknown { + composite = append(composite, ""...) + } else { + composite = append(composite, podName...) + } + podEnd := len(composite) + + // xid UID + uidStart := len(composite) + composite = append(composite, xid.Node...) + composite = append(composite, "/xid="...) + composite = append(composite, xidCodeBuf...) + composite = append(composite, '/') + composite = append(composite, xidNsBuf...) + uidEnd := len(composite) + + // headline + headStart := len(composite) + composite = append(composite, "Xid "...) + composite = append(composite, xidCodeBuf...) + composite = append(composite, " on "...) + composite = append(composite, xid.Node...) + composite = append(composite, " → "...) + composite = append(composite, composite[podStart:podEnd]...) + composite = append(composite, " evicted "...) + composite = append(composite, dtSecsBuf...) + composite = append(composite, "s later"...) + headEnd := len(composite) + + // remediation + remStart := len(composite) + composite = append(composite, remPrefix...) + composite = append(composite, xid.Node...) + composite = append(composite, remMid1...) + composite = append(composite, xidCodeBuf...) + composite = append(composite, remMid2...) + composite = append(composite, composite[podStart:podEnd]...) + composite = append(composite, remSuffix...) + remEnd := len(composite) + + // pod description + pdStart := len(composite) + composite = append(composite, pdPrefix...) + composite = append(composite, composite[podStart:podEnd]...) + composite = append(composite, pdMid...) + composite = append(composite, ev.ReportingInstance...) + pdEnd := len(composite) + + // --- string carve (unsafe; buffer is immutable post-build) --- + pod := stringSlice(composite, podStart, podEnd) + uid := stringSlice(composite, uidStart, uidEnd) + headline := stringSlice(composite, headStart, headEnd) + remediation := stringSlice(composite, remStart, remEnd) + podDesc := stringSlice(composite, pdStart, pdEnd) + return XidCorrelationVerdict{ PatternID: PatternIDXidCorrelation, XidCode: xid.Code, @@ -219,12 +361,12 @@ func buildXidCorrelationVerdict(xid XidRecord, ev Record) XidCorrelationVerdict EvictedPod: pod, PodName: ev.Regarding.Name, PodNamespace: ev.Regarding.Namespace, - Headline: xidCorrelationHeadline(xid, pod, dt), - Remediation: xidCorrelationRemediation(xid, pod), + Headline: headline, + Remediation: remediation, EvidenceTrail: []EvidenceRef{ { Kind: EvidenceKindXid, - UID: xidEvidenceUID(xid), + UID: uid, Timestamp: xid.Timestamp, Description: xidEvidenceDescription(xid), }, @@ -232,43 +374,102 @@ func buildXidCorrelationVerdict(xid XidRecord, ev Record) XidCorrelationVerdict Kind: EvidenceKindPodEvent, UID: ev.EventUID, Timestamp: ev.EventTime, - Description: fmt.Sprintf("Pod %s evicted on node %s", pod, ev.ReportingInstance), + Description: podDesc, }, }, } } +// podDisplayLength mirrors displayPodName's branches and returns the +// rendered length so the composite buffer can be pre-sized exactly. +// hasSep = ns + "/" + name; fallbackUnknown = "". +func podDisplayLength(ns, name string) (length int, hasSep bool, fallbackUnknown bool) { + if ns != "" && name != "" { + return len(ns) + 1 + len(name), true, false + } + if name != "" { + return len(name), false, false + } + return len(""), false, true +} + +// intToScratch renders n base-10 into the caller-provided [20]byte +// scratch and returns the populated sub-slice. The slice aliases the +// scratch array, so its lifetime is bounded by the caller's stack +// frame — but the array itself does not escape because each call site +// in buildXidCorrelationVerdict copies the bytes into the composite +// buffer (via append) before the next intToScratch call. Returning a +// []byte from a passed-in [20]byte keeps the scratch stack-allocated +// (the array is addressable and its address does not flow into the +// returned slice header beyond the caller's frame). +func intToScratch(scratch *[20]byte, n int64) []byte { + return strconv.AppendInt(scratch[:0], n, 10) +} + +// stringSlice carves an aliasing string over composite[lo:hi]. The +// caller MUST NOT mutate or further-append the underlying buffer +// after this call; in buildXidCorrelationVerdict the buffer is +// finalized before any carve happens, so the constraint is upheld by +// construction. +func stringSlice(composite []byte, lo, hi int) string { + if lo == hi { + return "" + } + return unsafe.String(&composite[lo], hi-lo) +} + // xidEvidenceUID synthesizes a stable identifier for the Xid // evidence ref. kmsg lines have no upstream UID — the (node, code, // timestamp) triple is the smallest globally-unique key. +// +// strings.Builder + strconv.AppendInt via WriteString of an itoa'd +// scratch buffer — strings.Builder.String() is zero-copy +// (unsafe.String aliases the builder's buffer), so this collapses to +// a single heap allocation per call. Sprintf was ~25% of per-verdict +// allocs under the bench fixture (issue #417). func xidEvidenceUID(xid XidRecord) string { - return fmt.Sprintf("%s/xid=%d/%d", xid.Node, xid.Code, xid.Timestamp.UnixNano()) + var sb strings.Builder + // Capacity: node + "/xid=" (5) + int (<=20) + "/" (1) + int (<=20). + sb.Grow(len(xid.Node) + 48) + sb.WriteString(xid.Node) + sb.WriteString("/xid=") + writeInt(&sb, int64(xid.Code)) + sb.WriteByte('/') + writeInt(&sb, xid.Timestamp.UnixNano()) + return sb.String() } // xidEvidenceDescription renders the operator-facing prose for the // Xid evidence ref. Falls back to a generic shape when Detail is // empty (e.g. a downstream stripped the body for PII reasons). +// +// Fast path returns xid.Detail directly (zero alloc); fallback builds +// "GPU Xid %d on node %s" via strings.Builder (issue #417). func xidEvidenceDescription(xid XidRecord) string { if xid.Detail != "" { return xid.Detail } - return fmt.Sprintf("GPU Xid %d on node %s", xid.Code, xid.Node) + var sb strings.Builder + sb.Grow(24 + len(xid.Node)) + sb.WriteString("GPU Xid ") + writeInt(&sb, int64(xid.Code)) + sb.WriteString(" on node ") + sb.WriteString(xid.Node) + return sb.String() } -// xidCorrelationHeadline renders the operator-facing one-liner. The -// shape is regex-asserted: /Xid \d+ on .* → .* evicted .*s later/. -func xidCorrelationHeadline(xid XidRecord, pod string, dt time.Duration) string { - return fmt.Sprintf("Xid %d on %s → %s evicted %ds later", xid.Code, xid.Node, pod, int(dt.Seconds())) -} - -// xidCorrelationRemediation returns the operator-actionable -// remediation prose. Pins the offending node + pod so the alert -// payload is self-contained. -func xidCorrelationRemediation(xid XidRecord, pod string) string { - return fmt.Sprintf( - "Likely GPU hardware failure on node %s (Xid %d). Drain the node, reseat or RMA the GPU, and reschedule pod %s once node returns Ready. Inspect dmesg / nvidia-smi -q for further driver state.", - xid.Node, xid.Code, pod, - ) +// writeInt appends the base-10 rendering of n to sb. Uses a +// stack-allocated scratch buffer + strconv.AppendInt so the integer +// formatting does not allocate (vs strconv.Itoa, which returns a +// fresh string per call). +// +// (Headline, Remediation, and pod-evidence-description rendering live +// in buildXidCorrelationVerdict's composite-buffer path; see issue +// #417 for the per-verdict alloc breakdown.) +func writeInt(sb *strings.Builder, n int64) { + var scratch [20]byte // max int64 base-10 width + sign + b := strconv.AppendInt(scratch[:0], n, 10) + sb.Write(b) } // PatternIDXidCorrelation is the xid-correlation pattern identifier. diff --git a/scripts/bench-registry.sh b/scripts/bench-registry.sh index f9b9d3a3..3326b6c9 100644 --- a/scripts/bench-registry.sh +++ b/scripts/bench-registry.sh @@ -45,7 +45,8 @@ bench_entries=( # # Initial ceilings (measured baseline + 1 slack per #302): # - pod_evicted measured 15.27/ev, ceiling 16 (tracked: optimize via #TBD) -# - xid_correlation measured 12.4/ev, ceiling 13 +# - xid_correlation measured 1.88/ev, ceiling 2 (ratcheted from 13 in #417; +# detector hit NORTHSTAR) # - hbm_ecc measured 1.4/ev, ceiling 2 # - nccl_hang measured 1.70/ev, ceiling 2 (ratcheted from 5 in #418; # detector hit NORTHSTAR) @@ -56,7 +57,7 @@ bench_entries=( # the ratchet path down to that ceiling. See #302 for the rollup. allocs_gate=( "BenchmarkPodEvictedDetector|1024|16" - "BenchmarkXidCorrelationDetector|1024|13" + "BenchmarkXidCorrelationDetector|1024|2" "BenchmarkHBMECCDetector|1024|2" "BenchmarkNCCLHangDetector|1024|2" "BenchmarkThermalThrottleDetector|1024|1"