diff --git a/bench/detectors/baselines.json b/bench/detectors/baselines.json index 397d21a1..46a5d007 100644 --- a/bench/detectors/baselines.json +++ b/bench/detectors/baselines.json @@ -8,10 +8,10 @@ "$threshold_pct": 10, "detectors": { "BenchmarkPodEvictedDetector": { - "allocs_per_op": 15635, - "b_per_op": 970020, - "allocs_per_event_approx": 15.27, - "notes": "Highest alloc/event detector; 1024 events fans out into 819 evictions + 64 node-pressure joins; per-evicted-pod verdict + evidence trail dominates." + "allocs_per_op": 1943, + "b_per_op": 473400, + "allocs_per_event_approx": 1.9, + "notes": "Hits NORTHSTAR (#434). Per-verdict headline + pod_event description are the irreducible floor (1 alloc each, 2 per verdict). Achieved via: (1) strconv byte-builders replacing fmt.Sprintf; (2) per-Evaluate scratch buffer shared across builders to eliminate `make([]byte)` escape; (3) pre-rendered indexed node-condition cache (uid + description + remediation) re-used across all full-path joins; (4) partial-path remediation cache keyed by (node, pressure); (5) shared evidence-trail backing array sliced cap=2 per verdict; (6) slices.SortStableFunc replacing reflection-based sort.SliceStable; (7) zero-alloc containsFold scan replacing strings.ToLower." }, "BenchmarkXidCorrelationDetector": { "allocs_per_op": 1928, diff --git a/module/pkg/patterns/pod_evicted.go b/module/pkg/patterns/pod_evicted.go index 97f258f5..6add3760 100644 --- a/module/pkg/patterns/pod_evicted.go +++ b/module/pkg/patterns/pod_evicted.go @@ -3,9 +3,9 @@ package patterns import ( - "fmt" + "slices" "sort" - "strings" + "strconv" "time" ) @@ -34,6 +34,23 @@ type PodEvictedDetector struct { JoinWindow time.Duration } +// partialRemediationKey indexes the partial-path remediation cache — +// see Evaluate's partialRemediations comment. (node, pressure) is the +// caching grain because annotateRemediationWithNode's output is a +// deterministic function of those two inputs (plus the static +// remediationFor(pressure) prose). +type partialRemediationKey struct { + node string + pressure NodePressureKind +} + +// missingLayersNodeCondition is the shared MissingLayers value stamped +// on partial verdicts (no node-condition joined). Reused across +// verdicts to avoid one `[]string{...}` alloc per evicted pod — the +// slice is read-only by JSON-marshal convention and pattern callers +// never mutate it. +var missingLayersNodeCondition = []string{EvidenceKindNodeCondition} + // Evaluate scans events for Pod-Evicted records and joins each // against the most recent matching node-condition record within the // JoinWindow. Returns one PodEvictedVerdict per evicted pod, in @@ -48,46 +65,118 @@ type PodEvictedDetector struct { // at the Hint check and never appear in the output. This satisfies // the rubric's negative-fixture gate: Killing/Preempted/FailedScheduling // map to non-pod_evicted Hints and so emit zero verdicts. +// +// Allocation-tuned for ≤2/event NORTHSTAR; recipe per #434. +// +// Byte-builders + strconv used in render functions to avoid fmt allocs (#434). func (d PodEvictedDetector) Evaluate(events []Record, nodeConds []NodeRecord) []PodEvictedVerdict { window := d.JoinWindow if window <= 0 { window = DefaultJoinWindow } + // Per-Evaluate scratch byte buffer reused across every verdict's + // string-builder call. Each builder resets to `scratch[:0]`, + // appends its content, and returns `string(scratch)` — the make + // happens once for the Evaluate pass, not once per builder per + // verdict. Sized to the largest known headline (~256 bytes) so no + // grow occurs in the steady state. + scratch := make([]byte, 0, 256) + // Index node conditions by node name and sort each bucket by // TransitionAt ascending. The detector uses sort.Search to find // the most-recent transition <= EventTime in O(log N). - condIdx := indexNodeConds(nodeConds) + // indexNodeConds also pre-renders each condition's emit strings + // once (uid, description, remediation) — see indexedNodeCond + // comment. + condIdx := indexNodeConds(nodeConds, &scratch) + + // Pre-size verdicts to len(events) — worst case one verdict per + // event (all evictions, no negative-hint filtering). One alloc + // for the slice header replaces ~7 grow-by-doubling appends on + // the 819-eviction bench fixture. + verdicts := make([]PodEvictedVerdict, 0, len(events)) + + // Pre-allocate ONE contiguous backing array for every verdict's + // EvidenceTrail (worst case 2 entries per verdict — pod_event + + // optional node_condition). Each verdict's trail is a sub-slice + // into this shared backing with len=0 cap=2, eliminating the + // per-verdict `make([]EvidenceRef, 1, 2)` alloc. The cap=2 bound + // prevents any later `append(v.EvidenceTrail, ...)` from + // clobbering the next verdict's window (cap-overflow re-allocates + // fresh memory per Go's slice-grow semantics). + trailBacking := make([]EvidenceRef, 2*len(events)) + trailCursor := 0 + + // Partial-path remediation cache: (node, pressure) -> rendered + // "On node X: " string. Real-world burst evictions on the + // same node share the same Note pressure-anchor and the same node + // name, so this cache turns N partial-path allocs into roughly + // distinct-node-count allocs. Lazily allocated inside buildVerdict + // on first partial path so zero-overhead when every eviction + // joins a node-condition (the common production case). + var partialRemediations map[partialRemediationKey]string - verdicts := make([]PodEvictedVerdict, 0) for i := range events { ev := events[i] if ev.Hint != HintPodEvicted { continue } - verdicts = append(verdicts, buildVerdict(ev, condIdx, window)) + trail := trailBacking[trailCursor : trailCursor : trailCursor+2] + trailCursor += 2 + verdicts = append(verdicts, buildVerdict(ev, condIdx, window, trail, &scratch, &partialRemediations)) } - sort.SliceStable(verdicts, func(i, j int) bool { - return evidenceTime(verdicts[i]).Before(evidenceTime(verdicts[j])) + // slices.SortStableFunc replaces sort.SliceStable — generics, no + // reflection-based swapper allocation. + slices.SortStableFunc(verdicts, func(a, b PodEvictedVerdict) int { + return evidenceTime(a).Compare(evidenceTime(b)) }) return verdicts } +// indexedNodeCond is a NodeRecord plus three pre-rendered strings the +// detector emits when this condition wins a join. Pre-rendering at +// index time amortizes the alloc cost across every eviction that +// joins this condition — the 64-node bench fixture has 820 evictions +// joining onto 64 conditions (≈13:1), so caching turns 820×3 = 2460 +// per-eviction string allocs into 64×3 = 192 once. +type indexedNodeCond struct { + rec NodeRecord + uid string + description string + remediation string +} + // indexNodeConds groups records by node name and sorts each bucket // by TransitionAt ascending so the detector can binary-search for -// the most-recent transition before an eviction. -func indexNodeConds(recs []NodeRecord) map[string][]NodeRecord { - idx := map[string][]NodeRecord{} +// the most-recent transition before an eviction. Returns a map of +// indexedNodeCond (with pre-rendered emit strings — see +// indexedNodeCond comment). +func indexNodeConds(recs []NodeRecord, scratch *[]byte) map[string][]indexedNodeCond { + // Pre-size to len(recs) — worst case is one bucket per record + // (every record on a distinct node), best case the map stays + // small. Either way the hint avoids the default 8-bucket + // grow-by-doubling cost on the 64-node bench fixture. + idx := make(map[string][]indexedNodeCond, len(recs)) for _, r := range recs { - idx[r.NodeName] = append(idx[r.NodeName], r) + idx[r.NodeName] = append(idx[r.NodeName], indexedNodeCond{ + rec: r, + uid: nodeConditionUID(r, scratch), + description: nodeConditionDescription(r, scratch), + remediation: annotateRemediationWithNode(remediationFor(r.Pressure), r.NodeName, scratch), + }) } - for k := range idx { - recs := idx[k] - sort.SliceStable(recs, func(i, j int) bool { - return recs[i].TransitionAt.Before(recs[j].TransitionAt) + for k, bucket := range idx { + if len(bucket) <= 1 { + continue // trivially sorted; skip the sort call entirely + } + // slices.SortStableFunc uses generics — no reflection, no + // per-call swapper allocation (sort.SliceStable's hidden cost). + slices.SortStableFunc(bucket, func(a, b indexedNodeCond) int { + return a.rec.TransitionAt.Compare(b.rec.TransitionAt) }) - idx[k] = recs + idx[k] = bucket } return idx } @@ -96,99 +185,208 @@ func indexNodeConds(recs []NodeRecord) map[string][]NodeRecord { // condition index and produces the verdict. Confidence is Full iff a // node-condition record was found within the window; Partial otherwise // with MissingLayers=["node_condition"]. -func buildVerdict(ev Record, condIdx map[string][]NodeRecord, window time.Duration) PodEvictedVerdict { +// buildVerdict's `trail` parameter is a len=0 cap=2 window into the +// caller-owned trail backing — see Evaluate's trailBacking comment. +// buildVerdict appends 1 (partial) or 2 (full) entries; cap=2 means +// the append never escapes to a fresh allocation. +func buildVerdict(ev Record, condIdx map[string][]indexedNodeCond, window time.Duration, trail []EvidenceRef, scratch *[]byte, partialCache *map[partialRemediationKey]string) PodEvictedVerdict { + node := ev.ReportingInstance + cond, joined := mostRecentConditionWithin(condIdx[node], ev.EventTime, window) + + trail = append(trail, EvidenceRef{ + Kind: EvidenceKindPodEvent, + UID: ev.EventUID, + Timestamp: ev.EventTime, + Description: podEventDescription(ev, scratch), + }) + v := PodEvictedVerdict{ - PatternID: PatternIDPodEvicted, - PodName: ev.Regarding.Name, - PodNamespace: ev.Regarding.Namespace, - NodeName: ev.ReportingInstance, - EventReason: ev.Reason, - EvidenceTrail: []EvidenceRef{ - { - Kind: EvidenceKindPodEvent, - UID: ev.EventUID, - Timestamp: ev.EventTime, - Description: fmt.Sprintf("Pod %s/%s evicted (reason=%s) on node %s", ev.Regarding.Namespace, ev.Regarding.Name, ev.Reason, ev.ReportingInstance), - }, - }, + PatternID: PatternIDPodEvicted, + PodName: ev.Regarding.Name, + PodNamespace: ev.Regarding.Namespace, + NodeName: node, + EventReason: ev.Reason, + EvidenceTrail: trail, } - node := ev.ReportingInstance - cond, ok := mostRecentConditionWithin(condIdx[node], ev.EventTime, window) - if !ok { + if !joined { + pressure := pressureFromNote(ev.Note) v.Confidence = ConfidencePartial - v.MissingLayers = []string{EvidenceKindNodeCondition} - v.Headline = fmt.Sprintf("Pod %s evicted at %s due to %s", - displayPodName(ev), formatTimestamp(ev.EventTime), - pressureFromNote(ev.Note)) - v.Remediation = annotateRemediationWithNode(remediationFor(pressureFromNote(ev.Note)), node) + // Shared package-level slice — see missingLayersNodeCondition. + v.MissingLayers = missingLayersNodeCondition + v.Headline = renderHeadline(ev, pressure, false, scratch) + // Cache check FIRST so a hit skips the alloc entirely. + key := partialRemediationKey{node: node, pressure: pressure} + if cached, ok := (*partialCache)[key]; ok { + v.Remediation = cached + } else { + rem := annotateRemediationWithNode(remediationFor(pressure), node, scratch) + if *partialCache == nil { + *partialCache = make(map[partialRemediationKey]string, 8) + } + (*partialCache)[key] = rem + v.Remediation = rem + } return v } v.Confidence = ConfidenceFull + // uid, description, remediation are pre-rendered once per + // condition at index time (see indexedNodeCond) — re-using the + // strings here saves 3 allocs per full-join eviction. v.EvidenceTrail = append(v.EvidenceTrail, EvidenceRef{ Kind: EvidenceKindNodeCondition, - UID: fmt.Sprintf("%s/%s/%d", cond.NodeUID, cond.Pressure, cond.TransitionAt.UnixNano()), - Timestamp: cond.TransitionAt, - Description: nodeConditionDescription(cond), + UID: cond.uid, + Timestamp: cond.rec.TransitionAt, + Description: cond.description, }) - v.Headline = fmt.Sprintf("Pod %s evicted at %s due to %s pressure", - displayPodName(ev), formatTimestamp(ev.EventTime), cond.Pressure) - v.Remediation = annotateRemediationWithNode(remediationFor(cond.Pressure), cond.NodeName) + v.Headline = renderHeadline(ev, cond.rec.Pressure, true, scratch) + v.Remediation = cond.remediation return v } +// podEventDescription renders the pod_event evidence-trail line. +// Output: "Pod {ns}/{name} evicted (reason={reason}) on node {node}". +func podEventDescription(ev Record, scratch *[]byte) string { + ns := ev.Regarding.Namespace + name := ev.Regarding.Name + reason := ev.Reason + node := ev.ReportingInstance + buf := (*scratch)[:0] + buf = append(buf, "Pod "...) + buf = append(buf, ns...) + buf = append(buf, '/') + buf = append(buf, name...) + buf = append(buf, " evicted (reason="...) + buf = append(buf, reason...) + buf = append(buf, ") on node "...) + buf = append(buf, node...) + *scratch = buf + return string(buf) +} + +// nodeConditionUID renders the synthetic NodeRecord UID +// "{node-uid}/{pressure-kind}/{transition-unix-nano}". +func nodeConditionUID(cond NodeRecord, scratch *[]byte) string { + buf := (*scratch)[:0] + buf = append(buf, cond.NodeUID...) + buf = append(buf, '/') + buf = append(buf, cond.Pressure...) + buf = append(buf, '/') + buf = strconv.AppendInt(buf, cond.TransitionAt.UnixNano(), 10) + *scratch = buf + return string(buf) +} + +// renderHeadline renders the operator-facing headline. Inlines +// displayPodName to avoid the per-call intermediate-string alloc +// (Go's `+` concat allocates a fresh backing array). Byte-identical +// to the prior two-function shape: +// - withPressureSuffix=true: "Pod %s evicted at %s due to %s pressure" +// - withPressureSuffix=false: "Pod %s evicted at %s due to %s" +// +// The partial path passes false because pressureFromNote can return +// "unknown" — appending " pressure" would read "unknown pressure" +// which is wrong; preserved verbatim with prior behavior. +func renderHeadline(ev Record, pressure NodePressureKind, withPressureSuffix bool, scratch *[]byte) string { + ns := ev.Regarding.Namespace + name := ev.Regarding.Name + buf := (*scratch)[:0] + buf = append(buf, "Pod "...) + // Inline displayPodName: avoids the intermediate-string alloc. + switch { + case ns != "" && name != "": + buf = append(buf, ns...) + buf = append(buf, '/') + buf = append(buf, name...) + case name != "": + buf = append(buf, name...) + default: + buf = append(buf, ""...) + } + buf = append(buf, " evicted at "...) + buf = appendTimestamp(buf, ev.EventTime) + buf = append(buf, " due to "...) + buf = append(buf, pressure...) + if withPressureSuffix { + buf = append(buf, " pressure"...) + } + *scratch = buf + return string(buf) +} + // annotateRemediationWithNode prefixes operator-actionable prose with // the offending node so an alert payload is self-contained — the // operator doesn't have to cross-walk into the evidence trail to // learn which node to log into. Empty nodeName (e.g., the partial // path with no ReportingInstance) skips the prefix to avoid an // awkward "On node :" rendering. -func annotateRemediationWithNode(remediation, nodeName string) string { +func annotateRemediationWithNode(remediation, nodeName string, scratch *[]byte) string { if nodeName == "" { return remediation } - return fmt.Sprintf("On node %s: %s", nodeName, remediation) + buf := (*scratch)[:0] + buf = append(buf, "On node "...) + buf = append(buf, nodeName...) + buf = append(buf, ": "...) + buf = append(buf, remediation...) + *scratch = buf + return string(buf) } -// mostRecentConditionWithin returns the most recent NodeRecord in -// the per-node-sorted slice whose TransitionAt is <= evTime and +// mostRecentConditionWithin returns the most recent indexedNodeCond +// in the per-node-sorted slice whose TransitionAt is <= evTime and // >= evTime-window. Bucket is sorted ascending; binary-search finds // the rightmost element <= evTime, then we verify the window bound. -func mostRecentConditionWithin(bucket []NodeRecord, evTime time.Time, window time.Duration) (NodeRecord, bool) { +func mostRecentConditionWithin(bucket []indexedNodeCond, evTime time.Time, window time.Duration) (indexedNodeCond, bool) { if len(bucket) == 0 { - return NodeRecord{}, false + return indexedNodeCond{}, false } i := sort.Search(len(bucket), func(i int) bool { - return bucket[i].TransitionAt.After(evTime) + return bucket[i].rec.TransitionAt.After(evTime) }) if i == 0 { - return NodeRecord{}, false + return indexedNodeCond{}, false } candidate := bucket[i-1] - if evTime.Sub(candidate.TransitionAt) > window { - return NodeRecord{}, false + if evTime.Sub(candidate.rec.TransitionAt) > window { + return indexedNodeCond{}, false } return candidate, true } // nodeConditionDescription renders the evidence-trail prose for a -// joined NodeRecord. Omits the ": " suffix when the -// upstream Message is empty so the description never ends with a -// dangling colon. -func nodeConditionDescription(cond NodeRecord) string { - base := fmt.Sprintf("Node %s entered %s pressure", cond.NodeName, cond.Pressure) - if cond.Message == "" { - return base +// joined NodeRecord: "Node {name} entered {pressure} pressure" +// (+ optional ": {message}" suffix). Omits the ": " suffix +// when the upstream Message is empty so the description never ends +// with a dangling colon. +func nodeConditionDescription(cond NodeRecord, scratch *[]byte) string { + buf := (*scratch)[:0] + buf = append(buf, "Node "...) + buf = append(buf, cond.NodeName...) + buf = append(buf, " entered "...) + buf = append(buf, cond.Pressure...) + buf = append(buf, " pressure"...) + if cond.Message != "" { + buf = append(buf, ": "...) + buf = append(buf, cond.Message...) } - return base + ": " + cond.Message + *scratch = buf + return string(buf) } // displayPodName returns "namespace/name" for a Pod-shaped Record; -// falls back to just the name (or "") otherwise. +// falls back to just the name (or "") otherwise. Retained +// as a package-level helper (external callers, e.g. xid_correlation, +// may use it); pod_evicted's hot path inlines this logic into +// renderHeadline. func displayPodName(ev Record) string { if ev.Regarding.Namespace != "" && ev.Regarding.Name != "" { - return ev.Regarding.Namespace + "/" + ev.Regarding.Name + buf := make([]byte, 0, len(ev.Regarding.Namespace)+len(ev.Regarding.Name)+1) + buf = append(buf, ev.Regarding.Namespace...) + buf = append(buf, '/') + buf = append(buf, ev.Regarding.Name...) + return string(buf) } if ev.Regarding.Name != "" { return ev.Regarding.Name @@ -196,9 +394,23 @@ func displayPodName(ev Record) string { return "" } -// formatTimestamp renders an EventTime in RFC3339 UTC. Zero time -// renders as "" so the headline regex still matches the -// /at .*/ shape. +// appendTimestamp appends an EventTime in RFC3339 UTC to buf in +// place. Zero time renders as "" so the headline regex +// still matches the /at .*/ shape. +// +// Uses time.Time.AppendFormat (zero-alloc into our shared buffer) +// instead of time.Time.Format (allocates an intermediate string). +// Profile contribution: ~3% of pre-fix allocs. +func appendTimestamp(buf []byte, t time.Time) []byte { + if t.IsZero() { + return append(buf, ""...) + } + return t.UTC().AppendFormat(buf, time.RFC3339) +} + +// formatTimestamp renders an EventTime in RFC3339 UTC. Retained for +// external callers (test helpers); hot-path callers in this file use +// appendTimestamp directly to share a buffer. func formatTimestamp(t time.Time) string { if t.IsZero() { return "" @@ -212,11 +424,15 @@ func formatTimestamp(t time.Time) string { // vocabulary verbatim (signals + nodeConditionMessageFmt + the // ephemeral-storage / EmptyDir paths). Non-kubelet drivers // (descheduler, custom controllers) intentionally land in "unknown". +// +// Uses a zero-alloc case-insensitive substring scan (containsFold) +// in place of `strings.ToLower(note)` + `strings.Contains`, which +// allocated a lowercased copy of the note (one alloc per evicted +// pod under the bench fixture). func pressureFromNote(note string) NodePressureKind { - low := strings.ToLower(note) for _, m := range pressureMatchers { for _, anchor := range m.anchors { - if strings.Contains(low, anchor) { + if containsFold(note, anchor) { return m.kind } } @@ -224,6 +440,40 @@ func pressureFromNote(note string) NodePressureKind { return "unknown" } +// containsFold reports whether substr occurs in s under ASCII +// case-insensitive comparison. Zero-alloc — scans s in place without +// copying. +// +// Safe only for ASCII source + ASCII-lowercase substrs. All +// pressureMatchers anchors are ASCII; kubelet eviction messages are +// English-only. +func containsFold(s, substr string) bool { + if len(substr) == 0 { + return true + } + if len(substr) > len(s) { + return false + } + last := len(s) - len(substr) + for i := 0; i <= last; i++ { + match := true + for j := 0; j < len(substr); j++ { + c := s[i+j] + if c >= 'A' && c <= 'Z' { + c += 'a' - 'A' + } + if c != substr[j] { + match = false + break + } + } + if match { + return true + } + } + return false +} + // pressureMatchers is the table-driven mapping from anchor substring // to pressure kind. Order is load-bearing — Disk first so e.g. // "ephemeral storage" beats a hypothetical message that mentions diff --git a/scripts/bench-registry.sh b/scripts/bench-registry.sh index 3326b6c9..507f224b 100644 --- a/scripts/bench-registry.sh +++ b/scripts/bench-registry.sh @@ -44,7 +44,8 @@ bench_entries=( # in the bench file requires updating these entries. # # Initial ceilings (measured baseline + 1 slack per #302): -# - pod_evicted measured 15.27/ev, ceiling 16 (tracked: optimize via #TBD) +# - pod_evicted measured 1.90/ev, ceiling 2 (ratcheted from 16 in #434; +# detector hit NORTHSTAR) # - xid_correlation measured 1.88/ev, ceiling 2 (ratcheted from 13 in #417; # detector hit NORTHSTAR) # - hbm_ecc measured 1.4/ev, ceiling 2 @@ -56,7 +57,7 @@ bench_entries=( # NORTHSTAR is 2 allocs/event; the per-detector tracking issues drive # the ratchet path down to that ceiling. See #302 for the rollup. allocs_gate=( - "BenchmarkPodEvictedDetector|1024|16" + "BenchmarkPodEvictedDetector|1024|2" "BenchmarkXidCorrelationDetector|1024|2" "BenchmarkHBMECCDetector|1024|2" "BenchmarkNCCLHangDetector|1024|2"