From 777eee332c6ecbb86ea2da7756e2a35d0a146ab3 Mon Sep 17 00:00:00 2001 From: Tri Lam Date: Mon, 1 Jun 2026 18:58:09 -0700 Subject: [PATCH] feat(integrations/examples): pattern-10 CUDA OOM filelog OTTL stanza MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds the `transform/cuda_oom` processor to `docs/integrations/examples/filelog-container.yaml` that projects PyTorch's canonical `RuntimeError: CUDA out of memory. Tried to allocate X.YY . GPU N has a total capacity of ...` stderr line onto the customer-stable `cuda_oom.tried_alloc_bytes` (Int, bytes; unit-normalized KiB/MiB/GiB/TiB) and `cuda_oom.gpu_index` (Int) attributes that pattern #10's detector (PR #338) reads via `projectCUDAOOMLogRecord`. Closes the load-bearing filelog→detector integration gap flagged in issue #303 follow-ups. Per-unit-branch shape (one stanza per KiB/MiB/GiB/TiB prefix) because OTTL has no capture-group-conditional dispatch — the multiplier must be a literal int64 per stanza. Uses OTTL Math Expressions (`Int(whole)*UNIT + Int(frac)*(UNIT/100)`) to handle PyTorch's `%.2f` `format_size` output; precision loss capped at <1% of the unit base, three orders of magnitude under the detector's 5% fragmentation threshold. `gpu.id` (PCI BDF per RFC-0013 §3) is NOT stamped by this transform — the CUDA-runtime ordinal `cuda_oom.gpu_index` is not a PCI BDF. Two operator-configurable paths documented in the recipe markdown: (a) k8sattributesprocessor + `nvidia.com/gpu-PCIDeviceBusID` device- plugin annotation, or (b) DCGM BDF-lookup transform indexed by `cuda_oom.gpu_index`. The detector's resource-attr fallback reads `gpu.id` from the log resource either way. Tests (TDD red→green): three new recipe-parity tests under `module/processor/patterndetectorprocessor/cuda_oom_recipe_test.go`: - `TestRecipe_CUDAOOM_StanzaPinsWireContract`: pins the 7 load- bearing tokens (`cuda_oom.tried_alloc_bytes`, `cuda_oom.gpu_index`, KiB/MiB/GiB/TiB unit prefixes, `transform/cuda_oom`) + pipeline- wiring against the live projector. Mirrors PR #393's IB-flap shape. - `TestRecipe_CUDAOOM_RoundTripFiresVerdict`: end-to-end gate — log records carrying the exact attribute shape the recipe stamps flow through CUDAOOMDetector and emit a kind=fragmentation verdict with the expected scalar-promotion contract. - `TestRecipe_CUDAOOM_RegexCoversCanonicalPyTorchMessages`: 5 canonical positive PyTorch OOM messages (KiB/MiB/GiB/TiB/fractional) + 3 negative messages (DataLoader worker killed, NCCL watchdog, illegal memory access). Exceeds the >=3-positive A-tier acceptance criterion. Validates clean: - `tracecore validate docs/integrations/examples/filelog-container.yaml` exits 0. - `make validator-recipe` covers this file (tested-against: tracecore). - `make doc-check` resolves the new pattern-10 cross-link. - Full `make ci-fast` green. Cross-links: - Pattern doc: `docs/patterns/10-cuda-oom-deceptive.md` §"Signal sources" now references this recipe and resolves Open Question #2 ("filelogreceiver OTTL stanza for the OOM regex"). - Recipe markdown: new §`cuda_oom.*` attribute stanza (pattern #10) in `docs/integrations/filelog-container.md` with the unit- normalization arithmetic table, the two `gpu.id` source paths, and a Failure-modes row. Closes #436. Refs #338, #303, #337. Signed-off-by: Tri Lam --- .../examples/filelog-container.yaml | 77 ++++- docs/integrations/filelog-container.md | 64 ++++ docs/patterns/10-cuda-oom-deceptive.md | 4 +- .../cuda_oom_recipe_test.go | 324 ++++++++++++++++++ 4 files changed, 465 insertions(+), 4 deletions(-) create mode 100644 module/processor/patterndetectorprocessor/cuda_oom_recipe_test.go diff --git a/docs/integrations/examples/filelog-container.yaml b/docs/integrations/examples/filelog-container.yaml index 03c98be4..00dc5af5 100644 --- a/docs/integrations/examples/filelog-container.yaml +++ b/docs/integrations/examples/filelog-container.yaml @@ -133,6 +133,75 @@ processors: # directly). Catch-all for the "transport died" runbook # branch. - 'set(attributes["dataloader.error_class"], "Connection reset by peer") where IsMatch(body, "Connection reset by peer") and attributes["dataloader.error_class"] == nil' + + # Project PyTorch's `RuntimeError: CUDA out of memory. Tried to + # allocate X.YY . GPU N has a total capacity of ...` stderr + # line onto the customer-stable `cuda_oom.tried_alloc_bytes` (Int, + # bytes) + `cuda_oom.gpu_index` (Int) attributes that pattern #10's + # detector (module/processor/patterndetectorprocessor/cuda_oom.go, + # `projectCUDAOOMLogRecord`) consumes. The detector's projection + # gate is BOTH `cuda_oom.tried_alloc_bytes` AND `gpu.id` (PCI BDF + # per RFC-0013 §3); the stanzas below stamp the bytes scalar and + # the human-visible GPU index off the body. `gpu.id` is the + # operator-configurable mapping — `cuda_oom.gpu_index` is the + # CUDA-runtime ordinal that PyTorch's allocator prints, NOT a PCI + # BDF, so the recipe DOES NOT alias it onto `gpu.id`. Two paths + # to populate `gpu.id` are documented in + # docs/integrations/filelog-container.md §"`cuda_oom.*` attribute + # stanza (pattern #10)": + # (a) k8sattributesprocessor + `nvidia.com/gpu` device-plugin + # resource — the pod allocation maps to one PCI BDF, lifted + # onto the log resource as `gpu.id`. + # (b) a sibling DCGM BDF-lookup transform indexed by + # `cuda_oom.gpu_index` — the DCGM exporter ships a per-host + # (index → BDF) table on its scrape endpoint. + # Either path stamps `gpu.id` on the resource; the detector's + # resource-attr fallback (cuda_oom.go:65) reads it from there. + # + # Unit normalization: PyTorch's `format_size` emits `%.2f ` + # with the four IEC binary prefixes below (KiB / MiB / GiB / TiB). + # OTTL Math Expressions support `*` and `+` on int64, so we capture + # `whole` (digits before the dot) + 2-digit `frac` (digits after) + # and compute `Int(whole)*UNIT + Int(frac)*UNIT/100`. The integer + # division floors the per-frac-unit step (max precision loss: + # ~10 MB on a 99.99 GiB alloc — three orders of magnitude under + # the detector's 5% fragmentation threshold). + # + # Per-unit branches instead of one omnibus regex: OTTL has no + # capture-group-conditional dispatch, so the multiplier must be a + # literal int64 per stanza. The four-row repetition is the smallest + # shape that compiles. The `where IsMatch(...)` guard is tight on + # `CUDA out of memory\. Tried to allocate` so a generic CUDA error + # (illegal memory access, NCCL watchdog) does not trip the stanza. + transform/cuda_oom: + log_statements: + - context: log + statements: + # ---- GPU index extraction (any OOM line) ---- + # PyTorch prints `GPU N has a total capacity of ...` after + # the alloc-size scalar. The index is the CUDA-runtime + # ordinal, NOT a PCI BDF — the detector's `gpu.id` projection + # is satisfied via the k8sattributes / DCGM-lookup paths + # documented above; `cuda_oom.gpu_index` is operator-facing + # context the verdict's evidence trail uses. + - 'set(attributes["cuda_oom.gpu_index"], Int(ExtractPatterns(body, "GPU (?P\\d+) has a total capacity")["idx"])) where IsMatch(body, "CUDA out of memory\\. Tried to allocate") and IsMatch(body, "GPU \\d+ has a total capacity")' + + # ---- KiB branch ---- + # 1 KiB = 1024 B. frac-unit step = 1024/100 = 10 (floor). + - 'set(attributes["cuda_oom.tried_alloc_bytes"], Int(ExtractPatterns(body, "Tried to allocate (?P\\d+)\\.(?P\\d{2}) KiB")["w"]) * 1024 + Int(ExtractPatterns(body, "Tried to allocate (?P\\d+)\\.(?P\\d{2}) KiB")["f"]) * 10) where IsMatch(body, "CUDA out of memory\\. Tried to allocate \\d+\\.\\d{2} KiB")' + + # ---- MiB branch ---- + # 1 MiB = 1048576 B. frac-unit step = 1048576/100 = 10485 (floor). + - 'set(attributes["cuda_oom.tried_alloc_bytes"], Int(ExtractPatterns(body, "Tried to allocate (?P\\d+)\\.(?P\\d{2}) MiB")["w"]) * 1048576 + Int(ExtractPatterns(body, "Tried to allocate (?P\\d+)\\.(?P\\d{2}) MiB")["f"]) * 10485) where IsMatch(body, "CUDA out of memory\\. Tried to allocate \\d+\\.\\d{2} MiB")' + + # ---- GiB branch ---- + # 1 GiB = 1073741824 B. frac-unit step = 1073741824/100 = 10737418 (floor). + - 'set(attributes["cuda_oom.tried_alloc_bytes"], Int(ExtractPatterns(body, "Tried to allocate (?P\\d+)\\.(?P\\d{2}) GiB")["w"]) * 1073741824 + Int(ExtractPatterns(body, "Tried to allocate (?P\\d+)\\.(?P\\d{2}) GiB")["f"]) * 10737418) where IsMatch(body, "CUDA out of memory\\. Tried to allocate \\d+\\.\\d{2} GiB")' + + # ---- TiB branch ---- + # 1 TiB = 1099511627776 B. frac-unit step = 1099511627776/100 = 10995116277 (floor). + - 'set(attributes["cuda_oom.tried_alloc_bytes"], Int(ExtractPatterns(body, "Tried to allocate (?P\\d+)\\.(?P\\d{2}) TiB")["w"]) * 1099511627776 + Int(ExtractPatterns(body, "Tried to allocate (?P\\d+)\\.(?P\\d{2}) TiB")["f"]) * 10995116277) where IsMatch(body, "CUDA out of memory\\. Tried to allocate \\d+\\.\\d{2} TiB")' + k8sattributes: auth_type: serviceAccount passthrough: false @@ -181,6 +250,10 @@ service: # body strings produced by the container parser and stamp the # customer-stable `dataloader.error_class` / # `dataloader.worker_pid` attributes pattern #7's detector - # consumes. - processors: [k8sattributes, transform/dataloader_errors, batch] + # consumes. `transform/cuda_oom` runs alongside dataloader_errors + # (order-insensitive — they gate on disjoint body substrings) + # to stamp `cuda_oom.tried_alloc_bytes` + `cuda_oom.gpu_index` + # off PyTorch's `RuntimeError: CUDA out of memory` line for + # pattern #10's detector. + processors: [k8sattributes, transform/dataloader_errors, transform/cuda_oom, batch] exporters: [otlphttp] diff --git a/docs/integrations/filelog-container.md b/docs/integrations/filelog-container.md index 34c0e0e1..a111a06f 100644 --- a/docs/integrations/filelog-container.md +++ b/docs/integrations/filelog-container.md @@ -13,6 +13,11 @@ projects per-driver PyTorch `DataLoader` error vocabulary (FUSE, S3, Lustre, multiprocessing queue, worker-killed) onto the customer-stable `dataloader.error_class` / `dataloader.worker_pid` attributes that [pattern #7's detector](../patterns/07-dataloader-hang.md) consumes. +A sibling `transform/cuda_oom` stanza projects PyTorch's +`RuntimeError: CUDA out of memory. Tried to allocate X.YY ` line +onto the customer-stable `cuda_oom.tried_alloc_bytes` (Int, bytes; +unit-normalized) + `cuda_oom.gpu_index` (Int) attributes that +[pattern #10's detector](../patterns/10-cuda-oom-deceptive.md) consumes. Replaces the in-tree `containerstdout` receiver scheduled for deletion at v0.2.0 per [RFC-0013 §migration PR-K](../rfcs/0013-distro-first-pivot.md#migration-rollout) @@ -169,6 +174,63 @@ at `module/pkg/patterns/dataloader_hang.go`). > error classes (e.g. a future Ceph-class driver) extend the table > here, not by widening an existing regex. +## `cuda_oom.*` attribute stanza (pattern #10) + +The `transform/cuda_oom` processor projects PyTorch's canonical +out-of-memory stderr line — `RuntimeError: CUDA out of memory. Tried +to allocate 2.00 GiB. GPU 0 has a total capacity of 79.18 GiB of +which 16.00 GiB is free.` — onto the customer-stable +[`cuda_oom.tried_alloc_bytes`](../ATTRIBUTES.md) + +[`cuda_oom.gpu_index`](../ATTRIBUTES.md) attributes that +[pattern #10's detector](../patterns/10-cuda-oom-deceptive.md) +(`projectCUDAOOMLogRecord` at +`module/processor/patterndetectorprocessor/cuda_oom.go`) consumes. +The detector's projection gate is BOTH `cuda_oom.tried_alloc_bytes` +AND `gpu.id` (PCI BDF per +[RFC-0013 §3](../rfcs/0013-distro-first-pivot.md#3-customer-stable-telemetry-contracts)); +this stanza stamps the bytes scalar and the human-visible GPU index +off the body. `gpu.id` is **not** stamped here — the CUDA-runtime +ordinal `cuda_oom.gpu_index` is a CUDA enumeration index, not a PCI +BDF. Two operator-configurable paths populate `gpu.id` on the log +resource so the detector's resource-attr fallback reads it: + +| `gpu.id` source path | When to use | +|---|---| +| **k8sattributesprocessor + `nvidia.com/gpu` device-plugin resource** | The trainer pod requests one GPU via `resources.limits.nvidia.com/gpu: 1`. The NVIDIA device plugin annotates the pod with the allocated PCI BDF (`nvidia.com/gpu-PCIDeviceBusID` since device-plugin v0.16). Extend `k8sattributes::extract::annotations` to lift this annotation onto the log resource as `gpu.id`. Cheapest path — already in the cluster's GPU scheduling fabric. | +| **DCGM BDF-lookup transform indexed by `cuda_oom.gpu_index`** | Multi-GPU pods (one container ↔ N GPUs) where the device-plugin annotation is the per-pod list, not the per-OOM GPU. Scrape the DCGM exporter's `DCGM_FI_DEV_PCI_BUSID` series, materialize a per-host `{gpu_index → BDF}` lookup, then add a sibling OTTL stanza that joins `cuda_oom.gpu_index` against the table to stamp `gpu.id`. Sibling to the [pattern-2 / pattern-10 DCGM recipe](prometheus-scrape.md). | + +The recipe uses four per-unit-prefix branches (KiB / MiB / GiB / TiB) +because OTTL has no capture-group-conditional dispatch — the +multiplier must be a literal `int64` per stanza. The body match +captures `whole` (digits before the decimal) and `frac` (two digits +after) and computes +`Int(whole) * UNIT + Int(frac) * (UNIT / 100)`. PyTorch's +`format_size` always emits `%.2f`, so the 2-digit `frac` capture is +exhaustive; the integer-divide-by-100 floor caps precision loss at +under 1% of the unit base (max ~10 MB on a 99.99 GiB alloc, three +orders of magnitude under the detector's 5% fragmentation threshold). + +| Body shape | Captured | Stamped attributes | +|---|---|---| +| `CUDA out of memory. Tried to allocate \d+\.\d{2} KiB` | `whole`, `frac` (×2 digits) | `cuda_oom.tried_alloc_bytes = whole*1024 + frac*10` | +| `CUDA out of memory. Tried to allocate \d+\.\d{2} MiB` | `whole`, `frac` | `cuda_oom.tried_alloc_bytes = whole*1048576 + frac*10485` | +| `CUDA out of memory. Tried to allocate \d+\.\d{2} GiB` | `whole`, `frac` | `cuda_oom.tried_alloc_bytes = whole*1073741824 + frac*10737418` | +| `CUDA out of memory. Tried to allocate \d+\.\d{2} TiB` | `whole`, `frac` | `cuda_oom.tried_alloc_bytes = whole*1099511627776 + frac*10995116277` | +| `... GPU \d+ has a total capacity` | `idx` | `cuda_oom.gpu_index = idx` | + +The `where IsMatch(body, "CUDA out of memory\. Tried to allocate")` +guard is tight on the OOM-summary line, so generic CUDA errors +(`an illegal memory access was encountered`, NCCL watchdog timeouts, +`DataLoader worker (pid N) is killed`) do not trip the stanza — +keeping the detector quiet on non-OOM stderr noise. + +> **Multi-line tracebacks.** A PyTorch OOM emits the summary line +> followed by a Python traceback (`File "train.py", line 42, in ...`). +> The container parser flattens each newline-delimited log line into +> its own log record; only the summary line matches the regex above, +> so the detector sees exactly one stamp per OOM event regardless of +> traceback depth. This is pattern #10 spec Open Q#2's answer. + ## Placeholders | Placeholder | What to fill in | @@ -192,6 +254,8 @@ fails immediately instead of silently dropping logs. | High-cardinality label explosion | The container parser surfaces every label from `app.kubernetes.io/name` plus whatever you add under `extract::labels`. Audit the list against the receiving backend's cardinality budget before adding more. | | Pattern #7 verdict never fires despite known DataLoader stalls | The `transform/dataloader_errors` stanzas gate on substring matches against the container `body`. If your trainer wraps DataLoader errors (e.g. a custom logger that prefixes with JSON), the body shape changes. Confirm via `kubectl logs --container= --previous 2>&1 | grep -E 'DataLoader worker|Transport endpoint|SlowDown|Stale file handle'` and extend the regexes in `transform/dataloader_errors`. | | `dataloader.error_class` empty on a known error line | The OTTL stanza fell through silently — the body substring did not match any branch. Add a row to the table above and a matching `set(attributes["dataloader.error_class"], ...)` statement. The detector's projection gate requires the attribute, so a missing class drops the discriminator. | +| Pattern #10 verdict never fires despite a known CUDA OOM | The `transform/cuda_oom` stanzas gate on substring matches against the container `body`. Confirm via `kubectl logs --container= --previous 2>&1 \| grep -E 'CUDA out of memory\. Tried to allocate'`. If the trainer wraps PyTorch errors (custom logger, JSON envelope), the body shape changes — extend the `IsMatch` predicates to match the wrapper format. Also check that `gpu.id` is being stamped onto the log resource via one of the two paths in the `cuda_oom.*` section: a missing `gpu.id` drops the projection at `cuda_oom.go`'s gate and the detector stays quiet. | +| `cuda_oom.tried_alloc_bytes` stamped with a wildly wrong magnitude | A unit-prefix branch was modified without updating its multiplier, or the body shape drifted from `%.2f`. PyTorch's `format_size` has used `%.2f` for the entire CUDA-allocator lifetime; if a customer fork emits `%.0f` or `%.4f` the recipe's `\d{2}` capture misses, and the stanza fails open (no stamp) rather than producing a wrong value. Verify against `pytorch/c10/util/Exception.h`'s formatter. | Upstream component docs: [`receiver/filelogreceiver`](https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/receiver/filelogreceiver), diff --git a/docs/patterns/10-cuda-oom-deceptive.md b/docs/patterns/10-cuda-oom-deceptive.md index 2f5b215e..9ce6d175 100644 --- a/docs/patterns/10-cuda-oom-deceptive.md +++ b/docs/patterns/10-cuda-oom-deceptive.md @@ -16,7 +16,7 @@ Training fails with `RuntimeError: CUDA out of memory. Tried to allocate X MiB. ## Signal sources -- `filelogreceiver` tailing training-container stderr — OTTL stanza on the `RuntimeError: CUDA out of memory` line extracts `cuda_oom.tried_alloc_bytes`, `cuda_oom.total_bytes`, `cuda_oom.free_bytes`, `cuda_oom.gpu_index`. +- `filelogreceiver` tailing training-container stderr — OTTL stanza on the `RuntimeError: CUDA out of memory` line extracts `cuda_oom.tried_alloc_bytes` (unit-normalized KiB/MiB/GiB/TiB → bytes) and `cuda_oom.gpu_index`. Recipe: [docs/integrations/filelog-container.md §`cuda_oom.*` attribute stanza (pattern #10)](../integrations/filelog-container.md#cuda_oom-attribute-stanza-pattern-10) (issue [#436](https://github.com/tracecoreai/tracecore/issues/436); sibling to detector PR [#338](https://github.com/tracecoreai/tracecore/pull/338) and metric-side recipe [#337](https://github.com/tracecoreai/tracecore/issues/337)). `gpu.id` (PCI BDF per RFC-0013 §3) is stamped via the sibling k8sattributesprocessor + `nvidia.com/gpu` device-plugin resource OR a DCGM BDF-lookup transform indexed by `cuda_oom.gpu_index` — the recipe documents both paths. - `prometheusreceiver` scraping `dcgm-exporter` — `DCGM_FI_DEV_FB_USED` / `DCGM_FI_DEV_FB_FREE` projected via OTTL transform to `hw.gpu.memory.{used,free}` Gauges (unit `By`) with `gpu.id` resource attr. Per-GPU `hw.gpu.memory.total = used + free` is computed at the metrics-to-logs bridge layer, not at OTTL — `transformprocessor` v0.130 cannot perform cross-series arithmetic on a metrics pipeline. Recipe: [docs/integrations/prometheus-scrape.md §Pattern #10](../integrations/prometheus-scrape.md#pattern-10--cuda-oom-framebuffer); bridge log-shape spec: [§Pattern #10 — `hw.gpu.memory.{free,total}`](../integrations/prometheus-scrape.md#pattern-10--hwgpumemoryfreetotal-issue-337) (issue [#337](https://github.com/tracecoreai/tracecore/issues/337)). - (optional) `torch.cuda.memory_summary()` dump from a faulthandler / SIGUSR2 hook — far richer fragmentation detail; out of v1 scope. @@ -72,6 +72,6 @@ Per issue #303 scalar-promotion checklist: ## Open questions 1. **`DCGM_FI_DEV_FB_*` OTTL recipe extension.** Resolved by issue [#337](https://github.com/tracecoreai/tracecore/issues/337): metric-side projection (`DCGM_FI_DEV_FB_USED` → `hw.gpu.memory.used`, `DCGM_FI_DEV_FB_FREE` → `hw.gpu.memory.free`) ships in [docs/integrations/prometheus-scrape.md §Pattern #10](../integrations/prometheus-scrape.md#pattern-10--cuda-oom-framebuffer). The `hw.gpu.memory.total = used + free` derivation + log-record emission belongs to the RFC-0014 PR-B `WithMetrics` bridge; the log-shape spec the bridge MUST honor is pinned in the recipe's [§Pattern #10 — `hw.gpu.memory.{free,total}`](../integrations/prometheus-scrape.md#pattern-10--hwgpumemoryfreetotal-issue-337) section. -2. **filelogreceiver OTTL stanza for the OOM regex.** Sibling to #285. Multi-line `RuntimeError` traceback handling: OTTL recipe stops at the first stanza match or continues into the traceback? +2. **filelogreceiver OTTL stanza for the OOM regex.** Resolved by issue [#436](https://github.com/tracecoreai/tracecore/issues/436): the `transform/cuda_oom` stanza ships in [docs/integrations/filelog-container.md §`cuda_oom.*` attribute stanza (pattern #10)](../integrations/filelog-container.md#cuda_oom-attribute-stanza-pattern-10). The recipe stops at the per-unit `where IsMatch(...) ... Tried to allocate \d+\.\d{2} ` guard — multi-line traceback lines (`File "train.py", line 42, in ...`) do not match the OOM-summary regex and pass through untransformed, so the detector receives one `cuda_oom.tried_alloc_bytes` stamp per OOM event regardless of traceback depth. 3. **Metrics-path on patterndetectorprocessor.** Per ADR-0001 PR-B — the processor today consumes logs only. CUDA-OOM joins a log to a metric. Either the metric is projected to a log via the metrics→logs OTTL bridge (RFC-0014 PR-B, also blocking pattern #3 today), or the processor grows a metrics input. 4. **`cuda_oom.kind` enum namespace.** Should this be `pattern.cuda_oom.kind` or top-level `cuda_oom.kind`? ATTRIBUTES.md prefers `pattern.*` for tracecore-internal verdict scalars, but issue #303 used `cuda_oom.kind` directly. Reconcile. diff --git a/module/processor/patterndetectorprocessor/cuda_oom_recipe_test.go b/module/processor/patterndetectorprocessor/cuda_oom_recipe_test.go new file mode 100644 index 00000000..c5b86a2a --- /dev/null +++ b/module/processor/patterndetectorprocessor/cuda_oom_recipe_test.go @@ -0,0 +1,324 @@ +// SPDX-License-Identifier: Apache-2.0 + +package patterndetectorprocessor + +import ( + "context" + "os" + "regexp" + "testing" + "time" + + "github.com/stretchr/testify/require" + "go.opentelemetry.io/collector/pdata/pcommon" + "go.opentelemetry.io/collector/pdata/plog" + + "github.com/tracecoreai/tracecore/module/pkg/patterns" +) + +// TestRecipe_CUDAOOM_StanzaPinsWireContract pins the +// docs/integrations/examples/filelog-container.yaml CUDA-OOM OTTL +// stanza against the live CUDAOOMDetector wire contract (issue #436). +// +// What this gate protects: +// 1. Renaming the customer-stable `cuda_oom.tried_alloc_bytes` / +// `cuda_oom.gpu_index` / `gpu.id` attributes in the recipe +// without updating it here would silently disconnect filelog +// stderr from the detector. +// 2. Dropping any of the four unit-prefix branches (KiB / MiB / +// GiB / TiB) would fail the projector at `projectCUDAOOMLogRecord` +// on real-world PyTorch lines with no downstream signal — the +// detector stays configured-but-quiet for that prefix. +// 3. Wiring the `transform/cuda_oom` processor out of the +// `logs/container` pipeline would compile cleanly but never +// stamp the attributes. +// +// The recipe-side compile gate (scripts/validator-recipe.sh) catches +// OTTL syntax breakage; this test catches semantic drift between the +// recipe's stamps and the projector's reads. +func TestRecipe_CUDAOOM_StanzaPinsWireContract(t *testing.T) { + t.Parallel() + + recipePath := findRepoFile(t, "docs/integrations/examples/filelog-container.yaml") + raw, err := os.ReadFile(recipePath) + require.NoError(t, err, "reading recipe example yaml") + recipe := string(raw) + + // Each token is load-bearing for pattern #10's wire contract. + // Drift here without a matching detector / docs / wiring-test edit + // is a defect. Strings are matched as-is against the YAML body — + // they are the exact identifiers the detector reads. + // + // NOT pinned here: `"gpu.id"`. The `gpu.id` PCI BDF (per RFC-0013 + // §3) is NOT stamped by this transform — it arrives via either + // (a) k8sattributesprocessor + the `nvidia.com/gpu` device-plugin + // resource (pod allocation → PCI BDF), or (b) a sibling DCGM + // BDF-lookup transform indexed by `cuda_oom.gpu_index`. The + // projector's resource-attr fallback (cuda_oom.go) reads `gpu.id` + // off the log resource. Pinning `"gpu.id"` here would falsely + // require the transform to stamp it, which it must not — the + // CUDA-runtime ordinal `cuda_oom.gpu_index` is NOT a PCI BDF. + pinned := []string{ + // Two customer-stable attributes the recipe IS responsible + // for: bytes scalar + GPU index. Pattern #10's detector reads + // `cuda_oom.tried_alloc_bytes`; `cuda_oom.gpu_index` is + // operator-facing context the verdict's evidence trail uses. + `"cuda_oom.tried_alloc_bytes"`, + `"cuda_oom.gpu_index"`, + // The four unit-prefix branches PyTorch's format_size emits. + // Capturing %.2f shape means a literal ".\\d{2}" in the regex. + `KiB`, + `MiB`, + `GiB`, + `TiB`, + // The renamed processor identifier — operators grep for this + // to confirm the stanza is in their pipeline. Mirrors the + // `transform/dataloader_errors` shape that #406 established. + `transform/cuda_oom`, + } + for _, p := range pinned { + require.Contains(t, recipe, p, + "recipe example yaml missing pattern-#10 wire-contract token %q; see issue #436", p) + } + + // The processor pipeline must wire transform/cuda_oom into the + // logs/container pipeline. Without this the stanza compiles but + // never executes — the failure mode the recipe-only test cannot + // catch on its own. The k8sattributes stamp must run FIRST so the + // pod identifier reaches the detector alongside cuda_oom.* attrs; + // transform/cuda_oom is order-insensitive vs. dataloader_errors + // (they gate on disjoint body substrings) but MUST run after + // k8sattributes for the same pod-identity reason pattern #7 + // documents. + require.Contains(t, recipe, "transform/cuda_oom, batch", + "recipe example yaml does not wire transform/cuda_oom into logs/container pipeline") +} + +// TestRecipe_CUDAOOM_RoundTripFiresVerdict pins the round-trip: log +// records carrying the exact attribute shape the recipe's OTTL stamps +// onto a stderr line (`cuda_oom.tried_alloc_bytes` int, `gpu.id` str) +// flow through the CUDAOOMDetector and emit a verdict (issue #436). +// +// This is the simulated end-to-end gate: the OTTL stanza projects the +// filelog stderr body, k8sattributes stamps the pod identity, and the +// detector consumes the resulting log record alongside a same-GPU +// FB-memory record. The contract from body-regex → attribute-stamp → +// projector-read is fully pinned by this test against the recipe's +// exact identifiers. +// +// Sibling to TestPatternDetector_CUDAOOMWiringEmitsFragmentationVerdict +// (which uses helper-builder attributes); this test deliberately +// constructs attributes by name so a rename in either the recipe or +// the projector breaks one of the two tests cleanly. +func TestRecipe_CUDAOOM_RoundTripFiresVerdict(t *testing.T) { + t.Parallel() + + fbAt := mustParseTime(t, "2026-06-01T10:00:00Z") + oomAt := fbAt.Add(30 * time.Second) + + ld := plog.NewLogs() + + // FB memory log record (metrics→logs OTTL recipe output; sibling + // to #273 / #337). Carries the same `gpu.id` PCI BDF the recipe's + // CUDA-OOM stanza stamps. + fbRL := ld.ResourceLogs().AppendEmpty() + fbRL.Resource().Attributes().PutStr("k8s.node.name", "gpu-node-0042") + fbSL := fbRL.ScopeLogs().AppendEmpty() + fbLR := fbSL.LogRecords().AppendEmpty() + fbLR.SetTimestamp(pcommon.NewTimestampFromTime(fbAt)) + fa := fbLR.Attributes() + fa.PutStr("gpu.id", "PCI:0000:3b:00") + fa.PutInt("hw.gpu.memory.free", 16*1024*1024*1024) // 16 GiB + fa.PutInt("hw.gpu.memory.total", 80*1024*1024*1024) // 80 GiB + + // CUDA OOM log record — EXACT attribute shape the recipe's OTTL + // stamps off the canonical PyTorch stderr body. The body is the + // real PyTorch error string; the recipe extracts the bytes scalar + // + GPU index from it. `gpu.id` is the operator-configurable PCI + // BDF (per RFC-0013 §3) the recipe stamps via a sibling stanza + // (k8sattributes or a DCGM BDF-lookup transform). + oomRL := ld.ResourceLogs().AppendEmpty() + oomRA := oomRL.Resource().Attributes() + oomRA.PutStr("k8s.node.name", "gpu-node-0042") + oomRA.PutStr("k8s.pod.name", "trainer-rank-7") + oomRA.PutStr("k8s.namespace.name", "training") + oomSL := oomRL.ScopeLogs().AppendEmpty() + oomLR := oomSL.LogRecords().AppendEmpty() + oomLR.SetTimestamp(pcommon.NewTimestampFromTime(oomAt)) + oomLR.Body().SetStr("RuntimeError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU 0 has a total capacity of 79.18 GiB of which 16.00 GiB is free.") + oa := oomLR.Attributes() + oa.PutStr("gpu.id", "PCI:0000:3b:00") + oa.PutInt("cuda_oom.tried_alloc_bytes", 2*1024*1024*1024) + oa.PutInt("cuda_oom.gpu_index", 0) + + sink := newLogsSink() + p := newProcessor(testSettings(), defaultConfig(), sink) + require.NoError(t, p.Start(context.Background(), componenttestHost{})) + t.Cleanup(func() { _ = p.Shutdown(context.Background()) }) + require.NoError(t, p.ConsumeLogs(context.Background(), ld)) + + verdicts := extractCUDAOOMVerdicts(t, sink.at(0)) + require.Len(t, verdicts, 1, + "recipe-shaped CUDA OOM log record did not produce a verdict — "+ + "either the recipe's attribute names drifted from the projector "+ + "(module/processor/patterndetectorprocessor/cuda_oom.go) or the "+ + "detector's gating contract moved without a test update") + v := verdicts[0] + require.Equal(t, patterns.PatternIDCUDAOOM, v.PatternID) + require.Equal(t, "PCI:0000:3b:00", v.GPUID) + require.Equal(t, "gpu-node-0042", v.Node) + require.Equal(t, "trainer-rank-7", v.PodName) + require.Equal(t, "training", v.PodNamespace) + require.Equal(t, patterns.CUDAOOMKindFragmentation, v.Kind) + require.Equal(t, patterns.ConfidenceFull, v.Confidence) + require.Equal(t, int64(2*1024*1024*1024), v.TriedAllocBytes) +} + +// TestRecipe_CUDAOOM_RegexCoversCanonicalPyTorchMessages exercises the +// recipe's OTTL regex against the actual PyTorch error vocabulary the +// recipe must cover. This is the "regex tested against >=3 canonical +// messages" gate from #436 A-tier acceptance. +// +// The Go-side regex below is a literal copy of the OTTL regex in the +// recipe YAML; if the recipe changes its capture groups or unit +// prefixes the parity below breaks. The unit-normalization arithmetic +// mirrors what the OTTL `Int(whole)*UNIT + Int(frac)*UNIT/100` +// expression computes (rounded-down floor; the precision loss is +// 0.01 of the unit base, well under the detector's 5% fragmentation +// threshold). +// +// Negative case: a non-OOM stderr line MUST NOT match — the projector +// gate is the `cuda_oom.tried_alloc_bytes` attribute presence, and a +// false stamp on every error line would flood the detector with +// noise. The recipe's `where IsMatch(...)` guard MUST be tight. +func TestRecipe_CUDAOOM_RegexCoversCanonicalPyTorchMessages(t *testing.T) { + t.Parallel() + + // Literal mirror of the OTTL regex set the recipe ships. Each + // unit-prefix branch captures `whole` + 2-digit `frac` groups + // (matches PyTorch's `format_size` %.2f shape). + bodyMatch := regexp.MustCompile(`CUDA out of memory\. Tried to allocate (\d+)\.(\d{2}) (KiB|MiB|GiB|TiB)`) + gpuIndexMatch := regexp.MustCompile(`GPU (\d+) has a total capacity`) + + unitToBase := map[string]int64{ + "KiB": 1024, + "MiB": 1024 * 1024, + "GiB": 1024 * 1024 * 1024, + "TiB": 1024 * 1024 * 1024 * 1024, + } + + type tc struct { + name string + body string + wantBytes int64 + wantGPU int64 + // allow a small tolerance on the unit-normalization floor — + // matches the precision loss the OTTL `frac * UNIT/100` + // expression incurs (max 1 byte for KiB, max 24 bytes for + // GiB at 99 frac-units). + tolBytes int64 + match bool + } + cases := []tc{ + { + name: "GiB-canonical-RFC0013-pattern10-example", + body: "RuntimeError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU 0 has a total capacity of 79.18 GiB of which 16.00 GiB is free.", + wantBytes: 2 * 1024 * 1024 * 1024, + wantGPU: 0, + tolBytes: 0, + match: true, + }, + { + name: "MiB-small-alloc", + body: "RuntimeError: CUDA out of memory. Tried to allocate 256.00 MiB. GPU 3 has a total capacity of 79.18 GiB.", + wantBytes: 256 * 1024 * 1024, + wantGPU: 3, + tolBytes: 0, + match: true, + }, + { + name: "GiB-fractional", + body: "torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1.50 GiB. GPU 7 has a total capacity of 39.59 GiB.", + wantBytes: 1*1024*1024*1024 + 50*(1024*1024*1024/100), // 0.50 GiB = 50 frac-units + wantGPU: 7, + tolBytes: 100, // accommodate floor-rounding within the OTTL Int/100 path + match: true, + }, + { + name: "KiB-tiny-alloc", + body: "RuntimeError: CUDA out of memory. Tried to allocate 768.00 KiB. GPU 0 has a total capacity of 79.18 GiB.", + wantBytes: 768 * 1024, + wantGPU: 0, + tolBytes: 0, + match: true, + }, + { + name: "TiB-large-alloc", + body: "RuntimeError: CUDA out of memory. Tried to allocate 1.00 TiB. GPU 0 has a total capacity of 79.18 GiB.", + wantBytes: 1 * 1024 * 1024 * 1024 * 1024, + wantGPU: 0, + tolBytes: 0, + match: true, + }, + // Negative cases — these MUST NOT match the OOM regex. A false + // positive here would stamp the cuda_oom.* attributes on every + // matching line and flood the detector. + { + name: "negative-dataloader-worker-killed", + body: "RuntimeError: DataLoader worker (pid 1234) is killed by signal: SIGKILL.", + match: false, + }, + { + name: "negative-nccl-timeout", + body: "RuntimeError: [Rank 0]: Watchdog caught collective operation timeout", + match: false, + }, + { + name: "negative-cuda-error-no-alloc-size", + body: "RuntimeError: CUDA error: an illegal memory access was encountered", + match: false, + }, + } + matched := 0 + for _, c := range cases { + c := c + t.Run(c.name, func(t *testing.T) { + bm := bodyMatch.FindStringSubmatch(c.body) + if !c.match { + require.Nil(t, bm, "negative case unexpectedly matched OOM regex: %q", c.body) + return + } + require.NotNil(t, bm, "positive case did not match OOM regex: %q", c.body) + matched++ + whole := atoi(t, bm[1]) + frac := atoi(t, bm[2]) + unit := bm[3] + base, ok := unitToBase[unit] + require.True(t, ok, "unknown unit prefix %q", unit) + // Mirror the OTTL arithmetic: bytes = whole*UNIT + frac*UNIT/100 + gotBytes := whole*base + frac*(base/100) + require.InDelta(t, c.wantBytes, gotBytes, float64(c.tolBytes), + "unit-normalized bytes drift for %s: want %d, got %d", c.name, c.wantBytes, gotBytes) + + gm := gpuIndexMatch.FindStringSubmatch(c.body) + require.NotNil(t, gm, "positive case did not match GPU-index regex: %q", c.body) + require.Equal(t, c.wantGPU, atoi(t, gm[1]), "gpu_index drift for %s", c.name) + }) + } + // Acceptance criterion: at least 3 canonical positive messages + // covered (KiB, MiB, GiB, TiB — 4 distinct unit prefixes). + require.GreaterOrEqual(t, matched, 3, + "regex must cover >=3 canonical PyTorch OOM messages per #436 A-tier acceptance") +} + +// atoi is a tiny test helper for parsing int64 capture groups; pulling +// in strconv at the test scope is overkill for the 4-call site count. +func atoi(t *testing.T, s string) int64 { + t.Helper() + var n int64 + for _, c := range []byte(s) { + require.True(t, c >= '0' && c <= '9', "non-digit in numeric capture %q", s) + n = n*10 + int64(c-'0') + } + return n +}