From 777eee332c6ecbb86ea2da7756e2a35d0a146ab3 Mon Sep 17 00:00:00 2001
From: Tri Lam <tree@lumalabs.ai>
Date: Mon, 1 Jun 2026 18:58:09 -0700
Subject: [PATCH] feat(integrations/examples): pattern-10 CUDA OOM filelog OTTL
 stanza
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds the `transform/cuda_oom` processor to
`docs/integrations/examples/filelog-container.yaml` that projects
PyTorch's canonical `RuntimeError: CUDA out of memory. Tried to
allocate X.YY <unit>. GPU N has a total capacity of ...` stderr line
onto the customer-stable `cuda_oom.tried_alloc_bytes` (Int, bytes;
unit-normalized KiB/MiB/GiB/TiB) and `cuda_oom.gpu_index` (Int)
attributes that pattern #10's detector (PR #338) reads via
`projectCUDAOOMLogRecord`. Closes the load-bearing filelog→detector
integration gap flagged in issue #303 follow-ups.

Per-unit-branch shape (one stanza per KiB/MiB/GiB/TiB prefix) because
OTTL has no capture-group-conditional dispatch — the multiplier must
be a literal int64 per stanza. Uses OTTL Math Expressions
(`Int(whole)*UNIT + Int(frac)*(UNIT/100)`) to handle PyTorch's
`%.2f` `format_size` output; precision loss capped at <1% of the
unit base, three orders of magnitude under the detector's 5%
fragmentation threshold.

`gpu.id` (PCI BDF per RFC-0013 §3) is NOT stamped by this transform
— the CUDA-runtime ordinal `cuda_oom.gpu_index` is not a PCI BDF.
Two operator-configurable paths documented in the recipe markdown:
(a) k8sattributesprocessor + `nvidia.com/gpu-PCIDeviceBusID` device-
plugin annotation, or (b) DCGM BDF-lookup transform indexed by
`cuda_oom.gpu_index`. The detector's resource-attr fallback reads
`gpu.id` from the log resource either way.

Tests (TDD red→green): three new recipe-parity tests under
`module/processor/patterndetectorprocessor/cuda_oom_recipe_test.go`:

- `TestRecipe_CUDAOOM_StanzaPinsWireContract`: pins the 7 load-
  bearing tokens (`cuda_oom.tried_alloc_bytes`, `cuda_oom.gpu_index`,
  KiB/MiB/GiB/TiB unit prefixes, `transform/cuda_oom`) + pipeline-
  wiring against the live projector. Mirrors PR #393's IB-flap
  shape.
- `TestRecipe_CUDAOOM_RoundTripFiresVerdict`: end-to-end gate —
  log records carrying the exact attribute shape the recipe stamps
  flow through CUDAOOMDetector and emit a kind=fragmentation
  verdict with the expected scalar-promotion contract.
- `TestRecipe_CUDAOOM_RegexCoversCanonicalPyTorchMessages`: 5
  canonical positive PyTorch OOM messages (KiB/MiB/GiB/TiB/fractional)
  + 3 negative messages (DataLoader worker killed, NCCL watchdog,
  illegal memory access). Exceeds the >=3-positive A-tier
  acceptance criterion.

Validates clean:
- `tracecore validate docs/integrations/examples/filelog-container.yaml`
  exits 0.
- `make validator-recipe` covers this file (tested-against: tracecore).
- `make doc-check` resolves the new pattern-10 cross-link.
- Full `make ci-fast` green.

Cross-links:
- Pattern doc: `docs/patterns/10-cuda-oom-deceptive.md` §"Signal
  sources" now references this recipe and resolves Open Question #2
  ("filelogreceiver OTTL stanza for the OOM regex").
- Recipe markdown: new §`cuda_oom.*` attribute stanza (pattern #10)
  in `docs/integrations/filelog-container.md` with the unit-
  normalization arithmetic table, the two `gpu.id` source paths,
  and a Failure-modes row.

Closes #436.
Refs #338, #303, #337.

Signed-off-by: Tri Lam <tree@lumalabs.ai>
---
 .../examples/filelog-container.yaml           |  77 ++++-
 docs/integrations/filelog-container.md        |  64 ++++
 docs/patterns/10-cuda-oom-deceptive.md        |   4 +-
 .../cuda_oom_recipe_test.go                   | 324 ++++++++++++++++++
 4 files changed, 465 insertions(+), 4 deletions(-)
 create mode 100644 module/processor/patterndetectorprocessor/cuda_oom_recipe_test.go
diff --git a/docs/integrations/examples/filelog-container.yaml b/docs/integrations/examples/filelog-container.yaml
index 03c98be4..00dc5af5 100644
--- a/docs/integrations/examples/filelog-container.yaml
+++ b/docs/integrations/examples/filelog-container.yaml
@@ -133,6 +133,75 @@ processors:
           # directly). Catch-all for the "transport died" runbook
           # branch.
           - 'set(attributes["dataloader.error_class"], "Connection reset by peer") where IsMatch(body, "Connection reset by peer") and attributes["dataloader.error_class"] == nil'
+
+  # Project PyTorch's `RuntimeError: CUDA out of memory. Tried to
+  # allocate X.YY <unit>. GPU N has a total capacity of ...` stderr
+  # line onto the customer-stable `cuda_oom.tried_alloc_bytes` (Int,
+  # bytes) + `cuda_oom.gpu_index` (Int) attributes that pattern #10's
+  # detector (module/processor/patterndetectorprocessor/cuda_oom.go,
+  # `projectCUDAOOMLogRecord`) consumes. The detector's projection
+  # gate is BOTH `cuda_oom.tried_alloc_bytes` AND `gpu.id` (PCI BDF
+  # per RFC-0013 §3); the stanzas below stamp the bytes scalar and
+  # the human-visible GPU index off the body. `gpu.id` is the
+  # operator-configurable mapping — `cuda_oom.gpu_index` is the
+  # CUDA-runtime ordinal that PyTorch's allocator prints, NOT a PCI
+  # BDF, so the recipe DOES NOT alias it onto `gpu.id`. Two paths
+  # to populate `gpu.id` are documented in
+  # docs/integrations/filelog-container.md §"`cuda_oom.*` attribute
+  # stanza (pattern #10)":
+  #   (a) k8sattributesprocessor + `nvidia.com/gpu` device-plugin
+  #       resource — the pod allocation maps to one PCI BDF, lifted
+  #       onto the log resource as `gpu.id`.
+  #   (b) a sibling DCGM BDF-lookup transform indexed by
+  #       `cuda_oom.gpu_index` — the DCGM exporter ships a per-host
+  #       (index → BDF) table on its scrape endpoint.
+  # Either path stamps `gpu.id` on the resource; the detector's
+  # resource-attr fallback (cuda_oom.go:65) reads it from there.
+  #
+  # Unit normalization: PyTorch's `format_size` emits `%.2f <unit>`
+  # with the four IEC binary prefixes below (KiB / MiB / GiB / TiB).
+  # OTTL Math Expressions support `*` and `+` on int64, so we capture
+  # `whole` (digits before the dot) + 2-digit `frac` (digits after)
+  # and compute `Int(whole)*UNIT + Int(frac)*UNIT/100`. The integer
+  # division floors the per-frac-unit step (max precision loss:
+  # ~10 MB on a 99.99 GiB alloc — three orders of magnitude under
+  # the detector's 5% fragmentation threshold).
+  #
+  # Per-unit branches instead of one omnibus regex: OTTL has no
+  # capture-group-conditional dispatch, so the multiplier must be a
+  # literal int64 per stanza. The four-row repetition is the smallest
+  # shape that compiles. The `where IsMatch(...)` guard is tight on
+  # `CUDA out of memory\. Tried to allocate` so a generic CUDA error
+  # (illegal memory access, NCCL watchdog) does not trip the stanza.
+  transform/cuda_oom:
+    log_statements:
+      - context: log
+        statements:
+          # ---- GPU index extraction (any OOM line) ----
+          # PyTorch prints `GPU N has a total capacity of ...` after
+          # the alloc-size scalar. The index is the CUDA-runtime
+          # ordinal, NOT a PCI BDF — the detector's `gpu.id` projection
+          # is satisfied via the k8sattributes / DCGM-lookup paths
+          # documented above; `cuda_oom.gpu_index` is operator-facing
+          # context the verdict's evidence trail uses.
+          - 'set(attributes["cuda_oom.gpu_index"], Int(ExtractPatterns(body, "GPU (?P<idx>\\d+) has a total capacity")["idx"])) where IsMatch(body, "CUDA out of memory\\. Tried to allocate") and IsMatch(body, "GPU \\d+ has a total capacity")'
+
+          # ---- KiB branch ----
+          # 1 KiB = 1024 B. frac-unit step = 1024/100 = 10 (floor).
+          - 'set(attributes["cuda_oom.tried_alloc_bytes"], Int(ExtractPatterns(body, "Tried to allocate (?P<w>\\d+)\\.(?P<f>\\d{2}) KiB")["w"]) * 1024 + Int(ExtractPatterns(body, "Tried to allocate (?P<w>\\d+)\\.(?P<f>\\d{2}) KiB")["f"]) * 10) where IsMatch(body, "CUDA out of memory\\. Tried to allocate \\d+\\.\\d{2} KiB")'
+
+          # ---- MiB branch ----
+          # 1 MiB = 1048576 B. frac-unit step = 1048576/100 = 10485 (floor).
+          - 'set(attributes["cuda_oom.tried_alloc_bytes"], Int(ExtractPatterns(body, "Tried to allocate (?P<w>\\d+)\\.(?P<f>\\d{2}) MiB")["w"]) * 1048576 + Int(ExtractPatterns(body, "Tried to allocate (?P<w>\\d+)\\.(?P<f>\\d{2}) MiB")["f"]) * 10485) where IsMatch(body, "CUDA out of memory\\. Tried to allocate \\d+\\.\\d{2} MiB")'
+
+          # ---- GiB branch ----
+          # 1 GiB = 1073741824 B. frac-unit step = 1073741824/100 = 10737418 (floor).
+          - 'set(attributes["cuda_oom.tried_alloc_bytes"], Int(ExtractPatterns(body, "Tried to allocate (?P<w>\\d+)\\.(?P<f>\\d{2}) GiB")["w"]) * 1073741824 + Int(ExtractPatterns(body, "Tried to allocate (?P<w>\\d+)\\.(?P<f>\\d{2}) GiB")["f"]) * 10737418) where IsMatch(body, "CUDA out of memory\\. Tried to allocate \\d+\\.\\d{2} GiB")'
+
+          # ---- TiB branch ----
+          # 1 TiB = 1099511627776 B. frac-unit step = 1099511627776/100 = 10995116277 (floor).
+          - 'set(attributes["cuda_oom.tried_alloc_bytes"], Int(ExtractPatterns(body, "Tried to allocate (?P<w>\\d+)\\.(?P<f>\\d{2}) TiB")["w"]) * 1099511627776 + Int(ExtractPatterns(body, "Tried to allocate (?P<w>\\d+)\\.(?P<f>\\d{2}) TiB")["f"]) * 10995116277) where IsMatch(body, "CUDA out of memory\\. Tried to allocate \\d+\\.\\d{2} TiB")'
+
   k8sattributes:
     auth_type: serviceAccount
     passthrough: false
@@ -181,6 +250,10 @@ service:
       # body strings produced by the container parser and stamp the
       # customer-stable `dataloader.error_class` /
       # `dataloader.worker_pid` attributes pattern #7's detector
-      # consumes.
-      processors: [k8sattributes, transform/dataloader_errors, batch]
+      # consumes. `transform/cuda_oom` runs alongside dataloader_errors
+      # (order-insensitive — they gate on disjoint body substrings)
+      # to stamp `cuda_oom.tried_alloc_bytes` + `cuda_oom.gpu_index`
+      # off PyTorch's `RuntimeError: CUDA out of memory` line for
+      # pattern #10's detector.
+      processors: [k8sattributes, transform/dataloader_errors, transform/cuda_oom, batch]
       exporters: [otlphttp]
diff --git a/docs/integrations/filelog-container.md b/docs/integrations/filelog-container.md
index 34c0e0e1..a111a06f 100644
--- a/docs/integrations/filelog-container.md
+++ b/docs/integrations/filelog-container.md
@@ -13,6 +13,11 @@ projects per-driver PyTorch `DataLoader` error vocabulary (FUSE, S3,
 Lustre, multiprocessing queue, worker-killed) onto the customer-stable
 `dataloader.error_class` / `dataloader.worker_pid` attributes that
 [pattern #7's detector](../patterns/07-dataloader-hang.md) consumes.
+A sibling `transform/cuda_oom` stanza projects PyTorch's
+`RuntimeError: CUDA out of memory. Tried to allocate X.YY <unit>` line
+onto the customer-stable `cuda_oom.tried_alloc_bytes` (Int, bytes;
+unit-normalized) + `cuda_oom.gpu_index` (Int) attributes that
+[pattern #10's detector](../patterns/10-cuda-oom-deceptive.md) consumes.
 Replaces the in-tree `containerstdout` receiver scheduled for deletion
 at v0.2.0 per
 [RFC-0013 §migration PR-K](../rfcs/0013-distro-first-pivot.md#migration-rollout)
@@ -169,6 +174,63 @@ at `module/pkg/patterns/dataloader_hang.go`).
 > error classes (e.g. a future Ceph-class driver) extend the table
 > here, not by widening an existing regex.
 
+## `cuda_oom.*` attribute stanza (pattern #10)
+
+The `transform/cuda_oom` processor projects PyTorch's canonical
+out-of-memory stderr line — `RuntimeError: CUDA out of memory. Tried
+to allocate 2.00 GiB. GPU 0 has a total capacity of 79.18 GiB of
+which 16.00 GiB is free.` — onto the customer-stable
+[`cuda_oom.tried_alloc_bytes`](../ATTRIBUTES.md) +
+[`cuda_oom.gpu_index`](../ATTRIBUTES.md) attributes that
+[pattern #10's detector](../patterns/10-cuda-oom-deceptive.md)
+(`projectCUDAOOMLogRecord` at
+`module/processor/patterndetectorprocessor/cuda_oom.go`) consumes.
+The detector's projection gate is BOTH `cuda_oom.tried_alloc_bytes`
+AND `gpu.id` (PCI BDF per
+[RFC-0013 §3](../rfcs/0013-distro-first-pivot.md#3-customer-stable-telemetry-contracts));
+this stanza stamps the bytes scalar and the human-visible GPU index
+off the body. `gpu.id` is **not** stamped here — the CUDA-runtime
+ordinal `cuda_oom.gpu_index` is a CUDA enumeration index, not a PCI
+BDF. Two operator-configurable paths populate `gpu.id` on the log
+resource so the detector's resource-attr fallback reads it:
+
+| `gpu.id` source path | When to use |
+|---|---|
+| **k8sattributesprocessor + `nvidia.com/gpu` device-plugin resource** | The trainer pod requests one GPU via `resources.limits.nvidia.com/gpu: 1`. The NVIDIA device plugin annotates the pod with the allocated PCI BDF (`nvidia.com/gpu-PCIDeviceBusID` since device-plugin v0.16). Extend `k8sattributes::extract::annotations` to lift this annotation onto the log resource as `gpu.id`. Cheapest path — already in the cluster's GPU scheduling fabric. |
+| **DCGM BDF-lookup transform indexed by `cuda_oom.gpu_index`** | Multi-GPU pods (one container ↔ N GPUs) where the device-plugin annotation is the per-pod list, not the per-OOM GPU. Scrape the DCGM exporter's `DCGM_FI_DEV_PCI_BUSID` series, materialize a per-host `{gpu_index → BDF}` lookup, then add a sibling OTTL stanza that joins `cuda_oom.gpu_index` against the table to stamp `gpu.id`. Sibling to the [pattern-2 / pattern-10 DCGM recipe](prometheus-scrape.md). |
+
+The recipe uses four per-unit-prefix branches (KiB / MiB / GiB / TiB)
+because OTTL has no capture-group-conditional dispatch — the
+multiplier must be a literal `int64` per stanza. The body match
+captures `whole` (digits before the decimal) and `frac` (two digits
+after) and computes
+`Int(whole) * UNIT + Int(frac) * (UNIT / 100)`. PyTorch's
+`format_size` always emits `%.2f`, so the 2-digit `frac` capture is
+exhaustive; the integer-divide-by-100 floor caps precision loss at
+under 1% of the unit base (max ~10 MB on a 99.99 GiB alloc, three
+orders of magnitude under the detector's 5% fragmentation threshold).
+
+| Body shape | Captured | Stamped attributes |
+|---|---|---|
+| `CUDA out of memory. Tried to allocate \d+\.\d{2} KiB` | `whole`, `frac` (×2 digits) | `cuda_oom.tried_alloc_bytes = whole*1024 + frac*10` |
+| `CUDA out of memory. Tried to allocate \d+\.\d{2} MiB` | `whole`, `frac` | `cuda_oom.tried_alloc_bytes = whole*1048576 + frac*10485` |
+| `CUDA out of memory. Tried to allocate \d+\.\d{2} GiB` | `whole`, `frac` | `cuda_oom.tried_alloc_bytes = whole*1073741824 + frac*10737418` |
+| `CUDA out of memory. Tried to allocate \d+\.\d{2} TiB` | `whole`, `frac` | `cuda_oom.tried_alloc_bytes = whole*1099511627776 + frac*10995116277` |
+| `... GPU \d+ has a total capacity` | `idx` | `cuda_oom.gpu_index = idx` |
+
+The `where IsMatch(body, "CUDA out of memory\. Tried to allocate")`
+guard is tight on the OOM-summary line, so generic CUDA errors
+(`an illegal memory access was encountered`, NCCL watchdog timeouts,
+`DataLoader worker (pid N) is killed`) do not trip the stanza —
+keeping the detector quiet on non-OOM stderr noise.
+
+> **Multi-line tracebacks.** A PyTorch OOM emits the summary line
+> followed by a Python traceback (`File "train.py", line 42, in ...`).
+> The container parser flattens each newline-delimited log line into
+> its own log record; only the summary line matches the regex above,
+> so the detector sees exactly one stamp per OOM event regardless of
+> traceback depth. This is pattern #10 spec Open Q#2's answer.
+
 ## Placeholders
 
 | Placeholder | What to fill in |
@@ -192,6 +254,8 @@ fails immediately instead of silently dropping logs.
 | High-cardinality label explosion | The container parser surfaces every label from `app.kubernetes.io/name` plus whatever you add under `extract::labels`. Audit the list against the receiving backend's cardinality budget before adding more. |
 | Pattern #7 verdict never fires despite known DataLoader stalls | The `transform/dataloader_errors` stanzas gate on substring matches against the container `body`. If your trainer wraps DataLoader errors (e.g. a custom logger that prefixes with JSON), the body shape changes. Confirm via `kubectl logs <trainer-pod> --container=<c> --previous 2>&1 | grep -E 'DataLoader worker|Transport endpoint|SlowDown|Stale file handle'` and extend the regexes in `transform/dataloader_errors`. |
 | `dataloader.error_class` empty on a known error line | The OTTL stanza fell through silently — the body substring did not match any branch. Add a row to the table above and a matching `set(attributes["dataloader.error_class"], ...)` statement. The detector's projection gate requires the attribute, so a missing class drops the discriminator. |
+| Pattern #10 verdict never fires despite a known CUDA OOM | The `transform/cuda_oom` stanzas gate on substring matches against the container `body`. Confirm via `kubectl logs <trainer-pod> --container=<c> --previous 2>&1 \| grep -E 'CUDA out of memory\. Tried to allocate'`. If the trainer wraps PyTorch errors (custom logger, JSON envelope), the body shape changes — extend the `IsMatch` predicates to match the wrapper format. Also check that `gpu.id` is being stamped onto the log resource via one of the two paths in the `cuda_oom.*` section: a missing `gpu.id` drops the projection at `cuda_oom.go`'s gate and the detector stays quiet. |
+| `cuda_oom.tried_alloc_bytes` stamped with a wildly wrong magnitude | A unit-prefix branch was modified without updating its multiplier, or the body shape drifted from `%.2f`. PyTorch's `format_size` has used `%.2f` for the entire CUDA-allocator lifetime; if a customer fork emits `%.0f` or `%.4f` the recipe's `\d{2}` capture misses, and the stanza fails open (no stamp) rather than producing a wrong value. Verify against `pytorch/c10/util/Exception.h`'s formatter. |
 
 Upstream component docs:
 [`receiver/filelogreceiver`](https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/receiver/filelogreceiver),
diff --git a/docs/patterns/10-cuda-oom-deceptive.md b/docs/patterns/10-cuda-oom-deceptive.md
index 2f5b215e..9ce6d175 100644
--- a/docs/patterns/10-cuda-oom-deceptive.md
+++ b/docs/patterns/10-cuda-oom-deceptive.md
@@ -16,7 +16,7 @@ Training fails with `RuntimeError: CUDA out of memory. Tried to allocate X MiB.
 
 ## Signal sources
 
-- `filelogreceiver` tailing training-container stderr — OTTL stanza on the `RuntimeError: CUDA out of memory` line extracts `cuda_oom.tried_alloc_bytes`, `cuda_oom.total_bytes`, `cuda_oom.free_bytes`, `cuda_oom.gpu_index`.
+- `filelogreceiver` tailing training-container stderr — OTTL stanza on the `RuntimeError: CUDA out of memory` line extracts `cuda_oom.tried_alloc_bytes` (unit-normalized KiB/MiB/GiB/TiB → bytes) and `cuda_oom.gpu_index`. Recipe: [docs/integrations/filelog-container.md §`cuda_oom.*` attribute stanza (pattern #10)](../integrations/filelog-container.md#cuda_oom-attribute-stanza-pattern-10) (issue [#436](https://github.com/tracecoreai/tracecore/issues/436); sibling to detector PR [#338](https://github.com/tracecoreai/tracecore/pull/338) and metric-side recipe [#337](https://github.com/tracecoreai/tracecore/issues/337)). `gpu.id` (PCI BDF per RFC-0013 §3) is stamped via the sibling k8sattributesprocessor + `nvidia.com/gpu` device-plugin resource OR a DCGM BDF-lookup transform indexed by `cuda_oom.gpu_index` — the recipe documents both paths.
 - `prometheusreceiver` scraping `dcgm-exporter` — `DCGM_FI_DEV_FB_USED` / `DCGM_FI_DEV_FB_FREE` projected via OTTL transform to `hw.gpu.memory.{used,free}` Gauges (unit `By`) with `gpu.id` resource attr. Per-GPU `hw.gpu.memory.total = used + free` is computed at the metrics-to-logs bridge layer, not at OTTL — `transformprocessor` v0.130 cannot perform cross-series arithmetic on a metrics pipeline. Recipe: [docs/integrations/prometheus-scrape.md §Pattern #10](../integrations/prometheus-scrape.md#pattern-10--cuda-oom-framebuffer); bridge log-shape spec: [§Pattern #10 — `hw.gpu.memory.{free,total}`](../integrations/prometheus-scrape.md#pattern-10--hwgpumemoryfreetotal-issue-337) (issue [#337](https://github.com/tracecoreai/tracecore/issues/337)).
 - (optional) `torch.cuda.memory_summary()` dump from a faulthandler / SIGUSR2 hook — far richer fragmentation detail; out of v1 scope.
 
@@ -72,6 +72,6 @@ Per issue #303 scalar-promotion checklist:
 ## Open questions
 
 1. **`DCGM_FI_DEV_FB_*` OTTL recipe extension.** Resolved by issue [#337](https://github.com/tracecoreai/tracecore/issues/337): metric-side projection (`DCGM_FI_DEV_FB_USED` → `hw.gpu.memory.used`, `DCGM_FI_DEV_FB_FREE` → `hw.gpu.memory.free`) ships in [docs/integrations/prometheus-scrape.md §Pattern #10](../integrations/prometheus-scrape.md#pattern-10--cuda-oom-framebuffer). The `hw.gpu.memory.total = used + free` derivation + log-record emission belongs to the RFC-0014 PR-B `WithMetrics` bridge; the log-shape spec the bridge MUST honor is pinned in the recipe's [§Pattern #10 — `hw.gpu.memory.{free,total}`](../integrations/prometheus-scrape.md#pattern-10--hwgpumemoryfreetotal-issue-337) section.
-2. **filelogreceiver OTTL stanza for the OOM regex.** Sibling to #285. Multi-line `RuntimeError` traceback handling: OTTL recipe stops at the first stanza match or continues into the traceback?
+2. **filelogreceiver OTTL stanza for the OOM regex.** Resolved by issue [#436](https://github.com/tracecoreai/tracecore/issues/436): the `transform/cuda_oom` stanza ships in [docs/integrations/filelog-container.md §`cuda_oom.*` attribute stanza (pattern #10)](../integrations/filelog-container.md#cuda_oom-attribute-stanza-pattern-10). The recipe stops at the per-unit `where IsMatch(...) ... Tried to allocate \d+\.\d{2} <unit>` guard — multi-line traceback lines (`File "train.py", line 42, in ...`) do not match the OOM-summary regex and pass through untransformed, so the detector receives one `cuda_oom.tried_alloc_bytes` stamp per OOM event regardless of traceback depth.
 3. **Metrics-path on patterndetectorprocessor.** Per ADR-0001 PR-B — the processor today consumes logs only. CUDA-OOM joins a log to a metric. Either the metric is projected to a log via the metrics→logs OTTL bridge (RFC-0014 PR-B, also blocking pattern #3 today), or the processor grows a metrics input.
 4. **`cuda_oom.kind` enum namespace.** Should this be `pattern.cuda_oom.kind` or top-level `cuda_oom.kind`? ATTRIBUTES.md prefers `pattern.*` for tracecore-internal verdict scalars, but issue #303 used `cuda_oom.kind` directly. Reconcile.
diff --git a/module/processor/patterndetectorprocessor/cuda_oom_recipe_test.go b/module/processor/patterndetectorprocessor/cuda_oom_recipe_test.go
new file mode 100644
index 00000000..c5b86a2a
--- /dev/null
+++ b/module/processor/patterndetectorprocessor/cuda_oom_recipe_test.go
@@ -0,0 +1,324 @@
+// SPDX-License-Identifier: Apache-2.0
+
+package patterndetectorprocessor
+
+import (
+	"context"
+	"os"
+	"regexp"
+	"testing"
+	"time"
+
+	"github.com/stretchr/testify/require"
+	"go.opentelemetry.io/collector/pdata/pcommon"
+	"go.opentelemetry.io/collector/pdata/plog"
+
+	"github.com/tracecoreai/tracecore/module/pkg/patterns"
+)
+
+// TestRecipe_CUDAOOM_StanzaPinsWireContract pins the
+// docs/integrations/examples/filelog-container.yaml CUDA-OOM OTTL
+// stanza against the live CUDAOOMDetector wire contract (issue #436).
+//
+// What this gate protects:
+//  1. Renaming the customer-stable `cuda_oom.tried_alloc_bytes` /
+//     `cuda_oom.gpu_index` / `gpu.id` attributes in the recipe
+//     without updating it here would silently disconnect filelog
+//     stderr from the detector.
+//  2. Dropping any of the four unit-prefix branches (KiB / MiB /
+//     GiB / TiB) would fail the projector at `projectCUDAOOMLogRecord`
+//     on real-world PyTorch lines with no downstream signal — the
+//     detector stays configured-but-quiet for that prefix.
+//  3. Wiring the `transform/cuda_oom` processor out of the
+//     `logs/container` pipeline would compile cleanly but never
+//     stamp the attributes.
+//
+// The recipe-side compile gate (scripts/validator-recipe.sh) catches
+// OTTL syntax breakage; this test catches semantic drift between the
+// recipe's stamps and the projector's reads.
+func TestRecipe_CUDAOOM_StanzaPinsWireContract(t *testing.T) {
+	t.Parallel()
+
+	recipePath := findRepoFile(t, "docs/integrations/examples/filelog-container.yaml")
+	raw, err := os.ReadFile(recipePath)
+	require.NoError(t, err, "reading recipe example yaml")
+	recipe := string(raw)
+
+	// Each token is load-bearing for pattern #10's wire contract.
+	// Drift here without a matching detector / docs / wiring-test edit
+	// is a defect. Strings are matched as-is against the YAML body —
+	// they are the exact identifiers the detector reads.
+	//
+	// NOT pinned here: `"gpu.id"`. The `gpu.id` PCI BDF (per RFC-0013
+	// §3) is NOT stamped by this transform — it arrives via either
+	// (a) k8sattributesprocessor + the `nvidia.com/gpu` device-plugin
+	// resource (pod allocation → PCI BDF), or (b) a sibling DCGM
+	// BDF-lookup transform indexed by `cuda_oom.gpu_index`. The
+	// projector's resource-attr fallback (cuda_oom.go) reads `gpu.id`
+	// off the log resource. Pinning `"gpu.id"` here would falsely
+	// require the transform to stamp it, which it must not — the
+	// CUDA-runtime ordinal `cuda_oom.gpu_index` is NOT a PCI BDF.
+	pinned := []string{
+		// Two customer-stable attributes the recipe IS responsible
+		// for: bytes scalar + GPU index. Pattern #10's detector reads
+		// `cuda_oom.tried_alloc_bytes`; `cuda_oom.gpu_index` is
+		// operator-facing context the verdict's evidence trail uses.
+		`"cuda_oom.tried_alloc_bytes"`,
+		`"cuda_oom.gpu_index"`,
+		// The four unit-prefix branches PyTorch's format_size emits.
+		// Capturing %.2f shape means a literal ".\\d{2}" in the regex.
+		`KiB`,
+		`MiB`,
+		`GiB`,
+		`TiB`,
+		// The renamed processor identifier — operators grep for this
+		// to confirm the stanza is in their pipeline. Mirrors the
+		// `transform/dataloader_errors` shape that #406 established.
+		`transform/cuda_oom`,
+	}
+	for _, p := range pinned {
+		require.Contains(t, recipe, p,
+			"recipe example yaml missing pattern-#10 wire-contract token %q; see issue #436", p)
+	}
+
+	// The processor pipeline must wire transform/cuda_oom into the
+	// logs/container pipeline. Without this the stanza compiles but
+	// never executes — the failure mode the recipe-only test cannot
+	// catch on its own. The k8sattributes stamp must run FIRST so the
+	// pod identifier reaches the detector alongside cuda_oom.* attrs;
+	// transform/cuda_oom is order-insensitive vs. dataloader_errors
+	// (they gate on disjoint body substrings) but MUST run after
+	// k8sattributes for the same pod-identity reason pattern #7
+	// documents.
+	require.Contains(t, recipe, "transform/cuda_oom, batch",
+		"recipe example yaml does not wire transform/cuda_oom into logs/container pipeline")
+}
+
+// TestRecipe_CUDAOOM_RoundTripFiresVerdict pins the round-trip: log
+// records carrying the exact attribute shape the recipe's OTTL stamps
+// onto a stderr line (`cuda_oom.tried_alloc_bytes` int, `gpu.id` str)
+// flow through the CUDAOOMDetector and emit a verdict (issue #436).
+//
+// This is the simulated end-to-end gate: the OTTL stanza projects the
+// filelog stderr body, k8sattributes stamps the pod identity, and the
+// detector consumes the resulting log record alongside a same-GPU
+// FB-memory record. The contract from body-regex → attribute-stamp →
+// projector-read is fully pinned by this test against the recipe's
+// exact identifiers.
+//
+// Sibling to TestPatternDetector_CUDAOOMWiringEmitsFragmentationVerdict
+// (which uses helper-builder attributes); this test deliberately
+// constructs attributes by name so a rename in either the recipe or
+// the projector breaks one of the two tests cleanly.
+func TestRecipe_CUDAOOM_RoundTripFiresVerdict(t *testing.T) {
+	t.Parallel()
+
+	fbAt := mustParseTime(t, "2026-06-01T10:00:00Z")
+	oomAt := fbAt.Add(30 * time.Second)
+
+	ld := plog.NewLogs()
+
+	// FB memory log record (metrics→logs OTTL recipe output; sibling
+	// to #273 / #337). Carries the same `gpu.id` PCI BDF the recipe's
+	// CUDA-OOM stanza stamps.
+	fbRL := ld.ResourceLogs().AppendEmpty()
+	fbRL.Resource().Attributes().PutStr("k8s.node.name", "gpu-node-0042")
+	fbSL := fbRL.ScopeLogs().AppendEmpty()
+	fbLR := fbSL.LogRecords().AppendEmpty()
+	fbLR.SetTimestamp(pcommon.NewTimestampFromTime(fbAt))
+	fa := fbLR.Attributes()
+	fa.PutStr("gpu.id", "PCI:0000:3b:00")
+	fa.PutInt("hw.gpu.memory.free", 16*1024*1024*1024)  // 16 GiB
+	fa.PutInt("hw.gpu.memory.total", 80*1024*1024*1024) // 80 GiB
+
+	// CUDA OOM log record — EXACT attribute shape the recipe's OTTL
+	// stamps off the canonical PyTorch stderr body. The body is the
+	// real PyTorch error string; the recipe extracts the bytes scalar
+	// + GPU index from it. `gpu.id` is the operator-configurable PCI
+	// BDF (per RFC-0013 §3) the recipe stamps via a sibling stanza
+	// (k8sattributes or a DCGM BDF-lookup transform).
+	oomRL := ld.ResourceLogs().AppendEmpty()
+	oomRA := oomRL.Resource().Attributes()
+	oomRA.PutStr("k8s.node.name", "gpu-node-0042")
+	oomRA.PutStr("k8s.pod.name", "trainer-rank-7")
+	oomRA.PutStr("k8s.namespace.name", "training")
+	oomSL := oomRL.ScopeLogs().AppendEmpty()
+	oomLR := oomSL.LogRecords().AppendEmpty()
+	oomLR.SetTimestamp(pcommon.NewTimestampFromTime(oomAt))
+	oomLR.Body().SetStr("RuntimeError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU 0 has a total capacity of 79.18 GiB of which 16.00 GiB is free.")
+	oa := oomLR.Attributes()
+	oa.PutStr("gpu.id", "PCI:0000:3b:00")
+	oa.PutInt("cuda_oom.tried_alloc_bytes", 2*1024*1024*1024)
+	oa.PutInt("cuda_oom.gpu_index", 0)
+
+	sink := newLogsSink()
+	p := newProcessor(testSettings(), defaultConfig(), sink)
+	require.NoError(t, p.Start(context.Background(), componenttestHost{}))
+	t.Cleanup(func() { _ = p.Shutdown(context.Background()) })
+	require.NoError(t, p.ConsumeLogs(context.Background(), ld))
+
+	verdicts := extractCUDAOOMVerdicts(t, sink.at(0))
+	require.Len(t, verdicts, 1,
+		"recipe-shaped CUDA OOM log record did not produce a verdict — "+
+			"either the recipe's attribute names drifted from the projector "+
+			"(module/processor/patterndetectorprocessor/cuda_oom.go) or the "+
+			"detector's gating contract moved without a test update")
+	v := verdicts[0]
+	require.Equal(t, patterns.PatternIDCUDAOOM, v.PatternID)
+	require.Equal(t, "PCI:0000:3b:00", v.GPUID)
+	require.Equal(t, "gpu-node-0042", v.Node)
+	require.Equal(t, "trainer-rank-7", v.PodName)
+	require.Equal(t, "training", v.PodNamespace)
+	require.Equal(t, patterns.CUDAOOMKindFragmentation, v.Kind)
+	require.Equal(t, patterns.ConfidenceFull, v.Confidence)
+	require.Equal(t, int64(2*1024*1024*1024), v.TriedAllocBytes)
+}
+
+// TestRecipe_CUDAOOM_RegexCoversCanonicalPyTorchMessages exercises the
+// recipe's OTTL regex against the actual PyTorch error vocabulary the
+// recipe must cover. This is the "regex tested against >=3 canonical
+// messages" gate from #436 A-tier acceptance.
+//
+// The Go-side regex below is a literal copy of the OTTL regex in the
+// recipe YAML; if the recipe changes its capture groups or unit
+// prefixes the parity below breaks. The unit-normalization arithmetic
+// mirrors what the OTTL `Int(whole)*UNIT + Int(frac)*UNIT/100`
+// expression computes (rounded-down floor; the precision loss is
+// 0.01 of the unit base, well under the detector's 5% fragmentation
+// threshold).
+//
+// Negative case: a non-OOM stderr line MUST NOT match — the projector
+// gate is the `cuda_oom.tried_alloc_bytes` attribute presence, and a
+// false stamp on every error line would flood the detector with
+// noise. The recipe's `where IsMatch(...)` guard MUST be tight.
+func TestRecipe_CUDAOOM_RegexCoversCanonicalPyTorchMessages(t *testing.T) {
+	t.Parallel()
+
+	// Literal mirror of the OTTL regex set the recipe ships. Each
+	// unit-prefix branch captures `whole` + 2-digit `frac` groups
+	// (matches PyTorch's `format_size` %.2f shape).
+	bodyMatch := regexp.MustCompile(`CUDA out of memory\. Tried to allocate (\d+)\.(\d{2}) (KiB|MiB|GiB|TiB)`)
+	gpuIndexMatch := regexp.MustCompile(`GPU (\d+) has a total capacity`)
+
+	unitToBase := map[string]int64{
+		"KiB": 1024,
+		"MiB": 1024 * 1024,
+		"GiB": 1024 * 1024 * 1024,
+		"TiB": 1024 * 1024 * 1024 * 1024,
+	}
+
+	type tc struct {
+		name      string
+		body      string
+		wantBytes int64
+		wantGPU   int64
+		// allow a small tolerance on the unit-normalization floor —
+		// matches the precision loss the OTTL `frac * UNIT/100`
+		// expression incurs (max 1 byte for KiB, max 24 bytes for
+		// GiB at 99 frac-units).
+		tolBytes int64
+		match    bool
+	}
+	cases := []tc{
+		{
+			name:      "GiB-canonical-RFC0013-pattern10-example",
+			body:      "RuntimeError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU 0 has a total capacity of 79.18 GiB of which 16.00 GiB is free.",
+			wantBytes: 2 * 1024 * 1024 * 1024,
+			wantGPU:   0,
+			tolBytes:  0,
+			match:     true,
+		},
+		{
+			name:      "MiB-small-alloc",
+			body:      "RuntimeError: CUDA out of memory. Tried to allocate 256.00 MiB. GPU 3 has a total capacity of 79.18 GiB.",
+			wantBytes: 256 * 1024 * 1024,
+			wantGPU:   3,
+			tolBytes:  0,
+			match:     true,
+		},
+		{
+			name:      "GiB-fractional",
+			body:      "torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1.50 GiB. GPU 7 has a total capacity of 39.59 GiB.",
+			wantBytes: 1*1024*1024*1024 + 50*(1024*1024*1024/100), // 0.50 GiB = 50 frac-units
+			wantGPU:   7,
+			tolBytes:  100, // accommodate floor-rounding within the OTTL Int/100 path
+			match:     true,
+		},
+		{
+			name:      "KiB-tiny-alloc",
+			body:      "RuntimeError: CUDA out of memory. Tried to allocate 768.00 KiB. GPU 0 has a total capacity of 79.18 GiB.",
+			wantBytes: 768 * 1024,
+			wantGPU:   0,
+			tolBytes:  0,
+			match:     true,
+		},
+		{
+			name:      "TiB-large-alloc",
+			body:      "RuntimeError: CUDA out of memory. Tried to allocate 1.00 TiB. GPU 0 has a total capacity of 79.18 GiB.",
+			wantBytes: 1 * 1024 * 1024 * 1024 * 1024,
+			wantGPU:   0,
+			tolBytes:  0,
+			match:     true,
+		},
+		// Negative cases — these MUST NOT match the OOM regex. A false
+		// positive here would stamp the cuda_oom.* attributes on every
+		// matching line and flood the detector.
+		{
+			name:  "negative-dataloader-worker-killed",
+			body:  "RuntimeError: DataLoader worker (pid 1234) is killed by signal: SIGKILL.",
+			match: false,
+		},
+		{
+			name:  "negative-nccl-timeout",
+			body:  "RuntimeError: [Rank 0]: Watchdog caught collective operation timeout",
+			match: false,
+		},
+		{
+			name:  "negative-cuda-error-no-alloc-size",
+			body:  "RuntimeError: CUDA error: an illegal memory access was encountered",
+			match: false,
+		},
+	}
+	matched := 0
+	for _, c := range cases {
+		c := c
+		t.Run(c.name, func(t *testing.T) {
+			bm := bodyMatch.FindStringSubmatch(c.body)
+			if !c.match {
+				require.Nil(t, bm, "negative case unexpectedly matched OOM regex: %q", c.body)
+				return
+			}
+			require.NotNil(t, bm, "positive case did not match OOM regex: %q", c.body)
+			matched++
+			whole := atoi(t, bm[1])
+			frac := atoi(t, bm[2])
+			unit := bm[3]
+			base, ok := unitToBase[unit]
+			require.True(t, ok, "unknown unit prefix %q", unit)
+			// Mirror the OTTL arithmetic: bytes = whole*UNIT + frac*UNIT/100
+			gotBytes := whole*base + frac*(base/100)
+			require.InDelta(t, c.wantBytes, gotBytes, float64(c.tolBytes),
+				"unit-normalized bytes drift for %s: want %d, got %d", c.name, c.wantBytes, gotBytes)
+
+			gm := gpuIndexMatch.FindStringSubmatch(c.body)
+			require.NotNil(t, gm, "positive case did not match GPU-index regex: %q", c.body)
+			require.Equal(t, c.wantGPU, atoi(t, gm[1]), "gpu_index drift for %s", c.name)
+		})
+	}
+	// Acceptance criterion: at least 3 canonical positive messages
+	// covered (KiB, MiB, GiB, TiB — 4 distinct unit prefixes).
+	require.GreaterOrEqual(t, matched, 3,
+		"regex must cover >=3 canonical PyTorch OOM messages per #436 A-tier acceptance")
+}
+
+// atoi is a tiny test helper for parsing int64 capture groups; pulling
+// in strconv at the test scope is overkill for the 4-call site count.
+func atoi(t *testing.T, s string) int64 {
+	t.Helper()
+	var n int64
+	for _, c := range []byte(s) {
+		require.True(t, c >= '0' && c <= '9', "non-digit in numeric capture %q", s)
+		n = n*10 + int64(c-'0')
+	}
+	return n
+}