diff --git a/docs/integrations/examples/prometheus-scrape.yaml b/docs/integrations/examples/prometheus-scrape.yaml index e5dc1345..5b9cc9fb 100644 --- a/docs/integrations/examples/prometheus-scrape.yaml +++ b/docs/integrations/examples/prometheus-scrape.yaml @@ -189,6 +189,41 @@ processors: - 'set(metric.unit, "By") where metric.name == "DCGM_FI_DEV_FB_FREE"' - 'set(metric.name, "hw.gpu.memory.free") where metric.name == "DCGM_FI_DEV_FB_FREE"' + # Project node_exporter's InfiniBand collector series into the + # customer-stable `hw.network.ib.*` namespace declared in + # docs/ATTRIBUTES.md §`hw.network.*` (alpha). Pattern #2's + # IBLinkFlapDetector projector reads these three attributes off a + # log record; this transform stamps them on the metric datapoint so + # the future RFC-0014 PR-B metrics→logs bridge emits the contract + # below without re-deriving label names. The source metric + # `node_infiniband_port_state_id` reports the IBA-spec phys_state + # ID as an integer Gauge (`1=Down`, `2=Initialize`, `3=Armed`, + # `4=Active`) — node_exporter reads + # `/sys/class/infiniband//ports//phys_state` and exposes the + # int directly; the `node_infiniband_state{state=""}` companion + # series is intentionally NOT used because the detector compares + # against `IBPortState*` integer constants in + # `module/pkg/patterns/ib_link_flap.go`. + transform/ib_to_hw_semconv: + metric_statements: + - context: datapoint + statements: + # ---- Pattern #2 (InfiniBand link flap): per-port Gauge ---- + # Source series: node_infiniband_port_state_id{device, port, + # instance}. The `device` label is the per-NIC IB device name + # (e.g. `mlx5_0`) — lifted as-is to `hw.network.ib.device`. + # The `port` label is a Prometheus string; `Int()` casts it + # to the int-typed `hw.network.ib.port.num` the detector's + # projector reads via `port.Int()`. + - 'set(datapoint.attributes["hw.network.ib.device"], datapoint.attributes["device"]) where metric.name == "node_infiniband_port_state_id" and datapoint.attributes["device"] != nil' + - 'set(datapoint.attributes["hw.network.ib.port.num"], Int(datapoint.attributes["port"])) where metric.name == "node_infiniband_port_state_id" and datapoint.attributes["port"] != nil' + # Rename the metric LAST so the `where metric.name == ...` + # guards above still match the raw exporter name. The Gauge + # value (the IBA phys_state ID) becomes the integer payload + # of `hw.network.ib.port.state`; the future PR-B bridge lifts + # it onto the log record at emit time. + - 'set(metric.name, "hw.network.ib.port.state") where metric.name == "node_infiniband_port_state_id"' + batch: send_batch_size: 8192 timeout: 10s @@ -208,5 +243,5 @@ service: # for DCGM series the next transform doesn't recognize. # `transform/dcgm_to_hw_semconv` runs AFTER so its statements # see the original `DCGM_FI_*` names and per-series labels. - processors: [transform/gpu_vendor, transform/dcgm_to_hw_semconv, batch] + processors: [transform/gpu_vendor, transform/dcgm_to_hw_semconv, transform/ib_to_hw_semconv, batch] exporters: [otlphttp] diff --git a/docs/integrations/prometheus-scrape.md b/docs/integrations/prometheus-scrape.md index c797eab4..d88d7a42 100644 --- a/docs/integrations/prometheus-scrape.md +++ b/docs/integrations/prometheus-scrape.md @@ -24,7 +24,7 @@ NVIDIA `dcgm-exporter`, AMD `ROCm/device-metrics-exporter`, Intel the Kueue scheduler's metrics endpoint. Replaces the in-tree `dcgm` and `kueue` receivers per RFC-0013 §7 (Deletion list — v0.1.0). -Two OTTL `transform` processors run in series over the scraped +Three OTTL `transform` processors run in series over the scraped metrics: 1. **`transform/gpu_vendor`** stamps the customer-stable @@ -42,6 +42,15 @@ metrics: `patterndetectorprocessor` with `processor.WithMetrics`; the transform below is the load-bearing wire-format contract that PR-B consumes. +3. **`transform/ib_to_hw_semconv`** projects + `node_exporter --collector.infiniband`'s + `node_infiniband_port_state_id` onto the customer-stable + `hw.network.ib.*` namespace (`docs/ATTRIBUTES.md §hw.network.*`, + alpha) so pattern #2's link-flap detector reads the same + vendor-neutral shape whether the underlying source is + node_exporter, a Mellanox-specific exporter, or + `journald-kernel.md`'s `mlx5_core` stream. Same RFC-0014 PR-B + bridge dependency as the DCGM transform. ## Config @@ -398,7 +407,49 @@ the joined record via `module/processor/patterndetectorprocessor/cuda_oom.go`'s > MIG-safe — it just renames the parent-device series; the > detector decides whether the renamed series is meaningful. -### Metrics-to-logs bridge contract (patterns #3, #4, #5, #10) +### Pattern #2 — InfiniBand link flap + +Source: `node_exporter --collector.infiniband` (the upstream Prometheus +node-exporter [infiniband collector](https://github.com/prometheus/node_exporter/blob/master/collector/infiniband_linux.go) +which reads `/sys/class/infiniband//ports//phys_state` and +exposes the IBA-spec phys_state ID as an integer Gauge). Run the +collector under tracecore's `prometheusreceiver` per +[RFC-0013 §2](../rfcs/0013-distro-first-pivot.md#2-adoption-matrix); +the in-tree binary bundles `prometheusreceiver` so no extra +component is required. + +| Raw node_exporter series | OTel metric | Datapoint attributes (added) | +|---|---|---| +| `node_infiniband_port_state_id{device, port}` | `hw.network.ib.port.state` (Gauge, IBA phys_state ID `1=Down` / `2=Init` / `3=Armed` / `4=Active`) | `hw.network.ib.device={device label}`, `hw.network.ib.port.num=Int({port label})` | + +The detector +([`module/processor/patterndetectorprocessor/ib_link_flap.go`](../../module/processor/patterndetectorprocessor/ib_link_flap.go)) +reads these three attributes off a log record via `port.Int()` / +`state.Int()` — the `Int()` cast on the `port` label is load-bearing +because `prometheusreceiver` exposes Prometheus labels as strings +while the projector calls `Int()` on the pdata Value. The companion +series `node_infiniband_state{state=""}` (string label) is +intentionally **not** mapped: the detector compares against the +`patterns.IBPortState*` integer constants, so the string variant +would round-trip wrong. + +The metric rename runs last so the `where metric.name == +"node_infiniband_port_state_id"` guards on the attribute-stamp +statements above still match the raw exporter name when each +statement evaluates. Renaming first would short-circuit the +attribute stamps because the second statement's guard would no +longer find the original name. + +[Pattern #2 doc](../patterns/pattern-2-ib-link-flap.md) consumes the +joined record via +[`projectIBPortStateRecord`](../../module/processor/patterndetectorprocessor/ib_link_flap.go) +(gate: `hw.network.ib.port.state` AND `hw.network.ib.device` AND +`hw.network.ib.port.num` on the same log record, plus +`k8s.node.name` on the resource). The metrics→logs emit half is +RFC-0014 PR-B; the bridge log-record schema is pinned in the +[next section](#pattern-2--hwnetworkibportstate-issue-393). + +### Metrics-to-logs bridge contract (patterns #2, #3, #4, #5, #10) The pattern detectors at `module/processor/patterndetectorprocessor` read **log records** today (`processor.WithLogs`). The DCGM scrape @@ -469,6 +520,59 @@ downstream. Layer 1 (journald-kernel AER stanza) is documented in [`journald-kernel.md`](journald-kernel.md) and ships independently of this bridge. +#### Pattern #2 — `hw.network.ib.port.state` (issue [#393](https://github.com/TraceCoreAI/tracecore/issues/393)) + +The InfiniBand link-flap detector +(`module/processor/patterndetectorprocessor/ib_link_flap.go::projectIBPortStateRecord`) +gates on a log record carrying: + +| Attribute | Type | Source | +|---|---|---| +| `hw.network.ib.port.state` | int | last `node_infiniband_port_state_id` Gauge sample for the `(device, port)` tuple at bridge-emit time; IBA phys_state ID (`1=Down`, `2=Init`, `3=Armed`, `4=Active`) | +| `hw.network.ib.device` | string | `device` label on the source series (e.g. `mlx5_0`) | +| `hw.network.ib.port.num` | int | `port` label on the source series, cast via OTTL `Int()` | +| `k8s.node.name` | string (resource) | stamped by `k8sattributesprocessor` on the DaemonSet | + +The metric datapoint attribute set from the `transform/ib_to_hw_semconv` +stanza above already carries `hw.network.ib.device` and +`hw.network.ib.port.num`; the future emitter passes those through +unchanged. The `hw.network.ib.port.state` integer lifts directly from +the renamed Gauge's datapoint value (one log record per `(device, +port, scrape)` — emit-once-per-state-transition is a detector-side +optimization, not a bridge-side gate; the detector's +[`patterns.IBLinkFlapDetector`](../../module/pkg/patterns/ib_link_flap.go) +counts transitions internally). + +##### Log-record schema (verdict-input) + +```yaml +# Bridge-emitted log record consumed by patterndetectorprocessor's +# ib_link_flap detector. One log record per (device, port, scrape) — +# the detector counts transitions across consecutive records. +resource: + attributes: + k8s.node.name: gpu-node-0007 # str — REQUIRED. Flap predicate is per-node. +log_record: + timestamp: 2026-06-01T10:04:30Z # MUST be the scrape timestamp. + body: "" # ignored by the detector. + attributes: + hw.network.ib.port.state: 1 # int — REQUIRED. IBA phys_state ID; detector compares against patterns.IBPortState* constants. + hw.network.ib.device: mlx5_0 # str — REQUIRED. Per-NIC device name; flap predicate is per-device. + hw.network.ib.port.num: 1 # int — REQUIRED. Port index; flap predicate is per-port (a 2-port HCA tracks each port separately). +``` + +##### Detector consumption + +`projectIBPortStateRecord` (at +`module/processor/patterndetectorprocessor/ib_link_flap.go`) extracts +the three scalars and builds a `patterns.IBPortStateRecord`. The +detector emits one verdict per `(k8s.node.name, hw.network.ib.device, +hw.network.ib.port.num)` tuple when transition count within +`ib_link_flap_window` (default 2min) crosses +`ib_link_flap_min_transitions` (default 2). The unit tests +[`TestPatternDetector_IBLinkFlapWiring*`](https://github.com/TraceCoreAI/tracecore/blob/main/module/processor/patterndetectorprocessor/ib_link_flap_test.go) +pin the canonical wire format above against the live detector. + #### Pattern #7 — `tracecore.alert.training_step_stalled.*` (issue [#365](https://github.com/tracecoreai/tracecore/issues/365)) The dataloader_hang detector's Layer 2 input is a **training-step diff --git a/docs/patterns/pattern-2-ib-link-flap.md b/docs/patterns/pattern-2-ib-link-flap.md index 2116d72b..aaa125d1 100644 --- a/docs/patterns/pattern-2-ib-link-flap.md +++ b/docs/patterns/pattern-2-ib-link-flap.md @@ -37,13 +37,14 @@ the integer as `node_infiniband_port_state_id`. A healthy port sits at `4`. A flapping port oscillates `4 → 1 → 4 → 1 …`, each transition visible to one scrape. -The bundled OTTL recipe (`prometheus-scrape.md` — IB stanza pending -on the [PR-B integration recipe](../rfcs/0014-metrics-to-logs-pattern-input.md)) -projects each transition into a log record carrying the -`hw.network.ib.*` namespace below, so the detector consumes one -vendor-neutral shape regardless of whether the underlying source is -`node_exporter --collector.infiniband`, a vendor exporter, or the -`mlx5_core` journald stream. +The bundled OTTL recipe +([`prometheus-scrape.md §Pattern #2`](../integrations/prometheus-scrape.md#pattern-2--infiniband-link-flap)) +projects `node_infiniband_port_state_id` into the metric-side +`hw.network.ib.*` shape; once RFC-0014 PR-B's metrics→logs bridge +lands, the same attributes ride a log record for the detector to +consume one vendor-neutral shape regardless of whether the underlying +source is `node_exporter --collector.infiniband`, a vendor exporter, +or the `mlx5_core` journald stream. ## Receiver-emitted signal @@ -214,19 +215,19 @@ scalar-promotion contract): The full attribute set lives in [`docs/ATTRIBUTES.md`](../ATTRIBUTES.md#hwnetwork--ibrdma-semconv-tracecore-ext-alpha). -## Integration gap - -The detector library + processor wiring are both in-tree. The -metrics→logs OTTL recipe that projects -`node_infiniband_port_state_id` onto `hw.network.ib.*` log records -is **not yet shipped** — tracked at -[#393](https://github.com/TraceCoreAI/tracecore/issues/393), sibling -to [#284](https://github.com/TraceCoreAI/tracecore/issues/284) / -[#285](https://github.com/TraceCoreAI/tracecore/issues/285) and -gated on [RFC-0014](../rfcs/0014-metrics-to-logs-pattern-input.md). -Until that recipe lands, this detector is configured-but-quiet on a -real deployment. The red-test path is fully covered (see Replay -above); only the input recipe is missing. +## Integration recipe + +The metric-side OTTL projection from +`node_infiniband_port_state_id` (node_exporter) onto the +`hw.network.ib.*` namespace ships at +[`prometheus-scrape.md §Pattern #2`](../integrations/prometheus-scrape.md#pattern-2--infiniband-link-flap) +(closed #393). The detector library + processor wiring are +already in-tree; the metrics→logs bridge emitter that lifts the +projected attrs onto a log record is the broader +[RFC-0014](../rfcs/0014-metrics-to-logs-pattern-input.md) PR-B work +shared with patterns #3 / #4 / #5 / #10 — the bridge log-record +schema for this pattern is pinned at +[`prometheus-scrape.md §Pattern #2 bridge contract`](../integrations/prometheus-scrape.md#pattern-2--hwnetworkibportstate-issue-393). The design-spec contract for this pattern (engineering-facing, distinct from this operator-facing walkthrough) is at diff --git a/module/processor/patterndetectorprocessor/ib_link_flap_recipe_test.go b/module/processor/patterndetectorprocessor/ib_link_flap_recipe_test.go new file mode 100644 index 00000000..0097fcb0 --- /dev/null +++ b/module/processor/patterndetectorprocessor/ib_link_flap_recipe_test.go @@ -0,0 +1,172 @@ +// SPDX-License-Identifier: Apache-2.0 + +package patterndetectorprocessor + +import ( + "context" + "os" + "path/filepath" + "testing" + "time" + + "github.com/stretchr/testify/require" + "go.opentelemetry.io/collector/pdata/pcommon" + "go.opentelemetry.io/collector/pdata/plog" + + "github.com/tracecoreai/tracecore/module/pkg/patterns" +) + +// TestRecipe_IBLinkFlap_StanzaPinsWireContract pins the +// docs/integrations/examples/prometheus-scrape.yaml IB OTTL stanza +// against the live IBLinkFlapDetector wire contract (issue #393). +// +// What this gate protects: +// 1. Renaming the source metric (`node_infiniband_port_state_id`) in +// the recipe without updating it here would silently disconnect +// node_exporter from the detector. +// 2. Dropping any of the three attribute stamps (`hw.network.ib.device` +// / `hw.network.ib.port.num` / `hw.network.ib.port.state`) would +// fail the projector at `projectIBPortStateRecord` with no +// downstream signal — the detector stays configured-but-quiet. +// +// The recipe-side compile gate (scripts/validator-recipe.sh) catches +// OTTL syntax breakage; this test catches semantic drift between the +// recipe's stamps and the projector's reads. +func TestRecipe_IBLinkFlap_StanzaPinsWireContract(t *testing.T) { + t.Parallel() + + // Walk up to the repo root so the test is invariant under + // `go test ./...` from anywhere. + recipePath := findRepoFile(t, "docs/integrations/examples/prometheus-scrape.yaml") + raw, err := os.ReadFile(recipePath) + require.NoError(t, err, "reading recipe example yaml") + recipe := string(raw) + + // Each token is load-bearing for pattern #2's wire contract. + // Drift here without a matching detector / docs / wiring-test edit + // is a defect. Strings are matched as-is against the YAML body — + // they are the exact identifiers the detector reads. + pinned := []string{ + // Source series the OTTL stanza must consume. + `"node_infiniband_port_state_id"`, + // Three datapoint attributes the projector reads. + `"hw.network.ib.device"`, + `"hw.network.ib.port.num"`, + `"hw.network.ib.port.state"`, + // Source labels lifted off the Prometheus series. `Int(...)` on + // the port cast is the load-bearing detail — the projector + // calls `port.Int()` on the pdata Value. + `datapoint.attributes["device"]`, + `Int(datapoint.attributes["port"])`, + // The renamed processor identifier (mirrors the dcgm transform + // shape — operators grep for this to confirm the stanza is + // in their pipeline). + `transform/ib_to_hw_semconv`, + } + for _, p := range pinned { + require.Contains(t, recipe, p, + "recipe example yaml missing pattern-#2 wire-contract token %q; see issue #393", p) + } + + // The processor pipeline must wire transform/ib_to_hw_semconv into + // the metrics/scrape pipeline. Without this the stanza compiles + // but never executes — the failure mode the recipe-only test + // cannot catch on its own. + require.Contains(t, recipe, "transform/ib_to_hw_semconv, batch", + "recipe example yaml does not wire transform/ib_to_hw_semconv into metrics/scrape pipeline") +} + +// TestRecipe_IBLinkFlap_RoundTripFiresVerdict pins the round-trip: +// log records carrying the exact attribute shape the recipe's OTTL +// stamps onto a metric datapoint (`hw.network.ib.port.state` int, +// `hw.network.ib.device` str, `hw.network.ib.port.num` int) flow +// through the IBLinkFlapDetector and emit a verdict (issue #393). +// +// This is the simulated end-to-end gate: the OTTL stanza projects the +// metric datapoint, the future RFC-0014 PR-B bridge lifts those +// attributes onto a log record, and the detector consumes the log +// record. The bridge emitter has not shipped yet, but the contract +// from metric-stamp → log-attribute → projector-read is fully pinned +// by this test against the recipe's exact identifiers. +// +// Sibling to TestPatternDetector_IBLinkFlapWiringEmitsFullVerdict +// (which uses helper-builder attributes); this test deliberately +// constructs attributes by name so a rename in either the recipe or +// the projector breaks one of the two tests cleanly. +func TestRecipe_IBLinkFlap_RoundTripFiresVerdict(t *testing.T) { + t.Parallel() + + now := time.Now().UTC() + // Two transitions within the default 2-min window — the detector's + // MinTransitions floor is 2 (a single observation is meaningless). + tx1 := now.Add(-90 * time.Second) + tx2 := now.Add(-30 * time.Second) + + ld := plog.NewLogs() + addRecipeShapedIBRecord := func(ts time.Time, stateID int64) { + rl := ld.ResourceLogs().AppendEmpty() + rl.Resource().Attributes().PutStr("k8s.node.name", "gpu-node-0007") + sl := rl.ScopeLogs().AppendEmpty() + lr := sl.LogRecords().AppendEmpty() + lr.SetTimestamp(pcommon.NewTimestampFromTime(ts)) + a := lr.Attributes() + // EXACT attribute shape the recipe stamps on the metric + // datapoint. The bridge emitter will copy these three keys + // verbatim onto the log record (per the bridge-contract + // section in docs/integrations/prometheus-scrape.md). + a.PutInt("hw.network.ib.port.state", stateID) + a.PutStr("hw.network.ib.device", "mlx5_0") + a.PutInt("hw.network.ib.port.num", 1) + } + // Down → Active oscillation = one flap (two transitions counted). + addRecipeShapedIBRecord(tx1, int64(patterns.IBPortStateDown)) + addRecipeShapedIBRecord(tx2, int64(patterns.IBPortStateActive)) + + sink := newLogsSink() + p := newProcessor(testSettings(), defaultConfig(), sink) + require.NoError(t, p.Start(context.Background(), componenttestHost{})) + t.Cleanup(func() { _ = p.Shutdown(context.Background()) }) + require.NoError(t, p.ConsumeLogs(context.Background(), ld)) + + verdicts := extractIBLinkFlapVerdicts(t, sink.at(0)) + require.Len(t, verdicts, 1, + "recipe-shaped IB log records did not produce a flap verdict — "+ + "either the recipe's attribute names drifted from the projector "+ + "(module/processor/patterndetectorprocessor/ib_link_flap.go) or the "+ + "detector's MinTransitions floor moved without a test update") + v := verdicts[0] + require.Equal(t, patterns.PatternIDIBLinkFlap, v.PatternID) + require.Equal(t, "gpu-node-0007", v.Node) + require.Equal(t, "mlx5_0", v.HCADevice) + require.Equal(t, 1, v.Port) + require.Equal(t, 2, v.TransitionCount) +} + +// findRepoFile walks up from the test's working directory to the +// repo root (identified by go.mod at the top) and returns the +// absolute path to `relPath`. Cheaper than threading a build flag. +func findRepoFile(t *testing.T, relPath string) string { + t.Helper() + dir, err := os.Getwd() + require.NoError(t, err) + for i := 0; i < 10; i++ { + candidate := filepath.Join(dir, relPath) + if _, err := os.Stat(candidate); err == nil { + return candidate + } + parent := filepath.Dir(dir) + if parent == dir { + break + } + dir = parent + } + t.Fatalf("could not locate %s above %s", relPath, mustGetwd(t)) + return "" +} + +func mustGetwd(t *testing.T) string { + t.Helper() + wd, err := os.Getwd() + require.NoError(t, err) + return wd +}