diff --git a/.github/workflows/chaos.yml b/.github/workflows/chaos.yml index 015f1f86..073c75a6 100644 --- a/.github/workflows/chaos.yml +++ b/.github/workflows/chaos.yml @@ -4,9 +4,6 @@ name: Chaos # - harness-determinism: same argv + --seed produces byte-identical # output across two runs, matching tools/failure-inject/testdata/ # golden.sha256. -# - chaos-pipeline-test: internal/pipeline/chaos_test.go runs under -# -tags=chaos and proves the receiver-exporter panic-or-error -# pairing leaks no goroutines across ≥100 iterations. # - cpu-steal-mpstat: failure-inject cpu-steal pins a busy-loop and # mpstat reports %steal+%user ≥ 95% on the pinned core for ≥ D-1 # seconds. @@ -14,6 +11,13 @@ name: Chaos # detector test plus pins the pod-evict CLI output SHA so harness # drift and detector drift are caught in the same workflow. # +# The legacy chaos-pipeline-test job ran internal/pipeline/chaos_test.go +# under -tags=chaos to prove the in-tree receiver-exporter panic-or-error +# pairing leaked no goroutines. Deleted in RFC-0013 PR-F.2 along with the +# in-tree pipeline runtime; the equivalent panic-recovery contract now +# rides on upstream `go.opentelemetry.io/collector/service` and is +# covered by upstream's own chaos tests. +# # Matrix-of-patterns rule: per MILESTONES.md §M4b the workflow grows # a row when each pattern lands. M17 / M18 are still open and will # add their own rows when they land. @@ -27,13 +31,11 @@ on: - main paths: - "tools/failure-inject/**" - - "internal/pipeline/chaos_test.go" - "internal/synthesis/**" - ".github/workflows/chaos.yml" pull_request: paths: - "tools/failure-inject/**" - - "internal/pipeline/chaos_test.go" - "internal/synthesis/**" - ".github/workflows/chaos.yml" @@ -105,19 +107,6 @@ jobs: echo "OK [$argv]" done < tools/failure-inject/testdata/golden.sha256 - chaos-pipeline-test: - name: chaos-pipeline-test - runs-on: ubuntu-latest - timeout-minutes: 10 - steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - - uses: actions/setup-go@4a3601121dd01d1626a1e23e37211e3254c1c06c # v6.4.0 - with: - go-version-file: go.mod - cache: true - - name: go test -tags=chaos (panic-or-error × goleak) - run: go test -tags=chaos -race -count=1 -run TestChaos ./internal/pipeline/... - cpu-steal-mpstat: name: cpu-steal-mpstat (linux) runs-on: ubuntu-latest diff --git a/.github/workflows/install-bench.yml b/.github/workflows/install-bench.yml index 6d102714..25e75ef1 100644 --- a/.github/workflows/install-bench.yml +++ b/.github/workflows/install-bench.yml @@ -16,9 +16,6 @@ on: - 'components/exporters/otlphttp/**' - 'install/kubernetes/tracecore/**' - 'builder-config.yaml' - - 'internal/pipeline/**' - - 'internal/runtime/**' - - 'internal/selftelemetry/**' - 'go.mod' - 'go.sum' - '.github/workflows/install-bench.yml' @@ -29,9 +26,6 @@ on: - 'components/exporters/otlphttp/**' - 'install/kubernetes/tracecore/**' - 'builder-config.yaml' - - 'internal/pipeline/**' - - 'internal/runtime/**' - - 'internal/selftelemetry/**' - 'go.mod' - 'go.sum' - '.github/workflows/install-bench.yml' diff --git a/.golangci.yml b/.golangci.yml index d8fd7c8b..6ee6ce82 100644 --- a/.golangci.yml +++ b/.golangci.yml @@ -113,8 +113,8 @@ linters: # Interface forwarders (Consume*, Component.Shutdown) are designed # to pass errors through unmodified — wrapping would stutter. ignore-interface-regexps: - - ^github\.com/tracecoreai/tracecore/internal/consumer\.(Metrics|Traces|Logs)$ - - ^github\.com/tracecoreai/tracecore/internal/pipeline\.Component$ + - ^go\.opentelemetry\.io/collector/consumer\.(Metrics|Traces|Logs)$ + - ^go\.opentelemetry\.io/collector/component\.Component$ revive: rules: - name: var-naming diff --git a/AGENTS.md b/AGENTS.md index ee969c5e..5a9dbf3c 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -31,9 +31,14 @@ Concrete implications for any edit you make: `github.com/tracecoreai/tracecore/module`) per RFC-0013 §6, via CONTRIBUTING.md "RFC routing" guidance. - **Self-telemetry internals** (`internal/componentstatus`, - `internal/selftelemetry`, `internal/telemetry`) delete at v0.1.0 - - replaced by upstream `componentstatus` + `service/telemetry` + the - standard `otelcol_*` metric surface. + `internal/selftelemetry`, `internal/telemetry`) deleted at v0.1.0 + (PR-F.1 + PR-F.2) - replaced by upstream `componentstatus` + + `service/telemetry` + the standard `otelcol_*` metric surface. +- **Pipeline / boot-path internals** (`internal/pipeline`, + `internal/pipelinebuilder`, `internal/config`, `internal/consumer`, + `internal/fanout`, `internal/runtime/lifecycle`) deleted at v0.1.0 + (PR-F.2) - replaced by the OCB-generated boot path off + `builder-config.yaml`. - **Release pipeline** rewrites at v0.1.0 to goreleaser + slsa-github-generator + cosign-installer + sbom-action + actions/attest-build-provenance. - **Customer-stable contracts** (`k8s.event.hint` enum, diff --git a/CHANGELOG.md b/CHANGELOG.md index 9a08f49d..770b4270 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -18,7 +18,13 @@ Pivot landed across four waves of PRs: - `internal/telemetry/` — was the in-tree `MeterProvider` + probe-server (`/metrics`, `/healthz`, `/readyz`) wrapper. Probes now flow through the upstream `healthcheckextension`; meter-provider is upstream `service.telemetry`. Only remaining consumers were `internal/selftelemetry/*_test.go` (deleted together with selftelemetry) and one orphan clockreceiver integration test. - `components/receivers/clockreceiver/errors_integration_test.go` — orphan integration test from #185's PR-B1 clockreceiver port; bootstrapped via the now-deleted `selftelemetry.Receiver` interface but never migrated to the receiver-scoped sibling `selftel.go`. The covered behaviour ("errors_total surfaces on downstream failure") is now exercised through clockreceiver's sibling tests. -PR-F.2 (deferred — pending three open ports): Delete `internal/{componentstatus,pipeline,pipelinebuilder,consumer,fanout,runtime/lifecycle}`. Gated on the last three pipeline+consumer-importing receivers landing — k8sevents (#204), clockreceiver (#205), otlphttp (#207) — all three open as of this entry, all three following the PR-B2 (#201) shape. Once they merge, the entire `internal/*` runtime bundle has zero non-test consumers and drops in a single cut. The `clockreceiver` source deletion stays in PR-K (chart + values-keys deprecation cycle) — PR-F.2 only deletes `internal/*` packages, not the canonical-example receivers themselves. +**PR-F.2 landed: `internal/{pipeline,pipelinebuilder,config,consumer,fanout,componentstatus,runtime/lifecycle}` deleted.** Net deletion of the full in-tree boot-path infrastructure across 56 files / -6,888 LOC. The nine sibling upstream-port PRs (#201 nccl_fr, #202 stdoutexporter, #203 pyspy, #204 k8sevents, #205 clockreceiver, #207 otlphttp, #208 kernelevents, #209 containerstdout, plus PR-A2 boot-path retirement) cleared every external caller of these packages; with PR-F.1's selftel/telemetry deletions and PR-A2's `cmd/tracecore` rewire to the OCB-generated main, none of these packages has a non-test consumer in the tree. Replacements: +- `internal/pipeline/` + `internal/pipelinebuilder/` + `internal/consumer/` + `internal/fanout/` — assembly + per-signal consumer chain + fanout cloning are now provided by upstream `go.opentelemetry.io/collector/service`. The OCB-generated `_build/main.go` consumes `builder-config.yaml` and produces an equivalent collector instance. +- `internal/config/` — YAML loader with `file:line:col` errors replaced by upstream `confmap` providers (`file`, `yaml`, `env`). +- `internal/componentstatus/` — replaced by upstream `go.opentelemetry.io/collector/component/componentstatus.ReportStatus` (same free-function shape). +- `internal/runtime/lifecycle/` — the `Lifecycle{Add, Start, Shutdown, PanicCallback}` helper was already ported into each receiver as a package-local sibling during the PR-B1 wave (#184/#185/#186/#187/#194/#196/#197); after the wave-3 PRs landed, the in-tree package had no non-test consumers and is now gone. `kernelevents/lifecycle.go` was inherited from k8sevents (#208). + +`.golangci.yml` `ignore-interface-regexps` now points at upstream `consumer.{Metrics,Traces,Logs}` and upstream `component.Component`. `.github/workflows/chaos.yml` drops the `chaos-pipeline-test` job (the in-tree `internal/pipeline/chaos_test.go` is gone; the equivalent panic-recovery contract is now provided by upstream `service`); harness-determinism + cpu-steal-mpstat + pattern-pod-evicted jobs preserved. `.github/workflows/install-bench.yml` drops the `internal/{pipeline,runtime,selftelemetry}/**` path-filter rows. `docs/FAILURE-MODES.md` Lifecycle / Data-flow / Shutdown-timing / Backend-connectivity tables rewired from in-tree test pointers to upstream-delegated wording matching PR-A2's pattern. `docs/STRATEGY.md` "Stable interfaces in `internal/pipeline/`" graduation row rewritten to point at the upstream surface. `MILESTONES.md` M1 + M4b + M19 rubric details annotated. `docs/migration/v0.1-to-v0.2.md` `internal/*` deletion section's status banner flipped from deferred to landed. Build-tag `dcgm` retired (`make build-tags` no longer vets `-tags dcgm`). `make bench-check` loop drops both deleted package rows (dcgm + internal/telemetry). `scripts/register-lint.sh` allowlist emptied (the two `internal/telemetry/{build_info,slo}.go` entries are gone with the package). Chart `receivers.dcgm` toggle + `_helpers.tpl` doc-list + `NOTES.txt` warning retained until PR-K removes them outright (toggle is already inert — operators enabling `receivers.dcgm.enabled=true` have crashed at boot since PR-A2). `internal/runtime/lifecycle/` doc-comment updated. `docs/FAILURE-MODES.md` self-tel-surface rows rewired to upstream-delegated wording. `docs/patterns/{README,pattern-{1,3,4,5}}.md` replay-test pointers updated. @@ -28,7 +34,7 @@ Build-tag `dcgm` retired (`make build-tags` no longer vets `-tags dcgm`). `make **PR-E unblocked.** Original RFC-0013 §migration plan named `telemetrygeneratorreceiver` as the upstream replacement for `clockreceiver`. Verified 2026-05-30: the receiver does not exist in `opentelemetry-collector-contrib` at any tag from v0.95.0 through v0.130.0; two community proposals (contrib issues #41687 and #43657) were closed `not_planned`. Replacement landed on `hostmetricsreceiver` (loadscraper @ 1s) — an upstream OCB-bundled receiver that emits 3 low-cardinality series (`system.cpu.load_average.{1m,5m,15m}`) at the cadence the bench's pass condition needs (first parseable JSON line at the sink — see `bench/install/run.sh`). This PR adds `hostmetricsreceiver` to `builder-config.yaml`, adds a `receivers.hostmetrics` opt-in block to the chart values (default disabled — chart default stays `clockreceiver` this release), and flips `bench/install/tracecore-values.yaml` to enable hostmetrics + disable clockreceiver. RFC-0013 §migration PR-E + §4 + §7 deletion table updated. Chart-default flip from `clockreceiver` to `hostmetrics` + source-deletion of `components/receivers/clockreceiver/` are deferred to PR-K (in-tree-receiver deletion wave) so the values-keys migration ships together with `NOTES.txt` deprecation warnings and the coordinated migration of ~92 in-tree test-fixture references in one cut rather than two operator-visible changes. -Remaining v0.1.0 work: PR-F.1 (delete `components/receivers/dcgm/` + `pkg/dcgm/` + `internal/selftelemetry/` + `internal/telemetry/`) landed in this Unreleased section; PR-F.2 (delete `internal/componentstatus/`) deferred until `internal/pipeline` migrates to upstream `componentstatus`. Chart default pipeline still hardwires the to-be-deleted receivers, so the receiver-side deletions (clockreceiver / containerstdout / kernelevents / k8sevents) ride with PR-K alongside the v0.2.0 recipe migration to avoid an interim chart break. +Remaining v0.1.0 work: PR-F.1 (delete `components/receivers/dcgm/` + `pkg/dcgm/` + `internal/selftelemetry/` + `internal/telemetry/`) and PR-F.2 (delete `internal/{pipeline,pipelinebuilder,config,consumer,fanout,componentstatus,runtime/lifecycle}`) both landed in this Unreleased section. Chart default pipeline still hardwires the to-be-deleted receivers, so the receiver-side deletions (clockreceiver / containerstdout / kernelevents / k8sevents) ride with PR-K alongside the v0.2.0 recipe migration to avoid an interim chart break. **RFC-0013 §migration rescoped (doc-only).** Headline: **PR-I is now an in-repo Go submodule at `module/`, not an external `tracecoreai/tracecore-components` repo.** Open-source project — one fork, one CI, one issue tracker, one DCO wins. Go submodule tags give independent version line; OCB `gomod:` + `replaces: ./module` for dev-loop resolves identical to external repo. @@ -86,7 +92,7 @@ Files updated in this PR: `docs/rfcs/0013-distro-first-pivot.md` (§1, §6, §mi - [RFC-0004](docs/rfcs/archived/0004-clockreceiver-stdoutexporter.md): clockreceiver + stdoutexporter (Option C scope adoption - `Capabilities()`, fan-out, `ComponentState` mixin, factory-as-package-var). Archived under RFC-0013. - [`docs/STRATEGY.md`](docs/STRATEGY.md): long-term repo posture. The single load-bearing principle: "Tracecore is OpenTelemetry-Collector-compatible by default. Every divergence is deliberate and documented." - [`docs/research/otel-graph-notes.md`](docs/research/otel-graph-notes.md): synthesized findings from reading OTel Collector v0.152.0's `service/internal/graph` + `testcomponents` + `fanoutconsumer` source. -- Receiver-author quickstart in [`internal/pipeline/README.md`](internal/pipeline/README.md). +- Receiver-author quickstart in `internal/pipeline/README.md` (deleted at v0.1.0 with the in-tree pipeline runtime per RFC-0013 PR-F.2; superseded by upstream [`go.opentelemetry.io/collector`](https://pkg.go.dev/go.opentelemetry.io/collector) docs). - `PRINCIPLES.md`: 15 design and engineering principles distilled from the foundation work; the *why* behind every rule in `STYLE.md`. - **`internal/selftelemetry`** - producer-side `Receiver` interface (`IncError`, `IncEmissions`, `ObserveLatency`, `SetDegraded`, `MarkActivity`) that components write to when reporting their own health, plus a noop default. The `/metrics` endpoint that surfaces these to operators is owned by M2; this package lets M8+ receivers wire to self-telemetry from day one without waiting for M2. - **M9 - kernelevents receiver (alpha)** - tails `/dev/kmsg` and the systemd journal, filters by severity / facility / regex, preserves trace context. Two sources behind a common interface, both `//go:build linux` with non-Linux stubs that degrade silently. Subprocess crash → backoff restart (1s/2s/5s, max 3 retries, 60s window). Emits records with a stable, dereferenceable SchemaURL pointing at [`docs/schemas/kernelevents/v0.md`](docs/schemas/kernelevents/v0.md); resolves NVIDIA Xid codes to canonical descriptions via the `kernelevents.xid.description` attribute (40 codes in the alpha subset). 16 KiB body cap with `...`-suffix truncation on pathological inputs. **Depends on M2** - receivers acquire their `selftelemetry.Receiver` from `TelemetrySettings.MeterProvider`; the M9 receiver will run without M2 (noop telemetry) but operators won't see `/metrics` / `/healthz` / `/readyz` until M2 is wired in `cmd/tracecore`. See [RFC-0007](docs/rfcs/0007-kernelevents-receiver-scope.md). (Originally numbered 0005; renamed to 0007 in the M8↔M9 merge to resolve a collision with the dcgm RFC that landed on main as RFC-0005.) diff --git a/MILESTONES.md b/MILESTONES.md index 5a6d7b8d..c2df8c80 100644 --- a/MILESTONES.md +++ b/MILESTONES.md @@ -17,7 +17,7 @@ Per [RFC-0013 §4](docs/rfcs/0013-distro-first-pivot.md#4-migration-timeline-rel | Release | Deletions | Adoptions | |---|---|---| -| **v0.1.0** | `clockreceiver`, `dcgm` (cgo stub never shipped real path), `kueue` (never shipped), self-tel internal packages (`internal/componentstatus`, `internal/selftelemetry`, `internal/telemetry`); legacy `release.yml` rewritten onto goreleaser stack (prior workflow in git history) | OCB skeleton + `builder-config.yaml`; upstream `componentstatus` + `service/telemetry`; goreleaser + SLSA stack; `ko` for image build; `telemetrygeneratorreceiver` for heartbeat | +| **v0.1.0** | `clockreceiver`, `dcgm` (cgo stub never shipped real path), `kueue` (never shipped), self-tel internal packages (`internal/componentstatus`, `internal/selftelemetry`, `internal/telemetry`), boot-path internals (`internal/pipeline`, `internal/pipelinebuilder`, `internal/config`, `internal/consumer`, `internal/fanout`, `internal/runtime/lifecycle`); legacy `release.yml` rewritten onto goreleaser stack (prior workflow in git history) | OCB skeleton + `builder-config.yaml`; upstream `componentstatus` + `service/telemetry`; upstream `service` / `confmap` for boot path + config loading; goreleaser + SLSA stack; `ko` for image build; `hostmetricsreceiver` loadscraper for heartbeat | | **v0.2.0** | `kernelevents`, `k8sevents`, `kineto` receivers + integration workflows (`.github/workflows/kernelevents-integration.yml`) | Recipes: `filelogreceiver`+container stanza+`file_storage`; `journaldreceiver`+`filelogreceiver`+OTTL transform; `k8sobjectsreceiver`+transform; `prometheusreceiver` (Kueue + dcgm-exporter / ROCm / Intel / Habana); `tracecoreai/tracecore-components` Go module split | | **v0.3.0** | `pyspy` receiver + `python/tracecore_pyspy/` PyPI helper + `tools/pyspy-lint/`; `.github/workflows/{pyspy-integration,python-publish}.yml` | `parca-agent` adoption recipe (deployed via separate chart). Kineto re-evaluated against OTel Profiles GA | @@ -108,9 +108,9 @@ Every milestone, in every lane, satisfies all seven principles below. Depth live ### M1. Pipeline runtime & component contract - **Status:** ☑ delivered (PRs #12 + #13) -- **Status (RFC-0013):** DELETED at v0.1.0 (pipeline boot path) - replaced by OCB-generated `main.go` from `builder-config.yaml`. **PR-A2 landed (#189)**: `cmd/tracecore/` deleted (3,032 LOC across 14 source + 7 test files); the OCB binary at `./_build/tracecore` is the canonical entry point. **PR-F.1 landed (#206)**: `internal/selftelemetry/` + `internal/telemetry/` deleted; every receiver/exporter now travels its own `selftel.go` + `lifecycle.go` siblings (PR-B1-shape sibling ports: #184/#185/#186/#187/#188/#193/#194/#196/#197). **PR-F.2 deferred**: `internal/{componentstatus,pipeline,pipelinebuilder,consumer,fanout,runtime/lifecycle}` drop together once the last three pipeline+consumer-importing receivers land (#204 k8sevents, #205 clockreceiver, #207 otlphttp — all PR-B2-shape ports off canonical #201). `internal/config/` retained (still load-bearing for `tracecore validate`). The bundled `components/receivers/clockreceiver/` and `components/exporters/stdoutexporter/` canonical examples are queued for deletion at v0.2.0 (PR-K.2); `clockreceiver` replaced by `hostmetricsreceiver` (loadscraper @ 1s) per PR-E unblocking (#180) — the originally-planned `telemetrygeneratorreceiver` does not exist in opentelemetry-collector-contrib at any tag. +- **Status (RFC-0013):** DELETED at v0.1.0 (pipeline boot path) - replaced by OCB-generated `main.go` from `builder-config.yaml`. **PR-A2 landed (#189)**: `cmd/tracecore/` deleted (3,032 LOC across 14 source + 7 test files); the OCB binary at `./_build/tracecore` is the canonical entry point. **PR-F.1 landed (#206)**: `internal/selftelemetry/` + `internal/telemetry/` deleted; every receiver/exporter now travels its own `selftel.go` + `lifecycle.go` siblings (PR-B1-shape sibling ports: #184/#185/#186/#187/#188/#193/#194/#196/#197). **PR-F.2 landed**: `internal/{componentstatus,pipeline,pipelinebuilder,config,consumer,fanout,runtime/lifecycle}` all deleted (56 files / -6,888 LOC) after the last pipeline+consumer-importing receivers landed (#204 k8sevents, #205 clockreceiver, #207 otlphttp — all PR-B2-shape ports off canonical #201). `internal/config/` deleted too; YAML loading delegated to upstream `confmap` providers. The bundled `components/receivers/clockreceiver/` and `components/exporters/stdoutexporter/` canonical examples are queued for deletion at v0.2.0 (PR-K.2); `clockreceiver` replaced by `hostmetricsreceiver` (loadscraper @ 1s) per PR-E unblocking (#180) — the originally-planned `telemetrygeneratorreceiver` does not exist in opentelemetry-collector-contrib at any tag. - **Depends on:** none (foundational) -- **Reference:** [RFC-0003](docs/rfcs/0003-pipeline-runtime-and-component-contract.md). Contract documented in [`internal/pipeline/README.md`](internal/pipeline/README.md). +- **Reference:** [RFC-0003](docs/rfcs/0003-pipeline-runtime-and-component-contract.md). The contract (formerly documented in `internal/pipeline/README.md`) was superseded at v0.1.0 by upstream `go.opentelemetry.io/collector/{component,receiver,processor,exporter,consumer,pipeline}` per RFC-0013. **Functional rubrics:** - ☑ Component / Host / Factory contracts and per-signal factory methods land per RFC-0003 §`Component` interface / §`Host` interface / §Per-signal factory methods; `internal/pipeline` package implements both. @@ -121,7 +121,7 @@ Every milestone, in every lane, satisfies all seven principles below. Depth live - ☑ Operator UX: config errors carry `file:line:col`, empty-pipeline boot, first-data log line, `pipelinetest.New(t)` test fixture, `tracecore validate` subcommand; all per RFC-0003 §Operator UX patterns + §Tests + §CLI integration. **Non-functional rubrics:** -- ☑ Component contract documented in `internal/pipeline/README.md` so receiver authors have a single canonical reference. (per RFC-0003 §Tests) +- ☑ Component contract documented (in the now-deleted `internal/pipeline/README.md`) so receiver authors had a single canonical reference. (per RFC-0003 §Tests). Superseded at v0.1.0 by upstream `go.opentelemetry.io/collector` documentation. ### M2. Self-telemetry surface @@ -258,7 +258,7 @@ M20a/b/c are gates against the same artifact (`bench/install/run.sh`) at progres - **Status:** ☑ partial (PR #30 - xid / cpu-steal / pod-evict shipped; nccl-hang CLI integration is a stub returning `ErrPending` per `tools/failure-inject/ncclhang/`) - **Depends on:** none -- **Landed:** `tools/failure-inject/` (`xid`/`pod-evict`/`cpu-steal`); `internal/pipeline/chaos_test.go`; `.github/workflows/chaos.yml`. +- **Landed:** `tools/failure-inject/` (`xid`/`pod-evict`/`cpu-steal`); `internal/pipeline/chaos_test.go` (deleted at v0.1.0 with the in-tree pipeline runtime per RFC-0013 PR-F.2; upstream `service` covers the equivalent panic-recovery contract); `.github/workflows/chaos.yml`. - **Carry-forward:** (1) `nccl-hang` CLI subcommand - wrap `pkg/nccl/fr_parser/synthesize.go` (capability already present from M11); (2) cross-arch SHA-256 equality check - `chaos.yml` builds on amd64 + arm64 but does not cross-compare the seeded outputs; (3) matrix grows as patterns land - when M17 / M18 / M19 ship, add a `pattern:` matrix entry per pattern (per NORTHSTARS O1). **Functional rubrics:** @@ -268,7 +268,7 @@ M20a/b/c are gates against the same artifact (`bench/install/run.sh`) at progres - ⧗ `nccl-hang` pickle stream contains only safe opcodes (dict, list, tuple, int, str, bytes, None, refs); no `REDUCE`, `BUILD`, `GLOBAL`, or `INST` opcodes appear. *(Underlying `pkg/nccl/fr_parser` enforces this default-deny set; the CLI wrapper that exposes it ships with carry-forward item #1.)* (per PRINCIPLES §9) - ☑ `failure-inject pod-evict` creates a real k8s `Event` with `Reason=Evicted` against the in-cluster ServiceAccount or `--kubeconfig`; event is observable via `kubectl get events -o json` within 5 seconds *(unverified - requires live cluster)*. - ☑ `failure-inject cpu-steal` pins a busy-loop to `--core N` for `--duration D`; `mpstat -P N 1` reports `%steal+%user ≥ 95%` for at least `D-1` seconds; process exits 0. (per NORTHSTARS Appendix A pattern #6) -- ☑ `internal/pipeline/chaos_test.go` under `-tags=chaos` pairs panic-or-error receiver with panic-or-error exporter; runtime stays alive for ≥100 iterations without leaking goroutines. (per PRINCIPLES §1) +- ☑ `internal/pipeline/chaos_test.go` under `-tags=chaos` paired panic-or-error receiver with panic-or-error exporter; runtime stayed alive for ≥100 iterations without leaking goroutines. (per PRINCIPLES §1). Deleted at v0.1.0 with the in-tree pipeline runtime per RFC-0013 PR-F.2; upstream `service` provides the equivalent contract. - ☑ `.github/workflows/chaos.yml` runs nightly with a matrix entry per landed pattern (M17 / M18 / M19); each entry invokes the relevant `failure-inject` subcommand and asserts the corresponding pattern emits ≥1 match. Matrix grows as patterns land - M4b ships without forward-referencing unbuilt patterns. (per NORTHSTARS O1) **Non-functional rubrics:** @@ -364,7 +364,7 @@ M20a/b/c are gates against the same artifact (`bench/install/run.sh`) at progres **Functional rubrics:** - ☑ Tails `/dev/kmsg` and `journalctl --output=json --follow` behind one config block via the `source` interface. (per `components/receivers/kernelevents/source.go`; RFC-0007 §Design overview) -- ☑ Lifecycle (cancel, WaitGroup, panic recovery, channel ownership) lives in `internal/runtime/lifecycle.Lifecycle`, not inlined per receiver. (per RFC-0007 §Design overview) +- ☑ Lifecycle (cancel, WaitGroup, panic recovery, channel ownership) lives in a package-local `lifecycle` sibling per receiver (originally a shared `internal/runtime/lifecycle.Lifecycle` helper, ported to per-receiver siblings during the PR-B1 wave and inherited by kernelevents from k8sevents per RFC-0013 PR-F.2). (per RFC-0007 §Design overview) - ☑ NVRM-prefixed Xid extraction populates `kernelevents.xid` and `gpu.id` (PCI BDF) on emitted log records. (per RFC-0007 §Design overview) - ☑ RE2 `reason_regex` and source-filter regexes compile at Start (DoS-safe; a bad regex fails Validate with exit 2). (per RFC-0007 §Config schema) - ☑ Trace context propagated from journald `_TRACE_ID` / `_SPAN_ID` onto emitted records. (per RFC-0007 §Design overview) diff --git a/README.md b/README.md index cc9cef24..510522e9 100644 --- a/README.md +++ b/README.md @@ -81,7 +81,7 @@ Lifecycle logs go to stderr. Run `./tracecore --help` for the full flag set and | If you're a … | Start here | |---|---| | **Operator** running tracecore in production | [`docs/getting-started.md`](docs/getting-started.md) → bundled recipes under [`docs/integrations/`](docs/integrations/) → [`docs/FAILURE-MODES.md`](docs/FAILURE-MODES.md) | -| **Contributor** adding a receiver / processor / exporter | [`CONTRIBUTING.md`](CONTRIBUTING.md) → [`PRINCIPLES.md`](PRINCIPLES.md) (the *why*) → [`STYLE.md`](STYLE.md) (the *what*) → [`internal/pipeline/README.md`](internal/pipeline/README.md) | +| **Contributor** adding a receiver / processor / exporter | [`CONTRIBUTING.md`](CONTRIBUTING.md) → [`PRINCIPLES.md`](PRINCIPLES.md) (the *why*) → [`STYLE.md`](STYLE.md) (the *what*) → upstream [`go.opentelemetry.io/collector`](https://pkg.go.dev/go.opentelemetry.io/collector) component/receiver/processor/exporter packages | | **Maintainer** making architectural calls | [`docs/STRATEGY.md`](docs/STRATEGY.md) → [`NORTHSTARS.md`](NORTHSTARS.md) → [`docs/rfcs/`](docs/rfcs/) → [`MILESTONES.md`](MILESTONES.md) → [`docs/FOLLOWUPS.md`](docs/FOLLOWUPS.md) | | **Evaluating** tracecore for your fleet | This README + [`CHANGELOG.md`](CHANGELOG.md) → [`docs/STRATEGY.md`](docs/STRATEGY.md) "single load-bearing principle" | | **Verifying** a published release end-to-end (auditor / supply-chain) | [`docs/reproducibility.md`](docs/reproducibility.md) (rebuild → diffoscope → cosign → SLSA → SBOM) | diff --git a/components/receivers/kernelevents/README.md b/components/receivers/kernelevents/README.md index d31ab5ed..9d278da4 100644 --- a/components/receivers/kernelevents/README.md +++ b/components/receivers/kernelevents/README.md @@ -320,7 +320,7 @@ follow-up. ``` - **Sibling isolation:** receiver-level cancel cascades to each source's lifecycle via `lc.Start(parent, ...)`; one source's Shutdown can't take down a sibling. -- **One mechanism for cancel/wg/panic:** both sources + the receiver's run loop use the package-local `lifecycle` sibling (see `lifecycle.go`). Mirrors the PR-B1 pattern adopted across receivers migrating off `internal/runtime/lifecycle` (RFC-0013 PR-F). +- **One mechanism for cancel/wg/panic:** both sources + the receiver's run loop use the package-local `lifecycle` sibling (see `lifecycle.go`). Mirrors the PR-B1 pattern adopted across receivers that migrated off the now-deleted `internal/runtime/lifecycle` (RFC-0013 PR-F.2). - **Fast-path filter** lives in `kmsgSource.prefilter` (priority byte before full parse). Full filter chain (`facilities`, `min_severity`, `regex_filter`) runs in the receiver's run loop after parse - that's where attribute-shape decisions are visible. ## Testing locally diff --git a/docs/FAILURE-MODES.md b/docs/FAILURE-MODES.md index 6dfa3113..d9a8b7a7 100644 --- a/docs/FAILURE-MODES.md +++ b/docs/FAILURE-MODES.md @@ -47,16 +47,16 @@ existing alerts survive the swap from in-tree receiver to upstream + OTTL. | Scenario | Behaviour | Test | |---|---|---| | 🟢 Empty config (no pipelines) | OCB logs `"no enabled receivers"` / `"no enabled pipelines"` at WARN and exits cleanly on signal. | Upstream `service.Settings.Validate`; RFC-0013 PR-A2 retired the legacy in-tree test. | -| 🟢 Bad YAML | `config.Load` returns `LoadError` with file + line number. `tracecore` exits **2** (EX_DATAERR). | `internal/config/load_test.go::TestLoad_UnknownTopLevelField_LineNumberedError` | -| 🟢 Multi-document YAML | Rejected by the loader (silent data loss would surprise operators copying OTel configs). | `internal/config/load_test.go::TestLoad_MultiDocumentYAML_Rejected` | +| 🟢 Bad YAML | OCB's `confmap` returns a file:line-tagged error. `tracecore` exits non-zero. | Upstream `confmap` / `service.Settings.Validate`; RFC-0013 PR-F.2 retired the legacy in-tree `internal/config` test. | +| 🟢 Multi-document YAML | Rejected by the upstream YAML provider (silent data loss would surprise operators copying OTel configs). | Upstream `confmap/provider/yamlprovider`; RFC-0013 PR-F.2 retired the legacy in-tree `internal/config` test. | | 🟢 Unknown component type | OCB names the offending YAML key. Exit non-zero. | Upstream `otelcol.Validate`; RFC-0013 PR-A2 retired the legacy in-tree test. | | 🟢 Receiver/exporter `Create*` returns ErrSignalNotSupported | OCB names the pipeline + signal. Exit non-zero. | Upstream `service/pipelines`; RFC-0013 PR-A2 retired the legacy in-tree test. | | 🟢 Multi-instance components | OCB builds independent instances per `/` ID (e.g. `otlphttp/primary` + `otlphttp/secondary`). | Upstream `service/pipelines`; RFC-0013 PR-A2 retired the legacy in-tree test. | -| 🟢 Component `Start` returns error | Runtime cancels Start-loop, calls `Shutdown` on already-started Components, returns the original error. | `internal/pipeline/runtime_test.go::TestRuntime_StartFailure_ShutdownUnwindsOnlyStarted` | -| 🟢 `Start` after `Shutdown` race | Second-to-acquire lifecycleMu returns nil (graceful no-op). No leaked Components. | `internal/pipeline/runtime_test.go::TestRuntime_ConcurrentStartShutdown_NoLostComponents` | -| 🟢 `Shutdown` called twice | Second call is a no-op (slices are nil'd after first). Returns nil. | `internal/pipeline/runtime_test.go::TestRuntime_ShutdownTwice_IsIdempotent` | -| 🟢 `Shutdown` without `Start` | No-op. Returns nil. | `internal/pipeline/runtime_test.go::TestRuntime_ShutdownWithoutStart_IsNoOp` | -| 🟢 `Start` called twice | Second call returns "runtime: Start called twice". | `internal/pipeline/runtime_test.go::TestRuntime_StartTwice_ReturnsError` | +| 🟢 Component `Start` returns error | Upstream runtime cancels Start-loop, calls `Shutdown` on already-started Components, returns the original error. | Upstream `service/pipelines`; RFC-0013 PR-F.2 retired the legacy in-tree `internal/pipeline` runtime + tests. | +| 🟢 `Start` after `Shutdown` race | Upstream lifecycle gates Start once Shutdown has fired. No leaked Components. | Upstream `service`; RFC-0013 PR-F.2 retired the legacy in-tree test. | +| 🟢 `Shutdown` called twice | Upstream runtime treats the second call as a no-op. | Upstream `service`; RFC-0013 PR-F.2 retired the legacy in-tree test. | +| 🟢 `Shutdown` without `Start` | Upstream runtime treats it as a no-op. | Upstream `service`; RFC-0013 PR-F.2 retired the legacy in-tree test. | +| 🟢 `Start` called twice | Upstream `Collector.Run` rejects a second start. | Upstream `otelcol.Collector.Run`; RFC-0013 PR-F.2 retired the legacy in-tree test. | | 🔴 `Component.Start` ignores ctx and hangs forever | Runtime waits indefinitely. **No timeout on Start.** Operator must `kill -9`. | - (deliberate; the contract says Start respects ctx) | ## Signals @@ -73,12 +73,12 @@ existing alerts survive the swap from in-tree receiver to upstream + OTTL. | Scenario | Behaviour | Test | |---|---|---| -| 🟢 Component `Consume*` panics | Recovered by `pipeline.WrapSafeX`. Logged at Error with component ID + panic value. Surfaced as ordinary error to caller. Wrapper remains usable for subsequent pushes. | `internal/pipeline/saferun_test.go::TestWrapSafe_PanicsRecovered_PerSignal` | -| 🟢 Component `Consume*` returns error | Propagates up to the receiver via the chain. Fanout joins errors across consumers; no early-exit. | `internal/fanout/fanout_test.go::TestMetrics_ErrorsJoined_AllInvoked` | -| 🟢 Fan-out cloning | Mutating consumers get a fresh clone (CopyTo); the last mutator gets the donated original when safe. Multiple readonly consumers share one payload via `MarkReadOnly`. | `internal/fanout/fanout_test.go::TestFanoutContract_*` | -| 🟢 Nested-payload mutation isolation | Attribute-map writes in a mutating consumer don't leak into a sibling consumer's clone. Verified at every nesting level (ResourceMetrics → ScopeMetrics → DataPoint → pcommon.Map). | `internal/fanout/fanout_test.go::TestMetrics_NestedPayload_DeepCloneIsolation` | -| 🟢 First-data instrumentation | Logs `"pipeline first data"` once per pipeline so operators can verify aliveness without external tooling. | `internal/pipeline/firstdata_test.go::TestWrapFirstDataMetrics_LogsOnlyOnce` | -| 🔴 Receiver-internal goroutine panics (before reaching `next.Consume*`) | **Process crashes.** `WrapSafe*` wraps the consumer seam, not the receiver's own goroutine body. Receivers MUST `defer/recover` in their own goroutines - see `internal/pipeline/README.md` Pitfalls. The linked test exercises the *recipe* (it shows what to do); there is no test for the negative case because asserting "the process crashes" from inside that process isn't tractable. | `internal/pipeline/example_receiver_recover_test.go::TestReceiver_GoroutineDeferRecover_KeepsProcessAlive` (recipe, not crash assertion) | +| 🟢 Component `Consume*` panics | Recovered by upstream `service/pipelines`. Logged at Error with component ID + panic value. Surfaced as ordinary error to caller. | Upstream `service/pipelines`; RFC-0013 PR-F.2 retired the legacy in-tree `WrapSafeX` wrapper + tests. | +| 🟢 Component `Consume*` returns error | Propagates up to the receiver via the chain. Upstream fanout joins errors across consumers; no early-exit. | Upstream `internal/fanoutconsumer` in `go.opentelemetry.io/collector`; RFC-0013 PR-F.2 retired the legacy in-tree `internal/fanout` test. | +| 🟢 Fan-out cloning | Mutating consumers get a fresh clone (CopyTo); the last mutator gets the donated original when safe. Multiple readonly consumers share one payload via `MarkReadOnly`. | Upstream `internal/fanoutconsumer`; RFC-0013 PR-F.2 retired the legacy in-tree test. | +| 🟢 Nested-payload mutation isolation | Attribute-map writes in a mutating consumer don't leak into a sibling consumer's clone. Verified at every nesting level (ResourceMetrics → ScopeMetrics → DataPoint → pcommon.Map). | Upstream `internal/fanoutconsumer`; RFC-0013 PR-F.2 retired the legacy in-tree test. | +| 🟢 First-data instrumentation | Upstream collector emits the `otelcol_receiver_*_accepted` counter family; operators verify pipeline liveness via the standard metrics. | Upstream `service/telemetry`; RFC-0013 PR-F.2 retired the legacy in-tree `WrapFirstDataMetrics` log line + test. | +| 🔴 Receiver-internal goroutine panics (before reaching `next.Consume*`) | **Process crashes.** The upstream consumer-seam recovery wraps `Consume*`, not the receiver's own goroutine body. Receivers MUST `defer/recover` in their own goroutines - see upstream `receiverhelper` docs. | - (no in-tree test; asserting "the process crashes" from inside that process isn't tractable; the recipe is documented at the receiver layer). | | 🔴 Exporter writes to a closed stdout pipe | If the parent process exits or the pipe is closed mid-write, `debugexporter.ConsumeMetrics` returns the write error; the runtime surfaces it but tracecore does not retry, queue, or drop. Operator sees a stream of "write metrics line" errors until shutdown. Source: upstream `debugexporter`. | - (no test today; first real failure-mode test target for M8+) | | 🔴 Receiver pushes to a closed channel / nil consumer | Standard Go panic, recovered by `WrapSafeX` IF the panic surfaces in the Consume call. If the receiver itself nil-derefs before calling Consume, see above. | - | @@ -86,10 +86,8 @@ existing alerts survive the swap from in-tree receiver to upstream + OTTL. | Scenario | Behaviour | Test | |---|---|---| -| 🟡 Phase-1 (receivers) ≤ `ReceiverShutdownTimeout` (1s) | Receivers shut down in parallel. If any exceed 1s, the runtime **abandons** them and logs `"shutdown phase 1 deadline elapsed"`. In-flight Shutdowns continue in background goroutines until they return or the process exits. | - (no dedicated test; the parallel Phase-1 path is exercised indirectly by `TestRuntime_ShutdownOrder_ReceiversFirstThenLIFO`) | -| 🟡 Phase-2 (processors + exporters) ≤ `WithDrainBudget` (default 10s; capped at `HardDrainCeiling` 30s) | Serial-LIFO. If budget elapses, remaining components are **skipped** and logged. | `internal/pipeline/runtime_test.go::TestRuntime_PhaseTwoBudgetElapsed_ReturnsError` | -| 🟡 Operator sets drain budget > 30s | Clamped to 30s with WARN log. Operators who genuinely need longer should fix the exporter, not extend the timeout. | `internal/pipeline/runtime_test.go::TestRuntime_DrainBudgetClamp` | -| 🟢 Component `Shutdown` panics | Recovered by `safeShutdown`. Returned as an error joined into the shutdown error set. Process exits, no crash. | `internal/pipeline/runtime_test.go::TestRuntime_PanickingShutdown_RecoveredAsError` | +| 🟡 Shutdown budget | Upstream `otelcol.Collector` honours `--config`'s graceful shutdown timeout (default 30s); receivers shut down before processors+exporters; the upstream collector logs at WARN when a component exceeds its budget. | Upstream `otelcol.Collector`; RFC-0013 PR-F.2 retired the legacy in-tree two-phase shutdown runtime + tests. The bespoke parallel-Phase-1 / serial-Phase-2 split was an in-tree divergence (documented in STRATEGY.md "Current accepted divergences") that closed when tracecore adopted upstream `service`. | +| 🟢 Component `Shutdown` panics | Recovered by upstream `service`. Returned as an error joined into the shutdown error set. Process exits, no crash. | Upstream `service`; RFC-0013 PR-F.2 retired the legacy in-tree test. | ## Backend and vendor connectivity (M6) @@ -97,7 +95,7 @@ existing alerts survive the swap from in-tree receiver to upstream + OTTL. |---|---|---| | 🟢 Exporter unreachable (network error mid-send) | `otlphttp` retries on retryable HTTP status codes (429/502/503/504) and on network errors with exponential backoff; final error propagates to the receiver as a `Permanent` or `Retryable` `kind`, surfaced via `otelcol_exporter_calls_total{outcome="error"}` (post-RFC-0013 naming). | `components/exporters/otlphttp/otlphttp_test.go::TestExporter_RetriesOnNetworkError` | | 🟢 Vendor SDK failure (`dcgm-exporter` unreachable at Start) | `prometheusreceiver` records the scrape failure and emits `up=0`; the pipeline continues without the scrape target's contribution rather than failing the whole binary. Source: upstream `prometheusreceiver` scraping `dcgm-exporter` per the bundled recipe. | recipe-level alert `DCGMReceiverDegraded`; see `tracecore-recipes` chart. | -| 🟢 Config invalid (unknown top-level field) | Loader returns a `file:line:column` error citing the offending key; `tracecore validate` and `tracecore collect` both exit 2 (EX_DATAERR) before any I/O. | `internal/config/load_test.go::TestLoad_UnknownTopLevelField_LineNumberedError` | +| 🟢 Config invalid (unknown top-level field) | Upstream `confmap` returns a path-tagged error citing the offending key; `tracecore validate` exits non-zero before any I/O. | Upstream `confmap`; RFC-0013 PR-F.2 retired the legacy in-tree `internal/config` test. | | 🟢 Config invalid (bad exporter endpoint) | `otlphttp` rejects non-http/https schemes at validate time with `otlphttp: endpoint: scheme must be http or https`; exit 2. | `components/exporters/otlphttp/otlphttp_test.go::TestConfig_Validate_RejectsNonHTTPSchemes` | ## Self-telemetry surface (M2) @@ -107,7 +105,7 @@ existing alerts survive the swap from in-tree receiver to upstream + OTTL. | 🟢 `telemetry.enabled: false` | No HTTP listener bound. Default. | Migrated to upstream `service.telemetry` per RFC-0013; the in-tree `internal/telemetry` Server was deleted in PR-F.1. PR-K removes the legacy `telemetry:` top-level chart key. | | 🟢 `service.telemetry.metrics.address` already in use | Upstream `service/telemetry` returns the bind error before pipelines start; the OCB binary exits non-zero with the bind diagnostic on stderr; no partial state. | - (port-conflict triggers upstream's synchronous bind error path; covered by `service/telemetry` upstream tests) | | 🟢 Bad `service.telemetry.metrics.address` (e.g., `notavalidaddress`) | `tracecore validate` rejects with the upstream `service.telemetry.metrics.address` parse error and exits non-zero. | - (delegated to upstream `confmap` + `service/telemetry` validation) | -| 🟢 Non-absolute path (`metrics: m`) | Validate rejects with `telemetry.paths.metrics: must be an absolute path starting with '/'`. Exit 2. | `internal/config/telemetry_test.go::TestTelemetry_RejectsNonAbsolutePath` | +| 🟢 Non-absolute path (`metrics: m`) | Upstream `service/telemetry` rejects non-absolute paths at config validate. Exit non-zero. | Upstream `service/telemetry`; RFC-0013 PR-F.2 retired the legacy in-tree `internal/config` test. | | 🟢 `/healthz` during shutdown | Returns 503 once the upstream `healthcheckextension` flips to shutdown mode. Operator's k8s livenessProbe sees the transition. | - (delegated to upstream `healthcheckextension`; the in-tree `internal/telemetry` Server was deleted in PR-F.1) | | 🟢 Liveness/readiness probes during shutdown | Probes served by the upstream `healthcheckextension` (chart-default port `:13133`). Extension flips to 503 on collector `Shutdown`; the kubelet probe transition mirrors the previous `/healthz` + `/readyz` behaviour. See [`docs/migration/v0.1-to-v0.2.md`](migration/v0.1-to-v0.2.md) "Self-telemetry chart values keys" for the values-key rename. | - (delegated to upstream `healthcheckextension`) | | 🟢 Scrape during high-cardinality emission | The OTel SDK + Prometheus exporter handle thousands of series; receivers MUST keep `kind` labels low-cardinality by contract. Source: upstream `service/telemetry` + standard `otelcol_*` metric family (RFC-0013 §Migration PR-B). | - (cardinality contract is documentation-only; no runtime guard) | diff --git a/docs/README.md b/docs/README.md index 8ccbb9b2..768561fb 100644 --- a/docs/README.md +++ b/docs/README.md @@ -73,7 +73,6 @@ Source (receiver-side) recipes — RFC-0013 §migration PR-J replacements for th | [components/receivers/pyspy/README.md](../components/receivers/pyspy/README.md) | 👤 🛠️ | On-demand Python stack-sampling receiver (faulthandler-based). - scheduled for deletion per RFC-0013 §7 | | [components/receivers/pyspy/RUNBOOK.md](../components/receivers/pyspy/RUNBOOK.md) | 👤 | Operator playbook + per-kind triage (RFC-0009 degraded modes). - scheduled for deletion per RFC-0013 §7 | | [components/exporters/otlphttp/README.md](../components/exporters/otlphttp/README.md) | 👤 🛠️ | OTLP/HTTP exporter - production sink to an OTel collector or backend. | -| [internal/pipeline/README.md](../internal/pipeline/README.md) | 🛠️ | Pipeline runtime contract; receiver/processor/exporter author quickstart. | ## What goes where (for contributors) diff --git a/docs/STRATEGY.md b/docs/STRATEGY.md index 946f3390..f40f1728 100644 --- a/docs/STRATEGY.md +++ b/docs/STRATEGY.md @@ -30,14 +30,17 @@ in-house - and propose it upstream when stable. **Post-RFC-0013 status of the M2 divergence list below:** the `Host.ReportStatus` / `CreateSettings` / `TelemetrySettings` / -self-tel-bind / metric-name / `componentstatus`-package rows close -when v0.1.0 adopts upstream `go.opentelemetry.io/collector/component/componentstatus` -+ `service/telemetry` and standard `otelcol_*` metric names (see -RFC-0013 §2 adoption matrix, §4 release boundaries, §7 deletion -list for `internal/componentstatus`, `internal/selftelemetry`, -`internal/telemetry`). The table below is preserved as the -historical decision trail; rows marked **done (M2)** ship as drop-in -upstream code at v0.1.0, not as in-tree packages. Rows tied to the +self-tel-bind / metric-name / `componentstatus`-package rows +**closed at v0.1.0** when tracecore adopted upstream +`go.opentelemetry.io/collector/component/componentstatus` + +`service/telemetry` and standard `otelcol_*` metric names. The +`internal/{componentstatus,selftelemetry,telemetry,pipeline, +pipelinebuilder,config,consumer,fanout,runtime/lifecycle}` packages +referenced in some rows below have been **deleted** (PR-F.1 + PR-F.2 +per RFC-0013 §7); see PR-A1 / PR-A2 for the OCB-generated boot path +that replaced them. The table is preserved as the historical +decision trail; rows marked **done (M2)** ship as drop-in upstream +code at v0.1.0, not as in-tree packages. Rows tied to the vendor-SDK receiver shape (cardinality cap, per-receiver scope name, Prometheus-scrape filtering, expfmt) move to the recipe layer - the OTTL transform processor in the bundled Helm chart applies them @@ -194,8 +197,11 @@ Concrete milestones for 1.0: ### What 1.0 commits to (when we get there) -- Stable Component / Factory / Host interfaces in `internal/pipeline/` - (these graduate to `pkg/` at 1.0) +- Stable Component / Factory / Host interfaces — adopted from + upstream `go.opentelemetry.io/collector/{component,receiver, + processor,exporter,consumer,pipeline}` per RFC-0013; tracecore + tracks the upstream stability contract rather than maintaining + a parallel surface. - Stable component YAML config schemas (per-component config types freeze their public fields) - Stable pdata version pinning (we move with OTel's pdata major diff --git a/docs/migration/v0.1-to-v0.2.md b/docs/migration/v0.1-to-v0.2.md index f749fad4..9a5f08f1 100644 --- a/docs/migration/v0.1-to-v0.2.md +++ b/docs/migration/v0.1-to-v0.2.md @@ -130,9 +130,7 @@ CI workflows changed path triggers from `cmd/tracecore/**` to `builder-config.ya ## `internal/*` package deletion (PR-F) -> **Status:** PR-F.1 landed (#206) — `internal/selftelemetry/` and `internal/telemetry/` are already gone in current main. PR-F.2 (deletes `internal/{componentstatus,pipeline,pipelinebuilder,consumer,fanout,runtime/lifecycle}`) is gated on three open ports: #204 (k8sevents), #205 (clockreceiver), #207 (otlphttp). Once those land, the remaining `internal/*` runtime packages drop in a single cut before v0.2.0 GA. - -Several internal Go packages were load-bearing only for the deleted `cmd/tracecore` boot path and the in-tree receivers/exporters. Third-party Go importers (unlikely in OSS pre-1.0; the packages live under `internal/` and the Go compiler rejects external imports) lose: +> **Status:** PR-F.1 and PR-F.2 have landed. The packages listed below are **deleted** in v0.2.0 builds. Third-party Go importers (unlikely in OSS pre-1.0; the packages live under `internal/` and the Go compiler rejects external imports) lose: | Package | Public surface | Migration | |---|---|---| @@ -142,7 +140,7 @@ Several internal Go packages were load-bearing only for the deleted `cmd/traceco | `internal/telemetry` | `ServerConfig`, `Paths`, `Server`, `NewServer`, `MeterProvider`, `NewMeterProvider`, `WindowedRate`, `AggregateSLOSource`, `ExporterRegistry`, `SLOSource` | Self-telemetry HTTP server is replaced by `service.telemetry.metrics.address`; `MeterProvider` is replaced by the upstream collector's internal meter provider. The probe-server `Server` (paths `/healthz` / `/readyz`) is replaced by `healthcheckextension`. | | `internal/pipeline`, `internal/pipelinebuilder`, `internal/consumer`, `internal/fanout` | Pipeline assembly helpers | Replaced wholesale by upstream `go.opentelemetry.io/collector/service.New(...)` driven by `builder-config.yaml`. | -PR-F will collapse the `internal/` tree to: `safe`, `config`, `synthesis`, `version`, `sli`, `integration` (the OCB scrape test). Everything else listed above goes away. +PR-F.2 collapsed the `internal/` tree to: `safe`, `synthesis`, `version`, `sli`, `integration` (the OCB scrape test). Everything else listed above is gone. ## Reproducibility note diff --git a/docs/rfcs/0013-distro-first-pivot.md b/docs/rfcs/0013-distro-first-pivot.md index f6420646..2aef31e2 100644 --- a/docs/rfcs/0013-distro-first-pivot.md +++ b/docs/rfcs/0013-distro-first-pivot.md @@ -171,8 +171,8 @@ Internal packages deleted: | `internal/componentstatus/` | `go.opentelemetry.io/collector/component/componentstatus` | v0.1.0 | | `internal/selftelemetry/` | `service/telemetry` + standard `otelcol_*` metrics | v0.1.0 | | `internal/telemetry/` | folds into upstream `service/telemetry` | v0.1.0 | -| `internal/pipeline/`, `internal/pipelinebuilder/`, `internal/config/`, `internal/consumer/`, `internal/fanout/` | OCB-generated boot path | v0.1.0 (audit first; keep only if a custom receiver/processor depends on a non-replaceable abstraction) | -| `internal/runtime/lifecycle/` | OCB / upstream `componenttest` | v0.2.0 (delete with last consumer) | +| `internal/pipeline/`, `internal/pipelinebuilder/`, `internal/config/`, `internal/consumer/`, `internal/fanout/` | OCB-generated boot path | v0.1.0 (landed PR-F.2; audit confirmed no custom-receiver dependency) | +| `internal/runtime/lifecycle/` | per-receiver package-local `lifecycle.go` siblings (PR-B1 pattern) | v0.1.0 (landed PR-F.2 — every receiver inherited a local sibling during the wave-3 ports, leaving the in-tree package without a non-test consumer) | External artifacts deleted: @@ -235,9 +235,9 @@ PR sequencing within v0.1.0: 5. **PR-C** (landed, #174): Release pipeline switches to goreleaser stack. Old `release.yml` archived under `.github/workflows/archived/`. 6. **PR-D** (landed, #176): Image build moves to `ko`. Chart `image.repository` continues to resolve. 7. **PR-E** (landed, #180): `clockreceiver` → `hostmetricsreceiver` (loadscraper @ 1s) in OCB manifest + bench-install Helm values. The originally-planned `telemetrygeneratorreceiver` does not exist in opentelemetry-collector-contrib at any tag (verified 2026-05-30; contrib issues #41687 and #43657 both closed `not_planned`). hostmetrics' loadscraper emits 3 low-cardinality series (`system.cpu.load_average.{1m,5m,15m}`) and satisfies the bench's "first parseable JSON line at sink" pass condition. Scope deferral: chart default stays `clockreceiver` and the in-tree source survives this PR (~92 references across `cmd/tracecore/*_test.go` + `internal/pipeline` + `internal/selftelemetry` fixtures); chart-default flip + source deletion ship as part of PR-K alongside coordinated test-fixture migration and the values-keys `NOTES.txt` deprecation cycle. -8. **PR-F** (lands after PR-A2 + PR-B1 + the wave-3 sibling-port PRs) splits into PR-F.1 + PR-F.2 to match the actual import-graph state — `internal/componentstatus` cannot be deleted until `internal/pipeline` migrates off it, and `internal/pipeline` is explicitly out of PR-F's scope. +8. **PR-F** (landed after PR-A2 + PR-B1 + the wave-3 sibling-port PRs + the wave-2 upstream-port PRs) split into PR-F.1 + PR-F.2. The original framing held `internal/pipeline` out of PR-F's scope because `internal/componentstatus` couldn't be deleted until `internal/pipeline` migrated off it. The wave-2 upstream-port PRs (#201/#202/#203/#204/#205/#207/#208/#209) cleared every external caller of `internal/{pipeline,consumer,runtime/lifecycle}`, so PR-F.2 collapsed the full boot-path infrastructure (`internal/pipeline` + `internal/pipelinebuilder` + `internal/config` + `internal/consumer` + `internal/fanout` + `internal/componentstatus` + `internal/runtime/lifecycle`) in one cut. - **PR-F.1** (landed): Delete `components/receivers/dcgm/` + `pkg/dcgm/` (cgo stub never shipped real code; live ports removed in #188's PR-B2-shaped dcgm sweep; kueue + kineto already deleted in #168). Delete `internal/selftelemetry/` — every consumer (containerstdout, clockreceiver, kernelevents, k8sevents, nccl_fr, dcgm, pyspy, stdoutexporter, otlphttp) ported onto receiver/exporter-scoped sibling `selftel.go` files in wave-3 of the pivot (#184/#185/#186/#187/#188/#193/#194/#196/#197). Delete `internal/telemetry/` — was the in-tree `MeterProvider` + probe-server (`/metrics`, `/healthz`, `/readyz`) wrapper; probes now flow through the upstream `healthcheckextension`, meter-provider via upstream `service.telemetry`; only remaining consumers were `internal/selftelemetry/*_test.go` (deleted in the same cut) and one orphan clockreceiver integration test (`components/receivers/clockreceiver/errors_integration_test.go`, left dangling by #185's PR-B1 clockreceiver port) which is deleted too. Retire the `dcgm` build tag; drop both deleted package rows from `make bench-check`; empty the `register-lint` allowlist (the two `internal/telemetry/{build_info,slo}.go` entries are gone). Chart `receivers.dcgm` toggle + `_helpers.tpl` doc-list + `NOTES.txt` warning retained until PR-K removes them outright (toggle is already inert — operators enabling `receivers.dcgm.enabled=true` have crashed at boot since PR-A2). - - **PR-F.2** (deferred — pending wave-3-followup completion): Delete `internal/componentstatus`, `internal/pipeline`, `internal/pipelinebuilder`, `internal/consumer`, `internal/fanout`, `internal/runtime/lifecycle`. Hard-gated on the remaining three receiver/exporter ports landing (clockreceiver / k8sevents / otlphttp pipeline+consumer ports are in flight as separate agents; the PR-B2-shape templates exist as PRs #205 / #204 / #207). Once those land, `internal/{pipeline,consumer,runtime/lifecycle,componentstatus,pipelinebuilder,fanout}` have zero non-test consumers and can drop in a single cut. The `clockreceiver` source deletion stays in PR-K (chart + values-keys deprecation cycle) — PR-F.2 only deletes the `internal/*` packages, not the canonical-example receivers themselves. See PR-E note for the chart-default flip rationale. + - **PR-F.2** (landed): Delete `internal/pipeline/`, `internal/pipelinebuilder/`, `internal/config/`, `internal/consumer/`, `internal/fanout/`, `internal/componentstatus/`, and `internal/runtime/lifecycle/` (56 files / -6,888 LOC). The OCB-generated `_build/main.go` consumes `builder-config.yaml` and produces an equivalent collector instance off upstream `go.opentelemetry.io/collector/service`; upstream `confmap` providers replace the in-tree YAML loader; upstream `componentstatus.ReportStatus` replaces the in-tree shim; the per-receiver lifecycle helpers (already siblings of each receiver from PR-B1) carry the cancel/wg/panic-recovery contract. `.golangci.yml` interface-forwarder regexps repointed at upstream `consumer.{Metrics,Traces,Logs}` + `component.Component`. `.github/workflows/chaos.yml` retires the `chaos-pipeline-test` job (the in-tree `internal/pipeline/chaos_test.go` covered a contract now provided by upstream `service`). `clockreceiver` source deletion remains deferred to PR-K — see PR-E note for rationale. 9. **PR-G** (landed, #182): Supersede RFCs (add status headers + redirects). Move RFC-0004 to `archived/`. 10. **PR-H** (landed, #183): Update top-level docs (README, NORTHSTARS, STRATEGY, PRINCIPLES, MILESTONES, CHANGELOG, CONTRIBUTING, AGENTS, docs/README). diff --git a/internal/componentstatus/componentstatus.go b/internal/componentstatus/componentstatus.go deleted file mode 100644 index ffb79a03..00000000 --- a/internal/componentstatus/componentstatus.go +++ /dev/null @@ -1,40 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 - -// Package componentstatus is tracecore's in-tree analogue of OTel -// collector's `go.opentelemetry.io/collector/component/componentstatus` -// at v0.152. ReportStatus is a free function that delegates to the -// host only when the host opts in via the optional StatusReporter -// interface — the pre-M2 Host.ReportStatus method has been removed. -// -// See docs/STRATEGY.md "Host.ReportStatus" divergence row. -package componentstatus - -import "github.com/tracecoreai/tracecore/internal/pipeline" - -// StatusReporter is the optional interface a pipeline.Host may -// implement to record a Component's status events. Hosts that don't -// care about status events simply omit this method; ReportStatus -// degrades to a no-op. -// -// The method is named ReportComponentStatus (not ReportStatus) so a -// type that satisfies it can't be confused with an old-style host -// that exposed ReportStatus directly. Once the pre-M2 method is -// removed, this is the only way to surface status into a host. -type StatusReporter interface { - ReportComponentStatus(ev pipeline.StatusEvent) -} - -// ReportStatus delivers ev to host if host implements StatusReporter, -// otherwise silently discards it. The "silent discard" is deliberate: -// receivers should not have to know which host implementation they're -// running against, and a binary that wants status logging wires a -// StatusReporter into its host. -// -// Mirrors OTel's componentstatus.ReportStatus(host, ev) signature -// (with a tracecore-flavored StatusEvent that already lived in -// pipeline.StatusEvent pre-M2). -func ReportStatus(host pipeline.Host, ev pipeline.StatusEvent) { - if r, ok := host.(StatusReporter); ok { - r.ReportComponentStatus(ev) - } -} diff --git a/internal/componentstatus/componentstatus_test.go b/internal/componentstatus/componentstatus_test.go deleted file mode 100644 index 72594b45..00000000 --- a/internal/componentstatus/componentstatus_test.go +++ /dev/null @@ -1,64 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 - -package componentstatus_test - -import ( - "errors" - "testing" - - "github.com/stretchr/testify/require" - - "github.com/tracecoreai/tracecore/internal/componentstatus" - "github.com/tracecoreai/tracecore/internal/pipeline" -) - -// recordingHost implements both pipeline.Host and the optional -// componentstatus.StatusReporter interface so the free function can -// delegate to it. -type recordingHost struct { - events []pipeline.StatusEvent -} - -func (h *recordingHost) GetExtensions() map[pipeline.ID]pipeline.Component { - return map[pipeline.ID]pipeline.Component{} -} - -func (h *recordingHost) ReportComponentStatus(ev pipeline.StatusEvent) { - h.events = append(h.events, ev) -} - -// silentHost implements pipeline.Host but NOT StatusReporter — to -// pin the silent-discard contract. -type silentHost struct{} - -func (silentHost) GetExtensions() map[pipeline.ID]pipeline.Component { - return map[pipeline.ID]pipeline.Component{} -} - -// TestReportStatus_DelegatesToStatusReporter verifies that when a -// host implements StatusReporter, ReportStatus passes through. -func TestReportStatus_DelegatesToStatusReporter(t *testing.T) { - t.Parallel() - - h := &recordingHost{} - componentstatus.ReportStatus(h, pipeline.StatusEvent{Kind: "starting"}) - componentstatus.ReportStatus(h, pipeline.StatusEvent{Kind: "fault", Err: errors.New("boom")}) - - require.Len(t, h.events, 2) - require.Equal(t, "starting", h.events[0].Kind) - require.Equal(t, "fault", h.events[1].Kind) - require.EqualError(t, h.events[1].Err, "boom") -} - -// TestReportStatus_SilentDiscardWhenHostHasNoReporter pins the -// "no opt-in == no error" contract: a host that doesn't implement -// StatusReporter sees the call discarded without panicking. This -// matches OTel's behavior and lets receivers call ReportStatus -// freely without knowing the host shape. -func TestReportStatus_SilentDiscardWhenHostHasNoReporter(t *testing.T) { - t.Parallel() - - require.NotPanics(t, func() { - componentstatus.ReportStatus(silentHost{}, pipeline.StatusEvent{Kind: "ignored"}) - }) -} diff --git a/internal/config/bench_test.go b/internal/config/bench_test.go deleted file mode 100644 index a4c6d7b1..00000000 --- a/internal/config/bench_test.go +++ /dev/null @@ -1,22 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 - -package config_test - -import ( - "testing" - - "github.com/tracecoreai/tracecore/internal/config" -) - -// BenchmarkParsePipelineID measures the per-call cost of parsing an -// operator-supplied pipeline key (e.g. `metrics/primary`). It's the -// cheapest hot-path parser in the tree; pinning a baseline now lets -// future regex / string-handling changes be assessed quantitatively. -func BenchmarkParsePipelineID(b *testing.B) { - for range b.N { - _, _, err := config.ParsePipelineID("metrics/primary") - if err != nil { - b.Fatal(err) - } - } -} diff --git a/internal/config/config.go b/internal/config/config.go deleted file mode 100644 index 514cb645..00000000 --- a/internal/config/config.go +++ /dev/null @@ -1,189 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 - -package config - -import ( - "fmt" - "net" - "strings" - - "gopkg.in/yaml.v3" - - "github.com/tracecoreai/tracecore/internal/pipeline" -) - -// netSplitHostPort is a small alias so the validator imports net at -// the package level rather than letting the bare call drift into -// other helpers. -var netSplitHostPort = net.SplitHostPort - -// Config is the in-memory shape of the collector YAML config after -// structural parsing. Per-component config bodies are kept as opaque -// yaml.Nodes; the matching factory decodes them at runtime. -type Config struct { - Receivers map[string]yaml.Node `yaml:"receivers,omitempty"` - Processors map[string]yaml.Node `yaml:"processors,omitempty"` - Exporters map[string]yaml.Node `yaml:"exporters,omitempty"` - Service Service `yaml:"service,omitempty"` - Telemetry Telemetry `yaml:"telemetry,omitempty"` -} - -// Telemetry is the operator-facing block for tracecore's self- -// telemetry surface. Default is OFF — operators opt in by setting -// `telemetry.enabled: true`. When disabled, other fields are not -// validated so operators can keep a full block commented as a -// production template. -type Telemetry struct { - // Enabled gates the surface. Default false. - Enabled bool `yaml:"enabled"` - - // Listen is the bind address for the HTTP server (host:port). - // Default "localhost:8888" — localhost-only is the safer - // default; operators in multi-node setups override to ":8888". - Listen string `yaml:"listen,omitempty"` - - // Paths controls the routes mounted on the single listener. - // Each field defaults to the conventional name. - Paths TelemetryPaths `yaml:"paths,omitempty"` -} - -// TelemetryPaths captures the three route knobs operators may need -// to override (e.g., service-mesh probe-path conflicts). Each field -// defaults to the conventional name when zero. -type TelemetryPaths struct { - Metrics string `yaml:"metrics,omitempty"` - Healthz string `yaml:"healthz,omitempty"` - Readyz string `yaml:"readyz,omitempty"` -} - -// telemetryDefaults are applied when a field is the zero-value AND -// the block is enabled. Kept centralized so tests + the loader stay -// in sync on what "default" means. -const ( - defaultTelemetryListen = "localhost:8888" - defaultTelemetryMetricsPath = "/metrics" - defaultTelemetryHealthzPath = "/healthz" - defaultTelemetryReadyzPath = "/readyz" -) - -// applyDefaults fills in zero-value fields with the canonical -// defaults when the block is enabled. No-op when disabled (the -// zero-value fields stay zero so operators inspecting the parsed -// config can tell "default-on" from "explicit-set"). -func (t *Telemetry) applyDefaults() { - if !t.Enabled { - return - } - if t.Listen == "" { - t.Listen = defaultTelemetryListen - } - if t.Paths.Metrics == "" { - t.Paths.Metrics = defaultTelemetryMetricsPath - } - if t.Paths.Healthz == "" { - t.Paths.Healthz = defaultTelemetryHealthzPath - } - if t.Paths.Readyz == "" { - t.Paths.Readyz = defaultTelemetryReadyzPath - } -} - -// validate runs the operator-facing checks. Empty fields after -// applyDefaults are bugs in the loader (not operator errors); this -// catches the operator-error path: malformed listen, non-absolute -// paths. -func (t *Telemetry) validate() error { - if !t.Enabled { - return nil - } - if _, _, err := splitHostPort(t.Listen); err != nil { - return fmt.Errorf("telemetry.listen: %w", err) - } - if err := validatePath("metrics", t.Paths.Metrics); err != nil { - return err - } - if err := validatePath("healthz", t.Paths.Healthz); err != nil { - return err - } - return validatePath("readyz", t.Paths.Readyz) -} - -func validatePath(name, p string) error { - if err := ValidateMountPath(p); err != nil { - return fmt.Errorf("telemetry.paths.%s: %w", name, err) - } - return nil -} - -// ValidateMountPath enforces the shared mount-path rules used by both -// the YAML loader (`telemetry.paths.*`) and the HTTP server -// (`ServerConfig.Paths.*`). Stricter than a bare leading-slash check: -// rejects whitespace, control bytes, query strings, and fragments — -// inputs that would let http.ServeMux.Handle panic at registration -// time rather than surface a clean operator-facing error. -func ValidateMountPath(p string) error { - if p == "" || p[0] != '/' { - return fmt.Errorf("must be an absolute path starting with '/' (got %q)", p) - } - if strings.ContainsAny(p, " \t\n\r") { - return fmt.Errorf("must not contain whitespace (got %q)", p) - } - if strings.ContainsAny(p, "?#") { - return fmt.Errorf("must be a bare path with no query or fragment (got %q)", p) - } - return nil -} - -// splitHostPort wraps net.SplitHostPort so the caller doesn't have -// to import net just for the validation path. We keep the dep -// shallow because the surface is small. -func splitHostPort(addr string) (host, port string, err error) { - host, port, err = netSplitHostPort(addr) - if err != nil { - return "", "", fmt.Errorf("invalid host:port %q: %w", addr, err) - } - return host, port, nil -} - -// Service holds the pipeline assembly block. -type Service struct { - Pipelines map[string]Pipeline `yaml:"pipelines,omitempty"` -} - -// Pipeline names the components that compose a single signal flow. -// The map key on Service.Pipelines is the pipeline ID in -// `` or `/` form (e.g. `metrics/primary`). -type Pipeline struct { - Receivers []string `yaml:"receivers,omitempty"` - Processors []string `yaml:"processors,omitempty"` - Exporters []string `yaml:"exporters,omitempty"` -} - -// ParsePipelineID converts the YAML pipeline key into a (Signal, name) -// pair. "metrics" → (SignalMetrics, ""). "metrics/primary" → -// (SignalMetrics, "primary"). Returns an error for unknown signals, -// trailing slashes (silent normalization hides operator typos), and -// multi-slash forms (those would slip past load-time validation only -// to fail later at NewID construction). -func ParsePipelineID(s string) (pipeline.Signal, string, error) { - if strings.HasSuffix(s, "/") { - return 0, "", fmt.Errorf("pipeline id %q has trailing slash; omit the slash or supply an instance name", s) - } - signalStr, name, _ := strings.Cut(s, "/") - if strings.Contains(name, "/") { - return 0, "", fmt.Errorf("pipeline id %q has multiple slashes; expected `` or `/`", s) - } - if err := pipeline.ValidateInstanceName(name); err != nil { - return 0, "", fmt.Errorf("pipeline id %q: %w", s, err) - } - switch signalStr { - case "metrics": - return pipeline.SignalMetrics, name, nil - case "traces": - return pipeline.SignalTraces, name, nil - case "logs": - return pipeline.SignalLogs, name, nil - default: - return 0, "", fmt.Errorf("unknown pipeline signal %q: must be one of metrics, traces, logs", signalStr) - } -} diff --git a/internal/config/doc.go b/internal/config/doc.go deleted file mode 100644 index b77d7328..00000000 --- a/internal/config/doc.go +++ /dev/null @@ -1,12 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 - -// Package config reads and validates the operator-facing YAML config. -// Errors carry file path, line, and (where the parser surfaces it) -// column, so operators can jump straight from a log line to the -// offending source — the first of the M1 Operator-UX patterns. -// -// M1 scope: structural parsing only. Per-component config decoding is -// the factory's job and lands once the first receiver does (M8); the -// loader carries each component's body as an opaque yaml.Node until -// then. -package config diff --git a/internal/config/fuzz_test.go b/internal/config/fuzz_test.go deleted file mode 100644 index c2d8f53b..00000000 --- a/internal/config/fuzz_test.go +++ /dev/null @@ -1,46 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 - -package config_test - -import ( - "os" - "path/filepath" - "testing" - - "github.com/tracecoreai/tracecore/internal/config" -) - -// FuzzLoad pushes operator-supplied YAML through config.Load. The -// property: Load must never panic, regardless of input. Malformed, -// empty, multi-document, oversized — all must surface as errors, not -// crashes. -func FuzzLoad(f *testing.F) { - f.Add([]byte("")) - f.Add([]byte("service:\n pipelines:\n metrics/primary:\n receivers: [clockreceiver]\n")) - f.Add([]byte("not-yaml: : :")) - f.Add([]byte("---\ndoc1\n---\ndoc2")) - f.Add([]byte("\x00\xff\x00binary")) - - f.Fuzz(func(t *testing.T, in []byte) { - path := filepath.Join(t.TempDir(), "fuzz.yaml") - if err := os.WriteFile(path, in, 0o600); err != nil { - t.Fatalf("write fuzz input: %v", err) - } - _, _ = config.Load(path) - }) -} - -// FuzzParsePipelineID pushes operator-supplied pipeline keys through -// ParsePipelineID. The property: no panic. -func FuzzParsePipelineID(f *testing.F) { - f.Add("metrics") - f.Add("metrics/primary") - f.Add("traces/") - f.Add("//bad") - f.Add("badsignal/x") - f.Add("") - - f.Fuzz(func(t *testing.T, s string) { - _, _, _ = config.ParsePipelineID(s) - }) -} diff --git a/internal/config/load.go b/internal/config/load.go deleted file mode 100644 index bdd6e4ab..00000000 --- a/internal/config/load.go +++ /dev/null @@ -1,126 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 - -package config - -import ( - "errors" - "fmt" - "io" - "os" - "strings" - - "gopkg.in/yaml.v3" -) - -// LoadError wraps a load failure with the offending file path. -// Preserves the underlying error so callers can match with errors.As -// (e.g. against *yaml.TypeError for structured handling) and renders -// operator-readable `:: ...` lines via Error(). -// -// yaml.v3 surfaces line numbers but not columns, so the rendered -// format is `:: ` rather than the -// `::: ` shape gcc/clang use. -type LoadError struct { - Path string - Err error -} - -// Error formats the error as one `:: ` line per -// underlying yaml.TypeError entry, or `: ` for other -// error shapes. -func (e *LoadError) Error() string { - var typeErr *yaml.TypeError - if errors.As(e.Err, &typeErr) { - var b strings.Builder - for i, line := range typeErr.Errors { - if i > 0 { - b.WriteByte('\n') - } - fmt.Fprintf(&b, "%s:%s", e.Path, line) - } - return b.String() - } - return fmt.Sprintf("%s: %s", e.Path, e.Err.Error()) -} - -// Unwrap returns the wrapped error so errors.Is / errors.As work -// against the underlying yaml.v3 error types. -func (e *LoadError) Unwrap() error { return e.Err } - -// Load reads path and decodes the operator-facing YAML config. The -// decoder rejects unknown top-level fields (KnownFields(true)) and -// reports errors as `:[:]: `. -// -// An empty file is valid and yields a zero-valued *Config — the -// "empty-pipeline boot" UX criterion depends on this. -func Load(path string) (*Config, error) { - f, err := os.Open(path) //nolint:gosec // operator-supplied config path is the intended use - if err != nil { - return nil, fmt.Errorf("open config: %w", err) - } - defer func() { _ = f.Close() }() - - dec := yaml.NewDecoder(f) - dec.KnownFields(true) - - var cfg Config - if err := dec.Decode(&cfg); err != nil { - if errors.Is(err, io.EOF) { - // Empty file → zero-valued Config. Validates the binary - // can boot before any receivers ship. - return &Config{}, nil - } - return nil, &LoadError{Path: path, Err: err} - } - - // Reject multi-document YAML. yaml.Decode reads only the first - // document; trailing `---` blocks would be silently discarded — - // a real surprise for operators copying OTel collector configs - // that sometimes ship multi-doc. - var extra yaml.Node - if err := dec.Decode(&extra); err == nil { - return nil, &LoadError{Path: path, Err: errors.New("multi-document YAML not supported; the loader reads only the first `---` block")} - } - - cfg.Telemetry.applyDefaults() - - if err := cfg.validate(); err != nil { - return nil, &LoadError{Path: path, Err: err} - } - return &cfg, nil -} - -// validate runs the cross-section checks the YAML decoder alone can't -// catch (pipeline-key format, component-reference resolution, -// telemetry block sanity). The caller wraps with a *LoadError to -// attach the file path. -func (c *Config) validate() error { - if err := c.Telemetry.validate(); err != nil { - return err - } - for key, p := range c.Service.Pipelines { - if _, _, err := ParsePipelineID(key); err != nil { - return fmt.Errorf("service.pipelines.%s: %w", key, err) - } - if err := checkRefs(key, "receivers", p.Receivers, c.Receivers); err != nil { - return err - } - if err := checkRefs(key, "processors", p.Processors, c.Processors); err != nil { - return err - } - if err := checkRefs(key, "exporters", p.Exporters, c.Exporters); err != nil { - return err - } - } - return nil -} - -func checkRefs(pipelineKey, role string, refs []string, defs map[string]yaml.Node) error { - for _, name := range refs { - if _, ok := defs[name]; !ok { - return fmt.Errorf("service.pipelines.%s.%s: undefined %s %q", - pipelineKey, role, strings.TrimSuffix(role, "s"), name) - } - } - return nil -} diff --git a/internal/config/load_test.go b/internal/config/load_test.go deleted file mode 100644 index 8f9b39db..00000000 --- a/internal/config/load_test.go +++ /dev/null @@ -1,246 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 - -package config_test - -import ( - "os" - "path/filepath" - "testing" - - "github.com/stretchr/testify/require" - "gopkg.in/yaml.v3" - - "github.com/tracecoreai/tracecore/internal/config" - "github.com/tracecoreai/tracecore/internal/pipeline" -) - -func TestLoad_EmptyFile_ReturnsEmptyConfig(t *testing.T) { - t.Parallel() - - path := writeConfig(t, "") - cfg, err := config.Load(path) - require.NoError(t, err) - require.NotNil(t, cfg) - require.Empty(t, cfg.Receivers) - require.Empty(t, cfg.Processors) - require.Empty(t, cfg.Exporters) - require.Empty(t, cfg.Service.Pipelines) -} - -func TestLoad_MinimalValidConfig(t *testing.T) { - t.Parallel() - - const src = ` -receivers: - dcgm: - endpoint: localhost:5555 -exporters: - otlp: - endpoint: collector:4317 -service: - pipelines: - metrics/primary: - receivers: [dcgm] - exporters: [otlp] -` - path := writeConfig(t, src) - cfg, err := config.Load(path) - require.NoError(t, err) - - require.Contains(t, cfg.Receivers, "dcgm") - require.Contains(t, cfg.Exporters, "otlp") - require.Contains(t, cfg.Service.Pipelines, "metrics/primary") - - p := cfg.Service.Pipelines["metrics/primary"] - require.Equal(t, []string{"dcgm"}, p.Receivers) - require.Equal(t, []string{"otlp"}, p.Exporters) -} - -func TestLoad_FileNotFound(t *testing.T) { - t.Parallel() - - _, err := config.Load("/no/such/path.yaml") - require.Error(t, err) - require.Contains(t, err.Error(), "open config") -} - -func TestLoad_UnknownTopLevelField_LineNumberedError(t *testing.T) { - t.Parallel() - - const src = ` -receivers: {} -exporters: {} -unknown_top: true -` - path := writeConfig(t, src) - _, err := config.Load(path) - require.Error(t, err) - require.Contains(t, err.Error(), path+":") - require.Contains(t, err.Error(), "line 4") // 1-based; the offending key is on line 4 - require.Contains(t, err.Error(), "unknown_top") - - // Underlying *yaml.TypeError must still be reachable via errors.As - // so structured handlers (linters, IDE plugins) can dig in. - var typeErr *yaml.TypeError - require.ErrorAs(t, err, &typeErr) - - var loadErr *config.LoadError - require.ErrorAs(t, err, &loadErr) - require.Equal(t, path, loadErr.Path) -} - -func TestLoad_UnknownPipelineSignal_NamesTheOffender(t *testing.T) { - t.Parallel() - - const src = ` -service: - pipelines: - badsignal/primary: {} -` - path := writeConfig(t, src) - _, err := config.Load(path) - require.Error(t, err) - require.Contains(t, err.Error(), "service.pipelines.badsignal/primary") - require.Contains(t, err.Error(), "unknown pipeline signal") -} - -func TestLoad_UndefinedReceiver_ReferenceError(t *testing.T) { - t.Parallel() - - const src = ` -exporters: - otlp: {endpoint: x} -service: - pipelines: - metrics/primary: - receivers: [ghost] - exporters: [otlp] -` - path := writeConfig(t, src) - _, err := config.Load(path) - require.Error(t, err) - require.Contains(t, err.Error(), "undefined receiver") - require.Contains(t, err.Error(), `"ghost"`) -} - -// TestLoad_MultiDocumentYAML_Rejected: yaml.Decoder reads one document -// per call; trailing `---` blocks would silently disappear. Operators -// copy-paste configs from OTel collector docs that occasionally ship -// multi-doc; data loss with no warning is unacceptable. -func TestLoad_MultiDocumentYAML_Rejected(t *testing.T) { - t.Parallel() - - const src = ` -receivers: {} ---- -service: {} -` - path := writeConfig(t, src) - _, err := config.Load(path) - require.Error(t, err) - require.Contains(t, err.Error(), "multi-document YAML") -} - -// TestParsePipelineID_LeadingDigitInstanceName_Rejected: `metrics/1primary` -// would slip past load-time validation and only fail later when the -// builder calls NewID. Surface the error at config load with operator- -// visible line context. -func TestParsePipelineID_LeadingDigitInstanceName_Rejected(t *testing.T) { - t.Parallel() - - _, _, err := config.ParsePipelineID("metrics/1primary") - require.Error(t, err) - require.Contains(t, err.Error(), "invalid instance name") -} - -// TestParsePipelineID_LeadingSlash_Rejected: `/primary` is malformed — -// signal portion is empty. Reject with the "unknown pipeline signal" -// branch (signalStr is ""). -func TestParsePipelineID_LeadingSlash_Rejected(t *testing.T) { - t.Parallel() - - _, _, err := config.ParsePipelineID("/primary") - require.Error(t, err) - require.Contains(t, err.Error(), "unknown pipeline signal") -} - -// TestLoad_PathIsDirectory_ReturnsClearError: a config path that points -// at a directory (operator typo: forgot the filename) currently fails -// inside the YAML decoder on read. Pin that the error message reaches -// the operator with the path attached, not a bare read syscall error. -func TestLoad_PathIsDirectory_ReturnsClearError(t *testing.T) { - t.Parallel() - - dir := t.TempDir() - _, err := config.Load(dir) - require.Error(t, err) - // Either branch is acceptable: a clean "is a directory" from os.Open - // or a yaml decode error wrapped in *LoadError. Both should mention - // the path so the operator can pin the cause. - require.Contains(t, err.Error(), dir) -} - -// TestParsePipelineID_TrailingSlash_Rejected: `metrics/` is distinct -// from `metrics` — silent normalization hides operator typos. -func TestParsePipelineID_TrailingSlash_Rejected(t *testing.T) { - t.Parallel() - - _, _, err := config.ParsePipelineID("metrics/") - require.Error(t, err) - require.Contains(t, err.Error(), "trailing slash") -} - -// TestParsePipelineID_MultipleSlashes_Rejected: `metrics/primary/secondary` -// is malformed; reject at parse time so the error surfaces at config -// load with operator-visible line context. -func TestParsePipelineID_MultipleSlashes_Rejected(t *testing.T) { - t.Parallel() - - _, _, err := config.ParsePipelineID("metrics/primary/secondary") - require.Error(t, err) - require.Contains(t, err.Error(), "multiple slashes") -} - -func TestParsePipelineID(t *testing.T) { - t.Parallel() - - tests := []struct { - in string - wantSignal pipeline.Signal - wantName string - wantErr bool - }{ - {in: "metrics", wantSignal: pipeline.SignalMetrics, wantName: ""}, - {in: "metrics/primary", wantSignal: pipeline.SignalMetrics, wantName: "primary"}, - {in: "traces", wantSignal: pipeline.SignalTraces}, - {in: "traces/secondary", wantSignal: pipeline.SignalTraces, wantName: "secondary"}, - {in: "logs", wantSignal: pipeline.SignalLogs}, - {in: "foo", wantErr: true}, - {in: "foo/bar", wantErr: true}, - {in: "", wantErr: true}, - } - - for _, tc := range tests { - t.Run(tc.in, func(t *testing.T) { - t.Parallel() - sig, n, err := config.ParsePipelineID(tc.in) - if tc.wantErr { - require.Error(t, err) - return - } - require.NoError(t, err) - require.Equal(t, tc.wantSignal, sig) - require.Equal(t, tc.wantName, n) - }) - } -} - -// writeConfig writes content to a temp file and returns the path. The -// test's Cleanup hook handles removal. -func writeConfig(t *testing.T, content string) string { - t.Helper() - dir := t.TempDir() - path := filepath.Join(dir, "config.yaml") - require.NoError(t, os.WriteFile(path, []byte(content), 0o600)) - return path -} diff --git a/internal/config/telemetry_test.go b/internal/config/telemetry_test.go deleted file mode 100644 index 678db725..00000000 --- a/internal/config/telemetry_test.go +++ /dev/null @@ -1,124 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 - -package config_test - -import ( - "os" - "path/filepath" - "testing" - - "github.com/stretchr/testify/require" - - "github.com/tracecoreai/tracecore/internal/config" -) - -// writeTmp writes contents to a fresh temp file and returns the path. -func writeTmp(t *testing.T, contents string) string { - t.Helper() - p := filepath.Join(t.TempDir(), "config.yaml") - require.NoError(t, os.WriteFile(p, []byte(contents), 0o600)) - return p -} - -// TestTelemetry_DefaultsToDisabled pins the operator-safety contract: -// a config that omits the `telemetry:` block must leave the surface -// OFF. Self-telemetry is opt-in. -func TestTelemetry_DefaultsToDisabled(t *testing.T) { - t.Parallel() - const yaml = ` -receivers: {} -exporters: {} -service: - pipelines: {} -` - cfg, err := config.Load(writeTmp(t, yaml)) - require.NoError(t, err) - require.False(t, cfg.Telemetry.Enabled, "default must be off") -} - -// TestTelemetry_LoadsListenAndPaths exercises a full opt-in block. -func TestTelemetry_LoadsListenAndPaths(t *testing.T) { - t.Parallel() - const yaml = ` -telemetry: - enabled: true - listen: ":8888" - paths: - metrics: /m - healthz: /h - readyz: /r -` - cfg, err := config.Load(writeTmp(t, yaml)) - require.NoError(t, err) - require.True(t, cfg.Telemetry.Enabled) - require.Equal(t, ":8888", cfg.Telemetry.Listen) - require.Equal(t, "/m", cfg.Telemetry.Paths.Metrics) - require.Equal(t, "/h", cfg.Telemetry.Paths.Healthz) - require.Equal(t, "/r", cfg.Telemetry.Paths.Readyz) -} - -// TestTelemetry_AppliesDefaultsOnEmptyBlock pins the contract: an -// `enabled: true` block with no listen + paths gets sensible -// defaults so operators don't have to spell out the whole block. -func TestTelemetry_AppliesDefaultsOnEmptyBlock(t *testing.T) { - t.Parallel() - const yaml = ` -telemetry: - enabled: true -` - cfg, err := config.Load(writeTmp(t, yaml)) - require.NoError(t, err) - require.True(t, cfg.Telemetry.Enabled) - require.Equal(t, "localhost:8888", cfg.Telemetry.Listen, "default listen is localhost-only") - require.Equal(t, "/metrics", cfg.Telemetry.Paths.Metrics) - require.Equal(t, "/healthz", cfg.Telemetry.Paths.Healthz) - require.Equal(t, "/readyz", cfg.Telemetry.Paths.Readyz) -} - -// TestTelemetry_RejectsBadListen pins that a malformed listen address -// is caught at validate time rather than at HTTP-Start time, when the -// error would land in the runtime log instead of `validate` output. -func TestTelemetry_RejectsBadListen(t *testing.T) { - t.Parallel() - const yaml = ` -telemetry: - enabled: true - listen: "notavalidaddress" -` - _, err := config.Load(writeTmp(t, yaml)) - require.Error(t, err) - require.Contains(t, err.Error(), "telemetry.listen", "error points at the offending field") -} - -// TestTelemetry_RejectsNonAbsolutePath pins that paths without the -// leading "/" are an obvious operator typo. -func TestTelemetry_RejectsNonAbsolutePath(t *testing.T) { - t.Parallel() - const yaml = ` -telemetry: - enabled: true - paths: - metrics: metrics -` - _, err := config.Load(writeTmp(t, yaml)) - require.Error(t, err) - require.Contains(t, err.Error(), "telemetry.paths.metrics") -} - -// TestTelemetry_DisabledSkipsValidation pins the operator-UX policy: -// fields are only validated when telemetry.enabled is true. A disabled -// block with a malformed listen must still load (so operators can -// keep their full production block commented as a template). -func TestTelemetry_DisabledSkipsValidation(t *testing.T) { - t.Parallel() - const yaml = ` -telemetry: - enabled: false - listen: "" - paths: - metrics: nope -` - cfg, err := config.Load(writeTmp(t, yaml)) - require.NoError(t, err, "disabled block must not gate on listen/path validation") - require.False(t, cfg.Telemetry.Enabled) -} diff --git a/internal/consumer/capabilities.go b/internal/consumer/capabilities.go deleted file mode 100644 index f0fd8504..00000000 --- a/internal/consumer/capabilities.go +++ /dev/null @@ -1,19 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 - -package consumer - -// Capabilities describes optional behaviours a consumer may expose to -// the runtime. Fan-out reads MutatesData to decide whether to clone -// the payload before each downstream push. -// -// Mirrors go.opentelemetry.io/collector/consumer.Capabilities at -// v1.58.0 — see docs/research/otel-graph-notes.md §3-4 for the -// cloning-decision rationale. -type Capabilities struct { - // MutatesData reports whether the consumer modifies the data - // passed to its Consume method. Fan-out clones the - // payload for every mutating consumer except (when safe) the - // last in the chain — pure read-only consumers can share a - // single immutable payload. - MutatesData bool -} diff --git a/internal/consumer/doc.go b/internal/consumer/doc.go deleted file mode 100644 index 451da22f..00000000 --- a/internal/consumer/doc.go +++ /dev/null @@ -1,21 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 - -// Package consumer defines the push interfaces stages of a pipeline -// use to hand data to the next stage. Mirrors -// go.opentelemetry.io/collector/consumer v1.58.0 — receivers, -// processors, and exporters port one-to-one between OTel and tracecore -// without a type-adapter layer. -// -// The package lives separately from internal/pipeline and internal/fanout -// to break what would otherwise be a cycle: fanout depends on the -// consumer interfaces, pipeline depends on both. Keeping consumer at -// the bottom of the dependency graph lets every component-side package -// import it without pulling in the runtime. -// -// The package has no executable code — only interfaces and the -// Capabilities struct — so it does not appear in coverage profiles -// and is deliberately excluded from the coverage gate. -// -// pdata values are not safe to share across goroutines. Callees that -// fan out asynchronously MUST clone before doing so. -package consumer diff --git a/internal/consumer/logs.go b/internal/consumer/logs.go deleted file mode 100644 index c40a19f8..00000000 --- a/internal/consumer/logs.go +++ /dev/null @@ -1,15 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 - -package consumer - -import ( - "context" - - "go.opentelemetry.io/collector/pdata/plog" -) - -// Logs is the push interface for the logs signal. -type Logs interface { - ConsumeLogs(ctx context.Context, ld plog.Logs) error - Capabilities() Capabilities -} diff --git a/internal/consumer/metrics.go b/internal/consumer/metrics.go deleted file mode 100644 index 095de1bb..00000000 --- a/internal/consumer/metrics.go +++ /dev/null @@ -1,15 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 - -package consumer - -import ( - "context" - - "go.opentelemetry.io/collector/pdata/pmetric" -) - -// Metrics is the push interface for the metrics signal. -type Metrics interface { - ConsumeMetrics(ctx context.Context, md pmetric.Metrics) error - Capabilities() Capabilities -} diff --git a/internal/consumer/traces.go b/internal/consumer/traces.go deleted file mode 100644 index 5faa6c0b..00000000 --- a/internal/consumer/traces.go +++ /dev/null @@ -1,15 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 - -package consumer - -import ( - "context" - - "go.opentelemetry.io/collector/pdata/ptrace" -) - -// Traces is the push interface for the trace signal. -type Traces interface { - ConsumeTraces(ctx context.Context, td ptrace.Traces) error - Capabilities() Capabilities -} diff --git a/internal/fanout/bench_test.go b/internal/fanout/bench_test.go deleted file mode 100644 index b31f9082..00000000 --- a/internal/fanout/bench_test.go +++ /dev/null @@ -1,65 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 - -package fanout_test - -import ( - "context" - "testing" - - "go.opentelemetry.io/collector/pdata/pmetric" - - "github.com/tracecoreai/tracecore/internal/consumer" - "github.com/tracecoreai/tracecore/internal/fanout" -) - -// BenchmarkFanout_NewMetrics measures the per-push cost of -// fanout.NewMetrics across representative readonly+mutable consumer -// mixes. The 1/0 mix exercises the unwrap fast-path; mixes that -// involve cloning hit the slow path. Numbers establish a baseline -// for future cloning-strategy changes. -func BenchmarkFanout_NewMetrics(b *testing.B) { - cases := []struct { - name string - mutable int - readonly int - }{ - {"1readonly-fast-path", 0, 1}, - {"2readonly", 0, 2}, - {"1mutable-1readonly", 1, 1}, - {"5mutable", 5, 0}, - {"2mutable-3readonly", 2, 3}, - } - for _, tc := range cases { - b.Run(tc.name, func(b *testing.B) { - consumers := make([]consumer.Metrics, 0, tc.mutable+tc.readonly) - for range tc.mutable { - consumers = append(consumers, noopBenchMetrics{mutates: true}) - } - for range tc.readonly { - consumers = append(consumers, noopBenchMetrics{}) - } - f := fanout.NewMetrics(consumers) - - md := pmetric.NewMetrics() - md.ResourceMetrics().AppendEmpty() - ctx := b.Context() - - b.ResetTimer() - for range b.N { - if err := f.ConsumeMetrics(ctx, md); err != nil { - b.Fatal(err) - } - } - }) - } -} - -type noopBenchMetrics struct { - mutates bool -} - -func (n noopBenchMetrics) Capabilities() consumer.Capabilities { - return consumer.Capabilities{MutatesData: n.mutates} -} - -func (noopBenchMetrics) ConsumeMetrics(context.Context, pmetric.Metrics) error { return nil } diff --git a/internal/fanout/doc.go b/internal/fanout/doc.go deleted file mode 100644 index 34efe76d..00000000 --- a/internal/fanout/doc.go +++ /dev/null @@ -1,30 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 - -// Package fanout distributes a single pmetric.Metrics / ptrace.Traces / -// plog.Logs to N downstream consumers, cloning only where required by -// the consumers' Capabilities. Mirrors OTel Collector v0.152.0's -// internal/fanoutconsumer (see docs/research/otel-graph-notes.md §3). -// -// Construction is via the per-signal NewMetrics / NewTraces / NewLogs -// functions. Each: -// -// - Fast-path returns the input directly when there is exactly one -// consumer and it does not mutate data. -// - Otherwise partitions consumers into mutable / readonly slices -// up-front so the per-call hot path skips the Capabilities lookup. -// -// Cloning strategy at each Consume: -// -// - All mutating consumers except the last receive a deep clone -// (via .CopyTo on a fresh pdata value). -// - The last mutating consumer receives the original ONLY when -// there are zero readonly consumers AND the payload is not -// already marked read-only. Otherwise it receives a clone. -// - Readonly consumers share the original payload. When there are -// two or more readonly consumers, the original is marked -// read-only first. -// -// Errors from individual consumers are accumulated with errors.Join -// (stdlib) — every consumer is invoked even on partial failure. -// Consumers run strictly serially in the caller's goroutine. -package fanout diff --git a/internal/fanout/fanout.go b/internal/fanout/fanout.go deleted file mode 100644 index 5c5fd585..00000000 --- a/internal/fanout/fanout.go +++ /dev/null @@ -1,100 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 - -package fanout - -import ( - "context" - "errors" - - "github.com/tracecoreai/tracecore/internal/consumer" -) - -// pdataOps captures the per-signal pdata operations the generic fanout -// needs. T is the pdata payload type (pmetric.Metrics, ptrace.Traces, -// plog.Logs). Each pdata struct ships these methods but they aren't -// reachable through a shared interface in upstream pdata, so we bridge -// here with one-line callbacks. -type pdataOps[T any] struct { - new func() T - copyTo func(src, dst T) - isReadOnly func(T) bool - markReadOnly func(T) -} - -// genericFanout holds the per-signal consumer slices + the per-signal -// pdata operations + a callback that invokes the right Consume* method -// on a consumer C. The cloning algorithm itself is identical across -// signals and lives only in consumeAll. -type genericFanout[T any, C any] struct { - mutable []C - readonly []C - - ops pdataOps[T] - consume func(c C, ctx context.Context, payload T) error -} - -// Capabilities surfaces MutatesData=true only when every downstream -// consumer mutates (no readonly consumers AND at least one mutable -// consumer). In that case our upstream may donate the payload without -// cloning, because we will donate it to the last mutating consumer. -func (g *genericFanout[T, C]) Capabilities() consumer.Capabilities { - return consumer.Capabilities{MutatesData: len(g.mutable) > 0 && len(g.readonly) == 0} -} - -// consumeAll dispatches one payload to every downstream consumer per -// the cloning rules in this package's doc.go. Returns errors.Join of -// all downstream errors — no early-exit; every consumer is invoked. -func (g *genericFanout[T, C]) consumeAll(ctx context.Context, d T) error { - var errs error - - if len(g.mutable) > 0 { - // All mutators except the last get a fresh clone. - for i := range len(g.mutable) - 1 { - clone := g.ops.new() - g.ops.copyTo(d, clone) - errs = errors.Join(errs, g.consume(g.mutable[i], ctx, clone)) - } - // Last mutator gets the original when safe — i.e. nobody else - // needs the unmodified payload (no readonly consumers behind us, - // and the payload isn't already shared via MarkReadOnly upstream). - last := g.mutable[len(g.mutable)-1] - if len(g.readonly) == 0 && !g.ops.isReadOnly(d) { - errs = errors.Join(errs, g.consume(last, ctx, d)) - } else { - clone := g.ops.new() - g.ops.copyTo(d, clone) - errs = errors.Join(errs, g.consume(last, ctx, clone)) - } - } - - if len(g.readonly) > 0 { - // Multiple readonly consumers share a single payload; mark it - // read-only so pdata's debug-time checks catch any downstream - // that violates the contract. - if len(g.readonly) > 1 && !g.ops.isReadOnly(d) { - g.ops.markReadOnly(d) - } - for _, c := range g.readonly { - errs = errors.Join(errs, g.consume(c, ctx, d)) - } - } - - return errs -} - -// partition splits consumers by Capabilities.MutatesData. Slices -// preallocate to len(consumers) since one of the two will hold every -// element; the over-allocation on the smaller side is one-time at -// pipeline build, not per-push. -func partition[C any](consumers []C, mutates func(C) bool) (mutable, readonly []C) { - mutable = make([]C, 0, len(consumers)) - readonly = make([]C, 0, len(consumers)) - for _, c := range consumers { - if mutates(c) { - mutable = append(mutable, c) - } else { - readonly = append(readonly, c) - } - } - return mutable, readonly -} diff --git a/internal/fanout/fanout_test.go b/internal/fanout/fanout_test.go deleted file mode 100644 index 6b03f1fe..00000000 --- a/internal/fanout/fanout_test.go +++ /dev/null @@ -1,530 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 - -package fanout_test - -import ( - "context" - "errors" - "strconv" - "sync/atomic" - "testing" - - "github.com/stretchr/testify/require" - "go.opentelemetry.io/collector/pdata/pcommon" - "go.opentelemetry.io/collector/pdata/plog" - "go.opentelemetry.io/collector/pdata/pmetric" - "go.opentelemetry.io/collector/pdata/ptrace" - - "github.com/tracecoreai/tracecore/internal/consumer" - "github.com/tracecoreai/tracecore/internal/fanout" -) - -// ========================================================================= -// Metrics fan-out tests -// ========================================================================= - -func TestMetrics_FastPath_SingleReadonlyReturnedUnwrapped(t *testing.T) { - t.Parallel() - - c := &metricsRecorder{mutates: false} - got := fanout.NewMetrics([]consumer.Metrics{c}) - require.Same(t, c, got, "single readonly consumer must be returned unwrapped") -} - -func TestMetrics_SingleMutatingWrapped_CapabilitiesPropagate(t *testing.T) { - t.Parallel() - - c := &metricsRecorder{mutates: true} - got := fanout.NewMetrics([]consumer.Metrics{c}) - require.NotSame(t, c, got, "single mutating consumer must be wrapped so Capabilities surface MutatesData") - require.True(t, got.Capabilities().MutatesData, "wrapper reports MutatesData when every downstream mutates") -} - -func TestMetrics_AllReadonly_SharePayload_MarkReadOnlyWhenMoreThanOne(t *testing.T) { - t.Parallel() - - a := &metricsRecorder{mutates: false} - b := &metricsRecorder{mutates: false} - f := fanout.NewMetrics([]consumer.Metrics{a, b}) - - md := pmetric.NewMetrics() - md.ResourceMetrics().AppendEmpty() - require.False(t, md.IsReadOnly()) - - require.NoError(t, f.ConsumeMetrics(t.Context(), md)) - - require.Equal(t, int32(1), a.calls.Load()) - require.Equal(t, int32(1), b.calls.Load()) - require.True(t, md.IsReadOnly(), "≥2 readonly consumers must trigger MarkReadOnly on the shared payload") -} - -func TestMetrics_AllMutating_LastGetsOriginal(t *testing.T) { - t.Parallel() - - a := &metricsRecorder{mutates: true} - b := &metricsRecorder{mutates: true} - last := &metricsRecorder{mutates: true} - f := fanout.NewMetrics([]consumer.Metrics{a, b, last}) - - // Behavioural identity test: mutate the original AFTER fanout - // and observe which recorder sees the change. Recorders that - // got clones are frozen at 1 resource; the recorder that got - // the donated original sees 2. - md := pmetric.NewMetrics() - md.ResourceMetrics().AppendEmpty() - require.NoError(t, f.ConsumeMetrics(t.Context(), md)) - md.ResourceMetrics().AppendEmpty() - - require.Equal(t, 1, a.last.ResourceMetrics().Len(), "a got a clone (frozen at 1)") - require.Equal(t, 1, b.last.ResourceMetrics().Len(), "b got a clone (frozen at 1)") - require.Equal(t, 2, last.last.ResourceMetrics().Len(), "last mutator got the donated original (sees the post-fanout mutation)") -} - -func TestMetrics_AllMutating_ReadonlyInput_LastAlsoGetsClone(t *testing.T) { - t.Parallel() - - last := &metricsRecorder{mutates: true} - f := fanout.NewMetrics([]consumer.Metrics{last}) - - md := pmetric.NewMetrics() - md.ResourceMetrics().AppendEmpty() - md.MarkReadOnly() - - require.NoError(t, f.ConsumeMetrics(t.Context(), md)) - // Recorder must have received a clone (not the read-only original). - // pdata's read-only flag is a value-level concept; if the clone path - // fired, last.last is NOT read-only. - require.False(t, last.last.IsReadOnly(), "read-only input forces a clone even for the last mutator") -} - -func TestMetrics_Mixed_LastMutatorGetsClone_ReadonlyGetsOriginal(t *testing.T) { - t.Parallel() - - mut := &metricsRecorder{mutates: true} - ro := &metricsRecorder{mutates: false} - f := fanout.NewMetrics([]consumer.Metrics{mut, ro}) - - // Same behavioural identity test: post-fanout mutation visible - // only to the recorder that got the original. - md := pmetric.NewMetrics() - md.ResourceMetrics().AppendEmpty() - require.NoError(t, f.ConsumeMetrics(t.Context(), md)) - md.ResourceMetrics().AppendEmpty() - - require.Equal(t, 1, mut.last.ResourceMetrics().Len(), "mutator (last with readonly behind) got a clone (frozen at 1)") - require.Equal(t, 2, ro.last.ResourceMetrics().Len(), "readonly consumer got the original (sees the mutation)") -} - -func TestMetrics_ErrorsJoined_AllInvoked(t *testing.T) { - t.Parallel() - - errA := errors.New("a failed") - errB := errors.New("b failed") - a := &metricsRecorder{mutates: false, returnErr: errA} - b := &metricsRecorder{mutates: false, returnErr: errB} - c := &metricsRecorder{mutates: false} - f := fanout.NewMetrics([]consumer.Metrics{a, b, c}) - - md := pmetric.NewMetrics() - md.ResourceMetrics().AppendEmpty() - err := f.ConsumeMetrics(t.Context(), md) - - require.Error(t, err) - require.ErrorIs(t, err, errA) - require.ErrorIs(t, err, errB) - require.Equal(t, int32(1), a.calls.Load()) - require.Equal(t, int32(1), b.calls.Load()) - require.Equal(t, int32(1), c.calls.Load(), "c must still be invoked even after a and b errored") -} - -// ========================================================================= -// Traces fan-out tests (representative; sharing logic with metrics) -// ========================================================================= - -func TestTraces_FastPath_SingleReadonlyReturnedUnwrapped(t *testing.T) { - t.Parallel() - c := &tracesRecorder{} - got := fanout.NewTraces([]consumer.Traces{c}) - require.Same(t, c, got) -} - -func TestTraces_AllReadonly_MarkReadOnlyOnSharedPayload(t *testing.T) { - t.Parallel() - - a := &tracesRecorder{} - b := &tracesRecorder{} - f := fanout.NewTraces([]consumer.Traces{a, b}) - - td := ptrace.NewTraces() - td.ResourceSpans().AppendEmpty() - require.NoError(t, f.ConsumeTraces(t.Context(), td)) - - require.True(t, td.IsReadOnly()) - require.Equal(t, int32(1), a.calls.Load()) - require.Equal(t, int32(1), b.calls.Load()) -} - -// ========================================================================= -// Logs fan-out tests (representative) -// ========================================================================= - -func TestLogs_FastPath_SingleReadonlyReturnedUnwrapped(t *testing.T) { - t.Parallel() - c := &logsRecorder{} - got := fanout.NewLogs([]consumer.Logs{c}) - require.Same(t, c, got) -} - -func TestLogs_AllReadonly_MarkReadOnlyOnSharedPayload(t *testing.T) { - t.Parallel() - - a := &logsRecorder{} - b := &logsRecorder{} - f := fanout.NewLogs([]consumer.Logs{a, b}) - - ld := plog.NewLogs() - ld.ResourceLogs().AppendEmpty() - require.NoError(t, f.ConsumeLogs(t.Context(), ld)) - - require.True(t, ld.IsReadOnly()) - require.Equal(t, int32(1), a.calls.Load()) - require.Equal(t, int32(1), b.calls.Load()) -} - -// TestMetrics_NestedPayload_DeepCloneIsolation verifies that pdata's -// CopyTo deep-copies every level of a realistic nested payload, so a -// mutating consumer can't leak changes into a sibling consumer's view. -// Structure: 2 ResourceMetrics × 2 ScopeMetrics × 1 Gauge × 2 -// DataPoints with attributes. Each level gets mutated in turn and -// the sibling's clone is asserted to be unaffected. -func TestMetrics_NestedPayload_DeepCloneIsolation(t *testing.T) { - t.Parallel() - - // Recorder A mutates: it appends a new attribute to every - // data point's attribute map. Recorder B observes the post-A - // state of its own (cloned) payload — should NOT see A's - // added attribute. - type captured struct { - attrKeys [][]string - } - var aCap, bCap captured - a := &mutateMetricsRecorder{mutates: true, capture: func(md pmetric.Metrics) { - for i := range md.ResourceMetrics().Len() { - rm := md.ResourceMetrics().At(i) - for j := range rm.ScopeMetrics().Len() { - sm := rm.ScopeMetrics().At(j) - for k := range sm.Metrics().Len() { - m := sm.Metrics().At(k) - for p := range m.Gauge().DataPoints().Len() { - dp := m.Gauge().DataPoints().At(p) - dp.Attributes().PutStr("mutated-by-a", "yes") - } - } - } - } - var keys []string - md.ResourceMetrics().At(0).ScopeMetrics().At(0).Metrics().At(0).Gauge().DataPoints().At(0).Attributes().Range(func(k string, _ pcommon.Value) bool { - keys = append(keys, k) - return true - }) - aCap.attrKeys = append(aCap.attrKeys, keys) - }} - b := &mutateMetricsRecorder{mutates: false, capture: func(md pmetric.Metrics) { - var keys []string - md.ResourceMetrics().At(0).ScopeMetrics().At(0).Metrics().At(0).Gauge().DataPoints().At(0).Attributes().Range(func(k string, _ pcommon.Value) bool { - keys = append(keys, k) - return true - }) - bCap.attrKeys = append(bCap.attrKeys, keys) - }} - - f := fanout.NewMetrics([]consumer.Metrics{a, b}) - - md := buildNestedMetrics(t, 2, 2, 2) - require.NoError(t, f.ConsumeMetrics(t.Context(), md)) - - require.NotEmpty(t, aCap.attrKeys) - require.NotEmpty(t, bCap.attrKeys) - require.Contains(t, aCap.attrKeys[0], "mutated-by-a", - "mutating consumer must see its own mutation in its payload") - require.NotContains(t, bCap.attrKeys[0], "mutated-by-a", - "readonly consumer's clone must be isolated from the mutator's writes — pdata CopyTo deep-clones attribute maps") -} - -// buildNestedMetrics returns a pmetric.Metrics populated with nRMs -// ResourceMetrics, each with nSMs ScopeMetrics, each carrying one -// Gauge metric with nDPs DataPoints. Each DataPoint gets a starter -// attribute "service" so the map is non-empty for the mutation test. -func buildNestedMetrics(t *testing.T, nRMs, nSMs, nDPs int) pmetric.Metrics { - t.Helper() - md := pmetric.NewMetrics() - for i := range nRMs { - rm := md.ResourceMetrics().AppendEmpty() - rm.Resource().Attributes().PutStr("rm.index", strconv.Itoa(i)) - for j := range nSMs { - sm := rm.ScopeMetrics().AppendEmpty() - sm.Scope().SetName("test/" + strconv.Itoa(j)) - m := sm.Metrics().AppendEmpty() - m.SetName("test.gauge") - g := m.SetEmptyGauge() - for k := range nDPs { - dp := g.DataPoints().AppendEmpty() - dp.Attributes().PutStr("service", "test") - dp.SetIntValue(int64(k)) - } - } - } - return md -} - -// mutateMetricsRecorder is a recorder that lets the test inject a -// per-call mutation/observation function. It runs the callback on -// every ConsumeMetrics call so the test can both mutate and snapshot -// in one place. -type mutateMetricsRecorder struct { - mutates bool - capture func(pmetric.Metrics) - calls atomic.Int32 -} - -func (r *mutateMetricsRecorder) Capabilities() consumer.Capabilities { - return consumer.Capabilities{MutatesData: r.mutates} -} - -func (r *mutateMetricsRecorder) ConsumeMetrics(_ context.Context, md pmetric.Metrics) error { - r.calls.Add(1) - if r.capture != nil { - r.capture(md) - } - return nil -} - -// ========================================================================= -// Shared cloning-contract table — exercised against all three signals so -// the refactored genericFanout proves identical behaviour per signal. -// Per-signal kits supply the type-specific glue; the scenarios live once. -// ========================================================================= - -// contractCase scenarios are observable through the generic kit: -// recorder call counts, joined error propagation, and the wrapper's -// Capabilities().MutatesData. The MarkReadOnly-on-input side effect -// is asserted by the per-signal tests above, which can read the -// concrete pdata type's IsReadOnly directly. -type contractCase struct { - name string - nMutating int - nReadonly int - inputReadOnly bool - wantMutatesData bool // wrapper's Capabilities().MutatesData - wantErrInOutput bool // joined error contains the recorder's err -} - -var contractCases = []contractCase{ - {name: "single-mutating-wrapped", nMutating: 1, wantMutatesData: true}, - {name: "all-readonly-2", nReadonly: 2}, - {name: "all-mutating-3", nMutating: 3, wantMutatesData: true}, - {name: "mixed-1m-1r", nMutating: 1, nReadonly: 1}, - {name: "readonly-input-mutator", nMutating: 1, inputReadOnly: true, wantMutatesData: true}, - {name: "errors-joined-all-invoked", nReadonly: 3, wantErrInOutput: true}, -} - -// fanoutKit binds the per-signal types into the generic test runner. -// T is the pdata payload; C is the consumer interface. -type fanoutKit[T any, C any] struct { - newRecorder func(mutates bool, returnErr error) (recorder C, getCalls func() int32) - newPayload func() T - markReadOnly func(T) - newFanout func([]C) C - consume func(C, context.Context, T) error - capabilities func(C) consumer.Capabilities -} - -func runCloningContract[T any, C any](t *testing.T, kit fanoutKit[T, C]) { - t.Helper() - for _, tc := range contractCases { - t.Run(tc.name, func(t *testing.T) { - t.Parallel() - - consumers := make([]C, 0, tc.nMutating+tc.nReadonly) - getCalls := make([]func() int32, 0, tc.nMutating+tc.nReadonly) - var injectErr error - if tc.wantErrInOutput { - injectErr = errors.New("recorder-injected") - } - for range tc.nMutating { - c, gc := kit.newRecorder(true, nil) - consumers = append(consumers, c) - getCalls = append(getCalls, gc) - } - for range tc.nReadonly { - c, gc := kit.newRecorder(false, injectErr) - consumers = append(consumers, c) - getCalls = append(getCalls, gc) - } - - f := kit.newFanout(consumers) - require.Equal(t, tc.wantMutatesData, kit.capabilities(f).MutatesData, - "wrapper Capabilities().MutatesData") - - payload := kit.newPayload() - if tc.inputReadOnly { - kit.markReadOnly(payload) - } - err := kit.consume(f, t.Context(), payload) - - if tc.wantErrInOutput { - require.ErrorIs(t, err, injectErr) - } else { - require.NoError(t, err) - } - for i, gc := range getCalls { - require.Equal(t, int32(1), gc(), - "recorder %d must be invoked exactly once", i) - } - }) - } -} - -func TestFanoutContract_Metrics(t *testing.T) { - runCloningContract(t, fanoutKit[pmetric.Metrics, consumer.Metrics]{ - newRecorder: func(mut bool, err error) (consumer.Metrics, func() int32) { - r := &metricsRecorder{mutates: mut, returnErr: err} - return r, r.calls.Load - }, - newPayload: func() pmetric.Metrics { - p := pmetric.NewMetrics() - p.ResourceMetrics().AppendEmpty() - return p - }, - markReadOnly: func(p pmetric.Metrics) { p.MarkReadOnly() }, - newFanout: fanout.NewMetrics, - consume: func(c consumer.Metrics, ctx context.Context, p pmetric.Metrics) error { - return c.ConsumeMetrics(ctx, p) - }, - capabilities: func(c consumer.Metrics) consumer.Capabilities { return c.Capabilities() }, - }) -} - -func TestFanoutContract_Traces(t *testing.T) { - runCloningContract(t, fanoutKit[ptrace.Traces, consumer.Traces]{ - newRecorder: func(mut bool, err error) (consumer.Traces, func() int32) { - r := &tracesContractRecorder{mutates: mut, returnErr: err} - return r, r.calls.Load - }, - newPayload: func() ptrace.Traces { - p := ptrace.NewTraces() - p.ResourceSpans().AppendEmpty() - return p - }, - markReadOnly: func(p ptrace.Traces) { p.MarkReadOnly() }, - newFanout: fanout.NewTraces, - consume: func(c consumer.Traces, ctx context.Context, p ptrace.Traces) error { - return c.ConsumeTraces(ctx, p) - }, - capabilities: func(c consumer.Traces) consumer.Capabilities { return c.Capabilities() }, - }) -} - -func TestFanoutContract_Logs(t *testing.T) { - runCloningContract(t, fanoutKit[plog.Logs, consumer.Logs]{ - newRecorder: func(mut bool, err error) (consumer.Logs, func() int32) { - r := &logsContractRecorder{mutates: mut, returnErr: err} - return r, r.calls.Load - }, - newPayload: func() plog.Logs { - p := plog.NewLogs() - p.ResourceLogs().AppendEmpty() - return p - }, - markReadOnly: func(p plog.Logs) { p.MarkReadOnly() }, - newFanout: fanout.NewLogs, - consume: func(c consumer.Logs, ctx context.Context, p plog.Logs) error { - return c.ConsumeLogs(ctx, p) - }, - capabilities: func(c consumer.Logs) consumer.Capabilities { return c.Capabilities() }, - }) -} - -// tracesContractRecorder / logsContractRecorder expose the same -// (mutates, returnErr, calls) surface as metricsRecorder so the shared -// table can drive them uniformly. The originals defaulted to -// mutates=false/returnErr=nil; these accept both. - -type tracesContractRecorder struct { - mutates bool - returnErr error - calls atomic.Int32 - last ptrace.Traces -} - -func (r *tracesContractRecorder) Capabilities() consumer.Capabilities { - return consumer.Capabilities{MutatesData: r.mutates} -} - -func (r *tracesContractRecorder) ConsumeTraces(_ context.Context, td ptrace.Traces) error { - r.calls.Add(1) - r.last = td - return r.returnErr -} - -type logsContractRecorder struct { - mutates bool - returnErr error - calls atomic.Int32 - last plog.Logs -} - -func (r *logsContractRecorder) Capabilities() consumer.Capabilities { - return consumer.Capabilities{MutatesData: r.mutates} -} - -func (r *logsContractRecorder) ConsumeLogs(_ context.Context, ld plog.Logs) error { - r.calls.Add(1) - r.last = ld - return r.returnErr -} - -// ========================================================================= -// recorders -// ========================================================================= - -type metricsRecorder struct { - mutates bool - returnErr error - calls atomic.Int32 - last pmetric.Metrics -} - -func (r *metricsRecorder) Capabilities() consumer.Capabilities { - return consumer.Capabilities{MutatesData: r.mutates} -} - -func (r *metricsRecorder) ConsumeMetrics(_ context.Context, md pmetric.Metrics) error { - r.calls.Add(1) - r.last = md - return r.returnErr -} - -type tracesRecorder struct { - calls atomic.Int32 - last ptrace.Traces -} - -func (*tracesRecorder) Capabilities() consumer.Capabilities { return consumer.Capabilities{} } -func (r *tracesRecorder) ConsumeTraces(_ context.Context, td ptrace.Traces) error { - r.calls.Add(1) - r.last = td - return nil -} - -type logsRecorder struct { - calls atomic.Int32 - last plog.Logs -} - -func (*logsRecorder) Capabilities() consumer.Capabilities { return consumer.Capabilities{} } -func (r *logsRecorder) ConsumeLogs(_ context.Context, ld plog.Logs) error { - r.calls.Add(1) - r.last = ld - return nil -} diff --git a/internal/fanout/logs.go b/internal/fanout/logs.go deleted file mode 100644 index f0e015a3..00000000 --- a/internal/fanout/logs.go +++ /dev/null @@ -1,46 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 - -package fanout - -import ( - "context" - - "go.opentelemetry.io/collector/pdata/plog" - - "github.com/tracecoreai/tracecore/internal/consumer" -) - -var logsOps = pdataOps[plog.Logs]{ - new: plog.NewLogs, - copyTo: func(src, dst plog.Logs) { src.CopyTo(dst) }, - isReadOnly: func(d plog.Logs) bool { return d.IsReadOnly() }, - markReadOnly: func(d plog.Logs) { d.MarkReadOnly() }, -} - -// NewLogs returns a consumer.Logs that fans out to the given consumers, -// cloning only as required by each consumer's Capabilities. See package -// doc for the cloning rules. -func NewLogs(consumers []consumer.Logs) consumer.Logs { - if len(consumers) == 1 && !consumers[0].Capabilities().MutatesData { - return consumers[0] - } - mut, ro := partition(consumers, func(c consumer.Logs) bool { return c.Capabilities().MutatesData }) - return &logsFanout{ - genericFanout: genericFanout[plog.Logs, consumer.Logs]{ - mutable: mut, - readonly: ro, - ops: logsOps, - consume: func(c consumer.Logs, ctx context.Context, d plog.Logs) error { - return c.ConsumeLogs(ctx, d) - }, - }, - } -} - -type logsFanout struct { - genericFanout[plog.Logs, consumer.Logs] -} - -func (l *logsFanout) ConsumeLogs(ctx context.Context, ld plog.Logs) error { - return l.consumeAll(ctx, ld) -} diff --git a/internal/fanout/metrics.go b/internal/fanout/metrics.go deleted file mode 100644 index 9a2ee661..00000000 --- a/internal/fanout/metrics.go +++ /dev/null @@ -1,47 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 - -package fanout - -import ( - "context" - - "go.opentelemetry.io/collector/pdata/pmetric" - - "github.com/tracecoreai/tracecore/internal/consumer" -) - -var metricsOps = pdataOps[pmetric.Metrics]{ - new: pmetric.NewMetrics, - copyTo: func(src, dst pmetric.Metrics) { src.CopyTo(dst) }, - isReadOnly: func(d pmetric.Metrics) bool { return d.IsReadOnly() }, - markReadOnly: func(d pmetric.Metrics) { d.MarkReadOnly() }, -} - -// NewMetrics returns a consumer.Metrics that fans out to the given -// consumers, cloning only as required by each consumer's Capabilities. -// See package doc for the cloning rules. -func NewMetrics(consumers []consumer.Metrics) consumer.Metrics { - // Fast path: a single read-only consumer needs no wrapping. - if len(consumers) == 1 && !consumers[0].Capabilities().MutatesData { - return consumers[0] - } - mut, ro := partition(consumers, func(c consumer.Metrics) bool { return c.Capabilities().MutatesData }) - return &metricsFanout{ - genericFanout: genericFanout[pmetric.Metrics, consumer.Metrics]{ - mutable: mut, - readonly: ro, - ops: metricsOps, - consume: func(c consumer.Metrics, ctx context.Context, d pmetric.Metrics) error { - return c.ConsumeMetrics(ctx, d) - }, - }, - } -} - -type metricsFanout struct { - genericFanout[pmetric.Metrics, consumer.Metrics] -} - -func (m *metricsFanout) ConsumeMetrics(ctx context.Context, md pmetric.Metrics) error { - return m.consumeAll(ctx, md) -} diff --git a/internal/fanout/traces.go b/internal/fanout/traces.go deleted file mode 100644 index ad13223f..00000000 --- a/internal/fanout/traces.go +++ /dev/null @@ -1,46 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 - -package fanout - -import ( - "context" - - "go.opentelemetry.io/collector/pdata/ptrace" - - "github.com/tracecoreai/tracecore/internal/consumer" -) - -var tracesOps = pdataOps[ptrace.Traces]{ - new: ptrace.NewTraces, - copyTo: func(src, dst ptrace.Traces) { src.CopyTo(dst) }, - isReadOnly: func(d ptrace.Traces) bool { return d.IsReadOnly() }, - markReadOnly: func(d ptrace.Traces) { d.MarkReadOnly() }, -} - -// NewTraces returns a consumer.Traces that fans out to the given -// consumers, cloning only as required by each consumer's Capabilities. -// See package doc for the cloning rules. -func NewTraces(consumers []consumer.Traces) consumer.Traces { - if len(consumers) == 1 && !consumers[0].Capabilities().MutatesData { - return consumers[0] - } - mut, ro := partition(consumers, func(c consumer.Traces) bool { return c.Capabilities().MutatesData }) - return &tracesFanout{ - genericFanout: genericFanout[ptrace.Traces, consumer.Traces]{ - mutable: mut, - readonly: ro, - ops: tracesOps, - consume: func(c consumer.Traces, ctx context.Context, d ptrace.Traces) error { - return c.ConsumeTraces(ctx, d) - }, - }, - } -} - -type tracesFanout struct { - genericFanout[ptrace.Traces, consumer.Traces] -} - -func (t *tracesFanout) ConsumeTraces(ctx context.Context, td ptrace.Traces) error { - return t.consumeAll(ctx, td) -} diff --git a/internal/pipeline/README.md b/internal/pipeline/README.md deleted file mode 100644 index 7000ec5b..00000000 --- a/internal/pipeline/README.md +++ /dev/null @@ -1,305 +0,0 @@ -# `internal/pipeline` - -> **Status (2026-05-22):** Scheduled for deletion at v0.1.0 per RFC-0013 §7. OCB-generated boot path replaces the custom pipeline assembly. Audit any remaining consumer before delete. - - -The runtime contract every receiver, processor, and exporter satisfies, plus -the lifecycle the binary uses to wire them together. - -Design rationale, alternatives considered, and the verified findings that -shaped this code live in -[`docs/rfcs/0003-pipeline-runtime-and-component-contract.md`](../../docs/rfcs/0003-pipeline-runtime-and-component-contract.md). -This README is a quick reference for component authors and reviewers; the -RFC is the source of truth. - -## What's here - -``` -internal/pipeline/ - component.go Component, Host, TelemetrySettings, StatusEvent - componentstate.go ComponentState lifecycle mixin (embed in receivers) - factory.go Receiver/Processor/Exporter factory interfaces + - Config marker + ErrSignalNotSupported + CreateSettings - firstdata.go WrapFirstData{Metrics,Traces,Logs} consumers - id.go Type / NewType (regex-validated) + ID - runtime.go Runtime, Pipeline, two-phase Shutdown - pipelinetest/ Stub Host + New(t) returning Fixture for tests -``` - -Sibling packages this layer depends on: - -- [`internal/consumer`](../consumer) — `Metrics` / `Traces` / `Logs` push - interfaces (one per signal) with `Capabilities()`. Wired into factory - methods at construction time. -- [`internal/fanout`](../fanout) — per-signal multi-exporter consumers - with MutatesData-aware cloning. -- [`internal/safe`](../safe) — `safe.Call(ctx, opName, fn)` for cgo / - vendor SDK calls. - -## Architecture at a glance - -A single pipeline assembles bottom-up. Data flows top-down (receivers → -exporters); shutdown ordering is the reverse (Phase 1 parallel -receivers, Phase 2 serial-LIFO drain). - -``` -┌─────────────┐ ┌─────────────┐ ┌─────────────┐ -│ Receiver A │ │ Receiver B │ │ Receiver C │ Phase-1 parallel shutdown -└──────┬──────┘ └──────┬──────┘ └──────┬──────┘ (1s budget) - │ │ │ - └───────────────┴───────────────┘ - ▼ - ┌──────────────────┐ - │ WrapFirstData │ emits "pipeline first data" once - │ (saferun wrap) │ saferun: panic→error - └────────┬─────────┘ - ▼ - ┌──────────────────┐ - │ Processor N │ data-flow order; built last-first - │ (saferun wrap) │ during assembly so each processor's - └────────┬─────────┘ `next` is the already-built downstream - │ - ▼ - ... - │ - ▼ - ┌──────────────────┐ - │ Processor 1 │ - │ (saferun wrap) │ - └────────┬─────────┘ - ▼ - ┌──────────────────┐ - │ fanout.New │ always-insert seam; clones for - │ (no wrap) │ mutating consumers, shares for - └─┬──────┬──────┬──┘ read-only ones (donate-to-last) - ▼ ▼ ▼ - ┌──────────┐ ┌──────────┐ ┌──────────┐ - │ Exporter │ │ Exporter │ │ Exporter │ Phase-2 serial-LIFO - │ X │ │ Y │ │ Z │ shutdown (drain budget) - │ (saferun)│ │ (saferun)│ │ (saferun)│ - └──────────┘ └──────────┘ └──────────┘ -``` - -Each labeled seam maps to one file: - -- **WrapFirstData / saferun wrappers** → [`firstdata.go`](firstdata.go), - [`saferun.go`](saferun.go) -- **Per-signal fanout** → [`internal/fanout/fanout.go`](../fanout/fanout.go) - (generic), [`metrics.go`](../fanout/metrics.go) etc. (thin wrappers) -- **Runtime lifecycle** (Start order, two-phase Shutdown, panic recovery, - Start-after-Shutdown gate) → [`runtime.go`](runtime.go) -- **Pipeline assembly** (bottom-up; signal-generic via - `buildSignalPipeline[C]`) → - [`cmd/tracecore/main.go`](../../cmd/tracecore/main.go) - -## The contract in one screen - -```go -type Component interface { - Start(ctx context.Context, host Host) error - Shutdown(ctx context.Context) error -} - -type Host interface { - GetExtensions() map[ID]Component - ReportStatus(event StatusEvent) -} - -type TelemetrySettings struct { - Logger *slog.Logger - Resource pcommon.Resource -} -``` - -Every receiver, processor, and exporter satisfies `Component`. Both -lifecycle methods MUST be idempotent — the runtime calls `Shutdown` on -partially-started graphs when an earlier `Start` failed. - -## Type names - -`Type` is a validated component-kind name. Constructed only through -`NewType` (returns error) or `MustNewType` (panics; reserved for -compile-time constants). The validation regex is the one OTel Collector -v0.152.0 uses, copied verbatim: - -``` -^[a-zA-Z][0-9a-zA-Z_]{0,62}$ -``` - -`ID` pairs a `Type` with an optional instance name. `String()` renders -`kind` or `kind/name` to match the YAML form operators write. - -## Factory shape - -One factory per kind, three create methods per factory (metrics, traces, -logs). Receivers and processors take `next consumer.X`; exporters are the -pipeline's leaves and take none. A factory whose Component doesn't support -a signal returns `ErrSignalNotSupported` (a sentinel — match with -`errors.Is`); the runtime surfaces a clear "receiver X does not support -signal Y" message to the operator. - -## Writing tests - -Component unit tests should reach for `pipelinetest.New(t)` rather -than wiring stubs by hand: - -```go -fx := pipelinetest.New(t) -recv, err := factory.CreateMetrics(t.Context(), fx.CreateSettings, cfg, sink) -require.NoError(t, err) -require.NoError(t, recv.Start(t.Context(), fx.Host)) -t.Cleanup(func() { require.NoError(t, recv.Shutdown(t.Context())) }) -``` - -The returned `Fixture` carries a stub `Host` (records `StatusEvent`s), a -discarding logger, and a `CreateSettings` with an ID derived from -`t.Name()` (`/` flattened to `_` so the ID stays valid). Replace -`fx.CreateSettings.Telemetry.Logger` with one writing to a buffer -when a test wants to assert on log output. - -`require.X` is safe from the main test goroutine only. Goroutines spawned -by tests must channel errors back and let the main goroutine call -`require.NoError(t, <-errCh)`. Convention beats clever wrappers. - -## Writing a receiver - -Receiver authors land six files in `components/receivers//` per -[STYLE.md](../../STYLE.md) §Component layout. Mirror the shape of -[`components/receivers/clockreceiver/`](../../components/receivers/clockreceiver) — -that's the canonical example. - -**Six files:** `config.go` (Config + `Validate()`), `factory.go` -(package-var `Factory` + `NewFactory()` for codegen), `.go` -(struct embedding `pipeline.ComponentState`), `_test.go`, -`README.md`, `example_config.yaml`. - -**Factory pattern:** package-scope `var Factory pipeline.ReceiverFactory = &factory{}`. -Per-signal `CreateMetrics`/`CreateTraces`/`CreateLogs`. Signals you -don't support return `nil, pipeline.ErrSignalNotSupported`. Add -`func NewFactory() pipeline.ReceiverFactory { return Factory }` — the -codegen entry point that [`tools/components-gen`](../../tools/components-gen) -calls. - -**Lifecycle via embedding:** embed `pipeline.ComponentState` for -`Started()`/`Stopped()` accessors. Override `Start`/`Shutdown` for -goroutine setup/teardown, chaining to the embedded methods so the -flags track lifecycle correctly. See -[`clockreceiver.go`](../../components/receivers/clockreceiver/clockreceiver.go). - -**Capabilities:** `Capabilities() consumer.Capabilities` returns -`{MutatesData: false}` by default. Set `MutatesData: true` only for -processors that rewrite payloads — fan-out clones for every mutating -consumer except (when safe) the last. - -**Testing:** use [`pipelinetest.New(t)`](pipelinetest) — returns a -`Fixture` with a stub Host and a CreateSettings. No hand-rolled stubs. - -**Registration:** add the component to -[`components.yaml`](../../components.yaml); run `make generate`; -commit the regenerated `cmd/tracecore/components.go` in the same commit. - -**Pitfalls:** - -- **No globals** other than `Factory`. Logger from - `TelemetrySettings.Logger`, never `slog.Default()` — see - [`clockreceiver.go:45`](../../components/receivers/clockreceiver/clockreceiver.go). -- **ctx-respecting goroutines** — exit within - `pipeline.ReceiverShutdownTimeout` (1s); see the `select { case <-ctx.Done() }` - at [`clockreceiver.go:111`](../../components/receivers/clockreceiver/clockreceiver.go). -- **Idempotent Shutdown** — `ComponentState.Shutdown` is no-op-safe - when Start never ran; custom overrides must guard with `if r.cancel != nil` - before calling cancel. See [`clockreceiver.go:80`](../../components/receivers/clockreceiver/clockreceiver.go). -- **Defer/recover your goroutine.** Runtime-level - [`safeBase`](saferun.go) wraps `ConsumeMetrics`/`Traces`/`Logs` — - panics there are recovered. A panic *inside* a receiver's own - goroutine (e.g. the ticker handler before reaching `next.Consume*`) - is NOT caught and will crash the process. Always: - ```go - go func() { - defer func() { if r := recover(); r != nil { logger.Error("receiver panic", "panic", r) } }() - for { /* loop */ } - }() - ``` - -## Writing a processor - -No real processor ships in M1 yet. The canonical referent is the -contract test's `fakeProcessor` in -[`internal/pipeline/contract_test.go:82`](contract_test.go) — minimal, -read-only, forwards to `next`. - -**Shape:** struct with a `next consumer.` field set at -construction time. Methods: `Start(ctx, Host) error`, `Shutdown(ctx) error`, -`Capabilities() consumer.Capabilities`, `ConsumeMetrics/Traces/Logs(ctx, -payload) error`. No goroutines unless the processor batches or -asynchronously emits. - -**Factory pattern:** same dual-export as receivers — `var Factory -pipeline.ProcessorFactory = &factory{}` plus `func NewFactory() -pipeline.ProcessorFactory { return Factory }`. Per-signal -`CreateMetrics/CreateTraces/CreateLogs` take a `next consumer.X` and -wire it into the returned Processor. - -**Capabilities:** return `{MutatesData: true}` if the processor calls -any mutating method on the payload — anything that writes, resizes, -or reorders. Otherwise return the zero value (`MutatesData: false`). -This drives fan-out cloning decisions; getting it wrong leaks -mutations across siblings. - -**Pitfalls:** - -- **Don't capture `next` at config time.** It's passed to `CreateX` - per pipeline; a processor's instance is per-pipeline-reference, so - store `next` on the struct, not a package global. -- **Forward errors as-is.** Wrap only if the processor adds - recoverable context; the runtime's `safeConsumer` wrap already - catches panics, so processors don't need their own defer/recover. -- **No `MarkReadOnly` calls.** The fan-out marks payloads as it sees - fit; processors that downstream into multiple consumers should use - `internal/fanout` rather than rolling their own clone+dispatch. - -## Writing an exporter - -Mirror [`components/exporters/stdoutexporter/`](../../components/exporters/stdoutexporter) -— the canonical example. - -**Six files (same layout as receivers):** `config.go`, `factory.go`, -`.go`, `_test.go`, `README.md`, `example_config.yaml`. - -**Factory pattern:** `var Factory pipeline.ExporterFactory = &factory{}` -plus `func NewFactory() pipeline.ExporterFactory { return Factory }`. -Per-signal `CreateMetrics/CreateTraces/CreateLogs` returns the -exporter struct; unsupported signals return `nil, -pipeline.ErrSignalNotSupported`. See -[`factory.go`](../../components/exporters/stdoutexporter/factory.go). - -**No `next`:** exporters are leaves. The struct implements -`consumer.` directly and ships data out via I/O. See -[`stdoutexporter.go:58`](../../components/exporters/stdoutexporter/stdoutexporter.go). - -**Capabilities:** return `{MutatesData: false}` unless the exporter -mutates the payload before shipping (rare; usually exporters are -read-only). The runtime's fan-out clones for mutating consumers and -shares a single payload across read-only ones — -[`stdoutexporter.go:50`](../../components/exporters/stdoutexporter/stdoutexporter.go) -sets it explicitly to zero. - -**Pitfalls:** - -- **Synchronize stdout/stderr writers.** Concurrent goroutines may - call `ConsumeMetrics` if upstream fans out to a mutating peer; if - the exporter writes to a shared sink, guard with a `sync.Mutex`. -- **Don't block in `Shutdown`.** The runtime's drain budget is a hard - ceiling (10s default, 30s ceiling). Long flushes belong in - `ConsumeMetrics`; Shutdown only closes handles + cancels in-flight - work. -- **Bound retries.** An exporter that retries forever wedges Phase-2 - shutdown. Honor the ctx passed to `ConsumeMetrics`; abandon on - `ctx.Done()`. - -## Deferred from M1 - -Tracked in [RFC-0003 §"Deferred"](../../docs/rfcs/0003-pipeline-runtime-and-component-contract.md): -sealed factory interfaces (`mustEmbedDefaultFactory`), per-signal -stability metadata, `Extension`/`Connector` kinds, `GetFactory` on -`Host`. RFC required before adding any of these. diff --git a/internal/pipeline/bench_test.go b/internal/pipeline/bench_test.go deleted file mode 100644 index 7e98e319..00000000 --- a/internal/pipeline/bench_test.go +++ /dev/null @@ -1,84 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 - -package pipeline_test - -import ( - "context" - "io" - "log/slog" - "strconv" - "testing" - - "go.opentelemetry.io/collector/pdata/pmetric" - - "github.com/tracecoreai/tracecore/internal/consumer" - "github.com/tracecoreai/tracecore/internal/pipeline" -) - -// BenchmarkRuntime_StartShutdown measures the lifecycle overhead of -// the Runtime with N noop components attached to a single pipeline. -// The numbers establish the baseline against which future runtime -// changes are compared. Sub-benchmarks vary N to surface any -// super-linear costs in startup/teardown. -func BenchmarkRuntime_StartShutdown(b *testing.B) { - for _, n := range []int{10, 100} { - b.Run(strconv.Itoa(n), func(b *testing.B) { - components := make([]pipeline.Exporter, n) - for i := range components { - components[i] = noopBenchExporter{} - } - - b.ResetTimer() - for range b.N { - rt := pipeline.NewRuntime([]pipeline.Pipeline{{ - ID: pipeline.MustNewID(pipeline.MustNewType("metrics"), "bench"), - Exporters: components, - }}, pipeline.WithLogger(discardBenchLogger())) - if err := rt.Start(b.Context()); err != nil { - b.Fatal(err) - } - if err := rt.Shutdown(b.Context()); err != nil { - b.Fatal(err) - } - } - }) - } -} - -// BenchmarkWrapFirstDataMetrics_ConsumeMetrics measures the per-push -// overhead of the once-only "pipeline first data" log wrapper. The -// `Once.Do` fast path dominates after the first call; the benchmark -// runs many iterations through one wrapper so >99% of measurements -// reflect the already-fired path. -func BenchmarkWrapFirstDataMetrics_ConsumeMetrics(b *testing.B) { - next := noopBenchMetrics{} - wrap := pipeline.WrapFirstDataMetrics("metrics/bench", discardBenchLogger(), next) - - md := pmetric.NewMetrics() - md.ResourceMetrics().AppendEmpty() - - ctx := b.Context() - - b.ResetTimer() - for range b.N { - if err := wrap.ConsumeMetrics(ctx, md); err != nil { - b.Fatal(err) - } - } -} - -type noopBenchExporter struct{} - -func (noopBenchExporter) Start(context.Context, pipeline.Host) error { return nil } -func (noopBenchExporter) Shutdown(context.Context) error { return nil } - -type noopBenchMetrics struct{} - -func (noopBenchMetrics) Capabilities() consumer.Capabilities { return consumer.Capabilities{} } -func (noopBenchMetrics) ConsumeMetrics(context.Context, pmetric.Metrics) error { - return nil -} - -func discardBenchLogger() *slog.Logger { - return slog.New(slog.NewTextHandler(io.Discard, nil)) -} diff --git a/internal/pipeline/chaos_test.go b/internal/pipeline/chaos_test.go deleted file mode 100644 index 8d32accd..00000000 --- a/internal/pipeline/chaos_test.go +++ /dev/null @@ -1,156 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 - -//go:build chaos - -package pipeline_test - -import ( - "context" - "errors" - "io" - "log/slog" - "math/rand/v2" - "sync/atomic" - "testing" - "time" - - "github.com/stretchr/testify/require" - "go.opentelemetry.io/collector/pdata/pmetric" - "go.uber.org/goleak" - - "github.com/tracecoreai/tracecore/internal/consumer" - "github.com/tracecoreai/tracecore/internal/pipeline" - "github.com/tracecoreai/tracecore/internal/runtime/lifecycle" -) - -// TestChaos_PanicReceiverPanicExporter is the M4b chaos rubric -// falsifier: pair a panic-or-error receiver goroutine (via -// lifecycle.Lifecycle's recover-then-onPanic seam) with a -// panic-or-error exporter (via pipeline.WrapSafeMetrics), run ≥100 -// iterations, and assert goleak sees zero leaked goroutines. -// -// Built only under -tags=chaos so it doesn't slow the default -// `make ci` budget; the chaos.yml workflow runs it nightly. -func TestChaos_PanicReceiverPanicExporter(t *testing.T) { - defer goleak.VerifyNone(t) // goleak tolerates internal test-harness goroutines but flags - // any user goroutine that outlives the test. No allow-lists - // here — a leak is a real bug to fix. - - const iterations = 100 - logger := slog.New(slog.NewTextHandler(io.Discard, &slog.HandlerOptions{Level: slog.LevelError})) - var ( - receiverPanics atomic.Int64 - exporterPanics atomic.Int64 - consumeErrors atomic.Int64 - ) - - for i := range iterations { - seed := uint64(i + 1) - rng := rand.New(rand.NewPCG(seed, seed^0x9E3779B97F4A7C15)) - - exporter := &chaosMetricsConsumer{rng: rng, panicCounter: &exporterPanics, errCounter: &consumeErrors} - safe := pipeline.WrapSafeMetrics("chaos/exporter", logger, exporter) - - lc := lifecycle.New(logger, func(_ any) { receiverPanics.Add(1) }) - err := lc.Start(context.Background(), func(ctx context.Context) { - chaosReceiverRun(ctx, rng, safe) - }) - require.NoError(t, err) - - // Let the goroutine do real work, then shutdown. - time.Sleep(5 * time.Millisecond) - shutdownCtx, cancel := context.WithTimeout(context.Background(), 500*time.Millisecond) - require.NoError(t, lc.Shutdown(shutdownCtx)) - cancel() - } - - // The bookkeeping is informational — if all panics are caught - // upstream, we expect non-zero values to log; if either is zero, - // the chaos RNG didn't roll one in 100 iterations — that's a - // signal to widen the chaos probability, not a test failure. - t.Logf("chaos iterations=%d receiverPanics=%d exporterPanics=%d consumeErrors=%d", - iterations, receiverPanics.Load(), exporterPanics.Load(), consumeErrors.Load()) - require.Positive(t, receiverPanics.Load()+exporterPanics.Load()+consumeErrors.Load(), - "chaos test rolled no panics/errors in 100 iterations — the RNG, the probability, or the harness is broken") -} - -// chaosReceiverRun is the panic-or-error receiver goroutine body. -// Pushes metrics into next; randomly panics with some probability so -// the lifecycle helper's onPanic seam exercises every iteration. -func chaosReceiverRun(ctx context.Context, rng *rand.Rand, next consumer.Metrics) { - deadline := time.Now().Add(20 * time.Millisecond) - for time.Now().Before(deadline) { - if ctx.Err() != nil { - return - } - switch rng.IntN(8) { - case 0: - panic("chaos-receiver: random panic") - case 1: - // Skip a tick — exercises the no-push branch. - default: - md := pmetric.NewMetrics() - _ = next.ConsumeMetrics(ctx, md) - } - } -} - -// chaosMetricsConsumer panics, errors, or succeeds on each -// ConsumeMetrics call. The wrapping WrapSafeMetrics layer must -// recover panics into errors without crashing the goroutine. -type chaosMetricsConsumer struct { - rng *rand.Rand - panicCounter *atomic.Int64 - errCounter *atomic.Int64 -} - -func (c *chaosMetricsConsumer) Capabilities() consumer.Capabilities { - return consumer.Capabilities{} -} - -func (c *chaosMetricsConsumer) ConsumeMetrics(_ context.Context, _ pmetric.Metrics) error { - switch c.rng.IntN(4) { - case 0: - c.panicCounter.Add(1) - panic("chaos-exporter: random panic") - case 1: - c.errCounter.Add(1) - return errors.New("chaos-exporter: random error") - default: - return nil - } -} - -// TestChaos_WrapSafeMetrics_RecoversPanics is the targeted falsifier -// for the receiver→exporter panic-recovery seam. The 100-iteration -// chaos loop above runs panics through the lifecycle helper's -// recover() at the goroutine boundary, which masks any regression -// in WrapSafeMetrics' own recover. This test pins WrapSafeMetrics -// directly: a panicking consumer must surface as an error, not a -// process crash, with the panic value reachable in the error string. -func TestChaos_WrapSafeMetrics_RecoversPanics(t *testing.T) { - logger := slog.New(slog.NewTextHandler(io.Discard, &slog.HandlerOptions{Level: slog.LevelError})) - panicker := &alwaysPanicMetricsConsumer{} - safe := pipeline.WrapSafeMetrics("chaos/exporter-direct", logger, panicker) - - err := safe.ConsumeMetrics(context.Background(), pmetric.NewMetrics()) - require.Error(t, err, "WrapSafeMetrics must convert panic to error") - require.Contains(t, err.Error(), "chaos/exporter-direct", - "error must name the wrapped component for operator triage") - require.Contains(t, err.Error(), "panic", - "error must mark the failure mode as a panic, not a regular error") - - // Falsifier check: the wrapper must stay usable after a panic. - err = safe.ConsumeMetrics(context.Background(), pmetric.NewMetrics()) - require.Error(t, err, "WrapSafeMetrics must remain usable after a recovered panic") -} - -type alwaysPanicMetricsConsumer struct{} - -func (alwaysPanicMetricsConsumer) Capabilities() consumer.Capabilities { - return consumer.Capabilities{} -} - -func (alwaysPanicMetricsConsumer) ConsumeMetrics(context.Context, pmetric.Metrics) error { - panic("chaos-direct: always panic") -} diff --git a/internal/pipeline/component.go b/internal/pipeline/component.go deleted file mode 100644 index f0cea71d..00000000 --- a/internal/pipeline/component.go +++ /dev/null @@ -1,84 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 - -package pipeline - -import ( - "context" - "log/slog" - - "go.opentelemetry.io/collector/pdata/pcommon" - "go.opentelemetry.io/otel/metric" -) - -// Component is the runtime lifecycle contract every receiver, processor, -// and exporter satisfies. Both methods MUST be idempotent: the runtime may -// call Shutdown without ever having called Start (e.g. when an earlier -// component's Start failed and shutdown unwinds the partial graph). -// -// Start receives a Host so the component can resolve extensions and report -// status without depending on package-level globals. Shutdown receives a -// context whose deadline reflects the operator's shutdown budget; honour -// it. See docs/rfcs/0003-pipeline-runtime-and-component-contract.md -// §"Two-phase shutdown" for the budget split between receivers and -// exporters. -type Component interface { - Start(ctx context.Context, host Host) error - Shutdown(ctx context.Context) error -} - -// Host is the runtime's downward-facing surface for a Component. The -// shape mirrors go.opentelemetry.io/collector/component.Host at -// v1.55.0 (collector v0.152.0): GetExtensions only. -// -// Component status reporting moved to a free function -// `componentstatus.ReportStatus(host, ev)` in M2 — see -// `internal/componentstatus` and docs/STRATEGY.md "Host.ReportStatus" -// divergence row. Hosts that want to record status events implement -// the optional `componentstatus.StatusReporter` interface. -// -// pipelinetest.NewHost() returns a no-op implementation suitable for -// component unit tests. -type Host interface { - // GetExtensions returns the runtime's loaded extensions keyed by ID. - // In M1 the runtime has no extensions; implementations return an - // empty map. Never returns nil. - GetExtensions() map[ID]Component -} - -// StatusEvent is an opaque status report from a Component to the Host. -// The concrete shape is intentionally minimal at M1; M2 will extend it -// once the self-telemetry surface lands. Components should treat it as -// a forward-compatible struct, not an enum to switch on. -type StatusEvent struct { - // Kind is a short string identifying the event ("starting", "ready", - // "stopping", "permanent-error", etc.). Free-form for now; M2 will - // enumerate the canonical values. - Kind string - - // Err carries the underlying error for fault events, nil otherwise. - Err error -} - -// TelemetrySettings carries the per-Component observability handles the -// runtime injects at construction time. The shape mirrors OTel -// component.TelemetrySettings at v1.55.0 minus TracerProvider (deferred -// to post-v1; see docs/STRATEGY.md). Logger stays slog (documented -// divergence from OTel's zap). -type TelemetrySettings struct { - // Logger is scoped to the Component (typically with attributes for - // the Component's kind and instance name pre-attached). - Logger *slog.Logger - - // MeterProvider is the OTel metric.MeterProvider the Component - // uses to acquire a Meter and register instruments. The runtime - // substitutes a noop provider when self-telemetry is disabled so - // receiver code never has to nil-check. - MeterProvider metric.MeterProvider - - // Resource describes the collector instance (host.name, service.name, - // etc.) and is attached to any data the Component emits about itself. - // Components should treat it as read-only. - Resource pcommon.Resource - - _ struct{} -} diff --git a/internal/pipeline/component_extension_test.go b/internal/pipeline/component_extension_test.go deleted file mode 100644 index 000d4f99..00000000 --- a/internal/pipeline/component_extension_test.go +++ /dev/null @@ -1,81 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 - -package pipeline_test - -import ( - "context" - "io" - "log/slog" - "testing" - - "github.com/stretchr/testify/require" - "go.opentelemetry.io/collector/pdata/pcommon" - "go.opentelemetry.io/otel/metric" - "go.opentelemetry.io/otel/metric/noop" - - "github.com/tracecoreai/tracecore/internal/pipeline" -) - -// TestCreateSettings_HasBuildInfo pins M2: CreateSettings carries a -// BuildInfo struct so factories can stamp version metadata onto -// emitted data without reaching into a package-level global. The -// three fields mirror OTel component.BuildInfo at v1.55.0 (Command, -// Description, Version); tracecore's revision + build date stay in -// internal/version, not BuildInfo, to keep STRATEGY's M2 row of -// "Add BuildInfo + unkeyed-init guard" minimal. -func TestCreateSettings_HasBuildInfo(t *testing.T) { - t.Parallel() - - settings := pipeline.CreateSettings{ - ID: pipeline.MustNewID(pipeline.MustNewType("test"), "x"), - Telemetry: pipeline.TelemetrySettings{}, - BuildInfo: pipeline.BuildInfo{ - Command: "tracecore", - Description: "tracecore OTel-compatible collector", - Version: "v0.1.0", - }, - } - - require.Equal(t, "tracecore", settings.BuildInfo.Command) - require.Equal(t, "tracecore OTel-compatible collector", settings.BuildInfo.Description) - require.Equal(t, "v0.1.0", settings.BuildInfo.Version) -} - -// TestTelemetrySettings_HasMeterProvider pins M2: TelemetrySettings -// carries a metric.MeterProvider so receiver authors acquire a Meter -// in the same place they already get Logger + Resource. Mirrors OTel -// component.TelemetrySettings at v1.55.0. -func TestTelemetrySettings_HasMeterProvider(t *testing.T) { - t.Parallel() - - mp := noop.NewMeterProvider() - tel := pipeline.TelemetrySettings{ - Logger: slog.New(slog.NewTextHandler(io.Discard, nil)), - Resource: pcommon.NewResource(), - MeterProvider: mp, - } - - require.NotNil(t, tel.MeterProvider, "MeterProvider must round-trip") - // Receivers acquire a Meter from the provider; assert that path works. - meter := tel.MeterProvider.Meter("tracecore.test") - require.NotNil(t, meter) - // And that the noop meter doesn't panic on a real instrument call. - ctr, err := meter.Int64Counter("test.counter") - require.NoError(t, err) - ctr.Add(context.Background(), 1, metric.WithAttributes()) -} - -// TestTelemetrySettings_NilMeterProvider_NotPanicOnReceiverDefault -// pins the safety invariant for receivers built against the zero-value -// TelemetrySettings: M2's wire-up MUST default MeterProvider to a -// noop so receivers don't have to nil-check. The zero-value struct's -// MeterProvider IS nil; the runtime is responsible for substituting -// a noop. This test documents the contract via a comment that -// receivers must trust: "telSet.MeterProvider is never nil when the -// runtime constructs the settings." -func TestTelemetrySettings_ZeroValue_MeterProviderIsNil(t *testing.T) { - t.Parallel() - - var tel pipeline.TelemetrySettings - require.Nil(t, tel.MeterProvider, "zero-value documents runtime's responsibility to substitute noop") -} diff --git a/internal/pipeline/componentstate.go b/internal/pipeline/componentstate.go deleted file mode 100644 index 385cdcc9..00000000 --- a/internal/pipeline/componentstate.go +++ /dev/null @@ -1,62 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 - -package pipeline - -import ( - "context" - "sync" -) - -// ComponentState is a lifecycle-bookkeeping mixin that Components -// embed to satisfy the Start / Shutdown methods of Component without -// reimplementing the boilerplate. Receivers, processors, and exporters -// that need custom Start/Shutdown logic should override the embedded -// methods. -// -// Mirrors OTel Collector v0.152.0's -// service/internal/testcomponents/stateful_component.go — see -// docs/research/otel-graph-notes.md §8. Exported so M8+ receiver -// authors can embed it directly. -// -// Zero-value ComponentState is ready to use; no constructor needed. -type ComponentState struct { - mu sync.Mutex - started bool - stopped bool -} - -// Start records that the Component has started. The default -// implementation always returns nil; embedders override when they -// need to spawn goroutines, open connections, etc. Callers that -// override must still call s.SetStarted() (or chain to the embedded -// Start) so Started() reports correctly. -func (s *ComponentState) Start(context.Context, Host) error { - s.mu.Lock() - s.started = true - s.mu.Unlock() - return nil -} - -// Shutdown records that the Component has stopped. The default -// implementation always returns nil; embedders override when they -// need to cancel goroutines, close connections, etc. -func (s *ComponentState) Shutdown(context.Context) error { - s.mu.Lock() - s.stopped = true - s.mu.Unlock() - return nil -} - -// Started reports whether Start has been called at least once. -func (s *ComponentState) Started() bool { - s.mu.Lock() - defer s.mu.Unlock() - return s.started -} - -// Stopped reports whether Shutdown has been called at least once. -func (s *ComponentState) Stopped() bool { - s.mu.Lock() - defer s.mu.Unlock() - return s.stopped -} diff --git a/internal/pipeline/componentstate_test.go b/internal/pipeline/componentstate_test.go deleted file mode 100644 index c70c8cf1..00000000 --- a/internal/pipeline/componentstate_test.go +++ /dev/null @@ -1,87 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 - -package pipeline_test - -import ( - "sync" - "testing" - - "github.com/stretchr/testify/require" - - "github.com/tracecoreai/tracecore/internal/pipeline" -) - -func TestComponentState_ZeroValue_IsUsable(t *testing.T) { - t.Parallel() - - var s pipeline.ComponentState - require.False(t, s.Started(), "zero value reports not started") - require.False(t, s.Stopped(), "zero value reports not stopped") -} - -func TestComponentState_Start_FlipsStarted(t *testing.T) { - t.Parallel() - - var s pipeline.ComponentState - require.NoError(t, s.Start(t.Context(), nil)) - require.True(t, s.Started()) - require.False(t, s.Stopped(), "Start does not flip stopped") -} - -func TestComponentState_Shutdown_FlipsStopped(t *testing.T) { - t.Parallel() - - var s pipeline.ComponentState - require.NoError(t, s.Shutdown(t.Context())) - require.True(t, s.Stopped()) - require.False(t, s.Started(), "Shutdown does not flip started") -} - -func TestComponentState_StartThenShutdown_BothFlagsSet(t *testing.T) { - t.Parallel() - - var s pipeline.ComponentState - require.NoError(t, s.Start(t.Context(), nil)) - require.NoError(t, s.Shutdown(t.Context())) - require.True(t, s.Started()) - require.True(t, s.Stopped()) -} - -// TestComponentState_ConcurrentAccess pins the race-detector-clean -// contract: many goroutines calling Start/Shutdown/Started/Stopped -// in parallel must not race. M8+ Components may be polled by -// supervisors while Start is still finishing. -func TestComponentState_ConcurrentAccess(t *testing.T) { - t.Parallel() - - var s pipeline.ComponentState - - const n = 64 - var wg sync.WaitGroup - wg.Add(n * 4) - for range n { - go func() { defer wg.Done(); _ = s.Start(t.Context(), nil) }() - go func() { defer wg.Done(); _ = s.Shutdown(t.Context()) }() - go func() { defer wg.Done(); _ = s.Started() }() - go func() { defer wg.Done(); _ = s.Stopped() }() - } - wg.Wait() - - require.True(t, s.Started()) - require.True(t, s.Stopped()) -} - -// TestComponentState_AsComponent: embedding ComponentState in a struct -// makes that struct satisfy pipeline.Component without writing any -// Start/Shutdown code — the canonical use case. -func TestComponentState_AsComponent(t *testing.T) { - t.Parallel() - - var c pipeline.Component = &embedsState{} - require.NoError(t, c.Start(t.Context(), nil)) - require.NoError(t, c.Shutdown(t.Context())) -} - -type embedsState struct { - pipeline.ComponentState -} diff --git a/internal/pipeline/contract_test.go b/internal/pipeline/contract_test.go deleted file mode 100644 index 2cf9dc05..00000000 --- a/internal/pipeline/contract_test.go +++ /dev/null @@ -1,198 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 - -// Contract tests for the pipeline package — the surface that receiver -// authors and tracecore operators rely on. Unit tests live alongside -// their target code; this file is the cross-cutting acceptance check -// that proves the pieces work together the way a downstream author -// will use them. -// -// Contract claims: -// -// 1. A Component implementation slots into Pipeline.Receivers / -// Processors / Exporters and Runtime.Start/Shutdown drives its -// lifecycle. Pinned by: this file. -// 2. Push consumers (consumer.Metrics/Traces/Logs) carry data -// stage→stage. Pinned by: this file. -// 3. WrapFirstDataMetrics fires once when data first flows. Pinned -// by: this file plus firstdata_test.go (concurrency, error -// pass-through). -// 4. pipelinetest.New(t) is the test-time entry point for a single -// Component. Pinned by: pipelinetest/fixture_test.go. -// 5. safe.Call covers success / error / panic / deadline. Pinned by: -// internal/safe/safe_test.go. -// 6. tracecore collect --config= boots, idles, exits 0 -// on signal. Pinned by: cmd/tracecore/main_test.go. -// 7. Bad YAML produces path:line: errors. Pinned by: -// internal/config/load_test.go. -// 8. A non-empty config without factories is rejected clearly. -// Pinned by: cmd/tracecore/main_test.go. -// 9. components-gen is idempotent and validates types at gen-time. -// Pinned by: tools/components-gen/main_test.go + `make generate-check`. -// -// The test in this file exercises #1–#3 together because that's the -// receiver-author story no individual unit test covers. If a future -// change breaks the way the contract pieces compose, this test trips -// even when the individual unit tests still pass. - -package pipeline_test - -import ( - "bytes" - "context" - "log/slog" - "strings" - "sync/atomic" - "testing" - - "github.com/stretchr/testify/require" - "go.opentelemetry.io/collector/pdata/pmetric" - - "github.com/tracecoreai/tracecore/internal/consumer" - "github.com/tracecoreai/tracecore/internal/pipeline" -) - -// fakeReceiver is the kind of type an M8 receiver author would write: -// a struct that satisfies Component and pushes data into a wired -// consumer.Metrics. PushOnce makes it deterministic for tests — no -// timers, no goroutines. -type fakeReceiver struct { - next consumer.Metrics - startCount atomic.Int32 - shutdownCount atomic.Int32 -} - -func (r *fakeReceiver) Start(_ context.Context, _ pipeline.Host) error { - r.startCount.Add(1) - return nil -} - -func (r *fakeReceiver) Shutdown(_ context.Context) error { - r.shutdownCount.Add(1) - return nil -} - -// PushOnce hands a single pmetric.Metrics to the next consumer. Used -// to drive the pipeline from the test goroutine. -func (r *fakeReceiver) PushOnce(ctx context.Context) error { - md := pmetric.NewMetrics() - md.ResourceMetrics().AppendEmpty().ScopeMetrics().AppendEmpty().Metrics().AppendEmpty() - return r.next.ConsumeMetrics(ctx, md) -} - -// fakeProcessor reads metrics, mutates nothing, forwards to next. The -// minimum that satisfies the Processor / consumer.Metrics contract. -type fakeProcessor struct { - next consumer.Metrics - consumeCount atomic.Int32 - startCount atomic.Int32 - shutdownCount atomic.Int32 -} - -func (p *fakeProcessor) Start(_ context.Context, _ pipeline.Host) error { - p.startCount.Add(1) - return nil -} - -func (p *fakeProcessor) Shutdown(_ context.Context) error { - p.shutdownCount.Add(1) - return nil -} - -func (p *fakeProcessor) ConsumeMetrics(ctx context.Context, md pmetric.Metrics) error { - p.consumeCount.Add(1) - return p.next.ConsumeMetrics(ctx, md) -} - -func (*fakeProcessor) Capabilities() consumer.Capabilities { return consumer.Capabilities{} } - -// fakeExporter is the pipeline's leaf: it records what arrived. The -// kind of exporter an integration test of an M8 receiver would use. -type fakeExporter struct { - consumeCount atomic.Int32 - startCount atomic.Int32 - shutdownCount atomic.Int32 -} - -func (e *fakeExporter) Start(_ context.Context, _ pipeline.Host) error { - e.startCount.Add(1) - return nil -} - -func (e *fakeExporter) Shutdown(_ context.Context) error { - e.shutdownCount.Add(1) - return nil -} - -func (e *fakeExporter) ConsumeMetrics(_ context.Context, _ pmetric.Metrics) error { - e.consumeCount.Add(1) - return nil -} - -func (*fakeExporter) Capabilities() consumer.Capabilities { return consumer.Capabilities{} } - -// TestContract_ReceiverAuthorEndToEnd is the acceptance test for -// claims #1–#3: a receiver author can wire components by hand, the -// Runtime drives their lifecycle, data pushes flow stage→stage, and -// the first-data log fires once. -// -// This test deliberately does NOT use factory-based assembly — that -// path (buildPipelines with non-empty Factories) is deferred until -// the first receiver lands. Receiver authors writing the first -// integration test will use exactly this hand-wired shape. -func TestContract_ReceiverAuthorEndToEnd(t *testing.T) { - t.Parallel() - - logBuf := &bytes.Buffer{} - logger := slog.New(slog.NewTextHandler(logBuf, &slog.HandlerOptions{Level: slog.LevelInfo})) - - // Wire bottom-up so each stage knows its `next` at construction. - // This is the shape an M8 receiver factory will produce inside - // CreateMetrics: receiver-with-wrapped-next. - exp := &fakeExporter{} - proc := &fakeProcessor{next: exp} - wrapped := pipeline.WrapFirstDataMetrics("metrics/primary", logger, proc) - rcv := &fakeReceiver{next: wrapped} - - rt := pipeline.NewRuntime([]pipeline.Pipeline{{ - ID: pipeline.MustNewID(pipeline.MustNewType("metrics"), "primary"), - Receivers: []pipeline.Receiver{rcv}, - Processors: []pipeline.Processor{proc}, - Exporters: []pipeline.Exporter{exp}, - }}, pipeline.WithLogger(logger)) - - require.NoError(t, rt.Start(t.Context()), - "Claim #1: Runtime.Start drives the Component lifecycle") - - require.Equal(t, int32(1), rcv.startCount.Load(), - "Claim #1: receiver Start invoked exactly once") - require.Equal(t, int32(1), proc.startCount.Load(), - "Claim #1: processor Start invoked exactly once") - require.Equal(t, int32(1), exp.startCount.Load(), - "Claim #1: exporter Start invoked exactly once") - - // Push twice to exercise: data flow (Claim #2) and first-data - // being once-per-process-per-pipeline (Claim #3). - require.NoError(t, rcv.PushOnce(t.Context())) - require.NoError(t, rcv.PushOnce(t.Context())) - - require.Equal(t, int32(2), proc.consumeCount.Load(), - "Claim #2: every push reaches the processor") - require.Equal(t, int32(2), exp.consumeCount.Load(), - "Claim #2: every push reaches the exporter") - - logOutput := logBuf.String() - require.Equal(t, 1, strings.Count(logOutput, "pipeline first data"), - "Claim #3: WrapFirstDataMetrics fires exactly once across N pushes") - require.Contains(t, logOutput, `pipeline=metrics/primary`, - "Claim #3: first-data log carries the pipeline ID") - - require.NoError(t, rt.Shutdown(t.Context()), - "Claim #1: Runtime.Shutdown tears down cleanly") - - require.Equal(t, int32(1), rcv.shutdownCount.Load(), - "Claim #1: receiver Shutdown invoked exactly once") - require.Equal(t, int32(1), proc.shutdownCount.Load(), - "Claim #1: processor Shutdown invoked exactly once") - require.Equal(t, int32(1), exp.shutdownCount.Load(), - "Claim #1: exporter Shutdown invoked exactly once") -} diff --git a/internal/pipeline/doc.go b/internal/pipeline/doc.go deleted file mode 100644 index 5637b3dc..00000000 --- a/internal/pipeline/doc.go +++ /dev/null @@ -1,8 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 - -// Package pipeline defines the runtime contracts that every receiver, -// processor, and exporter satisfies, plus the Start/Shutdown lifecycle -// the binary uses to wire them together. -// -// Design: see docs/rfcs/0003-pipeline-runtime-and-component-contract.md. -package pipeline diff --git a/internal/pipeline/example_receiver_recover_test.go b/internal/pipeline/example_receiver_recover_test.go deleted file mode 100644 index 8a8298e7..00000000 --- a/internal/pipeline/example_receiver_recover_test.go +++ /dev/null @@ -1,71 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 - -package pipeline_test - -import ( - "context" - "sync" - "testing" - "time" - - "github.com/stretchr/testify/require" - - "github.com/tracecoreai/tracecore/internal/pipeline" -) - -// TestReceiver_GoroutineDeferRecover_KeepsProcessAlive verifies the -// recipe documented in internal/pipeline/README.md's receiver-author -// Pitfalls list: a panic INSIDE a receiver's own goroutine (not -// reached via next.Consume*) crashes the process — saferun only -// wraps the consumer seam. Receivers must defer/recover in their -// own goroutines. -// -// This test exercises the correct recipe: the panicking-receiver -// stub uses `defer func() { recover() }()` inside its goroutine, -// and the test asserts (a) the goroutine exits cleanly and (b) the -// runtime's Shutdown returns without error. Without the recover, -// the process would terminate before the assertions run. -func TestReceiver_GoroutineDeferRecover_KeepsProcessAlive(t *testing.T) { - t.Parallel() - - r := &panickingReceiver{} - rt := pipeline.NewRuntime([]pipeline.Pipeline{{ - ID: pipeline.MustNewID(pipeline.MustNewType("test"), ""), - Receivers: []pipeline.Receiver{r}, - }}, pipeline.WithLogger(discardLogger())) - - require.NoError(t, rt.Start(t.Context())) - - // Give the receiver's goroutine time to fire its panic. - r.panicked.Wait() - - require.NoError(t, rt.Shutdown(t.Context()), - "runtime must shut down cleanly after a receiver goroutine recovered its own panic") -} - -// panickingReceiver simulates the documented receiver-author recipe: -// the inner goroutine fires a panic, recovers it locally, and exits. -// In a real receiver the recover would log + best-effort report-status; -// here we just signal that the panic-and-recover happened. -type panickingReceiver struct { - pipeline.ComponentState - panicked sync.WaitGroup -} - -func (r *panickingReceiver) Start(ctx context.Context, host pipeline.Host) error { - r.panicked.Add(1) - go func() { - // The recipe: every receiver-owned goroutine must defer/recover. - // The runtime's saferun wraps Consume* calls, not goroutine bodies. - defer func() { - if rec := recover(); rec != nil { - _ = rec // real receivers log + report status here - } - r.panicked.Done() - }() - // Simulate work, then a bug. - time.Sleep(1 * time.Millisecond) - panic("simulated receiver bug") - }() - return r.ComponentState.Start(ctx, host) //nolint:wrapcheck // pass-through to mixin -} diff --git a/internal/pipeline/factory.go b/internal/pipeline/factory.go deleted file mode 100644 index 1beaa11f..00000000 --- a/internal/pipeline/factory.go +++ /dev/null @@ -1,136 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 - -package pipeline - -import ( - "context" - "errors" - - "github.com/tracecoreai/tracecore/internal/consumer" -) - -// ErrSignalNotSupported is the sentinel a factory returns from a -// per-signal create method when the Component does not implement that -// signal (e.g. a metrics-only receiver's CreateTraces). The runtime -// matches with errors.Is and surfaces a clear "receiver X does not -// support signal Y" message to the operator. -var ErrSignalNotSupported = errors.New("signal not supported by this component") - -// Config is the marker interface a Component's config struct satisfies. -// Components define their own concrete config types (see -// `STYLE.md` §"Component layout"); the runtime treats them opaquely. -// -// Validate is invoked by the loader after YAML unmarshalling and before -// the runtime hands the config to a factory. Implementations should -// return errors with operator-readable messages naming the offending -// field. -type Config interface { - Validate() error -} - -// CreateSettings is the bundle a factory's create-X method receives at -// construction time. -// -// The trailing unexported `_ struct{}` rejects positional struct -// literals at compile time, so adding fields stays non-breaking. -// Mirrors OTel component.Settings shape at v1.55.0 (see -// docs/STRATEGY.md M2 row for "CreateSettings shape"). -type CreateSettings struct { - ID ID - Telemetry TelemetrySettings - BuildInfo BuildInfo - - _ struct{} -} - -// BuildInfo carries the binary identity factories may stamp onto -// emitted data. Mirrors go.opentelemetry.io/collector/component.BuildInfo -// at v1.55.0: three fields, plus the unkeyed-init guard. The richer -// version metadata tracecore carries (revision, build date) lives in -// `internal/version` and is not duplicated here. -type BuildInfo struct { - // Command is the executable file name, e.g. "tracecore". - Command string - - // Description is a human-readable name, e.g. "tracecore - // telemetry collector". - Description string - - // Version is the binary version string, e.g. "v0.1.0". - Version string - - _ struct{} -} - -// ReceiverFactory creates Components that pull data into a pipeline. -// One factory per kind handles all three signals; the unsupported -// signals return ErrSignalNotSupported. -// -// next is the consumer the receiver pushes into. The factory wires it -// into the returned Receiver at construction time; the Receiver does -// not look it up dynamically. -type ReceiverFactory interface { - Type() Type - CreateDefaultConfig() Config - - CreateMetrics(ctx context.Context, set CreateSettings, cfg Config, next consumer.Metrics) (Receiver, error) - CreateTraces(ctx context.Context, set CreateSettings, cfg Config, next consumer.Traces) (Receiver, error) - CreateLogs(ctx context.Context, set CreateSettings, cfg Config, next consumer.Logs) (Receiver, error) -} - -// Receiver is a Component returned by ReceiverFactory.CreateX. The -// type is a named alias so Pipeline.Receivers reads naturally; it -// adds no methods beyond Component. -type Receiver interface { - Component -} - -// ProcessorFactory creates Components that read from one pipeline stage -// and write to the next. The processor implements the matching -// consumer.X interface for its input signal; the next consumer is -// passed at construction time. -type ProcessorFactory interface { - Type() Type - CreateDefaultConfig() Config - - CreateMetrics(ctx context.Context, set CreateSettings, cfg Config, next consumer.Metrics) (Processor, error) - CreateTraces(ctx context.Context, set CreateSettings, cfg Config, next consumer.Traces) (Processor, error) - CreateLogs(ctx context.Context, set CreateSettings, cfg Config, next consumer.Logs) (Processor, error) -} - -// Processor is a Component returned by ProcessorFactory.CreateX. The -// type is a named alias; it adds no methods beyond Component. -type Processor interface { - Component -} - -// ExporterFactory creates Components that ship data out of the -// collector. The exporter implements the matching consumer.X for its -// signal; there is no "next" — exporters are the pipeline's leaves. -type ExporterFactory interface { - Type() Type - CreateDefaultConfig() Config - - CreateMetrics(ctx context.Context, set CreateSettings, cfg Config) (Exporter, error) - CreateTraces(ctx context.Context, set CreateSettings, cfg Config) (Exporter, error) - CreateLogs(ctx context.Context, set CreateSettings, cfg Config) (Exporter, error) -} - -// Exporter is a Component returned by ExporterFactory.CreateX. The -// type is a named alias; it adds no methods beyond Component. -type Exporter interface { - Component -} - -// Factories is the runtime's view of every Component the binary knows -// about. Keyed by Type so the config loader can resolve a YAML key -// like `receivers.dcgm` to the right factory. -// -// The struct is populated by `cmd/tracecore/components.go`, which is -// generated from `components.yaml` by `tools/components-gen` — -// receivers can land in parallel without conflicting on this file. -type Factories struct { - Receivers map[Type]ReceiverFactory - Processors map[Type]ProcessorFactory - Exporters map[Type]ExporterFactory -} diff --git a/internal/pipeline/firstdata.go b/internal/pipeline/firstdata.go deleted file mode 100644 index 8deb87a1..00000000 --- a/internal/pipeline/firstdata.go +++ /dev/null @@ -1,94 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 - -package pipeline - -import ( - "context" - "log/slog" - "sync" - - "go.opentelemetry.io/collector/pdata/plog" - "go.opentelemetry.io/collector/pdata/pmetric" - "go.opentelemetry.io/collector/pdata/ptrace" - - "github.com/tracecoreai/tracecore/internal/consumer" -) - -type firstDataBase[T any, C capable] struct { - pipeline string - logger *slog.Logger - once sync.Once - next C - invoke func(c C, ctx context.Context, payload T) error - itemCount func(T) int -} - -// Capabilities pass-through preserves the fan-out cloning decision -// that was made on the wrapped consumer. -func (f *firstDataBase[T, C]) Capabilities() consumer.Capabilities { return f.next.Capabilities() } - -func (f *firstDataBase[T, C]) fireAndForward(ctx context.Context, payload T) error { - f.once.Do(func() { - f.logger.Info("pipeline first data", "pipeline", f.pipeline, "item_count", f.itemCount(payload)) - }) - return f.invoke(f.next, ctx, payload) -} - -// WrapFirstDataMetrics logs "pipeline first data" exactly once per -// pipeline so operators can verify data flow without external tooling -// (operator UX criterion #4). -func WrapFirstDataMetrics(pipelineName string, logger *slog.Logger, next consumer.Metrics) consumer.Metrics { - f := &firstDataMetrics{} - f.pipeline, f.logger, f.next = pipelineName, logger, next - f.invoke = func(c consumer.Metrics, ctx context.Context, d pmetric.Metrics) error { - return c.ConsumeMetrics(ctx, d) - } - f.itemCount = func(d pmetric.Metrics) int { return d.DataPointCount() } - return f -} - -// WrapFirstDataTraces is the ptrace.Traces counterpart of WrapFirstDataMetrics. -func WrapFirstDataTraces(pipelineName string, logger *slog.Logger, next consumer.Traces) consumer.Traces { - f := &firstDataTraces{} - f.pipeline, f.logger, f.next = pipelineName, logger, next - f.invoke = func(c consumer.Traces, ctx context.Context, d ptrace.Traces) error { - return c.ConsumeTraces(ctx, d) - } - f.itemCount = func(d ptrace.Traces) int { return d.SpanCount() } - return f -} - -// WrapFirstDataLogs is the plog.Logs counterpart of WrapFirstDataMetrics. -func WrapFirstDataLogs(pipelineName string, logger *slog.Logger, next consumer.Logs) consumer.Logs { - f := &firstDataLogs{} - f.pipeline, f.logger, f.next = pipelineName, logger, next - f.invoke = func(c consumer.Logs, ctx context.Context, d plog.Logs) error { - return c.ConsumeLogs(ctx, d) - } - f.itemCount = func(d plog.Logs) int { return d.LogRecordCount() } - return f -} - -type firstDataMetrics struct { - firstDataBase[pmetric.Metrics, consumer.Metrics] -} - -func (f *firstDataMetrics) ConsumeMetrics(ctx context.Context, md pmetric.Metrics) error { - return f.fireAndForward(ctx, md) -} - -type firstDataTraces struct { - firstDataBase[ptrace.Traces, consumer.Traces] -} - -func (f *firstDataTraces) ConsumeTraces(ctx context.Context, td ptrace.Traces) error { - return f.fireAndForward(ctx, td) -} - -type firstDataLogs struct { - firstDataBase[plog.Logs, consumer.Logs] -} - -func (f *firstDataLogs) ConsumeLogs(ctx context.Context, ld plog.Logs) error { - return f.fireAndForward(ctx, ld) -} diff --git a/internal/pipeline/firstdata_test.go b/internal/pipeline/firstdata_test.go deleted file mode 100644 index cbfefc5d..00000000 --- a/internal/pipeline/firstdata_test.go +++ /dev/null @@ -1,233 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 - -package pipeline_test - -import ( - "bytes" - "context" - "errors" - "log/slog" - "strings" - "sync" - "sync/atomic" - "testing" - - "github.com/stretchr/testify/require" - "go.opentelemetry.io/collector/pdata/plog" - "go.opentelemetry.io/collector/pdata/pmetric" - "go.opentelemetry.io/collector/pdata/ptrace" - - "github.com/tracecoreai/tracecore/internal/consumer" - "github.com/tracecoreai/tracecore/internal/pipeline" -) - -func TestWrapFirstDataMetrics_LogsOnlyOnce(t *testing.T) { - t.Parallel() - - buf := &bytes.Buffer{} - logger := slog.New(slog.NewTextHandler(buf, &slog.HandlerOptions{Level: slog.LevelInfo})) - - next := &metricsSink{} - wrapped := pipeline.WrapFirstDataMetrics("metrics/primary", logger, next) - - for range 5 { - require.NoError(t, wrapped.ConsumeMetrics(t.Context(), pmetric.NewMetrics())) - } - require.Equal(t, int32(5), next.calls.Load(), "every push must reach next") - - occurrences := strings.Count(buf.String(), "pipeline first data") - require.Equal(t, 1, occurrences, "first-data line must fire exactly once") - require.Contains(t, buf.String(), "metrics/primary") -} - -func TestWrapFirstDataTraces_LogsOnlyOnce(t *testing.T) { - t.Parallel() - - buf := &bytes.Buffer{} - logger := slog.New(slog.NewTextHandler(buf, &slog.HandlerOptions{Level: slog.LevelInfo})) - - next := &tracesSink{} - wrapped := pipeline.WrapFirstDataTraces("traces/primary", logger, next) - - require.NoError(t, wrapped.ConsumeTraces(t.Context(), ptrace.NewTraces())) - require.NoError(t, wrapped.ConsumeTraces(t.Context(), ptrace.NewTraces())) - - require.Equal(t, 1, strings.Count(buf.String(), "pipeline first data")) - require.Contains(t, buf.String(), "traces/primary") -} - -func TestWrapFirstDataLogs_LogsOnlyOnce(t *testing.T) { - t.Parallel() - - buf := &bytes.Buffer{} - logger := slog.New(slog.NewTextHandler(buf, &slog.HandlerOptions{Level: slog.LevelInfo})) - - next := &logsSink{} - wrapped := pipeline.WrapFirstDataLogs("logs/primary", logger, next) - - require.NoError(t, wrapped.ConsumeLogs(t.Context(), plog.NewLogs())) - require.NoError(t, wrapped.ConsumeLogs(t.Context(), plog.NewLogs())) - - require.Equal(t, 1, strings.Count(buf.String(), "pipeline first data")) - require.Contains(t, buf.String(), "logs/primary") -} - -func TestWrapFirstDataMetrics_ConcurrentFirstPushes_LogOnce(t *testing.T) { - t.Parallel() - - buf := &syncBuffer{} - logger := slog.New(slog.NewTextHandler(buf, &slog.HandlerOptions{Level: slog.LevelInfo})) - - next := &metricsSink{} - wrapped := pipeline.WrapFirstDataMetrics("metrics/primary", logger, next) - - const goroutines = 32 - var wg sync.WaitGroup - wg.Add(goroutines) - for range goroutines { - go func() { - defer wg.Done() - _ = wrapped.ConsumeMetrics(t.Context(), pmetric.NewMetrics()) - }() - } - wg.Wait() - - require.Equal(t, int32(goroutines), next.calls.Load()) - require.Equal(t, 1, strings.Count(buf.String(), "pipeline first data")) -} - -// TestWrapFirstDataMetrics_EmptyPayload_StillLogs pins the current -// semantic: "first data" means first PUSH ATTEMPT, not first non-empty -// push. An empty pmetric.Metrics still triggers the log with -// item_count=0 — operationally meaningful because it tells operators -// the pipeline is alive even before real data flows. If a future -// design wants "first non-zero count," this test catches that -// regression. -func TestWrapFirstDataMetrics_EmptyPayload_StillLogs(t *testing.T) { - t.Parallel() - - buf := &bytes.Buffer{} - logger := slog.New(slog.NewTextHandler(buf, &slog.HandlerOptions{Level: slog.LevelInfo})) - - next := &metricsSink{} - wrapped := pipeline.WrapFirstDataMetrics("metrics/primary", logger, next) - - empty := pmetric.NewMetrics() // DataPointCount() == 0 - require.NoError(t, wrapped.ConsumeMetrics(t.Context(), empty)) - - require.Contains(t, buf.String(), "pipeline first data") - require.Contains(t, buf.String(), "item_count=0", - "empty payload still fires once with count=0") -} - -// TestRuntime_SharedComponentAcrossPipelines_StartedTwice pins the -// current behavior for the same Component instance appearing in two -// Pipelines: it is Started twice and Shutdown twice. Component contract -// says lifecycle methods are idempotent, but multiplicity is surprising. -// If we ever want dedup, this test catches the change. -func TestRuntime_SharedComponentAcrossPipelines_StartedTwice(t *testing.T) { - t.Parallel() - - shared := &countingComponent{} - - rt := pipeline.NewRuntime([]pipeline.Pipeline{ - { - ID: pipeline.MustNewID(pipeline.MustNewType("metrics"), "a"), - Exporters: []pipeline.Exporter{shared}, - }, - { - ID: pipeline.MustNewID(pipeline.MustNewType("metrics"), "b"), - Exporters: []pipeline.Exporter{shared}, - }, - }, pipeline.WithLogger(slog.New(slog.NewTextHandler(&bytes.Buffer{}, nil)))) - - require.NoError(t, rt.Start(t.Context())) - require.NoError(t, rt.Shutdown(t.Context())) - - require.Equal(t, int32(2), shared.starts.Load(), - "shared Component started once per pipeline reference (no dedup today)") - require.Equal(t, int32(2), shared.shutdowns.Load(), - "shared Component shutdown once per pipeline reference (no dedup today)") -} - -type countingComponent struct { - starts atomic.Int32 - shutdowns atomic.Int32 -} - -func (c *countingComponent) Start(context.Context, pipeline.Host) error { - c.starts.Add(1) - return nil -} - -func (c *countingComponent) Shutdown(context.Context) error { - c.shutdowns.Add(1) - return nil -} - -func TestWrapFirstDataMetrics_NextErrorPropagates(t *testing.T) { - t.Parallel() - - buf := &bytes.Buffer{} - logger := slog.New(slog.NewTextHandler(buf, nil)) - - wantErr := errors.New("downstream wedged") - next := &metricsSink{returnErr: wantErr} - wrapped := pipeline.WrapFirstDataMetrics("metrics/primary", logger, next) - - err := wrapped.ConsumeMetrics(t.Context(), pmetric.NewMetrics()) - require.ErrorIs(t, err, wantErr) -} - -type metricsSink struct { - calls atomic.Int32 - returnErr error -} - -func (s *metricsSink) ConsumeMetrics(_ context.Context, _ pmetric.Metrics) error { - s.calls.Add(1) - return s.returnErr -} - -func (*metricsSink) Capabilities() consumer.Capabilities { return consumer.Capabilities{} } - -type tracesSink struct { - calls atomic.Int32 -} - -func (s *tracesSink) ConsumeTraces(_ context.Context, _ ptrace.Traces) error { - s.calls.Add(1) - return nil -} - -func (*tracesSink) Capabilities() consumer.Capabilities { return consumer.Capabilities{} } - -type logsSink struct { - calls atomic.Int32 -} - -func (s *logsSink) ConsumeLogs(_ context.Context, _ plog.Logs) error { - s.calls.Add(1) - return nil -} - -func (*logsSink) Capabilities() consumer.Capabilities { return consumer.Capabilities{} } - -// syncBuffer is a bytes.Buffer with a mutex so multiple goroutines' -// slog writes don't tear under the race detector. -type syncBuffer struct { - mu sync.Mutex - buf bytes.Buffer -} - -func (b *syncBuffer) Write(p []byte) (int, error) { - b.mu.Lock() - defer b.mu.Unlock() - return b.buf.Write(p) -} - -func (b *syncBuffer) String() string { - b.mu.Lock() - defer b.mu.Unlock() - return b.buf.String() -} diff --git a/internal/pipeline/id.go b/internal/pipeline/id.go deleted file mode 100644 index d11fb1c6..00000000 --- a/internal/pipeline/id.go +++ /dev/null @@ -1,105 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 - -package pipeline - -import ( - "fmt" - "regexp" -) - -// typeRegex mirrors go.opentelemetry.io/collector/component (v0.152.0): -// a leading ASCII letter followed by up to 62 letters, digits, or -// underscores. Copied verbatim so receiver kind names port one-to-one -// between OTel and tracecore. -var typeRegex = regexp.MustCompile(`^[a-zA-Z][0-9a-zA-Z_]{0,62}$`) - -// Type is the kind of a Component (e.g. "dcgm", "otlp", "batch"). -// Construct with NewType so callers cannot bypass validation. -type Type struct { - name string -} - -// NewType validates s against typeRegex. -func NewType(s string) (Type, error) { - if !typeRegex.MatchString(s) { - return Type{}, fmt.Errorf("invalid component type %q: must match %s", s, typeRegex.String()) - } - return Type{name: s}, nil -} - -// MustNewType panics on invalid input. Use only for compile-time -// constants — generated factory wiring, test setup — never for -// operator-supplied data. -func MustNewType(s string) Type { - t, err := NewType(s) - if err != nil { - panic(err) - } - return t -} - -// String returns the validated type name. -func (t Type) String() string { return t.name } - -// IsZero reports whether t was never constructed via NewType. -func (t Type) IsZero() bool { return t.name == "" } - -// ID names a single Component instance inside a pipeline. The kind is -// the Component's Type; the optional name disambiguates two instances -// of the same kind (e.g. "otlp" vs "otlp/secondary"). The name uses -// the same character set as Type — letters, digits, underscores — so -// log lines and YAML keys parse unambiguously. -type ID struct { - kind Type - name string -} - -// NewID validates name against typeRegex (empty is allowed) and -// returns an ID. Restricting name to the same character set as Type -// prevents slashes or other special chars from appearing in ID.String, -// which would collide with the kind/name separator. -func NewID(kind Type, name string) (ID, error) { - if name != "" && !typeRegex.MatchString(name) { - return ID{}, fmt.Errorf("invalid component instance name %q: must match %s", name, typeRegex.String()) - } - return ID{kind: kind, name: name}, nil -} - -// MustNewID is NewID that panics on invalid name. Use for compile-time -// constants only. -func MustNewID(kind Type, name string) ID { - id, err := NewID(kind, name) - if err != nil { - panic(err) - } - return id -} - -// ValidateInstanceName reports whether name passes the same regex as -// component types. An empty name is allowed (it stringifies an ID to -// just the kind). Exported so the config loader can fail fast on -// bad pipeline-key instance names rather than letting them surface -// later at NewID construction. -func ValidateInstanceName(name string) error { - if name == "" { - return nil - } - if !typeRegex.MatchString(name) { - return fmt.Errorf("invalid instance name %q: must match %s", name, typeRegex.String()) - } - return nil -} - -// Kind returns the Component type. -func (id ID) Kind() Type { return id.kind } - -// Name returns the optional instance name. -func (id ID) Name() string { return id.name } - -// String returns "kind" when name is empty, "kind/name" otherwise. -func (id ID) String() string { - if id.name == "" { - return id.kind.name - } - return id.kind.name + "/" + id.name -} diff --git a/internal/pipeline/id_test.go b/internal/pipeline/id_test.go deleted file mode 100644 index 00ae2cce..00000000 --- a/internal/pipeline/id_test.go +++ /dev/null @@ -1,110 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 - -package pipeline_test - -import ( - "strings" - "testing" - - "github.com/stretchr/testify/require" - - "github.com/tracecoreai/tracecore/internal/pipeline" -) - -func TestNewType(t *testing.T) { - t.Parallel() - - tests := []struct { - name string - input string - wantErr bool - }{ - {name: "simple lowercase", input: "dcgm"}, - {name: "mixed case", input: "OTLP"}, - {name: "with digits", input: "ncc1l2"}, - {name: "with underscore", input: "k8s_events"}, - {name: "single letter", input: "a"}, - {name: "max length 63", input: "a" + strings.Repeat("x", 62)}, - - {name: "empty", input: "", wantErr: true}, - {name: "leading digit", input: "9dcgm", wantErr: true}, - {name: "leading underscore", input: "_dcgm", wantErr: true}, - {name: "contains dash", input: "k8s-events", wantErr: true}, - {name: "contains dot", input: "otel.otlp", wantErr: true}, - {name: "contains slash", input: "otlp/secondary", wantErr: true}, - {name: "over length 64", input: "a" + strings.Repeat("x", 63), wantErr: true}, - } - - for _, tc := range tests { - t.Run(tc.name, func(t *testing.T) { - t.Parallel() - - got, err := pipeline.NewType(tc.input) - if tc.wantErr { - require.Error(t, err) - require.True(t, got.IsZero(), "zero-valued Type on error") - return - } - require.NoError(t, err) - require.Equal(t, tc.input, got.String()) - require.False(t, got.IsZero()) - }) - } -} - -func TestMustNewType_PanicsOnInvalid(t *testing.T) { - t.Parallel() - - require.PanicsWithError(t, - `invalid component type "bad-name": must match ^[a-zA-Z][0-9a-zA-Z_]{0,62}$`, - func() { pipeline.MustNewType("bad-name") }, - ) -} - -func TestID_String(t *testing.T) { - t.Parallel() - - otlp := pipeline.MustNewType("otlp") - - tests := []struct { - label string - id pipeline.ID - want string - }{ - {label: "kind only", id: pipeline.MustNewID(otlp, ""), want: "otlp"}, - {label: "kind and name", id: pipeline.MustNewID(otlp, "secondary"), want: "otlp/secondary"}, - } - - for _, tc := range tests { - t.Run(tc.label, func(t *testing.T) { - t.Parallel() - require.Equal(t, tc.want, tc.id.String()) - require.Equal(t, otlp, tc.id.Kind()) - }) - } -} - -func TestNewID_RejectsInvalidName(t *testing.T) { - t.Parallel() - - otlp := pipeline.MustNewType("otlp") - - tests := []struct { - label string - name string - }{ - {label: "contains slash", name: "primary/secondary"}, - {label: "contains dash", name: "primary-instance"}, - {label: "contains dot", name: "primary.dev"}, - {label: "leading digit", name: "1primary"}, - } - - for _, tc := range tests { - t.Run(tc.label, func(t *testing.T) { - t.Parallel() - _, err := pipeline.NewID(otlp, tc.name) - require.Error(t, err) - require.Contains(t, err.Error(), "invalid component instance name") - }) - } -} diff --git a/internal/pipeline/pipelinetest/doc.go b/internal/pipeline/pipelinetest/doc.go deleted file mode 100644 index 5e6bb40e..00000000 --- a/internal/pipeline/pipelinetest/doc.go +++ /dev/null @@ -1,14 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 - -// Package pipelinetest provides stub Host, TelemetrySettings, and -// consumer implementations for use in Component unit tests. The goal -// is that a receiver test writes -// -// fx := pipelinetest.New(t) -// -// instead of thirty lines of stub-host wiring per test. M8-M16 receiver -// authors are the primary consumers. -// -// Stubs are hand-rolled — no testify mock, no gomock — and record calls -// atomically so race-detector runs stay clean. -package pipelinetest diff --git a/internal/pipeline/pipelinetest/fakes.go b/internal/pipeline/pipelinetest/fakes.go deleted file mode 100644 index 6c3d6866..00000000 --- a/internal/pipeline/pipelinetest/fakes.go +++ /dev/null @@ -1,88 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 - -package pipelinetest - -import ( - "bytes" - "context" - "sync" - "sync/atomic" - - "go.opentelemetry.io/collector/pdata/pmetric" - - "github.com/tracecoreai/tracecore/internal/consumer" -) - -// SyncBuffer is a concurrent-safe bytes.Buffer for integration tests -// where an exec.Cmd writer races a test goroutine reader. -type SyncBuffer struct { - mu sync.Mutex - buf bytes.Buffer -} - -// Write satisfies io.Writer. -func (b *SyncBuffer) Write(p []byte) (int, error) { - b.mu.Lock() - defer b.mu.Unlock() - return b.buf.Write(p) -} - -// String returns a snapshot of the buffer's current contents. -func (b *SyncBuffer) String() string { - b.mu.Lock() - defer b.mu.Unlock() - return b.buf.String() -} - -// RecordingMetricsSink captures pushed metrics for test assertion. -// Pushed buffers up to 128 payloads; further pushes drop silently -// so a slow test can't wedge the producer goroutine. -type RecordingMetricsSink struct { - Pushed chan pmetric.Metrics - count atomic.Int32 -} - -// NewRecordingMetricsSink returns a sink with a 128-entry buffer. -func NewRecordingMetricsSink() *RecordingMetricsSink { - return &RecordingMetricsSink{Pushed: make(chan pmetric.Metrics, 128)} -} - -// ConsumeMetrics records the payload and bumps the call counter. -func (s *RecordingMetricsSink) ConsumeMetrics(_ context.Context, md pmetric.Metrics) error { - s.count.Add(1) - select { - case s.Pushed <- md: - default: - } - return nil -} - -// Capabilities reports MutatesData=false. -func (*RecordingMetricsSink) Capabilities() consumer.Capabilities { - return consumer.Capabilities{} -} - -// Count returns the cumulative ConsumeMetrics call count. -func (s *RecordingMetricsSink) Count() int32 { return s.count.Load() } - -// FailingMetricsSink returns Err from every ConsumeMetrics call. -// Setting Err to nil reverts to silent-success — guard against that -// in tests that mean to exercise the failure path. -type FailingMetricsSink struct { - Err error - calls atomic.Int32 -} - -// ConsumeMetrics increments the call counter and returns s.Err. -func (s *FailingMetricsSink) ConsumeMetrics(_ context.Context, _ pmetric.Metrics) error { - s.calls.Add(1) - return s.Err -} - -// Capabilities reports MutatesData=false. -func (*FailingMetricsSink) Capabilities() consumer.Capabilities { - return consumer.Capabilities{MutatesData: false} -} - -// Calls returns the cumulative ConsumeMetrics call count. -func (s *FailingMetricsSink) Calls() int32 { return s.calls.Load() } diff --git a/internal/pipeline/pipelinetest/fakes_test.go b/internal/pipeline/pipelinetest/fakes_test.go deleted file mode 100644 index d4601e84..00000000 --- a/internal/pipeline/pipelinetest/fakes_test.go +++ /dev/null @@ -1,74 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 - -package pipelinetest_test - -import ( - "context" - "errors" - "sync" - "testing" - - "github.com/stretchr/testify/require" - "go.opentelemetry.io/collector/pdata/pmetric" - - "github.com/tracecoreai/tracecore/internal/pipeline/pipelinetest" -) - -// TestSyncBuffer_ConcurrentWritesAreSafe pins the concurrent-safety -// contract: parallel writers don't race, every byte lands. -func TestSyncBuffer_ConcurrentWritesAreSafe(t *testing.T) { - t.Parallel() - - var b pipelinetest.SyncBuffer - const writers = 16 - const writesPerWriter = 100 - - var wg sync.WaitGroup - wg.Add(writers) - for range writers { - go func() { - defer wg.Done() - for range writesPerWriter { - _, _ = b.Write([]byte("x")) - } - }() - } - wg.Wait() - - require.Len(t, b.String(), writers*writesPerWriter) -} - -// TestRecordingMetricsSink_CountsAndCaptures pins the recording -// contract. -func TestRecordingMetricsSink_CountsAndCaptures(t *testing.T) { - t.Parallel() - - sink := pipelinetest.NewRecordingMetricsSink() - md := pmetric.NewMetrics() - - require.NoError(t, sink.ConsumeMetrics(context.Background(), md)) - require.NoError(t, sink.ConsumeMetrics(context.Background(), md)) - require.EqualValues(t, 2, sink.Count()) - require.False(t, sink.Capabilities().MutatesData) - - select { - case <-sink.Pushed: - default: - t.Fatal("expected a payload on Pushed channel") - } -} - -// TestFailingMetricsSink_ReturnsErrAndCounts pins the failing -// contract: every call increments and returns the configured error. -func TestFailingMetricsSink_ReturnsErrAndCounts(t *testing.T) { - t.Parallel() - - want := errors.New("boom") - sink := &pipelinetest.FailingMetricsSink{Err: want} - md := pmetric.NewMetrics() - - require.ErrorIs(t, sink.ConsumeMetrics(context.Background(), md), want) - require.ErrorIs(t, sink.ConsumeMetrics(context.Background(), md), want) - require.EqualValues(t, 2, sink.Calls()) - require.False(t, sink.Capabilities().MutatesData) -} diff --git a/internal/pipeline/pipelinetest/fixture.go b/internal/pipeline/pipelinetest/fixture.go deleted file mode 100644 index 79cf68ac..00000000 --- a/internal/pipeline/pipelinetest/fixture.go +++ /dev/null @@ -1,49 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 - -package pipelinetest - -import ( - "io" - "log/slog" - "strings" - "testing" - - "go.opentelemetry.io/collector/pdata/pcommon" - - "github.com/tracecoreai/tracecore/internal/pipeline" -) - -// Fixture bundles the stubs a Component unit test typically needs: a -// Host and a CreateSettings ready to hand to a factory's CreateX -// method. Construct with New; mutate the exposed fields before use if -// a test needs a different ID or a logger that writes somewhere -// observable. -// -// Distinct name from pipeline.Runtime to avoid confusion in tests -// that use both. -type Fixture struct { - Host *Host - CreateSettings pipeline.CreateSettings -} - -// New returns a Fixture with sensible defaults: a fresh stub Host, a -// discarding logger, an empty pcommon.Resource, and an ID of -// `test/`. Subtest names that contain "/" (the -// standard table-test divider) are flattened to "_" so the result -// satisfies pipeline.NewID's instance-name validation. -func New(t *testing.T) *Fixture { - t.Helper() - - telemetry := pipeline.TelemetrySettings{ - Logger: slog.New(slog.NewTextHandler(io.Discard, nil)), - Resource: pcommon.NewResource(), - } - - return &Fixture{ - Host: NewHost(), - CreateSettings: pipeline.CreateSettings{ - ID: pipeline.MustNewID(pipeline.MustNewType("test"), strings.ReplaceAll(t.Name(), "/", "_")), - Telemetry: telemetry, - }, - } -} diff --git a/internal/pipeline/pipelinetest/fixture_test.go b/internal/pipeline/pipelinetest/fixture_test.go deleted file mode 100644 index bc1d504b..00000000 --- a/internal/pipeline/pipelinetest/fixture_test.go +++ /dev/null @@ -1,56 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 - -package pipelinetest_test - -import ( - "errors" - "strings" - "testing" - - "github.com/stretchr/testify/require" - - "github.com/tracecoreai/tracecore/internal/componentstatus" - "github.com/tracecoreai/tracecore/internal/pipeline" - "github.com/tracecoreai/tracecore/internal/pipeline/pipelinetest" -) - -func TestNew_DefaultsAreUsable(t *testing.T) { - t.Parallel() - - fx := pipelinetest.New(t) - - require.NotNil(t, fx.Host) - require.NotNil(t, fx.CreateSettings.Telemetry.Logger, "Logger must be non-nil so components can log without nil-checks") - require.Empty(t, fx.Host.StatusEvents()) - require.Empty(t, fx.Host.GetExtensions()) - - require.Equal(t, "test", fx.CreateSettings.ID.Kind().String()) - // Subtest "/" separators are flattened to "_" so the ID name - // satisfies pipeline.NewID validation. - require.Equal(t, strings.ReplaceAll(t.Name(), "/", "_"), fx.CreateSettings.ID.Name()) -} - -func TestHost_RecordsStatusEvents(t *testing.T) { - t.Parallel() - - host := pipelinetest.NewHost() - componentstatus.ReportStatus(host, pipeline.StatusEvent{Kind: "starting"}) - componentstatus.ReportStatus(host, pipeline.StatusEvent{Kind: "permanent-error", Err: errors.New("boom")}) - - events := host.StatusEvents() - require.Len(t, events, 2) - require.Equal(t, "starting", events[0].Kind) - require.Equal(t, "permanent-error", events[1].Kind) - require.ErrorContains(t, events[1].Err, "boom") -} - -func TestHost_ExtensionsAreIsolated(t *testing.T) { - t.Parallel() - - host := pipelinetest.NewHost() - require.Empty(t, host.GetExtensions(), "fresh host has no extensions") - - // Mutating the returned map must not leak back into the host's state. - host.GetExtensions()[pipeline.MustNewID(pipeline.MustNewType("evil"), "")] = nil - require.Empty(t, host.GetExtensions(), "GetExtensions returns a copy") -} diff --git a/internal/pipeline/pipelinetest/host.go b/internal/pipeline/pipelinetest/host.go deleted file mode 100644 index 629a4c0d..00000000 --- a/internal/pipeline/pipelinetest/host.go +++ /dev/null @@ -1,77 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 - -package pipelinetest - -import ( - "sync" - - "github.com/tracecoreai/tracecore/internal/pipeline" -) - -// Host is a pipeline.Host suitable for unit tests. It also implements -// the optional componentstatus.StatusReporter interface so tests can -// assert on the events a Component would have reported via the free -// function `componentstatus.ReportStatus(host, ev)`. -// -// Host is safe for concurrent use; tests that spawn goroutines through -// the Component can call StatusEvents from the main test goroutine -// without external synchronisation. -type Host struct { - mu sync.Mutex - extensions map[pipeline.ID]pipeline.Component - events []pipeline.StatusEvent -} - -// NewHost returns a Host with no extensions and no recorded events. -func NewHost() *Host { - return &Host{ - extensions: map[pipeline.ID]pipeline.Component{}, - } -} - -// GetExtensions returns the (initially empty) extension map. Tests can -// pre-populate it via WithExtension before passing the Host to a -// Component. -func (h *Host) GetExtensions() map[pipeline.ID]pipeline.Component { - h.mu.Lock() - defer h.mu.Unlock() - - out := make(map[pipeline.ID]pipeline.Component, len(h.extensions)) - for k, v := range h.extensions { - out[k] = v - } - return out -} - -// ReportComponentStatus implements componentstatus.StatusReporter -// so the free fn `componentstatus.ReportStatus(host, ev)` delegates -// here. Appends the event to the Host's record. -func (h *Host) ReportComponentStatus(event pipeline.StatusEvent) { - h.mu.Lock() - defer h.mu.Unlock() - h.events = append(h.events, event) -} - -// StatusEvents returns a snapshot of the events the Component has -// reported, in order. -func (h *Host) StatusEvents() []pipeline.StatusEvent { - h.mu.Lock() - defer h.mu.Unlock() - - out := make([]pipeline.StatusEvent, len(h.events)) - copy(out, h.events) - return out -} - -// WithExtension registers ext under id so the Component sees it in -// GetExtensions. Intended for the rare M1 test that wants to exercise -// extension lookup; receiver tests typically leave the map empty. -func (h *Host) WithExtension(id pipeline.ID, ext pipeline.Component) *Host { - h.mu.Lock() - defer h.mu.Unlock() - h.extensions[id] = ext - return h -} - -// Compile-time assertion: *Host satisfies pipeline.Host. -var _ pipeline.Host = (*Host)(nil) diff --git a/internal/pipeline/runtime.go b/internal/pipeline/runtime.go deleted file mode 100644 index 53cff18d..00000000 --- a/internal/pipeline/runtime.go +++ /dev/null @@ -1,391 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 - -package pipeline - -import ( - "context" - "errors" - "fmt" - "log/slog" - "sync" - "time" -) - -// ErrAlreadyStarted is returned by Runtime.Start when called a -// second time without an intervening Shutdown. Use -// `errors.Is(err, pipeline.ErrAlreadyStarted)`. -// See [lifecycle.ErrAlreadyStarted] for the convention. -var ErrAlreadyStarted = errors.New("runtime: Start called twice") - -// Signal identifies the telemetry signal a Pipeline carries. -// Named Signal (not Kind) to disambiguate from id.Kind(), which -// returns the Component's Type ("dcgm", "otlp", ...). -type Signal int - -const ( - // SignalMetrics carries pmetric.Metrics. - SignalMetrics Signal = iota - // SignalTraces carries ptrace.Traces. - SignalTraces - // SignalLogs carries plog.Logs. - SignalLogs -) - -// String returns "metrics" / "traces" / "logs" — the YAML form -// operators write as the pipeline key (e.g. `metrics/primary`). -func (s Signal) String() string { - switch s { - case SignalMetrics: - return "metrics" - case SignalTraces: - return "traces" - case SignalLogs: - return "logs" - default: - return "unknown" - } -} - -// Pipeline groups the Components participating in a single signal -// flow. The caller is responsible for wiring the push consumers -// between stages BEFORE constructing the Pipeline — the Runtime -// treats the lists as already connected and uses them only for -// lifecycle ordering. -// -// Processors MUST be listed in data-flow order; the Runtime starts -// them last-stage first so a consumer is always ready before its -// producer. Receivers and Exporters are concurrent peers — slice -// order does not affect runtime behaviour. -// -// The pipeline's signal is encoded in ID (e.g. "metrics/primary"); -// no separate Signal field is needed. -type Pipeline struct { - ID ID - Receivers []Receiver - Processors []Processor - Exporters []Exporter -} - -// settings is unexported so M2+ can add fields (MeterProvider, -// BuildInfo) without breaking call sites — configuration flows -// through the With* Options below. -type settings struct { - // logger receives Runtime lifecycle messages only; Components - // log via their own scoped TelemetrySettings.Logger. - logger *slog.Logger - - // drainBudget clamps to HardDrainCeiling at NewRuntime — operators - // who want longer should fix the exporter, not extend the timeout. - drainBudget time.Duration - - host Host -} - -// Option applies in order — later overrides earlier — so a default -// layer can be composed with a caller-override layer. -type Option func(*settings) - -// WithLogger overrides the lifecycle logger (default: slog.Default()). -func WithLogger(l *slog.Logger) Option { - return func(s *settings) { s.logger = l } -} - -// WithDrainBudget overrides the Phase-2 shutdown deadline (default: -// DefaultDrainBudget, clamped at HardDrainCeiling). -func WithDrainBudget(d time.Duration) Option { - return func(s *settings) { s.drainBudget = d } -} - -// WithHost overrides the Host passed to Component.Start (default: no-op host). -func WithHost(h Host) Option { - return func(s *settings) { s.host = h } -} - -// Shutdown-phase timing constants. -const ( - // ReceiverShutdownTimeout bounds Phase 1 (receivers) per STYLE.md - // §Concurrency: training nodes can't wait — stop ingesting fast - // and let exporters drain in Phase 2. - ReceiverShutdownTimeout = 1 * time.Second - - // HardDrainCeiling is the upper bound on WithDrainBudget. - HardDrainCeiling = 30 * time.Second - - // DefaultDrainBudget is the Phase 2 budget when WithDrainBudget is - // unset. Matches OTel Collector v0.152.0 default exporter timeout. - DefaultDrainBudget = 10 * time.Second -) - -// Runtime owns a graph of Components and exposes Start/Shutdown. -// -// Lifecycle is strictly: -// -// NewRuntime(...) → Start(ctx) → ... → Shutdown(ctx) -// -// Start is callable exactly once. Shutdown may be called any number of -// times — only the first does work; subsequent calls return nil. -// Start and Shutdown serialize via lifecycleMu: a Shutdown invoked -// concurrently with an in-progress Start waits for Start to finish so -// it observes the full set of started Components. Component.Start is -// therefore required to respect its ctx — a hung Component will block -// Shutdown indefinitely. -type Runtime struct { - settings settings - pipelines []Pipeline - - // lifecycleMu serializes Start and Shutdown. All state below is - // guarded by it. - lifecycleMu sync.Mutex - started bool - // shutdown closes the Start-after-Shutdown race: if Shutdown wins - // the lock first, a subsequent Start refuses rather than launching - // Components that nobody will tear down. - shutdown bool - startedReceivers []Component - startedNonReceivers []Component -} - -// NewRuntime returns a Runtime ready to Start. NewRuntime does not -// invoke any factories; pipelines must arrive pre-wired (each -// Receiver / Processor / Exporter constructed with its `next` -// consumer already set). Configure with the With* Option constructors: -// -// rt := pipeline.NewRuntime(pipelines, -// pipeline.WithLogger(logger), -// pipeline.WithDrainBudget(5*time.Second), -// ) -func NewRuntime(pipelines []Pipeline, opts ...Option) *Runtime { - var s settings - for _, opt := range opts { - opt(&s) - } - if s.logger == nil { - s.logger = slog.Default() - } - if s.drainBudget <= 0 { - s.drainBudget = DefaultDrainBudget - } - if s.drainBudget > HardDrainCeiling { - s.logger.Warn("drain budget exceeds hard ceiling; clamping", - "requested", s.drainBudget, "ceiling", HardDrainCeiling) - s.drainBudget = HardDrainCeiling - } - if s.host == nil { - s.host = noopHost{} - } - return &Runtime{ - settings: s, - pipelines: pipelines, - } -} - -// Start brings up every Component in the graph in reverse data-flow -// order (exporters first so they accept data, then processors, then -// receivers). If any Component's Start fails, Start returns the error -// and the caller should call Shutdown to tear down the partially-built -// graph — Shutdown only touches Components that successfully Started. -// -// Empty-pipelines case: Start logs `"no pipelines configured"` once -// and returns nil. Validates the binary can boot before any receivers -// ship. -func (r *Runtime) Start(ctx context.Context) error { - r.lifecycleMu.Lock() - defer r.lifecycleMu.Unlock() - - if r.shutdown { - // Shutdown beat us to lifecycleMu (concurrent boot + signal). - // Returning nil makes the lifecycle a clean no-op: nothing - // Started, so Shutdown also did nothing, and the operator - // sees a clean exit instead of a race-dependent error. - return nil - } - if r.started { - return ErrAlreadyStarted - } - r.started = true - - if len(r.pipelines) == 0 { - // Logged at WARN so operators running with default log filters - // don't mistake a typo'd-config boot for a successful run. - r.settings.logger.Warn("no pipelines configured") - return nil - } - - for _, p := range r.pipelines { - // Exporters first. - for _, c := range p.Exporters { - if err := r.startOne(ctx, p.ID, "exporter", c, false); err != nil { - return err - } - } - // Processors in reverse data-flow order (last-stage first). - for i := len(p.Processors) - 1; i >= 0; i-- { - if err := r.startOne(ctx, p.ID, "processor", p.Processors[i], false); err != nil { - return err - } - } - // Receivers last so they only fire data into a ready graph. - for _, c := range p.Receivers { - if err := r.startOne(ctx, p.ID, "receiver", c, true); err != nil { - return err - } - } - } - return nil -} - -// startOne assumes the caller holds r.lifecycleMu. -func (r *Runtime) startOne(ctx context.Context, pid ID, kind string, c Component, isReceiver bool) error { - if c == nil { - return fmt.Errorf("start %s in pipeline %s: component is nil", kind, pid) - } - if err := c.Start(ctx, r.settings.host); err != nil { - return fmt.Errorf("start %s in pipeline %s: %w", kind, pid, err) - } - if isReceiver { - r.startedReceivers = append(r.startedReceivers, c) - } else { - r.startedNonReceivers = append(r.startedNonReceivers, c) - } - return nil -} - -// Shutdown performs two-phase teardown: -// -// - Phase 1 (≤ReceiverShutdownTimeout, default 1s): every Receiver -// Shutdown runs in parallel — receivers have no inter-receiver -// ordering dependency. Receivers that exceed the deadline are -// abandoned and logged (PRINCIPLES §1). -// - Phase 2 (≤Settings.DrainBudget): processors and exporters -// Shutdown **serially in LIFO of start order** so each stage -// drains into a still-running downstream stage. Mirrors OTel -// Collector v0.152.0 service.Shutdown behaviour. The phase budget -// bounds the total of all Shutdown calls; on deadline elapse, -// remaining Components are not invoked. -// -// Shutdown is safe to call on a Runtime whose Start failed partway — -// only the Components that successfully Started will be torn down. -func (r *Runtime) Shutdown(ctx context.Context) error { - // Wait for any in-progress Start to complete so we observe the - // full set of started Components. A misbehaving Component that - // never returns from Start will wedge Shutdown — Component.Start - // is required to respect ctx; this is the contract. - r.lifecycleMu.Lock() - r.shutdown = true - receivers := r.startedReceivers - nonReceivers := r.startedNonReceivers - r.startedReceivers = nil - r.startedNonReceivers = nil - r.lifecycleMu.Unlock() - - phase1Ctx, cancel1 := context.WithTimeout(ctx, ReceiverShutdownTimeout) - defer cancel1() - phase1Errs := shutdownGroup(phase1Ctx, receivers) - if errors.Is(phase1Ctx.Err(), context.DeadlineExceeded) { - r.settings.logger.Warn("shutdown phase 1 (receivers) deadline elapsed; abandoning in-flight calls", - "deadline", ReceiverShutdownTimeout) - } - - phase2Ctx, cancel2 := context.WithTimeout(ctx, r.settings.drainBudget) - defer cancel2() - phase2Errs := shutdownSerial(phase2Ctx, nonReceivers) - if errors.Is(phase2Ctx.Err(), context.DeadlineExceeded) { - r.settings.logger.Error("shutdown phase 2 (drain) deadline elapsed; abandoning queued data", - "deadline", r.settings.drainBudget) - } - - return errors.Join(append(phase1Errs, phase2Errs...)...) -} - -// shutdownGroup invokes Shutdown on each Component in parallel, -// honouring ctx.Done as a phase budget. If the budget elapses before -// every Shutdown returns, the in-flight calls keep running (Go cannot -// abort a goroutine) and the function returns with a budget-elapsed -// sentinel — unless work actually completed in the same instant the -// deadline fired, in which case the sentinel is suppressed. -// -// Panics raised by Component.Shutdown are recovered and returned as -// errors — shutdown should never crash the process. -func shutdownGroup(ctx context.Context, cs []Component) []error { - if len(cs) == 0 { - return nil - } - - var ( - mu sync.Mutex - errs []error - ) - - done := make(chan struct{}) - var wg sync.WaitGroup - for _, c := range cs { - wg.Add(1) - go func(c Component) { - defer wg.Done() - if err := safeShutdown(ctx, c); err != nil { - mu.Lock() - errs = append(errs, err) - mu.Unlock() - } - }(c) - } - go func() { wg.Wait(); close(done) }() - - select { - case <-done: - case <-ctx.Done(): - // Race: work may have finished the same instant the deadline - // fired. Treat the closed `done` as authoritative. - select { - case <-done: - default: - mu.Lock() - errs = append(errs, fmt.Errorf("shutdown budget elapsed: %w", ctx.Err())) - mu.Unlock() - } - } - - mu.Lock() - defer mu.Unlock() - return errs -} - -// shutdownSerial invokes Shutdown on each Component in LIFO order -// (reverse of startedOrder), honouring ctx as the phase budget. On -// budget elapse, the remaining Components are not invoked — only those -// that already started Shutdown keep running in the background. -// -// Panics raised by Component.Shutdown are recovered and accumulated. -func shutdownSerial(ctx context.Context, startedOrder []Component) []error { - var errs []error - for i := len(startedOrder) - 1; i >= 0; i-- { - if ctx.Err() != nil { - errs = append(errs, fmt.Errorf("shutdown budget elapsed: %w", ctx.Err())) - return errs - } - if err := safeShutdown(ctx, startedOrder[i]); err != nil { - errs = append(errs, err) - } - } - return errs -} - -// safeShutdown wraps Component.Shutdown so a panic inside teardown -// becomes an error rather than crashing the process. Shutdown is the -// last code to run before exit; the operator's worst-case is "exporter -// failed to flush," not "collector segfaulted on the way out." -func safeShutdown(ctx context.Context, c Component) (err error) { - defer func() { - if r := recover(); r != nil { - err = fmt.Errorf("shutdown panic: %v", r) - } - }() - return c.Shutdown(ctx) -} - -// noopHost is the default Host substituted when Settings.Host is nil. -// Component unit tests should use pipelinetest.NewHost() instead so -// they can assert on StatusEvent reporting. -type noopHost struct{} - -func (noopHost) GetExtensions() map[ID]Component { return map[ID]Component{} } diff --git a/internal/pipeline/runtime_test.go b/internal/pipeline/runtime_test.go deleted file mode 100644 index 3f98c63a..00000000 --- a/internal/pipeline/runtime_test.go +++ /dev/null @@ -1,469 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 - -package pipeline_test - -import ( - "bytes" - "context" - "errors" - "log/slog" - "sync" - "sync/atomic" - "testing" - "time" - - "github.com/stretchr/testify/require" - - "github.com/tracecoreai/tracecore/internal/componentstatus" - "github.com/tracecoreai/tracecore/internal/pipeline" -) - -// stubComponent records lifecycle calls atomically so race-detector -// runs stay clean. Injection knobs: startErr, shutdownErr, -// shutdownDelay (the goroutine inside Shutdown sleeps before -// returning). -type stubComponent struct { - label string - - startErr error - shutdownErr error - shutdownDelay time.Duration - - startCount atomic.Int32 - shutdownCount atomic.Int32 - - events *eventLog -} - -type eventLog struct { - mu sync.Mutex - items []event -} - -type event struct { - label string - kind string // "start" or "shutdown" -} - -func (l *eventLog) record(label, kind string) { - l.mu.Lock() - l.items = append(l.items, event{label: label, kind: kind}) - l.mu.Unlock() -} - -func (l *eventLog) snapshot() []event { - l.mu.Lock() - defer l.mu.Unlock() - out := make([]event, len(l.items)) - copy(out, l.items) - return out -} - -func (c *stubComponent) Start(_ context.Context, _ pipeline.Host) error { - c.startCount.Add(1) - c.events.record(c.label, "start") - return c.startErr -} - -func (c *stubComponent) Shutdown(ctx context.Context) error { - c.shutdownCount.Add(1) - c.events.record(c.label, "shutdown") - if c.shutdownDelay > 0 { - select { - case <-time.After(c.shutdownDelay): - case <-ctx.Done(): - return ctx.Err() - } - } - return c.shutdownErr -} - -func newComponent(label string, log *eventLog) *stubComponent { - return &stubComponent{label: label, events: log} -} - -func TestRuntime_EmptyPipelines_LogsAndReturnsNil(t *testing.T) { - t.Parallel() - - buf := &bytes.Buffer{} - logger := slog.New(slog.NewTextHandler(buf, &slog.HandlerOptions{Level: slog.LevelInfo})) - - rt := pipeline.NewRuntime(nil, pipeline.WithLogger(logger)) - - require.NoError(t, rt.Start(t.Context())) - require.Contains(t, buf.String(), "no pipelines configured") - require.NoError(t, rt.Shutdown(t.Context())) -} - -func TestRuntime_StartOrder_ExportersFirstReceiversLast(t *testing.T) { - t.Parallel() - - log := &eventLog{} - recv := newComponent("recv", log) - proc1 := newComponent("proc1", log) - proc2 := newComponent("proc2", log) - exp := newComponent("exp", log) - - rt := pipeline.NewRuntime([]pipeline.Pipeline{{ - ID: pipeline.MustNewID(pipeline.MustNewType("metrics"), "primary"), - Receivers: []pipeline.Receiver{recv}, - Processors: []pipeline.Processor{proc1, proc2}, // data-flow order - Exporters: []pipeline.Exporter{exp}, - }}, pipeline.WithLogger(discardLogger())) - - require.NoError(t, rt.Start(t.Context())) - - events := log.snapshot() - starts := []string{} - for _, e := range events { - if e.kind == "start" { - starts = append(starts, e.label) - } - } - // Expect: exp, proc2 (last-stage processor first), proc1, recv. - require.Equal(t, []string{"exp", "proc2", "proc1", "recv"}, starts) - - require.NoError(t, rt.Shutdown(t.Context())) -} - -func TestRuntime_ShutdownOrder_ReceiversFirstThenLIFO(t *testing.T) { - t.Parallel() - - log := &eventLog{} - recv := newComponent("recv", log) - proc := newComponent("proc", log) - exp := newComponent("exp", log) - - rt := pipeline.NewRuntime([]pipeline.Pipeline{{ - ID: pipeline.MustNewID(pipeline.MustNewType("metrics"), ""), - Receivers: []pipeline.Receiver{recv}, - Processors: []pipeline.Processor{proc}, - Exporters: []pipeline.Exporter{exp}, - }}, pipeline.WithLogger(discardLogger())) - - require.NoError(t, rt.Start(t.Context())) - require.NoError(t, rt.Shutdown(t.Context())) - - shutdowns := []string{} - for _, e := range log.snapshot() { - if e.kind == "shutdown" { - shutdowns = append(shutdowns, e.label) - } - } - // Phase 1: receivers (recv). Phase 2: non-receivers in LIFO of - // start order. Start order was exp, proc, recv → non-receivers - // were exp then proc → LIFO is proc then exp. - require.Equal(t, []string{"recv", "proc", "exp"}, shutdowns) -} - -func TestRuntime_StartFailure_ShutdownUnwindsOnlyStarted(t *testing.T) { - t.Parallel() - - log := &eventLog{} - exp := newComponent("exp", log) - procFail := newComponent("proc-fail", log) - procFail.startErr = errors.New("processor refused to start") - recv := newComponent("recv", log) // should never start - - rt := pipeline.NewRuntime([]pipeline.Pipeline{{ - ID: pipeline.MustNewID(pipeline.MustNewType("metrics"), ""), - Receivers: []pipeline.Receiver{recv}, - Processors: []pipeline.Processor{procFail}, - Exporters: []pipeline.Exporter{exp}, - }}, pipeline.WithLogger(discardLogger())) - - err := rt.Start(t.Context()) - require.Error(t, err) - require.ErrorIs(t, err, procFail.startErr) - require.Contains(t, err.Error(), "start processor in pipeline metrics") - - require.Zero(t, recv.startCount.Load(), "receiver must not start after upstream failure") - require.NoError(t, rt.Shutdown(t.Context())) - require.Equal(t, int32(1), exp.shutdownCount.Load(), "exporter that started must be shut down") - require.Zero(t, recv.shutdownCount.Load(), "receiver never started; never shut down") - require.Zero(t, procFail.shutdownCount.Load(), "processor whose Start failed never shuts down") -} - -func TestRuntime_StartTwice_ReturnsError(t *testing.T) { - t.Parallel() - - rt := pipeline.NewRuntime(nil, pipeline.WithLogger(discardLogger())) - require.NoError(t, rt.Start(t.Context())) - err := rt.Start(t.Context()) - require.ErrorIs(t, err, pipeline.ErrAlreadyStarted) - require.Equal(t, "runtime: Start called twice", err.Error()) -} - -func TestRuntime_DrainBudgetClamp(t *testing.T) { - t.Parallel() - - buf := &bytes.Buffer{} - logger := slog.New(slog.NewTextHandler(buf, &slog.HandlerOptions{Level: slog.LevelWarn})) - - pipeline.NewRuntime(nil, pipeline.WithLogger(logger), pipeline.WithDrainBudget(5*time.Minute)) - require.Contains(t, buf.String(), "drain budget exceeds hard ceiling") -} - -func TestRuntime_PhaseTwoBudgetElapsed_ReturnsError(t *testing.T) { - t.Parallel() - - log := &eventLog{} - slowExp := newComponent("slow-exp", log) - slowExp.shutdownDelay = 500 * time.Millisecond - - rt := pipeline.NewRuntime([]pipeline.Pipeline{{ - ID: pipeline.MustNewID(pipeline.MustNewType("metrics"), ""), - Exporters: []pipeline.Exporter{slowExp}, - }}, pipeline.WithLogger(discardLogger()), pipeline.WithDrainBudget(20*time.Millisecond)) - - require.NoError(t, rt.Start(t.Context())) - err := rt.Shutdown(t.Context()) - require.Error(t, err) - require.ErrorIs(t, err, context.DeadlineExceeded) -} - -func TestRuntime_PhaseTwoSerialLIFO_SkipsRemainingOnBudgetElapse(t *testing.T) { - t.Parallel() - - log := &eventLog{} - // One pipeline with two exporters; the first to shutdown (LIFO of - // start order = exp2 first, then exp1) wedges and consumes the - // entire drain budget. exp1 must NOT have Shutdown invoked. - exp1 := newComponent("exp1", log) - exp2 := newComponent("exp2", log) - exp2.shutdownDelay = 500 * time.Millisecond - - rt := pipeline.NewRuntime([]pipeline.Pipeline{{ - ID: pipeline.MustNewID(pipeline.MustNewType("metrics"), ""), - Exporters: []pipeline.Exporter{exp1, exp2}, - }}, pipeline.WithLogger(discardLogger()), pipeline.WithDrainBudget(30*time.Millisecond)) - - require.NoError(t, rt.Start(t.Context())) - err := rt.Shutdown(t.Context()) - require.Error(t, err) - require.ErrorIs(t, err, context.DeadlineExceeded) - - require.Equal(t, int32(1), exp2.shutdownCount.Load(), "exp2 (LIFO first) is invoked and wedges") - require.Zero(t, exp1.shutdownCount.Load(), "exp1 is skipped because budget already elapsed") -} - -// TestRuntime_ConcurrentStartShutdown_NoLostComponents covers the -// race surfaced by the parallel audit: Start releasing the mutex -// between started=true and the per-startOne append would let a -// concurrent Shutdown clear the started* slices mid-Start, leaving -// late-appended Components running but untracked for shutdown. -// Serializing Start and Shutdown on lifecycleMu eliminates the -// race — Shutdown waits for Start to finish, then observes the -// complete set. -func TestRuntime_ConcurrentStartShutdown_NoLostComponents(t *testing.T) { - t.Parallel() - - log := &eventLog{} - const n = 50 - components := make([]*stubComponent, n) - exporters := make([]pipeline.Exporter, n) - for i := range components { - components[i] = newComponent("c", log) - exporters[i] = components[i] - } - - rt := pipeline.NewRuntime([]pipeline.Pipeline{{ - ID: pipeline.MustNewID(pipeline.MustNewType("metrics"), ""), - Exporters: exporters, - }}, pipeline.WithLogger(discardLogger())) - - startDone := make(chan error, 1) - shutdownDone := make(chan error, 1) - go func() { startDone <- rt.Start(t.Context()) }() - go func() { shutdownDone <- rt.Shutdown(t.Context()) }() - - require.NoError(t, <-startDone) - require.NoError(t, <-shutdownDone) - - // Every Component that Started must have Shutdown exactly once. - // (Some may not have started if Shutdown ran first — they're still - // allowed to be unshutdown since Start never happened for them.) - for _, c := range components { - if c.startCount.Load() == 0 { - require.Zero(t, c.shutdownCount.Load(), "never-started component should never be Shutdown") - continue - } - require.Equal(t, int32(1), c.shutdownCount.Load(), "every started component must be Shutdown once") - } -} - -// TestRuntime_PanickingShutdown_RecoveredAsError: if a Component's -// Shutdown panics, the runtime must not crash the process — it -// recovers and surfaces the panic as an error. Tests both shutdown -// paths (parallel for receivers, serial for non-receivers). -func TestRuntime_PanickingShutdown_RecoveredAsError(t *testing.T) { - t.Parallel() - - panicRcv := &panickingComponent{} - panicExp := &panickingComponent{} - - rt := pipeline.NewRuntime([]pipeline.Pipeline{{ - ID: pipeline.MustNewID(pipeline.MustNewType("metrics"), ""), - Receivers: []pipeline.Receiver{panicRcv}, - Exporters: []pipeline.Exporter{panicExp}, - }}, pipeline.WithLogger(discardLogger())) - - require.NoError(t, rt.Start(t.Context())) - err := rt.Shutdown(t.Context()) - require.Error(t, err) - require.Contains(t, err.Error(), "shutdown panic") - require.Contains(t, err.Error(), "boom") -} - -type panickingComponent struct{} - -func (panickingComponent) Start(context.Context, pipeline.Host) error { return nil } -func (panickingComponent) Shutdown(context.Context) error { panic("boom") } - -// TestRuntime_NilComponentInPipeline_ReturnsErrorNotPanic: nil entry -// in a Pipeline's slices is operator/builder error. Return an error -// at startOne rather than nil-derefing on c.Start. -func TestRuntime_NilComponentInPipeline_ReturnsErrorNotPanic(t *testing.T) { - t.Parallel() - - rt := pipeline.NewRuntime([]pipeline.Pipeline{{ - ID: pipeline.MustNewID(pipeline.MustNewType("metrics"), ""), - Exporters: []pipeline.Exporter{nil}, // nil entry - }}, pipeline.WithLogger(discardLogger())) - - err := rt.Start(t.Context()) - require.Error(t, err) - require.Contains(t, err.Error(), "component is nil") -} - -// TestRuntime_ShutdownTwice_IsIdempotent pins the documented contract: -// "Shutdown is safe to call on a Runtime [...]". A second Shutdown -// must be a no-op, not call Components' Shutdown a second time. -func TestRuntime_ShutdownTwice_IsIdempotent(t *testing.T) { - t.Parallel() - - log := &eventLog{} - exp := newComponent("exp", log) - - rt := pipeline.NewRuntime([]pipeline.Pipeline{{ - ID: pipeline.MustNewID(pipeline.MustNewType("metrics"), ""), - Exporters: []pipeline.Exporter{exp}, - }}, pipeline.WithLogger(discardLogger())) - - require.NoError(t, rt.Start(t.Context())) - require.NoError(t, rt.Shutdown(t.Context())) - require.NoError(t, rt.Shutdown(t.Context()), "second Shutdown must be a no-op") - - require.Equal(t, int32(1), exp.shutdownCount.Load(), - "Component Shutdown invoked exactly once across two Runtime.Shutdown calls") -} - -// TestRuntime_ShutdownWithoutStart_IsNoOp: Shutdown before Start is -// valid (e.g. main calls Shutdown after a Start failure that never -// actually started anything). No panics, no nil errors. -func TestRuntime_ShutdownWithoutStart_IsNoOp(t *testing.T) { - t.Parallel() - - rt := pipeline.NewRuntime(nil, pipeline.WithLogger(discardLogger())) - require.NoError(t, rt.Shutdown(t.Context())) -} - -// TestRuntime_TinyDrainBudget pins behavior when DrainBudget is so small -// (1ns) that the deadline fires before the first component's Shutdown -// is even attempted. shutdownSerial detects via ctx.Err() and returns -// the budget-elapsed sentinel — no Component.Shutdown call. -func TestRuntime_TinyDrainBudget_SkipsAllPhase2(t *testing.T) { - t.Parallel() - - log := &eventLog{} - exp := newComponent("exp", log) - - rt := pipeline.NewRuntime([]pipeline.Pipeline{{ - ID: pipeline.MustNewID(pipeline.MustNewType("metrics"), ""), - Exporters: []pipeline.Exporter{exp}, - }}, pipeline.WithLogger(discardLogger()), pipeline.WithDrainBudget(1*time.Nanosecond)) - - require.NoError(t, rt.Start(t.Context())) - err := rt.Shutdown(t.Context()) - require.Error(t, err) - require.ErrorIs(t, err, context.DeadlineExceeded) - require.Zero(t, exp.shutdownCount.Load(), - "phase-2 ctx already expired; serial loop bails before invoking exporter") -} - -// TestRuntime_VariadicOptions_ApplyInOrder pins the Option-list contract: -// each Option mutates a fresh settings struct in the order given, so a -// later Option overrides an earlier one. M2+ relies on this to layer -// build-default + caller-override patterns without proliferating -// constructors. -func TestRuntime_VariadicOptions_ApplyInOrder(t *testing.T) { - t.Parallel() - - loser := &bytes.Buffer{} - winner := &bytes.Buffer{} - - rt := pipeline.NewRuntime(nil, - pipeline.WithLogger(slog.New(slog.NewTextHandler(loser, nil))), - pipeline.WithLogger(slog.New(slog.NewTextHandler(winner, nil))), - ) - require.NoError(t, rt.Start(t.Context())) - require.NoError(t, rt.Shutdown(t.Context())) - - require.Empty(t, loser.String(), "earlier WithLogger must be overridden") - require.Contains(t, winner.String(), "no pipelines configured", - "last WithLogger wins") -} - -func TestRuntime_NilLogger_FallsBackToSlogDefault(t *testing.T) { - t.Parallel() - - // No With* options must not panic — NewRuntime substitutes - // slog.Default(). Empty-pipeline case exercises the substituted - // logger via the "no pipelines configured" Info call. - rt := pipeline.NewRuntime(nil) - require.NotPanics(t, func() { - _ = rt.Start(t.Context()) - _ = rt.Shutdown(t.Context()) - }) -} - -func TestRuntime_NoopHost_ReturnsEmptyExtensions(t *testing.T) { - t.Parallel() - - var observedHost pipeline.Host - probe := &hostProbe{onStart: func(h pipeline.Host) { observedHost = h }} - - rt := pipeline.NewRuntime([]pipeline.Pipeline{{ - ID: pipeline.MustNewID(pipeline.MustNewType("test"), ""), - Receivers: []pipeline.Receiver{probe}, - }}, pipeline.WithLogger(discardLogger())) - require.NoError(t, rt.Start(t.Context())) - require.NoError(t, rt.Shutdown(t.Context())) - - require.NotNil(t, observedHost) - require.Empty(t, observedHost.GetExtensions()) - // componentstatus.ReportStatus must not panic when the host - // does not implement StatusReporter (the runtime's noop host). - require.NotPanics(t, func() { - componentstatus.ReportStatus(observedHost, pipeline.StatusEvent{Kind: "anything"}) - }) -} - -// hostProbe is a stubComponent variant that captures the Host it sees -// at Start so tests can assert on it. -type hostProbe struct { - onStart func(pipeline.Host) -} - -func (p *hostProbe) Start(_ context.Context, host pipeline.Host) error { - if p.onStart != nil { - p.onStart(host) - } - return nil -} - -func (*hostProbe) Shutdown(_ context.Context) error { return nil } - -func discardLogger() *slog.Logger { - return slog.New(slog.NewTextHandler(&bytes.Buffer{}, &slog.HandlerOptions{Level: slog.LevelError})) -} diff --git a/internal/pipeline/saferun.go b/internal/pipeline/saferun.go deleted file mode 100644 index 5ad36da6..00000000 --- a/internal/pipeline/saferun.go +++ /dev/null @@ -1,102 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 - -package pipeline - -import ( - "context" - "fmt" - "log/slog" - - "go.opentelemetry.io/collector/pdata/plog" - "go.opentelemetry.io/collector/pdata/pmetric" - "go.opentelemetry.io/collector/pdata/ptrace" - - "github.com/tracecoreai/tracecore/internal/consumer" -) - -// capable constrains C in safeBase/firstDataBase so Capabilities -// can be a transparent pass-through. -type capable interface { - Capabilities() consumer.Capabilities -} - -type safeBase[T any, C capable] struct { - id string - op string - logger *slog.Logger - next C - invoke func(c C, ctx context.Context, payload T) error -} - -// Capabilities pass-through preserves the fan-out cloning decision -// that was made on the wrapped consumer. -func (s *safeBase[T, C]) Capabilities() consumer.Capabilities { return s.next.Capabilities() } - -// safeCall stays usable after a panic — the wrapper is not poisoned -// by a single bad push. -func (s *safeBase[T, C]) safeCall(ctx context.Context, payload T) (err error) { - defer func() { - if r := recover(); r != nil { - s.logger.Error("component panic", "component", s.id, "op", s.op, "panic", r) - err = fmt.Errorf("component %s: panic in %s: %v", s.id, s.op, r) - } - }() - return s.invoke(s.next, ctx, payload) -} - -// WrapSafeMetrics wraps next so a panic inside ConsumeMetrics -// becomes a logged error instead of a process crash. The wrap is -// inserted at the pipeline-assembly seam so every processor and -// exporter inherits it without per-component code. -func WrapSafeMetrics(componentID string, logger *slog.Logger, next consumer.Metrics) consumer.Metrics { - s := &safeMetrics{} - s.id, s.op, s.logger, s.next = componentID, "ConsumeMetrics", logger, next - s.invoke = func(c consumer.Metrics, ctx context.Context, d pmetric.Metrics) error { - return c.ConsumeMetrics(ctx, d) - } - return s -} - -// WrapSafeTraces is the ptrace.Traces counterpart of WrapSafeMetrics. -func WrapSafeTraces(componentID string, logger *slog.Logger, next consumer.Traces) consumer.Traces { - s := &safeTraces{} - s.id, s.op, s.logger, s.next = componentID, "ConsumeTraces", logger, next - s.invoke = func(c consumer.Traces, ctx context.Context, d ptrace.Traces) error { - return c.ConsumeTraces(ctx, d) - } - return s -} - -// WrapSafeLogs is the plog.Logs counterpart of WrapSafeMetrics. -func WrapSafeLogs(componentID string, logger *slog.Logger, next consumer.Logs) consumer.Logs { - s := &safeLogs{} - s.id, s.op, s.logger, s.next = componentID, "ConsumeLogs", logger, next - s.invoke = func(c consumer.Logs, ctx context.Context, d plog.Logs) error { - return c.ConsumeLogs(ctx, d) - } - return s -} - -type safeMetrics struct { - safeBase[pmetric.Metrics, consumer.Metrics] -} - -func (s *safeMetrics) ConsumeMetrics(ctx context.Context, md pmetric.Metrics) error { - return s.safeCall(ctx, md) -} - -type safeTraces struct { - safeBase[ptrace.Traces, consumer.Traces] -} - -func (s *safeTraces) ConsumeTraces(ctx context.Context, td ptrace.Traces) error { - return s.safeCall(ctx, td) -} - -type safeLogs struct { - safeBase[plog.Logs, consumer.Logs] -} - -func (s *safeLogs) ConsumeLogs(ctx context.Context, ld plog.Logs) error { - return s.safeCall(ctx, ld) -} diff --git a/internal/pipeline/saferun_test.go b/internal/pipeline/saferun_test.go deleted file mode 100644 index 5021bca2..00000000 --- a/internal/pipeline/saferun_test.go +++ /dev/null @@ -1,130 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 - -package pipeline_test - -import ( - "bytes" - "context" - "log/slog" - "testing" - - "github.com/stretchr/testify/require" - "go.opentelemetry.io/collector/pdata/plog" - "go.opentelemetry.io/collector/pdata/pmetric" - "go.opentelemetry.io/collector/pdata/ptrace" - - "github.com/tracecoreai/tracecore/internal/consumer" - "github.com/tracecoreai/tracecore/internal/pipeline" -) - -// TestWrapSafe_PanicsRecovered_PerSignal pins Phase 10's contract: a -// panicking downstream consumer must not crash the process; the wrapper -// recovers, logs with the component ID + "panic" string, surfaces an -// error to the caller, and remains usable for subsequent calls. -func TestWrapSafe_PanicsRecovered_PerSignal(t *testing.T) { - t.Parallel() - - t.Run("metrics", func(t *testing.T) { - t.Parallel() - buf := &bytes.Buffer{} - logger := slog.New(slog.NewTextHandler(buf, &slog.HandlerOptions{Level: slog.LevelError})) - safe := pipeline.WrapSafeMetrics("clockreceiver/primary", logger, - &panickingMetricsConsumer{}) - - md := pmetric.NewMetrics() - err := safe.ConsumeMetrics(t.Context(), md) - require.Error(t, err) - require.ErrorContains(t, err, "clockreceiver/primary") - require.ErrorContains(t, err, "panic") - require.Contains(t, buf.String(), "clockreceiver/primary") - require.Contains(t, buf.String(), "panic") - - // Pipeline continues — second call also recovers cleanly. - err = safe.ConsumeMetrics(t.Context(), md) - require.Error(t, err) - }) - - t.Run("traces", func(t *testing.T) { - t.Parallel() - logger := slog.New(slog.NewTextHandler(&bytes.Buffer{}, nil)) - safe := pipeline.WrapSafeTraces("otlp/primary", logger, &panickingTracesConsumer{}) - err := safe.ConsumeTraces(t.Context(), ptrace.NewTraces()) - require.Error(t, err) - require.ErrorContains(t, err, "otlp/primary") - require.ErrorContains(t, err, "panic") - }) - - t.Run("logs", func(t *testing.T) { - t.Parallel() - logger := slog.New(slog.NewTextHandler(&bytes.Buffer{}, nil)) - safe := pipeline.WrapSafeLogs("fluent/main", logger, &panickingLogsConsumer{}) - err := safe.ConsumeLogs(t.Context(), plog.NewLogs()) - require.Error(t, err) - require.ErrorContains(t, err, "fluent/main") - require.ErrorContains(t, err, "panic") - }) -} - -// TestWrapSafe_HappyPath_ForwardsToNext: when downstream doesn't panic, -// the wrapper is a transparent pass-through (no error, no log line). -func TestWrapSafe_HappyPath_ForwardsToNext(t *testing.T) { - t.Parallel() - - buf := &bytes.Buffer{} - logger := slog.New(slog.NewTextHandler(buf, &slog.HandlerOptions{Level: slog.LevelDebug})) - called := false - stub := &fnMetricsConsumer{fn: func(context.Context, pmetric.Metrics) error { - called = true - return nil - }} - safe := pipeline.WrapSafeMetrics("test/x", logger, stub) - require.NoError(t, safe.ConsumeMetrics(t.Context(), pmetric.NewMetrics())) - require.True(t, called) - require.NotContains(t, buf.String(), "panic") -} - -// TestWrapSafe_CapabilitiesPassThrough: the wrapper must forward -// Capabilities() to the wrapped consumer so the fan-out's cloning -// decisions stay correct. -func TestWrapSafe_CapabilitiesPassThrough(t *testing.T) { - t.Parallel() - stub := &fnMetricsConsumer{caps: consumer.Capabilities{MutatesData: true}} - safe := pipeline.WrapSafeMetrics("x", slog.Default(), stub) - require.True(t, safe.Capabilities().MutatesData) -} - -// Panicking stubs used by the panic-recovery tests above. - -type panickingMetricsConsumer struct{} - -func (panickingMetricsConsumer) Capabilities() consumer.Capabilities { return consumer.Capabilities{} } -func (panickingMetricsConsumer) ConsumeMetrics(context.Context, pmetric.Metrics) error { - panic("boom-metrics") -} - -type panickingTracesConsumer struct{} - -func (panickingTracesConsumer) Capabilities() consumer.Capabilities { return consumer.Capabilities{} } -func (panickingTracesConsumer) ConsumeTraces(context.Context, ptrace.Traces) error { - panic("boom-traces") -} - -type panickingLogsConsumer struct{} - -func (panickingLogsConsumer) Capabilities() consumer.Capabilities { return consumer.Capabilities{} } -func (panickingLogsConsumer) ConsumeLogs(context.Context, plog.Logs) error { - panic("boom-logs") -} - -type fnMetricsConsumer struct { - caps consumer.Capabilities - fn func(context.Context, pmetric.Metrics) error -} - -func (f *fnMetricsConsumer) Capabilities() consumer.Capabilities { return f.caps } -func (f *fnMetricsConsumer) ConsumeMetrics(ctx context.Context, md pmetric.Metrics) error { - if f.fn == nil { - return nil - } - return f.fn(ctx, md) -} diff --git a/internal/pipeline/testmain_test.go b/internal/pipeline/testmain_test.go deleted file mode 100644 index 4cfbf801..00000000 --- a/internal/pipeline/testmain_test.go +++ /dev/null @@ -1,19 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 - -package pipeline_test - -import ( - "testing" - - "go.uber.org/goleak" -) - -// TestMain runs every test in this package under goleak's -// VerifyTestMain so any goroutine still alive after a test returns -// fails the run. The pipeline runtime spawns receiver/processor -// goroutines on Start; a Component that ignores ctx and leaks its -// goroutine past Shutdown would otherwise go unnoticed in `go test -// -race` runs. -func TestMain(m *testing.M) { - goleak.VerifyTestMain(m) -} diff --git a/internal/pipelinebuilder/builder.go b/internal/pipelinebuilder/builder.go deleted file mode 100644 index b59c8c50..00000000 --- a/internal/pipelinebuilder/builder.go +++ /dev/null @@ -1,565 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 - -// Package pipelinebuilder turns a loaded config.Config + a -// pipeline.Factories registry into the runtime's []pipeline.Pipeline. -// Lives separately from internal/pipeline to keep that package free -// of any config-shape dependency (config already imports pipeline, -// so the inverse would cycle). -package pipelinebuilder - -import ( - "bytes" - "context" - "errors" - "fmt" - "log/slog" - "os" - "sort" - "strings" - - "go.opentelemetry.io/collector/pdata/pcommon" - "go.opentelemetry.io/otel/metric" - "go.opentelemetry.io/otel/metric/noop" - "gopkg.in/yaml.v3" - - "github.com/tracecoreai/tracecore/internal/config" - "github.com/tracecoreai/tracecore/internal/pipeline" -) - -// BuildOption configures BuildPipelines without breaking the -// signature. Useful when the caller has a MeterProvider (M2) or -// BuildInfo (M2) to inject; tests + tools that don't care omit -// the options and get the noop defaults. -type BuildOption func(*buildOptions) - -type buildOptions struct { - meterProvider metric.MeterProvider - buildInfo pipeline.BuildInfo - resource pcommon.Resource -} - -func newBuildOptions(opts []BuildOption) buildOptions { - o := buildOptions{meterProvider: noop.NewMeterProvider()} - for _, opt := range opts { - opt(&o) - } - if o.meterProvider == nil { - o.meterProvider = noop.NewMeterProvider() - } - // Build the default Resource lazily once per BuildPipelines call. - // Receivers reading set.Telemetry.Resource get OTel-semconv - // canonical attributes (host.name, service.name, service.version, - // service.instance.id) rather than a bare empty Resource. - o.resource = newDefaultResource(o.buildInfo) - return o -} - -// newDefaultResource constructs the pcommon.Resource the runtime -// stamps on every component's TelemetrySettings. Honest defaults: -// host.name from os.Hostname(); service.name/version from BuildInfo; -// service.instance.id falls back to host.name when nothing better -// exists. Missing values stay absent rather than getting placeholder -// strings that operators would mistake for real data. -func newDefaultResource(bi pipeline.BuildInfo) pcommon.Resource { - res := pcommon.NewResource() - attrs := res.Attributes() - if hn, err := os.Hostname(); err == nil && hn != "" { - attrs.PutStr("host.name", hn) - attrs.PutStr("service.instance.id", hn) - } - if bi.Command != "" { - attrs.PutStr("service.name", bi.Command) - } - if bi.Version != "" { - attrs.PutStr("service.version", bi.Version) - } - return res -} - -// WithMeterProvider sets the metric.MeterProvider every component's -// TelemetrySettings.MeterProvider points at. Defaults to a noop -// provider so receivers never have to nil-check. -func WithMeterProvider(mp metric.MeterProvider) BuildOption { - return func(o *buildOptions) { o.meterProvider = mp } -} - -// WithBuildInfo sets the BuildInfo every component's CreateSettings -// carries. Defaults to the zero-value (operators see empty strings), -// which is fine for unit tests; cmd/tracecore populates it from -// internal/version. -func WithBuildInfo(bi pipeline.BuildInfo) BuildOption { - return func(o *buildOptions) { o.buildInfo = bi } -} - -// BuildPipelines turns a loaded config into the runtime's Pipeline -// list using the supplied factories. Per RFC-0004, the assembly is -// bottom-up: exporters constructed first, wrapped in a fan-out -// consumer, then processors in reverse data-flow order, then the -// receivers' next is wrapped with WrapFirstDataMetrics, then receivers -// are constructed. -// -// When factories are empty (no in-tree components registered), -// returns nil pipelines for an empty config and an operator-actionable -// error for any non-empty config — useful so a hypothetical build -// stripped of all factories still produces a meaningful error. -func BuildPipelines( - ctx context.Context, - logger *slog.Logger, - cfg *config.Config, - factories pipeline.Factories, - opts ...BuildOption, -) ([]pipeline.Pipeline, error) { - bopts := newBuildOptions(opts) - - if !hasFactories(factories) { - if hasOperatorIntent(cfg) { - return nil, errors.New("no component factories registered; only an empty config is accepted until at least one factory is wired") - } - return nil, nil - } - - // Sort pipeline keys so iteration is deterministic (Go maps are - // random-order). Reproducible builds + reproducible error messages. - keys := make([]string, 0, len(cfg.Service.Pipelines)) - for k := range cfg.Service.Pipelines { - keys = append(keys, k) - } - sort.Strings(keys) - - pipelines := make([]pipeline.Pipeline, 0, len(keys)) - for _, pipelineKey := range keys { - pConfig := cfg.Service.Pipelines[pipelineKey] - - signal, instance, err := config.ParsePipelineID(pipelineKey) - if err != nil { - return nil, fmt.Errorf("pipeline %q: %w", pipelineKey, err) - } - pipelineKind, err := pipeline.NewType(signal.String()) - if err != nil { - return nil, fmt.Errorf("pipeline %q: %w", pipelineKey, err) - } - pID, err := pipeline.NewID(pipelineKind, instance) - if err != nil { - return nil, fmt.Errorf("pipeline %q: %w", pipelineKey, err) - } - - var p pipeline.Pipeline - switch signal { - case pipeline.SignalMetrics: - p, err = buildSignalPipeline(ctx, metricsOps, logger, pID, cfg, pConfig, factories, bopts) - case pipeline.SignalTraces: - p, err = buildSignalPipeline(ctx, tracesOps, logger, pID, cfg, pConfig, factories, bopts) - case pipeline.SignalLogs: - p, err = buildSignalPipeline(ctx, logsOps, logger, pID, cfg, pConfig, factories, bopts) - default: - // Unreachable: ParsePipelineID rejects non-metrics/traces/logs - // keys. Defense in depth so a fourth Signal value added to - // internal/pipeline doesn't silently produce zero pipelines. - return nil, fmt.Errorf("pipeline %q: no assembly path registered for signal %v", pipelineKey, signal) - } - if err != nil { - return nil, err - } - pipelines = append(pipelines, p) - } - return pipelines, nil -} - -func hasFactories(f pipeline.Factories) bool { - return len(f.Receivers)+len(f.Processors)+len(f.Exporters) > 0 -} - -// hasOperatorIntent reports whether cfg expresses any intent to run a -// pipeline: a component definition or a pipeline declaration. A bare -// `service: {}` or empty file returns false. -func hasOperatorIntent(cfg *config.Config) bool { - if cfg == nil { - return false - } - if len(cfg.Receivers)+len(cfg.Processors)+len(cfg.Exporters) > 0 { - return true - } - return len(cfg.Service.Pipelines) > 0 -} - -// splitName parses a component reference name from YAML into a -// (Type, instance) pair. "otlp" → (Type{otlp}, ""). "otlp/secondary" -// → (Type{otlp}, "secondary"). Multiple slashes or invalid type -// strings return an error. The error does NOT repeat the input — callers -// already include it via their wrapping prefix (STYLE-errors rule 2). -func splitName(s string) (pipeline.Type, string, error) { - typeStr, instance, _ := strings.Cut(s, "/") - if strings.Contains(instance, "/") { - return pipeline.Type{}, "", errors.New("has multiple slashes; expected `` or `/`") - } - typ, err := pipeline.NewType(typeStr) - if err != nil { - return pipeline.Type{}, "", err //nolint:wrapcheck // splitName's leaf errors are wrapped by callers. - } - return typ, instance, nil -} - -// componentSet bundles the per-component decode + ID + Settings work -// that every signal path repeats. -type componentSet struct { - ID pipeline.ID - Cfg pipeline.Config - Settings pipeline.CreateSettings -} - -// resolveComponent locates a factory, decodes its config, validates, -// and produces the (ID, Cfg, Settings) bundle for the create-call. -// factoryLookup returns nil when no factory matches the type — the -// caller produces the "unknown type" error. -func resolveComponent( - pID pipeline.ID, - role string, - name string, - section map[string]yaml.Node, - defaultConfig func() pipeline.Config, - logger *slog.Logger, - bopts buildOptions, -) (componentSet, error) { - typ, instance, err := splitName(name) - if err != nil { - return componentSet{}, fmt.Errorf("pipeline %s: %s %q: %w", pID, role, name, err) - } - node, ok := section[name] - if !ok { - return componentSet{}, fmt.Errorf("pipeline %s: %s %q not declared in top-level %ss", pID, role, name, role) - } - - cfgInst := defaultConfig() - if cfgInst == nil { - // STYLE.md requires factories return a non-nil default config; - // defending here surfaces a buggy factory at build time with a - // named error rather than a nil-deref inside node.Decode. - return componentSet{}, fmt.Errorf("pipeline %s: %s %q: factory returned nil default config", pID, role, name) - } - if err := strictDecodeNode(&node, cfgInst); err != nil { - return componentSet{}, fmt.Errorf("pipeline %s: %s %q decode: %w", pID, role, name, err) - } - if err := cfgInst.Validate(); err != nil { - return componentSet{}, fmt.Errorf("pipeline %s: %s %q: %w", pID, role, name, err) - } - - cID, err := pipeline.NewID(typ, instance) - if err != nil { - return componentSet{}, fmt.Errorf("pipeline %s: %s %q: %w", pID, role, name, err) - } - - return componentSet{ - ID: cID, - Cfg: cfgInst, - Settings: pipeline.CreateSettings{ - ID: cID, - Telemetry: pipeline.TelemetrySettings{ - Logger: logger.With("component", cID.String()), - MeterProvider: bopts.meterProvider, - Resource: bopts.resource, - }, - BuildInfo: bopts.buildInfo, - }, - }, nil -} - -// buildSignalPipeline assembles one Pipeline bottom-up: exporters → -// fanout → processors (reversed so the last processor is built first -// with the fanout as its `next`) → first-data instrumentation → -// receivers. The signal-specific glue is in ops; the algorithm is -// identical across metrics/traces/logs. -func buildSignalPipeline[C any]( - ctx context.Context, - ops signalOps[C], - logger *slog.Logger, - pID pipeline.ID, - cfg *config.Config, - pConfig config.Pipeline, - factories pipeline.Factories, - bopts buildOptions, -) (pipeline.Pipeline, error) { - exporters, expConsumers, err := buildExporters(ctx, ops, logger, pID, cfg, pConfig.Exporters, factories.Exporters, bopts) - if err != nil { - return pipeline.Pipeline{}, err - } - - next := ops.newFanout(expConsumers) - - processors, next, err := buildProcessors(ctx, ops, logger, pID, cfg, pConfig.Processors, factories.Processors, next, bopts) - if err != nil { - return pipeline.Pipeline{}, err - } - - // Wrap receivers' next with first-data instrumentation so the - // "pipeline first data" log fires once per pipeline — lets - // operators verify aliveness without external tooling. - next = ops.wrapFirstData(pID.String(), logger, next) - - receivers, err := buildReceivers(ctx, ops, logger, pID, cfg, pConfig.Receivers, factories.Receivers, next, bopts) - if err != nil { - return pipeline.Pipeline{}, err - } - - return pipeline.Pipeline{ - ID: pID, - Receivers: receivers, - Processors: processors, - Exporters: exporters, - }, nil -} - -// buildExporters constructs the leaf consumers for a pipeline. Returns -// the raw Exporter slice (for lifecycle ownership) and a parallel -// slice of consumer.C views (safe-wrapped) ready to be fed into a fanout. -func buildExporters[C any]( - ctx context.Context, - ops signalOps[C], - logger *slog.Logger, - pID pipeline.ID, - cfg *config.Config, - names []string, - factories map[pipeline.Type]pipeline.ExporterFactory, - bopts buildOptions, -) ([]pipeline.Exporter, []C, error) { - exporters := make([]pipeline.Exporter, 0, len(names)) - consumers := make([]C, 0, len(names)) - for _, name := range names { - typ, _, err := splitName(name) - if err != nil { - return nil, nil, fmt.Errorf("pipeline %s: exporter %q: %w", pID, name, err) - } - f, ok := factories[typ] - if !ok { - return nil, nil, fmt.Errorf("pipeline %s: unknown exporter type %q", pID, typ) - } - cs, err := resolveComponent(pID, "exporter", name, cfg.Exporters, f.CreateDefaultConfig, logger, bopts) - if err != nil { - return nil, nil, err - } - exp, cons, err := ops.createExporter(f, ctx, cs.Settings, cs.Cfg) - if err != nil { - return nil, nil, fmt.Errorf("pipeline %s: exporter %q: %w", pID, name, err) - } - // Wrap the consumer-view so a panic inside Consume* is - // recovered + logged instead of crashing the binary. The - // raw Exporter (stored for lifecycle) bypasses the wrapper. - cons = ops.wrapSafe(cs.ID.String(), logger, cons) - exporters = append(exporters, exp) - consumers = append(consumers, cons) - } - return exporters, consumers, nil -} - -// buildProcessors constructs processors in REVERSE data-flow order so -// the LAST processor is built first with `next` as its downstream. -// Returns the processor slice (data-flow order) and the new `next` -// — the upstream-most processor's consumer-view — for the receiver -// stage to push into. -func buildProcessors[C any]( - ctx context.Context, - ops signalOps[C], - logger *slog.Logger, - pID pipeline.ID, - cfg *config.Config, - names []string, - factories map[pipeline.Type]pipeline.ProcessorFactory, - next C, - bopts buildOptions, -) ([]pipeline.Processor, C, error) { - processors := make([]pipeline.Processor, len(names)) - for i := len(names) - 1; i >= 0; i-- { - name := names[i] - typ, _, err := splitName(name) - if err != nil { - return nil, next, fmt.Errorf("pipeline %s: processor %q: %w", pID, name, err) - } - f, ok := factories[typ] - if !ok { - return nil, next, fmt.Errorf("pipeline %s: unknown processor type %q", pID, typ) - } - cs, err := resolveComponent(pID, "processor", name, cfg.Processors, f.CreateDefaultConfig, logger, bopts) - if err != nil { - return nil, next, err - } - proc, cons, err := ops.createProcessor(f, ctx, cs.Settings, cs.Cfg, next) - if err != nil { - return nil, next, fmt.Errorf("pipeline %s: processor %q: %w", pID, name, err) - } - cons = ops.wrapSafe(cs.ID.String(), logger, cons) - processors[i] = proc - next = cons - } - return processors, next, nil -} - -// buildReceivers constructs the data-flow heads. Each receiver pushes -// into `next` (typically the first-data-wrapped chain leading to the -// fanout). Receivers don't have a downstream consumer-view themselves. -func buildReceivers[C any]( - ctx context.Context, - ops signalOps[C], - logger *slog.Logger, - pID pipeline.ID, - cfg *config.Config, - names []string, - factories map[pipeline.Type]pipeline.ReceiverFactory, - next C, - bopts buildOptions, -) ([]pipeline.Receiver, error) { - receivers := make([]pipeline.Receiver, 0, len(names)) - for _, name := range names { - typ, _, err := splitName(name) - if err != nil { - return nil, fmt.Errorf("pipeline %s: receiver %q: %w", pID, name, err) - } - f, ok := factories[typ] - if !ok { - return nil, fmt.Errorf("pipeline %s: unknown receiver type %q", pID, typ) - } - cs, err := resolveComponent(pID, "receiver", name, cfg.Receivers, f.CreateDefaultConfig, logger, bopts) - if err != nil { - return nil, err - } - rcv, err := ops.createReceiver(f, ctx, cs.Settings, cs.Cfg, next) - if err != nil { - return nil, fmt.Errorf("pipeline %s: receiver %q create: %w", pID, name, err) - } - receivers = append(receivers, rcv) - } - return receivers, nil -} - -// ResolvedConfigs holds each component's config after factory -// defaults have been applied and the operator's YAML overlay -// decoded on top. Keys are the operator-facing component names -// (e.g. "dcgm", "otlphttp/primary"). Values are the concrete -// pipeline.Config the factory would receive at Create time. -// -// Used by `tracecore validate --explain` so an operator who writes -// `dcgm: {}` sees the resolved configuration rather than reading -// the README to find the defaults. -type ResolvedConfigs struct { - Receivers map[string]pipeline.Config - Processors map[string]pipeline.Config - Exporters map[string]pipeline.Config -} - -// ResolveConfigs decodes each component referenced by the config's -// pipelines using the factory's CreateDefaultConfig + the operator's -// YAML overlay, the same way BuildPipelines would. Unlike -// BuildPipelines it does not call Create*, so a factory whose Create -// is expensive or has live-dependency requirements is safe to call -// at validate time. -// -// Returns an error if a component declared in a pipeline is missing -// from the top-level receivers/processors/exporters section, or if -// the strict-decode rejects an unknown field. -func ResolveConfigs(cfg *config.Config, factories pipeline.Factories) (ResolvedConfigs, error) { - out := ResolvedConfigs{ - Receivers: make(map[string]pipeline.Config), - Processors: make(map[string]pipeline.Config), - Exporters: make(map[string]pipeline.Config), - } - - for _, p := range cfg.Service.Pipelines { - for _, name := range p.Receivers { - if _, done := out.Receivers[name]; done { - continue - } - c, err := resolveOne("receiver", name, cfg.Receivers, func(t pipeline.Type) func() pipeline.Config { - f, ok := factories.Receivers[t] - if !ok { - return nil - } - return f.CreateDefaultConfig - }) - if err != nil { - return ResolvedConfigs{}, err - } - out.Receivers[name] = c - } - for _, name := range p.Processors { - if _, done := out.Processors[name]; done { - continue - } - c, err := resolveOne("processor", name, cfg.Processors, func(t pipeline.Type) func() pipeline.Config { - f, ok := factories.Processors[t] - if !ok { - return nil - } - return f.CreateDefaultConfig - }) - if err != nil { - return ResolvedConfigs{}, err - } - out.Processors[name] = c - } - for _, name := range p.Exporters { - if _, done := out.Exporters[name]; done { - continue - } - c, err := resolveOne("exporter", name, cfg.Exporters, func(t pipeline.Type) func() pipeline.Config { - f, ok := factories.Exporters[t] - if !ok { - return nil - } - return f.CreateDefaultConfig - }) - if err != nil { - return ResolvedConfigs{}, err - } - out.Exporters[name] = c - } - } - - return out, nil -} - -// resolveOne walks the role-specific path of resolveComponent -// without producing the create-side bundle. Shared across receiver -// /processor/exporter via the defaultLookup closure so the factory -// types do not need to be reconciled into one interface. -func resolveOne( - role string, - name string, - section map[string]yaml.Node, - defaultLookup func(pipeline.Type) func() pipeline.Config, -) (pipeline.Config, error) { - typ, _, err := splitName(name) - if err != nil { - return nil, fmt.Errorf("%s %q: %w", role, name, err) - } - node, ok := section[name] - if !ok { - return nil, fmt.Errorf("%s %q not declared in top-level %ss", role, name, role) - } - defFn := defaultLookup(typ) - if defFn == nil { - return nil, fmt.Errorf("%s %q: no factory registered for type %q", role, name, typ) - } - cfgInst := defFn() - if cfgInst == nil { - return nil, fmt.Errorf("%s %q: factory returned nil default config", role, name) - } - if err := strictDecodeNode(&node, cfgInst); err != nil { - return nil, fmt.Errorf("%s %q decode: %w", role, name, err) - } - return cfgInst, nil -} - -// strictDecodeNode decodes a yaml.Node into out with KnownFields(true) -// so a typo'd key (e.g. `collection_intrval`) fails at config-load -// time. yaml.Node.Decode bypasses the top-level decoder's strict-mode, -// so we re-marshal and feed it through a yaml.Decoder. -func strictDecodeNode(node *yaml.Node, out any) error { - raw, err := yaml.Marshal(node) - if err != nil { - return fmt.Errorf("re-marshal yaml node: %w", err) - } - dec := yaml.NewDecoder(bytes.NewReader(raw)) - dec.KnownFields(true) - if err := dec.Decode(out); err != nil { - return fmt.Errorf("strict decode: %w", err) - } - return nil -} diff --git a/internal/pipelinebuilder/builder_test.go b/internal/pipelinebuilder/builder_test.go deleted file mode 100644 index dadaf61d..00000000 --- a/internal/pipelinebuilder/builder_test.go +++ /dev/null @@ -1,403 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 - -package pipelinebuilder_test - -import ( - "context" - "io" - "log/slog" - "testing" - - "github.com/stretchr/testify/require" - "go.opentelemetry.io/collector/pdata/pmetric" - "gopkg.in/yaml.v3" - - "github.com/tracecoreai/tracecore/internal/config" - "github.com/tracecoreai/tracecore/internal/consumer" - "github.com/tracecoreai/tracecore/internal/pipeline" - "github.com/tracecoreai/tracecore/internal/pipelinebuilder" -) - -// TestBuildPipelines_WithProcessor exercises the processor-stage code -// path. In-tree components are receivers + exporters only; without a -// fake ProcessorFactory the buildProcessors function would never be -// covered by tests in the cmd-package suite. -func TestBuildPipelines_WithProcessor(t *testing.T) { - t.Parallel() - - procType := pipeline.MustNewType("noop") - cfg := &config.Config{ - Receivers: map[string]yaml.Node{ - "echo": mustYAML(`{}`), - }, - Processors: map[string]yaml.Node{ - "noop": mustYAML(`{}`), - }, - Exporters: map[string]yaml.Node{ - "sink": mustYAML(`{}`), - }, - Service: config.Service{ - Pipelines: map[string]config.Pipeline{ - "metrics/primary": { - Receivers: []string{"echo"}, - Processors: []string{"noop"}, - Exporters: []string{"sink"}, - }, - }, - }, - } - factories := pipeline.Factories{ - Receivers: map[pipeline.Type]pipeline.ReceiverFactory{ - pipeline.MustNewType("echo"): &echoReceiverFactory{}, - }, - Processors: map[pipeline.Type]pipeline.ProcessorFactory{ - procType: &noopProcessorFactory{}, - }, - Exporters: map[pipeline.Type]pipeline.ExporterFactory{ - pipeline.MustNewType("sink"): &sinkExporterFactory{}, - }, - } - got, err := pipelinebuilder.BuildPipelines(t.Context(), discardLogger(), cfg, factories) - require.NoError(t, err) - require.Len(t, got, 1) - require.Len(t, got[0].Processors, 1, - "buildProcessors must construct the noop processor instance") -} - -// TestBuildPipelines_RejectsUnknownComponentField confirms a typo'd -// key in any component config (receiver / processor / exporter) is -// rejected at build time. Catches `collection_intrval: 15s` etc. -// without each receiver needing its own strict-decode wiring. -func TestBuildPipelines_RejectsUnknownComponentField(t *testing.T) { - t.Parallel() - - mkCfg := func(receiver, processor, exporter string) *config.Config { - return &config.Config{ - Receivers: map[string]yaml.Node{"typed": mustYAML(receiver)}, - Processors: map[string]yaml.Node{"noop": mustYAML(processor)}, - Exporters: map[string]yaml.Node{"sink": mustYAML(exporter)}, - Service: config.Service{ - Pipelines: map[string]config.Pipeline{ - "metrics/primary": { - Receivers: []string{"typed"}, - Processors: []string{"noop"}, - Exporters: []string{"sink"}, - }, - }, - }, - } - } - factories := pipeline.Factories{ - Receivers: map[pipeline.Type]pipeline.ReceiverFactory{ - pipeline.MustNewType("typed"): &typedReceiverFactory{}, - }, - Processors: map[pipeline.Type]pipeline.ProcessorFactory{ - pipeline.MustNewType("noop"): &noopProcessorFactory{}, - }, - Exporters: map[pipeline.Type]pipeline.ExporterFactory{ - pipeline.MustNewType("sink"): &sinkExporterFactory{}, - }, - } - - t.Run("receiver typo", func(t *testing.T) { - t.Parallel() - cfg := mkCfg(`{collection_intrval: 15s}`, `{}`, `{}`) - _, err := pipelinebuilder.BuildPipelines(t.Context(), discardLogger(), cfg, factories) - require.Error(t, err) - require.Contains(t, err.Error(), `receiver "typed" decode`) - require.Contains(t, err.Error(), "collection_intrval") - }) - - t.Run("processor typo", func(t *testing.T) { - t.Parallel() - cfg := mkCfg(`{collection_interval: 15s}`, `{not_a_field: 1}`, `{}`) - _, err := pipelinebuilder.BuildPipelines(t.Context(), discardLogger(), cfg, factories) - require.Error(t, err) - require.Contains(t, err.Error(), `processor "noop" decode`) - }) - - t.Run("exporter typo", func(t *testing.T) { - t.Parallel() - cfg := mkCfg(`{collection_interval: 15s}`, `{}`, `{bogus: true}`) - _, err := pipelinebuilder.BuildPipelines(t.Context(), discardLogger(), cfg, factories) - require.Error(t, err) - require.Contains(t, err.Error(), `exporter "sink" decode`) - }) - - t.Run("valid config still builds", func(t *testing.T) { - t.Parallel() - cfg := mkCfg(`{collection_interval: 15s}`, `{}`, `{}`) - _, err := pipelinebuilder.BuildPipelines(t.Context(), discardLogger(), cfg, factories) - require.NoError(t, err) - }) -} - -// TestResolveConfigs returns the per-component resolved configs -// for every name referenced by a pipeline, applying factory -// defaults + the operator's YAML overlay. Used by `validate -// --explain`. -func TestResolveConfigs(t *testing.T) { - t.Parallel() - - cfg := &config.Config{ - Receivers: map[string]yaml.Node{"typed": mustYAML(`{collection_interval: 30s}`)}, - Processors: map[string]yaml.Node{"noop": mustYAML(`{}`)}, - Exporters: map[string]yaml.Node{"sink": mustYAML(`{}`)}, - Service: config.Service{ - Pipelines: map[string]config.Pipeline{ - "metrics/primary": { - Receivers: []string{"typed"}, - Processors: []string{"noop"}, - Exporters: []string{"sink"}, - }, - }, - }, - } - factories := pipeline.Factories{ - Receivers: map[pipeline.Type]pipeline.ReceiverFactory{ - pipeline.MustNewType("typed"): &typedReceiverFactory{}, - }, - Processors: map[pipeline.Type]pipeline.ProcessorFactory{ - pipeline.MustNewType("noop"): &noopProcessorFactory{}, - }, - Exporters: map[pipeline.Type]pipeline.ExporterFactory{ - pipeline.MustNewType("sink"): &sinkExporterFactory{}, - }, - } - - resolved, err := pipelinebuilder.ResolveConfigs(cfg, factories) - require.NoError(t, err) - - require.Contains(t, resolved.Receivers, "typed") - tc, ok := resolved.Receivers["typed"].(*typedConfig) - require.True(t, ok, "resolved receiver config has the factory's struct type") - require.Equal(t, "30s", tc.CollectionInterval, - "operator overlay must win over the factory default") - - require.Contains(t, resolved.Processors, "noop") - require.Contains(t, resolved.Exporters, "sink") -} - -func TestResolveConfigs_RejectsUnknownField(t *testing.T) { - t.Parallel() - - cfg := &config.Config{ - Receivers: map[string]yaml.Node{"typed": mustYAML(`{collection_intrval: 30s}`)}, - Processors: map[string]yaml.Node{"noop": mustYAML(`{}`)}, - Exporters: map[string]yaml.Node{"sink": mustYAML(`{}`)}, - Service: config.Service{ - Pipelines: map[string]config.Pipeline{ - "metrics/primary": { - Receivers: []string{"typed"}, - Processors: []string{"noop"}, - Exporters: []string{"sink"}, - }, - }, - }, - } - factories := pipeline.Factories{ - Receivers: map[pipeline.Type]pipeline.ReceiverFactory{ - pipeline.MustNewType("typed"): &typedReceiverFactory{}, - }, - Processors: map[pipeline.Type]pipeline.ProcessorFactory{ - pipeline.MustNewType("noop"): &noopProcessorFactory{}, - }, - Exporters: map[pipeline.Type]pipeline.ExporterFactory{ - pipeline.MustNewType("sink"): &sinkExporterFactory{}, - }, - } - - _, err := pipelinebuilder.ResolveConfigs(cfg, factories) - require.Error(t, err, "strict-decode must trip on typo'd field") - require.Contains(t, err.Error(), "collection_intrval") -} - -func TestResolveConfigs_UnknownTypeReportsFactoryGap(t *testing.T) { - t.Parallel() - - cfg := &config.Config{ - Receivers: map[string]yaml.Node{"nonexistent": mustYAML(`{}`)}, - Exporters: map[string]yaml.Node{"sink": mustYAML(`{}`)}, - Service: config.Service{ - Pipelines: map[string]config.Pipeline{ - "metrics/primary": { - Receivers: []string{"nonexistent"}, - Exporters: []string{"sink"}, - }, - }, - }, - } - factories := pipeline.Factories{ - Receivers: map[pipeline.Type]pipeline.ReceiverFactory{}, - Exporters: map[pipeline.Type]pipeline.ExporterFactory{ - pipeline.MustNewType("sink"): &sinkExporterFactory{}, - }, - } - - _, err := pipelinebuilder.ResolveConfigs(cfg, factories) - require.Error(t, err) - require.Contains(t, err.Error(), `no factory registered for type "nonexistent"`) -} - -func TestResolveConfigs_NameNotInSection(t *testing.T) { - t.Parallel() - - cfg := &config.Config{ - Receivers: map[string]yaml.Node{}, - Exporters: map[string]yaml.Node{"sink": mustYAML(`{}`)}, - Service: config.Service{ - Pipelines: map[string]config.Pipeline{ - "metrics/primary": { - Receivers: []string{"typed"}, - Exporters: []string{"sink"}, - }, - }, - }, - } - factories := pipeline.Factories{ - Receivers: map[pipeline.Type]pipeline.ReceiverFactory{ - pipeline.MustNewType("typed"): &typedReceiverFactory{}, - }, - Exporters: map[pipeline.Type]pipeline.ExporterFactory{ - pipeline.MustNewType("sink"): &sinkExporterFactory{}, - }, - } - - _, err := pipelinebuilder.ResolveConfigs(cfg, factories) - require.Error(t, err) - require.Contains(t, err.Error(), `not declared in top-level`) -} - -func mustYAML(s string) yaml.Node { - if s == "" { - panic("mustYAML: empty input — pass at least an empty mapping {}") - } - var n yaml.Node - if err := yaml.Unmarshal([]byte(s), &n); err != nil { - panic(err) - } - if n.Kind == yaml.DocumentNode && len(n.Content) > 0 { - return *n.Content[0] - } - return n -} - -func discardLogger() *slog.Logger { return slog.New(slog.NewTextHandler(io.Discard, nil)) } - -// echoReceiverFactory + noopProcessorFactory + sinkExporterFactory: -// minimal factory triple sufficient to exercise the processor path -// in buildPipelines. - -type echoReceiverFactory struct{} - -func (*echoReceiverFactory) Type() pipeline.Type { return pipeline.MustNewType("echo") } -func (*echoReceiverFactory) CreateDefaultConfig() pipeline.Config { - return &emptyConfig{} -} - -func (*echoReceiverFactory) CreateMetrics(_ context.Context, _ pipeline.CreateSettings, _ pipeline.Config, _ consumer.Metrics) (pipeline.Receiver, error) { - return noopComponent{}, nil -} - -func (*echoReceiverFactory) CreateTraces(_ context.Context, _ pipeline.CreateSettings, _ pipeline.Config, _ consumer.Traces) (pipeline.Receiver, error) { - return nil, pipeline.ErrSignalNotSupported -} - -func (*echoReceiverFactory) CreateLogs(_ context.Context, _ pipeline.CreateSettings, _ pipeline.Config, _ consumer.Logs) (pipeline.Receiver, error) { - return nil, pipeline.ErrSignalNotSupported -} - -type noopProcessorFactory struct{} - -func (*noopProcessorFactory) Type() pipeline.Type { return pipeline.MustNewType("noop") } -func (*noopProcessorFactory) CreateDefaultConfig() pipeline.Config { - return &emptyConfig{} -} - -func (*noopProcessorFactory) CreateMetrics(_ context.Context, _ pipeline.CreateSettings, _ pipeline.Config, next consumer.Metrics) (pipeline.Processor, error) { - return &noopProcessor{next: next}, nil -} - -func (*noopProcessorFactory) CreateTraces(_ context.Context, _ pipeline.CreateSettings, _ pipeline.Config, _ consumer.Traces) (pipeline.Processor, error) { - return nil, pipeline.ErrSignalNotSupported -} - -func (*noopProcessorFactory) CreateLogs(_ context.Context, _ pipeline.CreateSettings, _ pipeline.Config, _ consumer.Logs) (pipeline.Processor, error) { - return nil, pipeline.ErrSignalNotSupported -} - -type sinkExporterFactory struct{} - -func (*sinkExporterFactory) Type() pipeline.Type { return pipeline.MustNewType("sink") } -func (*sinkExporterFactory) CreateDefaultConfig() pipeline.Config { - return &emptyConfig{} -} - -func (*sinkExporterFactory) CreateMetrics(_ context.Context, _ pipeline.CreateSettings, _ pipeline.Config) (pipeline.Exporter, error) { - return &sinkExporter{}, nil -} - -func (*sinkExporterFactory) CreateTraces(_ context.Context, _ pipeline.CreateSettings, _ pipeline.Config) (pipeline.Exporter, error) { - return nil, pipeline.ErrSignalNotSupported -} - -func (*sinkExporterFactory) CreateLogs(_ context.Context, _ pipeline.CreateSettings, _ pipeline.Config) (pipeline.Exporter, error) { - return nil, pipeline.ErrSignalNotSupported -} - -type emptyConfig struct{} - -func (*emptyConfig) Validate() error { return nil } - -// typedConfig has a real field so KnownFields(true) rejects typos -// like `collection_intrval` while accepting `collection_interval`. -type typedConfig struct { - CollectionInterval string `yaml:"collection_interval"` -} - -func (*typedConfig) Validate() error { return nil } - -type typedReceiverFactory struct{} - -func (*typedReceiverFactory) Type() pipeline.Type { return pipeline.MustNewType("typed") } -func (*typedReceiverFactory) CreateDefaultConfig() pipeline.Config { - return &typedConfig{} -} - -func (*typedReceiverFactory) CreateMetrics(_ context.Context, _ pipeline.CreateSettings, _ pipeline.Config, _ consumer.Metrics) (pipeline.Receiver, error) { - return noopComponent{}, nil -} - -func (*typedReceiverFactory) CreateTraces(_ context.Context, _ pipeline.CreateSettings, _ pipeline.Config, _ consumer.Traces) (pipeline.Receiver, error) { - return nil, pipeline.ErrSignalNotSupported -} - -func (*typedReceiverFactory) CreateLogs(_ context.Context, _ pipeline.CreateSettings, _ pipeline.Config, _ consumer.Logs) (pipeline.Receiver, error) { - return nil, pipeline.ErrSignalNotSupported -} - -type noopComponent struct{} - -func (noopComponent) Start(context.Context, pipeline.Host) error { return nil } -func (noopComponent) Shutdown(context.Context) error { return nil } - -type noopProcessor struct { - next consumer.Metrics -} - -func (noopProcessor) Start(context.Context, pipeline.Host) error { return nil } -func (noopProcessor) Shutdown(context.Context) error { return nil } -func (noopProcessor) Capabilities() consumer.Capabilities { - return consumer.Capabilities{} -} - -func (n *noopProcessor) ConsumeMetrics(ctx context.Context, md pmetric.Metrics) error { - return n.next.ConsumeMetrics(ctx, md) //nolint:wrapcheck // pass-through processor -} - -type sinkExporter struct{} - -func (sinkExporter) Start(context.Context, pipeline.Host) error { return nil } -func (sinkExporter) Shutdown(context.Context) error { return nil } -func (sinkExporter) Capabilities() consumer.Capabilities { return consumer.Capabilities{} } -func (sinkExporter) ConsumeMetrics(context.Context, pmetric.Metrics) error { return nil } diff --git a/internal/pipelinebuilder/doc.go b/internal/pipelinebuilder/doc.go deleted file mode 100644 index d2dcbd79..00000000 --- a/internal/pipelinebuilder/doc.go +++ /dev/null @@ -1,23 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 - -// Package pipelinebuilder turns a loaded config.Config plus a -// pipeline.Factories registry into the runtime's []pipeline.Pipeline. -// The single exported entry point is BuildPipelines. -// -// As of 2026-05. -// -// Sibling of internal/pipeline (not nested inside) because -// internal/config imports internal/pipeline; the inverse would -// cycle. -// -// Tests live in two places: -// - builder_test.go covers paths in-tree components don't -// exercise (the processor stage; tracecore has no processor -// components today). -// - cmd/tracecore/main_test.go covers BuildPipelines end-to-end -// with the real components() registry. `make coverage` uses -// -coverpkg to attribute that coverage back to this package. -// -// When adding a builder-side feature, test it here unless it -// needs the real components() registry — then test in cmd/tracecore. -package pipelinebuilder diff --git a/internal/pipelinebuilder/fuzz_test.go b/internal/pipelinebuilder/fuzz_test.go deleted file mode 100644 index 74ba10f7..00000000 --- a/internal/pipelinebuilder/fuzz_test.go +++ /dev/null @@ -1,23 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 - -package pipelinebuilder - -import ( - "testing" -) - -// FuzzSplitName pushes operator-supplied component reference names -// (e.g. `clockreceiver/primary`, `otlp`, malformed inputs) through -// splitName. The property: no panic, regardless of input. -func FuzzSplitName(f *testing.F) { - f.Add("clockreceiver") - f.Add("clockreceiver/primary") - f.Add("/") - f.Add("a/b/c") - f.Add("") - f.Add("\x00bad") - - f.Fuzz(func(t *testing.T, s string) { - _, _, _ = splitName(s) - }) -} diff --git a/internal/pipelinebuilder/signalops.go b/internal/pipelinebuilder/signalops.go deleted file mode 100644 index ba639cbb..00000000 --- a/internal/pipelinebuilder/signalops.go +++ /dev/null @@ -1,131 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 - -package pipelinebuilder - -import ( - "context" - "errors" - "log/slog" - - "github.com/tracecoreai/tracecore/internal/consumer" - "github.com/tracecoreai/tracecore/internal/fanout" - "github.com/tracecoreai/tracecore/internal/pipeline" -) - -// Do not add a `signalConsumer` interface constraint to `[C any]` below. -// The closures already enforce the consumer interface at usage time, and -// a typed constraint silently breaks pipelinebuilder's `-coverpkg` -// coverage. See docs/FOLLOWUPS.md "Considered and explicitly skipped". - -// signalOps holds the per-signal callbacks (metrics/traces/logs) so the -// assembly algorithm in buildSignalPipeline stays signal-agnostic. -// -// createExporter / createProcessor return both the raw component and its -// consumer.C view; a factory that returns a Component not implementing -// consumer. is a builder bug, surfaced here rather than at use. -type signalOps[C any] struct { - createExporter func(f pipeline.ExporterFactory, ctx context.Context, set pipeline.CreateSettings, cfg pipeline.Config) (pipeline.Exporter, C, error) - createProcessor func(f pipeline.ProcessorFactory, ctx context.Context, set pipeline.CreateSettings, cfg pipeline.Config, next C) (pipeline.Processor, C, error) - createReceiver func(f pipeline.ReceiverFactory, ctx context.Context, set pipeline.CreateSettings, cfg pipeline.Config, next C) (pipeline.Receiver, error) - - newFanout func([]C) C - wrapFirstData func(pID string, logger *slog.Logger, next C) C - // wrapSafe wraps a downstream consumer so panics in its Consume* - // method are recovered, logged with the component ID, and surfaced - // as ordinary errors. Inserted between every consumer hop so a - // buggy processor or exporter can't crash the binary. - wrapSafe func(componentID string, logger *slog.Logger, next C) C -} - -var metricsOps = signalOps[consumer.Metrics]{ - createExporter: func(f pipeline.ExporterFactory, ctx context.Context, set pipeline.CreateSettings, cfg pipeline.Config) (pipeline.Exporter, consumer.Metrics, error) { - e, err := f.CreateMetrics(ctx, set, cfg) - if err != nil { - return nil, nil, err //nolint:wrapcheck // builder-side error; caller adds pipeline/component context. - } - c, ok := e.(consumer.Metrics) - if !ok { - return nil, nil, errors.New("does not implement consumer.Metrics") - } - return e, c, nil - }, - createProcessor: func(f pipeline.ProcessorFactory, ctx context.Context, set pipeline.CreateSettings, cfg pipeline.Config, next consumer.Metrics) (pipeline.Processor, consumer.Metrics, error) { - p, err := f.CreateMetrics(ctx, set, cfg, next) - if err != nil { - return nil, nil, err //nolint:wrapcheck // builder-side error; caller adds pipeline/component context. - } - c, ok := p.(consumer.Metrics) - if !ok { - return nil, nil, errors.New("does not implement consumer.Metrics") - } - return p, c, nil - }, - createReceiver: func(f pipeline.ReceiverFactory, ctx context.Context, set pipeline.CreateSettings, cfg pipeline.Config, next consumer.Metrics) (pipeline.Receiver, error) { - return f.CreateMetrics(ctx, set, cfg, next) //nolint:wrapcheck // caller wraps with pipeline/component context. - }, - newFanout: fanout.NewMetrics, - wrapFirstData: pipeline.WrapFirstDataMetrics, - wrapSafe: pipeline.WrapSafeMetrics, -} - -var tracesOps = signalOps[consumer.Traces]{ - createExporter: func(f pipeline.ExporterFactory, ctx context.Context, set pipeline.CreateSettings, cfg pipeline.Config) (pipeline.Exporter, consumer.Traces, error) { - e, err := f.CreateTraces(ctx, set, cfg) - if err != nil { - return nil, nil, err //nolint:wrapcheck // builder-side error; caller adds pipeline/component context. - } - c, ok := e.(consumer.Traces) - if !ok { - return nil, nil, errors.New("does not implement consumer.Traces") - } - return e, c, nil - }, - createProcessor: func(f pipeline.ProcessorFactory, ctx context.Context, set pipeline.CreateSettings, cfg pipeline.Config, next consumer.Traces) (pipeline.Processor, consumer.Traces, error) { - p, err := f.CreateTraces(ctx, set, cfg, next) - if err != nil { - return nil, nil, err //nolint:wrapcheck // builder-side error; caller adds pipeline/component context. - } - c, ok := p.(consumer.Traces) - if !ok { - return nil, nil, errors.New("does not implement consumer.Traces") - } - return p, c, nil - }, - createReceiver: func(f pipeline.ReceiverFactory, ctx context.Context, set pipeline.CreateSettings, cfg pipeline.Config, next consumer.Traces) (pipeline.Receiver, error) { - return f.CreateTraces(ctx, set, cfg, next) //nolint:wrapcheck // caller wraps with pipeline/component context. - }, - newFanout: fanout.NewTraces, - wrapFirstData: pipeline.WrapFirstDataTraces, - wrapSafe: pipeline.WrapSafeTraces, -} - -var logsOps = signalOps[consumer.Logs]{ - createExporter: func(f pipeline.ExporterFactory, ctx context.Context, set pipeline.CreateSettings, cfg pipeline.Config) (pipeline.Exporter, consumer.Logs, error) { - e, err := f.CreateLogs(ctx, set, cfg) - if err != nil { - return nil, nil, err //nolint:wrapcheck // builder-side error; caller adds pipeline/component context. - } - c, ok := e.(consumer.Logs) - if !ok { - return nil, nil, errors.New("does not implement consumer.Logs") - } - return e, c, nil - }, - createProcessor: func(f pipeline.ProcessorFactory, ctx context.Context, set pipeline.CreateSettings, cfg pipeline.Config, next consumer.Logs) (pipeline.Processor, consumer.Logs, error) { - p, err := f.CreateLogs(ctx, set, cfg, next) - if err != nil { - return nil, nil, err //nolint:wrapcheck // builder-side error; caller adds pipeline/component context. - } - c, ok := p.(consumer.Logs) - if !ok { - return nil, nil, errors.New("does not implement consumer.Logs") - } - return p, c, nil - }, - createReceiver: func(f pipeline.ReceiverFactory, ctx context.Context, set pipeline.CreateSettings, cfg pipeline.Config, next consumer.Logs) (pipeline.Receiver, error) { - return f.CreateLogs(ctx, set, cfg, next) //nolint:wrapcheck // caller wraps with pipeline/component context. - }, - newFanout: fanout.NewLogs, - wrapFirstData: pipeline.WrapFirstDataLogs, - wrapSafe: pipeline.WrapSafeLogs, -} diff --git a/internal/pipelinebuilder/telemetry_wiring_test.go b/internal/pipelinebuilder/telemetry_wiring_test.go deleted file mode 100644 index 47590432..00000000 --- a/internal/pipelinebuilder/telemetry_wiring_test.go +++ /dev/null @@ -1,137 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 - -package pipelinebuilder_test - -import ( - "context" - "sync" - "testing" - - "github.com/stretchr/testify/require" - "go.opentelemetry.io/otel/metric/noop" - "gopkg.in/yaml.v3" - - "github.com/tracecoreai/tracecore/internal/config" - "github.com/tracecoreai/tracecore/internal/consumer" - "github.com/tracecoreai/tracecore/internal/pipeline" - "github.com/tracecoreai/tracecore/internal/pipelinebuilder" -) - -// captureFactory is a stub receiver factory that records the -// CreateSettings it sees so a test can assert MeterProvider + -// BuildInfo flowed through. -type captureFactory struct { - mu sync.Mutex - set pipeline.CreateSettings -} - -func (*captureFactory) Type() pipeline.Type { return pipeline.MustNewType("capture") } -func (*captureFactory) CreateDefaultConfig() pipeline.Config { return &emptyConfig{} } - -func (f *captureFactory) CreateMetrics(_ context.Context, set pipeline.CreateSettings, _ pipeline.Config, _ consumer.Metrics) (pipeline.Receiver, error) { - f.mu.Lock() - f.set = set - f.mu.Unlock() - return noopComponent{}, nil -} - -func (*captureFactory) CreateTraces(_ context.Context, _ pipeline.CreateSettings, _ pipeline.Config, _ consumer.Traces) (pipeline.Receiver, error) { - return nil, pipeline.ErrSignalNotSupported -} - -func (*captureFactory) CreateLogs(_ context.Context, _ pipeline.CreateSettings, _ pipeline.Config, _ consumer.Logs) (pipeline.Receiver, error) { - return nil, pipeline.ErrSignalNotSupported -} - -func (f *captureFactory) seen() pipeline.CreateSettings { - f.mu.Lock() - defer f.mu.Unlock() - return f.set -} - -func minimalCfg(t *testing.T) *config.Config { - t.Helper() - var node yaml.Node - require.NoError(t, yaml.Unmarshal([]byte("{}"), &node)) - return &config.Config{ - Receivers: map[string]yaml.Node{"capture": node}, - Exporters: map[string]yaml.Node{"sink": node}, - Service: config.Service{ - Pipelines: map[string]config.Pipeline{ - "metrics/primary": { - Receivers: []string{"capture"}, - Exporters: []string{"sink"}, - }, - }, - }, - } -} - -// TestBuildPipelines_DefaultsMeterProviderToNoop pins the safety -// invariant: receivers MUST never see a nil MeterProvider. When the -// caller doesn't pass WithMeterProvider, BuildPipelines substitutes a -// noop, so receiver code can write `telSet.MeterProvider.Meter(...)` -// without a nil-check. -func TestBuildPipelines_DefaultsMeterProviderToNoop(t *testing.T) { - t.Parallel() - - rf := &captureFactory{} - factories := pipeline.Factories{ - Receivers: map[pipeline.Type]pipeline.ReceiverFactory{rf.Type(): rf}, - Exporters: map[pipeline.Type]pipeline.ExporterFactory{ - pipeline.MustNewType("sink"): &sinkExporterFactory{}, - }, - } - - _, err := pipelinebuilder.BuildPipelines(t.Context(), discardLogger(), minimalCfg(t), factories) - require.NoError(t, err) - - set := rf.seen() - require.NotNil(t, set.Telemetry.MeterProvider, "MeterProvider must default to noop, not stay nil") -} - -// TestBuildPipelines_WithMeterProvider_FlowsThrough pins the wire-up: -// when the caller passes WithMeterProvider, the receiver sees that -// exact provider. -func TestBuildPipelines_WithMeterProvider_FlowsThrough(t *testing.T) { - t.Parallel() - - mp := noop.NewMeterProvider() - rf := &captureFactory{} - factories := pipeline.Factories{ - Receivers: map[pipeline.Type]pipeline.ReceiverFactory{rf.Type(): rf}, - Exporters: map[pipeline.Type]pipeline.ExporterFactory{ - pipeline.MustNewType("sink"): &sinkExporterFactory{}, - }, - } - - _, err := pipelinebuilder.BuildPipelines(t.Context(), discardLogger(), minimalCfg(t), factories, - pipelinebuilder.WithMeterProvider(mp)) - require.NoError(t, err) - - set := rf.seen() - require.Equal(t, mp, set.Telemetry.MeterProvider, - "WithMeterProvider must be the exact provider the receiver sees") -} - -// TestBuildPipelines_WithBuildInfo_FlowsThrough pins that BuildInfo -// flows from BuildPipelines through to the receiver's CreateSettings. -func TestBuildPipelines_WithBuildInfo_FlowsThrough(t *testing.T) { - t.Parallel() - - rf := &captureFactory{} - factories := pipeline.Factories{ - Receivers: map[pipeline.Type]pipeline.ReceiverFactory{rf.Type(): rf}, - Exporters: map[pipeline.Type]pipeline.ExporterFactory{ - pipeline.MustNewType("sink"): &sinkExporterFactory{}, - }, - } - - bi := pipeline.BuildInfo{Command: "tracecore", Description: "test build", Version: "v0.2.0"} - _, err := pipelinebuilder.BuildPipelines(t.Context(), discardLogger(), minimalCfg(t), factories, - pipelinebuilder.WithBuildInfo(bi)) - require.NoError(t, err) - - set := rf.seen() - require.Equal(t, bi, set.BuildInfo) -} diff --git a/internal/runtime/lifecycle/example_test.go b/internal/runtime/lifecycle/example_test.go deleted file mode 100644 index f6f788c4..00000000 --- a/internal/runtime/lifecycle/example_test.go +++ /dev/null @@ -1,48 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 - -package lifecycle_test - -import ( - "context" - "fmt" - "io" - "log/slog" - "time" - - "github.com/tracecoreai/tracecore/internal/runtime/lifecycle" -) - -// ExampleLifecycle shows the canonical streaming-source lifecycle -// pattern: construct via New, Start with the parent ctx + a run -// function, Shutdown when done. Panic recovery and cancel-cascade -// are automatic. New receiver authors should copy this shape. -func ExampleLifecycle() { - // Discard logger keeps `go test -v` output focused on the - // Example's `// Output:` block; production callers pass - // slog.Default() (or a custom handler) instead. - logger := slog.New(slog.NewTextHandler(io.Discard, nil)) - lc := lifecycle.New(logger, func(rec any) { - fmt.Println("panic recovered:", rec) - }) - - // Spawn a worker that exits cleanly when ctx fires. - done := make(chan struct{}) - if err := lc.Start(context.Background(), func(ctx context.Context) { - <-ctx.Done() - close(done) - }); err != nil { - fmt.Println("start error:", err) - return - } - - // Shut down with a 1-second budget. - shutdownCtx, cancel := context.WithTimeout(context.Background(), time.Second) - defer cancel() - if err := lc.Shutdown(shutdownCtx); err != nil { - fmt.Println("shutdown error:", err) - return - } - <-done - fmt.Println("ok") - // Output: ok -} diff --git a/internal/runtime/lifecycle/lifecycle.go b/internal/runtime/lifecycle/lifecycle.go deleted file mode 100644 index a8de1199..00000000 --- a/internal/runtime/lifecycle/lifecycle.go +++ /dev/null @@ -1,206 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 - -// Package lifecycle is the canonical streaming-source lifecycle -// helper used by tracecore receivers whose hot path is a long-lived -// goroutine (vs the tick-based clockreceiver pattern). The helper -// owns cancel + WaitGroup + panic-recovery so each source author -// writes a body function, not bookkeeping. -// -// Used by: -// - components/receivers/containerstdout/receiver.go -// -// (Other receivers have migrated to receiver-scoped sibling -// `lifecycle.go` types — see components/receivers/{nccl_fr, -// clockreceiver,kernelevents,k8sevents,pyspy} for the pattern. This -// package is slated for deletion in RFC-0013 PR-F.2 once -// containerstdout either ports off the helper or is deleted by -// PR-K.2.) -// -// The cleanest extension test ("D1 falsifier" in M9's research log) -// is to write a Windows ETW source that consists only of a Run -// function plus Lifecycle-helper wiring; the source's Start/Shutdown -// methods are 3 lines each. -package lifecycle - -import ( - "context" - "errors" - "fmt" - "log/slog" - "runtime" - "sync" - "sync/atomic" -) - -// ErrAlreadyStarted is returned by Lifecycle.Start when called a -// second time without an intervening Shutdown. Callers that want to -// silently absorb the duplicate (rather than surface the contract -// violation) should do so via `errors.Is(err, ErrAlreadyStarted)` -// rather than by string-matching the error message. -// -// # Sentinel naming convention -// -// `ErrAlreadyStarted` is the canonical name for the -// "Start-called-twice-without-Shutdown" contract across tracecore. -// Mirroring symbols live in [pipeline.ErrAlreadyStarted] (runtime) -// and [telemetry.ErrAlreadyStarted] (HTTP server). The package path -// disambiguates which subject was started twice; using the same -// short name in each package means a contributor learning a new -// package doesn't have to learn a new error name for the same -// contract. New packages exporting this contract MUST use the same -// name. If a future contract is "Shutdown called twice," use -// `ErrAlreadyShutdown` — same pattern, no qualifying subject. -var ErrAlreadyStarted = errors.New("lifecycle: already started") - -// PanicCallback is invoked once if the Run function panics. The -// helper recovers the panic so the receiver never crashes the -// workload (PRINCIPLES.md §1). Receivers wire this to a structured -// log + selftelemetry IncError("panic"). -type PanicCallback func(panicValue any) - -// Lifecycle bundles the cancel + WaitGroup + started-flag a -// streaming source needs. Zero-value is NOT useful; use New. -// -// Shutdown semantics: Shutdown is idempotent. The FIRST Shutdown's -// error (if any — typically a caller-ctx deadline) is stashed and -// returned by every subsequent Shutdown so the failure is not -// silently swallowed. Goroutines that ignore ctx leak past -// Shutdown until they exit on their own; the helper logs at WARN -// and returns the caller-ctx error rather than blocking — STYLE.md -// 1-second shutdown deadline. -type Lifecycle struct { - logger *slog.Logger - onPanic PanicCallback - - mu sync.Mutex - cancel context.CancelFunc - internalCtx context.Context - closed bool // set by Shutdown so post-Shutdown Add silently no-ops - shutdownErr error - wg sync.WaitGroup - started atomic.Bool -} - -// New constructs a Lifecycle. The logger is required (slog.Default -// is acceptable for tests); onPanic may be nil, in which case panics -// are logged at ERROR but no counter fires. -func New(logger *slog.Logger, onPanic PanicCallback) *Lifecycle { - if logger == nil { - logger = slog.Default() - } - return &Lifecycle{logger: logger, onPanic: onPanic} -} - -// Start spawns `run` in a goroutine. The ctx passed to run is derived -// from `parent` via context.WithCancel — so a cancellation cascade -// from the receiver-level parent ctx reaches the goroutine without -// the caller needing to invoke Shutdown explicitly. Idempotent: a -// second Start without an intervening Shutdown returns an error. -func (l *Lifecycle) Start(parent context.Context, run func(context.Context)) error { - if !l.started.CompareAndSwap(false, true) { - return ErrAlreadyStarted - } - l.mu.Lock() - internalCtx, cancel := context.WithCancel(parent) - l.cancel = cancel - l.internalCtx = internalCtx - // wg.Add(1) MUST happen under the same mutex as cancel/internalCtx - // so a concurrent Shutdown sees the post-Add state. The previous - // shape (Add outside the lock) admitted a race where Shutdown - // could observe cancel != nil, run wg.Wait() at counter=0 - // (returning immediately), and either trigger the - // `sync: WaitGroup misuse` panic OR orphan the goroutine the - // helper still owed the caller. See TestLifecycle_StartShutdownConcurrent_NoPanic. - l.wg.Add(1) - l.mu.Unlock() - go l.safeRun(internalCtx, run) - return nil -} - -// safeRun wraps the caller's run function with panic recovery and -// wg.Done bookkeeping. -func (l *Lifecycle) safeRun(ctx context.Context, run func(context.Context)) { - defer l.wg.Done() - defer func() { - if rec := recover(); rec != nil { - l.logger.Error("lifecycle: run panic recovered", "panic", fmt.Sprintf("%v", rec)) - if l.onPanic != nil { - l.onPanic(rec) - } - } - }() - run(ctx) -} - -// Shutdown cancels the internal ctx and waits for the goroutine to -// exit, honoring the caller's ctx deadline. Idempotent: subsequent -// calls return the FIRST call's error (typically a deadline- -// exceeded) rather than silently swallowing it. -func (l *Lifecycle) Shutdown(ctx context.Context) error { - l.mu.Lock() - if l.closed { - err := l.shutdownErr - l.mu.Unlock() - return err - } - cancel := l.cancel - l.cancel = nil - l.internalCtx = nil - l.closed = true - l.mu.Unlock() - if cancel == nil { - return nil - } - cancel() - - done := make(chan struct{}) - go func() { - l.wg.Wait() - close(done) - }() - select { - case <-done: - return nil - case <-ctx.Done(): - // NumGoroutine is process-wide, not lifecycle-local; surfacing it - // here lets operators eyeball whether the leak is plausibly ours. - l.logger.Warn("lifecycle: shutdown deadline elapsed before goroutine exited", - "process_goroutines", runtime.NumGoroutine()) - err := fmt.Errorf("lifecycle shutdown: %w", ctx.Err()) - l.mu.Lock() - l.shutdownErr = err - l.mu.Unlock() - return err - } -} - -// Add registers an additional goroutine under the same WaitGroup, -// so auxiliary watchers (e.g., the kmsg ctx-cancel reader-close -// goroutine) participate in Shutdown waiting. The goroutine receives -// the lifecycle's internal ctx so Shutdown's cancel reaches it. A -// panic inside `run` is recovered (same contract as Start). -// -// Refusal modes (silent, but logged at WARN so callers don't lose -// the goroutine to invisibility): -// - lifecycle has not been Started: the callback would never see -// a cancel; spawning is a leak hazard. -// - lifecycle has already been Shutdown: wg.Wait may have -// returned; a fresh wg.Add(1) would panic. -// -// TOCTOU safety: wg.Add(1) happens under the same mutex as the -// post-Shutdown `closed` check, so Add never races a concurrent -// Shutdown into wg.Add-after-wg.Wait-returned panic territory. -func (l *Lifecycle) Add(run func(context.Context)) { - l.mu.Lock() - if l.closed || l.internalCtx == nil { - l.mu.Unlock() - // Log so future authors don't hit silent-refusal traps. - l.logger.Warn("lifecycle.Add called outside running window — ignored", - "closed", l.closed, "started", l.internalCtx != nil) - return - } - ctx := l.internalCtx - l.wg.Add(1) - l.mu.Unlock() - go l.safeRun(ctx, run) -} diff --git a/internal/runtime/lifecycle/lifecycle_test.go b/internal/runtime/lifecycle/lifecycle_test.go deleted file mode 100644 index cb80eaaa..00000000 --- a/internal/runtime/lifecycle/lifecycle_test.go +++ /dev/null @@ -1,251 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 - -package lifecycle_test - -import ( - "bytes" - "context" - "log/slog" - "sync" - "sync/atomic" - "testing" - "time" - - "github.com/stretchr/testify/require" - - "github.com/tracecoreai/tracecore/internal/runtime/lifecycle" -) - -// TestLifecycle_StartShutdown_CleanExit pins the happy path: Start -// spawns a goroutine, Shutdown cancels its ctx, the goroutine exits, -// Shutdown returns nil within the deadline. -func TestLifecycle_StartShutdown_CleanExit(t *testing.T) { - t.Parallel() - l := lifecycle.New(slog.Default(), nil) - - exited := make(chan struct{}) - require.NoError(t, l.Start(t.Context(), func(ctx context.Context) { - <-ctx.Done() - close(exited) - })) - require.NoError(t, l.Shutdown(t.Context())) - select { - case <-exited: - case <-time.After(time.Second): - t.Fatal("goroutine did not observe ctx cancellation") - } -} - -// TestLifecycle_ParentCancelCascades pins the cancellation contract: -// cancelling the parent ctx (e.g., the receiver's internal ctx) -// reaches the goroutine WITHOUT an explicit Shutdown call. -func TestLifecycle_ParentCancelCascades(t *testing.T) { - t.Parallel() - l := lifecycle.New(slog.Default(), nil) - parent, cancel := context.WithCancel(t.Context()) - defer cancel() - - exited := make(chan struct{}) - require.NoError(t, l.Start(parent, func(ctx context.Context) { - <-ctx.Done() - close(exited) - })) - - cancel() // cancel the PARENT, not via Shutdown - select { - case <-exited: - case <-time.After(time.Second): - t.Fatal("parent-ctx cancel did not cascade") - } - // Shutdown should still complete cleanly. - require.NoError(t, l.Shutdown(t.Context())) -} - -func TestLifecycle_ShutdownIsIdempotent(t *testing.T) { - t.Parallel() - l := lifecycle.New(slog.Default(), nil) - require.NoError(t, l.Start(t.Context(), func(ctx context.Context) { <-ctx.Done() })) - require.NoError(t, l.Shutdown(t.Context())) - require.NoError(t, l.Shutdown(t.Context())) -} - -// TestLifecycle_ShutdownPreservesFirstError pins that the first -// Shutdown's deadline-exceeded error is returned by subsequent -// Shutdown calls (no swallowed errors). -func TestLifecycle_ShutdownPreservesFirstError(t *testing.T) { - t.Parallel() - l := lifecycle.New(slog.Default(), nil) - require.NoError(t, l.Start(t.Context(), func(_ context.Context) { - time.Sleep(500 * time.Millisecond) - })) - deadlineCtx, cancel := context.WithTimeout(t.Context(), 50*time.Millisecond) - defer cancel() - err1 := l.Shutdown(deadlineCtx) - require.Error(t, err1) - require.ErrorIs(t, err1, context.DeadlineExceeded) - err2 := l.Shutdown(t.Context()) - require.Equal(t, err1, err2, "second Shutdown returns the first's error") -} - -// TestLifecycle_AddAfterShutdown_SilentlyNoOps pins the TOCTOU -// safety: an Add racing a Shutdown does not panic on wg.Add after -// wg.Wait. -func TestLifecycle_AddAfterShutdown_SilentlyNoOps(t *testing.T) { - t.Parallel() - l := lifecycle.New(slog.Default(), nil) - require.NoError(t, l.Start(t.Context(), func(ctx context.Context) { <-ctx.Done() })) - require.NoError(t, l.Shutdown(t.Context())) - - require.NotPanics(t, func() { - l.Add(func(_ context.Context) { - t.Fatal("post-Shutdown Add should not have spawned the goroutine") - }) - }) -} - -func TestLifecycle_StartTwice_Rejects(t *testing.T) { - t.Parallel() - l := lifecycle.New(slog.Default(), nil) - require.NoError(t, l.Start(t.Context(), func(ctx context.Context) { <-ctx.Done() })) - err := l.Start(t.Context(), func(_ context.Context) {}) - require.ErrorIs(t, err, lifecycle.ErrAlreadyStarted) - // Message text is part of the contract for operators grepping - // logs. Changing it must update CHANGELOG; this assertion - // surfaces the change. - require.Equal(t, "lifecycle: already started", err.Error()) - require.NoError(t, l.Shutdown(t.Context())) -} - -// TestLifecycle_PanicRecovery pins the safety contract: a panicking -// run function does NOT crash the process; onPanic fires once. -func TestLifecycle_PanicRecovery(t *testing.T) { - t.Parallel() - var panicCount atomic.Int32 - l := lifecycle.New(slog.Default(), func(_ any) { panicCount.Add(1) }) - - require.NoError(t, l.Start(t.Context(), func(_ context.Context) { - panic("boom") - })) - require.Eventually(t, func() bool { return panicCount.Load() == 1 }, - time.Second, 10*time.Millisecond, "onPanic must fire once") - require.NoError(t, l.Shutdown(t.Context())) -} - -// TestLifecycle_Add_PanicRecovery pins that an Add'd goroutine that -// panics doesn't bring the lifecycle down — same contract as -// Start's safeRun. -func TestLifecycle_Add_PanicRecovery(t *testing.T) { - t.Parallel() - var panicCount atomic.Int32 - l := lifecycle.New(slog.Default(), func(_ any) { panicCount.Add(1) }) - require.NoError(t, l.Start(t.Context(), func(ctx context.Context) { <-ctx.Done() })) - l.Add(func(_ context.Context) { panic("aux boom") }) - require.Eventually(t, func() bool { return panicCount.Load() == 1 }, - time.Second, 10*time.Millisecond, "Add'd panic must fire onPanic") - require.NoError(t, l.Shutdown(t.Context())) -} - -// TestLifecycle_Add_AuxiliaryGoroutine pins that Add tracks an -// extra goroutine in the same WaitGroup so Shutdown waits for it. -func TestLifecycle_Add_AuxiliaryGoroutine(t *testing.T) { - t.Parallel() - l := lifecycle.New(slog.Default(), nil) - var auxExited atomic.Bool - - require.NoError(t, l.Start(t.Context(), func(ctx context.Context) { <-ctx.Done() })) - l.Add(func(ctx context.Context) { - <-ctx.Done() - auxExited.Store(true) - }) - require.NoError(t, l.Shutdown(t.Context())) - require.True(t, auxExited.Load(), "Shutdown must wait for aux goroutine") -} - -// TestLifecycle_ShutdownHonorsCallerCtx pins the 1s shutdown deadline -// contract — a stuck run function returns the caller's ctx error, -// not a hang. STYLE.md: training nodes can't wait > 1s; log and -// abandon the goroutine. -func TestLifecycle_ShutdownHonorsCallerCtx(t *testing.T) { - t.Parallel() - l := lifecycle.New(slog.Default(), nil) - // 500ms sleep is long enough to outlast the 100ms deadline below - // but short enough that the goroutine cleans up before the test - // completes — keeping the suite snappy. - require.NoError(t, l.Start(t.Context(), func(_ context.Context) { - time.Sleep(500 * time.Millisecond) - })) - - deadlineCtx, cancel := context.WithTimeout(t.Context(), 100*time.Millisecond) - defer cancel() - err := l.Shutdown(deadlineCtx) - require.Error(t, err, "stuck run must surface deadline-exceeded") - require.ErrorIs(t, err, context.DeadlineExceeded) -} - -// TestLifecycle_ShutdownDeadline_LogsWarning pins the operator- -// observable warning that fires when Shutdown's caller ctx elapses -// before the goroutine exits. The warning + the error-return are -// the two halves of the deadline contract — testing only the error -// would let the log message rot. -func TestLifecycle_ShutdownDeadline_LogsWarning(t *testing.T) { - t.Parallel() - var buf bytes.Buffer - logger := slog.New(slog.NewTextHandler(&buf, &slog.HandlerOptions{Level: slog.LevelWarn})) - l := lifecycle.New(logger, nil) - - require.NoError(t, l.Start(t.Context(), func(_ context.Context) { - time.Sleep(500 * time.Millisecond) - })) - - deadlineCtx, cancel := context.WithTimeout(t.Context(), 50*time.Millisecond) - defer cancel() - err := l.Shutdown(deadlineCtx) - require.ErrorIs(t, err, context.DeadlineExceeded) - - require.Contains(t, buf.String(), "shutdown deadline elapsed", - "deadline path MUST emit the operator-visible warning line") -} - -// TestLifecycle_StartShutdownConcurrent_NoPanic pins the contract -// that a Start/Shutdown race on a freshly-constructed Lifecycle -// does NOT trigger a `sync: WaitGroup misuse: Add called -// concurrently with Wait` panic. The previous Start implementation -// did `wg.Add(1)` AFTER `mu.Unlock()`, so a Shutdown that observed -// cancel!=nil could call wg.Wait() (returning at counter=0) -// before the Add(1) committed — yielding either the WaitGroup- -// misuse panic OR a silently-orphaned goroutine that Shutdown -// nonetheless reported as cleanly shut down. -// -// The fix is to put wg.Add(1) under the same mutex as the cancel -// / internalCtx assignment, so Shutdown's mu.Lock acts as a happens- -// before barrier for both observations. -// -// Stress shape: N iterations × M parallel goroutines per iteration, -// each running on a brand-new Lifecycle. Under -race the test -// surfaces the bug deterministically once N×M is large enough; on -// the developer laptop 200×8 fires reliably with the old code. -func TestLifecycle_StartShutdownConcurrent_NoPanic(t *testing.T) { - t.Parallel() - const iterations = 200 - const parallel = 8 - - for range iterations { - var wg sync.WaitGroup - for range parallel { - wg.Add(1) - go func() { - defer wg.Done() - l := lifecycle.New(slog.New(slog.NewTextHandler(&bytes.Buffer{}, nil)), nil) - // Start spawns a goroutine that blocks on ctx. - _ = l.Start(t.Context(), func(ctx context.Context) { - <-ctx.Done() - }) - // Race Shutdown against the just-spawned Start. - ctx, cancel := context.WithTimeout(t.Context(), 100*time.Millisecond) - defer cancel() - _ = l.Shutdown(ctx) - }() - } - wg.Wait() - } -} diff --git a/tools/failure-inject/README.md b/tools/failure-inject/README.md index 0806d369..8b73b01d 100644 --- a/tools/failure-inject/README.md +++ b/tools/failure-inject/README.md @@ -128,8 +128,11 @@ builds fall back to plain `OpenFile`. ```sh go test ./tools/failure-inject/... -go test -tags=chaos -race -count=1 ./internal/pipeline/... ``` -The `Chaos` workflow runs the same suites nightly plus a Linux -`mpstat` verification for `cpu-steal`. +The `Chaos` workflow runs the same suite nightly plus a Linux +`mpstat` verification for `cpu-steal`. (The legacy `-tags=chaos` +sweep over `internal/pipeline/...` was deleted in RFC-0013 PR-F.2 +along with the in-tree pipeline runtime; the equivalent +panic-recovery contract now rides on upstream +`go.opentelemetry.io/collector/service`.)