diff --git a/.github/actions/kind-cluster-setup/action.yml b/.github/actions/kind-cluster-setup/action.yml new file mode 100644 index 00000000..aa256ef8 --- /dev/null +++ b/.github/actions/kind-cluster-setup/action.yml @@ -0,0 +1,148 @@ +name: Set up kind cluster (with optional CRD prereqs + tracecore image) +description: > + Unified kind-cluster bootstrap shared by chart.yml, policy-matrix.yml, + install-bench.yml, and (optionally) compat-matrix.yml. Replaces the + prior `kind-tracecore-up` action (deleted — all callsites migrated) + and centralises three sources of fragmentation that recurred on every + chart-touching PR: + + 1. helm + kind version pins drifted independently per workflow. + Pinned here once: helm v3.16.4, kind v0.25.0, node v1.32.0. + 2. Each workflow that talked to the chart's production preset hit + the same "no matches for kind ServiceMonitor in version + monitoring.coreos.com/v1" error on kind (regressed three PRs + before #494 closed it for policy-matrix only). Install the CRDs + in one place driven by inputs (`install-servicemonitor-crd`, + `install-gatekeeper-crds`, `install-cert-manager-crds`). + 3. `docker build` + `kind load` for the tracecore image was + duplicated across chart.yml and install-bench.yml with + diverging tag names (`tracecore:ci` vs `tracecore:bench`). + Driven by `build-image` + `image-tag` inputs. + + CRD refs are pinned to tagged releases per repo convention + (KYVERNO_POLICIES_REF / GATEKEEPER_VERSION in + scripts/policy-matrix-smoke.sh — never track `main`). + +inputs: + cluster-name: + description: kind cluster name (must be unique per job). + required: true + kind-config: + description: Optional path to a kind cluster config file (e.g. bench/install/kind-config.yaml). + required: false + default: '' + build-image: + description: When 'true', `docker build` the tracecore image and `kind load` it. When 'false', skip — kind cluster only. + required: false + default: 'true' + image-tag: + description: Image tag for the locally built tracecore image. Defaults to 'ci' (chart.yml's convention). + required: false + default: 'ci' + install-servicemonitor-crd: + description: | + When 'true', install the prometheus-operator ServiceMonitor CRD (pinned + v0.91.0). Required for `helm install --dry-run=server` against the + chart's production preset (serviceMonitor.enabled=true). + required: false + default: 'false' + install-gatekeeper-crds: + description: | + When 'true', install the Gatekeeper-library v3.18.x CRDs. Reserved + for future workflows that need to apply Gatekeeper Constraints + outside the policy-matrix flow. Today the policy-matrix-smoke.sh + script installs Gatekeeper itself via its own helm chart. + required: false + default: 'false' + install-cert-manager-crds: + description: | + When 'true', install cert-manager CRDs (pinned v1.16.1). Reserved + for future tls.enabled=true cert-manager-integrated install paths. + required: false + default: 'false' + +runs: + using: composite + steps: + - name: Install helm + uses: azure/setup-helm@dda3372f752e03dde6b3237bc9431cdc2f7a02a2 # v5.0.0 + with: + version: v3.16.4 + + - name: Build tracecore image + if: inputs.build-image == 'true' + shell: bash + env: + IMAGE_TAG: ${{ inputs.image-tag }} + run: | + docker build \ + -f install/kubernetes/tracecore/Dockerfile \ + -t "tracecore:${IMAGE_TAG}" \ + . + + - name: Create kind cluster + uses: helm/kind-action@ef37e7f390d99f746eb8b610417061a60e82a6cc # v1.14.0 + with: + version: v0.25.0 + node_image: kindest/node:v1.32.0 + cluster_name: ${{ inputs.cluster-name }} + config: ${{ inputs.kind-config }} + + - name: Load image into kind + if: inputs.build-image == 'true' + shell: bash + env: + CLUSTER_NAME: ${{ inputs.cluster-name }} + IMAGE_TAG: ${{ inputs.image-tag }} + run: kind load docker-image "tracecore:${IMAGE_TAG}" --name "${CLUSTER_NAME}" + + - name: Install prometheus-operator ServiceMonitor CRD (pinned v0.91.0) + # The chart's production preset flips `serviceMonitor.enabled=true`, + # which renders a `monitoring.coreos.com/v1 ServiceMonitor` resource. + # Kind does not ship that CRD, so `helm install --dry-run=server` + # exits 1 with "no matches for kind ServiceMonitor in version + # monitoring.coreos.com/v1". We install ONLY the ServiceMonitor CRD + # — the chart references no other monitoring.coreos.com kinds, and + # the slim install (~700 lines of YAML) is cheaper than the full + # prometheus-operator bundle (~3MB) which would also pull + # Prometheus, Alertmanager, ThanosRuler, PodMonitor, Probe, and + # PrometheusRule CRDs we do not exercise. + # + # CRD ref pinned to v0.91.0 (published 2026-05-05) per #494. + # Bumping this pin is a reviewed code change. + if: inputs.install-servicemonitor-crd == 'true' + shell: bash + run: | + kubectl apply -f \ + "https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.91.0/example/prometheus-operator-crd/monitoring.coreos.com_servicemonitors.yaml" + kubectl wait --for=condition=established \ + crd/servicemonitors.monitoring.coreos.com --timeout=60s + + - name: Install Gatekeeper CRDs (pinned v3.18.2) + # Reserved for future workflows that need to apply + # ConstraintTemplate / Constraint resources outside the + # policy-matrix flow. Today the policy-matrix smoke script + # installs Gatekeeper itself via its own helm chart; this + # input is a forward-compat hook so we have ONE place that + # knows the Gatekeeper CRD pin. + if: inputs.install-gatekeeper-crds == 'true' + shell: bash + run: | + kubectl apply -f \ + "https://raw.githubusercontent.com/open-policy-agent/gatekeeper/v3.18.2/deploy/gatekeeper.yaml" + kubectl wait --for=condition=established \ + crd/constrainttemplates.templates.gatekeeper.sh --timeout=120s + + - name: Install cert-manager CRDs (pinned v1.16.1) + # Reserved for future tls.enabled=true install paths that + # integrate with cert-manager-issued Secrets. + if: inputs.install-cert-manager-crds == 'true' + shell: bash + run: | + kubectl apply -f \ + "https://github.com/cert-manager/cert-manager/releases/download/v1.16.1/cert-manager.crds.yaml" + for crd in certificates.cert-manager.io \ + issuers.cert-manager.io \ + clusterissuers.cert-manager.io; do + kubectl wait --for=condition=established "crd/${crd}" --timeout=60s + done diff --git a/.github/actions/kind-tracecore-up/action.yml b/.github/actions/kind-tracecore-up/action.yml deleted file mode 100644 index ae8853a5..00000000 --- a/.github/actions/kind-tracecore-up/action.yml +++ /dev/null @@ -1,41 +0,0 @@ -name: Set up kind cluster with tracecore image -description: > - Install helm, build the tracecore CI image, create a kind cluster, and - load the image into it. Shared by chart.yml's `install` and `upgrade` - jobs, both of which need an isolated kind cluster with the same locally - built `tracecore:ci` image preloaded so helm can install with - `image.pullPolicy=Never`. - -inputs: - cluster-name: - description: kind cluster name (must be unique per job; passed to both kind-action and `kind load`). - required: true - kind-config: - description: Optional path to a kind cluster config file (e.g. bench/install/kind-config.yaml for the multi-node upgrade fixture). - required: false - default: '' - -runs: - using: composite - steps: - - name: Install helm - uses: azure/setup-helm@dda3372f752e03dde6b3237bc9431cdc2f7a02a2 # v5.0.0 - with: - version: v3.16.4 - - name: Build tracecore image - shell: bash - run: | - docker build \ - -f install/kubernetes/tracecore/Dockerfile \ - -t tracecore:ci \ - . - - name: Create kind cluster - uses: helm/kind-action@ef37e7f390d99f746eb8b610417061a60e82a6cc # v1.14.0 - with: - version: v0.25.0 - node_image: kindest/node:v1.32.0 - cluster_name: ${{ inputs.cluster-name }} - config: ${{ inputs.kind-config }} - - name: Load image into kind - shell: bash - run: kind load docker-image tracecore:ci --name "${{ inputs.cluster-name }}" diff --git a/.github/workflows/chart.yml b/.github/workflows/chart.yml index 4dd4cd46..a7c26017 100644 --- a/.github/workflows/chart.yml +++ b/.github/workflows/chart.yml @@ -584,9 +584,10 @@ jobs: with: go-version-file: go.mod cache: true - - uses: ./.github/actions/kind-tracecore-up + - uses: ./.github/actions/kind-cluster-setup with: cluster-name: tracecore-m5b + install-servicemonitor-crd: 'true' - name: helm install + measure install-to-Ready id: install run: | @@ -682,10 +683,11 @@ jobs: with: go-version-file: go.mod cache: true - - uses: ./.github/actions/kind-tracecore-up + - uses: ./.github/actions/kind-cluster-setup with: cluster-name: tracecore-upgrade kind-config: bench/install/kind-config.yaml + install-servicemonitor-crd: 'true' - name: helm install (revision 1) — baseline values run: | set -eo pipefail diff --git a/.github/workflows/install-bench.yml b/.github/workflows/install-bench.yml index 16c6e8ee..2d4b7434 100644 --- a/.github/workflows/install-bench.yml +++ b/.github/workflows/install-bench.yml @@ -46,26 +46,16 @@ jobs: with: go-version-file: go.mod cache: true - - name: Install helm - uses: azure/setup-helm@dda3372f752e03dde6b3237bc9431cdc2f7a02a2 # v5.0.0 + - uses: ./.github/actions/kind-cluster-setup with: - version: v3.16.4 - - name: Build tracecore image - run: | - docker build \ - -f install/kubernetes/tracecore/Dockerfile \ - -t tracecore:bench \ - . - - name: Create kind cluster - uses: helm/kind-action@ef37e7f390d99f746eb8b610417061a60e82a6cc # v1.14.0 - with: - version: v0.25.0 - node_image: kindest/node:v1.32.0 - cluster_name: tracecore-install-bench - config: bench/install/kind-config.yaml - - name: Load image into kind - run: | - kind load docker-image tracecore:bench --name tracecore-install-bench + cluster-name: tracecore-install-bench + kind-config: bench/install/kind-config.yaml + build-image: 'true' + image-tag: bench + # Production-preset render path needs the ServiceMonitor CRD + # available; install-bench renders the chart and exercises the + # full apply path. Same rationale as policy-matrix and chart. + install-servicemonitor-crd: 'true' - name: Run install bench env: GITHUB_RUNNER_LABEL: ubuntu-latest diff --git a/.github/workflows/policy-matrix.yml b/.github/workflows/policy-matrix.yml index 6e8d02e7..c1d8c926 100644 --- a/.github/workflows/policy-matrix.yml +++ b/.github/workflows/policy-matrix.yml @@ -82,43 +82,23 @@ jobs: values_file: install/kubernetes/tracecore/values-production.yaml steps: - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - - name: Install helm - uses: azure/setup-helm@dda3372f752e03dde6b3237bc9431cdc2f7a02a2 # v5.0.0 + - uses: ./.github/actions/kind-cluster-setup with: - version: v3.16.4 - - name: Create kind cluster - uses: helm/kind-action@ef37e7f390d99f746eb8b610417061a60e82a6cc # v1.14.0 - with: - version: v0.25.0 - node_image: kindest/node:v1.32.0 - cluster_name: tracecore-policy-${{ matrix.policy_engine }}-${{ matrix.values_profile }} + cluster-name: tracecore-policy-${{ matrix.policy_engine }}-${{ matrix.values_profile }} + # policy-matrix asserts admission via `helm install + # --dry-run=server`; the API server never pulls the image, + # so no local docker build is needed. + build-image: 'false' + # Chart's production preset flips serviceMonitor.enabled=true; + # without this CRD the dry-run exits 1 with "no matches for + # kind ServiceMonitor". Applied unconditionally across every + # matrix row (not just production) so a future default-values + # flip cannot silently re-break the gate (#494). + install-servicemonitor-crd: 'true' - name: Sanity — kubectl reaches the cluster run: | kubectl cluster-info kubectl version - - name: Install prometheus-operator ServiceMonitor CRD (issue #494) - # The production-preset values file flips `serviceMonitor.enabled=true`, - # which renders a `monitoring.coreos.com/v1 ServiceMonitor` resource. - # Kind does not ship that CRD, so `helm install --dry-run=server` - # exits 1 with "no matches for kind ServiceMonitor in version - # monitoring.coreos.com/v1" on every chart-touching PR (regression - # since #475). We install ONLY the ServiceMonitor CRD — the chart's - # production preset references no other monitoring.coreos.com kinds, - # and the slim CRD install (~700 lines of YAML) is cheaper than the - # full prometheus-operator bundle (~3MB) which would also pull - # Prometheus, Alertmanager, ThanosRuler, PodMonitor, Probe, and - # PrometheusRule kinds we do not exercise. Applied unconditionally - # across every matrix row (not just production) so a future - # default-values flip cannot silently re-break this gate. - # - # CRD ref pinned to a tagged release (v0.91.0, published - # 2026-05-05) per repo convention `KYVERNO_POLICIES_REF` / - # `GATEKEEPER_VERSION` in scripts/policy-matrix-smoke.sh — never - # track `main`. Bumping this pin is a reviewed code change. - run: | - kubectl apply -f \ - "https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.91.0/example/prometheus-operator-crd/monitoring.coreos.com_servicemonitors.yaml" - kubectl wait --for=condition=established crd/servicemonitors.monitoring.coreos.com --timeout=60s - name: Smoke — install policy engine + helm dry-run tracecore chart env: POLICY_ENGINE: ${{ matrix.policy_engine }} @@ -177,16 +157,15 @@ jobs: - policy_engine: gatekeeper steps: - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - - name: Install helm - uses: azure/setup-helm@dda3372f752e03dde6b3237bc9431cdc2f7a02a2 # v5.0.0 - with: - version: v3.16.4 - - name: Create kind cluster - uses: helm/kind-action@ef37e7f390d99f746eb8b610417061a60e82a6cc # v1.14.0 + - uses: ./.github/actions/kind-cluster-setup with: - version: v0.25.0 - node_image: kindest/node:v1.32.0 - cluster_name: tracecore-policy-mutation-${{ matrix.policy_engine }} + cluster-name: tracecore-policy-mutation-${{ matrix.policy_engine }} + # Mutation job exercises `kubectl apply --dry-run=server` + # against a known-bad DaemonSet; the API server never pulls + # any image. No local docker build needed. + build-image: 'false' + # No ServiceMonitor CRD here: the mutation test applies a + # plain DaemonSet fixture, not the rendered chart. - name: Provision policy engine + restricted namespace env: POLICY_ENGINE: ${{ matrix.policy_engine }} diff --git a/Makefile b/Makefile index e32ba51e..d3359a8c 100644 --- a/Makefile +++ b/Makefile @@ -1,26 +1,12 @@ -# Build -.PHONY: help build clean hooks +# Sharded prerequisite lists — append-in-shard, not here. See make/README.md. +# `scripts/makefile-hotfile-check.sh` (make makefile-hotfile-check) enforces. +include make/phony.mk +include make/check.mk +include make/verify.mk +include make/ci-fast.mk +include make/ci-full.mk -# Test suites -.PHONY: test test-extras test-extras-sustained test-extras-fuzz test-extras-fuzz-kmsg test-extras-fuzz-journald test-extras-fuzz-nccl-fr test-extras-race bench bench-check bench-allocs-check bench-baseline bench-detectors bench-detectors-check bench-detectors-baseline bench-cv-report bench-e2e-steady-state helm-install-rolling-report - -# Format + tidy -.PHONY: fmt fmt-fix vet lint lint-fix lint-unused-module tidy tidy-check mod-verify bump-otel - -# Code generation -.PHONY: generate-fixtures generate-fixtures-check verdict-fixtures-check - -# Coverage -.PHONY: coverage coverage-check - -# Policy gates (each enforces a specific RFC-bound invariant) -.PHONY: license-check license-fix govulncheck dco-check ci-fuzz-nccl-fr nccl-fr-rce-gate register-lint actionlint zizmor doc-check doc-check-release no-autoupdate-check recipes-path-check anonymize-pod-evicted-fixture-check base-digest-check build-tags attribute-namespace-check deprecation-check rfc-status-check cut-criteria-status cut-criteria-status-all cut-criteria-render cut-criteria-check slo-rules-check test-flake-audit - -# Aggregate gates: pre-commit / pre-push / fast-CI / full-CI -.PHONY: check verify ci ci-fast ci-full - -# Release + integration -.PHONY: smoke smoke-quickstart validator-recipe +.PHONY: $(PHONY_TARGETS) BIN := ./_build/tracecore @@ -145,7 +131,18 @@ lint: ## Run golangci-lint. lint-fix: ## Run golangci-lint with --fix. go tool golangci-lint run --fix ./... -lint-unused-module: ## Run the `unused` lint check against module/ (the in-repo Go submodule). +lint-module-full: ## Run the full .golangci.yml linter set against module/ (the in-repo Go submodule). Wider than lint-unused-module — gates every linter declared in .golangci.yml. + @# Workspace mode resolves `./...` only inside the current module, so + @# `make lint` from root NEVER reaches module/. PR #486 enabled the + @# `unused` linter via lint-unused-module; this target promotes that + @# narrow gate to the full linter set declared in .golangci.yml, + @# eliminating the divergence between what `make lint` enforces at + @# the root and what module/ tolerates. The repo .golangci.yml is + @# applied directly (no `--no-config`), so adding/removing a linter + @# in .golangci.yml automatically extends/retracts this gate too. + cd module && go tool golangci-lint run ./... + +lint-unused-module: ## Run the `unused` lint check against module/ (the in-repo Go submodule). Subset of lint-module-full retained for fast-feedback iteration during dead-code sweeps. @# Workspace mode resolves `./...` only inside the current module, so @# `make lint` from root NEVER reaches module/ — the in-repo submodule @# is a separate Go module under go.work. The `unused` linter is @@ -286,7 +283,7 @@ coverage: ## Run all tests under the race detector with coverage profiling; emi coverage-check: coverage ## Fail if any internal/* package <70% or components/* package <60%. @go run ./tools/coverage-check -profile=coverage.out -check: fmt tidy-check lint lint-unused-module vet mod-verify attribute-namespace-check ## Pre-commit gate. Fast (<10s); no test, no build. Catches typos, format, deps drift. +check: $(CHECK_DEPS) ## Pre-commit gate. Fast (<10s); no test, no build. Catches typos, format, deps drift. Members declared in make/check.mk. attribute-namespace-check: ## Binding drift gate for the customer-stable attribute namespace declared in docs/ATTRIBUTES.md (v1.0-rc1 cut criterion 3). Fails on any attribute literal in code that is missing from ATTRIBUTES.md. @bash scripts/attribute-namespace-check.sh @@ -314,7 +311,7 @@ cut-criteria-check: ## Drift gate: every per-milestone rendered markdown must m @# this gate. Override with MILESTONE=v1.0-rc1 (etc.) to scope. @python3 scripts/cut_criteria.py check $(if $(MILESTONE),--milestone $(MILESTONE),) -verify: check license-check generate-fixtures-check verdict-fixtures-check build-tags nccl-fr-rce-gate register-lint actionlint zizmor doc-check deprecation-check no-autoupdate-check anonymize-pod-evicted-fixture-check test-flake-audit ## Pre-push gate. Medium (<30s); CI handles heavy gates (test, coverage, govulncheck, fuzz, build). +verify: $(VERIFY_DEPS) ## Pre-push gate. Medium (<30s); CI handles heavy gates (test, coverage, govulncheck, fuzz, build). Members declared in make/verify.mk. test-extras-sustained: ## (sub-target) sustained-load (5 min); see `make test-extras`. @# kernelevents was the sole sustained-load suite consumer; deleted @@ -396,6 +393,9 @@ base-digest-check: ## Compare the Dockerfile pin for gcr.io/distroless/static-d register-lint: ## Verify `func Register*` symbols live only under components/** (or an explicit allowlist). Enforces STRATEGY.md §"Each component owns its own Factory var". @scripts/register-lint.sh +makefile-hotfile-check: ## Enforce the make/ shard convention (see make/README.md). Fails if the root Makefile re-inlines aggregate prereq lists or .PHONY tokens that belong in make/*.mk shards. + @bash scripts/makefile-hotfile-check.sh --strict + actionlint: ## Lint GitHub Actions workflow files (syntax + embedded shellcheck for run: blocks). @if ! command -v shellcheck >/dev/null 2>&1; then \ echo "WARNING: shellcheck not on PATH; actionlint will skip run-block shellcheck."; \ @@ -408,6 +408,9 @@ no-autoupdate-check: ## Enforce RFC-0008: cmd/, components/, internal/, pkg/ co @scripts/no-autoupdate-check.sh @scripts/no-autoupdate-check_test.sh +zizmor: ## Security-lint GitHub Actions workflows (template injection, untrusted-input-in-script, over-broad permissions, cache poisoning). Gates at --min-severity=high. + @scripts/zizmor.sh + recipes-path-check: ## Issue #427 convention gate: assert PR titles / commit subjects don't reference an aspirational `recipes/pattern-N/` directory. Runs the gate's own regression test against a fixture suite of accept/reject subject shapes. @scripts/recipes-path-check_test.sh @@ -419,10 +422,7 @@ anonymize-pod-evicted-fixture-check: ## M19 carry-forward #1: verify every oper done @bash scripts/anonymize-pod-evicted-fixture_test.sh -zizmor: ## Security-lint GitHub Actions workflows (template injection, untrusted-input-in-script, over-broad permissions, cache poisoning). Gates at --min-severity=high. - @scripts/zizmor.sh - -ci-fast: lint vet mod-verify attribute-namespace-check doc-check recipes-path-check test-flake-audit ## Fast-feedback CI subset. <60s on a dev laptop; the gates that catch the most defects per second. Run on every save / pre-commit; not a substitute for `ci-full`. See PRINCIPLES.md §10. +ci-fast: $(CI_FAST_DEPS) ## Fast-feedback CI subset. <60s on a dev laptop; the gates that catch the most defects per second. Run on every save / pre-commit; not a substitute for `ci-full`. See PRINCIPLES.md §10. Members declared in make/ci-fast.mk. # `ci-full` is the full superset CI enforces on every PR. `ci` is kept as a # back-compat alias so existing scripts, docs, and hooks invoking `make ci` @@ -436,7 +436,7 @@ ci-fast: lint vet mod-verify attribute-namespace-check doc-check recipes-path-ch # the ratchet path. We accept the wall-time hit because local `ci-full` # divergence from CI surfaces ceiling breaches only post-PR-open, which # defeats the per-PR enforcement intent of #302. -ci-full: license-check generate-fixtures-check verdict-fixtures-check vet build-tags tidy-check mod-verify lint nccl-fr-rce-gate register-lint actionlint zizmor coverage-check ci-fuzz-nccl-fr govulncheck doc-check deprecation-check no-autoupdate-check recipes-path-check anonymize-pod-evicted-fixture-check test-flake-audit pre-push-test build smoke smoke-quickstart bench-allocs-check ## Everything CI runs. Run before opening a PR. +ci-full: $(CI_FULL_DEPS) ## Everything CI runs. Run before opening a PR. Members declared in make/ci-full.mk. pre-push-test: ## Regression harness for .githooks/pre-push path-filter routing. Cheap (<1s); runs the hook in dry-run mode against synthetic diff ranges and asserts each gate fires only when its source paths change. @bash scripts/pre-push-test.sh diff --git a/make/README.md b/make/README.md new file mode 100644 index 00000000..8f943a88 --- /dev/null +++ b/make/README.md @@ -0,0 +1,43 @@ +# `make/` — sharded Makefile feature lists + +This directory exists to kill **cascade-rebases on the root `Makefile`**. + +## Problem + +The root `Makefile` carries four monolithic lists: + +1. `.PHONY: ...` (per category) +2. `verify:` prerequisite list (pre-push gate) +3. `ci-fast:` prerequisite list (sub-60s feedback gate) +4. `ci-full:` prerequisite list (full CI superset) + +Every new gate adds one token to each list. Two open PRs touching the +same line produce a 3-way merge conflict that requires manual fix-up, +which is the dominant source of cascade-rebases on this repo. + +## Convention + +When adding a new gate / target: + +1. **DO NOT edit `Makefile` itself.** It is treated as a hot-file. + The pre-push hook will reject Makefile edits that mutate these + lists (`scripts/makefile-hotfile-check.sh`). +2. **Add to the feature's shard** under `make/`: + - `make/phony.mk` — append your target name to `PHONY_TARGETS`. + - `make/verify.mk` — append to `VERIFY_DEPS` if part of pre-push. + - `make/ci-fast.mk` — append to `CI_FAST_DEPS` if <2s and high-yield. + - `make/ci-full.mk` — append to `CI_FULL_DEPS` (most new gates land here). +3. Keep the recipe body in `Makefile` itself (one logical location). + Only the aggregate-list memberships move to shards. + +## Why `+=` not `:=`? + +`+=` is the appendable assignment. Every shard line stands on its own; +adding a new gate touches exactly one line in one shard. There is no +shared list for two PRs to fight over. + +## Why this isn't just `:= ... \` continuation? + +Backslash-continuation across many lines is the *same* hot-file shape +as a long single line — every edit touches the trailing-line position. +Append-from-many-shards eliminates the textual coupling entirely. diff --git a/make/check.mk b/make/check.mk new file mode 100644 index 00000000..a520acc8 --- /dev/null +++ b/make/check.mk @@ -0,0 +1,21 @@ +# make/check.mk — pre-commit gate (fast; ~10s). +# +# Append your target to CHECK_DEPS to gate it on every commit. +# Criterion for inclusion: <2s wall-time, deterministic, no network, +# catches a defect class that would otherwise reach CI. +# +# See make/README.md for the convention. + +# Format / lint / tidy +CHECK_DEPS += fmt +CHECK_DEPS += tidy-check +CHECK_DEPS += lint +# lint-module-full enforces the full .golangci.yml linter set against +# the in-repo submodule under module/ so the root + submodule lint +# surface match (supersedes the unused-only gate that landed earlier). +CHECK_DEPS += lint-module-full +CHECK_DEPS += vet +CHECK_DEPS += mod-verify + +# Per-feature attribute / spec drift gates +CHECK_DEPS += attribute-namespace-check diff --git a/make/ci-fast.mk b/make/ci-fast.mk new file mode 100644 index 00000000..6f8273eb --- /dev/null +++ b/make/ci-fast.mk @@ -0,0 +1,33 @@ +# make/ci-fast.mk — fast-feedback CI subset (<60s on a dev laptop). +# +# Append your target to CI_FAST_DEPS only if BOTH: +# 1. wall-time <2s on a dev laptop, AND +# 2. catches a high-yield defect class per second of cost. +# +# PRINCIPLES.md §10 — keep the inner loop honest. Most new gates belong +# in ci-full.mk, not here. +# +# See make/README.md for the convention. + +# Static analysis +CI_FAST_DEPS += lint +CI_FAST_DEPS += vet +CI_FAST_DEPS += mod-verify + +# Per-feature attribute / spec drift gates +CI_FAST_DEPS += attribute-namespace-check + +# Doc parity +CI_FAST_DEPS += doc-check + +# Recipe convention gate (issue #427) +CI_FAST_DEPS += recipes-path-check + +# CI flake hygiene +CI_FAST_DEPS += test-flake-audit + +# Repo hot-file convention enforcement +# Fails the gate if a PR re-inlines prereq tokens into the root Makefile +# instead of appending to the relevant make/*.mk shard (cascade-rebase +# tripwire; see make/README.md). +CI_FAST_DEPS += makefile-hotfile-check diff --git a/make/ci-full.mk b/make/ci-full.mk new file mode 100644 index 00000000..ec97c8dd --- /dev/null +++ b/make/ci-full.mk @@ -0,0 +1,60 @@ +# make/ci-full.mk — full-CI superset (everything CI runs on every PR). +# +# Append your target to CI_FULL_DEPS. Most new gates land here. +# +# See make/README.md for the convention. + +# License + fixture drift +CI_FULL_DEPS += license-check +CI_FULL_DEPS += generate-fixtures-check +CI_FULL_DEPS += verdict-fixtures-check + +# Static analysis +CI_FULL_DEPS += vet +CI_FULL_DEPS += build-tags +CI_FULL_DEPS += tidy-check +CI_FULL_DEPS += mod-verify +CI_FULL_DEPS += lint + +# Security / RFC-bound gates +CI_FULL_DEPS += nccl-fr-rce-gate +CI_FULL_DEPS += register-lint +CI_FULL_DEPS += no-autoupdate-check + +# Workflow + YAML hygiene +CI_FULL_DEPS += actionlint +CI_FULL_DEPS += zizmor + +# Test + coverage + fuzz +CI_FULL_DEPS += coverage-check +CI_FULL_DEPS += ci-fuzz-nccl-fr +CI_FULL_DEPS += govulncheck + +# Doc parity +CI_FULL_DEPS += doc-check +CI_FULL_DEPS += deprecation-check + +# Recipe convention gate (issue #427) +CI_FULL_DEPS += recipes-path-check + +# Replay fixture PII anonymizer (M19 #1) +CI_FULL_DEPS += anonymize-pod-evicted-fixture-check + +# CI flake hygiene +CI_FULL_DEPS += test-flake-audit +CI_FULL_DEPS += pre-push-test + +# Repo hot-file convention enforcement +CI_FULL_DEPS += makefile-hotfile-check + +# Release-shape: build the binary + smoke + Quickstart parity +CI_FULL_DEPS += build +CI_FULL_DEPS += smoke +CI_FULL_DEPS += smoke-quickstart + +# Bench ceiling +# `bench-allocs-check` is included so the local pre-PR gate matches what +# CI enforces in `verify-static`. Adds ~80s on an M1 Max so wall-time +# budgets ~5m rather than the historical ~2.5m; see PRINCIPLES.md §10 +# and follow-up issue #424 for the ratchet path. +CI_FULL_DEPS += bench-allocs-check diff --git a/make/phony.mk b/make/phony.mk new file mode 100644 index 00000000..add4206a --- /dev/null +++ b/make/phony.mk @@ -0,0 +1,46 @@ +# make/phony.mk — sharded .PHONY collector. +# +# Append your target names to PHONY_TARGETS in the relevant feature +# block. Do NOT edit the catch-all `.PHONY: $(PHONY_TARGETS)` line in +# Makefile — adding a target only requires one new `+=` line below. +# +# See make/README.md for the convention. + +# Build / lifecycle +PHONY_TARGETS += help build clean hooks + +# Test suites +PHONY_TARGETS += test test-extras test-extras-sustained +PHONY_TARGETS += test-extras-fuzz test-extras-fuzz-kmsg test-extras-fuzz-journald +PHONY_TARGETS += test-extras-fuzz-nccl-fr test-extras-race +PHONY_TARGETS += bench bench-check bench-allocs-check bench-baseline +PHONY_TARGETS += bench-detectors bench-detectors-check bench-detectors-baseline +PHONY_TARGETS += bench-cv-report bench-e2e-steady-state helm-install-rolling-report + +# Format / lint / tidy +PHONY_TARGETS += fmt fmt-fix vet lint lint-fix lint-unused-module lint-module-full +PHONY_TARGETS += tidy tidy-check mod-verify bump-otel + +# Code generation +PHONY_TARGETS += generate-fixtures generate-fixtures-check verdict-fixtures-check + +# Coverage +PHONY_TARGETS += coverage coverage-check + +# Policy gates (each enforces a specific RFC-bound invariant) +PHONY_TARGETS += license-check license-fix govulncheck dco-check +PHONY_TARGETS += ci-fuzz-nccl-fr nccl-fr-rce-gate +PHONY_TARGETS += register-lint actionlint zizmor +PHONY_TARGETS += doc-check doc-check-release no-autoupdate-check base-digest-check +PHONY_TARGETS += recipes-path-check +PHONY_TARGETS += anonymize-pod-evicted-fixture-check +PHONY_TARGETS += build-tags attribute-namespace-check deprecation-check +PHONY_TARGETS += rfc-status-check +PHONY_TARGETS += cut-criteria-status cut-criteria-status-all cut-criteria-render cut-criteria-check +PHONY_TARGETS += slo-rules-check test-flake-audit makefile-hotfile-check + +# Aggregate gates: pre-commit / pre-push / fast-CI / full-CI +PHONY_TARGETS += check verify ci ci-fast ci-full pre-push-test + +# Release + integration +PHONY_TARGETS += smoke smoke-quickstart validator-recipe diff --git a/make/verify.mk b/make/verify.mk new file mode 100644 index 00000000..e21ae375 --- /dev/null +++ b/make/verify.mk @@ -0,0 +1,38 @@ +# make/verify.mk — pre-push gate (medium; <30s). +# +# Append your target to VERIFY_DEPS to gate it on every push. Criterion +# for inclusion: catches a defect class CI would otherwise have to +# reject the PR for. Heavy gates (test, coverage, govulncheck, fuzz, +# build) live in ci-full.mk and run only in CI. +# +# See make/README.md for the convention. + +# Pre-commit superset +VERIFY_DEPS += check + +# License + fixture drift +VERIFY_DEPS += license-check +VERIFY_DEPS += generate-fixtures-check +VERIFY_DEPS += verdict-fixtures-check + +# Build-tag variants +VERIFY_DEPS += build-tags + +# Security / RFC-bound gates +VERIFY_DEPS += nccl-fr-rce-gate +VERIFY_DEPS += register-lint +VERIFY_DEPS += no-autoupdate-check + +# Workflow + YAML hygiene +VERIFY_DEPS += actionlint +VERIFY_DEPS += zizmor + +# Doc parity +VERIFY_DEPS += doc-check +VERIFY_DEPS += deprecation-check + +# Replay fixture PII anonymizer (M19 #1) +VERIFY_DEPS += anonymize-pod-evicted-fixture-check + +# CI flake hygiene +VERIFY_DEPS += test-flake-audit diff --git a/module/pkg/patterns/checkpointer_hang_test.go b/module/pkg/patterns/checkpointer_hang_test.go index 52eabeaa..d39e4268 100644 --- a/module/pkg/patterns/checkpointer_hang_test.go +++ b/module/pkg/patterns/checkpointer_hang_test.go @@ -6,7 +6,6 @@ import ( "encoding/json" "os" "path/filepath" - "regexp" "testing" "time" @@ -81,8 +80,8 @@ func TestCheckpointerHangDetector_PositiveStallDuringWriteWithPressure(t *testin require.Equal(t, int64(600), v.StallSeconds) require.Equal(t, "/lustre/checkpoints/run42/step1000", v.CheckpointPath) - require.Regexp(t, regexp.MustCompile(`(?i)checkpoint|stall`), v.Headline) - require.Regexp(t, regexp.MustCompile(`(?i)storage|backend|async|frequency`), v.Remediation) + require.Regexp(t, `(?i)checkpoint|stall`, v.Headline) + require.Regexp(t, `(?i)storage|backend|async|frequency`, v.Remediation) require.Len(t, v.EvidenceTrail, 3, "checkpoint + stall + pressure") require.Equal(t, patterns.EvidenceKindCheckpointPhase, v.EvidenceTrail[0].Kind) diff --git a/module/pkg/patterns/cuda_oom.go b/module/pkg/patterns/cuda_oom.go index 01b103c1..126994c5 100644 --- a/module/pkg/patterns/cuda_oom.go +++ b/module/pkg/patterns/cuda_oom.go @@ -383,7 +383,7 @@ func cudaOOMEvidenceDescription(oom CUDAOOMLogRecord) string { if oom.TriedAllocBytes > 0 { return fmt.Sprintf("CUDA out of memory on GPU %s (tried to allocate %d bytes)", oom.GPUID, oom.TriedAllocBytes) } - return fmt.Sprintf("CUDA out of memory on GPU %s", oom.GPUID) + return "CUDA out of memory on GPU " + oom.GPUID } // fbEvidenceUID synthesizes a stable identifier for the FB evidence diff --git a/module/pkg/patterns/cuda_oom_test.go b/module/pkg/patterns/cuda_oom_test.go index b2d79d68..3e9b56d4 100644 --- a/module/pkg/patterns/cuda_oom_test.go +++ b/module/pkg/patterns/cuda_oom_test.go @@ -6,7 +6,6 @@ import ( "encoding/json" "os" "path/filepath" - "regexp" "testing" "time" @@ -80,8 +79,8 @@ func TestCUDAOOMDetector_PositiveFragmentation(t *testing.T) { require.Equal(t, int64(16*1024*1024*1024), v.FBFreeBytes) require.Equal(t, int64(2*1024*1024*1024), v.TriedAllocBytes) - require.Regexp(t, regexp.MustCompile(`(?i)CUDA|OOM|fragment`), v.Headline) - require.Regexp(t, regexp.MustCompile(`(?i)max_split_size_mb|empty_cache`), v.Remediation, + require.Regexp(t, `(?i)CUDA|OOM|fragment`, v.Headline) + require.Regexp(t, `(?i)max_split_size_mb|empty_cache`, v.Remediation, "fragmentation remediation must point at allocator knobs") require.Len(t, v.EvidenceTrail, 2, "FB sample + OOM log") @@ -121,7 +120,7 @@ func TestCUDAOOMDetector_PositiveTrueOOM(t *testing.T) { v := verdicts[0] require.Equal(t, patterns.CUDAOOMKindTrueOOM, v.Kind) require.InDelta(t, 0.025, v.FBFreeRatio, 1e-9, "2/80 = 0.025, below default 0.05") - require.Regexp(t, regexp.MustCompile(`(?i)batch|shard|checkpoint`), v.Remediation, + require.Regexp(t, `(?i)batch|shard|checkpoint`, v.Remediation, "true-OOM remediation must point at workload-level fixes") } diff --git a/module/pkg/patterns/hbm_ecc_test.go b/module/pkg/patterns/hbm_ecc_test.go index 6b0a20ed..f36d3b4b 100644 --- a/module/pkg/patterns/hbm_ecc_test.go +++ b/module/pkg/patterns/hbm_ecc_test.go @@ -65,7 +65,7 @@ func TestHBMECCDetector_PositiveDeltaThenXid48(t *testing.T) { headlineRE := regexp.MustCompile(`HBM.*ECC.*PCI:0000:3b:00.*Xid 48`) require.Regexp(t, headlineRE, v.Headline, "headline must name HBM ECC, the GPU id, and the Xid code") - require.Regexp(t, regexp.MustCompile(`drain|RMA|replace`), v.Remediation) + require.Regexp(t, `drain|RMA|replace`, v.Remediation) require.Len(t, v.EvidenceTrail, 2, "two layers: hw_error (ECC) + kernel_event (Xid)") require.Equal(t, patterns.EvidenceKindHBM, v.EvidenceTrail[0].Kind, diff --git a/module/pkg/patterns/ib_link_flap.go b/module/pkg/patterns/ib_link_flap.go index 5e41dab8..1513a838 100644 --- a/module/pkg/patterns/ib_link_flap.go +++ b/module/pkg/patterns/ib_link_flap.go @@ -302,7 +302,7 @@ func ibPortStateEvidenceDescription(bucket []IBPortStateRecord) string { func ibLinkFlapNCCLEvidenceDescription(cohort NCCLFRRecord) string { op := cohort.ProfilingName if op == "" { - op = "nccl:collective" + op = fallbackCollectiveOp } return fmt.Sprintf( "NCCL collective %s stuck on node %s (pg=%d, collective_seq_id=%d, state=%s)", diff --git a/module/pkg/patterns/model.go b/module/pkg/patterns/model.go index 96638f3b..5249bd95 100644 --- a/module/pkg/patterns/model.go +++ b/module/pkg/patterns/model.go @@ -148,6 +148,10 @@ const ( PressureDisk NodePressureKind = "disk" PressurePID NodePressureKind = "pid" PressureNotReady NodePressureKind = "notready" + // PressureUnknown is the catch-all bucket for evictions whose + // upstream signal does not match any of the kubelet vocabulary + // anchors above (descheduler, custom controllers, partial notes). + PressureUnknown NodePressureKind = "unknown" ) // Event Type values mirror the upstream events.k8s.io/v1 Event.Type diff --git a/module/pkg/patterns/nccl_bootstrap.go b/module/pkg/patterns/nccl_bootstrap.go index a5b3e2ab..102434fa 100644 --- a/module/pkg/patterns/nccl_bootstrap.go +++ b/module/pkg/patterns/nccl_bootstrap.go @@ -257,6 +257,14 @@ type NCCLBootstrapDetector struct { // // Inputs are read-only snapshots; the detector does not mutate any // slice. +// +// Complexity rationale: a pattern-match dispatch on N record-shape +// branches; each branch is single-statement and the total complexity +// (21) reflects the number of fields tested, not nested logic. Splitting +// per-branch would scatter the join sequence across helper funcs and +// obscure the detection ordering documented in the godoc above. +// +//nolint:gocyclo // See "Complexity rationale" in the godoc above. func (d NCCLBootstrapDetector) Evaluate( pods []TrainingPodRecord, ncclRecs []NCCLFRRecord, diff --git a/module/pkg/patterns/nccl_bootstrap_test.go b/module/pkg/patterns/nccl_bootstrap_test.go index f8f5d3f2..e5258a50 100644 --- a/module/pkg/patterns/nccl_bootstrap_test.go +++ b/module/pkg/patterns/nccl_bootstrap_test.go @@ -6,7 +6,6 @@ import ( "encoding/json" "os" "path/filepath" - "regexp" "testing" "time" @@ -81,8 +80,8 @@ func TestNCCLBootstrapDetector_FullCorrelationFires(t *testing.T) { require.Equal(t, patterns.EvidenceKindTrainingPod, v.EvidenceTrail[0].Kind) require.Equal(t, patterns.EvidenceKindCNINetworkEvent, v.EvidenceTrail[1].Kind) - require.Regexp(t, regexp.MustCompile(`(?i)NCCL|bootstrap`), v.Headline) - require.Regexp(t, regexp.MustCompile(`(?i)NCCL_SOCKET_IFNAME|init-container|NetworkAttachmentDefinition`), v.Remediation, + require.Regexp(t, `(?i)NCCL|bootstrap`, v.Headline) + require.Regexp(t, `(?i)NCCL_SOCKET_IFNAME|init-container|NetworkAttachmentDefinition`, v.Remediation, "remediation must point at the spec's three remediation hooks") } diff --git a/module/pkg/patterns/nccl_hang.go b/module/pkg/patterns/nccl_hang.go index 2269bd77..77c9f0c6 100644 --- a/module/pkg/patterns/nccl_hang.go +++ b/module/pkg/patterns/nccl_hang.go @@ -136,6 +136,15 @@ type nccllatestKey struct { // uses strconv-based byte builders instead of fmt.Sprintf — Sprintf's // interface-boxing + format-scan overhead was ~70% of the per-event // alloc count under the bench/detectors fixture. +// +// Complexity rationale: a linear pass over records with per-rank +// state-machine branches; complexity (18) reflects the NCCL FR state +// vocabulary (collective_done / collective_started / collective_pending / +// rank_idle / rank_unscheduled / process_died) being explicitly +// enumerated. Refactoring into helpers would split the per-rank +// timeline reasoning across files. +// +//nolint:gocyclo // See "Complexity rationale" in the godoc above. func (d NCCLHangDetector) Evaluate(records []NCCLFRRecord) []NCCLHangVerdict { threshold := d.HangThreshold if threshold <= 0 { @@ -262,13 +271,19 @@ func ncclHangEvidenceUID(r NCCLFRRecord) string { return string(buf) } +// fallbackCollectiveOp is the operator-name placeholder used in the +// NCCL hang / IB link-flap one-liners when the upstream FR record +// carries no `profiling_name` (older NCCL builds; partial records). +// Hoisted from string literals to a constant per goconst lint. +const fallbackCollectiveOp = "nccl:collective" + // ncclHangHeadline renders the operator-facing one-liner. strconv-based // to avoid fmt.Sprintf's interface-boxing alloc per call (was ~25% of // the per-cohort alloc count). func ncclHangHeadline(r NCCLFRRecord, n int) string { op := r.ProfilingName if op == "" { - op = "nccl:collective" + op = fallbackCollectiveOp } // Format: "%d ranks hung on collective %d (%s) in pg %d" buf := make([]byte, 0, 64+len(op)) @@ -312,7 +327,7 @@ func ncclHangRemediation(r NCCLFRRecord, ranks []int64) string { func ncclHangEvidenceDescription(r NCCLFRRecord, startedAt time.Time) string { op := r.ProfilingName if op == "" { - op = "nccl:collective" + op = fallbackCollectiveOp } // Format: "Rank %d stuck on %s (collective_seq_id=%d, state=%s) since %s" // where %s on the timestamp is time.RFC3339 ("2006-01-02T15:04:05Z07:00"). diff --git a/module/pkg/patterns/nccl_hang_test.go b/module/pkg/patterns/nccl_hang_test.go index d8506fea..1ba6a58e 100644 --- a/module/pkg/patterns/nccl_hang_test.go +++ b/module/pkg/patterns/nccl_hang_test.go @@ -50,7 +50,7 @@ func TestNCCLHangDetector_PositiveThreeRanksStuck(t *testing.T) { headlineRE := regexp.MustCompile(`3 ranks hung on collective 42 \(nccl:all_reduce\)`) require.Regexp(t, headlineRE, v.Headline) - require.Regexp(t, regexp.MustCompile(`py-spy|inspect`), v.Remediation) + require.Regexp(t, `py-spy|inspect`, v.Remediation) require.Len(t, v.EvidenceTrail, 3, "one evidence ref per hanging rank") require.Equal(t, patterns.EvidenceKindNCCLFR, v.EvidenceTrail[0].Kind) } diff --git a/module/pkg/patterns/pcie_aer_test.go b/module/pkg/patterns/pcie_aer_test.go index f1c9143e..2a277e46 100644 --- a/module/pkg/patterns/pcie_aer_test.go +++ b/module/pkg/patterns/pcie_aer_test.go @@ -69,7 +69,7 @@ func TestPCIeAERDetector_PositiveAERThenRateCollapse(t *testing.T) { headlineRE := regexp.MustCompile(`PCIe|AER|degraded`) require.Regexp(t, headlineRE, v.Headline) - require.Regexp(t, regexp.MustCompile(`lspci|drain|RMA|maintenance`), v.Remediation) + require.Regexp(t, `lspci|drain|RMA|maintenance`, v.Remediation) require.Len(t, v.EvidenceTrail, 2, "two evidence layers: pcie_aer + hw_io_collapse") require.Equal(t, patterns.EvidenceKindAER, v.EvidenceTrail[0].Kind, "AER message first (causal order)") diff --git a/module/pkg/patterns/pod_evicted.go b/module/pkg/patterns/pod_evicted.go index 0fcade0d..3110d52c 100644 --- a/module/pkg/patterns/pod_evicted.go +++ b/module/pkg/patterns/pod_evicted.go @@ -408,7 +408,7 @@ func pressureFromNote(note string) NodePressureKind { } } } - return "unknown" + return PressureUnknown } // containsFold reports whether substr occurs in s under ASCII @@ -501,6 +501,9 @@ func remediationFor(p NodePressureKind) string { return "Cap fork rate (training-process spawn) or raise the node pid limit." case PressureNotReady: return "Inspect kubelet health, network plumbing, and CSI mounts on the node." + case PressureUnknown: + // Explicit case for exhaustiveness; same body as the catch-all. + return "Inspect kubelet eviction logs on the node; the pressure root was not captured in the join window." } return "Inspect kubelet eviction logs on the node; the pressure root was not captured in the join window." } diff --git a/module/pkg/patterns/silent_data_corruption.go b/module/pkg/patterns/silent_data_corruption.go index 4058a5d0..052653b2 100644 --- a/module/pkg/patterns/silent_data_corruption.go +++ b/module/pkg/patterns/silent_data_corruption.go @@ -249,6 +249,13 @@ type SilentDataCorruptionDetector struct { // // Inputs are read-only snapshots; the detector does not mutate either // slice. Order of inputs is not assumed. +// +// Complexity rationale: confidence dispatch enumerates the +// {accuracy_drop, sdc_counter_delta} cross-product per the godoc above; +// complexity (16) reflects that table. Refactoring would split the +// confidence rules out of band from this documentation. +// +//nolint:gocyclo // See "Complexity rationale" in the godoc above. func (d SilentDataCorruptionDetector) Evaluate(evals []EvalAccuracyRecord, sdcs []SDCCounterRecord) []SilentDataCorruptionVerdict { threshold := d.AccuracyDropThreshold if threshold <= 0 { diff --git a/module/pkg/patterns/silent_data_corruption_test.go b/module/pkg/patterns/silent_data_corruption_test.go index 30ccbe35..3f7e92cf 100644 --- a/module/pkg/patterns/silent_data_corruption_test.go +++ b/module/pkg/patterns/silent_data_corruption_test.go @@ -6,7 +6,6 @@ import ( "encoding/json" "os" "path/filepath" - "regexp" "testing" "time" @@ -82,8 +81,8 @@ func TestSDCDetector_PositiveVendorSignaled(t *testing.T) { require.Equal(t, int64(1), v.SDCCounterDelta) require.Empty(t, v.MissingLayers) - require.Regexp(t, regexp.MustCompile(`(?i)eval|accuracy|SDC`), v.Headline) - require.Regexp(t, regexp.MustCompile(`(?i)re-run|different hardware`), v.Remediation, + require.Regexp(t, `(?i)eval|accuracy|SDC`, v.Headline) + require.Regexp(t, `(?i)re-run|different hardware`, v.Remediation, "vendor_signaled remediation must route operator to a hardware re-run") require.Len(t, v.EvidenceTrail, 2, "SDC counter + eval accuracy") @@ -612,6 +611,12 @@ func TestSDCVerdict_SchemaConformance(t *testing.T) { // TestSDCVerdict_SchemaRejectsDrift is the drift-rejection battery. // Each row is a falsifier for one schema constraint; removing the // constraint flips the row to PASS. +// +// The repeated `rejected` / `maybe` / `full` literals are the test's +// input vocabulary — extracting them to package-level constants would +// obscure which row mutates which schema axis. +// +//nolint:goconst // See "input vocabulary" note above. func TestSDCVerdict_SchemaRejectsDrift(t *testing.T) { t.Parallel() diff --git a/module/pkg/patterns/thermal_throttle.go b/module/pkg/patterns/thermal_throttle.go index 3aff4748..f6674b1b 100644 --- a/module/pkg/patterns/thermal_throttle.go +++ b/module/pkg/patterns/thermal_throttle.go @@ -300,7 +300,7 @@ func buildThermalThrottleVerdict( func windowedSum(bucket []ThermalThrottleRecord, floor, anchor time.Time) (time.Duration, ThermalThrottleRecord, bool) { var sum time.Duration var latest ThermalThrottleRecord - any := false + found := false for _, r := range bucket { if r.Timestamp.Before(floor) { continue @@ -310,9 +310,9 @@ func windowedSum(bucket []ThermalThrottleRecord, floor, anchor time.Time) (time. } sum += r.ThrottleDelta latest = r - any = true + found = true } - return sum, latest, any + return sum, latest, found } // thermalThrottleEvidenceUID synthesizes a stable identifier for the diff --git a/module/pkg/patterns/thermal_throttle_test.go b/module/pkg/patterns/thermal_throttle_test.go index 3a3cf915..725a70cd 100644 --- a/module/pkg/patterns/thermal_throttle_test.go +++ b/module/pkg/patterns/thermal_throttle_test.go @@ -53,7 +53,7 @@ func TestThermalThrottleDetector_PositiveCascade(t *testing.T) { headlineRE := regexp.MustCompile(`(?i)thermal.*throttl.*gpu-node-0001`) require.Regexp(t, headlineRE, v.Headline, "headline must name thermal throttling and the node") - require.Regexp(t, regexp.MustCompile(`airflow|HVAC|cooling|facilities`), v.Remediation) + require.Regexp(t, `airflow|HVAC|cooling|facilities`, v.Remediation) require.GreaterOrEqual(t, len(v.EvidenceTrail), 4, "one evidence entry per throttling GPU in the cascade") @@ -387,6 +387,12 @@ func TestThermalThrottleVerdict_SchemaConformance(t *testing.T) { // the constraint flips the row to PASS. A schema with no drift tests // is a schema that documents nothing — every constraint in // thermal_throttle_verdict.schema.json has a falsifier here. +// +// The repeated `full` literals are the test's input vocabulary — +// extracting them to a package-level constant would obscure which row +// mutates which schema axis. +// +//nolint:goconst // See "input vocabulary" note above. func TestThermalThrottleVerdict_SchemaRejectsDrift(t *testing.T) { t.Parallel() diff --git a/module/pkg/patterns/xid_correlation.go b/module/pkg/patterns/xid_correlation.go index eabc6024..01f338f2 100644 --- a/module/pkg/patterns/xid_correlation.go +++ b/module/pkg/patterns/xid_correlation.go @@ -295,13 +295,14 @@ func buildXidCorrelationVerdict(xid XidRecord, ev Record) XidCorrelationVerdict // pod ("ns/name") podStart := len(composite) - if podHasSep { + switch { + case podHasSep: composite = append(composite, podNs...) composite = append(composite, '/') composite = append(composite, podName...) - } else if podFallbackUnknown { + case podFallbackUnknown: composite = append(composite, ""...) - } else { + default: composite = append(composite, podName...) } podEnd := len(composite) @@ -415,7 +416,11 @@ func stringSlice(composite []byte, lo, hi int) string { if lo == hi { return "" } - return unsafe.String(&composite[lo], hi-lo) + // SECURITY (G103): aliasing-string carve over a caller-owned []byte. + // The buffer (`composite`) is finalised before any carve in + // buildXidCorrelationVerdict — see the function-level invariant + // above. Audited as required by gosec G103. + return unsafe.String(&composite[lo], hi-lo) //nolint:gosec // G103: audited aliasing string carve; buffer is finalised before carve. } // xidEvidenceUID synthesizes a stable identifier for the Xid diff --git a/module/pkg/patterns/xid_correlation_test.go b/module/pkg/patterns/xid_correlation_test.go index d31bb320..7144a44d 100644 --- a/module/pkg/patterns/xid_correlation_test.go +++ b/module/pkg/patterns/xid_correlation_test.go @@ -64,7 +64,7 @@ func TestXidCorrelationDetector_PositiveXid79ThenEviction(t *testing.T) { headlineRE := regexp.MustCompile(`Xid 79.*gpu-node-0001.*training/job-rank-3`) require.Regexp(t, headlineRE, v.Headline, "headline must name the Xid code, node, and pod") - require.Regexp(t, regexp.MustCompile(`GPU|hardware|drain`), v.Remediation) + require.Regexp(t, `GPU|hardware|drain`, v.Remediation) require.Len(t, v.EvidenceTrail, 2, "two evidence layers: kernel_event + pod_event") require.Equal(t, patterns.EvidenceKindXid, v.EvidenceTrail[0].Kind, "Xid surfaces first (causal order)") diff --git a/module/pkg/replay/helpers_test.go b/module/pkg/replay/helpers_test.go index a5783c34..04e71517 100644 --- a/module/pkg/replay/helpers_test.go +++ b/module/pkg/replay/helpers_test.go @@ -4,6 +4,7 @@ package replay_test import ( "encoding/json" + "fmt" "os" "path/filepath" "testing" @@ -31,7 +32,7 @@ import ( func listFixtures(root string) ([]string, error) { entries, err := os.ReadDir(root) if err != nil { - return nil, err + return nil, fmt.Errorf("read %q: %w", root, err) } var out []string for _, e := range entries { @@ -45,7 +46,7 @@ func listFixtures(root string) ([]string, error) { } grouped, err := os.ReadDir(sub) if err != nil { - return nil, err + return nil, fmt.Errorf("read %q: %w", sub, err) } for _, g := range grouped { if !g.IsDir() { diff --git a/module/pkg/selftel/selftel_test.go b/module/pkg/selftel/selftel_test.go index 7a81164d..ccd131ce 100644 --- a/module/pkg/selftel/selftel_test.go +++ b/module/pkg/selftel/selftel_test.go @@ -126,14 +126,20 @@ func TestNewReceiver_EmitsAllFiveInstruments(t *testing.T) { // emissions_total: -1 must be discarded → total 5. em, _ := selftelutil.FindInstrument(rm, receiverEmissionsTotal) - sum := em.Data.(metricdata.Sum[int64]) + sum, ok := em.Data.(metricdata.Sum[int64]) + if !ok { + t.Fatalf("emissions: expected Sum[int64], got %T", em.Data) + } if len(sum.DataPoints) != 1 || sum.DataPoints[0].Value != 5 { t.Errorf("emissions: got %v, want 1 datapoint value=5", sum.DataPoints) } // errors_total: 2 enumerate + 1 parse. errs, _ := selftelutil.FindInstrument(rm, receiverErrorsTotal) - errSum := errs.Data.(metricdata.Sum[int64]) + errSum, ok := errs.Data.(metricdata.Sum[int64]) + if !ok { + t.Fatalf("errors_total: expected Sum[int64], got %T", errs.Data) + } byKind := map[string]int64{} for _, dp := range errSum.DataPoints { kind, _ := dp.Attributes.Value("kind") @@ -153,7 +159,10 @@ func TestRecordInitError_TicksCounter(t *testing.T) { if !ok { t.Fatalf("init_errors_total absent; have: %s", selftelutil.DumpNames(rm)) } - sum := m.Data.(metricdata.Sum[int64]) + sum, ok := m.Data.(metricdata.Sum[int64]) + if !ok { + t.Fatalf("init_errors_total: expected Sum[int64], got %T", m.Data) + } if len(sum.DataPoints) != 1 || sum.DataPoints[0].Value != 1 { t.Fatalf("init_errors: want 1 datapoint value=1, got %v", sum.DataPoints) } diff --git a/module/pkg/testutil/selftel/selftel.go b/module/pkg/testutil/selftel/selftel.go index f2571e57..5f95605a 100644 --- a/module/pkg/testutil/selftel/selftel.go +++ b/module/pkg/testutil/selftel/selftel.go @@ -123,8 +123,8 @@ type FailingMeterProvider struct { // Float64Histogram, Float64ObservableCounter, or Int64ObservableGauge // registration whose name starts with prefix returns a synthetic // error. -func NewFailingMeterProvider(real metric.MeterProvider, prefix string) *FailingMeterProvider { - return &FailingMeterProvider{real: real, prefix: prefix} +func NewFailingMeterProvider(realMP metric.MeterProvider, prefix string) *FailingMeterProvider { + return &FailingMeterProvider{real: realMP, prefix: prefix} } // Meter returns a wrapped meter whose instrument registrations honor diff --git a/module/processor/patterndetectorprocessor/checkpointer_hang_test.go b/module/processor/patterndetectorprocessor/checkpointer_hang_test.go index f70b4d82..4f46bc8e 100644 --- a/module/processor/patterndetectorprocessor/checkpointer_hang_test.go +++ b/module/processor/patterndetectorprocessor/checkpointer_hang_test.go @@ -286,6 +286,8 @@ func extractCheckpointerHangPromotedAttrs(t *testing.T, ld plog.Logs) map[string out[k] = v.Int() case pcommon.ValueTypeDouble: out[k] = v.Double() + default: + // Other pcommon.ValueType cases not exercised by this test fixture. } return true }) diff --git a/module/processor/patterndetectorprocessor/config.go b/module/processor/patterndetectorprocessor/config.go index 943fc989..b53df659 100644 --- a/module/processor/patterndetectorprocessor/config.go +++ b/module/processor/patterndetectorprocessor/config.go @@ -1,5 +1,11 @@ // SPDX-License-Identifier: Apache-2.0 +// Package patterndetectorprocessor implements the in-collector pattern +// detector chain. Each detector consumes a per-pattern record stream +// (NCCL FR records, NCCL hang signals, XID-correlation events, +// thermal-throttle / HBM-ECC / PCIe-AER scrapes, IB link-flap events, +// CUDA-OOM crash markers) and emits a structured verdict on match. +// // Knob-naming convention (post-wave audit #379): // // New pattern knobs MUST use the `` Go identifier shape, @@ -359,6 +365,15 @@ type Config struct { } // Validate enforces operator-actionable rules. +// +// Complexity rationale: a linear per-knob +// `if c.X != 0 && c.X < bound { ... }` table; complexity (47) tracks +// the number of knobs declared above and has no nested branches. +// Splitting per-knob would scatter the validation contract across +// helper funcs and make adding a new knob a two-file edit +// (struct + helper) instead of a one-file edit. +// +//nolint:gocyclo // See "Complexity rationale" above. func (c *Config) Validate() error { if c.JoinWindow != 0 && c.JoinWindow < time.Second { return fmt.Errorf("join_window: must be >= 1s, got %s", c.JoinWindow) @@ -466,6 +481,15 @@ func defaultConfig() *Config { } } +// withDefaults returns a copy of c with zero-valued knobs filled in +// from their package-level Default* constants. +// +// Complexity rationale: a linear per-knob default-fill +// `if out.X == 0 { out.X = DefaultX }` table; complexity (26) tracks +// the number of knobs. Mirrors Validate above — same +// one-file-edit-to-add-a-knob property. +// +//nolint:gocyclo // See "Complexity rationale" above. func (c *Config) withDefaults() *Config { out := *c if out.JoinWindow == 0 { diff --git a/module/processor/patterndetectorprocessor/cuda_oom_recipe_test.go b/module/processor/patterndetectorprocessor/cuda_oom_recipe_test.go index c5b86a2a..551678f3 100644 --- a/module/processor/patterndetectorprocessor/cuda_oom_recipe_test.go +++ b/module/processor/patterndetectorprocessor/cuda_oom_recipe_test.go @@ -40,7 +40,7 @@ func TestRecipe_CUDAOOM_StanzaPinsWireContract(t *testing.T) { t.Parallel() recipePath := findRepoFile(t, "docs/integrations/examples/filelog-container.yaml") - raw, err := os.ReadFile(recipePath) + raw, err := os.ReadFile(recipePath) //nolint:gosec // G304: test-local repo path resolved via findRepoFile; not user-controlled. require.NoError(t, err, "reading recipe example yaml") recipe := string(raw) diff --git a/module/processor/patterndetectorprocessor/cuda_oom_test.go b/module/processor/patterndetectorprocessor/cuda_oom_test.go index fbfae92d..a1c6d316 100644 --- a/module/processor/patterndetectorprocessor/cuda_oom_test.go +++ b/module/processor/patterndetectorprocessor/cuda_oom_test.go @@ -330,6 +330,8 @@ func extractCUDAOOMPromotedAttrs(t *testing.T, ld plog.Logs) map[string]any { out[k] = v.Double() case pcommon.ValueTypeInt: out[k] = v.Int() + default: + // Other pcommon.ValueType cases not exercised by this test fixture. } return true }) diff --git a/module/processor/patterndetectorprocessor/dataloader_hang.go b/module/processor/patterndetectorprocessor/dataloader_hang.go index f47a5908..adf52c68 100644 --- a/module/processor/patterndetectorprocessor/dataloader_hang.go +++ b/module/processor/patterndetectorprocessor/dataloader_hang.go @@ -195,12 +195,13 @@ func appendDataLoaderHangVerdict(ld plog.Logs, v patterns.DataLoaderHangVerdict, putStrIfSet(attrs, verdictAttrK8sPodName, v.PodName) putStrIfSet(attrs, verdictAttrK8sPodNamespace, v.PodNamespace) putStrIfSet(attrs, verdictAttrK8sNodeName, v.NodeName) - if v.Discriminator == patterns.DataLoaderHangDiscriminatorWorkerKilled { + switch v.Discriminator { + case patterns.DataLoaderHangDiscriminatorWorkerKilled: if v.WorkerPID > 0 { attrs.PutInt(verdictAttrDataLoaderWorkerPID, v.WorkerPID) } putStrIfSet(attrs, verdictAttrDataLoaderErrorClass, v.ErrorClass) - } else if v.Discriminator == patterns.DataLoaderHangDiscriminatorStorageEvent { + case patterns.DataLoaderHangDiscriminatorStorageEvent: putStrIfSet(attrs, verdictAttrK8sEventReason, v.EventReason) } }) diff --git a/module/processor/patterndetectorprocessor/dataloader_hang_test.go b/module/processor/patterndetectorprocessor/dataloader_hang_test.go index 194e860e..5f6b53ee 100644 --- a/module/processor/patterndetectorprocessor/dataloader_hang_test.go +++ b/module/processor/patterndetectorprocessor/dataloader_hang_test.go @@ -557,6 +557,8 @@ func extractDataLoaderHangPromotedAttrs(t *testing.T, ld plog.Logs) map[string]a out[k] = v.Double() case pcommon.ValueTypeInt: out[k] = v.Int() + default: + // Other pcommon.ValueType cases not exercised by this test fixture. } return true }) diff --git a/module/processor/patterndetectorprocessor/ib_link_flap_recipe_test.go b/module/processor/patterndetectorprocessor/ib_link_flap_recipe_test.go index 733fa8ce..b790e9b8 100644 --- a/module/processor/patterndetectorprocessor/ib_link_flap_recipe_test.go +++ b/module/processor/patterndetectorprocessor/ib_link_flap_recipe_test.go @@ -38,7 +38,7 @@ func TestRecipe_IBLinkFlap_StanzaPinsWireContract(t *testing.T) { // Walk up to the repo root so the test is invariant under // `go test ./...` from anywhere. recipePath := findRepoFile(t, "docs/integrations/examples/prometheus-scrape.yaml") - raw, err := os.ReadFile(recipePath) + raw, err := os.ReadFile(recipePath) //nolint:gosec // G304: test-local repo path resolved via findRepoFile; not user-controlled. require.NoError(t, err, "reading recipe example yaml") recipe := string(raw) diff --git a/module/processor/patterndetectorprocessor/ib_link_flap_test.go b/module/processor/patterndetectorprocessor/ib_link_flap_test.go index 0486ccda..fb19936b 100644 --- a/module/processor/patterndetectorprocessor/ib_link_flap_test.go +++ b/module/processor/patterndetectorprocessor/ib_link_flap_test.go @@ -240,6 +240,8 @@ func extractIBLinkFlapPromotedAttrs(t *testing.T, ld plog.Logs) map[string]any { out[k] = v.Int() case pcommon.ValueTypeDouble: out[k] = v.Double() + default: + // Other pcommon.ValueType cases not exercised by this test fixture. } return true }) diff --git a/module/processor/patterndetectorprocessor/metrics.go b/module/processor/patterndetectorprocessor/metrics.go index 535b027e..cca928d6 100644 --- a/module/processor/patterndetectorprocessor/metrics.go +++ b/module/processor/patterndetectorprocessor/metrics.go @@ -80,12 +80,12 @@ type fbBuffer struct { } // newFBBuffer constructs an empty buffer with the given capacity. -// cap<=0 falls back to fbBufferDefaultMaxRecords. -func newFBBuffer(cap int) *fbBuffer { - if cap <= 0 { - cap = fbBufferDefaultMaxRecords +// capacity<=0 falls back to fbBufferDefaultMaxRecords. +func newFBBuffer(capacity int) *fbBuffer { + if capacity <= 0 { + capacity = fbBufferDefaultMaxRecords } - return &fbBuffer{cap: cap} + return &fbBuffer{cap: capacity} } // add appends a record to the buffer, evicting the oldest entry if at diff --git a/module/processor/patterndetectorprocessor/nccl_bootstrap_test.go b/module/processor/patterndetectorprocessor/nccl_bootstrap_test.go index 01fc0ee8..46110f65 100644 --- a/module/processor/patterndetectorprocessor/nccl_bootstrap_test.go +++ b/module/processor/patterndetectorprocessor/nccl_bootstrap_test.go @@ -287,6 +287,8 @@ func extractNCCLBootstrapPromotedAttrs(t *testing.T, ld plog.Logs) map[string]an out[k] = v.Int() case pcommon.ValueTypeDouble: out[k] = v.Double() + default: + // Other pcommon.ValueType cases not exercised by this test fixture. } return true }) diff --git a/module/processor/patterndetectorprocessor/noop_fallback_test.go b/module/processor/patterndetectorprocessor/noop_fallback_test.go index 57c7ae52..67ebb01a 100644 --- a/module/processor/patterndetectorprocessor/noop_fallback_test.go +++ b/module/processor/patterndetectorprocessor/noop_fallback_test.go @@ -4,6 +4,7 @@ package patterndetectorprocessor import ( "context" + "errors" "testing" "go.opentelemetry.io/otel/sdk/metric/metricdata" @@ -186,11 +187,8 @@ func TestNoopFallback_FactoryFailedMeter_AllHotPathsSafe(t *testing.T) { // FindInstrument MUST miss; the lookup-miss branch is the success // case. if m, ok := selftelutil.FindInstrument(rm, "otelcol.processor.patterndetector.verdicts_emitted_total"); ok { - switch d := m.Data.(type) { - case metricdata.Sum[int64]: - if len(d.DataPoints) > 0 { - t.Errorf("noop fallback leaked into verdicts_emitted_total: %d datapoints", len(d.DataPoints)) - } + if d, ok := m.Data.(metricdata.Sum[int64]); ok && len(d.DataPoints) > 0 { + t.Errorf("noop fallback leaked into verdicts_emitted_total: %d datapoints", len(d.DataPoints)) } } @@ -239,7 +237,7 @@ func TestNoopFallback_NewSelfTelemetry_NilProviderErrors(t *testing.T) { } // errors.Is-style check via direct sentinel compare since the // package returns the sentinel directly (no wrap). - if err != errNilMeterProvider { + if !errors.Is(err, errNilMeterProvider) { t.Errorf("newSelfTelemetry(nil) err: got %v, want %v", err, errNilMeterProvider) } } diff --git a/module/processor/patterndetectorprocessor/patterndetector_test.go b/module/processor/patterndetectorprocessor/patterndetector_test.go index 99065fe7..af63c7be 100644 --- a/module/processor/patterndetectorprocessor/patterndetector_test.go +++ b/module/processor/patterndetectorprocessor/patterndetector_test.go @@ -63,7 +63,7 @@ func TestPatternDetector_NegativeFixturesEmitNoVerdicts(t *testing.T) { fixtures, err := replay.LoadFixturesUnder(negativeRoot) require.NoError(t, err) - var negatives []replay.Fixture + negatives := make([]replay.Fixture, 0, len(fixtures)) for _, f := range fixtures { if f.Name == "canonical" { continue @@ -167,7 +167,8 @@ func TestConfig_Validate(t *testing.T) { func TestFactory_Surface(t *testing.T) { f := NewFactory() require.Equal(t, componentType(), f.Type()) - cfg := f.CreateDefaultConfig().(*Config) + cfg, ok := f.CreateDefaultConfig().(*Config) + require.True(t, ok, "CreateDefaultConfig must return *Config") require.Equal(t, DefaultJoinWindow, cfg.JoinWindow) require.Equal(t, DefaultNCCLHangThreshold, cfg.NCCLHangThreshold) require.Equal(t, DefaultXidCorrelationWindow, cfg.XidCorrelationWindow) diff --git a/module/processor/patterndetectorprocessor/pcie_aer_test.go b/module/processor/patterndetectorprocessor/pcie_aer_test.go index a6c6940b..cc9a8cbb 100644 --- a/module/processor/patterndetectorprocessor/pcie_aer_test.go +++ b/module/processor/patterndetectorprocessor/pcie_aer_test.go @@ -324,6 +324,8 @@ func extractPCIeAERPromotedAttrs(t *testing.T, ld plog.Logs) map[string]any { out[k] = v.Double() case pcommon.ValueTypeInt: out[k] = v.Int() + default: + // Other pcommon.ValueType cases not exercised by this test fixture. } return true }) diff --git a/module/processor/patterndetectorprocessor/silent_data_corruption_test.go b/module/processor/patterndetectorprocessor/silent_data_corruption_test.go index 1a9cd93f..866c6c54 100644 --- a/module/processor/patterndetectorprocessor/silent_data_corruption_test.go +++ b/module/processor/patterndetectorprocessor/silent_data_corruption_test.go @@ -347,6 +347,8 @@ func extractSDCPromotedAttrs(t *testing.T, ld plog.Logs) map[string]any { out[k] = v.Double() case pcommon.ValueTypeInt: out[k] = v.Int() + default: + // Other pcommon.ValueType cases not exercised by this test fixture. } return true }) diff --git a/module/processor/patterndetectorprocessor/xid_correlation_test.go b/module/processor/patterndetectorprocessor/xid_correlation_test.go index 4eb0e587..5c874b40 100644 --- a/module/processor/patterndetectorprocessor/xid_correlation_test.go +++ b/module/processor/patterndetectorprocessor/xid_correlation_test.go @@ -68,7 +68,12 @@ func TestPatternDetector_XidCorrelationWiringEmitsVerdict(t *testing.T) { require.Equal(t, patterns.PatternIDXidCorrelation, v.PatternID) require.Equal(t, 79, v.XidCode) require.Equal(t, "gpu-node-0001", v.Node) - require.Equal(t, "training/job-rank-3", v.EvictedPod) + // EvictedPod is intentionally asserted here: the field is deprecated + // (removed in v0.6 per docs/ATTRIBUTES.md) and this test pins the + // pre-deprecation shape end-to-end until the v0.5 warn-on-read window + // elapses. The split-out PodName / PodNamespace fields are asserted + // in sibling tests. + require.Equal(t, "training/job-rank-3", v.EvictedPod) //nolint:staticcheck // SA1019: explicit pre-deprecation parity assertion (issue #277). } // TestPatternDetector_XidCorrelationWiringHealthy pins the negative diff --git a/module/processor/rankjoinprocessor/rankjoin_test.go b/module/processor/rankjoinprocessor/rankjoin_test.go index 6468afa8..a0568cbf 100644 --- a/module/processor/rankjoinprocessor/rankjoin_test.go +++ b/module/processor/rankjoinprocessor/rankjoin_test.go @@ -207,7 +207,7 @@ func TestRankJoin_FixtureCorpusGoldenLoadsCleanly(t *testing.T) { // 2026-05-18T10:00:00Z. The fixture is the rubric L404 deliverable // for M19. fixtureDir := filepath.Join("..", "..", "pkg", "replay", "pod_evicted", "canonical") - manBytes, err := os.ReadFile(filepath.Join(fixtureDir, "manifest.json")) + manBytes, err := os.ReadFile(filepath.Join(fixtureDir, "manifest.json")) //nolint:gosec // G304: test-local fixture path. if err != nil { t.Fatalf("read manifest: %v", err) } @@ -221,7 +221,7 @@ func TestRankJoin_FixtureCorpusGoldenLoadsCleanly(t *testing.T) { t.Fatalf("manifest pattern_id: got %q, want %q (fixture drift)", manifest.PatternID, patterns.PatternIDPodEvicted) } - eventsBytes, err := os.ReadFile(filepath.Join(fixtureDir, "events.json")) + eventsBytes, err := os.ReadFile(filepath.Join(fixtureDir, "events.json")) //nolint:gosec // G304: test-local fixture path. if err != nil { t.Fatalf("read events: %v", err) } @@ -294,10 +294,14 @@ func TestFactory_Surface(t *testing.T) { if cfg == nil { t.Fatal("CreateDefaultConfig returned nil") } - if got := cfg.(*Config).EvictionMatchWindow; got != DefaultEvictionMatchWindow { + cfgT, ok := cfg.(*Config) + if !ok { + t.Fatalf("CreateDefaultConfig: expected *Config, got %T", cfg) + } + if got := cfgT.EvictionMatchWindow; got != DefaultEvictionMatchWindow { t.Errorf("default EvictionMatchWindow: got %v, want %v", got, DefaultEvictionMatchWindow) } - if got := cfg.(*Config).MaxBufferedRecords; got != DefaultMaxBufferedRecords { + if got := cfgT.MaxBufferedRecords; got != DefaultMaxBufferedRecords { t.Errorf("default MaxBufferedRecords: got %d, want %d", got, DefaultMaxBufferedRecords) } } diff --git a/module/receiver/ncclfrreceiver/nccl_fr_integration_test.go b/module/receiver/ncclfrreceiver/nccl_fr_integration_test.go index 3641e607..b5bb2471 100644 --- a/module/receiver/ncclfrreceiver/nccl_fr_integration_test.go +++ b/module/receiver/ncclfrreceiver/nccl_fr_integration_test.go @@ -87,7 +87,7 @@ func TestIntegration_E2E_FactoryToConsumer(t *testing.T) { }() // Drop the committed .pkl fixture into the watched directory. - pklBytes, err := os.ReadFile(filepath.Join(integrationFixtureDir, slug+".pkl")) + pklBytes, err := os.ReadFile(filepath.Join(integrationFixtureDir, slug+".pkl")) //nolint:gosec // G304: test-local fixture path. if err != nil { t.Fatalf("read committed fixture: %v (run `make generate-fixtures`)", err) } @@ -238,7 +238,7 @@ func runFixtureRoundtrip(t *testing.T, slug string) { _ = r.Shutdown(stopCtx) }() - pklBytes, err := os.ReadFile(filepath.Join(integrationFixtureDir, slug+".pkl")) + pklBytes, err := os.ReadFile(filepath.Join(integrationFixtureDir, slug+".pkl")) //nolint:gosec // G304: test-local fixture path. if err != nil { t.Fatalf("read committed fixture: %v", err) } @@ -443,7 +443,7 @@ func assertGolden(t *testing.T, path string, got []normalizedLog) { want = append(want, '\n') if os.Getenv("UPDATE_GOLDEN") == "1" { - if err := os.MkdirAll(filepath.Dir(path), 0o755); err != nil { + if err := os.MkdirAll(filepath.Dir(path), 0o750); err != nil { t.Fatalf("mkdir golden dir: %v", err) } if err := os.WriteFile(path, want, 0o600); err != nil { diff --git a/scripts/makefile-hotfile-check.sh b/scripts/makefile-hotfile-check.sh new file mode 100755 index 00000000..0e57c034 --- /dev/null +++ b/scripts/makefile-hotfile-check.sh @@ -0,0 +1,78 @@ +#!/usr/bin/env bash +# scripts/makefile-hotfile-check.sh +# +# Enforce the make/ shard convention (see make/README.md). +# +# The root `Makefile` carries four monolithic prerequisite lists that +# were historically the dominant source of cascade-rebases on this repo: +# `.PHONY: $(PHONY_TARGETS)` plus the `check:`, `verify:`, `ci-fast:`, +# `ci-full:` aggregate targets. After the sharding pass in +# #infra/makefile-shard-and-kind-bootstrap-and-module-lint these lists +# now expand from `make/*.mk` shards via `+=` — each shard line stands +# on its own, eliminating the textual coupling. +# +# This gate ensures future PRs do not re-inline prerequisites into the +# root Makefile (which would silently reintroduce the hot-file). It is +# advisory (warning-only) by default; pass --strict to fail. +# +# Tripwires: +# 1. Direct `.PHONY:` lines in `Makefile` (other than the canonical +# `.PHONY: $(PHONY_TARGETS)` collector). +# 2. Hand-rolled prereq tokens on the `check:`/`verify:`/`ci-fast:`/ +# `ci-full:` lines instead of the `$(*_DEPS)` variable expansion. +# +# Exit: +# 0 — clean +# 1 — violation (only when --strict) + +set -euo pipefail + +ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +MAKEFILE="$ROOT/Makefile" + +strict=0 +[ "${1:-}" = "--strict" ] && strict=1 + +fail=0 +note() { echo "makefile-hotfile-check: $*"; } + +# 1. Only the canonical .PHONY collector is allowed in Makefile. +extra_phony=$(grep -nE '^\.PHONY:' "$MAKEFILE" | grep -v '\$(PHONY_TARGETS)' || true) +if [ -n "$extra_phony" ]; then + note "found inline .PHONY: line(s) in Makefile (move target names to make/phony.mk):" + printf ' %s\n' "$extra_phony" + fail=1 +fi + +# 2. The aggregate target lines must consume the *_DEPS variable, not +# inline tokens. Match `^TARGET:` followed by prereqs. +for target in check verify ci-fast ci-full; do + var="$(echo "$target" | tr 'a-z-' 'A-Z_')_DEPS" + line=$(grep -nE "^${target}:" "$MAKEFILE" | head -1 || true) + [ -z "$line" ] && continue + # Strip "lineno:" prefix, then "target:" prefix, then optional `## doc`. + prereqs=$(echo "$line" \ + | sed -E "s|^[0-9]+:${target}:[[:space:]]*||" \ + | sed -E 's|##.*$||' \ + | xargs) + case "$prereqs" in + "\$($var)") ;; # canonical + *) + note "Makefile target '$target' has hand-rolled prereqs; should be '\$($var)' (declare members in make/$target.mk):" + printf ' %s\n' "$line" + fail=1 + ;; + esac +done + +if [ "$fail" -eq 1 ]; then + if [ "$strict" -eq 1 ]; then + note "FAIL (strict)" + exit 1 + else + note "warnings only (pass --strict to fail)" + exit 0 + fi +fi + +note "ok — make/ shard convention preserved"