diff --git a/.githooks/pre-push b/.githooks/pre-push index 14c9bfee..5f4ae0db 100755 --- a/.githooks/pre-push +++ b/.githooks/pre-push @@ -124,13 +124,6 @@ if changed 'docs/schemas/fixtures/**' '*verdict*' '*shipped-pattern*'; then run_gate verdict-fixtures-check fi -# build-tags runs `go vet` against every build-tag variant. Skip when -# no Go (or Makefile, which owns the build-tag list) changed. -if changed '*.go' 'Makefile'; then - echo "[pre-push] Go (or Makefile) changed → make build-tags" - run_gate build-tags -fi - # nccl-fr-rce-gate: stdlib-only dependency gate for the safe-pickle parser. if changed 'module/pkg/nccl/fr_parser/**'; then echo "[pre-push] nccl-fr parser changed → make nccl-fr-rce-gate" diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index bd820404..486d08ce 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -129,8 +129,6 @@ jobs: - uses: ./.github/actions/go-setup - name: license-check run: make license-check - - name: build-tags - run: make build-tags - name: tidy-check run: make tidy-check - name: nccl_fr RCE gate diff --git a/.golangci.yml b/.golangci.yml index f7671d8c..153d1d90 100644 --- a/.golangci.yml +++ b/.golangci.yml @@ -174,9 +174,12 @@ linters: # Integration tests legitimately invoke subprocesses (G204) and # hit localhost endpoints (G107) — they orchestrate built # binaries against ephemeral servers under t.TempDir control. + # bench/e2e/ocb_scrape_test.go spawns ./_build/tracecore and + # scrapes its self-telemetry endpoint; rest of bench/e2e/ is + # in-process and unaffected. - linters: - gosec - path: internal/integration/.*_test\.go + path: bench/e2e/ocb_scrape_test\.go text: "G204|G107" # Component integration helpers spawn child processes # (e.g. py-spy) under test-controlled args. diff --git a/Makefile b/Makefile index d3359a8c..9d4d9b00 100644 --- a/Makefile +++ b/Makefile @@ -454,11 +454,6 @@ validator-recipe: build ## M6: validate each docs/integrations/examples/*.yaml @scripts/validator-recipe.sh @bash scripts/validator-recipe_test.sh -build-tags: ## go vet against each build-tag variant we ship. Cheaper than CI matrix; kept as a hook for future build-tag-gated paths (the `dcgm` tag was retired in PR-F.1). - @echo "vet (default)..." - @go vet ./... - @echo "All build-tag variants vet-clean." - dco-check: ## Verify DCO sign-off on every commit since origin/main. @if ! git rev-parse --verify --quiet origin/main >/dev/null; then \ echo "origin/main is not available; fetch it first."; exit 1; \ diff --git a/bench/canary/kueue-metric-surface.sh b/bench/canary/kueue-metric-surface.sh deleted file mode 100755 index 3b382871..00000000 --- a/bench/canary/kueue-metric-surface.sh +++ /dev/null @@ -1,167 +0,0 @@ -#!/usr/bin/env bash -# DRAFT: Nightly CI canary detecting Kueue metric-surface drift. -# Lands at bench/canary/kueue-metric-surface.sh with M16 alpha (P8). -# -# The canary spins kind + the pinned Kueue version, scrapes the -# /metrics endpoint, and compares the kueue-prefixed metric-family -# set against a committed baseline. Fails on drift (added or removed -# families) so we catch upstream renames before they silently -# disappear from operator dashboards. -# -# Run nightly via .github/workflows/canary.yml (also to-be-drafted). -# Failures page tracecore-on-call; the response is to update the -# baseline + RUNBOOK + receiver test assertions, then re-merge. - -set -euo pipefail - -KUEUE_VERSION="${KUEUE_VERSION:-v0.17.3}" -CLUSTER_NAME="${CLUSTER_NAME:-kueue-canary}" -BASELINE_PATH="${BASELINE_PATH:-bench/canary/kueue-metric-surface.${KUEUE_VERSION}.baseline}" -LOG_DIR="${LOG_DIR:-$(mktemp -d)}" - -log() { - printf '[canary] %s\n' "$*" >&2 -} - -cleanup() { - log "tearing down kind cluster ${CLUSTER_NAME}" - kind delete cluster --name "${CLUSTER_NAME}" >/dev/null 2>&1 || true -} -trap cleanup EXIT - -log "creating kind cluster ${CLUSTER_NAME}" -kind create cluster --name "${CLUSTER_NAME}" --wait 90s >"${LOG_DIR}/kind.log" 2>&1 - -log "installing Kueue ${KUEUE_VERSION}" -kubectl apply --server-side \ - -f "https://github.com/kubernetes-sigs/kueue/releases/download/${KUEUE_VERSION}/manifests.yaml" \ - >"${LOG_DIR}/kueue-install.log" 2>&1 - -kubectl -n kueue-system wait --for=condition=available --timeout=120s \ - deployment/kueue-controller-manager >>"${LOG_DIR}/kueue-install.log" 2>&1 - -log "creating minimal Kueue resources to exercise metric emission" -cat <<'EOF' | kubectl apply -f - >>"${LOG_DIR}/kueue-install.log" 2>&1 -apiVersion: kueue.x-k8s.io/v1beta1 -kind: ResourceFlavor -metadata: - name: default-flavor ---- -apiVersion: kueue.x-k8s.io/v1beta1 -kind: ClusterQueue -metadata: - name: cpu-batch -spec: - namespaceSelector: {} - resourceGroups: - - coveredResources: ["cpu", "memory"] - flavors: - - name: default-flavor - resources: - - name: cpu - nominalQuota: 8 - - name: memory - nominalQuota: 16Gi ---- -apiVersion: kueue.x-k8s.io/v1beta1 -kind: LocalQueue -metadata: - name: lq-cpu - namespace: default -spec: - clusterQueue: cpu-batch -EOF - -log "submitting a probe job" -cat <<'EOF' | kubectl create -f - >>"${LOG_DIR}/kueue-install.log" 2>&1 -apiVersion: batch/v1 -kind: Job -metadata: - generateName: canary-probe- - namespace: default - labels: - kueue.x-k8s.io/queue-name: lq-cpu -spec: - suspend: true - parallelism: 1 - completions: 1 - template: - spec: - restartPolicy: Never - containers: - - name: probe - image: busybox:1.36 - command: ["sh", "-c", "sleep 30"] - resources: - requests: - cpu: "100m" - memory: "32Mi" -EOF - -sleep 10 - -log "provisioning metrics-scraper SA + bindings" -kubectl create sa metrics-scraper -n default >/dev/null 2>&1 || true -kubectl create clusterrolebinding canary-auth-delegator \ - --clusterrole=system:auth-delegator \ - --serviceaccount=default:metrics-scraper >/dev/null 2>&1 || true -kubectl create clusterrolebinding canary-kueue-reader \ - --clusterrole=kueue-metrics-reader \ - --serviceaccount=default:metrics-scraper >/dev/null 2>&1 || true - -TOKEN=$(kubectl create token metrics-scraper -n default --duration=1h) - -log "port-forwarding metrics service" -kubectl -n kueue-system port-forward \ - service/kueue-controller-manager-metrics-service 8443:8443 \ - >"${LOG_DIR}/pf.log" 2>&1 & -PF_PID=$! -sleep 3 - -log "scraping /metrics" -curl -sk -H "Authorization: Bearer ${TOKEN}" \ - -H "Accept: text/plain; version=0.0.4" \ - https://127.0.0.1:8443/metrics > "${LOG_DIR}/live-metrics.prom" - -kill "${PF_PID}" 2>/dev/null || true -wait "${PF_PID}" 2>/dev/null || true - -log "extracting kueue-prefixed metric family names" -CURRENT_SURFACE=$(grep -E '^# HELP kueue_' "${LOG_DIR}/live-metrics.prom" | - awk '{print $3}' | sort -u) - -if [[ ! -f "${BASELINE_PATH}" ]]; then - log "ERROR: baseline file ${BASELINE_PATH} does not exist." - log " Either run with BASELINE_PATH pointing at a valid baseline," - log " or seed an initial baseline by copying current surface to" - log " the expected path and reviewing it." - log "" - log " Current surface:" - printf '%s\n' "${CURRENT_SURFACE}" - exit 2 -fi - -BASELINE_SURFACE=$(sort -u < "${BASELINE_PATH}") - -if [[ "${CURRENT_SURFACE}" == "${BASELINE_SURFACE}" ]]; then - log "PASS: Kueue ${KUEUE_VERSION} metric surface matches baseline" - printf '%s\n' "${CURRENT_SURFACE}" | wc -l | xargs -I{} log " {} kueue-prefixed families observed" - exit 0 -fi - -log "FAIL: metric-surface drift detected" -log "" -log "Added (in current, not in baseline):" -comm -23 <(printf '%s\n' "${CURRENT_SURFACE}") <(printf '%s\n' "${BASELINE_SURFACE}") | - sed 's/^/ + /' >&2 -log "" -log "Removed (in baseline, not in current):" -comm -13 <(printf '%s\n' "${CURRENT_SURFACE}") <(printf '%s\n' "${BASELINE_SURFACE}") | - sed 's/^/ - /' >&2 -log "" -log "Remediation:" -log " 1. Verify upstream Kueue release notes for rename/add/remove." -log " 2. Update receiver tests + RUNBOOK with the new surface." -log " 3. Update baseline file ${BASELINE_PATH}." -log " 4. Re-run this canary; it should pass." -exit 1 diff --git a/bench/canary/kueue-metric-surface.v0.17.3.baseline b/bench/canary/kueue-metric-surface.v0.17.3.baseline deleted file mode 100644 index 366c579f..00000000 --- a/bench/canary/kueue-metric-surface.v0.17.3.baseline +++ /dev/null @@ -1,22 +0,0 @@ -kueue_admission_attempt_duration_seconds -kueue_admission_attempts_total -kueue_admission_wait_time_seconds -kueue_admitted_active_workloads -kueue_admitted_workloads_total -kueue_build_info -kueue_cluster_queue_status -kueue_finished_workloads -kueue_local_queue_admission_wait_time_seconds -kueue_local_queue_admitted_active_workloads -kueue_local_queue_admitted_workloads_total -kueue_local_queue_pending_workloads -kueue_local_queue_quota_reserved_wait_time_seconds -kueue_local_queue_quota_reserved_workloads_total -kueue_local_queue_reserving_active_workloads -kueue_local_queue_resource_reservation -kueue_local_queue_resource_usage -kueue_local_queue_status -kueue_pending_workloads -kueue_quota_reserved_wait_time_seconds -kueue_quota_reserved_workloads_total -kueue_reserving_active_workloads diff --git a/internal/integration/ocb_scrape_test.go b/bench/e2e/ocb_scrape_test.go similarity index 88% rename from internal/integration/ocb_scrape_test.go rename to bench/e2e/ocb_scrape_test.go index 7f315373..183ed41f 100644 --- a/internal/integration/ocb_scrape_test.go +++ b/bench/e2e/ocb_scrape_test.go @@ -1,20 +1,21 @@ // SPDX-License-Identifier: Apache-2.0 -// Package integration holds end-to-end checks that span the OCB-built -// tracecore binary + a live process. These tests are skipped when -// ./_build/tracecore is missing so `go test ./...` on a fresh checkout -// stays green; CI runs `make build` before invoking the integration -// package so the binary is always present in pipeline runs. +// OCB-scrape end-to-end check: spins up the OCB-built tracecore +// binary and asserts the chart's *operator-facing self-telemetry +// contract* — namely that the binary keeps serving the upstream +// `otelcol_*` metric vocabulary on the same Prometheus endpoint the +// chart's service.telemetry.metrics.address points at. Dashboards +// on those names must keep working across upstream OCB version +// bumps; this test catches a silent metric-name rename before it +// ships. // -// RFC-0013 PR-A2: this package was created when the legacy -// cmd/tracecore tree was deleted. Its single purpose is to falsify -// regressions in the chart's *operator-facing self-telemetry contract* -// — namely that the OCB binary keeps serving the upstream `otelcol_*` -// metric vocabulary on the same Prometheus endpoint the chart's -// service.telemetry.metrics.address points at. Dashboards on those -// names must keep working across upstream OCB version bumps; this -// test catches a silent metric-name rename before it ships. -package integration +// Skipped when ./_build/tracecore is missing so `go test ./...` on +// a fresh checkout stays green; CI runs `make build` before invoking +// this package so the binary is always present in pipeline runs. +// Co-located with the rest of the bench/e2e suite (steady-state + +// budget assertions) since all three exercise the same OCB-built +// binary; previously lived at internal/integration/ (RFC-0013 PR-A2). +package e2e import ( "errors" @@ -37,8 +38,8 @@ import ( // `make build` does not fail the suite. func findBuiltBinary(t *testing.T) string { t.Helper() - // Test file lives at internal/integration/ocb_scrape_test.go. - // Walk up two levels to land at the repo root. + // Test file lives at bench/e2e/ocb_scrape_test.go. + // Walk up three levels (file -> e2e -> bench -> repo root). _, thisFile, _, ok := runtime.Caller(0) if !ok { t.Fatalf("runtime.Caller(0) failed") diff --git a/docs/MILESTONES.md b/docs/MILESTONES.md index 680356f7..79a163ed 100644 --- a/docs/MILESTONES.md +++ b/docs/MILESTONES.md @@ -371,5 +371,5 @@ Receiver alpha ──▶ Lane 1 **Serialization points** (where lanes coordinate at merge): - `cmd/tracecore/components.go` - one-line registration edits per receiver -- `Makefile` - section comments per target (`make ci`, `make doc-check`, `make alert-check`, `make build-tags`, `make bench`, `make generate-fixtures`, `make release`) +- `Makefile` - section comments per target (`make ci`, `make doc-check`, `make alert-check`, `make bench`, `make generate-fixtures`, `make release`) - `go.mod` - dependency additions reviewed at merge diff --git a/docs/migration/v0.1-to-v0.2.md b/docs/migration/v0.1-to-v0.2.md index 451d7bff..d41f758b 100644 --- a/docs/migration/v0.1-to-v0.2.md +++ b/docs/migration/v0.1-to-v0.2.md @@ -102,7 +102,7 @@ The two metrics the chart commits to keeping available across OCB version bumps - `otelcol_process_uptime` — emitted by `service.telemetry` at startup; presence proves the binary is OCB-assembled and the self-tel server is wired through `service.telemetry.metrics.address`. - `otelcol_receiver_accepted_metric_points` — emitted by the receiver helper once the first scrape lands; presence proves end-to-end pipeline liveness. -[`internal/integration/ocb_scrape_test.go`](../../internal/integration/ocb_scrape_test.go) (`TestOCBScrape_UpstreamMetricVocabulary`) is the regression gate: an upstream rename of either metric fails this test before it can ship. +[`bench/e2e/ocb_scrape_test.go`](../../bench/e2e/ocb_scrape_test.go) (`TestOCBScrape_UpstreamMetricVocabulary`) is the regression gate: an upstream rename of either metric fails this test before it can ship. ### In-tree receiver / exporter namespace alignment (RFC-0013 v0.1.0) diff --git a/docs/migration/v0.x-to-v1.0.md b/docs/migration/v0.x-to-v1.0.md index a8a49d08..904c3901 100644 --- a/docs/migration/v0.x-to-v1.0.md +++ b/docs/migration/v0.x-to-v1.0.md @@ -201,7 +201,7 @@ The metric series `otelcol_process_uptime` and `otelcol_receiver_accepted_metric_points` continue to be the two liveness signals the chart commits to keeping available across OCB version bumps; the regression gate is -[`internal/integration/ocb_scrape_test.go`](../../internal/integration/ocb_scrape_test.go) +[`bench/e2e/ocb_scrape_test.go`](../../bench/e2e/ocb_scrape_test.go) (`TestOCBScrape_UpstreamMetricVocabulary`). ## 3. Verdict schema v1.0-rc1 published diff --git a/docs/notes/ci.md b/docs/notes/ci.md index 2115309e..14b56ea2 100644 --- a/docs/notes/ci.md +++ b/docs/notes/ci.md @@ -13,7 +13,7 @@ PRINCIPLES §10 splits the inner loop into two budgets: The "gates that catch the most defects per second" subset; runnable on every save. - `make ci-full` — strict superset. Adds test+coverage under -race, - generate-fixtures-check, build-tags, tidy-check, license-check, + generate-fixtures-check, tidy-check, license-check, nccl-fr-rce-gate, register-lint, actionlint, zizmor, ci-fuzz-nccl-fr, govulncheck, deprecation-check, no-autoupdate-check, build, smoke-quickstart. ~2.5m on a dev laptop. diff --git a/docs/research/m16-kueue-production-followups.md b/docs/research/m16-kueue-production-followups.md index d62e96ee..85b2b574 100644 --- a/docs/research/m16-kueue-production-followups.md +++ b/docs/research/m16-kueue-production-followups.md @@ -177,15 +177,16 @@ disappear from operator dashboards. **Blocker class:** doable now (post-alpha) — CI canary against a `kind` cluster; no production data required. -**Status: DRAFT READY.** Canary script at -[`m16-kueue-spike/draft-receiver/canary/kueue-metric-surface.sh`](../../bench/canary/kueue-metric-surface.sh) -+ baseline at -[`kueue-metric-surface.v0.17.3.baseline`](../../bench/canary/kueue-metric-surface.v0.17.3.baseline) -(22 families pinned). Lands at `bench/canary/` with the M16 -implementation PR; nightly workflow follows. - -**Trigger:** post-alpha. **Artifact:** -`bench/canary/kueue-metric-surface.sh` running against a daily +**Status: WITHDRAWN.** Draft canary script + 22-family baseline +(previously at `bench/canary/kueue-metric-surface.sh` + +`kueue-metric-surface.v0.17.3.baseline`) deleted alongside the +in-tree Kueue receiver per RFC-0013 §7. If the metric-surface +canary becomes useful again for the `prometheusreceiver` recipe, +re-introduce as a recipe-side smoke (not a receiver gate) and +restore from git history. + +**Trigger:** Kueue recipe adoption surfaces an operator-dashboard +drift incident. **Artifact:** recipe-side nightly smoke against a `kind`-pinned Kueue install. ### P9. Failure-mode taxonomy completeness @@ -473,5 +474,5 @@ M17/M18/M20 or AMD hardware). - C8 — R1+R4 tests — **DONE in spike**; R2+R3 specified in RFC for the implementation PR. - C9 — cap-semantics doc — **DONE** (RFC-0011 § "Cardinality cap semantics"). - C10 — values.yaml expansion — **DRAFT READY** (same artifact as C4). -- P8 — CI surface-drift canary — **DRAFT READY** (`bench/canary/`). +- P8 — CI surface-drift canary — **WITHDRAWN** (Kueue receiver deleted at v0.1.0 per RFC-0013 §7; restore from git history if revived as a recipe-side smoke). - C2 — adversarial review — **PENDING** (triggered at implementation PR draft). diff --git a/make/ci-full.mk b/make/ci-full.mk index ec97c8dd..487c7e7f 100644 --- a/make/ci-full.mk +++ b/make/ci-full.mk @@ -11,7 +11,6 @@ CI_FULL_DEPS += verdict-fixtures-check # Static analysis CI_FULL_DEPS += vet -CI_FULL_DEPS += build-tags CI_FULL_DEPS += tidy-check CI_FULL_DEPS += mod-verify CI_FULL_DEPS += lint diff --git a/make/phony.mk b/make/phony.mk index add4206a..46043043 100644 --- a/make/phony.mk +++ b/make/phony.mk @@ -34,7 +34,7 @@ PHONY_TARGETS += register-lint actionlint zizmor PHONY_TARGETS += doc-check doc-check-release no-autoupdate-check base-digest-check PHONY_TARGETS += recipes-path-check PHONY_TARGETS += anonymize-pod-evicted-fixture-check -PHONY_TARGETS += build-tags attribute-namespace-check deprecation-check +PHONY_TARGETS += attribute-namespace-check deprecation-check PHONY_TARGETS += rfc-status-check PHONY_TARGETS += cut-criteria-status cut-criteria-status-all cut-criteria-render cut-criteria-check PHONY_TARGETS += slo-rules-check test-flake-audit makefile-hotfile-check diff --git a/make/verify.mk b/make/verify.mk index e21ae375..83cdf2eb 100644 --- a/make/verify.mk +++ b/make/verify.mk @@ -15,9 +15,6 @@ VERIFY_DEPS += license-check VERIFY_DEPS += generate-fixtures-check VERIFY_DEPS += verdict-fixtures-check -# Build-tag variants -VERIFY_DEPS += build-tags - # Security / RFC-bound gates VERIFY_DEPS += nccl-fr-rce-gate VERIFY_DEPS += register-lint diff --git a/scripts/pre-push-test.sh b/scripts/pre-push-test.sh index 70ed84bd..7c435724 100755 --- a/scripts/pre-push-test.sh +++ b/scripts/pre-push-test.sh @@ -104,7 +104,6 @@ assert_ran deprecation-check "$out" "docs-only (*.md)" assert_skipped license-check "$out" "docs-only" assert_skipped generate-fixtures-check "$out" "docs-only" assert_skipped verdict-fixtures-check "$out" "docs-only" -assert_skipped build-tags "$out" "docs-only" assert_skipped nccl-fr-rce-gate "$out" "docs-only" assert_skipped register-lint "$out" "docs-only" assert_skipped actionlint "$out" "docs-only" @@ -116,7 +115,6 @@ set_changed_file "module/internal/example.go" out="$(run_hook)" assert_ran check "$out" "go-only" assert_ran license-check "$out" "go-only" -assert_ran build-tags "$out" "go-only" assert_ran register-lint "$out" "go-only" assert_ran deprecation-check "$out" "go-only (*.go)" assert_ran no-autoupdate-check "$out" "go-only (module/**)" @@ -136,7 +134,6 @@ assert_ran zizmor "$out" "workflow" assert_ran deprecation-check "$out" "workflow (*.yml)" assert_skipped license-check "$out" "workflow (no go)" assert_skipped doc-check "$out" "workflow" -assert_skipped build-tags "$out" "workflow (no go)" assert_skipped no-autoupdate-check "$out" "workflow" echo "==> case: nccl-fr parser change" @@ -146,7 +143,6 @@ assert_ran check "$out" "nccl-fr" assert_ran generate-fixtures-check "$out" "nccl-fr" assert_ran nccl-fr-rce-gate "$out" "nccl-fr" assert_ran license-check "$out" "nccl-fr (*.go)" -assert_ran build-tags "$out" "nccl-fr (*.go)" echo "==> case: chart-appVersion source (install/kubernetes/Chart.yaml)" set_changed_file "install/kubernetes/tracecore/Chart.yaml" @@ -197,7 +193,6 @@ assert_ran actionlint "$out" "force-all" assert_ran zizmor "$out" "force-all" assert_ran deprecation-check "$out" "force-all" assert_ran no-autoupdate-check "$out" "force-all" -assert_ran build-tags "$out" "force-all" assert_ran nccl-fr-rce-gate "$out" "force-all" assert_ran register-lint "$out" "force-all" assert_ran verdict-fixtures-check "$out" "force-all"