diff --git a/.github/workflows/chart.yml b/.github/workflows/chart.yml new file mode 100644 index 00000000..733af194 --- /dev/null +++ b/.github/workflows/chart.yml @@ -0,0 +1,263 @@ +name: chart + +on: + push: + branches: [main] + paths: + - 'install/kubernetes/tracecore/**' + - '.github/workflows/chart.yml' + - 'cmd/tracecore/**' + - 'internal/**' + - 'components/**' + - 'go.mod' + - 'go.sum' + pull_request: + paths: + - 'install/kubernetes/tracecore/**' + - '.github/workflows/chart.yml' + - 'cmd/tracecore/**' + - 'internal/**' + - 'components/**' + - 'go.mod' + - 'go.sum' + +permissions: + contents: read + +jobs: + render: + # helm lint + helm template + tracecore validate + conftest + + # yq field assertions. No cluster required; ~30s wall-clock. + # Sits OUTSIDE the `make ci` 60-second budget by living in this + # workflow rather than under `make ci`. + name: render + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v6 + - uses: actions/setup-go@v6 + with: + go-version-file: go.mod + cache: true + - name: Install helm + uses: azure/setup-helm@b9e51907a09c216f16ebe8536097933489208112 # v4.3.0 + with: + version: v3.16.4 + - name: Install yq + conftest + # `go install` resolves both tools through the Go module proxy + + # checksum database; that's an integrity-checked path (versus a + # bare curl of the GitHub release binary, which has no signature + # verification today). + run: | + go install github.com/mikefarah/yq/v4@v4.44.5 + go install github.com/open-policy-agent/conftest@v0.62.0 + yq --version + conftest --version + - name: Build tracecore (for validate gate) + run: go build -o tracecore ./cmd/tracecore + - name: helm lint (must exit 0 with zero WARNINGs) + run: | + set -eo pipefail + out=$(helm lint install/kubernetes/tracecore 2>&1) + echo "$out" + if echo "$out" | grep -q '^\[WARNING\]'; then + echo "::error::helm lint emitted WARNING; failing per M5b rubric" + exit 1 + fi + - name: Render config — all-receivers-off + run: | + helm template demo install/kubernetes/tracecore \ + -f install/kubernetes/tracecore/ci/all-receivers-off-values.yaml \ + --show-only templates/configmap.yaml \ + | yq '.data["config.yaml"]' > rendered-all-off.yaml + cat rendered-all-off.yaml + - name: tracecore validate — all-off + run: ./tracecore validate --config=rendered-all-off.yaml + - name: Render config — one-receiver-on + run: | + helm template demo install/kubernetes/tracecore \ + -f install/kubernetes/tracecore/ci/one-receiver-on-values.yaml \ + --show-only templates/configmap.yaml \ + | yq '.data["config.yaml"]' > rendered-one-on.yaml + cat rendered-one-on.yaml + - name: tracecore validate — one-on + run: ./tracecore validate --config=rendered-one-on.yaml + - name: yq — DaemonSet kind assertion + run: | + kind=$(helm template demo install/kubernetes/tracecore \ + --show-only templates/daemonset.yaml | yq '.kind') + test "$kind" = "DaemonSet" || { echo "expected DaemonSet, got $kind"; exit 1; } + - name: yq — securityContext field assertions + run: | + render=$(helm template demo install/kubernetes/tracecore --show-only templates/daemonset.yaml) + assert() { + local path="$1" expected="$2" + local got + got=$(echo "$render" | yq "$path") + if [ "$got" != "$expected" ]; then + echo "::error::$path = $got (expected $expected)" + exit 1 + fi + echo "ok: $path = $got" + } + assert '.spec.template.spec.securityContext.runAsNonRoot' 'true' + assert '.spec.template.spec.securityContext.runAsUser' '65532' + assert '.spec.template.spec.securityContext.seccompProfile.type' 'RuntimeDefault' + assert '.spec.template.spec.hostPID' 'false' + assert '.spec.template.spec.hostIPC' 'false' + assert '.spec.template.spec.hostNetwork' 'false' + assert '.spec.template.spec.containers[0].securityContext.allowPrivilegeEscalation' 'false' + assert '.spec.template.spec.containers[0].securityContext.readOnlyRootFilesystem' 'true' + assert '.spec.template.spec.containers[0].securityContext.capabilities.drop[0]' 'ALL' + - name: priorityClassName + telemetry-off render correctness + # Two value-conditional template paths that yq cannot infer from + # default-render output alone: priorityClassName must appear when + # set, and probes must disappear when telemetry is off. + run: | + pc=$(helm template demo install/kubernetes/tracecore \ + --set priorityClassName=tracecore-high \ + --show-only templates/daemonset.yaml \ + | yq '.spec.template.spec.priorityClassName') + test "$pc" = "tracecore-high" \ + || { echo "::error::priorityClassName rendered '$pc' (expected tracecore-high)"; exit 1; } + probes=$(helm template demo install/kubernetes/tracecore \ + --set telemetry.enabled=false \ + --show-only templates/daemonset.yaml \ + | yq '.spec.template.spec.containers[0] | has("livenessProbe")') + test "$probes" = "false" \ + || { echo "::error::livenessProbe rendered when telemetry.enabled=false"; exit 1; } + echo "ok: priorityClassName injection + telemetry-off probe omission both verified" + - name: Probe paths match rendered telemetry.paths + # Without this gate, a future template edit could wire a probe + # to /healthz while the config exposes /health — the pod would + # CrashLoopBackOff for a kubelet 404 the chart authors couldn't + # see in `helm template` alone. + run: | + ds_render=$(helm template demo install/kubernetes/tracecore --show-only templates/daemonset.yaml) + cm_render=$(helm template demo install/kubernetes/tracecore --show-only templates/configmap.yaml) + probe_healthz=$(echo "$ds_render" | yq '.spec.template.spec.containers[0].livenessProbe.httpGet.path') + probe_readyz=$(echo "$ds_render" | yq '.spec.template.spec.containers[0].readinessProbe.httpGet.path') + cfg_healthz=$(echo "$cm_render" | yq '.data["config.yaml"]' | yq '.telemetry.paths.healthz') + cfg_readyz=$(echo "$cm_render" | yq '.data["config.yaml"]' | yq '.telemetry.paths.readyz') + test "$probe_healthz" = "$cfg_healthz" \ + || { echo "::error::probe healthz=$probe_healthz != config healthz=$cfg_healthz"; exit 1; } + test "$probe_readyz" = "$cfg_readyz" \ + || { echo "::error::probe readyz=$probe_readyz != config readyz=$cfg_readyz"; exit 1; } + echo "ok: probe paths match config — healthz=$probe_healthz readyz=$probe_readyz" + - name: Chart.yaml — apiVersion v2 + SemVer + non-empty appVersion + run: | + api=$(yq '.apiVersion' install/kubernetes/tracecore/Chart.yaml) + ver=$(yq '.version' install/kubernetes/tracecore/Chart.yaml) + app=$(yq '.appVersion' install/kubernetes/tracecore/Chart.yaml) + [ "$api" = "v2" ] || { echo "apiVersion must be v2 (got $api)"; exit 1; } + echo "$ver" | grep -Eq '^[0-9]+\.[0-9]+\.[0-9]+(-[0-9A-Za-z.\-]+)?(\+[0-9A-Za-z.\-]+)?$' \ + || { echo "version is not SemVer: $ver"; exit 1; } + [ -n "$app" ] && [ "$app" != "null" ] || { echo "appVersion empty"; exit 1; } + - name: README — required H2 sections + run: | + required=("## Install" "## Upgrade" "## Uninstall" "## Values reference" "## Troubleshooting") + for h2 in "${required[@]}"; do + grep -Fq "$h2" install/kubernetes/tracecore/README.md \ + || { echo "missing H2 in chart README: $h2"; exit 1; } + done + - name: conftest — deny fixtures (each must fail) + run: | + set +e + fail=0 + for f in install/kubernetes/tracecore/policies/conftest/testdata/bad-*.yaml; do + conftest test \ + --policy install/kubernetes/tracecore/policies/conftest/tracecore.rego \ + "$f" > /tmp/conftest.out 2>&1 + rc=$? + if [ $rc -eq 0 ]; then + echo "::error::policy did NOT deny $f (expected failure)" + cat /tmp/conftest.out + fail=1 + else + echo "ok: $f denied" + fi + done + exit $fail + - name: conftest — good fixtures + chart render (must pass) + run: | + conftest test \ + --policy install/kubernetes/tracecore/policies/conftest/tracecore.rego \ + install/kubernetes/tracecore/policies/conftest/testdata/good-baseline.yaml \ + install/kubernetes/tracecore/policies/conftest/testdata/good-sys-ptrace.yaml + helm template demo install/kubernetes/tracecore > /tmp/chart-render.yaml + conftest test \ + --policy install/kubernetes/tracecore/policies/conftest/tracecore.rego \ + /tmp/chart-render.yaml + + install: + # End-to-end kind-cluster install. Builds a local Docker image, loads + # it into kind, runs `helm install`, waits for the DaemonSet to + # reach Ready, asserts the rollout status, then `helm uninstall`s. + # Records the wall-clock install-to-Ready duration so the M5b ≤5 min + # rubric is falsifiable across runs. + name: install (kind) + runs-on: ubuntu-latest + needs: render + steps: + - uses: actions/checkout@v6 + - uses: actions/setup-go@v6 + with: + go-version-file: go.mod + cache: true + - name: Install helm + uses: azure/setup-helm@b9e51907a09c216f16ebe8536097933489208112 # v4.3.0 + with: + version: v3.16.4 + - name: Build tracecore image + run: | + docker build \ + -f install/kubernetes/tracecore/Dockerfile \ + -t tracecore:ci \ + . + - name: Create kind cluster + uses: helm/kind-action@a1b0e391336a6ee6713a0583f8c6240d70863de3 # v1.12.0 + with: + version: v0.25.0 + node_image: kindest/node:v1.32.0 + cluster_name: tracecore-m5b + - name: Load image into kind + run: kind load docker-image tracecore:ci --name tracecore-m5b + - name: helm install + measure install-to-Ready + run: | + set -eo pipefail + start=$(date +%s) + helm install tracecore install/kubernetes/tracecore \ + --namespace tracecore-system --create-namespace \ + --set image.repository=tracecore \ + --set image.tag=ci \ + --set image.pullPolicy=Never \ + --wait --timeout 5m + ds=$(kubectl -n tracecore-system get daemonset \ + -l app.kubernetes.io/instance=tracecore \ + -o jsonpath='{.items[0].metadata.name}') + kubectl -n tracecore-system rollout status "daemonset/$ds" --timeout=2m + end=$(date +%s) + dur=$((end - start)) + echo "install_to_ready_seconds=$dur" >> "$GITHUB_OUTPUT" + echo "::notice::install-to-Ready: ${dur}s (rubric: ≤300s)" + test "$dur" -le 300 \ + || { echo "::error::install-to-Ready ${dur}s exceeds 300s rubric"; exit 1; } + - name: "helm status — STATUS: deployed" + run: | + status=$(helm status tracecore --namespace tracecore-system | grep '^STATUS:' | awk '{print $2}') + test "$status" = "deployed" \ + || { echo "expected STATUS: deployed, got $status"; exit 1; } + - name: Probe — collector readiness via port-forward + run: | + ds=$(kubectl -n tracecore-system get daemonset \ + -l app.kubernetes.io/instance=tracecore \ + -o jsonpath='{.items[0].metadata.name}') + kubectl -n tracecore-system port-forward "daemonset/$ds" 8888:8888 & + pf=$! + trap "kill $pf 2>/dev/null || true" EXIT + sleep 3 + curl -fsS http://localhost:8888/readyz + curl -fsS http://localhost:8888/healthz + - name: helm uninstall + if: always() + run: | + helm uninstall tracecore --namespace tracecore-system || true diff --git a/CHANGELOG.md b/CHANGELOG.md index ad7c34af..f85b56ff 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ Pre-alpha. The CLI runs the M1 pipeline runtime end-to-end via factory-based ass ### Added +- **M5b — Helm chart + minimal-privilege pod spec** — `install/kubernetes/tracecore/` ships a `restricted` Pod Security Standard DaemonSet (`runAsNonRoot`, `runAsUser: 65532`, `seccompProfile: RuntimeDefault`, `allowPrivilegeEscalation: false`, `readOnlyRootFilesystem: true`, `capabilities.drop: [ALL]`), per-receiver `receivers..enabled` toggles, free-form `config:` override, and a bundled conftest policy that rejects `privileged`, `hostPID`, `hostIPC`, `hostNetwork`, `hostUsers`, `runAsUser: 0`, `runAsGroup: 0`, `procMount: Unmasked`, missing pod `runAsNonRoot`, missing `seccompProfile`, missing `readOnlyRootFilesystem`, `allowPrivilegeEscalation: true`, and capability additions other than `SYS_PTRACE`. Values exposes `priorityClassName`, `tolerations`, `nodeSelector`, `affinity`, `updateStrategy`, and `probes.{liveness,readiness}` so operators tune rollout cadence and scheduling without forking the chart. New `chart` workflow renders the chart, validates rendered configs (`all-receivers-off` + `one-on`) via `tracecore validate`, exercises the conftest policy, asserts probe paths match rendered telemetry paths, and installs the chart end-to-end on a single-node kind cluster. - **M1 keystone** — `internal/pipeline` package with `Component`, `Host`, per-signal `Factory` interfaces; `Runtime` with two-phase shutdown (1s receivers + operator-configurable drain budget); `pipelinetest.New(t)` test fixture; first-data instrumentation wrappers. See [RFC-0003](docs/rfcs/0003-pipeline-runtime-and-component-contract.md). - **`internal/consumer` push interfaces** — per-signal `Metrics`/`Traces`/`Logs` interfaces with `Capabilities() Capabilities` (MutatesData flag), mirroring OTel Collector v0.152.0. - **`internal/safe.Call`** — cgo / vendor-SDK panic-wrapper with named-op error tagging and ctx-respect. diff --git a/docs/FOLLOWUPS.md b/docs/FOLLOWUPS.md index e3898a22..b7b83804 100644 --- a/docs/FOLLOWUPS.md +++ b/docs/FOLLOWUPS.md @@ -732,3 +732,50 @@ deferred are phased here. the second use. Rare edge case in operator-owned YAML; the regex-based redaction catches the common case. Revisit if an operator reports it. + +## M5b chart — opportunistic deferrals + +Items surfaced during PR #29 review that were explicitly held out of +M5b scope. Order is roughly highest-leverage first. + +- [ ] **NetworkPolicy template.** Ship a `templates/networkpolicy.yaml` + gated by `networkPolicy.enabled` (default `false`) that allows + egress to operator-configured exporter endpoints + kubelet probe + traffic. Zero-trust adopters add their own today; the chart + should ship a baseline they can opt into. *Trigger:* the first + adopter request, or when an OTLP exporter receiver lands (M10+) + and the egress shape stops being guess-work. +- [ ] **Image scanning + SBOM gate on the chart container image.** The + `install/kubernetes/tracecore/Dockerfile` is reference-only for + the kind-install CI workflow; M3 owns the canonical release + image and is the right place to wire `trivy` / `syft` / + `cosign`. When M3 lands, replumb `chart.yml` to reuse M3's + scanned image instead of building locally. *Trigger:* M3 + reproducible-build CI lands and exposes a reusable image + reference. +- [ ] **Per-receiver resource guidance docs.** README's "Troubleshooting" + currently has one OOMKilled entry for kernelevents. Replace with + a measured table (kernelevents under journald load, dcgm scrape + cardinality) once a real workload runs against tracecore. + *Trigger:* M5 install/overhead benchmark harness produces RSS + measurements per enabled receiver, OR an operator reports + tuning friction. +- [ ] **10-run install-to-Ready median aggregate measurement.** + MILESTONES.md §M5b non-functional rubric asks for a median + "across 10 CI runs on single-node kind." The current workflow + asserts ≤300s on every run and records install-to-Ready in + `$GITHUB_OUTPUT`; the 10-run aggregate accumulates on `main` + post-merge. A scheduled nightly that pulls the recorded values, + computes median + p99, and gates on regression closes the + rubric explicitly. *Trigger:* M5 install-bench harness lands + (its `bench/results/*.json` already captures the shape), OR a + regression is reported. +- [ ] **`appArmorProfile` on the rendered DaemonSet.** Restricted PSS + *permits* AppArmor profile to be undefined, so the chart is + compliant today. Setting `RuntimeDefault` would harden against + clusters with stricter local policy and shave one item off + adopter security checklists. K8s 1.30+ uses + `pod.securityContext.appArmorProfile`; chart targets `>=1.28` + so this needs a version-gated template or values toggle. + *Trigger:* kubeVersion floor moves to >=1.30, or first adopter + asks. diff --git a/install/kubernetes/tracecore/.helmignore b/install/kubernetes/tracecore/.helmignore new file mode 100644 index 00000000..797da3f5 --- /dev/null +++ b/install/kubernetes/tracecore/.helmignore @@ -0,0 +1,13 @@ +.DS_Store +.git/ +.gitignore +*.swp +*.tmp +*.bak +*.orig +*.tgz +ci/ +policies/ +# Dockerfile lives here for the kind-install CI workflow; the packaged +# chart does not ship a container build context to end users. +Dockerfile diff --git a/install/kubernetes/tracecore/Chart.yaml b/install/kubernetes/tracecore/Chart.yaml new file mode 100644 index 00000000..77aab204 --- /dev/null +++ b/install/kubernetes/tracecore/Chart.yaml @@ -0,0 +1,48 @@ +apiVersion: v2 +name: tracecore +description: Minimal-privilege DaemonSet for the tracecore OpenTelemetry collector. +type: application +# version: chart-package version. Independent from appVersion; bumped on +# any chart change. Pre-1.0.0 while the install surface evolves. +version: 0.1.0 +# appVersion: the tracecore binary release the chart's default image tag +# points at. Held in sync with the upstream binary by the +# chart-appversion CI gate; do not edit by hand without bumping +# install/kubernetes/tracecore/.appversion-source and rerunning the +# gate. +appVersion: "0.0.0-dev" +kubeVersion: ">=1.28.0-0" +home: https://github.com/tracecoreai/tracecore +sources: + - https://github.com/tracecoreai/tracecore +maintainers: + - name: tracecore maintainers + url: https://github.com/tracecoreai/tracecore +keywords: + - opentelemetry + - observability + - gpu + - kernel-events + - daemonset +annotations: + category: Observability + licenses: Apache-2.0 + # Artifact Hub metadata. Spec: https://artifacthub.io/docs/topics/annotations/helm/ + # `prerelease: "true"` flags the chart as pre-1.0; drop when v0.1.0 ships. + artifacthub.io/license: Apache-2.0 + artifacthub.io/prerelease: "true" + artifacthub.io/links: | + - name: source + url: https://github.com/tracecoreai/tracecore + - name: chart-readme + url: https://github.com/tracecoreai/tracecore/blob/main/install/kubernetes/tracecore/README.md + - name: milestones + url: https://github.com/tracecoreai/tracecore/blob/main/MILESTONES.md + artifacthub.io/changes: | + - kind: added + description: Restricted Pod Security Standard DaemonSet with per-receiver toggles. + - kind: added + description: Bundled conftest policy blocking 13 distinct privilege-escalation paths. + - kind: added + description: Kind-cluster install workflow with helm lint + chart-render + conftest + end-to-end install gates. + artifacthub.io/operator: "false" diff --git a/install/kubernetes/tracecore/Dockerfile b/install/kubernetes/tracecore/Dockerfile new file mode 100644 index 00000000..999ecabb --- /dev/null +++ b/install/kubernetes/tracecore/Dockerfile @@ -0,0 +1,23 @@ +# Reference Dockerfile for kind-cluster install CI. NOT the production +# release image — M3 (reproducible-build) owns the canonical +# `tracecore:` builds. This file lives under the chart so the +# install workflow can build a self-contained image without depending +# on M3 having landed. + +# Image digests pinned for supply-chain integrity. Update by +# `crane digest ` (or `docker buildx imagetools inspect `) +# whenever the tag is bumped. +FROM golang:1.26.3-alpine@sha256:91eda9776261207ea25fd06b5b7fed8d397dd2c0a283e77f2ab6e91bfa71079d AS build +WORKDIR /src +COPY . . +RUN apk add --no-cache git ca-certificates && \ + CGO_ENABLED=0 GOOS=linux go build \ + -trimpath \ + -ldflags "-s -w" \ + -o /out/tracecore \ + ./cmd/tracecore + +FROM gcr.io/distroless/static-debian12:nonroot@sha256:a9329520abc449e3b14d5bc3a6ffae065bdde0f02667fa10880c49b35c109fd1 +COPY --from=build /out/tracecore /usr/local/bin/tracecore +USER 65532:65532 +ENTRYPOINT ["/usr/local/bin/tracecore"] diff --git a/install/kubernetes/tracecore/README.md b/install/kubernetes/tracecore/README.md new file mode 100644 index 00000000..6f9d5f8f --- /dev/null +++ b/install/kubernetes/tracecore/README.md @@ -0,0 +1,273 @@ +# tracecore Helm chart + +Minimal-privilege DaemonSet for the [tracecore](https://github.com/tracecoreai/tracecore) +OpenTelemetry collector. Renders a `restricted`-class Pod Security Standard +pod spec by default; per-receiver toggles let an operator opt into +hardware-coupled receivers (DCGM, kernelevents) without changing the +template. + +| Chart attribute | Value | +| --- | --- | +| `apiVersion` | v2 | +| `version` (chart) | 0.1.0 | +| `appVersion` (binary) | tracked to the tracecore release the chart was tested against | +| `kubeVersion` | `>=1.28.0-0` | + +## Install + +Add the chart from a local checkout: + +```bash +helm install tracecore install/kubernetes/tracecore \ + --namespace tracecore-system --create-namespace +``` + +Or render and apply manually for a dry-run review: + +```bash +helm template tracecore install/kubernetes/tracecore \ + --namespace tracecore-system \ + | kubectl apply --dry-run=server -f - +``` + +The default values enable the hardware-free `clockreceiver` paired with +the in-tree `stdoutexporter`; the DaemonSet boots cleanly on a no-GPU +cluster. To enable the GPU `dcgm` receiver or the host-kernel +`kernelevents` receiver, see `values.yaml` and the deviations table in +"Pod Security Standard compliance" below. + +## Upgrade + +The chart follows SemVer. Backwards-incompatible values changes carry a +MAJOR bump and a `BREAKING CHANGES.md` entry under the chart directory. +Patch and minor upgrades: + +```bash +helm upgrade tracecore install/kubernetes/tracecore \ + --namespace tracecore-system --reuse-values +``` + +To inspect the rendered config diff before applying: + +```bash +helm diff upgrade tracecore install/kubernetes/tracecore \ + --namespace tracecore-system +``` + +(requires the [helm-diff plugin](https://github.com/databus23/helm-diff).) + +## Uninstall + +```bash +helm uninstall tracecore --namespace tracecore-system +kubectl delete namespace tracecore-system # only if no other workloads live there +``` + +The chart does not create the namespace and will not delete it on +uninstall. ConfigMaps/ServiceAccounts owned by the release are removed +automatically; PersistentVolumeClaims (if any are added via the +`config:` override) are not. + +## Values reference + +| Path | Type | Default | Purpose | +| --- | --- | --- | --- | +| `namespace` | string | `tracecore-system` | Target namespace for all chart objects. | +| `image.repository` | string | `ghcr.io/tracecoreai/tracecore` | Container image repository. | +| `image.tag` | string | `""` (falls back to `.Chart.AppVersion`) | Override for pinned digests or local kind-loaded images. | +| `image.pullPolicy` | string | `IfNotPresent` | Standard kubelet pull policy. | +| `serviceAccount.create` | bool | `true` | Render a ServiceAccount alongside the DaemonSet. | +| `serviceAccount.automount` | bool | `false` | The collector does not call the API server by default. | +| `podSecurityContext.runAsNonRoot` | bool | `true` | restricted-PSS gate. | +| `podSecurityContext.runAsUser` | int | `65532` | Non-zero UID. | +| `podSecurityContext.seccompProfile.type` | string | `RuntimeDefault` | restricted-PSS gate. | +| `containerSecurityContext.allowPrivilegeEscalation` | bool | `false` | restricted-PSS gate. | +| `containerSecurityContext.readOnlyRootFilesystem` | bool | `true` | tracecore writes only to `/tmp` (emptyDir). | +| `containerSecurityContext.capabilities.drop` | list | `[ALL]` | restricted-PSS gate. | +| `containerSecurityContext.capabilities.add` | list | `[]` | SYS_PTRACE is the only allowed addition; conftest rejects any other. | +| `telemetry.enabled` | bool | `true` | tracecore `/metrics`+`/healthz`+`/readyz` listener. | +| `telemetry.listen` | string | `0.0.0.0:8888` | Pod-IP listener; kubelet probes hit the pod IP. | +| `receivers..enabled` | bool | varies | Toggle per receiver. `clockreceiver` on by default. | +| `exporters..enabled` | bool | varies | Toggle per exporter. `stdoutexporter` on by default. | +| `pipelines.` | map | `metrics: {receivers:[clockreceiver], exporters:[stdoutexporter]}` | Pipeline wiring. References to disabled components are silently dropped at render time. | +| `config` | map | `{}` | Free-form override deep-merged INTO the rendered tracecore config last. Do NOT place credentials here; ConfigMaps are unencrypted in etcd. | +| `resources.requests` | map | `{cpu: 10m, memory: 32Mi}` | Conservative defaults; tune for receiver load. | +| `resources.limits` | map | `{cpu: 100m, memory: 128Mi}` | Conservative defaults; tune for receiver load. | +| `updateStrategy` | map | RollingUpdate / maxUnavailable=1 | DaemonSet rollout cadence. Bump `maxUnavailable` to a percentage (e.g. `10%`) on fleets >500 nodes to avoid multi-hour rollouts. | +| `priorityClassName` | string | `""` | PriorityClass for node-pressure eviction survival. Empty falls back to the cluster default; supply a name when tracecore is part of your incident-response surface. | +| `tolerations` | list | `[]` | Drop a `{operator: Exists}` entry to schedule on tainted nodes (control plane, GPU pools). | +| `probes.liveness.{initialDelaySeconds,periodSeconds,failureThreshold}` | map | `{10, 30, 3}` | Liveness probe timing (100s before kubelet restarts on `/healthz` failure). | +| `probes.readiness.{initialDelaySeconds,periodSeconds,failureThreshold}` | map | `{5, 10, 4}` | Readiness probe timing (45s grace window). | + +The chart's authoritative defaults live in +[`values.yaml`](./values.yaml); the table above is a narrative +companion, not the schema. If the two disagree, `values.yaml` wins — +file a bug against this README. + +## Common configurations + +A few worked examples for typical adopter overlays. Save each as a +file and pass with `-f `; `--reuse-values` preserves anything +not overridden. + +**Enable DCGM on every node (requires `nv-hostengine` reachable):** + +```yaml +# dcgm-overlay.yaml +receivers: + clockreceiver: + enabled: false + dcgm: + enabled: true + endpoint: localhost:5555 +pipelines: + metrics: + receivers: [dcgm] + exporters: [stdoutexporter] +``` + +Apply: `helm upgrade tracecore install/kubernetes/tracecore -n tracecore-system -f dcgm-overlay.yaml` + +**Route output to an OTLP backend via the `config` override:** + +```yaml +# otlp-overlay.yaml +exporters: + stdoutexporter: + enabled: false +config: + exporters: + otlphttp: + endpoint: https://collector.example.com:4318 + service: + pipelines: + metrics: + receivers: [clockreceiver] + exporters: [otlphttp] +``` + +**Run on every node including tainted ones (control plane, GPU pools):** + +```yaml +# all-nodes-overlay.yaml +tolerations: + - operator: Exists +``` + +## Troubleshooting + +**Pod stuck in `CrashLoopBackOff` after install.** Run `kubectl logs` +on the failing pod; the most common cause on first install is an +unreachable `image.repository`. The default tag is the chart's +`appVersion` — pre-release clusters need either an explicit +`--set image.tag=` or a `kind load docker-image` step. + +**`helm install` succeeds but `helm status` reports +`STATUS: deployed` with zero ready pods.** Either no nodes match the +default tolerations (the chart tolerates nothing by default — only +worker nodes are eligible) or the kubelet probe is failing. Inspect +`kubectl describe pod` for taint mismatches and `kubectl logs` for +listener bind errors. Override `tolerations: [{operator: Exists}]` to +schedule on control-plane and GPU-tainted nodes. + +**Receiver shows up in `tracecore validate --explain` but emits no +data.** The receiver is enabled but its hardware/kernel dependency is +unavailable. Check the per-receiver README under +`components/receivers//`; degraded mode is the documented +contract for missing dependencies. + +**`helm lint` reports `[WARNING]`.** Treat WARNING as error — the CI +gate fails on any WARNING. The common cause is a stale `Chart.yaml` +`apiVersion` (must be v2) or a missing `kubeVersion` clause. + +**Conftest rejects the rendered DaemonSet.** The chart's own output +must pass the bundled policy; if it does not, you have changed the +template in a way that violates the minimum-privilege charter. Re-read +[`policies/conftest/tracecore.rego`](./policies/conftest/tracecore.rego) +and the fixture set under `policies/conftest/testdata/` before +patching the template. + +**`OOMKilled` after enabling `receivers.kernelevents`.** The kernel +event source can buffer large batches under journald load; the chart's +default `resources.limits.memory: 128Mi` is sized for the default +hardware-free configuration. Bump to `256Mi` or higher +(`--set resources.limits.memory=256Mi`) and monitor RSS with +`kubectl top pod`. + +**Rollout takes hours on fleets above ~500 nodes.** Default +`updateStrategy.rollingUpdate.maxUnavailable: 1` × per-node readiness +grace (~45s) serializes the rollout. Override with +`--set updateStrategy.rollingUpdate.maxUnavailable=10%` to parallelize. + +**ImagePullBackOff on first install.** The default image +(`ghcr.io/tracecoreai/tracecore`) is a public registry; air-gapped +clusters must mirror the image to an internal registry and set +`--set image.repository=/tracecore` (+ optional +`imagePullSecrets`). For pre-release builds, the M3 release stream +has not landed yet — use `kind load docker-image` against a local +build for evaluation clusters. + +**`helm upgrade --reuse-values` ignores a chart-level default I want.** +`--reuse-values` is intentionally additive: a chart that added a new +field (e.g. `priorityClassName` in chart `0.1.x`) keeps the operator's +old missing-field state. Re-render with explicit overrides or omit +`--reuse-values` to pick up new defaults — `helm diff upgrade` shows +exactly which fields would change. + +**Cluster-wide PSS enforcement.** The chart renders pods that comply +with `restricted`; cluster-level enforcement is the operator's +responsibility. Label the target namespace once at install time: + +```bash +kubectl label namespace tracecore-system \ + pod-security.kubernetes.io/enforce=restricted \ + pod-security.kubernetes.io/audit=restricted \ + pod-security.kubernetes.io/warn=restricted +``` + +## Pod Security Standard compliance + +The chart targets the Kubernetes [`restricted`](https://kubernetes.io/docs/concepts/security/pod-security-standards/) +Pod Security Standard. Every restricted-profile assertion is enforced +by the bundled conftest policy and CI gate: + +| Assertion | Where enforced | +| --- | --- | +| `securityContext.runAsNonRoot: true` | values.yaml `podSecurityContext.runAsNonRoot` | +| `securityContext.runAsUser != 0` | values.yaml `podSecurityContext.runAsUser` (default 65532) | +| `seccompProfile.type: RuntimeDefault` | values.yaml `podSecurityContext.seccompProfile.type` | +| `allowPrivilegeEscalation: false` | values.yaml `containerSecurityContext.allowPrivilegeEscalation` | +| `readOnlyRootFilesystem: true` | values.yaml `containerSecurityContext.readOnlyRootFilesystem` + conftest deny | +| `capabilities.drop: [ALL]` | values.yaml `containerSecurityContext.capabilities.drop` | +| `hostPID: false` | DaemonSet template (not values-tunable) + conftest deny | +| `hostIPC: false` | DaemonSet template (not values-tunable) + conftest deny | +| `hostNetwork: false` | DaemonSet template (not values-tunable) + conftest deny | + +### Documented deviations + +The `restricted` profile permits the empty capability set only. The +chart's deviations from a literal reading of `restricted`: + +1. **`SYS_PTRACE` is allowed in `capabilities.add`.** Some receivers + (e.g. future host-process inspection receivers) need `ptrace` to + read `/proc/` of other processes for failure attribution. The + capability is in the conftest allowlist; any other addition rejects + the build. + +2. **Host-path mounts are required for some receivers.** Enabling + `receivers.kernelevents` requires `hostPath` mounts (`/dev/kmsg` + read-only, optionally `/var/log/journal` and + `/run/systemd/journal` for the journald source). The chart does + not render those mounts by default; operators opt in via the + `config:` override and accept the deviation. + +3. **DCGM standalone mode connects to `nv-hostengine`.** When + `receivers.dcgm.enabled=true` and `mode=standalone`, the + DaemonSet connects to an external DCGM endpoint specified in + values. The chart does not run `nv-hostengine` in-process and does + not add capabilities for it. Embedded mode is out of scope for the + default chart. + +Each deviation is bounded by the conftest policy: the policy only +permits SYS_PTRACE, never relaxes hostPID/hostIPC/hostNetwork, and +fails the CI gate on any privileged container. diff --git a/install/kubernetes/tracecore/ci/all-receivers-off-values.yaml b/install/kubernetes/tracecore/ci/all-receivers-off-values.yaml new file mode 100644 index 00000000..ff18c164 --- /dev/null +++ b/install/kubernetes/tracecore/ci/all-receivers-off-values.yaml @@ -0,0 +1,16 @@ +# Used by the chart-render CI gate. Disables every receiver and +# exporter; rendered config has no pipelines and only the telemetry +# block. tracecore validate exits 0 on this minimal config. +receivers: + clockreceiver: + enabled: false + dcgm: + enabled: false + kernelevents: + enabled: false + +exporters: + stdoutexporter: + enabled: false + +pipelines: {} diff --git a/install/kubernetes/tracecore/ci/one-receiver-on-values.yaml b/install/kubernetes/tracecore/ci/one-receiver-on-values.yaml new file mode 100644 index 00000000..ad631052 --- /dev/null +++ b/install/kubernetes/tracecore/ci/one-receiver-on-values.yaml @@ -0,0 +1,20 @@ +# Used by the chart-render CI gate. Enables only clockreceiver (the +# hardware-free default) and the stdoutexporter; the rendered config +# has exactly one pipeline. +receivers: + clockreceiver: + enabled: true + interval: 1s + dcgm: + enabled: false + kernelevents: + enabled: false + +exporters: + stdoutexporter: + enabled: true + +pipelines: + metrics: + receivers: [clockreceiver] + exporters: [stdoutexporter] diff --git a/install/kubernetes/tracecore/policies/conftest/testdata/README.md b/install/kubernetes/tracecore/policies/conftest/testdata/README.md new file mode 100644 index 00000000..8fb2e8bc --- /dev/null +++ b/install/kubernetes/tracecore/policies/conftest/testdata/README.md @@ -0,0 +1,44 @@ +# conftest fixtures + +Each `bad-*.yaml` violates exactly one rule in +[`../tracecore.rego`](../tracecore.rego) and asserts the chart's +minimum-privilege charter is testable in isolation. The chart CI gate +loops over `bad-*.yaml` and fails if any fixture does **not** deny. + +`good-baseline.yaml` and `good-sys-ptrace.yaml` are the inverse: they +must pass the policy. The chart's own `helm template` output is also +fed through the policy in CI. + +## Adding a fixture + +Every fixture must carry the pod-level securityContext the new rules +enforce so it fails on the rule it is meant to test, not on the policy +floor: + +```yaml +spec: + template: + spec: + securityContext: + runAsNonRoot: true + seccompProfile: { type: RuntimeDefault } + containers: + - name: c + image: nginx + securityContext: + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + capabilities: { drop: [ALL] } +``` + +The deviating field is the one the rule under test denies. Run the +fixture locally before pushing: + +```bash +conftest test \ + --policy install/kubernetes/tracecore/policies/conftest/tracecore.rego \ + install/kubernetes/tracecore/policies/conftest/testdata/bad-.yaml +``` + +The output should show exactly one `FAIL` line naming your rule and +matching exit code 1. diff --git a/install/kubernetes/tracecore/policies/conftest/testdata/bad-allowprivilegeescalation.yaml b/install/kubernetes/tracecore/policies/conftest/testdata/bad-allowprivilegeescalation.yaml new file mode 100644 index 00000000..62cf98fb --- /dev/null +++ b/install/kubernetes/tracecore/policies/conftest/testdata/bad-allowprivilegeescalation.yaml @@ -0,0 +1,18 @@ +apiVersion: apps/v1 +kind: DaemonSet +metadata: { name: bad-ape } +spec: + selector: { matchLabels: { app: bad } } + template: + metadata: { labels: { app: bad } } + spec: + securityContext: + runAsNonRoot: true + seccompProfile: { type: RuntimeDefault } + containers: + - name: c + image: nginx + securityContext: + allowPrivilegeEscalation: true + readOnlyRootFilesystem: true + capabilities: { drop: [ALL] } diff --git a/install/kubernetes/tracecore/policies/conftest/testdata/bad-disallowed-cap.yaml b/install/kubernetes/tracecore/policies/conftest/testdata/bad-disallowed-cap.yaml new file mode 100644 index 00000000..eb054b4c --- /dev/null +++ b/install/kubernetes/tracecore/policies/conftest/testdata/bad-disallowed-cap.yaml @@ -0,0 +1,20 @@ +apiVersion: apps/v1 +kind: DaemonSet +metadata: { name: bad-cap } +spec: + selector: { matchLabels: { app: bad } } + template: + metadata: { labels: { app: bad } } + spec: + securityContext: + runAsNonRoot: true + seccompProfile: { type: RuntimeDefault } + containers: + - name: c + image: nginx + securityContext: + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + capabilities: + drop: [ALL] + add: [NET_ADMIN] diff --git a/install/kubernetes/tracecore/policies/conftest/testdata/bad-hostipc.yaml b/install/kubernetes/tracecore/policies/conftest/testdata/bad-hostipc.yaml new file mode 100644 index 00000000..5199f03b --- /dev/null +++ b/install/kubernetes/tracecore/policies/conftest/testdata/bad-hostipc.yaml @@ -0,0 +1,19 @@ +apiVersion: apps/v1 +kind: DaemonSet +metadata: { name: bad-hostipc } +spec: + selector: { matchLabels: { app: bad } } + template: + metadata: { labels: { app: bad } } + spec: + hostIPC: true + securityContext: + runAsNonRoot: true + seccompProfile: { type: RuntimeDefault } + containers: + - name: c + image: nginx + securityContext: + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + capabilities: { drop: [ALL] } diff --git a/install/kubernetes/tracecore/policies/conftest/testdata/bad-hostnetwork.yaml b/install/kubernetes/tracecore/policies/conftest/testdata/bad-hostnetwork.yaml new file mode 100644 index 00000000..e15bb8de --- /dev/null +++ b/install/kubernetes/tracecore/policies/conftest/testdata/bad-hostnetwork.yaml @@ -0,0 +1,19 @@ +apiVersion: apps/v1 +kind: DaemonSet +metadata: { name: bad-hostnetwork } +spec: + selector: { matchLabels: { app: bad } } + template: + metadata: { labels: { app: bad } } + spec: + hostNetwork: true + securityContext: + runAsNonRoot: true + seccompProfile: { type: RuntimeDefault } + containers: + - name: c + image: nginx + securityContext: + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + capabilities: { drop: [ALL] } diff --git a/install/kubernetes/tracecore/policies/conftest/testdata/bad-hostpid.yaml b/install/kubernetes/tracecore/policies/conftest/testdata/bad-hostpid.yaml new file mode 100644 index 00000000..261122e0 --- /dev/null +++ b/install/kubernetes/tracecore/policies/conftest/testdata/bad-hostpid.yaml @@ -0,0 +1,19 @@ +apiVersion: apps/v1 +kind: DaemonSet +metadata: { name: bad-hostpid } +spec: + selector: { matchLabels: { app: bad } } + template: + metadata: { labels: { app: bad } } + spec: + hostPID: true + securityContext: + runAsNonRoot: true + seccompProfile: { type: RuntimeDefault } + containers: + - name: c + image: nginx + securityContext: + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + capabilities: { drop: [ALL] } diff --git a/install/kubernetes/tracecore/policies/conftest/testdata/bad-hostusers.yaml b/install/kubernetes/tracecore/policies/conftest/testdata/bad-hostusers.yaml new file mode 100644 index 00000000..59acfdb5 --- /dev/null +++ b/install/kubernetes/tracecore/policies/conftest/testdata/bad-hostusers.yaml @@ -0,0 +1,19 @@ +apiVersion: apps/v1 +kind: DaemonSet +metadata: { name: bad-hostusers } +spec: + selector: { matchLabels: { app: bad } } + template: + metadata: { labels: { app: bad } } + spec: + hostUsers: true + securityContext: + runAsNonRoot: true + seccompProfile: { type: RuntimeDefault } + containers: + - name: c + image: nginx + securityContext: + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + capabilities: { drop: [ALL] } diff --git a/install/kubernetes/tracecore/policies/conftest/testdata/bad-missing-rorootfs.yaml b/install/kubernetes/tracecore/policies/conftest/testdata/bad-missing-rorootfs.yaml new file mode 100644 index 00000000..1092b4f8 --- /dev/null +++ b/install/kubernetes/tracecore/policies/conftest/testdata/bad-missing-rorootfs.yaml @@ -0,0 +1,17 @@ +apiVersion: apps/v1 +kind: DaemonSet +metadata: { name: bad-rorootfs } +spec: + selector: { matchLabels: { app: bad } } + template: + metadata: { labels: { app: bad } } + spec: + securityContext: + runAsNonRoot: true + seccompProfile: { type: RuntimeDefault } + containers: + - name: c + image: nginx + securityContext: + allowPrivilegeEscalation: false + capabilities: { drop: [ALL] } diff --git a/install/kubernetes/tracecore/policies/conftest/testdata/bad-missing-seccomp.yaml b/install/kubernetes/tracecore/policies/conftest/testdata/bad-missing-seccomp.yaml new file mode 100644 index 00000000..a8c71dc8 --- /dev/null +++ b/install/kubernetes/tracecore/policies/conftest/testdata/bad-missing-seccomp.yaml @@ -0,0 +1,17 @@ +apiVersion: apps/v1 +kind: DaemonSet +metadata: { name: bad-seccomp } +spec: + selector: { matchLabels: { app: bad } } + template: + metadata: { labels: { app: bad } } + spec: + securityContext: + runAsNonRoot: true + containers: + - name: c + image: nginx + securityContext: + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + capabilities: { drop: [ALL] } diff --git a/install/kubernetes/tracecore/policies/conftest/testdata/bad-privileged.yaml b/install/kubernetes/tracecore/policies/conftest/testdata/bad-privileged.yaml new file mode 100644 index 00000000..8dd0dda3 --- /dev/null +++ b/install/kubernetes/tracecore/policies/conftest/testdata/bad-privileged.yaml @@ -0,0 +1,19 @@ +apiVersion: apps/v1 +kind: DaemonSet +metadata: { name: bad-privileged } +spec: + selector: { matchLabels: { app: bad } } + template: + metadata: { labels: { app: bad } } + spec: + securityContext: + runAsNonRoot: true + seccompProfile: { type: RuntimeDefault } + containers: + - name: c + image: nginx + securityContext: + privileged: true + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + capabilities: { drop: [ALL] } diff --git a/install/kubernetes/tracecore/policies/conftest/testdata/bad-procmount.yaml b/install/kubernetes/tracecore/policies/conftest/testdata/bad-procmount.yaml new file mode 100644 index 00000000..d72e0ff8 --- /dev/null +++ b/install/kubernetes/tracecore/policies/conftest/testdata/bad-procmount.yaml @@ -0,0 +1,19 @@ +apiVersion: apps/v1 +kind: DaemonSet +metadata: { name: bad-procmount } +spec: + selector: { matchLabels: { app: bad } } + template: + metadata: { labels: { app: bad } } + spec: + securityContext: + runAsNonRoot: true + seccompProfile: { type: RuntimeDefault } + containers: + - name: c + image: nginx + securityContext: + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + procMount: Unmasked + capabilities: { drop: [ALL] } diff --git a/install/kubernetes/tracecore/policies/conftest/testdata/bad-runasgroup-0.yaml b/install/kubernetes/tracecore/policies/conftest/testdata/bad-runasgroup-0.yaml new file mode 100644 index 00000000..04a91c81 --- /dev/null +++ b/install/kubernetes/tracecore/policies/conftest/testdata/bad-runasgroup-0.yaml @@ -0,0 +1,19 @@ +apiVersion: apps/v1 +kind: DaemonSet +metadata: { name: bad-gid0 } +spec: + selector: { matchLabels: { app: bad } } + template: + metadata: { labels: { app: bad } } + spec: + securityContext: + runAsNonRoot: true + runAsGroup: 0 + seccompProfile: { type: RuntimeDefault } + containers: + - name: c + image: nginx + securityContext: + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + capabilities: { drop: [ALL] } diff --git a/install/kubernetes/tracecore/policies/conftest/testdata/bad-runasroot.yaml b/install/kubernetes/tracecore/policies/conftest/testdata/bad-runasroot.yaml new file mode 100644 index 00000000..8940a210 --- /dev/null +++ b/install/kubernetes/tracecore/policies/conftest/testdata/bad-runasroot.yaml @@ -0,0 +1,18 @@ +apiVersion: apps/v1 +kind: DaemonSet +metadata: { name: bad-runasroot } +spec: + selector: { matchLabels: { app: bad } } + template: + metadata: { labels: { app: bad } } + spec: + securityContext: + runAsNonRoot: false + seccompProfile: { type: RuntimeDefault } + containers: + - name: c + image: nginx + securityContext: + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + capabilities: { drop: [ALL] } diff --git a/install/kubernetes/tracecore/policies/conftest/testdata/bad-runasuser-0.yaml b/install/kubernetes/tracecore/policies/conftest/testdata/bad-runasuser-0.yaml new file mode 100644 index 00000000..72b11330 --- /dev/null +++ b/install/kubernetes/tracecore/policies/conftest/testdata/bad-runasuser-0.yaml @@ -0,0 +1,19 @@ +apiVersion: apps/v1 +kind: DaemonSet +metadata: { name: bad-uid0 } +spec: + selector: { matchLabels: { app: bad } } + template: + metadata: { labels: { app: bad } } + spec: + securityContext: + runAsNonRoot: true + runAsUser: 0 + seccompProfile: { type: RuntimeDefault } + containers: + - name: c + image: nginx + securityContext: + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + capabilities: { drop: [ALL] } diff --git a/install/kubernetes/tracecore/policies/conftest/testdata/good-baseline.yaml b/install/kubernetes/tracecore/policies/conftest/testdata/good-baseline.yaml new file mode 100644 index 00000000..6987fa6c --- /dev/null +++ b/install/kubernetes/tracecore/policies/conftest/testdata/good-baseline.yaml @@ -0,0 +1,23 @@ +apiVersion: apps/v1 +kind: DaemonSet +metadata: { name: good-baseline } +spec: + selector: { matchLabels: { app: good } } + template: + metadata: { labels: { app: good } } + spec: + hostPID: false + hostIPC: false + hostNetwork: false + securityContext: + runAsNonRoot: true + seccompProfile: { type: RuntimeDefault } + containers: + - name: c + image: ghcr.io/tracecoreai/tracecore:0.0.0-dev + securityContext: + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + capabilities: + drop: [ALL] + add: [] diff --git a/install/kubernetes/tracecore/policies/conftest/testdata/good-sys-ptrace.yaml b/install/kubernetes/tracecore/policies/conftest/testdata/good-sys-ptrace.yaml new file mode 100644 index 00000000..beed3e30 --- /dev/null +++ b/install/kubernetes/tracecore/policies/conftest/testdata/good-sys-ptrace.yaml @@ -0,0 +1,23 @@ +apiVersion: apps/v1 +kind: DaemonSet +metadata: { name: good-sysptrace } +spec: + selector: { matchLabels: { app: good } } + template: + metadata: { labels: { app: good } } + spec: + hostPID: false + hostIPC: false + hostNetwork: false + securityContext: + runAsNonRoot: true + seccompProfile: { type: RuntimeDefault } + containers: + - name: c + image: ghcr.io/tracecoreai/tracecore:0.0.0-dev + securityContext: + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + capabilities: + drop: [ALL] + add: [SYS_PTRACE] diff --git a/install/kubernetes/tracecore/policies/conftest/tracecore.rego b/install/kubernetes/tracecore/policies/conftest/tracecore.rego new file mode 100644 index 00000000..8fea6aff --- /dev/null +++ b/install/kubernetes/tracecore/policies/conftest/tracecore.rego @@ -0,0 +1,134 @@ +# Conftest / Open Policy Agent policy for the tracecore Helm chart. +# Rejects manifests that violate the project's minimum-privilege +# charter: privileged containers, host namespace sharing, missing +# read-only root filesystem, or any Linux capability other than +# SYS_PTRACE. +# +# The policy is the canonical machine-readable interpretation of +# `MILESTONES.md` §M5b functional rubric #6. Editing rules here is a +# load-bearing change — keep the rubric updated in lockstep. + +package main + +import rego.v1 + +pod_spec := input.spec.template.spec + +all_containers := array.concat( + object.get(pod_spec, "initContainers", []), + object.get(pod_spec, "containers", []), +) + +deny contains msg if { + input.spec.template.spec + pod_spec.hostPID == true + msg := sprintf("%s/%s sets hostPID=true; host PID namespace sharing is forbidden", [input.kind, input.metadata.name]) +} + +deny contains msg if { + input.spec.template.spec + pod_spec.hostIPC == true + msg := sprintf("%s/%s sets hostIPC=true; host IPC namespace sharing is forbidden", [input.kind, input.metadata.name]) +} + +deny contains msg if { + input.spec.template.spec + pod_spec.hostNetwork == true + msg := sprintf("%s/%s sets hostNetwork=true; host network namespace sharing is forbidden", [input.kind, input.metadata.name]) +} + +# hostUsers (Kubernetes 1.30+, GA in 1.31): forbidden under restricted PSS. +deny contains msg if { + input.spec.template.spec + pod_spec.hostUsers == true + msg := sprintf("%s/%s sets hostUsers=true; user-namespace sharing with the host is forbidden", [input.kind, input.metadata.name]) +} + +# Pod-level runAsNonRoot — restricted PSS requires this OR every container's +# securityContext.runAsNonRoot=true. The chart sets it at pod level; the +# policy enforces that contract so a values-override can't downgrade. +# +# Guarded with input.spec.template.spec so the rule only fires on +# pod-bearing objects (Deployment, DaemonSet, StatefulSet, Job, …); +# ConfigMap / ServiceAccount documents are exempt. +deny contains msg if { + input.spec.template.spec + not pod_spec.securityContext.runAsNonRoot == true + msg := sprintf("%s/%s must set pod securityContext.runAsNonRoot=true", [input.kind, input.metadata.name]) +} + +# seccompProfile.type — restricted PSS requires RuntimeDefault or Localhost +# at either pod or container level. Accept either placement; reject only +# when neither carries a non-Unconfined profile. +deny contains msg if { + input.spec.template.spec + not pod_has_seccomp + not all_containers_have_seccomp + msg := sprintf("%s/%s must set seccompProfile.type (RuntimeDefault or Localhost) at pod or every container", [input.kind, input.metadata.name]) +} + +pod_has_seccomp if { + t := pod_spec.securityContext.seccompProfile.type + t != "Unconfined" +} + +all_containers_have_seccomp if { + count(all_containers) > 0 + every c in all_containers { + t := c.securityContext.seccompProfile.type + t != "Unconfined" + } +} + +# UID 0 / GID 0 forbidden — restricted PSS requires non-root. The chart +# default sets runAsUser/Group to 65532; this rule rejects a values +# override that downgrades to root. +deny contains msg if { + input.spec.template.spec + pod_spec.securityContext.runAsUser == 0 + msg := sprintf("%s/%s sets runAsUser=0; root execution is forbidden", [input.kind, input.metadata.name]) +} + +deny contains msg if { + input.spec.template.spec + pod_spec.securityContext.runAsGroup == 0 + msg := sprintf("%s/%s sets runAsGroup=0; root group is forbidden", [input.kind, input.metadata.name]) +} + +# procMount: restricted PSS requires Default. Reject Unmasked, which +# exposes host /proc paths kubelet ordinarily masks. +deny contains msg if { + some c in all_containers + c.securityContext.procMount == "Unmasked" + msg := sprintf("container %q sets procMount=Unmasked; only Default is allowed", [c.name]) +} + +deny contains msg if { + some c in all_containers + c.securityContext.privileged == true + msg := sprintf("container %q is privileged=true; privileged containers are forbidden", [c.name]) +} + +deny contains msg if { + some c in all_containers + c.securityContext.allowPrivilegeEscalation == true + msg := sprintf("container %q sets allowPrivilegeEscalation=true; must be false", [c.name]) +} + +deny contains msg if { + some c in all_containers + not c.securityContext.readOnlyRootFilesystem == true + msg := sprintf("container %q must set securityContext.readOnlyRootFilesystem=true", [c.name]) +} + +# SYS_PTRACE is the sole capability the chart's minimum-privilege +# charter permits to be added back after `drop: [ALL]`. Reject any +# other capability addition. +allowed_capabilities := {"SYS_PTRACE"} + +deny contains msg if { + some c in all_containers + some cap in object.get(c.securityContext.capabilities, "add", []) + not allowed_capabilities[cap] + msg := sprintf("container %q adds capability %q; only SYS_PTRACE is allowed", [c.name, cap]) +} diff --git a/install/kubernetes/tracecore/templates/NOTES.txt b/install/kubernetes/tracecore/templates/NOTES.txt new file mode 100644 index 00000000..b2d51bf2 --- /dev/null +++ b/install/kubernetes/tracecore/templates/NOTES.txt @@ -0,0 +1,39 @@ +tracecore has been installed. + + Release: {{ .Release.Name }} + Namespace: {{ .Values.namespace }} + Image: {{ include "tracecore.image" . }} + AppVersion: {{ .Chart.AppVersion }} + +Inspect the DaemonSet: + kubectl -n {{ .Values.namespace }} get daemonset {{ include "tracecore.fullname" . }} + +Tail logs from one pod: + kubectl -n {{ .Values.namespace }} logs -l app.kubernetes.io/instance={{ .Release.Name }} --tail=50 -f + +Validate the rendered tracecore config locally (run from a checkout +of the tracecore repository so the chart path resolves): + helm template {{ .Release.Name }} install/kubernetes/tracecore \ + --show-only templates/configmap.yaml \ + | yq '.data["config.yaml"]' > /tmp/tracecore.yaml + tracecore validate --config=/tmp/tracecore.yaml + +{{- if and .Values.exporters.stdoutexporter.enabled (eq .Values.image.repository "ghcr.io/tracecoreai/tracecore") }} + +Note: exporters.stdoutexporter is enabled by default and writes metrics +to pod stdout (visible via `kubectl logs`). Swap it for an OTLP/Datadog/ +ClickHouse exporter via the `config:` override before treating the +DaemonSet as a steady-state production deployment. +{{- end }} +{{- if not .Values.receivers.dcgm.enabled }} + +Note: receivers.dcgm is disabled by default. Enable it on hosts with an +nv-hostengine reachable on receivers.dcgm.endpoint. +{{- end }} +{{- if not .Values.receivers.kernelevents.enabled }} + +Note: receivers.kernelevents is disabled by default. Enabling it +requires hostPath volume mounts and SYS_PTRACE (kmsg) or journald +access; review the chart README's "Pod Security Standard compliance" +section before enabling. +{{- end }} diff --git a/install/kubernetes/tracecore/templates/_helpers.tpl b/install/kubernetes/tracecore/templates/_helpers.tpl new file mode 100644 index 00000000..afce2441 --- /dev/null +++ b/install/kubernetes/tracecore/templates/_helpers.tpl @@ -0,0 +1,125 @@ +{{/* +Common chart helpers — name, fullname, labels, selectorLabels. +*/}} + +{{- define "tracecore.name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}} +{{- end -}} + +{{- define "tracecore.fullname" -}} +{{- if .Values.fullnameOverride -}} +{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" -}} +{{- else -}} +{{- $name := default .Chart.Name .Values.nameOverride -}} +{{- if contains $name .Release.Name -}} +{{- .Release.Name | trunc 63 | trimSuffix "-" -}} +{{- else -}} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" -}} +{{- end -}} +{{- end -}} +{{- end -}} + +{{- define "tracecore.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" -}} +{{- end -}} + +{{- define "tracecore.labels" -}} +helm.sh/chart: {{ include "tracecore.chart" . }} +{{ include "tracecore.selectorLabels" . }} +app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +{{- end -}} + +{{- define "tracecore.selectorLabels" -}} +app.kubernetes.io/name: {{ include "tracecore.name" . }} +app.kubernetes.io/instance: {{ .Release.Name }} +{{- end -}} + +{{- define "tracecore.serviceAccountName" -}} +{{- if .Values.serviceAccount.create -}} +{{- default (include "tracecore.fullname" .) .Values.serviceAccount.name -}} +{{- else -}} +{{- default "default" .Values.serviceAccount.name -}} +{{- end -}} +{{- end -}} + +{{- define "tracecore.image" -}} +{{- $tag := .Values.image.tag | default .Chart.AppVersion -}} +{{- printf "%s:%s" .Values.image.repository $tag -}} +{{- end -}} + +{{/* +tracecore.renderedConfig — assembles the tracecore YAML body that the +ConfigMap ships. Walks per-receiver enabled toggles, joins with +exporters/telemetry/pipelines, then deep-merges the free-form +.Values.config override on top so operators can land arbitrary blocks +(processors, extensions) without templating them through values. + +The intermediate dict shape mirrors `internal/config/config.go`: + receivers: { : } + exporters: { : } + service.pipelines.: { receivers: [...], exporters: [...] } + telemetry: { enabled: bool, listen: ..., paths: {metrics, healthz, readyz} } +*/}} +{{- define "tracecore.renderedConfig" -}} +{{- $built := dict -}} + +{{/* Receivers — include only enabled blocks; strip the `enabled` key. */}} +{{- $recvs := dict -}} +{{- range $name, $cfg := .Values.receivers -}} + {{- if $cfg.enabled -}} + {{- $body := omit $cfg "enabled" -}} + {{- $_ := set $recvs $name $body -}} + {{- end -}} +{{- end -}} +{{- if gt (len $recvs) 0 -}} + {{- $_ := set $built "receivers" $recvs -}} +{{- end -}} + +{{/* Exporters — same shape. */}} +{{- $exps := dict -}} +{{- range $name, $cfg := .Values.exporters -}} + {{- if $cfg.enabled -}} + {{- $body := omit $cfg "enabled" -}} + {{- $_ := set $exps $name $body -}} + {{- end -}} +{{- end -}} +{{- if gt (len $exps) 0 -}} + {{- $_ := set $built "exporters" $exps -}} +{{- end -}} + +{{/* Pipelines — drop refs to disabled components so partial overrides validate. */}} +{{- $pipes := dict -}} +{{- range $key, $p := .Values.pipelines -}} + {{- $kept_r := list -}} + {{- range $r := $p.receivers -}} + {{- if hasKey $recvs $r -}} + {{- $kept_r = append $kept_r $r -}} + {{- end -}} + {{- end -}} + {{- $kept_e := list -}} + {{- range $e := $p.exporters -}} + {{- if hasKey $exps $e -}} + {{- $kept_e = append $kept_e $e -}} + {{- end -}} + {{- end -}} + {{- if and (gt (len $kept_r) 0) (gt (len $kept_e) 0) -}} + {{- $entry := dict "receivers" $kept_r "exporters" $kept_e -}} + {{- $_ := set $pipes $key $entry -}} + {{- end -}} +{{- end -}} +{{- if gt (len $pipes) 0 -}} + {{- $service := dict "pipelines" $pipes -}} + {{- $_ := set $built "service" $service -}} +{{- end -}} + +{{/* Telemetry surface — opt-in but defaults to enabled in values.yaml. */}} +{{- if .Values.telemetry.enabled -}} + {{- $tele := dict "enabled" true "listen" .Values.telemetry.listen "paths" .Values.telemetry.paths -}} + {{- $_ := set $built "telemetry" $tele -}} +{{- end -}} + +{{/* Deep-merge free-form .Values.config last so operators can override. */}} +{{- $merged := mustMergeOverwrite $built (.Values.config | default dict) -}} +{{- toYaml $merged -}} +{{- end -}} diff --git a/install/kubernetes/tracecore/templates/configmap.yaml b/install/kubernetes/tracecore/templates/configmap.yaml new file mode 100644 index 00000000..dc6457d5 --- /dev/null +++ b/install/kubernetes/tracecore/templates/configmap.yaml @@ -0,0 +1,9 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ include "tracecore.fullname" . }}-config + namespace: {{ .Values.namespace }} + labels: {{- include "tracecore.labels" . | nindent 4 }} +data: + config.yaml: | +{{ include "tracecore.renderedConfig" . | indent 4 }} diff --git a/install/kubernetes/tracecore/templates/daemonset.yaml b/install/kubernetes/tracecore/templates/daemonset.yaml new file mode 100644 index 00000000..9c4c47d1 --- /dev/null +++ b/install/kubernetes/tracecore/templates/daemonset.yaml @@ -0,0 +1,86 @@ +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: {{ include "tracecore.fullname" . }} + namespace: {{ .Values.namespace }} + labels: {{- include "tracecore.labels" . | nindent 4 }} +spec: + selector: + matchLabels: {{- include "tracecore.selectorLabels" . | nindent 6 }} + updateStrategy: {{- toYaml .Values.updateStrategy | nindent 4 }} + template: + metadata: + labels: {{- include "tracecore.labels" . | nindent 8 }} + {{- with .Values.podLabels }} + {{- toYaml . | nindent 8 }} + {{- end }} + annotations: + checksum/config: {{ include "tracecore.renderedConfig" . | sha256sum }} + {{- with .Values.podAnnotations }} + {{- toYaml . | nindent 8 }} + {{- end }} + spec: + serviceAccountName: {{ include "tracecore.serviceAccountName" . }} + automountServiceAccountToken: {{ .Values.serviceAccount.automount }} + {{- with .Values.imagePullSecrets }} + imagePullSecrets: {{- toYaml . | nindent 8 }} + {{- end }} + hostNetwork: false + hostPID: false + hostIPC: false + {{- with .Values.priorityClassName }} + priorityClassName: {{ . | quote }} + {{- end }} + securityContext: {{- toYaml .Values.podSecurityContext | nindent 8 }} + {{- with .Values.nodeSelector }} + nodeSelector: {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.tolerations }} + tolerations: {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.affinity }} + affinity: {{- toYaml . | nindent 8 }} + {{- end }} + containers: + - name: tracecore + image: {{ include "tracecore.image" . }} + imagePullPolicy: {{ .Values.image.pullPolicy }} + args: + - collect + - --config=/etc/tracecore/config.yaml + securityContext: {{- toYaml .Values.containerSecurityContext | nindent 12 }} + {{- if .Values.telemetry.enabled }} + {{- $listen := .Values.telemetry.listen -}} + {{- $port := regexReplaceAll ".*:" $listen "" }} + ports: + - name: telemetry + containerPort: {{ $port | int }} + protocol: TCP + livenessProbe: + httpGet: + path: {{ .Values.telemetry.paths.healthz }} + port: telemetry + initialDelaySeconds: {{ .Values.probes.liveness.initialDelaySeconds }} + periodSeconds: {{ .Values.probes.liveness.periodSeconds }} + failureThreshold: {{ .Values.probes.liveness.failureThreshold }} + readinessProbe: + httpGet: + path: {{ .Values.telemetry.paths.readyz }} + port: telemetry + initialDelaySeconds: {{ .Values.probes.readiness.initialDelaySeconds }} + periodSeconds: {{ .Values.probes.readiness.periodSeconds }} + failureThreshold: {{ .Values.probes.readiness.failureThreshold }} + {{- end }} + resources: {{- toYaml .Values.resources | nindent 12 }} + volumeMounts: + - name: config + mountPath: /etc/tracecore + readOnly: true + - name: tmp + mountPath: /tmp + volumes: + - name: config + configMap: + name: {{ include "tracecore.fullname" . }}-config + - name: tmp + emptyDir: {} diff --git a/install/kubernetes/tracecore/templates/serviceaccount.yaml b/install/kubernetes/tracecore/templates/serviceaccount.yaml new file mode 100644 index 00000000..fe229efe --- /dev/null +++ b/install/kubernetes/tracecore/templates/serviceaccount.yaml @@ -0,0 +1,12 @@ +{{- if .Values.serviceAccount.create }} +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ include "tracecore.serviceAccountName" . }} + namespace: {{ .Values.namespace }} + labels: {{- include "tracecore.labels" . | nindent 4 }} + {{- with .Values.serviceAccount.annotations }} + annotations: {{- toYaml . | nindent 4 }} + {{- end }} +automountServiceAccountToken: {{ .Values.serviceAccount.automount }} +{{- end }} diff --git a/install/kubernetes/tracecore/values.schema.json b/install/kubernetes/tracecore/values.schema.json new file mode 100644 index 00000000..afd1cf8e --- /dev/null +++ b/install/kubernetes/tracecore/values.schema.json @@ -0,0 +1,188 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "title": "tracecore Helm chart values", + "description": "Schema for install/kubernetes/tracecore/values.yaml. Validated by `helm install` / `helm template` when the chart is rendered; rejects malformed overlays before they reach the cluster.", + "type": "object", + "additionalProperties": false, + "required": ["namespace", "image", "podSecurityContext", "containerSecurityContext", "telemetry", "receivers", "exporters"], + "properties": { + "namespace": { "type": "string", "minLength": 1 }, + "nameOverride": { "type": "string" }, + "fullnameOverride": { "type": "string" }, + + "image": { + "type": "object", + "additionalProperties": false, + "required": ["repository", "pullPolicy"], + "properties": { + "repository": { "type": "string", "minLength": 1 }, + "pullPolicy": { "type": "string", "enum": ["Always", "IfNotPresent", "Never"] }, + "tag": { "type": "string" } + } + }, + + "imagePullSecrets": { + "type": "array", + "items": { + "type": "object", + "additionalProperties": false, + "required": ["name"], + "properties": { "name": { "type": "string", "minLength": 1 } } + } + }, + + "serviceAccount": { + "type": "object", + "additionalProperties": false, + "properties": { + "create": { "type": "boolean" }, + "name": { "type": "string" }, + "annotations": { "type": "object" }, + "automount": { "type": "boolean" } + } + }, + + "podAnnotations": { "type": "object" }, + "podLabels": { "type": "object" }, + + "resources": { + "type": "object", + "additionalProperties": false, + "properties": { + "requests": { "type": "object" }, + "limits": { "type": "object" } + } + }, + + "podSecurityContext": { + "type": "object", + "additionalProperties": true, + "properties": { + "runAsNonRoot": { "type": "boolean" }, + "runAsUser": { "type": "integer", "minimum": 1 }, + "runAsGroup": { "type": "integer", "minimum": 1 }, + "fsGroup": { "type": "integer" }, + "seccompProfile": { + "type": "object", + "additionalProperties": false, + "required": ["type"], + "properties": { + "type": { "type": "string", "enum": ["RuntimeDefault", "Localhost"] }, + "localhostProfile": { "type": "string" } + } + } + } + }, + + "containerSecurityContext": { + "type": "object", + "additionalProperties": true, + "properties": { + "allowPrivilegeEscalation": { "type": "boolean", "const": false }, + "readOnlyRootFilesystem": { "type": "boolean", "const": true }, + "capabilities": { + "type": "object", + "additionalProperties": false, + "properties": { + "drop": { "type": "array", "items": { "type": "string" } }, + "add": { + "type": "array", + "items": { "type": "string", "enum": ["SYS_PTRACE"] } + } + } + } + } + }, + + "telemetry": { + "type": "object", + "additionalProperties": false, + "required": ["enabled"], + "properties": { + "enabled": { "type": "boolean" }, + "listen": { "type": "string" }, + "paths": { + "type": "object", + "additionalProperties": false, + "properties": { + "metrics": { "type": "string", "pattern": "^/" }, + "healthz": { "type": "string", "pattern": "^/" }, + "readyz": { "type": "string", "pattern": "^/" } + } + } + } + }, + + "receivers": { + "type": "object", + "additionalProperties": { + "type": "object", + "required": ["enabled"], + "properties": { "enabled": { "type": "boolean" } } + } + }, + + "exporters": { + "type": "object", + "additionalProperties": { + "type": "object", + "required": ["enabled"], + "properties": { "enabled": { "type": "boolean" } } + } + }, + + "pipelines": { + "type": "object", + "additionalProperties": { + "type": "object", + "additionalProperties": false, + "properties": { + "receivers": { "type": "array", "items": { "type": "string" } }, + "processors": { "type": "array", "items": { "type": "string" } }, + "exporters": { "type": "array", "items": { "type": "string" } } + } + } + }, + + "config": { "type": "object" }, + + "updateStrategy": { + "type": "object", + "additionalProperties": true, + "properties": { + "type": { "type": "string", "enum": ["RollingUpdate", "OnDelete"] }, + "rollingUpdate": { "type": "object" } + } + }, + + "priorityClassName": { "type": "string" }, + + "tolerations": { "type": "array", "items": { "type": "object" } }, + "nodeSelector": { "type": "object" }, + "affinity": { "type": "object" }, + + "probes": { + "type": "object", + "additionalProperties": false, + "required": ["liveness", "readiness"], + "properties": { + "liveness": { "$ref": "#/$defs/probe" }, + "readiness": { "$ref": "#/$defs/probe" } + } + } + }, + + "$defs": { + "probe": { + "type": "object", + "additionalProperties": true, + "properties": { + "initialDelaySeconds": { "type": "integer", "minimum": 0 }, + "periodSeconds": { "type": "integer", "minimum": 1 }, + "failureThreshold": { "type": "integer", "minimum": 1 }, + "timeoutSeconds": { "type": "integer", "minimum": 1 }, + "successThreshold": { "type": "integer", "minimum": 1 } + } + } + } +} diff --git a/install/kubernetes/tracecore/values.yaml b/install/kubernetes/tracecore/values.yaml new file mode 100644 index 00000000..3ecbdeed --- /dev/null +++ b/install/kubernetes/tracecore/values.yaml @@ -0,0 +1,153 @@ +# tracecore Helm chart — operator-facing knobs. Anything not exposed +# here can be overridden via the free-form `config:` block at the +# bottom, which is deep-merged INTO the rendered tracecore config last. + +# Namespace the chart objects land in. The chart does NOT create the +# namespace; deploy with `helm install --create-namespace` if it does +# not already exist. +namespace: tracecore-system + +nameOverride: "" +fullnameOverride: "" + +image: + repository: ghcr.io/tracecoreai/tracecore + pullPolicy: IfNotPresent + # tag defaults to .Chart.AppVersion when empty. Override for local + # kind-loaded images (e.g. "kind-loaded") or for pinning by digest. + tag: "" + +imagePullSecrets: [] + +serviceAccount: + create: true + # name defaults to the fullname template when empty. + name: "" + annotations: {} + # automount is false because the collector does not call the + # kube-apiserver. The SA is still created so future receivers + # (k8s events, in-cluster discovery) have a stable identity to + # attach RBAC to without a chart-shape break. + automount: false + +podAnnotations: {} +podLabels: {} + +resources: + requests: + cpu: 10m + memory: 32Mi + limits: + cpu: 100m + memory: 128Mi + +# Pod-level security context. Defaults satisfy the Kubernetes restricted +# Pod Security Standard. The DaemonSet template also pins +# `hostPID: false`, `hostIPC: false`, and `hostNetwork: false` at the +# pod spec level — those are not values-tunable, by design. +podSecurityContext: + runAsNonRoot: true + runAsUser: 65532 + runAsGroup: 65532 + fsGroup: 65532 + seccompProfile: + type: RuntimeDefault + +# Container-level security context. The capabilities allowlist is +# bounded to SYS_PTRACE by the chart's conftest policy; any other +# capability rejects the build. Default is the empty list (drop all, +# add none) — receivers that need SYS_PTRACE must opt in explicitly. +containerSecurityContext: + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + capabilities: + drop: [ALL] + add: [] + +# tracecore self-telemetry surface (Pod /metrics, /healthz, /readyz). +# Defaults to enabled because kubelet probes need a working listener. +# Set telemetry.enabled=false to skip the surface entirely; probes are +# then omitted from the DaemonSet (no listener => no kubelet probe). +telemetry: + enabled: true + listen: "0.0.0.0:8888" + paths: + metrics: /metrics + healthz: /healthz + readyz: /readyz + +# Per-receiver toggles. Only enabled receivers render into the +# tracecore config. clockreceiver is the hardware-free default so a +# fresh install on a no-GPU cluster boots and emits metrics +# immediately. +receivers: + clockreceiver: + enabled: true + interval: 1s + dcgm: + enabled: false + mode: standalone + # endpoint must reach a reachable nv-hostengine on every node the + # DaemonSet schedules on; otherwise the pod enters degraded mode + # permanently. Verify with `nv-hostengine --version` on the host + # before enabling, and override per-node via affinity/nodeSelector + # if only a subset of the fleet runs hostengine. + endpoint: localhost:5555 + collection_interval: 15s + kernelevents: + enabled: false + min_severity: info + +exporters: + stdoutexporter: + enabled: true + +# service.pipelines wiring. The defaults pair clockreceiver to +# stdoutexporter. Override to wire dcgm + kernelevents + custom +# exporters; entries whose components are not enabled above are +# silently dropped at render time so a partial override still +# validates. +pipelines: + metrics: + receivers: [clockreceiver] + exporters: [stdoutexporter] + +# Free-form override deep-merged INTO the rendered tracecore config +# last. Use for fields the structured values above don't expose +# (processors, extensions, additional pipeline names). +# +# DO NOT inject credentials here. The block lands in a Kubernetes +# ConfigMap, which is stored unencrypted in etcd by default. For +# secrets (API keys, exporter auth), mount a `Secret` via `envFrom` +# in a custom DaemonSet patch and reference the env var from the +# tracecore config. +config: {} + +updateStrategy: + type: RollingUpdate + rollingUpdate: + maxUnavailable: 1 + +# priorityClassName: set when the chart must survive node-pressure +# evictions (incident-response surface). Empty string falls back to +# the cluster default. `system-node-critical` requires the namespace +# to be a `kube-system`-class privileged namespace; most operators +# will want a custom PriorityClass. +priorityClassName: "" + +tolerations: [] +nodeSelector: {} +affinity: {} + +# Probe timing — readiness has a ~45s grace window +# (initialDelaySeconds + periodSeconds * failureThreshold) so a slow +# kubelet or congested disk does not flip pods NotReady mid-rollout. +probes: + liveness: + initialDelaySeconds: 10 + periodSeconds: 30 + failureThreshold: 3 + readiness: + initialDelaySeconds: 5 + periodSeconds: 10 + failureThreshold: 4