Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
263 changes: 263 additions & 0 deletions .github/workflows/chart.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,263 @@
name: chart

on:
push:
branches: [main]
paths:
- 'install/kubernetes/tracecore/**'
- '.github/workflows/chart.yml'
- 'cmd/tracecore/**'
- 'internal/**'
- 'components/**'
- 'go.mod'
- 'go.sum'
pull_request:
paths:
- 'install/kubernetes/tracecore/**'
- '.github/workflows/chart.yml'
- 'cmd/tracecore/**'
- 'internal/**'
- 'components/**'
- 'go.mod'
- 'go.sum'

permissions:
contents: read

jobs:
render:
# helm lint + helm template + tracecore validate + conftest +
# yq field assertions. No cluster required; ~30s wall-clock.
# Sits OUTSIDE the `make ci` 60-second budget by living in this
# workflow rather than under `make ci`.
name: render
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v6
- uses: actions/setup-go@v6
with:
go-version-file: go.mod
cache: true
- name: Install helm
uses: azure/setup-helm@b9e51907a09c216f16ebe8536097933489208112 # v4.3.0
with:
version: v3.16.4
- name: Install yq + conftest
# `go install` resolves both tools through the Go module proxy +
# checksum database; that's an integrity-checked path (versus a
# bare curl of the GitHub release binary, which has no signature
# verification today).
run: |
go install github.com/mikefarah/yq/v4@v4.44.5
go install github.com/open-policy-agent/conftest@v0.62.0
yq --version
conftest --version
- name: Build tracecore (for validate gate)
run: go build -o tracecore ./cmd/tracecore
- name: helm lint (must exit 0 with zero WARNINGs)
run: |
set -eo pipefail
out=$(helm lint install/kubernetes/tracecore 2>&1)
echo "$out"
if echo "$out" | grep -q '^\[WARNING\]'; then
echo "::error::helm lint emitted WARNING; failing per M5b rubric"
exit 1
fi
- name: Render config — all-receivers-off
run: |
helm template demo install/kubernetes/tracecore \
-f install/kubernetes/tracecore/ci/all-receivers-off-values.yaml \
--show-only templates/configmap.yaml \
| yq '.data["config.yaml"]' > rendered-all-off.yaml
cat rendered-all-off.yaml
- name: tracecore validate — all-off
run: ./tracecore validate --config=rendered-all-off.yaml
- name: Render config — one-receiver-on
run: |
helm template demo install/kubernetes/tracecore \
-f install/kubernetes/tracecore/ci/one-receiver-on-values.yaml \
--show-only templates/configmap.yaml \
| yq '.data["config.yaml"]' > rendered-one-on.yaml
cat rendered-one-on.yaml
- name: tracecore validate — one-on
run: ./tracecore validate --config=rendered-one-on.yaml
- name: yq — DaemonSet kind assertion
run: |
kind=$(helm template demo install/kubernetes/tracecore \
--show-only templates/daemonset.yaml | yq '.kind')
test "$kind" = "DaemonSet" || { echo "expected DaemonSet, got $kind"; exit 1; }
- name: yq — securityContext field assertions
run: |
render=$(helm template demo install/kubernetes/tracecore --show-only templates/daemonset.yaml)
assert() {
local path="$1" expected="$2"
local got
got=$(echo "$render" | yq "$path")
if [ "$got" != "$expected" ]; then
echo "::error::$path = $got (expected $expected)"
exit 1
fi
echo "ok: $path = $got"
}
assert '.spec.template.spec.securityContext.runAsNonRoot' 'true'
assert '.spec.template.spec.securityContext.runAsUser' '65532'
assert '.spec.template.spec.securityContext.seccompProfile.type' 'RuntimeDefault'
assert '.spec.template.spec.hostPID' 'false'
assert '.spec.template.spec.hostIPC' 'false'
assert '.spec.template.spec.hostNetwork' 'false'
assert '.spec.template.spec.containers[0].securityContext.allowPrivilegeEscalation' 'false'
assert '.spec.template.spec.containers[0].securityContext.readOnlyRootFilesystem' 'true'
assert '.spec.template.spec.containers[0].securityContext.capabilities.drop[0]' 'ALL'
- name: priorityClassName + telemetry-off render correctness
# Two value-conditional template paths that yq cannot infer from
# default-render output alone: priorityClassName must appear when
# set, and probes must disappear when telemetry is off.
run: |
pc=$(helm template demo install/kubernetes/tracecore \
--set priorityClassName=tracecore-high \
--show-only templates/daemonset.yaml \
| yq '.spec.template.spec.priorityClassName')
test "$pc" = "tracecore-high" \
|| { echo "::error::priorityClassName rendered '$pc' (expected tracecore-high)"; exit 1; }
probes=$(helm template demo install/kubernetes/tracecore \
--set telemetry.enabled=false \
--show-only templates/daemonset.yaml \
| yq '.spec.template.spec.containers[0] | has("livenessProbe")')
test "$probes" = "false" \
|| { echo "::error::livenessProbe rendered when telemetry.enabled=false"; exit 1; }
echo "ok: priorityClassName injection + telemetry-off probe omission both verified"
- name: Probe paths match rendered telemetry.paths
# Without this gate, a future template edit could wire a probe
# to /healthz while the config exposes /health — the pod would
# CrashLoopBackOff for a kubelet 404 the chart authors couldn't
# see in `helm template` alone.
run: |
ds_render=$(helm template demo install/kubernetes/tracecore --show-only templates/daemonset.yaml)
cm_render=$(helm template demo install/kubernetes/tracecore --show-only templates/configmap.yaml)
probe_healthz=$(echo "$ds_render" | yq '.spec.template.spec.containers[0].livenessProbe.httpGet.path')
probe_readyz=$(echo "$ds_render" | yq '.spec.template.spec.containers[0].readinessProbe.httpGet.path')
cfg_healthz=$(echo "$cm_render" | yq '.data["config.yaml"]' | yq '.telemetry.paths.healthz')
cfg_readyz=$(echo "$cm_render" | yq '.data["config.yaml"]' | yq '.telemetry.paths.readyz')
test "$probe_healthz" = "$cfg_healthz" \
|| { echo "::error::probe healthz=$probe_healthz != config healthz=$cfg_healthz"; exit 1; }
test "$probe_readyz" = "$cfg_readyz" \
|| { echo "::error::probe readyz=$probe_readyz != config readyz=$cfg_readyz"; exit 1; }
echo "ok: probe paths match config — healthz=$probe_healthz readyz=$probe_readyz"
- name: Chart.yaml — apiVersion v2 + SemVer + non-empty appVersion
run: |
api=$(yq '.apiVersion' install/kubernetes/tracecore/Chart.yaml)
ver=$(yq '.version' install/kubernetes/tracecore/Chart.yaml)
app=$(yq '.appVersion' install/kubernetes/tracecore/Chart.yaml)
[ "$api" = "v2" ] || { echo "apiVersion must be v2 (got $api)"; exit 1; }
echo "$ver" | grep -Eq '^[0-9]+\.[0-9]+\.[0-9]+(-[0-9A-Za-z.\-]+)?(\+[0-9A-Za-z.\-]+)?$' \
|| { echo "version is not SemVer: $ver"; exit 1; }
[ -n "$app" ] && [ "$app" != "null" ] || { echo "appVersion empty"; exit 1; }
- name: README — required H2 sections
run: |
required=("## Install" "## Upgrade" "## Uninstall" "## Values reference" "## Troubleshooting")
for h2 in "${required[@]}"; do
grep -Fq "$h2" install/kubernetes/tracecore/README.md \
|| { echo "missing H2 in chart README: $h2"; exit 1; }
done
- name: conftest — deny fixtures (each must fail)
run: |
set +e
fail=0
for f in install/kubernetes/tracecore/policies/conftest/testdata/bad-*.yaml; do
conftest test \
--policy install/kubernetes/tracecore/policies/conftest/tracecore.rego \
"$f" > /tmp/conftest.out 2>&1
rc=$?
if [ $rc -eq 0 ]; then
echo "::error::policy did NOT deny $f (expected failure)"
cat /tmp/conftest.out
fail=1
else
echo "ok: $f denied"
fi
done
exit $fail
- name: conftest — good fixtures + chart render (must pass)
run: |
conftest test \
--policy install/kubernetes/tracecore/policies/conftest/tracecore.rego \
install/kubernetes/tracecore/policies/conftest/testdata/good-baseline.yaml \
install/kubernetes/tracecore/policies/conftest/testdata/good-sys-ptrace.yaml
helm template demo install/kubernetes/tracecore > /tmp/chart-render.yaml
conftest test \
--policy install/kubernetes/tracecore/policies/conftest/tracecore.rego \
/tmp/chart-render.yaml

install:
# End-to-end kind-cluster install. Builds a local Docker image, loads
# it into kind, runs `helm install`, waits for the DaemonSet to
# reach Ready, asserts the rollout status, then `helm uninstall`s.
# Records the wall-clock install-to-Ready duration so the M5b ≤5 min
# rubric is falsifiable across runs.
name: install (kind)
runs-on: ubuntu-latest
needs: render
steps:
- uses: actions/checkout@v6
- uses: actions/setup-go@v6
with:
go-version-file: go.mod
cache: true
- name: Install helm
uses: azure/setup-helm@b9e51907a09c216f16ebe8536097933489208112 # v4.3.0
with:
version: v3.16.4
- name: Build tracecore image
run: |
docker build \
-f install/kubernetes/tracecore/Dockerfile \
-t tracecore:ci \
.
- name: Create kind cluster
uses: helm/kind-action@a1b0e391336a6ee6713a0583f8c6240d70863de3 # v1.12.0
with:
version: v0.25.0
node_image: kindest/node:v1.32.0
cluster_name: tracecore-m5b
- name: Load image into kind
run: kind load docker-image tracecore:ci --name tracecore-m5b
- name: helm install + measure install-to-Ready
run: |
set -eo pipefail
start=$(date +%s)
helm install tracecore install/kubernetes/tracecore \
--namespace tracecore-system --create-namespace \
--set image.repository=tracecore \
--set image.tag=ci \
--set image.pullPolicy=Never \
--wait --timeout 5m
ds=$(kubectl -n tracecore-system get daemonset \
-l app.kubernetes.io/instance=tracecore \
-o jsonpath='{.items[0].metadata.name}')
kubectl -n tracecore-system rollout status "daemonset/$ds" --timeout=2m
end=$(date +%s)
dur=$((end - start))
echo "install_to_ready_seconds=$dur" >> "$GITHUB_OUTPUT"
echo "::notice::install-to-Ready: ${dur}s (rubric: ≤300s)"
test "$dur" -le 300 \
|| { echo "::error::install-to-Ready ${dur}s exceeds 300s rubric"; exit 1; }
- name: "helm status — STATUS: deployed"
run: |
status=$(helm status tracecore --namespace tracecore-system | grep '^STATUS:' | awk '{print $2}')
test "$status" = "deployed" \
|| { echo "expected STATUS: deployed, got $status"; exit 1; }
- name: Probe — collector readiness via port-forward
run: |
ds=$(kubectl -n tracecore-system get daemonset \
-l app.kubernetes.io/instance=tracecore \
-o jsonpath='{.items[0].metadata.name}')
kubectl -n tracecore-system port-forward "daemonset/$ds" 8888:8888 &
pf=$!
trap "kill $pf 2>/dev/null || true" EXIT
sleep 3
curl -fsS http://localhost:8888/readyz
curl -fsS http://localhost:8888/healthz
- name: helm uninstall
if: always()
run: |
helm uninstall tracecore --namespace tracecore-system || true
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ Pre-alpha. The CLI runs the M1 pipeline runtime end-to-end via factory-based ass

### Added

- **M5b — Helm chart + minimal-privilege pod spec** — `install/kubernetes/tracecore/` ships a `restricted` Pod Security Standard DaemonSet (`runAsNonRoot`, `runAsUser: 65532`, `seccompProfile: RuntimeDefault`, `allowPrivilegeEscalation: false`, `readOnlyRootFilesystem: true`, `capabilities.drop: [ALL]`), per-receiver `receivers.<name>.enabled` toggles, free-form `config:` override, and a bundled conftest policy that rejects `privileged`, `hostPID`, `hostIPC`, `hostNetwork`, `hostUsers`, `runAsUser: 0`, `runAsGroup: 0`, `procMount: Unmasked`, missing pod `runAsNonRoot`, missing `seccompProfile`, missing `readOnlyRootFilesystem`, `allowPrivilegeEscalation: true`, and capability additions other than `SYS_PTRACE`. Values exposes `priorityClassName`, `tolerations`, `nodeSelector`, `affinity`, `updateStrategy`, and `probes.{liveness,readiness}` so operators tune rollout cadence and scheduling without forking the chart. New `chart` workflow renders the chart, validates rendered configs (`all-receivers-off` + `one-on`) via `tracecore validate`, exercises the conftest policy, asserts probe paths match rendered telemetry paths, and installs the chart end-to-end on a single-node kind cluster.
- **M1 keystone** — `internal/pipeline` package with `Component`, `Host`, per-signal `Factory` interfaces; `Runtime` with two-phase shutdown (1s receivers + operator-configurable drain budget); `pipelinetest.New(t)` test fixture; first-data instrumentation wrappers. See [RFC-0003](docs/rfcs/0003-pipeline-runtime-and-component-contract.md).
- **`internal/consumer` push interfaces** — per-signal `Metrics`/`Traces`/`Logs` interfaces with `Capabilities() Capabilities` (MutatesData flag), mirroring OTel Collector v0.152.0.
- **`internal/safe.Call`** — cgo / vendor-SDK panic-wrapper with named-op error tagging and ctx-respect.
Expand Down
47 changes: 47 additions & 0 deletions docs/FOLLOWUPS.md
Original file line number Diff line number Diff line change
Expand Up @@ -732,3 +732,50 @@ deferred are phased here.
the second use. Rare edge case in operator-owned YAML; the
regex-based redaction catches the common case. Revisit if an
operator reports it.

## M5b chart — opportunistic deferrals

Items surfaced during PR #29 review that were explicitly held out of
M5b scope. Order is roughly highest-leverage first.

- [ ] **NetworkPolicy template.** Ship a `templates/networkpolicy.yaml`
gated by `networkPolicy.enabled` (default `false`) that allows
egress to operator-configured exporter endpoints + kubelet probe
traffic. Zero-trust adopters add their own today; the chart
should ship a baseline they can opt into. *Trigger:* the first
adopter request, or when an OTLP exporter receiver lands (M10+)
and the egress shape stops being guess-work.
- [ ] **Image scanning + SBOM gate on the chart container image.** The
`install/kubernetes/tracecore/Dockerfile` is reference-only for
the kind-install CI workflow; M3 owns the canonical release
image and is the right place to wire `trivy` / `syft` /
`cosign`. When M3 lands, replumb `chart.yml` to reuse M3's
scanned image instead of building locally. *Trigger:* M3
reproducible-build CI lands and exposes a reusable image
reference.
- [ ] **Per-receiver resource guidance docs.** README's "Troubleshooting"
currently has one OOMKilled entry for kernelevents. Replace with
a measured table (kernelevents under journald load, dcgm scrape
cardinality) once a real workload runs against tracecore.
*Trigger:* M5 install/overhead benchmark harness produces RSS
measurements per enabled receiver, OR an operator reports
tuning friction.
- [ ] **10-run install-to-Ready median aggregate measurement.**
MILESTONES.md §M5b non-functional rubric asks for a median
"across 10 CI runs on single-node kind." The current workflow
asserts ≤300s on every run and records install-to-Ready in
`$GITHUB_OUTPUT`; the 10-run aggregate accumulates on `main`
post-merge. A scheduled nightly that pulls the recorded values,
computes median + p99, and gates on regression closes the
rubric explicitly. *Trigger:* M5 install-bench harness lands
(its `bench/results/*.json` already captures the shape), OR a
regression is reported.
- [ ] **`appArmorProfile` on the rendered DaemonSet.** Restricted PSS
*permits* AppArmor profile to be undefined, so the chart is
compliant today. Setting `RuntimeDefault` would harden against
clusters with stricter local policy and shave one item off
adopter security checklists. K8s 1.30+ uses
`pod.securityContext.appArmorProfile`; chart targets `>=1.28`
so this needs a version-gated template or values toggle.
*Trigger:* kubeVersion floor moves to >=1.30, or first adopter
asks.
13 changes: 13 additions & 0 deletions install/kubernetes/tracecore/.helmignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
.DS_Store
.git/
.gitignore
*.swp
*.tmp
*.bak
*.orig
*.tgz
ci/
policies/
# Dockerfile lives here for the kind-install CI workflow; the packaged
# chart does not ship a container build context to end users.
Dockerfile
48 changes: 48 additions & 0 deletions install/kubernetes/tracecore/Chart.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
apiVersion: v2
name: tracecore
description: Minimal-privilege DaemonSet for the tracecore OpenTelemetry collector.
type: application
# version: chart-package version. Independent from appVersion; bumped on
# any chart change. Pre-1.0.0 while the install surface evolves.
version: 0.1.0
# appVersion: the tracecore binary release the chart's default image tag
# points at. Held in sync with the upstream binary by the
# chart-appversion CI gate; do not edit by hand without bumping
# install/kubernetes/tracecore/.appversion-source and rerunning the
# gate.
appVersion: "0.0.0-dev"
kubeVersion: ">=1.28.0-0"
home: https://github.com/tracecoreai/tracecore
sources:
- https://github.com/tracecoreai/tracecore
maintainers:
- name: tracecore maintainers
url: https://github.com/tracecoreai/tracecore
keywords:
- opentelemetry
- observability
- gpu
- kernel-events
- daemonset
annotations:
category: Observability
licenses: Apache-2.0
# Artifact Hub metadata. Spec: https://artifacthub.io/docs/topics/annotations/helm/
# `prerelease: "true"` flags the chart as pre-1.0; drop when v0.1.0 ships.
artifacthub.io/license: Apache-2.0
artifacthub.io/prerelease: "true"
artifacthub.io/links: |
- name: source
url: https://github.com/tracecoreai/tracecore
- name: chart-readme
url: https://github.com/tracecoreai/tracecore/blob/main/install/kubernetes/tracecore/README.md
- name: milestones
url: https://github.com/tracecoreai/tracecore/blob/main/MILESTONES.md
artifacthub.io/changes: |
- kind: added
description: Restricted Pod Security Standard DaemonSet with per-receiver toggles.
- kind: added
description: Bundled conftest policy blocking 13 distinct privilege-escalation paths.
- kind: added
description: Kind-cluster install workflow with helm lint + chart-render + conftest + end-to-end install gates.
artifacthub.io/operator: "false"
Loading
Loading