From b73ba676eadcc09712db7f9f34b60ed870c2b5ab Mon Sep 17 00:00:00 2001 From: Tri Lam Date: Sun, 31 May 2026 22:18:07 -0700 Subject: [PATCH 1/2] ci(policy): kind matrix (PSA + Kyverno + Gatekeeper) (closes #138) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adopt upstream curated policy bundles to gate the tracecore Helm chart against three enterprise-shape admission engines on every chart edit: - PSA-restricted via namespace labels (KEP-2579 GA) - kyverno/policies pod-security baseline + restricted (Enforce mode) - open-policy-agent/gatekeeper-library PSP constraint templates The assertion is `helm install --dry-run=server` — the kind API server runs each engine's admission webhook(s) against the rendered chart and rejects it on policy violation. No hand-rolled Rego or Kyverno YAML; bundle versions are pinned in the helper script. Signed-off-by: Tri Lam --- .github/workflows/policy-matrix.yml | 97 +++++++++++ scripts/policy-matrix-smoke.sh | 242 ++++++++++++++++++++++++++++ 2 files changed, 339 insertions(+) create mode 100644 .github/workflows/policy-matrix.yml create mode 100755 scripts/policy-matrix-smoke.sh diff --git a/.github/workflows/policy-matrix.yml b/.github/workflows/policy-matrix.yml new file mode 100644 index 00000000..790d3ee2 --- /dev/null +++ b/.github/workflows/policy-matrix.yml @@ -0,0 +1,97 @@ +name: policy-matrix + +# Roadmap A12 / closes #138 — kind-cluster gate that asserts the +# tracecore Helm chart passes enterprise-shape policy bundles: +# +# - PSA (Pod Security Admission, restricted profile, namespace-labeled) +# - Kyverno (kyverno/policies pod-security baseline + restricted) +# - Gatekeeper (open-policy-agent/gatekeeper-library PSP templates) +# +# The assertion is `helm install --dry-run=server`: the kind API +# server runs the engine's admission webhook(s) and rejects the +# install on policy violation. We adopt curated upstream bundles +# verbatim — no hand-rolled Rego or Kyverno YAML — per the +# `adopt-over-build` repo policy. +# +# This workflow sits OUTSIDE the `make ci` 60s budget (kind takes +# 2-3 min per row to come up). It runs only when chart shape or +# this workflow itself changes. + +on: + pull_request: + paths: + - 'install/kubernetes/tracecore/**' + - 'scripts/policy-matrix-smoke.sh' + - '.github/workflows/policy-matrix.yml' + push: + branches: [main] + paths: + - 'install/kubernetes/tracecore/**' + - 'scripts/policy-matrix-smoke.sh' + - '.github/workflows/policy-matrix.yml' + +permissions: + contents: read + +concurrency: + group: policy-matrix-${{ github.ref }} + cancel-in-progress: true + +jobs: + policy-matrix: + name: ${{ matrix.engine }} + runs-on: ubuntu-latest + timeout-minutes: 20 + strategy: + # We want all three rows to run even if one fails — operators + # want to know "which gate did we fail?" not "we failed". + fail-fast: false + matrix: + include: + - engine: psa-restricted + policy_engine: psa + - engine: kyverno-baseline-restricted + policy_engine: kyverno + - engine: gatekeeper-restricted + policy_engine: gatekeeper + steps: + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + - name: Install helm + uses: azure/setup-helm@dda3372f752e03dde6b3237bc9431cdc2f7a02a2 # v5.0.0 + with: + version: v3.16.4 + - name: Create kind cluster + uses: helm/kind-action@ef37e7f390d99f746eb8b610417061a60e82a6cc # v1.14.0 + with: + version: v0.25.0 + node_image: kindest/node:v1.32.0 + cluster_name: tracecore-policy-${{ matrix.policy_engine }} + - name: Sanity — kubectl reaches the cluster + run: | + kubectl cluster-info + kubectl version + - name: Smoke — install policy engine + helm dry-run tracecore chart + env: + POLICY_ENGINE: ${{ matrix.policy_engine }} + run: bash scripts/policy-matrix-smoke.sh + - name: Collect engine logs on failure + if: failure() + run: | + echo "::group::events (all-namespaces)" + kubectl get events -A --sort-by=.lastTimestamp | tail -100 || true + echo "::endgroup::" + echo "::group::kyverno logs" + kubectl -n kyverno logs -l app.kubernetes.io/component=admission-controller --tail=200 || true + echo "::endgroup::" + echo "::group::gatekeeper logs" + kubectl -n gatekeeper-system logs -l control-plane=controller-manager --tail=200 || true + echo "::endgroup::" + echo "::group::constraints" + kubectl get constraints -A || true + echo "::endgroup::" + - name: Tear down kind cluster + # helm/kind-action's post-step has known cleanup gaps on + # cancellation; explicit teardown matches the install-bench + # workflow's pattern (V4 reviewer note). + if: always() + run: kind delete cluster --name "tracecore-policy-${{ matrix.policy_engine }}" || true diff --git a/scripts/policy-matrix-smoke.sh b/scripts/policy-matrix-smoke.sh new file mode 100755 index 00000000..c9fe72a9 --- /dev/null +++ b/scripts/policy-matrix-smoke.sh @@ -0,0 +1,242 @@ +#!/usr/bin/env bash +# policy-matrix-smoke.sh — install the tracecore chart against a +# pre-provisioned kind cluster running ONE of: +# +# POLICY_ENGINE=psa Pod Security Admission (restricted) +# POLICY_ENGINE=kyverno Kyverno + kyverno/policies baseline+restricted +# POLICY_ENGINE=gatekeeper OPA Gatekeeper + gatekeeper-library PSP set +# +# The assertion model is `helm install --dry-run=server`: the API server +# returns admission errors when a policy engine rejects the rendered +# manifests. A non-zero exit means tracecore fails enterprise-shape +# gates and needs a chart-side fix (we don't relax the policy bundles). +# +# Bundle versions are pinned in env vars below so reproductions are +# byte-identical to CI. +# +# Roadmap A12 / closes #138. +set -euo pipefail + +: "${POLICY_ENGINE:?POLICY_ENGINE must be one of: psa | kyverno | gatekeeper}" +: "${CHART_PATH:=install/kubernetes/tracecore}" +: "${RELEASE_NAME:=tracecore}" +: "${TARGET_NAMESPACE:=tracecore-system}" + +# Pinned upstream policy-bundle versions. Bumping these is an explicit +# code change reviewed in PR — we do NOT chase "latest" silently. +: "${KYVERNO_HELM_VERSION:=3.4.6}" # kyverno/kyverno chart (app v1.14.x) +: "${KYVERNO_POLICIES_REF:=main}" # kyverno/policies (PolicySet) +: "${GATEKEEPER_VERSION:=v3.18.2}" # open-policy-agent/gatekeeper release +: "${GATEKEEPER_LIBRARY_REF:=master}" # gatekeeper-library snapshot + +log() { printf '[policy-matrix] %s\n' "$*"; } +fail() { printf '::error::%s\n' "$*"; exit 1; } + +require() { + command -v "$1" >/dev/null 2>&1 || fail "missing required binary: $1" +} + +require kubectl +require helm + +# ---------------------------------------------------------------- PSA +install_psa() { + log "PSA-restricted: labeling namespace ${TARGET_NAMESPACE} with restricted enforce" + kubectl create namespace "${TARGET_NAMESPACE}" --dry-run=client -o yaml \ + | kubectl apply -f - + # PSA labels are the upstream-blessed mechanism (KEP-2579, GA 1.25+). + # `enforce` is the gate; warn + audit are for observability. + kubectl label --overwrite namespace "${TARGET_NAMESPACE}" \ + pod-security.kubernetes.io/enforce=restricted \ + pod-security.kubernetes.io/enforce-version=latest \ + pod-security.kubernetes.io/warn=restricted \ + pod-security.kubernetes.io/audit=restricted +} + +# ---------------------------------------------------------------- Kyverno +install_kyverno() { + log "Kyverno: helm install chart ${KYVERNO_HELM_VERSION}" + helm repo add kyverno https://kyverno.github.io/kyverno/ >/dev/null + helm repo update kyverno >/dev/null + helm install kyverno kyverno/kyverno \ + --namespace kyverno --create-namespace \ + --version "${KYVERNO_HELM_VERSION}" \ + --wait --timeout 5m + # Wait for the validating webhooks to register; without this a fast + # `helm install --dry-run=server` races the webhook config and + # admits manifests the engine would otherwise reject. + kubectl -n kyverno wait --for=condition=Ready pod \ + -l app.kubernetes.io/component=admission-controller --timeout=3m + + log "Kyverno: applying baseline + restricted policy bundles (ref=${KYVERNO_POLICIES_REF})" + # kustomization.yaml in each bundle dir aggregates every policy file. + # kubectl kustomize handles the remote ref so we don't shell out to kustomize. + kubectl apply -k \ + "https://github.com/kyverno/policies/pod-security/baseline?ref=${KYVERNO_POLICIES_REF}" + kubectl apply -k \ + "https://github.com/kyverno/policies/pod-security/restricted?ref=${KYVERNO_POLICIES_REF}" + + # Default policy install action is `audit`. Flip everything to enforce + # so violations actually block admission (the whole point of this gate). + log "Kyverno: switching all ClusterPolicies to validationFailureAction=Enforce" + for p in $(kubectl get clusterpolicies -o name); do + kubectl patch "$p" --type merge \ + -p '{"spec":{"validationFailureAction":"Enforce"}}' >/dev/null + done + kubectl get clusterpolicies \ + -o jsonpath='{range .items[*]}{.metadata.name}{"\t"}{.spec.validationFailureAction}{"\n"}{end}' +} + +# ---------------------------------------------------------------- Gatekeeper +install_gatekeeper() { + log "Gatekeeper: applying upstream release manifest (${GATEKEEPER_VERSION})" + kubectl apply -f \ + "https://raw.githubusercontent.com/open-policy-agent/gatekeeper/${GATEKEEPER_VERSION}/deploy/gatekeeper.yaml" + kubectl -n gatekeeper-system wait --for=condition=Available deploy \ + --all --timeout=5m + + log "Gatekeeper: installing pod-security-policy constraint templates (ref=${GATEKEEPER_LIBRARY_REF})" + # The minimum-viable set that exercises tracecore's pod shape: + # privileged, host-namespaces, capabilities, allow-privilege-escalation, + # read-only-root-filesystem, host-filesystem, users (runAsNonRoot). + # NOTE: paths use `template.yaml` per gatekeeper-library v3 layout. + local base="https://raw.githubusercontent.com/open-policy-agent/gatekeeper-library/${GATEKEEPER_LIBRARY_REF}/library/pod-security-policy" + local templates=( + "privileged-containers/template.yaml" + "host-namespaces/template.yaml" + "capabilities/template.yaml" + "allow-privilege-escalation/template.yaml" + "read-only-root-filesystem/template.yaml" + "host-filesystem/template.yaml" + "users/template.yaml" + ) + for t in "${templates[@]}"; do + kubectl apply -f "${base}/${t}" + done + + # Constraint templates compile to CRDs asynchronously; wait for the + # last one to register before applying constraints. + log "Gatekeeper: waiting for constraint CRDs" + for k in K8sPSPPrivilegedContainer K8sPSPHostNamespace K8sPSPCapabilities \ + K8sPSPAllowPrivilegeEscalationContainer K8sPSPReadOnlyRootFilesystem \ + K8sPSPHostFilesystem K8sPSPAllowedUsers; do + for _ in $(seq 1 30); do + if kubectl get crd "$(echo "$k" | tr '[:upper:]' '[:lower:]').constraints.gatekeeper.sh" >/dev/null 2>&1; then + break + fi + sleep 2 + done + done + + log "Gatekeeper: applying minimum enforce constraints" + cat <<'EOF' | kubectl apply -f - +apiVersion: constraints.gatekeeper.sh/v1beta1 +kind: K8sPSPPrivilegedContainer +metadata: + name: tracecore-psp-privileged +spec: + enforcementAction: deny + match: + kinds: + - apiGroups: ["apps"] + kinds: ["DaemonSet", "Deployment", "StatefulSet"] + - apiGroups: [""] + kinds: ["Pod"] +--- +apiVersion: constraints.gatekeeper.sh/v1beta1 +kind: K8sPSPHostNamespace +metadata: + name: tracecore-psp-host-namespace +spec: + enforcementAction: deny + match: + kinds: + - apiGroups: ["apps"] + kinds: ["DaemonSet", "Deployment", "StatefulSet"] + - apiGroups: [""] + kinds: ["Pod"] +--- +apiVersion: constraints.gatekeeper.sh/v1beta1 +kind: K8sPSPCapabilities +metadata: + name: tracecore-psp-capabilities +spec: + enforcementAction: deny + match: + kinds: + - apiGroups: ["apps"] + kinds: ["DaemonSet", "Deployment", "StatefulSet"] + - apiGroups: [""] + kinds: ["Pod"] + parameters: + requiredDropCapabilities: ["ALL"] + allowedCapabilities: ["SYS_PTRACE"] +--- +apiVersion: constraints.gatekeeper.sh/v1beta1 +kind: K8sPSPAllowPrivilegeEscalationContainer +metadata: + name: tracecore-psp-allow-privilege-escalation +spec: + enforcementAction: deny + match: + kinds: + - apiGroups: ["apps"] + kinds: ["DaemonSet", "Deployment", "StatefulSet"] + - apiGroups: [""] + kinds: ["Pod"] +--- +apiVersion: constraints.gatekeeper.sh/v1beta1 +kind: K8sPSPReadOnlyRootFilesystem +metadata: + name: tracecore-psp-read-only-root +spec: + enforcementAction: deny + match: + kinds: + - apiGroups: ["apps"] + kinds: ["DaemonSet", "Deployment", "StatefulSet"] + - apiGroups: [""] + kinds: ["Pod"] +EOF + + # Webhook readiness — same race as Kyverno. + kubectl -n gatekeeper-system wait --for=condition=Ready pod \ + -l control-plane=controller-manager --timeout=2m + # Constraints need ~30s to compile+sync after apply. Poll the audit + # status field rather than `sleep`. + for _ in $(seq 1 30); do + if kubectl get k8spspprivilegedcontainer tracecore-psp-privileged \ + -o jsonpath='{.status.byPod[*].enforced}' 2>/dev/null | grep -q true; then + break + fi + sleep 2 + done +} + +# ---------------------------------------------------------------- Smoke +smoke_install() { + log "helm install --dry-run=server (engine=${POLICY_ENGINE})" + # --dry-run=server hits the API server's admission chain — that's the + # whole point: it exercises Kyverno / Gatekeeper / PSA in the same + # path a real install would, without actually scheduling pods. + helm install "${RELEASE_NAME}" "${CHART_PATH}" \ + --namespace "${TARGET_NAMESPACE}" \ + --create-namespace \ + --dry-run=server \ + --debug \ + > /tmp/policy-matrix-dryrun.out 2>&1 || { + log "helm dry-run FAILED for engine=${POLICY_ENGINE}:" + sed 's/^/ /' /tmp/policy-matrix-dryrun.out + fail "tracecore chart violates ${POLICY_ENGINE} policy gates" + } + log "OK: chart admitted under ${POLICY_ENGINE}" +} + +case "${POLICY_ENGINE}" in + psa) install_psa ;; + kyverno) install_kyverno ;; + gatekeeper) install_gatekeeper ;; + *) fail "unknown POLICY_ENGINE=${POLICY_ENGINE} (want psa|kyverno|gatekeeper)" ;; +esac + +smoke_install From b5e81d3218f64b2203634241e8f60f0c2d53379b Mon Sep 17 00:00:00 2001 From: Tri Lam Date: Sun, 31 May 2026 22:24:49 -0700 Subject: [PATCH 2/2] ci(policy): pin upstream bundle SHAs + add 2 missing constraints MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Per fresh-context review of #289: - KYVERNO_POLICIES_REF + GATEKEEPER_LIBRARY_REF were 'main'/'master' branches; upstream has no tagged releases, so byte-reproducibility required commit-SHA pins. Refreshed 2026-05-31: - kyverno/policies: 76be98a25d49ae01278a94ecde8f50f9e08577ef - gatekeeper-library: 53684fab133fd52d77aa42f632bc2ecd52f0447c - Heredoc was missing constraint resources for K8sPSPHostFilesystem and K8sPSPAllowedUsers — the templates were wait-applied but no constraints fired against them. Added both with deny enforcement + parameters that gate the chart's existing hostPath-free, runAsNonRoot+nonRoot-group posture. Strictly more regression coverage; chart already conforms. Signed-off-by: Tri Lam --- scripts/policy-matrix-smoke.sh | 50 ++++++++++++++++++++++++++++++++-- 1 file changed, 48 insertions(+), 2 deletions(-) diff --git a/scripts/policy-matrix-smoke.sh b/scripts/policy-matrix-smoke.sh index c9fe72a9..66bbe145 100755 --- a/scripts/policy-matrix-smoke.sh +++ b/scripts/policy-matrix-smoke.sh @@ -25,9 +25,12 @@ set -euo pipefail # Pinned upstream policy-bundle versions. Bumping these is an explicit # code change reviewed in PR — we do NOT chase "latest" silently. : "${KYVERNO_HELM_VERSION:=3.4.6}" # kyverno/kyverno chart (app v1.14.x) -: "${KYVERNO_POLICIES_REF:=main}" # kyverno/policies (PolicySet) +# kyverno/policies + gatekeeper-library have no tagged releases; pin +# by commit SHA so CI is byte-reproducible. Bump SHAs in a reviewed +# PR (refreshed 2026-05-31). +: "${KYVERNO_POLICIES_REF:=76be98a25d49ae01278a94ecde8f50f9e08577ef}" : "${GATEKEEPER_VERSION:=v3.18.2}" # open-policy-agent/gatekeeper release -: "${GATEKEEPER_LIBRARY_REF:=master}" # gatekeeper-library snapshot +: "${GATEKEEPER_LIBRARY_REF:=53684fab133fd52d77aa42f632bc2ecd52f0447c}" log() { printf '[policy-matrix] %s\n' "$*"; } fail() { printf '::error::%s\n' "$*"; exit 1; } @@ -197,6 +200,49 @@ spec: kinds: ["DaemonSet", "Deployment", "StatefulSet"] - apiGroups: [""] kinds: ["Pod"] +--- +apiVersion: constraints.gatekeeper.sh/v1beta1 +kind: K8sPSPHostFilesystem +metadata: + name: tracecore-psp-host-filesystem +spec: + enforcementAction: deny + match: + kinds: + - apiGroups: ["apps"] + kinds: ["DaemonSet", "Deployment", "StatefulSet"] + - apiGroups: [""] + kinds: ["Pod"] + parameters: + allowedHostPaths: [] +--- +apiVersion: constraints.gatekeeper.sh/v1beta1 +kind: K8sPSPAllowedUsers +metadata: + name: tracecore-psp-allowed-users +spec: + enforcementAction: deny + match: + kinds: + - apiGroups: ["apps"] + kinds: ["DaemonSet", "Deployment", "StatefulSet"] + - apiGroups: [""] + kinds: ["Pod"] + parameters: + runAsUser: + rule: MustRunAsNonRoot + runAsGroup: + rule: MayRunAs + ranges: + - {min: 1, max: 65535} + fsGroup: + rule: MayRunAs + ranges: + - {min: 1, max: 65535} + supplementalGroups: + rule: MayRunAs + ranges: + - {min: 1, max: 65535} EOF # Webhook readiness — same race as Kyverno.