diff --git a/.github/workflows/policy-matrix.yml b/.github/workflows/policy-matrix.yml index 790d3ee2..906ea911 100644 --- a/.github/workflows/policy-matrix.yml +++ b/.github/workflows/policy-matrix.yml @@ -39,21 +39,47 @@ concurrency: jobs: policy-matrix: - name: ${{ matrix.engine }} + name: ${{ matrix.engine }} / ${{ matrix.values_profile }} runs-on: ubuntu-latest timeout-minutes: 20 strategy: - # We want all three rows to run even if one fails — operators + # We want every row to run even if one fails — operators # want to know "which gate did we fail?" not "we failed". fail-fast: false matrix: + # Two-dimensional matrix: + # policy engine × values profile + # The `default` profile uses the chart's `values.yaml` straight + # off the shelf — the smallest install an operator could do. + # The `production` profile layers + # `install/kubernetes/tracecore/values-production.yaml` on top + # — the v1.0-rc1 cut-criteria-10 preset that adds + # NetworkPolicy, PDB, ServiceMonitor, hardened gracePeriod, + # and pinned image policy. The A+ requirement from the #138 + # task brief is explicit: validate production values against + # real policy engines, not just defaults. include: - engine: psa-restricted policy_engine: psa + values_profile: default - engine: kyverno-baseline-restricted policy_engine: kyverno + values_profile: default - engine: gatekeeper-restricted policy_engine: gatekeeper + values_profile: default + - engine: psa-restricted + policy_engine: psa + values_profile: production + values_file: install/kubernetes/tracecore/values-production.yaml + - engine: kyverno-baseline-restricted + policy_engine: kyverno + values_profile: production + values_file: install/kubernetes/tracecore/values-production.yaml + - engine: gatekeeper-restricted + policy_engine: gatekeeper + values_profile: production + values_file: install/kubernetes/tracecore/values-production.yaml steps: - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - name: Install helm @@ -65,7 +91,7 @@ jobs: with: version: v0.25.0 node_image: kindest/node:v1.32.0 - cluster_name: tracecore-policy-${{ matrix.policy_engine }} + cluster_name: tracecore-policy-${{ matrix.policy_engine }}-${{ matrix.values_profile }} - name: Sanity — kubectl reaches the cluster run: | kubectl cluster-info @@ -73,6 +99,7 @@ jobs: - name: Smoke — install policy engine + helm dry-run tracecore chart env: POLICY_ENGINE: ${{ matrix.policy_engine }} + VALUES_FILE: ${{ matrix.values_file }} run: bash scripts/policy-matrix-smoke.sh - name: Collect engine logs on failure if: failure() @@ -94,4 +121,108 @@ jobs: # cancellation; explicit teardown matches the install-bench # workflow's pattern (V4 reviewer note). if: always() - run: kind delete cluster --name "tracecore-policy-${{ matrix.policy_engine }}" || true + run: kind delete cluster --name "tracecore-policy-${{ matrix.policy_engine }}-${{ matrix.values_profile }}" || true + + policy-matrix-mutation: + # Falsifier for the whole policy-matrix gate. A green + # `policy-matrix` row tells us nothing on its own: if the policy + # bundle is no-op (forgot to switch Kyverno to Enforce, forgot to + # apply the Gatekeeper constraints, forgot the PSA label), the + # chart admits clean and CI lies. This job takes a known-bad + # DaemonSet (`bad-allowprivilegeescalation.yaml` from the + # chart's conftest testdata — `allowPrivilegeEscalation: true`), + # applies it with `kubectl apply --dry-run=server` against a + # namespace governed by the engine under test, and asserts the + # API server admission chain rejects it. If any engine admits + # the mutated manifest, the bundle is broken — fail loudly. + # + # We bypass `helm` entirely here: the chart's + # `values.schema.json` pins `allowPrivilegeEscalation: const false`, + # so helm itself would reject the mutated values before the API + # server saw the manifest. The point of the mutation gate is to + # exercise the API server's policy engine, not the chart schema + # (the conftest gate in `chart.yml` already covers that). + name: mutation / ${{ matrix.policy_engine }} + runs-on: ubuntu-latest + timeout-minutes: 20 + strategy: + fail-fast: false + matrix: + include: + - policy_engine: psa + - policy_engine: kyverno + - policy_engine: gatekeeper + steps: + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + - name: Install helm + uses: azure/setup-helm@dda3372f752e03dde6b3237bc9431cdc2f7a02a2 # v5.0.0 + with: + version: v3.16.4 + - name: Create kind cluster + uses: helm/kind-action@ef37e7f390d99f746eb8b610417061a60e82a6cc # v1.14.0 + with: + version: v0.25.0 + node_image: kindest/node:v1.32.0 + cluster_name: tracecore-policy-mutation-${{ matrix.policy_engine }} + - name: Provision policy engine + restricted namespace + env: + POLICY_ENGINE: ${{ matrix.policy_engine }} + # SKIP_SMOKE skips the helm dry-run smoke at the end of the + # script. This job only needs the engine installed; the + # actual mutation check runs in the next step. + SKIP_SMOKE: "1" + run: bash scripts/policy-matrix-smoke.sh + - name: Assert engine REJECTS the known-bad DaemonSet + # Reuses the conftest testdata fixture so we have ONE + # source-of-truth bad manifest. The denial message contains + # `allowPrivilegeEscalation` in every engine's vocabulary + # (PSA's field name, Kyverno's policy rule, Gatekeeper's + # constraint template name); grep for that substring rather + # than engine-specific text. + run: | + set -eo pipefail + fixture=install/kubernetes/tracecore/policies/conftest/testdata/bad-allowprivilegeescalation.yaml + test -f "$fixture" || { echo "::error::mutation fixture not found: $fixture"; exit 1; } + # install_psa() creates tracecore-system; install_kyverno / + # install_gatekeeper do not. Ensure it exists in all three + # engines so the dry-run sees the same namespace shape. + # PSA label is namespace-scoped and only present when + # POLICY_ENGINE=psa — the other two engines gate via + # ClusterPolicy / Constraint match selectors on workload + # kinds, which match regardless of namespace. + kubectl create namespace tracecore-system --dry-run=client -o yaml | kubectl apply -f - + rc=0 + kubectl apply --dry-run=server \ + -n tracecore-system \ + -f "$fixture" \ + > /tmp/mutation.out 2>&1 || rc=$? + echo "::group::dry-run output" + cat /tmp/mutation.out + echo "::endgroup::" + if [ "$rc" -eq 0 ]; then + echo "::error::engine '${{ matrix.policy_engine }}' admitted bad-allowprivilegeescalation.yaml — policy bundle is no-op" + exit 1 + fi + if ! grep -qE '[Aa]llow[Pp]rivilege[Ee]scalation|privilege-escalation|privilegeEscalation' /tmp/mutation.out; then + echo "::error::engine '${{ matrix.policy_engine }}' rejected but denial message did not name allowPrivilegeEscalation" + exit 1 + fi + echo "ok: engine '${{ matrix.policy_engine }}' rejected the mutated manifest with the expected field-name in the denial" + - name: Collect engine logs on failure + if: failure() + run: | + echo "::group::events (all-namespaces)" + kubectl get events -A --sort-by=.lastTimestamp | tail -100 || true + echo "::endgroup::" + echo "::group::kyverno logs" + kubectl -n kyverno logs -l app.kubernetes.io/component=admission-controller --tail=200 || true + echo "::endgroup::" + echo "::group::gatekeeper logs" + kubectl -n gatekeeper-system logs -l control-plane=controller-manager --tail=200 || true + echo "::endgroup::" + echo "::group::constraints" + kubectl get constraints -A || true + echo "::endgroup::" + - name: Tear down kind cluster + if: always() + run: kind delete cluster --name "tracecore-policy-mutation-${{ matrix.policy_engine }}" || true diff --git a/install/kubernetes/tracecore/README.md b/install/kubernetes/tracecore/README.md index e2dd75cc..d7fd8238 100644 --- a/install/kubernetes/tracecore/README.md +++ b/install/kubernetes/tracecore/README.md @@ -586,3 +586,50 @@ chart's deviations from a literal reading of `restricted`: Each deviation is bounded by the conftest policy: the policy only permits SYS_PTRACE, never relaxes hostPID/hostIPC/hostNetwork, and fails the CI gate on any privileged container. + +### Live-cluster policy validation + +In addition to the chart-side conftest gate, the +[`.github/workflows/policy-matrix.yml`](../../../.github/workflows/policy-matrix.yml) +workflow gates every chart edit against three enterprise-shape +admission engines running on a real `kind` cluster: + +| Engine | Bundle | What it asserts | +| --- | --- | --- | +| Pod Security Admission (`restricted`) | namespace label `pod-security.kubernetes.io/enforce=restricted` | KEP-2579 in-tree restricted profile | +| Kyverno | `kyverno/policies` `pod-security/baseline` + `pod-security/restricted` (Enforce mode) | Upstream curated PSS bundle | +| OPA Gatekeeper | `open-policy-agent/gatekeeper-library` PSP constraint templates | Privileged, host-namespace, capabilities, allow-privilege-escalation, read-only-rootfs, host-filesystem, runAsNonRoot | + +Each matrix row runs `helm install --dry-run=server` so the API +server's admission chain — not a local linter — decides whether +the chart admits cleanly. The matrix runs across two values +profiles: + +- `default` — chart `values.yaml` straight off the shelf. +- `production` — `values-production.yaml` layered on top + (v1.0-rc1 cut-criteria-10 preset; NetworkPolicy, PDB, + ServiceMonitor, hardened gracePeriod). + +A separate `policy-matrix-mutation` job applies a deliberately +broken values overlay (`allowPrivilegeEscalation: true`) and +asserts every engine rejects it — that's the falsifier that proves +the policy bundles are actually enforcing, not silently no-op. + +Engine and bundle versions are pinned in +[`scripts/policy-matrix-smoke.sh`](../../../scripts/policy-matrix-smoke.sh); +bumping them is an explicit, reviewed code change. + +**Reproducing a failure locally.** When the CI gate trips, repro +against a pre-provisioned `kind` cluster: + +```bash +# Pick one engine (no need to install all three locally). +export POLICY_ENGINE=kyverno # or psa | gatekeeper +# Optional: layer the production preset on top of chart defaults. +export VALUES_FILE=install/kubernetes/tracecore/values-production.yaml +bash scripts/policy-matrix-smoke.sh +``` + +The script prints the engine-level denial verbatim before exiting +non-zero; the field path in the denial names the chart values key +that needs the fix. diff --git a/scripts/policy-matrix-smoke.sh b/scripts/policy-matrix-smoke.sh index 66bbe145..bebac51b 100755 --- a/scripts/policy-matrix-smoke.sh +++ b/scripts/policy-matrix-smoke.sh @@ -21,6 +21,23 @@ set -euo pipefail : "${CHART_PATH:=install/kubernetes/tracecore}" : "${RELEASE_NAME:=tracecore}" : "${TARGET_NAMESPACE:=tracecore-system}" +# Optional values overlay. Empty = chart defaults; set to a path +# (e.g. `install/kubernetes/tracecore/values-production.yaml`) to +# layer the production preset on top of chart defaults before the +# dry-run hits the admission chain. This is the falsifier for the +# A+ requirement in the original #138 task brief: the chart MUST +# admit clean under the production-hardened preset, not just the +# default values. +: "${VALUES_FILE:=}" +# Skip the final helm-dry-run smoke step. Used by the +# `policy-matrix-mutation` workflow job, which provisions the engine +# via this script and then runs its own `kubectl apply --dry-run=server` +# against a known-bad fixture. The smoke step is not useful there +# because the chart's `values.schema.json` pins +# `allowPrivilegeEscalation: const false`, so a helm-shaped mutation +# would be rejected by the chart schema before reaching the API +# server's policy chain. +: "${SKIP_SMOKE:=}" # Pinned upstream policy-bundle versions. Bumping these is an explicit # code change reviewed in PR — we do NOT chase "latest" silently. @@ -261,7 +278,14 @@ EOF # ---------------------------------------------------------------- Smoke smoke_install() { - log "helm install --dry-run=server (engine=${POLICY_ENGINE})" + local values_arg=() + local profile_label="default" + if [ -n "${VALUES_FILE}" ]; then + [ -f "${VALUES_FILE}" ] || fail "VALUES_FILE not found: ${VALUES_FILE}" + values_arg=(-f "${VALUES_FILE}") + profile_label="$(basename "${VALUES_FILE}")" + fi + log "helm install --dry-run=server (engine=${POLICY_ENGINE} profile=${profile_label})" # --dry-run=server hits the API server's admission chain — that's the # whole point: it exercises Kyverno / Gatekeeper / PSA in the same # path a real install would, without actually scheduling pods. @@ -270,12 +294,13 @@ smoke_install() { --create-namespace \ --dry-run=server \ --debug \ + "${values_arg[@]}" \ > /tmp/policy-matrix-dryrun.out 2>&1 || { - log "helm dry-run FAILED for engine=${POLICY_ENGINE}:" + log "helm dry-run FAILED for engine=${POLICY_ENGINE} profile=${profile_label}:" sed 's/^/ /' /tmp/policy-matrix-dryrun.out - fail "tracecore chart violates ${POLICY_ENGINE} policy gates" + fail "tracecore chart violates ${POLICY_ENGINE} policy gates (profile=${profile_label})" } - log "OK: chart admitted under ${POLICY_ENGINE}" + log "OK: chart admitted under ${POLICY_ENGINE} (profile=${profile_label})" } case "${POLICY_ENGINE}" in @@ -285,4 +310,9 @@ case "${POLICY_ENGINE}" in *) fail "unknown POLICY_ENGINE=${POLICY_ENGINE} (want psa|kyverno|gatekeeper)" ;; esac +if [ -n "${SKIP_SMOKE}" ]; then + log "SKIP_SMOKE set; engine installed, skipping helm dry-run smoke" + exit 0 +fi + smoke_install