Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
139 changes: 135 additions & 4 deletions .github/workflows/policy-matrix.yml
Original file line number Diff line number Diff line change
Expand Up @@ -39,21 +39,47 @@ concurrency:

jobs:
policy-matrix:
name: ${{ matrix.engine }}
name: ${{ matrix.engine }} / ${{ matrix.values_profile }}
runs-on: ubuntu-latest
timeout-minutes: 20
strategy:
# We want all three rows to run even if one fails — operators
# We want every row to run even if one fails — operators
# want to know "which gate did we fail?" not "we failed".
fail-fast: false
matrix:
# Two-dimensional matrix:
# policy engine × values profile
# The `default` profile uses the chart's `values.yaml` straight
# off the shelf — the smallest install an operator could do.
# The `production` profile layers
# `install/kubernetes/tracecore/values-production.yaml` on top
# — the v1.0-rc1 cut-criteria-10 preset that adds
# NetworkPolicy, PDB, ServiceMonitor, hardened gracePeriod,
# and pinned image policy. The A+ requirement from the #138
# task brief is explicit: validate production values against
# real policy engines, not just defaults.
include:
- engine: psa-restricted
policy_engine: psa
values_profile: default
- engine: kyverno-baseline-restricted
policy_engine: kyverno
values_profile: default
- engine: gatekeeper-restricted
policy_engine: gatekeeper
values_profile: default
- engine: psa-restricted
policy_engine: psa
values_profile: production
values_file: install/kubernetes/tracecore/values-production.yaml
- engine: kyverno-baseline-restricted
policy_engine: kyverno
values_profile: production
values_file: install/kubernetes/tracecore/values-production.yaml
- engine: gatekeeper-restricted
policy_engine: gatekeeper
values_profile: production
values_file: install/kubernetes/tracecore/values-production.yaml
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- name: Install helm
Expand All @@ -65,14 +91,15 @@ jobs:
with:
version: v0.25.0
node_image: kindest/node:v1.32.0
cluster_name: tracecore-policy-${{ matrix.policy_engine }}
cluster_name: tracecore-policy-${{ matrix.policy_engine }}-${{ matrix.values_profile }}
- name: Sanity — kubectl reaches the cluster
run: |
kubectl cluster-info
kubectl version
- name: Smoke — install policy engine + helm dry-run tracecore chart
env:
POLICY_ENGINE: ${{ matrix.policy_engine }}
VALUES_FILE: ${{ matrix.values_file }}
run: bash scripts/policy-matrix-smoke.sh
- name: Collect engine logs on failure
if: failure()
Expand All @@ -94,4 +121,108 @@ jobs:
# cancellation; explicit teardown matches the install-bench
# workflow's pattern (V4 reviewer note).
if: always()
run: kind delete cluster --name "tracecore-policy-${{ matrix.policy_engine }}" || true
run: kind delete cluster --name "tracecore-policy-${{ matrix.policy_engine }}-${{ matrix.values_profile }}" || true

policy-matrix-mutation:
# Falsifier for the whole policy-matrix gate. A green
# `policy-matrix` row tells us nothing on its own: if the policy
# bundle is no-op (forgot to switch Kyverno to Enforce, forgot to
# apply the Gatekeeper constraints, forgot the PSA label), the
# chart admits clean and CI lies. This job takes a known-bad
# DaemonSet (`bad-allowprivilegeescalation.yaml` from the
# chart's conftest testdata — `allowPrivilegeEscalation: true`),
# applies it with `kubectl apply --dry-run=server` against a
# namespace governed by the engine under test, and asserts the
# API server admission chain rejects it. If any engine admits
# the mutated manifest, the bundle is broken — fail loudly.
#
# We bypass `helm` entirely here: the chart's
# `values.schema.json` pins `allowPrivilegeEscalation: const false`,
# so helm itself would reject the mutated values before the API
# server saw the manifest. The point of the mutation gate is to
# exercise the API server's policy engine, not the chart schema
# (the conftest gate in `chart.yml` already covers that).
name: mutation / ${{ matrix.policy_engine }}
runs-on: ubuntu-latest
timeout-minutes: 20
strategy:
fail-fast: false
matrix:
include:
- policy_engine: psa
- policy_engine: kyverno
- policy_engine: gatekeeper
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- name: Install helm
uses: azure/setup-helm@dda3372f752e03dde6b3237bc9431cdc2f7a02a2 # v5.0.0
with:
version: v3.16.4
- name: Create kind cluster
uses: helm/kind-action@ef37e7f390d99f746eb8b610417061a60e82a6cc # v1.14.0
with:
version: v0.25.0
node_image: kindest/node:v1.32.0
cluster_name: tracecore-policy-mutation-${{ matrix.policy_engine }}
- name: Provision policy engine + restricted namespace
env:
POLICY_ENGINE: ${{ matrix.policy_engine }}
# SKIP_SMOKE skips the helm dry-run smoke at the end of the
# script. This job only needs the engine installed; the
# actual mutation check runs in the next step.
SKIP_SMOKE: "1"
run: bash scripts/policy-matrix-smoke.sh
- name: Assert engine REJECTS the known-bad DaemonSet
# Reuses the conftest testdata fixture so we have ONE
# source-of-truth bad manifest. The denial message contains
# `allowPrivilegeEscalation` in every engine's vocabulary
# (PSA's field name, Kyverno's policy rule, Gatekeeper's
# constraint template name); grep for that substring rather
# than engine-specific text.
run: |
set -eo pipefail
fixture=install/kubernetes/tracecore/policies/conftest/testdata/bad-allowprivilegeescalation.yaml
test -f "$fixture" || { echo "::error::mutation fixture not found: $fixture"; exit 1; }
# install_psa() creates tracecore-system; install_kyverno /
# install_gatekeeper do not. Ensure it exists in all three
# engines so the dry-run sees the same namespace shape.
# PSA label is namespace-scoped and only present when
# POLICY_ENGINE=psa — the other two engines gate via
# ClusterPolicy / Constraint match selectors on workload
# kinds, which match regardless of namespace.
kubectl create namespace tracecore-system --dry-run=client -o yaml | kubectl apply -f -
rc=0
kubectl apply --dry-run=server \
-n tracecore-system \
-f "$fixture" \
> /tmp/mutation.out 2>&1 || rc=$?
echo "::group::dry-run output"
cat /tmp/mutation.out
echo "::endgroup::"
if [ "$rc" -eq 0 ]; then
echo "::error::engine '${{ matrix.policy_engine }}' admitted bad-allowprivilegeescalation.yaml — policy bundle is no-op"
exit 1
fi
if ! grep -qE '[Aa]llow[Pp]rivilege[Ee]scalation|privilege-escalation|privilegeEscalation' /tmp/mutation.out; then
echo "::error::engine '${{ matrix.policy_engine }}' rejected but denial message did not name allowPrivilegeEscalation"
exit 1
fi
echo "ok: engine '${{ matrix.policy_engine }}' rejected the mutated manifest with the expected field-name in the denial"
- name: Collect engine logs on failure
if: failure()
run: |
echo "::group::events (all-namespaces)"
kubectl get events -A --sort-by=.lastTimestamp | tail -100 || true
echo "::endgroup::"
echo "::group::kyverno logs"
kubectl -n kyverno logs -l app.kubernetes.io/component=admission-controller --tail=200 || true
echo "::endgroup::"
echo "::group::gatekeeper logs"
kubectl -n gatekeeper-system logs -l control-plane=controller-manager --tail=200 || true
echo "::endgroup::"
echo "::group::constraints"
kubectl get constraints -A || true
echo "::endgroup::"
- name: Tear down kind cluster
if: always()
run: kind delete cluster --name "tracecore-policy-mutation-${{ matrix.policy_engine }}" || true
47 changes: 47 additions & 0 deletions install/kubernetes/tracecore/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -586,3 +586,50 @@ chart's deviations from a literal reading of `restricted`:
Each deviation is bounded by the conftest policy: the policy only
permits SYS_PTRACE, never relaxes hostPID/hostIPC/hostNetwork, and
fails the CI gate on any privileged container.

### Live-cluster policy validation

In addition to the chart-side conftest gate, the
[`.github/workflows/policy-matrix.yml`](../../../.github/workflows/policy-matrix.yml)
workflow gates every chart edit against three enterprise-shape
admission engines running on a real `kind` cluster:

| Engine | Bundle | What it asserts |
| --- | --- | --- |
| Pod Security Admission (`restricted`) | namespace label `pod-security.kubernetes.io/enforce=restricted` | KEP-2579 in-tree restricted profile |
| Kyverno | `kyverno/policies` `pod-security/baseline` + `pod-security/restricted` (Enforce mode) | Upstream curated PSS bundle |
| OPA Gatekeeper | `open-policy-agent/gatekeeper-library` PSP constraint templates | Privileged, host-namespace, capabilities, allow-privilege-escalation, read-only-rootfs, host-filesystem, runAsNonRoot |

Each matrix row runs `helm install --dry-run=server` so the API
server's admission chain — not a local linter — decides whether
the chart admits cleanly. The matrix runs across two values
profiles:

- `default` — chart `values.yaml` straight off the shelf.
- `production` — `values-production.yaml` layered on top
(v1.0-rc1 cut-criteria-10 preset; NetworkPolicy, PDB,
ServiceMonitor, hardened gracePeriod).

A separate `policy-matrix-mutation` job applies a deliberately
broken values overlay (`allowPrivilegeEscalation: true`) and
asserts every engine rejects it — that's the falsifier that proves
the policy bundles are actually enforcing, not silently no-op.

Engine and bundle versions are pinned in
[`scripts/policy-matrix-smoke.sh`](../../../scripts/policy-matrix-smoke.sh);
bumping them is an explicit, reviewed code change.

**Reproducing a failure locally.** When the CI gate trips, repro
against a pre-provisioned `kind` cluster:

```bash
# Pick one engine (no need to install all three locally).
export POLICY_ENGINE=kyverno # or psa | gatekeeper
# Optional: layer the production preset on top of chart defaults.
export VALUES_FILE=install/kubernetes/tracecore/values-production.yaml
bash scripts/policy-matrix-smoke.sh
```

The script prints the engine-level denial verbatim before exiting
non-zero; the field path in the denial names the chart values key
that needs the fix.
38 changes: 34 additions & 4 deletions scripts/policy-matrix-smoke.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,23 @@ set -euo pipefail
: "${CHART_PATH:=install/kubernetes/tracecore}"
: "${RELEASE_NAME:=tracecore}"
: "${TARGET_NAMESPACE:=tracecore-system}"
# Optional values overlay. Empty = chart defaults; set to a path
# (e.g. `install/kubernetes/tracecore/values-production.yaml`) to
# layer the production preset on top of chart defaults before the
# dry-run hits the admission chain. This is the falsifier for the
# A+ requirement in the original #138 task brief: the chart MUST
# admit clean under the production-hardened preset, not just the
# default values.
: "${VALUES_FILE:=}"
# Skip the final helm-dry-run smoke step. Used by the
# `policy-matrix-mutation` workflow job, which provisions the engine
# via this script and then runs its own `kubectl apply --dry-run=server`
# against a known-bad fixture. The smoke step is not useful there
# because the chart's `values.schema.json` pins
# `allowPrivilegeEscalation: const false`, so a helm-shaped mutation
# would be rejected by the chart schema before reaching the API
# server's policy chain.
: "${SKIP_SMOKE:=}"

# Pinned upstream policy-bundle versions. Bumping these is an explicit
# code change reviewed in PR — we do NOT chase "latest" silently.
Expand Down Expand Up @@ -261,7 +278,14 @@ EOF

# ---------------------------------------------------------------- Smoke
smoke_install() {
log "helm install --dry-run=server (engine=${POLICY_ENGINE})"
local values_arg=()
local profile_label="default"
if [ -n "${VALUES_FILE}" ]; then
[ -f "${VALUES_FILE}" ] || fail "VALUES_FILE not found: ${VALUES_FILE}"
values_arg=(-f "${VALUES_FILE}")
profile_label="$(basename "${VALUES_FILE}")"
fi
log "helm install --dry-run=server (engine=${POLICY_ENGINE} profile=${profile_label})"
# --dry-run=server hits the API server's admission chain — that's the
# whole point: it exercises Kyverno / Gatekeeper / PSA in the same
# path a real install would, without actually scheduling pods.
Expand All @@ -270,12 +294,13 @@ smoke_install() {
--create-namespace \
--dry-run=server \
--debug \
"${values_arg[@]}" \
> /tmp/policy-matrix-dryrun.out 2>&1 || {
log "helm dry-run FAILED for engine=${POLICY_ENGINE}:"
log "helm dry-run FAILED for engine=${POLICY_ENGINE} profile=${profile_label}:"
sed 's/^/ /' /tmp/policy-matrix-dryrun.out
fail "tracecore chart violates ${POLICY_ENGINE} policy gates"
fail "tracecore chart violates ${POLICY_ENGINE} policy gates (profile=${profile_label})"
}
log "OK: chart admitted under ${POLICY_ENGINE}"
log "OK: chart admitted under ${POLICY_ENGINE} (profile=${profile_label})"
}

case "${POLICY_ENGINE}" in
Expand All @@ -285,4 +310,9 @@ case "${POLICY_ENGINE}" in
*) fail "unknown POLICY_ENGINE=${POLICY_ENGINE} (want psa|kyverno|gatekeeper)" ;;
esac

if [ -n "${SKIP_SMOKE}" ]; then
log "SKIP_SMOKE set; engine installed, skipping helm dry-run smoke"
exit 0
fi

smoke_install
Loading