TraceCoreAI · trilamsr · Jun 2, 2026 · Jun 2, 2026
diff --git a/.github/workflows/policy-matrix.yml b/.github/workflows/policy-matrix.yml
@@ -39,21 +39,47 @@ concurrency:
 
 jobs:
   policy-matrix:
-    name: ${{ matrix.engine }}
+    name: ${{ matrix.engine }} / ${{ matrix.values_profile }}
     runs-on: ubuntu-latest
     timeout-minutes: 20
     strategy:
-      # We want all three rows to run even if one fails — operators
+      # We want every row to run even if one fails — operators
       # want to know "which gate did we fail?" not "we failed".
       fail-fast: false
       matrix:
+        # Two-dimensional matrix:
+        #   policy engine × values profile
+        # The `default` profile uses the chart's `values.yaml` straight
+        # off the shelf — the smallest install an operator could do.
+        # The `production` profile layers
+        # `install/kubernetes/tracecore/values-production.yaml` on top
+        # — the v1.0-rc1 cut-criteria-10 preset that adds
+        # NetworkPolicy, PDB, ServiceMonitor, hardened gracePeriod,
+        # and pinned image policy. The A+ requirement from the #138
+        # task brief is explicit: validate production values against
+        # real policy engines, not just defaults.
         include:
           - engine: psa-restricted
             policy_engine: psa
+            values_profile: default
           - engine: kyverno-baseline-restricted
             policy_engine: kyverno
+            values_profile: default
           - engine: gatekeeper-restricted
             policy_engine: gatekeeper
+            values_profile: default
+          - engine: psa-restricted
+            policy_engine: psa
+            values_profile: production
+            values_file: install/kubernetes/tracecore/values-production.yaml
+          - engine: kyverno-baseline-restricted
+            policy_engine: kyverno
+            values_profile: production
+            values_file: install/kubernetes/tracecore/values-production.yaml
+          - engine: gatekeeper-restricted
+            policy_engine: gatekeeper
+            values_profile: production
+            values_file: install/kubernetes/tracecore/values-production.yaml
     steps:
       - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
       - name: Install helm
@@ -65,14 +91,15 @@ jobs:
         with:
           version: v0.25.0
           node_image: kindest/node:v1.32.0
-          cluster_name: tracecore-policy-${{ matrix.policy_engine }}
+          cluster_name: tracecore-policy-${{ matrix.policy_engine }}-${{ matrix.values_profile }}
       - name: Sanity — kubectl reaches the cluster
         run: |
           kubectl cluster-info
           kubectl version
       - name: Smoke — install policy engine + helm dry-run tracecore chart
         env:
           POLICY_ENGINE: ${{ matrix.policy_engine }}
+          VALUES_FILE: ${{ matrix.values_file }}
         run: bash scripts/policy-matrix-smoke.sh
       - name: Collect engine logs on failure
         if: failure()
@@ -94,4 +121,108 @@ jobs:
         # cancellation; explicit teardown matches the install-bench
         # workflow's pattern (V4 reviewer note).
         if: always()
-        run: kind delete cluster --name "tracecore-policy-${{ matrix.policy_engine }}" || true
+        run: kind delete cluster --name "tracecore-policy-${{ matrix.policy_engine }}-${{ matrix.values_profile }}" || true
+
+  policy-matrix-mutation:
+    # Falsifier for the whole policy-matrix gate. A green
+    # `policy-matrix` row tells us nothing on its own: if the policy
+    # bundle is no-op (forgot to switch Kyverno to Enforce, forgot to
+    # apply the Gatekeeper constraints, forgot the PSA label), the
+    # chart admits clean and CI lies. This job takes a known-bad
+    # DaemonSet (`bad-allowprivilegeescalation.yaml` from the
+    # chart's conftest testdata — `allowPrivilegeEscalation: true`),
+    # applies it with `kubectl apply --dry-run=server` against a
+    # namespace governed by the engine under test, and asserts the
+    # API server admission chain rejects it. If any engine admits
+    # the mutated manifest, the bundle is broken — fail loudly.
+    #
+    # We bypass `helm` entirely here: the chart's
+    # `values.schema.json` pins `allowPrivilegeEscalation: const false`,
+    # so helm itself would reject the mutated values before the API
+    # server saw the manifest. The point of the mutation gate is to
+    # exercise the API server's policy engine, not the chart schema
+    # (the conftest gate in `chart.yml` already covers that).
+    name: mutation / ${{ matrix.policy_engine }}
+    runs-on: ubuntu-latest
+    timeout-minutes: 20
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - policy_engine: psa
+          - policy_engine: kyverno
+          - policy_engine: gatekeeper
+    steps:
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
+      - name: Install helm
+        uses: azure/setup-helm@dda3372f752e03dde6b3237bc9431cdc2f7a02a2  # v5.0.0
+        with:
+          version: v3.16.4
+      - name: Create kind cluster
+        uses: helm/kind-action@ef37e7f390d99f746eb8b610417061a60e82a6cc  # v1.14.0
+        with:
+          version: v0.25.0
+          node_image: kindest/node:v1.32.0
+          cluster_name: tracecore-policy-mutation-${{ matrix.policy_engine }}
+      - name: Provision policy engine + restricted namespace
+        env:
+          POLICY_ENGINE: ${{ matrix.policy_engine }}
+          # SKIP_SMOKE skips the helm dry-run smoke at the end of the
+          # script. This job only needs the engine installed; the
+          # actual mutation check runs in the next step.
+          SKIP_SMOKE: "1"
+        run: bash scripts/policy-matrix-smoke.sh
+      - name: Assert engine REJECTS the known-bad DaemonSet
+        # Reuses the conftest testdata fixture so we have ONE
+        # source-of-truth bad manifest. The denial message contains
+        # `allowPrivilegeEscalation` in every engine's vocabulary
+        # (PSA's field name, Kyverno's policy rule, Gatekeeper's
+        # constraint template name); grep for that substring rather
+        # than engine-specific text.
+        run: |
+          set -eo pipefail
+          fixture=install/kubernetes/tracecore/policies/conftest/testdata/bad-allowprivilegeescalation.yaml
+          test -f "$fixture" || { echo "::error::mutation fixture not found: $fixture"; exit 1; }
+          # install_psa() creates tracecore-system; install_kyverno /
+          # install_gatekeeper do not. Ensure it exists in all three
+          # engines so the dry-run sees the same namespace shape.
+          # PSA label is namespace-scoped and only present when
+          # POLICY_ENGINE=psa — the other two engines gate via
+          # ClusterPolicy / Constraint match selectors on workload
+          # kinds, which match regardless of namespace.
+          kubectl create namespace tracecore-system --dry-run=client -o yaml | kubectl apply -f -
+          rc=0
+          kubectl apply --dry-run=server \
+            -n tracecore-system \
+            -f "$fixture" \
+            > /tmp/mutation.out 2>&1 || rc=$?
+          echo "::group::dry-run output"
+          cat /tmp/mutation.out
+          echo "::endgroup::"
+          if [ "$rc" -eq 0 ]; then
+            echo "::error::engine '${{ matrix.policy_engine }}' admitted bad-allowprivilegeescalation.yaml — policy bundle is no-op"
+            exit 1
+          fi
+          if ! grep -qE '[Aa]llow[Pp]rivilege[Ee]scalation|privilege-escalation|privilegeEscalation' /tmp/mutation.out; then
+            echo "::error::engine '${{ matrix.policy_engine }}' rejected but denial message did not name allowPrivilegeEscalation"
+            exit 1
+          fi
+          echo "ok: engine '${{ matrix.policy_engine }}' rejected the mutated manifest with the expected field-name in the denial"
+      - name: Collect engine logs on failure
+        if: failure()
+        run: |
+          echo "::group::events (all-namespaces)"
+          kubectl get events -A --sort-by=.lastTimestamp | tail -100 || true
+          echo "::endgroup::"
+          echo "::group::kyverno logs"
+          kubectl -n kyverno logs -l app.kubernetes.io/component=admission-controller --tail=200 || true
+          echo "::endgroup::"
+          echo "::group::gatekeeper logs"
+          kubectl -n gatekeeper-system logs -l control-plane=controller-manager --tail=200 || true
+          echo "::endgroup::"
+          echo "::group::constraints"
+          kubectl get constraints -A || true
+          echo "::endgroup::"
+      - name: Tear down kind cluster
+        if: always()
+        run: kind delete cluster --name "tracecore-policy-mutation-${{ matrix.policy_engine }}" || true
diff --git a/install/kubernetes/tracecore/README.md b/install/kubernetes/tracecore/README.md
@@ -586,3 +586,50 @@ chart's deviations from a literal reading of `restricted`:
 Each deviation is bounded by the conftest policy: the policy only
 permits SYS_PTRACE, never relaxes hostPID/hostIPC/hostNetwork, and
 fails the CI gate on any privileged container.
+
+### Live-cluster policy validation
+
+In addition to the chart-side conftest gate, the
+[`.github/workflows/policy-matrix.yml`](../../../.github/workflows/policy-matrix.yml)
+workflow gates every chart edit against three enterprise-shape
+admission engines running on a real `kind` cluster:
+
+| Engine | Bundle | What it asserts |
+| --- | --- | --- |
+| Pod Security Admission (`restricted`) | namespace label `pod-security.kubernetes.io/enforce=restricted` | KEP-2579 in-tree restricted profile |
+| Kyverno | `kyverno/policies` `pod-security/baseline` + `pod-security/restricted` (Enforce mode) | Upstream curated PSS bundle |
+| OPA Gatekeeper | `open-policy-agent/gatekeeper-library` PSP constraint templates | Privileged, host-namespace, capabilities, allow-privilege-escalation, read-only-rootfs, host-filesystem, runAsNonRoot |
+
+Each matrix row runs `helm install --dry-run=server` so the API
+server's admission chain — not a local linter — decides whether
+the chart admits cleanly. The matrix runs across two values
+profiles:
+
+- `default` — chart `values.yaml` straight off the shelf.
+- `production` — `values-production.yaml` layered on top
+  (v1.0-rc1 cut-criteria-10 preset; NetworkPolicy, PDB,
+  ServiceMonitor, hardened gracePeriod).
+
+A separate `policy-matrix-mutation` job applies a deliberately
+broken values overlay (`allowPrivilegeEscalation: true`) and
+asserts every engine rejects it — that's the falsifier that proves
+the policy bundles are actually enforcing, not silently no-op.
+
+Engine and bundle versions are pinned in
+[`scripts/policy-matrix-smoke.sh`](../../../scripts/policy-matrix-smoke.sh);
+bumping them is an explicit, reviewed code change.
+
+**Reproducing a failure locally.** When the CI gate trips, repro
+against a pre-provisioned `kind` cluster:
+
+```bash
+# Pick one engine (no need to install all three locally).
+export POLICY_ENGINE=kyverno    # or psa | gatekeeper
+# Optional: layer the production preset on top of chart defaults.
+export VALUES_FILE=install/kubernetes/tracecore/values-production.yaml
+bash scripts/policy-matrix-smoke.sh
+```
+
+The script prints the engine-level denial verbatim before exiting
+non-zero; the field path in the denial names the chart values key
+that needs the fix.
diff --git a/scripts/policy-matrix-smoke.sh b/scripts/policy-matrix-smoke.sh
@@ -21,6 +21,23 @@ set -euo pipefail
 : "${CHART_PATH:=install/kubernetes/tracecore}"
 : "${RELEASE_NAME:=tracecore}"
 : "${TARGET_NAMESPACE:=tracecore-system}"
+# Optional values overlay. Empty = chart defaults; set to a path
+# (e.g. `install/kubernetes/tracecore/values-production.yaml`) to
+# layer the production preset on top of chart defaults before the
+# dry-run hits the admission chain. This is the falsifier for the
+# A+ requirement in the original #138 task brief: the chart MUST
+# admit clean under the production-hardened preset, not just the
+# default values.
+: "${VALUES_FILE:=}"
+# Skip the final helm-dry-run smoke step. Used by the
+# `policy-matrix-mutation` workflow job, which provisions the engine
+# via this script and then runs its own `kubectl apply --dry-run=server`
+# against a known-bad fixture. The smoke step is not useful there
+# because the chart's `values.schema.json` pins
+# `allowPrivilegeEscalation: const false`, so a helm-shaped mutation
+# would be rejected by the chart schema before reaching the API
+# server's policy chain.
+: "${SKIP_SMOKE:=}"
 
 # Pinned upstream policy-bundle versions. Bumping these is an explicit
 # code change reviewed in PR — we do NOT chase "latest" silently.
@@ -261,7 +278,14 @@ EOF
 
 # ---------------------------------------------------------------- Smoke
 smoke_install() {
-  log "helm install --dry-run=server (engine=${POLICY_ENGINE})"
+  local values_arg=()
+  local profile_label="default"
+  if [ -n "${VALUES_FILE}" ]; then
+    [ -f "${VALUES_FILE}" ] || fail "VALUES_FILE not found: ${VALUES_FILE}"
+    values_arg=(-f "${VALUES_FILE}")
+    profile_label="$(basename "${VALUES_FILE}")"
+  fi
+  log "helm install --dry-run=server (engine=${POLICY_ENGINE} profile=${profile_label})"
   # --dry-run=server hits the API server's admission chain — that's the
   # whole point: it exercises Kyverno / Gatekeeper / PSA in the same
   # path a real install would, without actually scheduling pods.
@@ -270,12 +294,13 @@ smoke_install() {
     --create-namespace \
     --dry-run=server \
     --debug \
+    "${values_arg[@]}" \
     > /tmp/policy-matrix-dryrun.out 2>&1 || {
-      log "helm dry-run FAILED for engine=${POLICY_ENGINE}:"
+      log "helm dry-run FAILED for engine=${POLICY_ENGINE} profile=${profile_label}:"
       sed 's/^/  /' /tmp/policy-matrix-dryrun.out
-      fail "tracecore chart violates ${POLICY_ENGINE} policy gates"
+      fail "tracecore chart violates ${POLICY_ENGINE} policy gates (profile=${profile_label})"
     }
-  log "OK: chart admitted under ${POLICY_ENGINE}"
+  log "OK: chart admitted under ${POLICY_ENGINE} (profile=${profile_label})"
 }
 
 case "${POLICY_ENGINE}" in
@@ -285,4 +310,9 @@ case "${POLICY_ENGINE}" in
   *)          fail "unknown POLICY_ENGINE=${POLICY_ENGINE} (want psa|kyverno|gatekeeper)" ;;
 esac
 
+if [ -n "${SKIP_SMOKE}" ]; then
+  log "SKIP_SMOKE set; engine installed, skipping helm dry-run smoke"
+  exit 0
+fi
+
 smoke_install