Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
97 changes: 97 additions & 0 deletions .github/workflows/policy-matrix.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
name: policy-matrix

# Roadmap A12 / closes #138 — kind-cluster gate that asserts the
# tracecore Helm chart passes enterprise-shape policy bundles:
#
# - PSA (Pod Security Admission, restricted profile, namespace-labeled)
# - Kyverno (kyverno/policies pod-security baseline + restricted)
# - Gatekeeper (open-policy-agent/gatekeeper-library PSP templates)
#
# The assertion is `helm install --dry-run=server`: the kind API
# server runs the engine's admission webhook(s) and rejects the
# install on policy violation. We adopt curated upstream bundles
# verbatim — no hand-rolled Rego or Kyverno YAML — per the
# `adopt-over-build` repo policy.
#
# This workflow sits OUTSIDE the `make ci` 60s budget (kind takes
# 2-3 min per row to come up). It runs only when chart shape or
# this workflow itself changes.

on:
pull_request:
paths:
- 'install/kubernetes/tracecore/**'
- 'scripts/policy-matrix-smoke.sh'
- '.github/workflows/policy-matrix.yml'
push:
branches: [main]
paths:
- 'install/kubernetes/tracecore/**'
- 'scripts/policy-matrix-smoke.sh'
- '.github/workflows/policy-matrix.yml'

permissions:
contents: read

concurrency:
group: policy-matrix-${{ github.ref }}
cancel-in-progress: true

jobs:
policy-matrix:
name: ${{ matrix.engine }}
runs-on: ubuntu-latest
timeout-minutes: 20
strategy:
# We want all three rows to run even if one fails — operators
# want to know "which gate did we fail?" not "we failed".
fail-fast: false
matrix:
include:
- engine: psa-restricted
policy_engine: psa
- engine: kyverno-baseline-restricted
policy_engine: kyverno
- engine: gatekeeper-restricted
policy_engine: gatekeeper
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- name: Install helm
uses: azure/setup-helm@dda3372f752e03dde6b3237bc9431cdc2f7a02a2 # v5.0.0
with:
version: v3.16.4
- name: Create kind cluster
uses: helm/kind-action@ef37e7f390d99f746eb8b610417061a60e82a6cc # v1.14.0
with:
version: v0.25.0
node_image: kindest/node:v1.32.0
cluster_name: tracecore-policy-${{ matrix.policy_engine }}
- name: Sanity — kubectl reaches the cluster
run: |
kubectl cluster-info
kubectl version
- name: Smoke — install policy engine + helm dry-run tracecore chart
env:
POLICY_ENGINE: ${{ matrix.policy_engine }}
run: bash scripts/policy-matrix-smoke.sh
- name: Collect engine logs on failure
if: failure()
run: |
echo "::group::events (all-namespaces)"
kubectl get events -A --sort-by=.lastTimestamp | tail -100 || true
echo "::endgroup::"
echo "::group::kyverno logs"
kubectl -n kyverno logs -l app.kubernetes.io/component=admission-controller --tail=200 || true
echo "::endgroup::"
echo "::group::gatekeeper logs"
kubectl -n gatekeeper-system logs -l control-plane=controller-manager --tail=200 || true
echo "::endgroup::"
echo "::group::constraints"
kubectl get constraints -A || true
echo "::endgroup::"
- name: Tear down kind cluster
# helm/kind-action's post-step has known cleanup gaps on
# cancellation; explicit teardown matches the install-bench
# workflow's pattern (V4 reviewer note).
if: always()
run: kind delete cluster --name "tracecore-policy-${{ matrix.policy_engine }}" || true
288 changes: 288 additions & 0 deletions scripts/policy-matrix-smoke.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,288 @@
#!/usr/bin/env bash
# policy-matrix-smoke.sh — install the tracecore chart against a
# pre-provisioned kind cluster running ONE of:
#
# POLICY_ENGINE=psa Pod Security Admission (restricted)
# POLICY_ENGINE=kyverno Kyverno + kyverno/policies baseline+restricted
# POLICY_ENGINE=gatekeeper OPA Gatekeeper + gatekeeper-library PSP set
#
# The assertion model is `helm install --dry-run=server`: the API server
# returns admission errors when a policy engine rejects the rendered
# manifests. A non-zero exit means tracecore fails enterprise-shape
# gates and needs a chart-side fix (we don't relax the policy bundles).
#
# Bundle versions are pinned in env vars below so reproductions are
# byte-identical to CI.
#
# Roadmap A12 / closes #138.
set -euo pipefail

: "${POLICY_ENGINE:?POLICY_ENGINE must be one of: psa | kyverno | gatekeeper}"
: "${CHART_PATH:=install/kubernetes/tracecore}"
: "${RELEASE_NAME:=tracecore}"
: "${TARGET_NAMESPACE:=tracecore-system}"

# Pinned upstream policy-bundle versions. Bumping these is an explicit
# code change reviewed in PR — we do NOT chase "latest" silently.
: "${KYVERNO_HELM_VERSION:=3.4.6}" # kyverno/kyverno chart (app v1.14.x)
# kyverno/policies + gatekeeper-library have no tagged releases; pin
# by commit SHA so CI is byte-reproducible. Bump SHAs in a reviewed
# PR (refreshed 2026-05-31).
: "${KYVERNO_POLICIES_REF:=76be98a25d49ae01278a94ecde8f50f9e08577ef}"
: "${GATEKEEPER_VERSION:=v3.18.2}" # open-policy-agent/gatekeeper release
: "${GATEKEEPER_LIBRARY_REF:=53684fab133fd52d77aa42f632bc2ecd52f0447c}"

log() { printf '[policy-matrix] %s\n' "$*"; }
fail() { printf '::error::%s\n' "$*"; exit 1; }

require() {
command -v "$1" >/dev/null 2>&1 || fail "missing required binary: $1"
}

require kubectl
require helm

# ---------------------------------------------------------------- PSA
install_psa() {
log "PSA-restricted: labeling namespace ${TARGET_NAMESPACE} with restricted enforce"
kubectl create namespace "${TARGET_NAMESPACE}" --dry-run=client -o yaml \
| kubectl apply -f -
# PSA labels are the upstream-blessed mechanism (KEP-2579, GA 1.25+).
# `enforce` is the gate; warn + audit are for observability.
kubectl label --overwrite namespace "${TARGET_NAMESPACE}" \
pod-security.kubernetes.io/enforce=restricted \
pod-security.kubernetes.io/enforce-version=latest \
pod-security.kubernetes.io/warn=restricted \
pod-security.kubernetes.io/audit=restricted
}

# ---------------------------------------------------------------- Kyverno
install_kyverno() {
log "Kyverno: helm install chart ${KYVERNO_HELM_VERSION}"
helm repo add kyverno https://kyverno.github.io/kyverno/ >/dev/null
helm repo update kyverno >/dev/null
helm install kyverno kyverno/kyverno \
--namespace kyverno --create-namespace \
--version "${KYVERNO_HELM_VERSION}" \
--wait --timeout 5m
# Wait for the validating webhooks to register; without this a fast
# `helm install --dry-run=server` races the webhook config and
# admits manifests the engine would otherwise reject.
kubectl -n kyverno wait --for=condition=Ready pod \
-l app.kubernetes.io/component=admission-controller --timeout=3m

log "Kyverno: applying baseline + restricted policy bundles (ref=${KYVERNO_POLICIES_REF})"
# kustomization.yaml in each bundle dir aggregates every policy file.
# kubectl kustomize handles the remote ref so we don't shell out to kustomize.
kubectl apply -k \
"https://github.com/kyverno/policies/pod-security/baseline?ref=${KYVERNO_POLICIES_REF}"
kubectl apply -k \
"https://github.com/kyverno/policies/pod-security/restricted?ref=${KYVERNO_POLICIES_REF}"

# Default policy install action is `audit`. Flip everything to enforce
# so violations actually block admission (the whole point of this gate).
log "Kyverno: switching all ClusterPolicies to validationFailureAction=Enforce"
for p in $(kubectl get clusterpolicies -o name); do
kubectl patch "$p" --type merge \
-p '{"spec":{"validationFailureAction":"Enforce"}}' >/dev/null
done
kubectl get clusterpolicies \
-o jsonpath='{range .items[*]}{.metadata.name}{"\t"}{.spec.validationFailureAction}{"\n"}{end}'
}

# ---------------------------------------------------------------- Gatekeeper
install_gatekeeper() {
log "Gatekeeper: applying upstream release manifest (${GATEKEEPER_VERSION})"
kubectl apply -f \
"https://github.com/open-policy-agent/gatekeeper/${GATEKEEPER_VERSION}/deploy/gatekeeper.yaml"
kubectl -n gatekeeper-system wait --for=condition=Available deploy \
--all --timeout=5m

log "Gatekeeper: installing pod-security-policy constraint templates (ref=${GATEKEEPER_LIBRARY_REF})"
# The minimum-viable set that exercises tracecore's pod shape:
# privileged, host-namespaces, capabilities, allow-privilege-escalation,
# read-only-root-filesystem, host-filesystem, users (runAsNonRoot).
# NOTE: paths use `template.yaml` per gatekeeper-library v3 layout.
local base="https://github.com/open-policy-agent/gatekeeper-library/${GATEKEEPER_LIBRARY_REF}/library/pod-security-policy"
local templates=(
"privileged-containers/template.yaml"
"host-namespaces/template.yaml"
"capabilities/template.yaml"
"allow-privilege-escalation/template.yaml"
"read-only-root-filesystem/template.yaml"
"host-filesystem/template.yaml"
"users/template.yaml"
)
for t in "${templates[@]}"; do
kubectl apply -f "${base}/${t}"
done

# Constraint templates compile to CRDs asynchronously; wait for the
# last one to register before applying constraints.
log "Gatekeeper: waiting for constraint CRDs"
for k in K8sPSPPrivilegedContainer K8sPSPHostNamespace K8sPSPCapabilities \
K8sPSPAllowPrivilegeEscalationContainer K8sPSPReadOnlyRootFilesystem \
K8sPSPHostFilesystem K8sPSPAllowedUsers; do
for _ in $(seq 1 30); do
if kubectl get crd "$(echo "$k" | tr '[:upper:]' '[:lower:]').constraints.gatekeeper.sh" >/dev/null 2>&1; then
break
fi
sleep 2
done
done

log "Gatekeeper: applying minimum enforce constraints"
cat <<'EOF' | kubectl apply -f -
apiVersion: constraints.gatekeeper.sh/v1beta1
kind: K8sPSPPrivilegedContainer
metadata:
name: tracecore-psp-privileged
spec:
enforcementAction: deny
match:
kinds:
- apiGroups: ["apps"]
kinds: ["DaemonSet", "Deployment", "StatefulSet"]
- apiGroups: [""]
kinds: ["Pod"]
---
apiVersion: constraints.gatekeeper.sh/v1beta1
kind: K8sPSPHostNamespace
metadata:
name: tracecore-psp-host-namespace
spec:
enforcementAction: deny
match:
kinds:
- apiGroups: ["apps"]
kinds: ["DaemonSet", "Deployment", "StatefulSet"]
- apiGroups: [""]
kinds: ["Pod"]
---
apiVersion: constraints.gatekeeper.sh/v1beta1
kind: K8sPSPCapabilities
metadata:
name: tracecore-psp-capabilities
spec:
enforcementAction: deny
match:
kinds:
- apiGroups: ["apps"]
kinds: ["DaemonSet", "Deployment", "StatefulSet"]
- apiGroups: [""]
kinds: ["Pod"]
parameters:
requiredDropCapabilities: ["ALL"]
allowedCapabilities: ["SYS_PTRACE"]
---
apiVersion: constraints.gatekeeper.sh/v1beta1
kind: K8sPSPAllowPrivilegeEscalationContainer
metadata:
name: tracecore-psp-allow-privilege-escalation
spec:
enforcementAction: deny
match:
kinds:
- apiGroups: ["apps"]
kinds: ["DaemonSet", "Deployment", "StatefulSet"]
- apiGroups: [""]
kinds: ["Pod"]
---
apiVersion: constraints.gatekeeper.sh/v1beta1
kind: K8sPSPReadOnlyRootFilesystem
metadata:
name: tracecore-psp-read-only-root
spec:
enforcementAction: deny
match:
kinds:
- apiGroups: ["apps"]
kinds: ["DaemonSet", "Deployment", "StatefulSet"]
- apiGroups: [""]
kinds: ["Pod"]
---
apiVersion: constraints.gatekeeper.sh/v1beta1
kind: K8sPSPHostFilesystem
metadata:
name: tracecore-psp-host-filesystem
spec:
enforcementAction: deny
match:
kinds:
- apiGroups: ["apps"]
kinds: ["DaemonSet", "Deployment", "StatefulSet"]
- apiGroups: [""]
kinds: ["Pod"]
parameters:
allowedHostPaths: []
---
apiVersion: constraints.gatekeeper.sh/v1beta1
kind: K8sPSPAllowedUsers
metadata:
name: tracecore-psp-allowed-users
spec:
enforcementAction: deny
match:
kinds:
- apiGroups: ["apps"]
kinds: ["DaemonSet", "Deployment", "StatefulSet"]
- apiGroups: [""]
kinds: ["Pod"]
parameters:
runAsUser:
rule: MustRunAsNonRoot
runAsGroup:
rule: MayRunAs
ranges:
- {min: 1, max: 65535}
fsGroup:
rule: MayRunAs
ranges:
- {min: 1, max: 65535}
supplementalGroups:
rule: MayRunAs
ranges:
- {min: 1, max: 65535}
EOF

# Webhook readiness — same race as Kyverno.
kubectl -n gatekeeper-system wait --for=condition=Ready pod \
-l control-plane=controller-manager --timeout=2m
# Constraints need ~30s to compile+sync after apply. Poll the audit
# status field rather than `sleep`.
for _ in $(seq 1 30); do
if kubectl get k8spspprivilegedcontainer tracecore-psp-privileged \
-o jsonpath='{.status.byPod[*].enforced}' 2>/dev/null | grep -q true; then
break
fi
sleep 2
done
}

# ---------------------------------------------------------------- Smoke
smoke_install() {
log "helm install --dry-run=server (engine=${POLICY_ENGINE})"
# --dry-run=server hits the API server's admission chain — that's the
# whole point: it exercises Kyverno / Gatekeeper / PSA in the same
# path a real install would, without actually scheduling pods.
helm install "${RELEASE_NAME}" "${CHART_PATH}" \
--namespace "${TARGET_NAMESPACE}" \
--create-namespace \
--dry-run=server \
--debug \
> /tmp/policy-matrix-dryrun.out 2>&1 || {
log "helm dry-run FAILED for engine=${POLICY_ENGINE}:"
sed 's/^/ /' /tmp/policy-matrix-dryrun.out
fail "tracecore chart violates ${POLICY_ENGINE} policy gates"
}
log "OK: chart admitted under ${POLICY_ENGINE}"
}

case "${POLICY_ENGINE}" in
psa) install_psa ;;
kyverno) install_kyverno ;;
gatekeeper) install_gatekeeper ;;
*) fail "unknown POLICY_ENGINE=${POLICY_ENGINE} (want psa|kyverno|gatekeeper)" ;;
esac

smoke_install