Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions .github/workflows/chart.yml
Original file line number Diff line number Diff line change
Expand Up @@ -458,6 +458,7 @@ jobs:
with:
cluster-name: tracecore-m5b
- name: helm install + measure install-to-Ready
id: install
run: |
set -eo pipefail
start=$(date +%s)
Expand All @@ -474,9 +475,35 @@ jobs:
end=$(date +%s)
dur=$((end - start))
echo "install_to_ready_seconds=$dur" >> "$GITHUB_OUTPUT"
# Persist the per-run sample so the rolling-median aggregator
# (M3 carry-forward, docs/MILESTONES.md L209) can download it
# via `gh run download` from the next CI run. Sibling pattern:
# PR #446's bench-cv-rolling artifact pipeline.
mkdir -p helm-install-artifacts
printf '%s\n' "$dur" > helm-install-artifacts/install_to_ready_seconds.txt
{
echo "sha=${GITHUB_SHA}"
echo "run_id=${GITHUB_RUN_ID}"
echo "timestamp=$(date -u +%Y-%m-%dT%H:%M:%SZ)"
echo "runner=ubuntu-latest"
} > helm-install-artifacts/metadata.txt
echo "::notice::install-to-Ready: ${dur}s (rubric: ≤300s)"
test "$dur" -le 300 \
|| { echo "::error::install-to-Ready ${dur}s exceeds 300s rubric"; exit 1; }
- name: Upload helm-install duration artifact (M3 #209 carry-forward)
# Feeds `scripts/helm-install-rolling.sh` so the 10-run median
# gate can graduate ⧗ → ☑ once 10 successful main-branch runs
# have accumulated artifacts. `if: always()` so a single-run
# 300s breach (which exits the previous step non-zero) still
# uploads its sample — the rolling-median view is more useful
# with the regression-run data point included than without it.
if: always() && steps.install.outputs.install_to_ready_seconds != ''
uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
with:
name: helm-install-duration-${{ github.run_id }}
path: helm-install-artifacts/
if-no-files-found: warn
retention-days: 90
- name: "helm status — STATUS: deployed"
run: |
status=$(helm status tracecore --namespace tracecore-system | grep '^STATUS:' | awk '{print $2}')
Expand Down
8 changes: 7 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
.PHONY: help build clean hooks

# Test suites
.PHONY: test test-extras test-extras-sustained test-extras-fuzz test-extras-fuzz-kmsg test-extras-fuzz-journald test-extras-fuzz-nccl-fr test-extras-race bench bench-check bench-allocs-check bench-baseline bench-detectors bench-detectors-check bench-detectors-baseline bench-cv-report
.PHONY: test test-extras test-extras-sustained test-extras-fuzz test-extras-fuzz-kmsg test-extras-fuzz-journald test-extras-fuzz-nccl-fr test-extras-race bench bench-check bench-allocs-check bench-baseline bench-detectors bench-detectors-check bench-detectors-baseline bench-cv-report helm-install-rolling-report

# Format + tidy
.PHONY: fmt fmt-fix vet lint lint-fix tidy tidy-check mod-verify bump-otel
Expand Down Expand Up @@ -111,6 +111,12 @@ bench-cv-report: ## Print per-detector allocs/op CV across the last N bench.yml
@# /tmp/tracecore-bench-artifacts/ — safe to wipe between sessions.
scripts/bench-cv-rolling.sh

helm-install-rolling-report: ## Median helm install + DaemonSet Ready across the last N chart.yml runs on main (M3 #209 carry-forward). Exits non-zero if median >300s. Requires `gh` auth; offline when unauthed.
@# N override: `N=20 make helm-install-rolling-report`. Cache lives
@# under /tmp/tracecore-helm-install-artifacts/. Sibling pattern to
@# `make bench-cv-report` (PR #446).
scripts/helm-install-rolling.sh


fmt: ## Check formatting; fails if any file is not gofumpt-clean.
@# gofumpt has no native exclude flag; filter ./_build/ (OCB-generated,
Expand Down
2 changes: 1 addition & 1 deletion docs/MILESTONES.md
Original file line number Diff line number Diff line change
Expand Up @@ -206,7 +206,7 @@ Critical path to v0.1.0; the only lane in which a single milestone (M21) gates e
- ☑ Rendered pod spec passes the Kubernetes `restricted` Pod Security Standard except for explicit `SYS_PTRACE` and the host-path mounts required by receivers; deviation list is enumerated in the chart README with a one-line justification per item. (per https://kubernetes.io/docs/concepts/security/pod-security-standards/)
- ☑ DaemonSet template sets `securityContext.runAsNonRoot: true`, a non-zero `runAsUser`, `seccompProfile.type: RuntimeDefault`, `allowPrivilegeEscalation: false`; CI asserts each field via `yq`/grep gate. (per NORTHSTARS O2)
- ☑ `Chart.yaml` declares `apiVersion: v2`, a SemVer `version`, and an `appVersion` matching the tracecore binary tag; CI gate fails on drift. (per PRINCIPLES §15)
- ⧗ `helm install` plus DaemonSet `Ready` on a single-node kind cluster completes in ≤5 min median across 10 CI runs. *(Single-run ≤300s gate in `chart.yml`; 10-run median aggregation is the carry-forward.)* (per NORTHSTARS O2 hero-KPI)
- ⧗ `helm install` plus DaemonSet `Ready` on a single-node kind cluster completes in ≤5 min median across 10 CI runs. *(Single-run ≤300s gate in `chart.yml`; 10-run median aggregation now live via `scripts/helm-install-rolling.sh` + per-run `helm-install-duration-<run_id>` artifact upload in `chart.yml` — flips ⧗ → ☑ once 10 successful main-branch runs have accumulated artifacts. Sibling pattern: PR #446's `bench-cv-rolling`. Operator entry point: `make helm-install-rolling-report`.)* (per NORTHSTARS O2 hero-KPI)

### M20. Reference-cluster install benchmark (staged)

Expand Down
25 changes: 25 additions & 0 deletions install/kubernetes/tracecore/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -540,6 +540,31 @@ kubectl label namespace tracecore-system \
pod-security.kubernetes.io/warn=restricted
```

**`make helm-install-rolling-report` reports median above 300s.** The
M3 carry-forward rubric (`docs/MILESTONES.md` L209) requires the
`helm install` + DaemonSet `Ready` wall-clock to land at a median ≤5
min across 10 successful CI runs. `chart.yml`'s `install` job uploads
each run's `helm-install-duration-<run_id>` artifact; the script
`scripts/helm-install-rolling.sh` (operator entry point: `make
helm-install-rolling-report`) downloads the last 10 via `gh run
download` and computes the median.

When the median trips the 300s gate:

1. Run `make helm-install-rolling-report` locally to see per-run
samples. Borderline (~290-310s) often means flake noise; sustained
means real regression.
2. If a single run jumped to 400-500s, `gh run view <id> --log` and
look for image-pull or probe-misconfig stalls in the kind-up step.
3. If every run jumped, suspect a chart template edit. `git bisect`
between the last-green run sha and the first-red run sha against
`install/kubernetes/tracecore/`.

The single-run ≤300s gate is the hard fail inside the workflow; the
rolling-median view is the carry-forward layer that flips ⧗ → ☑ once
10 successful main-branch runs have artifacts. Sibling pattern: PR
#446's `bench-cv-rolling` for per-detector allocs/op CV.

## Pod Security Standard compliance

The chart targets the Kubernetes [`restricted`](https://kubernetes.io/docs/concepts/security/pod-security-standards/)
Expand Down
231 changes: 231 additions & 0 deletions scripts/helm-install-rolling.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,231 @@
#!/usr/bin/env bash
# helm-install-rolling.sh — rolling median of `helm install` plus
# DaemonSet Ready wall-clock across the last N successful chart.yml
# runs on main. Closes the M3 carry-forward (docs/MILESTONES.md L209):
# "≤5 min median across 10 CI runs". The single-run ≤300s gate already
# lives in `.github/workflows/chart.yml`; this is the 10-run
# aggregation layer.
#
# Sibling pattern: scripts/bench-cv-rolling.sh (PR #446) does the same
# shape — download last N artifacts via `gh run download`, parse a
# single numeric per artifact, aggregate. Differences:
# * scope: install-to-Ready duration per run (one sample), not 10
# bench samples per detector per run
# * statistic: median (matches MILESTONES.md wording "≤5 min median")
# rather than CV
# * gate: ≤300s (matches single-run threshold so the aggregation can
# graduate from advisory to hard-fail without redefining the rubric)
#
# How it works:
# 1. List the last N successful runs of `.github/workflows/chart.yml`
# via `gh run list`.
# 2. Download each run's `helm-install-duration-<run_id>` artifact via
# `gh run download`. Cached locally in $TC_HELM_INSTALL_CACHE_DIR
# (default /tmp/tracecore-helm-install-artifacts).
# 3. Read install_to_ready_seconds.txt (single integer) per artifact.
# 4. Print every sample + median across N runs; exit 0 if median ≤
# 300, exit 1 if median > 300.
#
# Edge cases (parity with bench-cv-rolling):
# - Missing artifacts (older runs predating this PR) skipped with a
# one-line note; the script still produces a report from whatever
# runs do have artifacts.
# - n_runs < 10: prints "need ≥10 runs" warning; does NOT fail the
# gate yet (the carry-forward says the gate flips ⧗ → ☑ "once 10
# runs accumulate"). Exit code is still pass/fail based on the
# median of what we have.
# - Garbage content in an artifact (non-integer): skip that run,
# continue aggregating; do not crash. Bench-cv-rolling handles
# the equivalent via the awk allocs/op-line-only grep.
# - Offline / no `gh`: prints a "no rolling data available" message
# and exits 0 (not a failure — the offline operator just gets the
# fallback view). Sibling bench-cv-rolling.sh falls back to
# baselines.json; this script has no equivalent single-sample
# source, so the fallback is informational.
#
# Failure-mode debug recipe (when CI flips this script red):
# 1. Pull last 10 runs locally: `make helm-install-rolling-report`.
# 2. If median is borderline (~290-310s), inspect per-run samples
# printed in the report — flake noise vs sustained regression.
# 3. If a single run jumped to 400-500s, download its kind-up logs
# via `gh run view <id> --log` and look for image-pull / probe-
# misconfig stalls.
# 4. If every run jumped, suspect a chart template edit — `git
# bisect` between the last-green run sha and the first-red run
# sha against `install/kubernetes/tracecore/`.
#
# Usage:
# scripts/helm-install-rolling.sh # last 10 runs
# N=20 scripts/helm-install-rolling.sh # last 20 runs
# scripts/helm-install-rolling.sh --dir /path/to/dir # offline, parse
# # local dir of
# # install_to_
# # ready_seconds
# # .txt files
#
# Portability: bash 3.2 (macOS stock) — no associative arrays, no
# mapfile, no readarray.
set -euo pipefail

N="${N:-10}"
WORKFLOW="${WORKFLOW:-chart.yml}"
CACHE_DIR="${TC_HELM_INSTALL_CACHE_DIR:-/tmp/tracecore-helm-install-artifacts}"
mode="ci"
local_dir=""

while [[ $# -gt 0 ]]; do
case "$1" in
--dir)
mode="local"
local_dir="$2"
shift 2
;;
--help|-h)
sed -n '2,72p' "$0"
exit 0
;;
*)
echo "helm-install-rolling: unknown flag $1" >&2
exit 2
;;
esac
done

mkdir -p "$CACHE_DIR"
runs_seen=0

if [[ "$mode" == "local" ]]; then
if [[ ! -d "$local_dir" ]]; then
echo "helm-install-rolling: --dir path '$local_dir' does not exist" >&2
exit 2
fi
# Each *.txt file under --dir is one "run". Treat its single-line
# integer as the install-to-Ready measurement.
i=0
while IFS= read -r f; do
i=$((i + 1))
run_dir="$CACHE_DIR/local-$i"
mkdir -p "$run_dir"
cp "$f" "$run_dir/install_to_ready_seconds.txt"
runs_seen=$((runs_seen + 1))
done < <(find "$local_dir" -type f -name '*.txt' | sort)
else
if ! command -v gh >/dev/null 2>&1; then
echo "helm-install-rolling: gh CLI not in PATH; no rolling data available" >&2
echo " (Sibling bench-cv-rolling.sh falls back to baselines.json;" >&2
echo " no equivalent single-sample source exists for install duration.)" >&2
exit 0
fi
fi

if [[ "$mode" == "ci" ]]; then
runs_json=$(gh run list \
--workflow="$WORKFLOW" \
--status=success \
--branch=main \
--limit="$N" \
--json=databaseId,headSha,createdAt 2>/dev/null || echo '[]')

run_ids=$(echo "$runs_json" | jq -r '.[].databaseId' 2>/dev/null || true)
if [[ -z "$run_ids" ]]; then
echo "helm-install-rolling: no successful main-branch runs found for $WORKFLOW" >&2
echo " (artifact pipeline likely not landed on main yet — check #444-style follow-up)" >&2
exit 0
fi

for run_id in $run_ids; do
run_dir="$CACHE_DIR/run-$run_id"
if [[ -f "$run_dir/install_to_ready_seconds.txt" ]]; then
runs_seen=$((runs_seen + 1))
continue
fi
mkdir -p "$run_dir"
if gh run download "$run_id" \
--name="helm-install-duration-$run_id" \
--dir="$run_dir" 2>/dev/null; then
if [[ -f "$run_dir/install_to_ready_seconds.txt" ]]; then
runs_seen=$((runs_seen + 1))
else
echo " skip run $run_id (artifact present but empty)" >&2
fi
else
echo " skip run $run_id (no helm-install artifact — pre-#445 or expired)" >&2
fi
done

if [[ "$runs_seen" -eq 0 ]]; then
echo "helm-install-rolling: 0 runs had artifacts (gate not yet primed)" >&2
exit 0
fi
fi

# Collect every parseable sample into a sorted file. Garbage-tolerant:
# non-integer content is dropped (and the run is silently skipped — the
# operator already saw the per-run breakdown above).
samples=$(mktemp)
trap 'rm -f "$samples"' EXIT

valid_runs=0
for d in "$CACHE_DIR"/*/; do
f="$d/install_to_ready_seconds.txt"
if [[ -f "$f" ]]; then
# Read the single-line integer. Tolerate trailing whitespace.
val=$(head -1 "$f" | tr -d '[:space:]')
if [[ "$val" =~ ^[0-9]+$ ]]; then
echo "$val" >> "$samples"
valid_runs=$((valid_runs + 1))
else
echo " skip $f (non-integer content: '$val')" >&2
fi
fi
done

if [[ "$valid_runs" -eq 0 ]]; then
echo "helm-install-rolling: collected $runs_seen runs but 0 parsed (bad artifacts?)" >&2
exit 2
fi

# Median computation. awk handles integer + float; result is an integer
# when both midpoints are integers (n=odd → middle; n=even → mean of two
# middles, which is integer when (a+b) is even).
sorted=$(sort -n "$samples")
median=$(echo "$sorted" | awk '
{
a[NR] = $1
}
END {
if (NR == 0) { exit 1 }
if (NR % 2 == 1) {
m = a[(NR + 1) / 2]
} else {
m = (a[NR / 2] + a[NR / 2 + 1]) / 2
}
# Print as integer if integral, else 1-decimal float. Avoids
# 145 → 145.000000 noise but preserves 145.5 for true mid-frac.
if (m == int(m)) {
printf "%d\n", m
} else {
printf "%.1f\n", m
}
}
')

echo "==> helm install + DaemonSet Ready: rolling median (rubric: median ≤ 300s, M3 #209)"
echo
echo "n_runs=$valid_runs"
echo "median_seconds=$median"
echo "samples_sorted=$(echo "$sorted" | tr '\n' ' ' | sed 's/ $//')"
echo

if [[ "$valid_runs" -lt 10 ]]; then
echo "NOTE: need ≥10 runs to flip M3 #209 carry-forward ⧗ → ☑;"
echo " currently $valid_runs run(s) in window."
fi

# Gate: exit 1 iff median strictly above the rubric.
if awk -v m="$median" 'BEGIN { exit (m > 300) ? 0 : 1 }'; then
echo "::error::install-to-Ready rolling median ${median}s exceeds 300s rubric (M3 #209)"
exit 1
fi

echo "ok: rolling median ${median}s within 300s rubric"
Loading
Loading