From 50662585461496ae97a88d59b6bdd78dd2e79a61 Mon Sep 17 00:00:00 2001 From: Tri Lam Date: Tue, 19 May 2026 01:22:56 -0700 Subject: [PATCH] [ci] cpusteal_test: relax hang-sentinel bounds (flake-pattern audit) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Caught by the flake-pattern audit (FOLLOWUPS entry post-PR #76). `require.Less(elapsed, 500ms)` for a 100ms request and `require.Less(elapsed, 250ms)` for a context-cancel were both calibrated to fast-runner expectations — same shape as the SLI and SetDegraded flakes already fixed this session. Under GH Actions runner contention, scheduler delays on a busy-loop or context-cancellation latency can exceed those bounds without any regression in the receiver under test. Relaxed both upper bounds to 2s as hang sentinels rather than perf bounds. The lower-bound assertion on `TestRun_HonorsDuration` (`elapsed >= 95ms`) still pins the real contract (busy-loop ran for the requested time); the upper bound just catches "never returned". Same fix shape applied to `TestRun_HonorsContextCancellation`. Local: 3 isolated runs under -race, all 4 cpusteal tests PASS. `make lint` clean, `make vet` clean. Anchor for the audit: `AGENTS.md` lesson "Match perf-budget assertions by the invariant only" (PR #81); FOLLOWUPS § "CI flake hygiene". Signed-off-by: Tri Lam --- tools/failure-inject/cpusteal/cpusteal_test.go | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/tools/failure-inject/cpusteal/cpusteal_test.go b/tools/failure-inject/cpusteal/cpusteal_test.go index d3557e34..95b47742 100644 --- a/tools/failure-inject/cpusteal/cpusteal_test.go +++ b/tools/failure-inject/cpusteal/cpusteal_test.go @@ -13,9 +13,9 @@ import ( ) // TestRun_HonorsDuration pins the duration contract: a 100ms request -// must complete in [95ms, 500ms] window — lower bound is the busy-loop -// minimum, upper bound is the scheduling-slack ceiling beyond which -// the harness is misbehaving. +// must busy-loop at least that long and must terminate — the upper +// bound is a hang sentinel, not a perf bound, so it is generous +// enough to absorb GH Actions runner contention. func TestRun_HonorsDuration(t *testing.T) { t.Parallel() start := time.Now() @@ -25,12 +25,14 @@ func TestRun_HonorsDuration(t *testing.T) { })) elapsed := time.Since(start) require.GreaterOrEqual(t, elapsed, 95*time.Millisecond, "must busy-loop for at least the requested duration") - require.Less(t, elapsed, 500*time.Millisecond, "must not run substantially past the requested duration") + require.Less(t, elapsed, 2*time.Second, "must terminate (hang sentinel; not a perf bound)") } // TestRun_HonorsContextCancellation pins the early-exit contract: a -// cancelled context returns context.Canceled within ~50ms regardless -// of the configured Duration. +// cancelled context returns context.Canceled and Run terminates. The +// upper bound is a hang sentinel, not a perf bound — context-cancel +// responsiveness under runner contention can vary by an order of +// magnitude. Tighten only after sustained CI evidence. func TestRun_HonorsContextCancellation(t *testing.T) { t.Parallel() ctx, cancel := context.WithCancel(context.Background()) @@ -45,7 +47,7 @@ func TestRun_HonorsContextCancellation(t *testing.T) { }) elapsed := time.Since(start) require.ErrorIs(t, err, context.Canceled) - require.Less(t, elapsed, 250*time.Millisecond, "must abort quickly on context cancel") + require.Less(t, elapsed, 2*time.Second, "must terminate on cancel (hang sentinel; not a perf bound)") } // TestRun_RejectsZeroDuration pins the input-validation contract.