From 5b37089c40da41cb990b0ad47954e688e3e9830a Mon Sep 17 00:00:00 2001 From: Akash Sinha Date: Fri, 5 Jun 2026 15:55:00 +0530 Subject: [PATCH 1/3] fix(ci): re-run only failed specs on Windows test retries (PER-9011) Windows @percy/core CI took ~241 min vs ~19 min on Linux. The cause was not a single slow run: ~1 of 1078 node specs flakes per run on Windows (browser/local-server timing), and windows.yml retried the entire ~60-min suite up to 4x, with continue-on-error masking the failures as green. Make retries cheap instead of fighting every flaky spec: - scripts/test.js records failed spec names to PERCY_NODE_FAILURES_FILE and, when PERCY_ONLY_FAILED_SPECS=1, uses a jasmine specFilter to re-run only those specs. A filtered run reports "incomplete" (rest skipped), so success there means the targeted specs passed; full runs keep jasmine's strict status. No behavior change when the env vars are unset (Linux/local). - windows.yml: retry0 runs the full suite; retries 1-4 re-run only the specs that just failed (seconds, not ~60 min). - Add a ::warning:: when a retry recovers a flake so "green via retry" stays visible. Verified locally against the real @percy/core suite: a 28-spec failure set re-ran in 12s vs 1304s for the full 1078-spec suite. Expected on Windows: ~241 min -> ~62 min, full coverage kept. Co-Authored-By: Claude Opus 4.8 (1M context) --- .github/workflows/windows.yml | 22 +++++++++++++++- .gitignore | 3 +++ scripts/test.js | 47 ++++++++++++++++++++++++++++++++++- 3 files changed, 70 insertions(+), 2 deletions(-) diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml index 7028a2867..554e61337 100644 --- a/.github/workflows/windows.yml +++ b/.github/workflows/windows.yml @@ -58,6 +58,10 @@ jobs: - '@percy/monitoring' - '@percy/cli-doctor' runs-on: windows-latest + # Collect failed node-test spec names here so retries can re-run only the + # specs that flaked instead of the whole ~60-min suite. See PER-9011. + env: + PERCY_NODE_FAILURES_FILE: ${{ github.workspace }}/.percy-node-failures.json steps: - uses: actions/checkout@v5 - uses: actions/setup-node@v3 @@ -81,26 +85,42 @@ jobs: name: dist path: packages - run: yarn + # First attempt runs the full suite and records any failed specs. - name: Run tests continue-on-error: true id: retry0 run: yarn workspace ${{ matrix.package }} test --colors - - name: Run tests Retry (1/3) + # Retries re-run ONLY the specs that failed in the previous attempt + # (PERCY_ONLY_FAILED_SPECS), so a flaky spec costs seconds, not ~60 min. + - name: Run tests Retry (1/4) continue-on-error: true id: retry1 if: steps.retry0.outcome=='failure' + env: + PERCY_ONLY_FAILED_SPECS: '1' run: yarn workspace ${{ matrix.package }} test --colors - name: Run tests Retry (2/4) continue-on-error: true id: retry2 if: steps.retry1.outcome=='failure' + env: + PERCY_ONLY_FAILED_SPECS: '1' run: yarn workspace ${{ matrix.package }} test --colors - name: Run tests Retry (3/4) continue-on-error: true id: retry3 if: steps.retry2.outcome=='failure' + env: + PERCY_ONLY_FAILED_SPECS: '1' run: yarn workspace ${{ matrix.package }} test --colors - name: Run tests Retry (4/4) id: retry4 if: steps.retry3.outcome=='failure' + env: + PERCY_ONLY_FAILED_SPECS: '1' run: yarn workspace ${{ matrix.package }} test --colors + # Keep "green via retry" honest: surface a warning whenever the first + # attempt failed and a retry recovered it, so flakiness stays visible. + - name: Flag flaky tests + if: steps.retry0.outcome=='failure' + run: echo "::warning title=Flaky Windows tests::${{ matrix.package }} flaked on the first attempt and was recovered by a spec-level retry (tracked in PER-9011)." diff --git a/.gitignore b/.gitignore index bb9542884..db52aa81b 100644 --- a/.gitignore +++ b/.gitignore @@ -18,3 +18,6 @@ bstack-ai-harness.yml CLAUDE.md .claude/ # bstack-ai-harness:end + +# spec-level retry bookkeeping (CI only, see PER-9011) +.percy-node-failures.json diff --git a/scripts/test.js b/scripts/test.js index b733898b8..fe8c4a683 100644 --- a/scripts/test.js +++ b/scripts/test.js @@ -125,8 +125,53 @@ async function main({ } })); + // Spec-level retry support for flaky environments (notably Windows CI, + // where ~1 spec out of 1000+ flakes per run on browser/server timing). + // When PERCY_NODE_FAILURES_FILE is set we record every failed spec name; + // a follow-up run with PERCY_ONLY_FAILED_SPECS=1 re-runs only those specs, + // turning a ~60-min full-suite retry into a few seconds. See PER-9011. + let failuresFile = process.env.PERCY_NODE_FAILURES_FILE; + let onlyFailed = process.env.PERCY_ONLY_FAILED_SPECS === '1'; + let priorFailures = []; + + if (onlyFailed && failuresFile && fs.existsSync(failuresFile)) { + try { priorFailures = JSON.parse(fs.readFileSync(failuresFile, 'utf8')); } catch { /* run all on unreadable file */ } + } + + if (onlyFailed && priorFailures.length) { + let retrySet = new Set(priorFailures); + jasmine.env.configure({ specFilter: spec => retrySet.has(spec.getFullName()) }); + console.log(colors.yellow(`Re-running ${priorFailures.length} previously-failed spec(s)...\n`)); + } + + let failures = []; + jasmine.addReporter({ + specDone: result => result.status === 'failed' && failures.push(result.fullName) + }); + console.log(colors.magenta('Running node tests...\n')); - await jasmine.execute(); + + // manage the exit code ourselves so failures can be persisted first + jasmine.exitOnCompletion = false; + let result; + + try { + result = await jasmine.execute(); + } finally { + if (failuresFile) { + try { fs.writeFileSync(failuresFile, JSON.stringify(failures)); } catch { /* best-effort */ } + } + } + + // When re-running only previously-failed specs, jasmine reports the run as + // "incomplete" (the rest are intentionally skipped) — so success here means + // the targeted specs passed, i.e. no new failures. A normal full run keeps + // jasmine's strict status (incomplete/failed both count as failure). + let passed = onlyFailed && priorFailures.length + ? failures.length === 0 + : (result ? result.overallStatus === 'passed' : failures.length === 0); + + process.exit(passed ? 0 : 1); } else if (testBrowsers) { // $ karma start --config /karma.config.js let { default: Karma } = await import('karma'); From ffeee0096f1ddc096778bdce095f33f5c06c0d53 Mon Sep 17 00:00:00 2001 From: Akash Sinha Date: Fri, 5 Jun 2026 17:16:49 +0530 Subject: [PATCH 2/3] fix(ci): rename retry env vars off the PERCY_ prefix (PER-9011) cli-doctor's env-audit check flags any process.env key starting with PERCY_, so injecting PERCY_NODE_FAILURES_FILE at the job level broke its "no Percy vars are set" specs on Windows CI. Rename to CLI_TEST_FAILURES_FILE and CLI_TEST_ONLY_FAILED (non-PERCY_) so the retry plumbing stays invisible to Percy-env detection. Verified locally: cli-doctor 508/508 pass with the renamed vars (both specs fail when a PERCY_-prefixed var is injected, confirming the cause); spec-level retry behavior unchanged. Co-Authored-By: Claude Opus 4.8 (1M context) --- .github/workflows/windows.yml | 13 +++++++------ .gitignore | 2 +- scripts/test.js | 10 ++++++---- 3 files changed, 14 insertions(+), 11 deletions(-) diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml index 554e61337..8aa1ba4c2 100644 --- a/.github/workflows/windows.yml +++ b/.github/workflows/windows.yml @@ -60,8 +60,9 @@ jobs: runs-on: windows-latest # Collect failed node-test spec names here so retries can re-run only the # specs that flaked instead of the whole ~60-min suite. See PER-9011. + # Name is deliberately non-PERCY_ so it doesn't trip env-audit tests. env: - PERCY_NODE_FAILURES_FILE: ${{ github.workspace }}/.percy-node-failures.json + CLI_TEST_FAILURES_FILE: ${{ github.workspace }}/.cli-test-failures.json steps: - uses: actions/checkout@v5 - uses: actions/setup-node@v3 @@ -91,33 +92,33 @@ jobs: id: retry0 run: yarn workspace ${{ matrix.package }} test --colors # Retries re-run ONLY the specs that failed in the previous attempt - # (PERCY_ONLY_FAILED_SPECS), so a flaky spec costs seconds, not ~60 min. + # (CLI_TEST_ONLY_FAILED), so a flaky spec costs seconds, not ~60 min. - name: Run tests Retry (1/4) continue-on-error: true id: retry1 if: steps.retry0.outcome=='failure' env: - PERCY_ONLY_FAILED_SPECS: '1' + CLI_TEST_ONLY_FAILED: '1' run: yarn workspace ${{ matrix.package }} test --colors - name: Run tests Retry (2/4) continue-on-error: true id: retry2 if: steps.retry1.outcome=='failure' env: - PERCY_ONLY_FAILED_SPECS: '1' + CLI_TEST_ONLY_FAILED: '1' run: yarn workspace ${{ matrix.package }} test --colors - name: Run tests Retry (3/4) continue-on-error: true id: retry3 if: steps.retry2.outcome=='failure' env: - PERCY_ONLY_FAILED_SPECS: '1' + CLI_TEST_ONLY_FAILED: '1' run: yarn workspace ${{ matrix.package }} test --colors - name: Run tests Retry (4/4) id: retry4 if: steps.retry3.outcome=='failure' env: - PERCY_ONLY_FAILED_SPECS: '1' + CLI_TEST_ONLY_FAILED: '1' run: yarn workspace ${{ matrix.package }} test --colors # Keep "green via retry" honest: surface a warning whenever the first # attempt failed and a retry recovered it, so flakiness stays visible. diff --git a/.gitignore b/.gitignore index db52aa81b..a050b3ede 100644 --- a/.gitignore +++ b/.gitignore @@ -20,4 +20,4 @@ CLAUDE.md # bstack-ai-harness:end # spec-level retry bookkeeping (CI only, see PER-9011) -.percy-node-failures.json +.cli-test-failures.json diff --git a/scripts/test.js b/scripts/test.js index fe8c4a683..dc635153d 100644 --- a/scripts/test.js +++ b/scripts/test.js @@ -127,11 +127,13 @@ async function main({ // Spec-level retry support for flaky environments (notably Windows CI, // where ~1 spec out of 1000+ flakes per run on browser/server timing). - // When PERCY_NODE_FAILURES_FILE is set we record every failed spec name; - // a follow-up run with PERCY_ONLY_FAILED_SPECS=1 re-runs only those specs, + // When CLI_TEST_FAILURES_FILE is set we record every failed spec name; + // a follow-up run with CLI_TEST_ONLY_FAILED=1 re-runs only those specs, // turning a ~60-min full-suite retry into a few seconds. See PER-9011. - let failuresFile = process.env.PERCY_NODE_FAILURES_FILE; - let onlyFailed = process.env.PERCY_ONLY_FAILED_SPECS === '1'; + // NB: these env vars are intentionally NOT prefixed with PERCY_ so they do + // not leak into env-audit / Percy-env detection tests (cli-doctor). + let failuresFile = process.env.CLI_TEST_FAILURES_FILE; + let onlyFailed = process.env.CLI_TEST_ONLY_FAILED === '1'; let priorFailures = []; if (onlyFailed && failuresFile && fs.existsSync(failuresFile)) { From 9c81399d1c8414d67c320c33b4081fefde0d6f2b Mon Sep 17 00:00:00 2001 From: Akash Sinha Date: Mon, 8 Jun 2026 12:46:26 +0530 Subject: [PATCH 3/3] feat(ci): extend spec-level retry to the Linux test workflow (PER-9011) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Per review feedback: the spec-level retry was only wired into windows.yml, but the runner support in scripts/test.js is platform-agnostic. Wire it into test.yml (Linux) too — Linux flakes on the same timing-sensitive specs and has no retry today. Linux runs test:coverage with a 100% coverage gate (.nycrc), which needs care: - retry0 runs `test:coverage` (full suite + coverage gate, records failed specs) - retries 1-4 run plain `test` on ONLY the failed specs, WITHOUT coverage (a subset can't hit the 100% threshold) - if retry0 fails with no recorded spec failures (a real coverage drop or a crash, not a flaky spec), the runner now preserves the failure instead of masking it Verified locally: empty-failures retry -> exit 1 (preserved); real-failure retry -> runs only that spec, exit 0; the coverage path writes the failures file that retries consume. Co-Authored-By: Claude Opus 4.8 (1M context) --- .github/workflows/test.yml | 45 ++++++++++++++++++++++++++++++++++++++ scripts/test.js | 14 +++++++++--- 2 files changed, 56 insertions(+), 3 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 951ca2ea1..3045d61cc 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -59,6 +59,11 @@ jobs: - '@percy/monitoring' - '@percy/cli-doctor' runs-on: ${{ matrix.os }} + # Collect failed node-test spec names so retries re-run only the specs that + # flaked instead of the whole suite. Non-PERCY_ name so it doesn't trip + # cli-doctor's env-audit tests. See PER-9011. + env: + CLI_TEST_FAILURES_FILE: ${{ github.workspace }}/.cli-test-failures.json steps: - uses: actions/checkout@v5 - uses: actions/setup-node@v3 @@ -87,8 +92,48 @@ jobs: sudo apt-get update sudo apt-get install -y --fix-missing libgbm-dev if: ${{ matrix.os == 'ubuntu-latest' }} + # First attempt runs the full suite WITH coverage (enforces the 100% + # gate) and records any failed specs. - name: Run tests + continue-on-error: true + id: retry0 run: yarn workspace ${{ matrix.package }} test:coverage --colors + # Retries re-run ONLY the specs that failed in the previous attempt, and + # WITHOUT coverage (a subset can't hit the 100% threshold). If retry0 + # failed with no recorded spec failures (e.g. a real coverage drop), the + # runner preserves that failure instead of masking it. See PER-9011. + - name: Run tests Retry (1/4) + continue-on-error: true + id: retry1 + if: steps.retry0.outcome=='failure' + env: + CLI_TEST_ONLY_FAILED: '1' + run: yarn workspace ${{ matrix.package }} test --colors + - name: Run tests Retry (2/4) + continue-on-error: true + id: retry2 + if: steps.retry1.outcome=='failure' + env: + CLI_TEST_ONLY_FAILED: '1' + run: yarn workspace ${{ matrix.package }} test --colors + - name: Run tests Retry (3/4) + continue-on-error: true + id: retry3 + if: steps.retry2.outcome=='failure' + env: + CLI_TEST_ONLY_FAILED: '1' + run: yarn workspace ${{ matrix.package }} test --colors + - name: Run tests Retry (4/4) + id: retry4 + if: steps.retry3.outcome=='failure' + env: + CLI_TEST_ONLY_FAILED: '1' + run: yarn workspace ${{ matrix.package }} test --colors + # Keep "green via retry" honest: surface a warning whenever the first + # attempt failed and a retry recovered it, so flakiness stays visible. + - name: Flag flaky tests + if: steps.retry0.outcome=='failure' + run: echo "::warning title=Flaky tests::${{ matrix.package }} flaked on the first attempt and was recovered by a spec-level retry (tracked in PER-9011)." regression: name: Regression diff --git a/scripts/test.js b/scripts/test.js index dc635153d..64cde2f8a 100644 --- a/scripts/test.js +++ b/scripts/test.js @@ -137,10 +137,18 @@ async function main({ let priorFailures = []; if (onlyFailed && failuresFile && fs.existsSync(failuresFile)) { - try { priorFailures = JSON.parse(fs.readFileSync(failuresFile, 'utf8')); } catch { /* run all on unreadable file */ } + try { priorFailures = JSON.parse(fs.readFileSync(failuresFile, 'utf8')); } catch { /* treated as no recorded failures below */ } } - if (onlyFailed && priorFailures.length) { + // A retry was requested but the previous run recorded no failed specs. That + // means the failure was not a flaky spec (e.g. a coverage-threshold failure + // or a crash) -- re-running a subset would mask it, so preserve the failure. + if (onlyFailed && !priorFailures.length) { + console.log(colors.yellow('No recorded spec failures to retry — preserving the previous failure.')); + process.exit(1); + } + + if (onlyFailed) { let retrySet = new Set(priorFailures); jasmine.env.configure({ specFilter: spec => retrySet.has(spec.getFullName()) }); console.log(colors.yellow(`Re-running ${priorFailures.length} previously-failed spec(s)...\n`)); @@ -169,7 +177,7 @@ async function main({ // "incomplete" (the rest are intentionally skipped) — so success here means // the targeted specs passed, i.e. no new failures. A normal full run keeps // jasmine's strict status (incomplete/failed both count as failure). - let passed = onlyFailed && priorFailures.length + let passed = onlyFailed ? failures.length === 0 : (result ? result.overallStatus === 'passed' : failures.length === 0);