From 5b37089c40da41cb990b0ad47954e688e3e9830a Mon Sep 17 00:00:00 2001
From: Akash Sinha <akash.si@browserstack.com>
Date: Fri, 5 Jun 2026 15:55:00 +0530
Subject: [PATCH 1/3] fix(ci): re-run only failed specs on Windows test retries
 (PER-9011)

Windows @percy/core CI took ~241 min vs ~19 min on Linux. The cause was not
a single slow run: ~1 of 1078 node specs flakes per run on Windows
(browser/local-server timing), and windows.yml retried the entire ~60-min
suite up to 4x, with continue-on-error masking the failures as green.

Make retries cheap instead of fighting every flaky spec:
- scripts/test.js records failed spec names to PERCY_NODE_FAILURES_FILE and,
  when PERCY_ONLY_FAILED_SPECS=1, uses a jasmine specFilter to re-run only
  those specs. A filtered run reports "incomplete" (rest skipped), so success
  there means the targeted specs passed; full runs keep jasmine's strict
  status. No behavior change when the env vars are unset (Linux/local).
- windows.yml: retry0 runs the full suite; retries 1-4 re-run only the specs
  that just failed (seconds, not ~60 min).
- Add a ::warning:: when a retry recovers a flake so "green via retry" stays
  visible.

Verified locally against the real @percy/core suite: a 28-spec failure set
re-ran in 12s vs 1304s for the full 1078-spec suite. Expected on Windows:
~241 min -> ~62 min, full coverage kept.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .github/workflows/windows.yml | 22 +++++++++++++++-
 .gitignore                    |  3 +++
 scripts/test.js               | 47 ++++++++++++++++++++++++++++++++++-
 3 files changed, 70 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml
index 7028a2867..554e61337 100644
--- a/.github/workflows/windows.yml
+++ b/.github/workflows/windows.yml
@@ -58,6 +58,10 @@ jobs:
           - '@percy/monitoring'
           - '@percy/cli-doctor'
     runs-on: windows-latest
+    # Collect failed node-test spec names here so retries can re-run only the
+    # specs that flaked instead of the whole ~60-min suite. See PER-9011.
+    env:
+      PERCY_NODE_FAILURES_FILE: ${{ github.workspace }}/.percy-node-failures.json
     steps:
       - uses: actions/checkout@v5
       - uses: actions/setup-node@v3
@@ -81,26 +85,42 @@ jobs:
           name: dist
           path: packages
       - run: yarn
+      # First attempt runs the full suite and records any failed specs.
       - name: Run tests
         continue-on-error: true
         id: retry0
         run: yarn workspace ${{ matrix.package }} test --colors
-      - name: Run tests Retry (1/3)
+      # Retries re-run ONLY the specs that failed in the previous attempt
+      # (PERCY_ONLY_FAILED_SPECS), so a flaky spec costs seconds, not ~60 min.
+      - name: Run tests Retry (1/4)
         continue-on-error: true
         id: retry1
         if: steps.retry0.outcome=='failure'
+        env:
+          PERCY_ONLY_FAILED_SPECS: '1'
         run: yarn workspace ${{ matrix.package }} test --colors
       - name: Run tests Retry (2/4)
         continue-on-error: true
         id: retry2
         if: steps.retry1.outcome=='failure'
+        env:
+          PERCY_ONLY_FAILED_SPECS: '1'
         run: yarn workspace ${{ matrix.package }} test --colors
       - name: Run tests Retry (3/4)
         continue-on-error: true
         id: retry3
         if: steps.retry2.outcome=='failure'
+        env:
+          PERCY_ONLY_FAILED_SPECS: '1'
         run: yarn workspace ${{ matrix.package }} test --colors
       - name: Run tests Retry (4/4)
         id: retry4
         if: steps.retry3.outcome=='failure'
+        env:
+          PERCY_ONLY_FAILED_SPECS: '1'
         run: yarn workspace ${{ matrix.package }} test --colors
+      # Keep "green via retry" honest: surface a warning whenever the first
+      # attempt failed and a retry recovered it, so flakiness stays visible.
+      - name: Flag flaky tests
+        if: steps.retry0.outcome=='failure'
+        run: echo "::warning title=Flaky Windows tests::${{ matrix.package }} flaked on the first attempt and was recovered by a spec-level retry (tracked in PER-9011)."
diff --git a/.gitignore b/.gitignore
index bb9542884..db52aa81b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -18,3 +18,6 @@ bstack-ai-harness.yml
 CLAUDE.md
 .claude/
 # bstack-ai-harness:end
+
+# spec-level retry bookkeeping (CI only, see PER-9011)
+.percy-node-failures.json
diff --git a/scripts/test.js b/scripts/test.js
index b733898b8..fe8c4a683 100644
--- a/scripts/test.js
+++ b/scripts/test.js
@@ -125,8 +125,53 @@ async function main({
       }
     }));
 
+    // Spec-level retry support for flaky environments (notably Windows CI,
+    // where ~1 spec out of 1000+ flakes per run on browser/server timing).
+    // When PERCY_NODE_FAILURES_FILE is set we record every failed spec name;
+    // a follow-up run with PERCY_ONLY_FAILED_SPECS=1 re-runs only those specs,
+    // turning a ~60-min full-suite retry into a few seconds. See PER-9011.
+    let failuresFile = process.env.PERCY_NODE_FAILURES_FILE;
+    let onlyFailed = process.env.PERCY_ONLY_FAILED_SPECS === '1';
+    let priorFailures = [];
+
+    if (onlyFailed && failuresFile && fs.existsSync(failuresFile)) {
+      try { priorFailures = JSON.parse(fs.readFileSync(failuresFile, 'utf8')); } catch { /* run all on unreadable file */ }
+    }
+
+    if (onlyFailed && priorFailures.length) {
+      let retrySet = new Set(priorFailures);
+      jasmine.env.configure({ specFilter: spec => retrySet.has(spec.getFullName()) });
+      console.log(colors.yellow(`Re-running ${priorFailures.length} previously-failed spec(s)...\n`));
+    }
+
+    let failures = [];
+    jasmine.addReporter({
+      specDone: result => result.status === 'failed' && failures.push(result.fullName)
+    });
+
     console.log(colors.magenta('Running node tests...\n'));
-    await jasmine.execute();
+
+    // manage the exit code ourselves so failures can be persisted first
+    jasmine.exitOnCompletion = false;
+    let result;
+
+    try {
+      result = await jasmine.execute();
+    } finally {
+      if (failuresFile) {
+        try { fs.writeFileSync(failuresFile, JSON.stringify(failures)); } catch { /* best-effort */ }
+      }
+    }
+
+    // When re-running only previously-failed specs, jasmine reports the run as
+    // "incomplete" (the rest are intentionally skipped) — so success here means
+    // the targeted specs passed, i.e. no new failures. A normal full run keeps
+    // jasmine's strict status (incomplete/failed both count as failure).
+    let passed = onlyFailed && priorFailures.length
+      ? failures.length === 0
+      : (result ? result.overallStatus === 'passed' : failures.length === 0);
+
+    process.exit(passed ? 0 : 1);
   } else if (testBrowsers) {
     // $ karma start --config <root>/karma.config.js
     let { default: Karma } = await import('karma');

From ffeee0096f1ddc096778bdce095f33f5c06c0d53 Mon Sep 17 00:00:00 2001
From: Akash Sinha <akash.si@browserstack.com>
Date: Fri, 5 Jun 2026 17:16:49 +0530
Subject: [PATCH 2/3] fix(ci): rename retry env vars off the PERCY_ prefix
 (PER-9011)

cli-doctor's env-audit check flags any process.env key starting with PERCY_,
so injecting PERCY_NODE_FAILURES_FILE at the job level broke its
"no Percy vars are set" specs on Windows CI. Rename to CLI_TEST_FAILURES_FILE
and CLI_TEST_ONLY_FAILED (non-PERCY_) so the retry plumbing stays invisible to
Percy-env detection.

Verified locally: cli-doctor 508/508 pass with the renamed vars (both specs
fail when a PERCY_-prefixed var is injected, confirming the cause); spec-level
retry behavior unchanged.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .github/workflows/windows.yml | 13 +++++++------
 .gitignore                    |  2 +-
 scripts/test.js               | 10 ++++++----
 3 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml
index 554e61337..8aa1ba4c2 100644
--- a/.github/workflows/windows.yml
+++ b/.github/workflows/windows.yml
@@ -60,8 +60,9 @@ jobs:
     runs-on: windows-latest
     # Collect failed node-test spec names here so retries can re-run only the
     # specs that flaked instead of the whole ~60-min suite. See PER-9011.
+    # Name is deliberately non-PERCY_ so it doesn't trip env-audit tests.
     env:
-      PERCY_NODE_FAILURES_FILE: ${{ github.workspace }}/.percy-node-failures.json
+      CLI_TEST_FAILURES_FILE: ${{ github.workspace }}/.cli-test-failures.json
     steps:
       - uses: actions/checkout@v5
       - uses: actions/setup-node@v3
@@ -91,33 +92,33 @@ jobs:
         id: retry0
         run: yarn workspace ${{ matrix.package }} test --colors
       # Retries re-run ONLY the specs that failed in the previous attempt
-      # (PERCY_ONLY_FAILED_SPECS), so a flaky spec costs seconds, not ~60 min.
+      # (CLI_TEST_ONLY_FAILED), so a flaky spec costs seconds, not ~60 min.
       - name: Run tests Retry (1/4)
         continue-on-error: true
         id: retry1
         if: steps.retry0.outcome=='failure'
         env:
-          PERCY_ONLY_FAILED_SPECS: '1'
+          CLI_TEST_ONLY_FAILED: '1'
         run: yarn workspace ${{ matrix.package }} test --colors
       - name: Run tests Retry (2/4)
         continue-on-error: true
         id: retry2
         if: steps.retry1.outcome=='failure'
         env:
-          PERCY_ONLY_FAILED_SPECS: '1'
+          CLI_TEST_ONLY_FAILED: '1'
         run: yarn workspace ${{ matrix.package }} test --colors
       - name: Run tests Retry (3/4)
         continue-on-error: true
         id: retry3
         if: steps.retry2.outcome=='failure'
         env:
-          PERCY_ONLY_FAILED_SPECS: '1'
+          CLI_TEST_ONLY_FAILED: '1'
         run: yarn workspace ${{ matrix.package }} test --colors
       - name: Run tests Retry (4/4)
         id: retry4
         if: steps.retry3.outcome=='failure'
         env:
-          PERCY_ONLY_FAILED_SPECS: '1'
+          CLI_TEST_ONLY_FAILED: '1'
         run: yarn workspace ${{ matrix.package }} test --colors
       # Keep "green via retry" honest: surface a warning whenever the first
       # attempt failed and a retry recovered it, so flakiness stays visible.
diff --git a/.gitignore b/.gitignore
index db52aa81b..a050b3ede 100644
--- a/.gitignore
+++ b/.gitignore
@@ -20,4 +20,4 @@ CLAUDE.md
 # bstack-ai-harness:end
 
 # spec-level retry bookkeeping (CI only, see PER-9011)
-.percy-node-failures.json
+.cli-test-failures.json
diff --git a/scripts/test.js b/scripts/test.js
index fe8c4a683..dc635153d 100644
--- a/scripts/test.js
+++ b/scripts/test.js
@@ -127,11 +127,13 @@ async function main({
 
     // Spec-level retry support for flaky environments (notably Windows CI,
     // where ~1 spec out of 1000+ flakes per run on browser/server timing).
-    // When PERCY_NODE_FAILURES_FILE is set we record every failed spec name;
-    // a follow-up run with PERCY_ONLY_FAILED_SPECS=1 re-runs only those specs,
+    // When CLI_TEST_FAILURES_FILE is set we record every failed spec name;
+    // a follow-up run with CLI_TEST_ONLY_FAILED=1 re-runs only those specs,
     // turning a ~60-min full-suite retry into a few seconds. See PER-9011.
-    let failuresFile = process.env.PERCY_NODE_FAILURES_FILE;
-    let onlyFailed = process.env.PERCY_ONLY_FAILED_SPECS === '1';
+    // NB: these env vars are intentionally NOT prefixed with PERCY_ so they do
+    // not leak into env-audit / Percy-env detection tests (cli-doctor).
+    let failuresFile = process.env.CLI_TEST_FAILURES_FILE;
+    let onlyFailed = process.env.CLI_TEST_ONLY_FAILED === '1';
     let priorFailures = [];
 
     if (onlyFailed && failuresFile && fs.existsSync(failuresFile)) {

From 9c81399d1c8414d67c320c33b4081fefde0d6f2b Mon Sep 17 00:00:00 2001
From: Akash Sinha <akash.si@browserstack.com>
Date: Mon, 8 Jun 2026 12:46:26 +0530
Subject: [PATCH 3/3] feat(ci): extend spec-level retry to the Linux test
 workflow (PER-9011)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Per review feedback: the spec-level retry was only wired into windows.yml, but
the runner support in scripts/test.js is platform-agnostic. Wire it into
test.yml (Linux) too — Linux flakes on the same timing-sensitive specs and has
no retry today.

Linux runs test:coverage with a 100% coverage gate (.nycrc), which needs care:
- retry0 runs `test:coverage` (full suite + coverage gate, records failed specs)
- retries 1-4 run plain `test` on ONLY the failed specs, WITHOUT coverage
  (a subset can't hit the 100% threshold)
- if retry0 fails with no recorded spec failures (a real coverage drop or a
  crash, not a flaky spec), the runner now preserves the failure instead of
  masking it

Verified locally: empty-failures retry -> exit 1 (preserved); real-failure
retry -> runs only that spec, exit 0; the coverage path writes the failures
file that retries consume.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .github/workflows/test.yml | 45 ++++++++++++++++++++++++++++++++++++++
 scripts/test.js            | 14 +++++++++---
 2 files changed, 56 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 951ca2ea1..3045d61cc 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -59,6 +59,11 @@ jobs:
           - '@percy/monitoring'
           - '@percy/cli-doctor'
     runs-on: ${{ matrix.os }}
+    # Collect failed node-test spec names so retries re-run only the specs that
+    # flaked instead of the whole suite. Non-PERCY_ name so it doesn't trip
+    # cli-doctor's env-audit tests. See PER-9011.
+    env:
+      CLI_TEST_FAILURES_FILE: ${{ github.workspace }}/.cli-test-failures.json
     steps:
       - uses: actions/checkout@v5
       - uses: actions/setup-node@v3
@@ -87,8 +92,48 @@ jobs:
           sudo apt-get update
           sudo apt-get install -y --fix-missing libgbm-dev
         if: ${{ matrix.os == 'ubuntu-latest' }}
+      # First attempt runs the full suite WITH coverage (enforces the 100%
+      # gate) and records any failed specs.
       - name: Run tests
+        continue-on-error: true
+        id: retry0
         run: yarn workspace ${{ matrix.package }} test:coverage --colors
+      # Retries re-run ONLY the specs that failed in the previous attempt, and
+      # WITHOUT coverage (a subset can't hit the 100% threshold). If retry0
+      # failed with no recorded spec failures (e.g. a real coverage drop), the
+      # runner preserves that failure instead of masking it. See PER-9011.
+      - name: Run tests Retry (1/4)
+        continue-on-error: true
+        id: retry1
+        if: steps.retry0.outcome=='failure'
+        env:
+          CLI_TEST_ONLY_FAILED: '1'
+        run: yarn workspace ${{ matrix.package }} test --colors
+      - name: Run tests Retry (2/4)
+        continue-on-error: true
+        id: retry2
+        if: steps.retry1.outcome=='failure'
+        env:
+          CLI_TEST_ONLY_FAILED: '1'
+        run: yarn workspace ${{ matrix.package }} test --colors
+      - name: Run tests Retry (3/4)
+        continue-on-error: true
+        id: retry3
+        if: steps.retry2.outcome=='failure'
+        env:
+          CLI_TEST_ONLY_FAILED: '1'
+        run: yarn workspace ${{ matrix.package }} test --colors
+      - name: Run tests Retry (4/4)
+        id: retry4
+        if: steps.retry3.outcome=='failure'
+        env:
+          CLI_TEST_ONLY_FAILED: '1'
+        run: yarn workspace ${{ matrix.package }} test --colors
+      # Keep "green via retry" honest: surface a warning whenever the first
+      # attempt failed and a retry recovered it, so flakiness stays visible.
+      - name: Flag flaky tests
+        if: steps.retry0.outcome=='failure'
+        run: echo "::warning title=Flaky tests::${{ matrix.package }} flaked on the first attempt and was recovered by a spec-level retry (tracked in PER-9011)."
 
   regression:
     name: Regression
diff --git a/scripts/test.js b/scripts/test.js
index dc635153d..64cde2f8a 100644
--- a/scripts/test.js
+++ b/scripts/test.js
@@ -137,10 +137,18 @@ async function main({
     let priorFailures = [];
 
     if (onlyFailed && failuresFile && fs.existsSync(failuresFile)) {
-      try { priorFailures = JSON.parse(fs.readFileSync(failuresFile, 'utf8')); } catch { /* run all on unreadable file */ }
+      try { priorFailures = JSON.parse(fs.readFileSync(failuresFile, 'utf8')); } catch { /* treated as no recorded failures below */ }
     }
 
-    if (onlyFailed && priorFailures.length) {
+    // A retry was requested but the previous run recorded no failed specs. That
+    // means the failure was not a flaky spec (e.g. a coverage-threshold failure
+    // or a crash) -- re-running a subset would mask it, so preserve the failure.
+    if (onlyFailed && !priorFailures.length) {
+      console.log(colors.yellow('No recorded spec failures to retry — preserving the previous failure.'));
+      process.exit(1);
+    }
+
+    if (onlyFailed) {
       let retrySet = new Set(priorFailures);
       jasmine.env.configure({ specFilter: spec => retrySet.has(spec.getFullName()) });
       console.log(colors.yellow(`Re-running ${priorFailures.length} previously-failed spec(s)...\n`));
@@ -169,7 +177,7 @@ async function main({
     // "incomplete" (the rest are intentionally skipped) — so success here means
     // the targeted specs passed, i.e. no new failures. A normal full run keeps
     // jasmine's strict status (incomplete/failed both count as failure).
-    let passed = onlyFailed && priorFailures.length
+    let passed = onlyFailed
       ? failures.length === 0
       : (result ? result.overallStatus === 'passed' : failures.length === 0);