From a56fc3771a63d75b764a1d5504369fc4f7bff27f Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Fri, 12 Jun 2026 14:38:33 +0000
Subject: [PATCH 1/2] Revert PR #237's GetConsoleOutputCP fix; cmd pipes use
 OEM, not console CP
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Eli's Windows test showed PR #237's review fix actively regressed
the bug it was meant to address.

  cmd console at CP437 (his `chcp` confirms)
  pasclaw.exe launched from Windows Terminal -> our console CP=65001
  GetConsoleOutputCP returns 65001 in DecodeShellOutputBytes
  MultiByteToWideChar(65001, [0x82 from dir's CP437 output], ...)
  -> 0x82 is invalid UTF-8 lead byte
  -> replaces with U+FFFD
  -> UTF8Encode yields 0xEF 0xBF 0xBD per replaced byte
  -> model sees `r�sum�.txt` instead of `résumé.txt`

Exactly the original mojibake, just routed through a different
codepage mismatch.

The Codex P2 review reasoning ("operator ran chcp 65001 first") is
wrong about how cmd.exe handles piped output. `chcp` switches the
CONSOLE's codepage, but cmd.exe's redirected stdout uses the OEM
codepage regardless of the console's chcp setting. Our spawned
cmd.exe children also don't inherit a parent shell's chcp at all
(they're new processes attached to the same console). So
GetConsoleOutputCP returns the wrong CP for decoding piped cmd
output -- it returns OUR console's CP, not the OEM CP that cmd
actually used for its piped output.

GetOEMCP returns the system default (CP437 on US English) which is
what cmd.exe's pipe output ACTUALLY uses. The "I want UTF-8 output"
case (where the operator genuinely wants cmd in CP65001 mode) can
still be handled by putting `chcp 65001 &&` inside the shell_exec
command, since that affects the spawned cmd.exe's own internal
codepage handling.

FPC make smoke + full make test (36 targets) green. The fix needs
manual Windows verification: rebuild, run `dir /b` of a directory
with résumé.txt, model should see résumé.txt clean.

https://claude.ai/code/session_01TBcLtmpj7dqA5tyFbGnQon
---
 src/pkg/platform/PasClaw.Platform.pas | 39 +++++++++++++++------------
 1 file changed, 22 insertions(+), 17 deletions(-)

diff --git a/src/pkg/platform/PasClaw.Platform.pas b/src/pkg/platform/PasClaw.Platform.pas
index 952eda8..565aa3a 100644
--- a/src/pkg/platform/PasClaw.Platform.pas
+++ b/src/pkg/platform/PasClaw.Platform.pas
@@ -203,23 +203,28 @@ function DecodeShellOutputBytes(const Bytes: TBytes;
     CP := Codepage
   else
   begin
-    { Prefer the ACTIVE console output codepage over the system OEM
-      default. cmd.exe writes its stdout in whichever codepage is
-      currently set on the console (the OEM default initially, but
-      operators can switch it -- `chcp 65001` puts the console in
-      UTF-8, and PowerShell sessions inherit the host process's
-      OutputEncoding). Pinning to GetOEMCP would silently re-mojibake
-      output in those environments by decoding UTF-8 bytes as if they
-      were CP437. GetConsoleOutputCP returns the active output CP for
-      the console attached to this process; it returns 0 when the
-      process isn't attached to a console (gateway / serve daemons
-      launched from a service manager, headless CI), in which case
-      we fall back to GetOEMCP -- a long-running headless daemon
-      doesn't have a "currently active" console CP to consult, but
-      its spawned cmd.exe children still default to the OEM CP.
-      Codex P2 on PR #237. }
-    CP := GetConsoleOutputCP;
-    if CP = 0 then CP := GetOEMCP;
+    { Use the OEM codepage, NOT GetConsoleOutputCP. Reverting PR #237's
+      Codex P2 fix because real-world testing (Eli on a Windows Terminal
+      / Windows 7 box) showed it regresses the common case. The
+      reviewer's reasoning was: "what if the operator ran chcp 65001
+      first?" But `chcp` in cmd.exe is well-documented to NOT reliably
+      switch piped output encoding -- cmd.exe's redirected stdout uses
+      the OEM codepage regardless of the console's chcp setting (and
+      our spawned cmd.exe children don't inherit a parent shell's
+      chcp at all). Meanwhile pasclaw.exe launched from Windows
+      Terminal (CP_UTF8 by default on Win10+) makes GetConsoleOutputCP
+      return 65001 -- so we then ran MultiByteToWideChar(65001, ...)
+      over real CP437 bytes (the 0x82 byte for é). 65001 is strict
+      UTF-8: 0x82 is an invalid lead byte, gets replaced with U+FFFD,
+      and the model sees `r�sum�.txt` instead of `résumé.txt`.
+      Exactly the original bug, just routed through a different
+      mojibake. GetOEMCP returns the system default (CP437 on US
+      English Windows), which is what cmd.exe's piped output actually
+      uses. The chcp 65001 case can still be handled by the operator
+      running it inside their shell_exec command -- `cmd /c "chcp
+      65001 && dir /b > out.utf8 && type out.utf8"` -- but the
+      default decode path must match cmd.exe's pipe default. }
+    CP := GetOEMCP;
   end;
   { Pass 1: discover the wide-char buffer size we need. }
   WideLen := MultiByteToWideChar(CP, 0, PAnsiChar(@Bytes[0]), Len, nil, 0);

From 0a25432030026e38972ce459d80332d2548d213e Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Fri, 12 Jun 2026 14:53:38 +0000
Subject: [PATCH 2/2] PR #239 review fix: auto-detect UTF-8 vs OEM for shell
 output decode
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Codex P2, legit. The straight revert to GetOEMCP fixed cmd.exe
pipe output (Eli's original bug) but regressed the pwsh case:
PowerShell 6+ defaults to UTF-8 stdout, so a script like
Write-Output 'résumé' from execute_code's pwsh branch produces
UTF-8 bytes that GetOEMCP would now decode as CP437 -> mojibake.

Auto-detection heuristic: try strict UTF-8 first
(MB_ERR_INVALID_CHARS), fall back to OEM only when validation
fails.

  Pure ASCII (most output)            -> valid UTF-8 (no-op)
  pwsh UTF-8 output (résumé etc.)     -> valid UTF-8, passes through
  cmd.exe CP437 non-ASCII (0x82 etc.) -> invalid UTF-8 lead byte,
                                         falls back to GetOEMCP=437

Robustness:
  - 0x82 from CP437 has binary 10000010, a UTF-8 continuation
    marker, NOT a valid lead byte -> reliably caught by
    MB_ERR_INVALID_CHARS
  - Multi-byte CP437 sequences for filenames similarly cluster
    bytes that don't form valid UTF-8 sequences
  - Edge case (OEM bytes that coincidentally form a valid UTF-8
    sequence) is vanishingly unlikely for filename / dir output

New test TestAutoDetectPrefersUTF8ForValidSequences pins the pwsh
case: 8 UTF-8 bytes for "résumé" pass through verbatim when
Codepage=0. The earlier CP437 tests (TestCP437SingleNonAsciiByte
etc.) continue exercising the OEM-fallback path with explicit
Codepage=437.

FPC make smoke + full make test (36 targets) green.

https://claude.ai/code/session_01TBcLtmpj7dqA5tyFbGnQon
---
 src/pkg/platform/PasClaw.Platform.pas   | 59 ++++++++++++++++---------
 src/tests/shell_output_decode_tests.pas | 29 ++++++++++++
 2 files changed, 66 insertions(+), 22 deletions(-)

diff --git a/src/pkg/platform/PasClaw.Platform.pas b/src/pkg/platform/PasClaw.Platform.pas
index 565aa3a..16d3575 100644
--- a/src/pkg/platform/PasClaw.Platform.pas
+++ b/src/pkg/platform/PasClaw.Platform.pas
@@ -183,6 +183,11 @@ implementation
 function DecodeShellOutputBytes(const Bytes: TBytes;
                                 ByteCount: Integer;
                                 Codepage: UInt32): string;
+{$IFDEF MSWINDOWS}
+const
+  CP_UTF8_LOCAL              = 65001;
+  MB_ERR_INVALID_CHARS_LOCAL = $00000008;
+{$ENDIF}
 var
   Len: Integer;
   {$IFDEF MSWINDOWS}
@@ -203,28 +208,38 @@ function DecodeShellOutputBytes(const Bytes: TBytes;
     CP := Codepage
   else
   begin
-    { Use the OEM codepage, NOT GetConsoleOutputCP. Reverting PR #237's
-      Codex P2 fix because real-world testing (Eli on a Windows Terminal
-      / Windows 7 box) showed it regresses the common case. The
-      reviewer's reasoning was: "what if the operator ran chcp 65001
-      first?" But `chcp` in cmd.exe is well-documented to NOT reliably
-      switch piped output encoding -- cmd.exe's redirected stdout uses
-      the OEM codepage regardless of the console's chcp setting (and
-      our spawned cmd.exe children don't inherit a parent shell's
-      chcp at all). Meanwhile pasclaw.exe launched from Windows
-      Terminal (CP_UTF8 by default on Win10+) makes GetConsoleOutputCP
-      return 65001 -- so we then ran MultiByteToWideChar(65001, ...)
-      over real CP437 bytes (the 0x82 byte for é). 65001 is strict
-      UTF-8: 0x82 is an invalid lead byte, gets replaced with U+FFFD,
-      and the model sees `r�sum�.txt` instead of `résumé.txt`.
-      Exactly the original bug, just routed through a different
-      mojibake. GetOEMCP returns the system default (CP437 on US
-      English Windows), which is what cmd.exe's piped output actually
-      uses. The chcp 65001 case can still be handled by the operator
-      running it inside their shell_exec command -- `cmd /c "chcp
-      65001 && dir /b > out.utf8 && type out.utf8"` -- but the
-      default decode path must match cmd.exe's pipe default. }
-    CP := GetOEMCP;
+    { Auto-detect between UTF-8 and OEM. PR #237's first attempt
+      pinned GetConsoleOutputCP (wrong: returns OUR console's CP,
+      not the spawned cmd.exe's piped-output CP). The revert to
+      unconditional GetOEMCP was right for cmd.exe (which pipes
+      OEM regardless of chcp) but wrong for pwsh -- PowerShell 6+
+      defaults to UTF-8 stdout, so a `Write-Output 'résumé'` from
+      execute_code's pwsh branch produces UTF-8 bytes that GetOEMCP
+      would mojibake as CP437. Codex P2 on PR #239.
+
+      Heuristic: try strict UTF-8 (MB_ERR_INVALID_CHARS) first.
+      Valid UTF-8 -> use it (pwsh / chcp 65001 / Linux on Wine).
+      Invalid sequence anywhere -> fall back to OEM (cmd.exe's
+      piped output).
+
+      This is robust because:
+        - Pure ASCII (most output) is valid UTF-8 -> taken either way.
+        - cmd's CP437 non-ASCII bytes (0x80-0xFF) are typically
+          invalid UTF-8 lead bytes -- e.g. 0x82 (é in CP437) has
+          binary 10000010 which is a UTF-8 continuation marker, not
+          a lead byte, so it fails MB_ERR_INVALID_CHARS.
+        - pwsh UTF-8 output (multi-byte sequences for é/résumé/etc.)
+          parses cleanly.
+
+      Edge case: OEM bytes that happen to coincide with a valid
+      UTF-8 sequence (e.g. exactly a 2-byte CP437 pair that maps
+      to a real Unicode codepoint via UTF-8 decoding). This is
+      vanishingly unlikely for filename / dir output. }
+    if MultiByteToWideChar(CP_UTF8_LOCAL, MB_ERR_INVALID_CHARS_LOCAL,
+                            PAnsiChar(@Bytes[0]), Len, nil, 0) > 0 then
+      CP := CP_UTF8_LOCAL
+    else
+      CP := GetOEMCP;
   end;
   { Pass 1: discover the wide-char buffer size we need. }
   WideLen := MultiByteToWideChar(CP, 0, PAnsiChar(@Bytes[0]), Len, nil, 0);
diff --git a/src/tests/shell_output_decode_tests.pas b/src/tests/shell_output_decode_tests.pas
index d0cb6c6..c975981 100644
--- a/src/tests/shell_output_decode_tests.pas
+++ b/src/tests/shell_output_decode_tests.pas
@@ -241,6 +241,33 @@ procedure TestUTF8InputThroughExplicitCP65001;
               'UTF-8 input round-trips through codepage 65001');
 end;
 
+procedure TestAutoDetectPrefersUTF8ForValidSequences;
+(* Codex P2 on PR #239: PowerShell 6+ (pwsh) defaults to UTF-8 stdout,
+   so when execute_code or shell_exec captures pwsh output the bytes
+   are valid UTF-8 sequences -- decoding via GetOEMCP would mojibake
+   them. With Codepage = 0 the helper should detect "this is valid
+   UTF-8" and pass through verbatim, NOT route through CP437. POSIX
+   side: Codepage=0 already goes through TEncoding.UTF8.GetString
+   so the same input is handled the same way on Linux CI. *)
+var
+  B: TBytes;
+  Got: string;
+begin
+  { "résumé" as UTF-8: r(0x72), é(0xC3 0xA9), s(0x73), u(0x75),
+    m(0x6D), é(0xC3 0xA9) -- 8 bytes total. }
+  SetLength(B, 8);
+  B[0] := $72;
+  B[1] := $C3; B[2] := $A9;
+  B[3] := $73;
+  B[4] := $75;
+  B[5] := $6D;
+  B[6] := $C3; B[7] := $A9;
+  Got := DecodeShellOutputBytes(B);   { Codepage = 0 -> auto-detect }
+  AssertEqStr(Got, 'résumé',
+              'auto-detect: valid UTF-8 input passes through verbatim ' +
+              '(would be mojibake "rA©sumA©" or similar if CP437 was forced)');
+end;
+
 begin
   TestEmptyInputEmptyOutput;
   WriteLn('  ok: empty input -> empty output');
@@ -262,5 +289,7 @@ procedure TestUTF8InputThroughExplicitCP65001;
   WriteLn('  ok: CP437 0xC4 -> ─ (3-byte UTF-8)');
   TestUTF8InputThroughExplicitCP65001;
   WriteLn('  ok: codepage 65001 = pass-through UTF-8');
+  TestAutoDetectPrefersUTF8ForValidSequences;
+  WriteLn('  ok: auto-detect picks UTF-8 for valid UTF-8 input (pwsh case)');
   WriteLn('PASS');
 end.