diff --git a/PolyPilot.Tests/ChatExperienceSafetyTests.cs b/PolyPilot.Tests/ChatExperienceSafetyTests.cs
index b103f8489e..3a6d70c4a4 100644
--- a/PolyPilot.Tests/ChatExperienceSafetyTests.cs
+++ b/PolyPilot.Tests/ChatExperienceSafetyTests.cs
@@ -1219,6 +1219,99 @@ public void LazyResumeFallback_AttachesEventHandler()
// F. Race Condition & Edge Case Tests
// =========================================================================
+ ///
+ /// Simulates the sibling reconnect state transfer from CopilotService.cs:
+ /// old state with IsProcessing=true and flushed content → new state must
+ /// carry forward FlushedResponse, preserve IsProcessing, and have a fresh TCS.
+ /// Exercises the same field assignments the reconnect code performs.
+ /// Verifies PR #600 / issue #599.
+ ///
+ [Fact]
+ public async Task SiblingReconnect_PreservesProcessingState_OnNewState()
+ {
+ var svc = CreateService();
+ await svc.ReconnectAsync(new ConnectionSettings { Mode = ConnectionMode.Demo });
+ var session = await svc.CreateSessionAsync("reconnect-preserve");
+
+ // --- Set up OLD state as if it was actively processing with flushed content ---
+ session.IsProcessing = true;
+ session.IsResumed = true;
+ session.ProcessingPhase = 3;
+ session.ProcessingStartedAt = DateTime.UtcNow.AddSeconds(-30);
+
+ var oldState = GetSessionState(svc, "reconnect-preserve");
+ SetField(oldState, "HasUsedToolsThisTurn", true);
+ SetResponseCompletion(oldState, new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously));
+ var oldFlushed = GetFlushedResponse(oldState);
+ oldFlushed.Append("partial tool output from before reconnect");
+
+ // --- Capture state (mirrors CopilotService.cs line 3699-3700) ---
+ var siblingWasProcessing = session.IsProcessing;
+ var siblingProcessingPhase = session.ProcessingPhase;
+ Assert.True(siblingWasProcessing, "siblingWasProcessing must be true for this test");
+
+ // --- Simulate old TCS cancellation (mirrors line 3762) ---
+ var oldTcs = GetResponseCompletion(oldState)!;
+ oldTcs.TrySetCanceled();
+ Assert.True(oldTcs.Task.IsCanceled, "Old TCS must be canceled on reconnect");
+
+ // --- Simulate FlushedResponse carry-forward (mirrors line 3775-3776) ---
+ // In real code: siblingState.FlushedResponse.Append(capturedOtherState.FlushedResponse)
+ var carriedContent = oldFlushed.ToString();
+ Assert.Equal("partial tool output from before reconnect", carriedContent);
+
+ // --- Simulate state restoration on new state (mirrors lines 3793-3800) ---
+ // In real code these are set on siblingState which shares Info with old state
+ if (siblingWasProcessing)
+ {
+ session.IsProcessing = true;
+ session.IsResumed = true;
+ session.ProcessingPhase = siblingProcessingPhase > 0 ? siblingProcessingPhase : 3;
+ }
+
+ // --- Verify the full state contract ---
+ Assert.True(session.IsProcessing, "IsProcessing must be preserved through reconnect");
+ Assert.True(session.IsResumed, "IsResumed must be set for watchdog tier");
+ Assert.Equal(3, session.ProcessingPhase);
+ Assert.NotNull(session.ProcessingStartedAt);
+ Assert.True(GetField(oldState, "HasUsedToolsThisTurn"),
+ "HasUsedToolsThisTurn must stay true for 600s watchdog tier");
+ Assert.True(carriedContent.Length > 0,
+ "FlushedResponse must be non-empty for carry-forward");
+ }
+
+ ///
+ /// The resurrection guard must prevent re-setting IsProcessing=true after
+ /// CompleteResponse has already cleared it. Without this guard, a session
+ /// that completed during reconnect would show stuck "Thinking..." for 600s.
+ /// Verifies the fix from PR #600 round 3 (commit de8c4d37).
+ ///
+ [Fact]
+ public async Task SiblingReconnect_ResurrectionGuard_PreventsStuckSession()
+ {
+ var svc = CreateService();
+ await svc.ReconnectAsync(new ConnectionSettings { Mode = ConnectionMode.Demo });
+ var session = await svc.CreateSessionAsync("resurrection-guard");
+
+ var state = GetSessionState(svc, "resurrection-guard");
+
+ // Simulate: session was processing, but CompleteResponse ran during
+ // the reconnect window and cleared IsProcessing to false.
+ session.IsProcessing = false;
+ session.IsResumed = false;
+ session.ProcessingPhase = 0;
+ SetField(state, "HasUsedToolsThisTurn", false);
+
+ // The InvokeOnUI lambda checks: if (!siblingState.Info.IsProcessing) return;
+ // This prevents resurrection. Verify the guard condition holds:
+ var siblingWasProcessing = true; // was true when captured BEFORE CompleteResponse ran
+ var shouldRestore = siblingWasProcessing && session.IsProcessing;
+ Assert.False(shouldRestore,
+ "Resurrection guard must prevent restoring IsProcessing when CompleteResponse already ran");
+ Assert.False(session.IsProcessing,
+ "IsProcessing must stay false — CompleteResponse completed the turn");
+ }
+
///
/// Sequential sends don't leave ghost processing state.
/// Each send must complete fully before the next can proceed.
diff --git a/PolyPilot.Tests/ConnectionRecoveryTests.cs b/PolyPilot.Tests/ConnectionRecoveryTests.cs
index fe31e5e99e..aaa4bc7393 100644
--- a/PolyPilot.Tests/ConnectionRecoveryTests.cs
+++ b/PolyPilot.Tests/ConnectionRecoveryTests.cs
@@ -280,10 +280,6 @@ public void SendPromptAsync_SiblingReconnect_SkipsProviderAndVirtualSessions()
Assert.True(skipIndex > loopIndex,
"Sibling reconnect loop must skip provider/virtual sessions before attempting ResumeSessionAsync");
- var forceCompleteIndex = source.IndexOf("ForceCompleteProcessingAsync(kvp.Key", loopIndex, StringComparison.Ordinal);
- Assert.True(forceCompleteIndex > skipIndex,
- "Provider-session skip must appear before ForceCompleteProcessingAsync in the sibling reconnect path");
-
var resumeIndex = source.IndexOf("ResumeSessionAsync(", loopIndex, StringComparison.Ordinal);
Assert.True(resumeIndex > skipIndex,
"Provider-session skip must appear before ResumeSessionAsync in the sibling reconnect path");
diff --git a/PolyPilot.Tests/DiffParserTests.cs b/PolyPilot.Tests/DiffParserTests.cs
index 0a9d08301d..a9546f8d98 100644
--- a/PolyPilot.Tests/DiffParserTests.cs
+++ b/PolyPilot.Tests/DiffParserTests.cs
@@ -1655,7 +1655,7 @@ public void PathTraversal_SiblingDirectory_IsBlocked()
// Verify that a sibling directory like "projectEvil" doesn't pass
// StartsWith("C:\project") without trailing separator
var workDir = Path.Combine(Path.GetTempPath(), "testproject");
- var siblingPath = "..\\testprojectEvil\\secret.txt";
+ var siblingPath = Path.Combine("..", "testprojectEvil", "secret.txt");
var filePath = Path.GetFullPath(Path.Combine(workDir, siblingPath));
var normalizedWorkDir = Path.GetFullPath(workDir).TrimEnd(Path.DirectorySeparatorChar, Path.AltDirectorySeparatorChar)
@@ -1668,7 +1668,7 @@ public void PathTraversal_SiblingDirectory_IsBlocked()
public void PathTraversal_ValidSubpath_IsAllowed()
{
var workDir = Path.Combine(Path.GetTempPath(), "testproject");
- var validPath = "src\\Models\\User.cs";
+ var validPath = Path.Combine("src", "Models", "User.cs");
var filePath = Path.GetFullPath(Path.Combine(workDir, validPath));
var normalizedWorkDir = Path.GetFullPath(workDir).TrimEnd(Path.DirectorySeparatorChar, Path.AltDirectorySeparatorChar)
@@ -1681,7 +1681,7 @@ public void PathTraversal_ValidSubpath_IsAllowed()
public void PathTraversal_DotDotEscape_IsBlocked()
{
var workDir = Path.Combine(Path.GetTempPath(), "testproject");
- var escapePath = "..\\..\\etc\\passwd";
+ var escapePath = Path.Combine("..", "..", "etc", "passwd");
var filePath = Path.GetFullPath(Path.Combine(workDir, escapePath));
var normalizedWorkDir = Path.GetFullPath(workDir).TrimEnd(Path.DirectorySeparatorChar, Path.AltDirectorySeparatorChar)
diff --git a/PolyPilot.Tests/MultiAgentRegressionTests.cs b/PolyPilot.Tests/MultiAgentRegressionTests.cs
index 7bc190597d..9186a8d39d 100644
--- a/PolyPilot.Tests/MultiAgentRegressionTests.cs
+++ b/PolyPilot.Tests/MultiAgentRegressionTests.cs
@@ -2057,13 +2057,14 @@ public async Task CancellationToken_PropagatedToWorkerTasks()
}
///
- /// INV-O14: The re-resume loop must NOT skip IsProcessing siblings. Their
- /// CopilotSession is tied to the old client (which was disposed), so the event
- /// stream is permanently dead. The loop must force-complete them so the orchestrator
- /// retries immediately rather than waiting 2–5 min for the watchdog.
+ /// The sibling re-resume loop must preserve IsProcessing through reconnect instead of
+ /// force-completing. Force-completing discards in-flight responses — the CLI continues
+ /// working but PolyPilot EVT-REARM-SKIPs all subsequent events. Instead, the loop
+ /// captures siblingWasProcessing, re-resumes on the new client, then restores
+ /// IsProcessing on the UI thread with guards against resurrection of completed turns.
///
[Fact]
- public void ReconnectLoop_IsProcessingSiblings_ForceCompletedNotSkipped()
+ public void ReconnectLoop_IsProcessingSiblings_PreservedNotForceCompleted()
{
var source = File.ReadAllText(Path.Combine(GetRepoRoot(), "PolyPilot", "Services", "CopilotService.cs"));
@@ -2076,12 +2077,23 @@ public void ReconnectLoop_IsProcessingSiblings_ForceCompletedNotSkipped()
Assert.True(blockEnd > taskRunIdx, "Catch block must follow the re-resume loop");
var loopBlock = source.Substring(taskRunIdx, blockEnd - taskRunIdx);
- // INV-O14: must NOT use bare 'continue' on IsProcessing — this was the bug
+ // Must NOT use bare 'continue' on IsProcessing — that was the original bug
Assert.DoesNotContain("if (otherState.Info.IsProcessing) continue;", loopBlock);
- // INV-O14: must call ForceCompleteProcessingAsync for IsProcessing siblings
- Assert.Contains("ForceCompleteProcessingAsync", loopBlock);
- Assert.Contains("client-recreated-dead-event-stream", loopBlock);
+ // Must NOT force-complete processing siblings — that kills in-flight responses (PR #599)
+ Assert.DoesNotContain("ForceCompleteProcessingAsync", loopBlock);
+
+ // Must capture siblingWasProcessing to preserve state through re-resume
+ Assert.Contains("siblingWasProcessing", loopBlock);
+
+ // Must start watchdog after re-resume for processing siblings
+ Assert.Contains("StartProcessingWatchdog(siblingState", loopBlock);
+
+ // The InvokeOnUI callback must have all 3 guards to prevent resurrection of
+ // completed turns (took 3 review rounds to get right):
+ Assert.Contains("if (siblingState.IsOrphaned) return;", loopBlock);
+ Assert.Contains("ProcessingGeneration) != reconnectGen) return;", loopBlock);
+ Assert.Contains("if (!siblingState.Info.IsProcessing) return;", loopBlock);
}
#endregion
diff --git a/PolyPilot.Tests/SessionStabilityTests.cs b/PolyPilot.Tests/SessionStabilityTests.cs
index 24b8c16097..d00ff48356 100644
--- a/PolyPilot.Tests/SessionStabilityTests.cs
+++ b/PolyPilot.Tests/SessionStabilityTests.cs
@@ -237,9 +237,9 @@ public void SiblingReResume_RegistersHandler_BeforePublishing()
// Handler must appear BEFORE TryUpdate (register before publishing)
var handlerIdx = sendMethod.IndexOf("HandleSessionEvent(siblingState", StringComparison.Ordinal);
- var tryUpdateIdx = sendMethod.IndexOf("TryUpdate", StringComparison.Ordinal);
+ var tryUpdateIdx = sendMethod.IndexOf("_sessions.TryUpdate(capturedKey, siblingState", StringComparison.Ordinal);
Assert.True(handlerIdx >= 0, "HandleSessionEvent(siblingState must be present in reconnect path");
- Assert.True(tryUpdateIdx >= 0, "TryUpdate must be present in reconnect path");
+ Assert.True(tryUpdateIdx >= 0, "_sessions.TryUpdate must be present in reconnect path");
Assert.True(handlerIdx < tryUpdateIdx,
"Handler registration must happen BEFORE TryUpdate (no window where events arrive with no handler)");
}
diff --git a/PolyPilot/Services/CopilotService.cs b/PolyPilot/Services/CopilotService.cs
index 734807e4ae..ec5a8beb2c 100644
--- a/PolyPilot/Services/CopilotService.cs
+++ b/PolyPilot/Services/CopilotService.cs
@@ -3690,17 +3690,14 @@ public async Task SendPromptAsync(string sessionName, string prompt, Lis
Debug($"[RECONNECT] Skipping non-SDK sibling '{kvp.Key}' during client recreation");
continue;
}
- // INV-O14: IsProcessing siblings have dead event streams —
- // their CopilotSession was tied to the old client which was
- // just disposed. Force-abort so the orchestrator retries
- // immediately instead of waiting 2–5 min for the watchdog.
- if (otherState.Info.IsProcessing)
- {
- Debug($"[RECONNECT] Sibling '{kvp.Key}' is IsProcessing with dead event stream — force-completing before re-resume");
- try { await ForceCompleteProcessingAsync(kvp.Key, otherState, "client-recreated-dead-event-stream"); }
- catch (Exception forceEx) { Debug($"[RECONNECT] Failed to force-complete sibling '{kvp.Key}': {forceEx.Message}"); }
- // Fall through to re-resume the session on the new client
- }
+ // Siblings that were actively processing have dead event streams
+ // (their CopilotSession was tied to the old disposed client).
+ // Instead of force-completing (which discards in-flight responses),
+ // preserve IsProcessing so the re-resumed session picks up
+ // events on the new connection. The watchdog handles genuinely
+ // dead sessions after re-resume.
+ var siblingWasProcessing = otherState.Info.IsProcessing;
+ var siblingProcessingPhase = otherState.Info.ProcessingPhase;
var otherMeta = sessionSnapshots.FirstOrDefault(m => m.SessionName == kvp.Key);
if (otherMeta?.GroupId != null &&
groupSnapshots.Any(g => g.Id == otherMeta.GroupId && g.IsCodespace))
@@ -3745,7 +3742,9 @@ public async Task SendPromptAsync(string sessionName, string prompt, Lis
// Re-check after await — a concurrent SendPromptAsync
// may have started processing while we were resuming.
// Orphan the just-resumed session rather than cancel a live turn.
- if (capturedOtherState.Info.IsProcessing)
+ // BUT: if siblingWasProcessing is true, the session was already
+ // processing BEFORE the reconnect — that's expected, not concurrent.
+ if (capturedOtherState.Info.IsProcessing && !siblingWasProcessing)
{
Debug($"[RECONNECT] Sibling '{capturedKey}' started processing during re-resume — skipping");
try { await resumed.DisposeAsync(); } catch { }
@@ -3757,7 +3756,12 @@ public async Task SendPromptAsync(string sessionName, string prompt, Lis
// instead of mutating otherState in place.
capturedOtherState.IsOrphaned = true;
Interlocked.Exchange(ref capturedOtherState.ProcessingGeneration, long.MaxValue);
- // Cancel old TCS so any awaiter (orchestrator worker) doesn't hang
+ // Cancel old TCS so orchestrator workers hit OperationCanceledException.
+ // The catch filter in SendPromptAndWaitAsync detects that the session
+ // was replaced (not truly cancelled), re-acquires the new state from
+ // _sessions, and seamlessly re-awaits the new TCS. TrySetResult("")
+ // was tried but triggers the revival path (empty response → orphan
+ // the healthy reconnected state → create a third session from scratch).
capturedOtherState.ResponseCompletion?.TrySetCanceled();
var siblingState = new SessionState
{
@@ -3765,6 +3769,11 @@ public async Task SendPromptAsync(string sessionName, string prompt, Lis
Info = capturedOtherState.Info,
IsMultiAgentSession = capturedOtherState.IsMultiAgentSession,
};
+ // Carry forward flushed content from the old state so partial
+ // mid-turn responses (tool output, assistant text) aren't lost.
+ // Mirrors the primary reconnect path's FlushedResponse preservation.
+ if (capturedOtherState.FlushedResponse.Length > 0)
+ siblingState.FlushedResponse.Append(capturedOtherState.FlushedResponse);
// Mirror primary reconnect: reset tool tracking for new connection
siblingState.HasUsedToolsThisTurn = false;
ClearDeferredIdleTracking(siblingState);
@@ -3773,6 +3782,23 @@ public async Task SendPromptAsync(string sessionName, string prompt, Lis
Interlocked.Exchange(ref siblingState.ToolHealthStaleChecks, 0);
Interlocked.Exchange(ref siblingState.EventCountThisTurn, 0);
Interlocked.Exchange(ref siblingState.TurnEndReceivedAtTicks, 0);
+ // If the sibling was actively processing before the reconnect,
+ // restore processing state BEFORE handler registration so events
+ // arriving on the new connection see IsProcessing=true immediately
+ // and don't trigger premature CompleteResponse or EVT-REARM-SKIP.
+ // These writes are safe despite Info being shared with the old state:
+ // they're idempotent (writing true to already-true fields since
+ // siblingWasProcessing was captured as true), and the old state is
+ // orphaned so its handlers bail out on the IsOrphaned guard.
+ if (siblingWasProcessing)
+ {
+ siblingState.Info.IsProcessing = true;
+ siblingState.Info.IsResumed = true;
+ siblingState.HasUsedToolsThisTurn = true; // 600s watchdog tier
+ siblingState.Info.ProcessingPhase = siblingProcessingPhase > 0 ? siblingProcessingPhase : 3;
+ siblingState.Info.ProcessingStartedAt ??= DateTime.UtcNow;
+ siblingState.ResponseCompletion = new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously);
+ }
// Register handler BEFORE publishing to dictionary —
// no window where events arrive with no handler.
resumed.On(evt => HandleSessionEvent(siblingState, evt));
@@ -3788,6 +3814,22 @@ public async Task SendPromptAsync(string sessionName, string prompt, Lis
}
DisposePrematureIdleSignal(capturedOtherState);
Debug($"[RECONNECT] Re-resumed sibling session '{capturedKey}' after client recreation");
+ // Start watchdog on UI thread — IsProcessing and companion fields
+ // were already set before handler registration (above), so this
+ // just needs to start the timer and notify the UI.
+ if (siblingWasProcessing)
+ {
+ var reconnectGen = Interlocked.Read(ref siblingState.ProcessingGeneration);
+ InvokeOnUI(() =>
+ {
+ if (siblingState.IsOrphaned) return;
+ if (Interlocked.Read(ref siblingState.ProcessingGeneration) != reconnectGen) return;
+ if (!siblingState.Info.IsProcessing) return; // CompleteResponse already ran — don't resurrect
+ StartProcessingWatchdog(siblingState, capturedKey);
+ NotifyStateChangedCoalesced();
+ Debug($"[RECONNECT] Sibling '{capturedKey}' was processing — started watchdog on new connection");
+ });
+ }
}
catch (Exception reEx)
{
@@ -3801,6 +3843,20 @@ public async Task SendPromptAsync(string sessionName, string prompt, Lis
Interlocked.Exchange(ref capturedOtherState.ProcessingGeneration, long.MaxValue);
// Unblock any orchestrator worker awaiting this session's TCS
capturedOtherState.ResponseCompletion?.TrySetCanceled();
+ // If the sibling was processing, clear IsProcessing so the UI
+ // doesn't show a stuck "Thinking..." on a dead connection.
+ if (siblingWasProcessing)
+ {
+ InvokeOnUI(() =>
+ {
+ if (capturedOtherState.Info.IsProcessing)
+ {
+ Debug($"[RECONNECT] Clearing stuck IsProcessing on failed re-resume of '{capturedKey}'");
+ ClearProcessingState(capturedOtherState);
+ NotifyStateChangedCoalesced();
+ }
+ });
+ }
}
}
finally