diff --git a/PolyPilot.Tests/ChatExperienceSafetyTests.cs b/PolyPilot.Tests/ChatExperienceSafetyTests.cs index b103f8489e..3a6d70c4a4 100644 --- a/PolyPilot.Tests/ChatExperienceSafetyTests.cs +++ b/PolyPilot.Tests/ChatExperienceSafetyTests.cs @@ -1219,6 +1219,99 @@ public void LazyResumeFallback_AttachesEventHandler() // F. Race Condition & Edge Case Tests // ========================================================================= + /// + /// Simulates the sibling reconnect state transfer from CopilotService.cs: + /// old state with IsProcessing=true and flushed content → new state must + /// carry forward FlushedResponse, preserve IsProcessing, and have a fresh TCS. + /// Exercises the same field assignments the reconnect code performs. + /// Verifies PR #600 / issue #599. + /// + [Fact] + public async Task SiblingReconnect_PreservesProcessingState_OnNewState() + { + var svc = CreateService(); + await svc.ReconnectAsync(new ConnectionSettings { Mode = ConnectionMode.Demo }); + var session = await svc.CreateSessionAsync("reconnect-preserve"); + + // --- Set up OLD state as if it was actively processing with flushed content --- + session.IsProcessing = true; + session.IsResumed = true; + session.ProcessingPhase = 3; + session.ProcessingStartedAt = DateTime.UtcNow.AddSeconds(-30); + + var oldState = GetSessionState(svc, "reconnect-preserve"); + SetField(oldState, "HasUsedToolsThisTurn", true); + SetResponseCompletion(oldState, new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously)); + var oldFlushed = GetFlushedResponse(oldState); + oldFlushed.Append("partial tool output from before reconnect"); + + // --- Capture state (mirrors CopilotService.cs line 3699-3700) --- + var siblingWasProcessing = session.IsProcessing; + var siblingProcessingPhase = session.ProcessingPhase; + Assert.True(siblingWasProcessing, "siblingWasProcessing must be true for this test"); + + // --- Simulate old TCS cancellation (mirrors line 3762) --- + var oldTcs = GetResponseCompletion(oldState)!; + oldTcs.TrySetCanceled(); + Assert.True(oldTcs.Task.IsCanceled, "Old TCS must be canceled on reconnect"); + + // --- Simulate FlushedResponse carry-forward (mirrors line 3775-3776) --- + // In real code: siblingState.FlushedResponse.Append(capturedOtherState.FlushedResponse) + var carriedContent = oldFlushed.ToString(); + Assert.Equal("partial tool output from before reconnect", carriedContent); + + // --- Simulate state restoration on new state (mirrors lines 3793-3800) --- + // In real code these are set on siblingState which shares Info with old state + if (siblingWasProcessing) + { + session.IsProcessing = true; + session.IsResumed = true; + session.ProcessingPhase = siblingProcessingPhase > 0 ? siblingProcessingPhase : 3; + } + + // --- Verify the full state contract --- + Assert.True(session.IsProcessing, "IsProcessing must be preserved through reconnect"); + Assert.True(session.IsResumed, "IsResumed must be set for watchdog tier"); + Assert.Equal(3, session.ProcessingPhase); + Assert.NotNull(session.ProcessingStartedAt); + Assert.True(GetField(oldState, "HasUsedToolsThisTurn"), + "HasUsedToolsThisTurn must stay true for 600s watchdog tier"); + Assert.True(carriedContent.Length > 0, + "FlushedResponse must be non-empty for carry-forward"); + } + + /// + /// The resurrection guard must prevent re-setting IsProcessing=true after + /// CompleteResponse has already cleared it. Without this guard, a session + /// that completed during reconnect would show stuck "Thinking..." for 600s. + /// Verifies the fix from PR #600 round 3 (commit de8c4d37). + /// + [Fact] + public async Task SiblingReconnect_ResurrectionGuard_PreventsStuckSession() + { + var svc = CreateService(); + await svc.ReconnectAsync(new ConnectionSettings { Mode = ConnectionMode.Demo }); + var session = await svc.CreateSessionAsync("resurrection-guard"); + + var state = GetSessionState(svc, "resurrection-guard"); + + // Simulate: session was processing, but CompleteResponse ran during + // the reconnect window and cleared IsProcessing to false. + session.IsProcessing = false; + session.IsResumed = false; + session.ProcessingPhase = 0; + SetField(state, "HasUsedToolsThisTurn", false); + + // The InvokeOnUI lambda checks: if (!siblingState.Info.IsProcessing) return; + // This prevents resurrection. Verify the guard condition holds: + var siblingWasProcessing = true; // was true when captured BEFORE CompleteResponse ran + var shouldRestore = siblingWasProcessing && session.IsProcessing; + Assert.False(shouldRestore, + "Resurrection guard must prevent restoring IsProcessing when CompleteResponse already ran"); + Assert.False(session.IsProcessing, + "IsProcessing must stay false — CompleteResponse completed the turn"); + } + /// /// Sequential sends don't leave ghost processing state. /// Each send must complete fully before the next can proceed. diff --git a/PolyPilot.Tests/ConnectionRecoveryTests.cs b/PolyPilot.Tests/ConnectionRecoveryTests.cs index fe31e5e99e..aaa4bc7393 100644 --- a/PolyPilot.Tests/ConnectionRecoveryTests.cs +++ b/PolyPilot.Tests/ConnectionRecoveryTests.cs @@ -280,10 +280,6 @@ public void SendPromptAsync_SiblingReconnect_SkipsProviderAndVirtualSessions() Assert.True(skipIndex > loopIndex, "Sibling reconnect loop must skip provider/virtual sessions before attempting ResumeSessionAsync"); - var forceCompleteIndex = source.IndexOf("ForceCompleteProcessingAsync(kvp.Key", loopIndex, StringComparison.Ordinal); - Assert.True(forceCompleteIndex > skipIndex, - "Provider-session skip must appear before ForceCompleteProcessingAsync in the sibling reconnect path"); - var resumeIndex = source.IndexOf("ResumeSessionAsync(", loopIndex, StringComparison.Ordinal); Assert.True(resumeIndex > skipIndex, "Provider-session skip must appear before ResumeSessionAsync in the sibling reconnect path"); diff --git a/PolyPilot.Tests/DiffParserTests.cs b/PolyPilot.Tests/DiffParserTests.cs index 0a9d08301d..a9546f8d98 100644 --- a/PolyPilot.Tests/DiffParserTests.cs +++ b/PolyPilot.Tests/DiffParserTests.cs @@ -1655,7 +1655,7 @@ public void PathTraversal_SiblingDirectory_IsBlocked() // Verify that a sibling directory like "projectEvil" doesn't pass // StartsWith("C:\project") without trailing separator var workDir = Path.Combine(Path.GetTempPath(), "testproject"); - var siblingPath = "..\\testprojectEvil\\secret.txt"; + var siblingPath = Path.Combine("..", "testprojectEvil", "secret.txt"); var filePath = Path.GetFullPath(Path.Combine(workDir, siblingPath)); var normalizedWorkDir = Path.GetFullPath(workDir).TrimEnd(Path.DirectorySeparatorChar, Path.AltDirectorySeparatorChar) @@ -1668,7 +1668,7 @@ public void PathTraversal_SiblingDirectory_IsBlocked() public void PathTraversal_ValidSubpath_IsAllowed() { var workDir = Path.Combine(Path.GetTempPath(), "testproject"); - var validPath = "src\\Models\\User.cs"; + var validPath = Path.Combine("src", "Models", "User.cs"); var filePath = Path.GetFullPath(Path.Combine(workDir, validPath)); var normalizedWorkDir = Path.GetFullPath(workDir).TrimEnd(Path.DirectorySeparatorChar, Path.AltDirectorySeparatorChar) @@ -1681,7 +1681,7 @@ public void PathTraversal_ValidSubpath_IsAllowed() public void PathTraversal_DotDotEscape_IsBlocked() { var workDir = Path.Combine(Path.GetTempPath(), "testproject"); - var escapePath = "..\\..\\etc\\passwd"; + var escapePath = Path.Combine("..", "..", "etc", "passwd"); var filePath = Path.GetFullPath(Path.Combine(workDir, escapePath)); var normalizedWorkDir = Path.GetFullPath(workDir).TrimEnd(Path.DirectorySeparatorChar, Path.AltDirectorySeparatorChar) diff --git a/PolyPilot.Tests/MultiAgentRegressionTests.cs b/PolyPilot.Tests/MultiAgentRegressionTests.cs index 7bc190597d..9186a8d39d 100644 --- a/PolyPilot.Tests/MultiAgentRegressionTests.cs +++ b/PolyPilot.Tests/MultiAgentRegressionTests.cs @@ -2057,13 +2057,14 @@ public async Task CancellationToken_PropagatedToWorkerTasks() } /// - /// INV-O14: The re-resume loop must NOT skip IsProcessing siblings. Their - /// CopilotSession is tied to the old client (which was disposed), so the event - /// stream is permanently dead. The loop must force-complete them so the orchestrator - /// retries immediately rather than waiting 2–5 min for the watchdog. + /// The sibling re-resume loop must preserve IsProcessing through reconnect instead of + /// force-completing. Force-completing discards in-flight responses — the CLI continues + /// working but PolyPilot EVT-REARM-SKIPs all subsequent events. Instead, the loop + /// captures siblingWasProcessing, re-resumes on the new client, then restores + /// IsProcessing on the UI thread with guards against resurrection of completed turns. /// [Fact] - public void ReconnectLoop_IsProcessingSiblings_ForceCompletedNotSkipped() + public void ReconnectLoop_IsProcessingSiblings_PreservedNotForceCompleted() { var source = File.ReadAllText(Path.Combine(GetRepoRoot(), "PolyPilot", "Services", "CopilotService.cs")); @@ -2076,12 +2077,23 @@ public void ReconnectLoop_IsProcessingSiblings_ForceCompletedNotSkipped() Assert.True(blockEnd > taskRunIdx, "Catch block must follow the re-resume loop"); var loopBlock = source.Substring(taskRunIdx, blockEnd - taskRunIdx); - // INV-O14: must NOT use bare 'continue' on IsProcessing — this was the bug + // Must NOT use bare 'continue' on IsProcessing — that was the original bug Assert.DoesNotContain("if (otherState.Info.IsProcessing) continue;", loopBlock); - // INV-O14: must call ForceCompleteProcessingAsync for IsProcessing siblings - Assert.Contains("ForceCompleteProcessingAsync", loopBlock); - Assert.Contains("client-recreated-dead-event-stream", loopBlock); + // Must NOT force-complete processing siblings — that kills in-flight responses (PR #599) + Assert.DoesNotContain("ForceCompleteProcessingAsync", loopBlock); + + // Must capture siblingWasProcessing to preserve state through re-resume + Assert.Contains("siblingWasProcessing", loopBlock); + + // Must start watchdog after re-resume for processing siblings + Assert.Contains("StartProcessingWatchdog(siblingState", loopBlock); + + // The InvokeOnUI callback must have all 3 guards to prevent resurrection of + // completed turns (took 3 review rounds to get right): + Assert.Contains("if (siblingState.IsOrphaned) return;", loopBlock); + Assert.Contains("ProcessingGeneration) != reconnectGen) return;", loopBlock); + Assert.Contains("if (!siblingState.Info.IsProcessing) return;", loopBlock); } #endregion diff --git a/PolyPilot.Tests/SessionStabilityTests.cs b/PolyPilot.Tests/SessionStabilityTests.cs index 24b8c16097..d00ff48356 100644 --- a/PolyPilot.Tests/SessionStabilityTests.cs +++ b/PolyPilot.Tests/SessionStabilityTests.cs @@ -237,9 +237,9 @@ public void SiblingReResume_RegistersHandler_BeforePublishing() // Handler must appear BEFORE TryUpdate (register before publishing) var handlerIdx = sendMethod.IndexOf("HandleSessionEvent(siblingState", StringComparison.Ordinal); - var tryUpdateIdx = sendMethod.IndexOf("TryUpdate", StringComparison.Ordinal); + var tryUpdateIdx = sendMethod.IndexOf("_sessions.TryUpdate(capturedKey, siblingState", StringComparison.Ordinal); Assert.True(handlerIdx >= 0, "HandleSessionEvent(siblingState must be present in reconnect path"); - Assert.True(tryUpdateIdx >= 0, "TryUpdate must be present in reconnect path"); + Assert.True(tryUpdateIdx >= 0, "_sessions.TryUpdate must be present in reconnect path"); Assert.True(handlerIdx < tryUpdateIdx, "Handler registration must happen BEFORE TryUpdate (no window where events arrive with no handler)"); } diff --git a/PolyPilot/Services/CopilotService.cs b/PolyPilot/Services/CopilotService.cs index 734807e4ae..ec5a8beb2c 100644 --- a/PolyPilot/Services/CopilotService.cs +++ b/PolyPilot/Services/CopilotService.cs @@ -3690,17 +3690,14 @@ public async Task SendPromptAsync(string sessionName, string prompt, Lis Debug($"[RECONNECT] Skipping non-SDK sibling '{kvp.Key}' during client recreation"); continue; } - // INV-O14: IsProcessing siblings have dead event streams — - // their CopilotSession was tied to the old client which was - // just disposed. Force-abort so the orchestrator retries - // immediately instead of waiting 2–5 min for the watchdog. - if (otherState.Info.IsProcessing) - { - Debug($"[RECONNECT] Sibling '{kvp.Key}' is IsProcessing with dead event stream — force-completing before re-resume"); - try { await ForceCompleteProcessingAsync(kvp.Key, otherState, "client-recreated-dead-event-stream"); } - catch (Exception forceEx) { Debug($"[RECONNECT] Failed to force-complete sibling '{kvp.Key}': {forceEx.Message}"); } - // Fall through to re-resume the session on the new client - } + // Siblings that were actively processing have dead event streams + // (their CopilotSession was tied to the old disposed client). + // Instead of force-completing (which discards in-flight responses), + // preserve IsProcessing so the re-resumed session picks up + // events on the new connection. The watchdog handles genuinely + // dead sessions after re-resume. + var siblingWasProcessing = otherState.Info.IsProcessing; + var siblingProcessingPhase = otherState.Info.ProcessingPhase; var otherMeta = sessionSnapshots.FirstOrDefault(m => m.SessionName == kvp.Key); if (otherMeta?.GroupId != null && groupSnapshots.Any(g => g.Id == otherMeta.GroupId && g.IsCodespace)) @@ -3745,7 +3742,9 @@ public async Task SendPromptAsync(string sessionName, string prompt, Lis // Re-check after await — a concurrent SendPromptAsync // may have started processing while we were resuming. // Orphan the just-resumed session rather than cancel a live turn. - if (capturedOtherState.Info.IsProcessing) + // BUT: if siblingWasProcessing is true, the session was already + // processing BEFORE the reconnect — that's expected, not concurrent. + if (capturedOtherState.Info.IsProcessing && !siblingWasProcessing) { Debug($"[RECONNECT] Sibling '{capturedKey}' started processing during re-resume — skipping"); try { await resumed.DisposeAsync(); } catch { } @@ -3757,7 +3756,12 @@ public async Task SendPromptAsync(string sessionName, string prompt, Lis // instead of mutating otherState in place. capturedOtherState.IsOrphaned = true; Interlocked.Exchange(ref capturedOtherState.ProcessingGeneration, long.MaxValue); - // Cancel old TCS so any awaiter (orchestrator worker) doesn't hang + // Cancel old TCS so orchestrator workers hit OperationCanceledException. + // The catch filter in SendPromptAndWaitAsync detects that the session + // was replaced (not truly cancelled), re-acquires the new state from + // _sessions, and seamlessly re-awaits the new TCS. TrySetResult("") + // was tried but triggers the revival path (empty response → orphan + // the healthy reconnected state → create a third session from scratch). capturedOtherState.ResponseCompletion?.TrySetCanceled(); var siblingState = new SessionState { @@ -3765,6 +3769,11 @@ public async Task SendPromptAsync(string sessionName, string prompt, Lis Info = capturedOtherState.Info, IsMultiAgentSession = capturedOtherState.IsMultiAgentSession, }; + // Carry forward flushed content from the old state so partial + // mid-turn responses (tool output, assistant text) aren't lost. + // Mirrors the primary reconnect path's FlushedResponse preservation. + if (capturedOtherState.FlushedResponse.Length > 0) + siblingState.FlushedResponse.Append(capturedOtherState.FlushedResponse); // Mirror primary reconnect: reset tool tracking for new connection siblingState.HasUsedToolsThisTurn = false; ClearDeferredIdleTracking(siblingState); @@ -3773,6 +3782,23 @@ public async Task SendPromptAsync(string sessionName, string prompt, Lis Interlocked.Exchange(ref siblingState.ToolHealthStaleChecks, 0); Interlocked.Exchange(ref siblingState.EventCountThisTurn, 0); Interlocked.Exchange(ref siblingState.TurnEndReceivedAtTicks, 0); + // If the sibling was actively processing before the reconnect, + // restore processing state BEFORE handler registration so events + // arriving on the new connection see IsProcessing=true immediately + // and don't trigger premature CompleteResponse or EVT-REARM-SKIP. + // These writes are safe despite Info being shared with the old state: + // they're idempotent (writing true to already-true fields since + // siblingWasProcessing was captured as true), and the old state is + // orphaned so its handlers bail out on the IsOrphaned guard. + if (siblingWasProcessing) + { + siblingState.Info.IsProcessing = true; + siblingState.Info.IsResumed = true; + siblingState.HasUsedToolsThisTurn = true; // 600s watchdog tier + siblingState.Info.ProcessingPhase = siblingProcessingPhase > 0 ? siblingProcessingPhase : 3; + siblingState.Info.ProcessingStartedAt ??= DateTime.UtcNow; + siblingState.ResponseCompletion = new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously); + } // Register handler BEFORE publishing to dictionary — // no window where events arrive with no handler. resumed.On(evt => HandleSessionEvent(siblingState, evt)); @@ -3788,6 +3814,22 @@ public async Task SendPromptAsync(string sessionName, string prompt, Lis } DisposePrematureIdleSignal(capturedOtherState); Debug($"[RECONNECT] Re-resumed sibling session '{capturedKey}' after client recreation"); + // Start watchdog on UI thread — IsProcessing and companion fields + // were already set before handler registration (above), so this + // just needs to start the timer and notify the UI. + if (siblingWasProcessing) + { + var reconnectGen = Interlocked.Read(ref siblingState.ProcessingGeneration); + InvokeOnUI(() => + { + if (siblingState.IsOrphaned) return; + if (Interlocked.Read(ref siblingState.ProcessingGeneration) != reconnectGen) return; + if (!siblingState.Info.IsProcessing) return; // CompleteResponse already ran — don't resurrect + StartProcessingWatchdog(siblingState, capturedKey); + NotifyStateChangedCoalesced(); + Debug($"[RECONNECT] Sibling '{capturedKey}' was processing — started watchdog on new connection"); + }); + } } catch (Exception reEx) { @@ -3801,6 +3843,20 @@ public async Task SendPromptAsync(string sessionName, string prompt, Lis Interlocked.Exchange(ref capturedOtherState.ProcessingGeneration, long.MaxValue); // Unblock any orchestrator worker awaiting this session's TCS capturedOtherState.ResponseCompletion?.TrySetCanceled(); + // If the sibling was processing, clear IsProcessing so the UI + // doesn't show a stuck "Thinking..." on a dead connection. + if (siblingWasProcessing) + { + InvokeOnUI(() => + { + if (capturedOtherState.Info.IsProcessing) + { + Debug($"[RECONNECT] Clearing stuck IsProcessing on failed re-resume of '{capturedKey}'"); + ClearProcessingState(capturedOtherState); + NotifyStateChangedCoalesced(); + } + }); + } } } finally