Skip to content

Commit 0d4fcbd

Browse files
authored
Fix QQSHI13 issue hardening pass
Address QQSHI13-reported reliability, lifecycle, settings, and App refactor hardening issues.
1 parent bf21e7b commit 0d4fcbd

38 files changed

Lines changed: 1528 additions & 383 deletions

src/OpenClaw.Connection/GatewayConnectionManager.cs

Lines changed: 112 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -23,13 +23,15 @@ public sealed class GatewayConnectionManager : IGatewayConnectionManager
2323
private readonly Func<GatewayRecord, string, bool>? _shouldStartNodeConnection;
2424
private readonly Func<TimeSpan, Task> _reconnectDelay;
2525
private readonly SemaphoreSlim _transitionSemaphore = new(1, 1);
26+
private readonly object _disposeLock = new();
2627

2728
private long _generation;
2829
private CancellationTokenSource? _operationCts;
2930
private IGatewayClientLifecycle? _activeLifecycle;
3031
private string? _activeIdentityPath; // identity directory for the active connection
3132
private string? _activeGatewayRecordId; // gateway record ID for node credential resolution
3233
private bool _disposed;
34+
private Task? _disposeTask;
3335
private bool _gatewayNeedsV2Signature; // remembered across reconnects
3436
private string? _lastAutoApprovedRequestId; // prevent auto-approve loops
3537
private string? _autoApproveInFlight; // atomic guard against concurrent approval of same requestId
@@ -128,7 +130,7 @@ private async Task ConnectCoreAsync(string? gatewayId = null)
128130
oldCts?.Dispose();
129131

130132
// Dispose old client
131-
DisposeActiveClient();
133+
await DisposeActiveClientAsync();
132134

133135
// Update snapshot with gateway info
134136
_stateMachine.Current = _stateMachine.Current with
@@ -273,7 +275,7 @@ public async Task DisconnectAsync()
273275
await _transitionSemaphore.WaitAsync();
274276
try
275277
{
276-
DisconnectCore();
278+
await DisconnectCoreAsync();
277279
}
278280
finally
279281
{
@@ -282,10 +284,10 @@ public async Task DisconnectAsync()
282284
}
283285

284286
/// <summary>Core disconnect logic. Caller must hold <see cref="_transitionSemaphore"/>.</summary>
285-
private void DisconnectCore()
287+
private async Task DisconnectCoreAsync()
286288
{
287289
var prev = _stateMachine.Current.OverallState;
288-
DisposeActiveClient();
290+
await DisposeActiveClientAsync();
289291
_stateMachine.TryTransition(ConnectionTrigger.DisconnectRequested);
290292
_diagnostics.RecordStateChange(prev, _stateMachine.Current.OverallState);
291293
EmitStateChanged(prev);
@@ -297,7 +299,7 @@ public async Task ReconnectAsync()
297299
await _transitionSemaphore.WaitAsync();
298300
try
299301
{
300-
DisconnectCore();
302+
await DisconnectCoreAsync();
301303
await ConnectCoreAsync();
302304
}
303305
finally
@@ -312,7 +314,7 @@ public async Task SwitchGatewayAsync(string gatewayId)
312314
await _transitionSemaphore.WaitAsync();
313315
try
314316
{
315-
DisconnectCore();
317+
await DisconnectCoreAsync();
316318
// Stop tunnel when switching gateways — the new one may not need it.
317319
// Use a bounded timeout to avoid blocking all connection transitions.
318320
if (_tunnelManager?.IsActive == true)
@@ -767,7 +769,14 @@ await _nodeConnector.ConnectAsync(nodeConnectUrl, nodeCredential, _activeIdentit
767769
return true;
768770
}
769771

770-
private async void OnNodeStatusChanged(object? sender, ConnectionStatus status)
772+
private void OnNodeStatusChanged(object? sender, ConnectionStatus status) =>
773+
AsyncEventHandlerGuard.Run(
774+
() => OnNodeStatusChangedAsync(status),
775+
_logger,
776+
nameof(OnNodeStatusChanged),
777+
ex => _diagnostics.Record("node", "Node status handler failed", ex.Message));
778+
779+
private async Task OnNodeStatusChangedAsync(ConnectionStatus status)
771780
{
772781
_diagnostics.Record("node", $"Node status: {status}");
773782

@@ -815,7 +824,14 @@ private async void OnNodeStatusChanged(object? sender, ConnectionStatus status)
815824
}
816825
}
817826

818-
private async void OnNodePairingStatusChanged(object? sender, PairingStatusEventArgs e)
827+
private void OnNodePairingStatusChanged(object? sender, PairingStatusEventArgs e) =>
828+
AsyncEventHandlerGuard.Run(
829+
() => OnNodePairingStatusChangedAsync(e),
830+
_logger,
831+
nameof(OnNodePairingStatusChanged),
832+
ex => _diagnostics.Record("node", "Node pairing handler failed", ex.Message));
833+
834+
private async Task OnNodePairingStatusChangedAsync(PairingStatusEventArgs e)
819835
{
820836
_diagnostics.Record("node", $"Node pairing: {e.Status}");
821837

@@ -926,12 +942,13 @@ private void EmitStateChanged(OverallConnectionState previousOverall)
926942
StateChanged?.Invoke(this, snapshot);
927943
}
928944

929-
private void DisposeActiveClient()
945+
private async Task DisposeActiveClientAsync()
930946
{
931-
// Disconnect node first — run on threadpool to avoid sync context deadlocks
947+
// Disconnect node first, but do not block the caller thread; shutdown
948+
// and reconnect paths await this with a bounded timeout.
932949
if (_nodeConnector != null)
933950
{
934-
try { Task.Run(() => _nodeConnector.DisconnectAsync()).Wait(TimeSpan.FromSeconds(2)); }
951+
try { await WaitWithTimeoutAsync(_nodeConnector.DisconnectAsync(), TimeSpan.FromSeconds(2), "Node disconnect"); }
935952
catch (Exception ex) { _logger.Warn($"[ConnMgr] Node disconnect error: {ex.Message}"); }
936953
}
937954

@@ -951,42 +968,116 @@ private void DisposeActiveClient()
951968
}
952969
}
953970

971+
private async Task WaitWithTimeoutAsync(Task task, TimeSpan timeout, string operation)
972+
{
973+
var completed = await Task.WhenAny(task, Task.Delay(timeout)).ConfigureAwait(false);
974+
if (completed != task)
975+
{
976+
_logger.Warn($"[ConnMgr] {operation} timed out after {timeout.TotalSeconds:F1}s");
977+
return;
978+
}
979+
980+
await task.ConfigureAwait(false);
981+
}
982+
954983
private void ThrowIfDisposed()
955984
{
956985
ObjectDisposedException.ThrowIf(_disposed, this);
957986
}
958987

988+
public ValueTask DisposeAsync()
989+
{
990+
var task = EnsureDisposeTask();
991+
return new ValueTask(task);
992+
}
993+
959994
public void Dispose()
995+
{
996+
ObserveBackgroundFault(EnsureDisposeTask(), "[ConnMgr] Dispose error");
997+
}
998+
999+
private Task EnsureDisposeTask()
1000+
{
1001+
lock (_disposeLock)
1002+
{
1003+
return _disposeTask ??= DisposeCoreAsync();
1004+
}
1005+
}
1006+
1007+
private async Task DisposeCoreAsync()
9601008
{
9611009
if (_disposed) return;
9621010
_disposed = true;
1011+
_operationCts?.Cancel();
1012+
9631013
// Unsubscribe from node events before disposing the semaphore
964-
// to prevent async void handlers from crashing via ObjectDisposedException.
1014+
// to prevent guarded async handlers from racing the disposed semaphore.
9651015
if (_nodeConnector != null)
9661016
{
9671017
_nodeConnector.StatusChanged -= OnNodeStatusChanged;
9681018
_nodeConnector.PairingStatusChanged -= OnNodePairingStatusChanged;
9691019
}
9701020
// Acquire semaphore briefly to ensure no in-flight reconnect/switch is mid-transition.
971-
// Use a short timeout — if something is stuck, proceed with disposal anyway.
972-
try { _transitionSemaphore.Wait(TimeSpan.FromSeconds(2)); } catch { }
1021+
// Use a short timeout — if something is stuck, proceed with disposal anyway,
1022+
// but do not dispose the semaphore out from under the holder.
1023+
var semaphoreEntered = false;
1024+
try
1025+
{
1026+
semaphoreEntered = await _transitionSemaphore.WaitAsync(TimeSpan.FromSeconds(2)).ConfigureAwait(false);
1027+
if (!semaphoreEntered)
1028+
_logger.Warn("[ConnMgr] Dispose timed out waiting for transition semaphore");
1029+
}
1030+
catch (ObjectDisposedException)
1031+
{
1032+
return;
1033+
}
1034+
9731035
try
9741036
{
9751037
_stateMachine.TryTransition(ConnectionTrigger.Disposed);
976-
DisposeActiveClient();
977-
// Stop tunnel on app shutdown — run on threadpool with timeout to avoid stalling exit
1038+
await DisposeActiveClientAsync();
1039+
// Stop tunnel on app shutdown with timeout to avoid stalling exit.
9781040
if (_tunnelManager?.IsActive == true)
9791041
{
980-
try { Task.Run(() => _tunnelManager.StopAsync()).Wait(TimeSpan.FromSeconds(3)); }
981-
catch { /* shutting down — best effort */ }
1042+
try { await WaitWithTimeoutAsync(_tunnelManager.StopAsync(), TimeSpan.FromSeconds(3), "Tunnel stop"); }
1043+
catch (Exception ex) { _logger.Warn($"[ConnMgr] Tunnel stop error during dispose: {ex.Message}"); }
9821044
}
983-
_operationCts?.Cancel();
9841045
_operationCts?.Dispose();
1046+
_operationCts = null;
9851047
}
9861048
finally
9871049
{
988-
try { _transitionSemaphore.Release(); } catch { }
989-
_transitionSemaphore.Dispose();
1050+
if (semaphoreEntered)
1051+
{
1052+
try { _transitionSemaphore.Release(); } catch { }
1053+
_transitionSemaphore.Dispose();
1054+
}
1055+
1056+
GC.SuppressFinalize(this);
1057+
}
1058+
}
1059+
1060+
private void ObserveBackgroundFault(Task task, string message)
1061+
{
1062+
if (task.IsFaulted)
1063+
{
1064+
_logger.Warn($"{message}: {task.Exception.GetBaseException().Message}");
1065+
return;
1066+
}
1067+
1068+
if (task.IsCanceled)
1069+
{
1070+
_logger.Warn($"{message}: canceled");
1071+
return;
1072+
}
1073+
1074+
if (!task.IsCompleted)
1075+
{
1076+
_ = task.ContinueWith(
1077+
t => _logger.Warn($"{message}: {t.Exception!.GetBaseException().Message}"),
1078+
CancellationToken.None,
1079+
TaskContinuationOptions.OnlyOnFaulted | TaskContinuationOptions.ExecuteSynchronously,
1080+
TaskScheduler.Default);
9901081
}
9911082
}
9921083
}

src/OpenClaw.Connection/IGatewayConnectionManager.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ namespace OpenClaw.Connection;
77
/// Manages operator connection, node connection, credential resolution,
88
/// state transitions, and diagnostics.
99
/// </summary>
10-
public interface IGatewayConnectionManager : IDisposable
10+
public interface IGatewayConnectionManager : IDisposable, IAsyncDisposable
1111
{
1212
// ─── State ───
1313
GatewayConnectionSnapshot CurrentSnapshot { get; }

src/OpenClaw.SetupEngine.UI/OpenClaw.SetupEngine.UI.csproj

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232

3333
<ItemGroup>
3434
<ProjectReference Include="..\OpenClaw.SetupEngine\OpenClaw.SetupEngine.csproj" />
35+
<ProjectReference Include="..\OpenClaw.Shared\OpenClaw.Shared.csproj" />
3536
</ItemGroup>
3637

3738
<!-- Reuse Setup assets from the Tray project -->

src/OpenClaw.SetupEngine.UI/Pages/ProgressPage.xaml.cs

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
using Microsoft.UI.Xaml.Controls;
55
using Microsoft.UI.Xaml.Media;
66
using Microsoft.UI.Xaml.Navigation;
7+
using OpenClaw.Shared;
78
using Windows.UI;
89

910
namespace OpenClaw.SetupEngine.UI.Pages;
@@ -59,7 +60,13 @@ private void BuildStepRows()
5960
}
6061
}
6162

62-
private async void StartPipeline()
63+
private void StartPipeline() =>
64+
AsyncEventHandlerGuard.Run(
65+
StartPipelineAsync,
66+
NullLogger.Instance,
67+
nameof(StartPipeline));
68+
69+
private async Task StartPipelineAsync()
6370
{
6471
var config = _config!;
6572
if (_runCts != null)

src/OpenClaw.SetupEngine.UI/Pages/WelcomePage.xaml.cs

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
using Microsoft.UI.Xaml.Media;
66
using Microsoft.UI.Xaml.Navigation;
77
using OpenClaw.SetupEngine;
8+
using OpenClaw.Shared;
89
using System.Diagnostics;
910
using System.Numerics;
1011
using Windows.UI;
@@ -57,7 +58,13 @@ private void StartLobsterBreatheAnimation()
5758
visual.StartAnimation("Scale", pulse);
5859
}
5960

60-
private async void StartButton_Click(object sender, RoutedEventArgs e)
61+
private void StartButton_Click(object sender, RoutedEventArgs e) =>
62+
AsyncEventHandlerGuard.Run(
63+
StartButtonClickAsync,
64+
NullLogger.Instance,
65+
nameof(StartButton_Click));
66+
67+
private async Task StartButtonClickAsync()
6168
{
6269
var dataDir = Environment.GetEnvironmentVariable("OPENCLAW_TRAY_DATA_DIR")
6370
?? Path.Combine(Environment.GetFolderPath(Environment.SpecialFolder.ApplicationData), "OpenClawTray");

src/OpenClaw.SetupEngine.UI/Pages/WizardPage.xaml.cs

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -341,7 +341,13 @@ private static Brush ResourceBrush(string key)
341341
: new SolidColorBrush(Microsoft.UI.Colors.Gray);
342342
}
343343

344-
private async void Primary_Click(object sender, RoutedEventArgs e)
344+
private void Primary_Click(object sender, RoutedEventArgs e) =>
345+
AsyncEventHandlerGuard.Run(
346+
PrimaryClickAsync,
347+
NullLogger.Instance,
348+
nameof(Primary_Click));
349+
350+
private async Task PrimaryClickAsync()
345351
{
346352
if (_errorState)
347353
{
@@ -352,7 +358,13 @@ private async void Primary_Click(object sender, RoutedEventArgs e)
352358
await SendCurrentAnswerAsync(skip: false);
353359
}
354360

355-
private async void Secondary_Click(object sender, RoutedEventArgs e)
361+
private void Secondary_Click(object sender, RoutedEventArgs e) =>
362+
AsyncEventHandlerGuard.Run(
363+
SecondaryClickAsync,
364+
NullLogger.Instance,
365+
nameof(Secondary_Click));
366+
367+
private async Task SecondaryClickAsync()
356368
{
357369
if (_errorState)
358370
{
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
namespace OpenClaw.Shared;
2+
3+
/// <summary>
4+
/// Runs async event-handler work behind a fault boundary so fire-and-forget UI
5+
/// and transport events cannot crash the process through an async void escape.
6+
/// </summary>
7+
public static class AsyncEventHandlerGuard
8+
{
9+
public static void Run(
10+
Func<Task> work,
11+
IOpenClawLogger? logger = null,
12+
string? operationName = null,
13+
Action<Exception>? onError = null)
14+
{
15+
ArgumentNullException.ThrowIfNull(work);
16+
_ = RunCoreAsync(work, logger, operationName, onError);
17+
}
18+
19+
private static async Task RunCoreAsync(
20+
Func<Task> work,
21+
IOpenClawLogger? logger,
22+
string? operationName,
23+
Action<Exception>? onError)
24+
{
25+
try
26+
{
27+
await work();
28+
}
29+
catch (OperationCanceledException ex)
30+
{
31+
logger?.Debug($"{FormatOperation(operationName)} canceled: {ex.Message}");
32+
}
33+
catch (Exception ex)
34+
{
35+
logger?.Error($"{FormatOperation(operationName)} failed", ex);
36+
onError?.Invoke(ex);
37+
}
38+
}
39+
40+
private static string FormatOperation(string? operationName) =>
41+
string.IsNullOrWhiteSpace(operationName) ? "Async event handler" : operationName;
42+
}

0 commit comments

Comments
 (0)