@@ -23,13 +23,15 @@ public sealed class GatewayConnectionManager : IGatewayConnectionManager
2323 private readonly Func < GatewayRecord , string , bool > ? _shouldStartNodeConnection ;
2424 private readonly Func < TimeSpan , Task > _reconnectDelay ;
2525 private readonly SemaphoreSlim _transitionSemaphore = new ( 1 , 1 ) ;
26+ private readonly object _disposeLock = new ( ) ;
2627
2728 private long _generation ;
2829 private CancellationTokenSource ? _operationCts ;
2930 private IGatewayClientLifecycle ? _activeLifecycle ;
3031 private string ? _activeIdentityPath ; // identity directory for the active connection
3132 private string ? _activeGatewayRecordId ; // gateway record ID for node credential resolution
3233 private bool _disposed ;
34+ private Task ? _disposeTask ;
3335 private bool _gatewayNeedsV2Signature ; // remembered across reconnects
3436 private string ? _lastAutoApprovedRequestId ; // prevent auto-approve loops
3537 private string ? _autoApproveInFlight ; // atomic guard against concurrent approval of same requestId
@@ -128,7 +130,7 @@ private async Task ConnectCoreAsync(string? gatewayId = null)
128130 oldCts ? . Dispose ( ) ;
129131
130132 // Dispose old client
131- DisposeActiveClient ( ) ;
133+ await DisposeActiveClientAsync ( ) ;
132134
133135 // Update snapshot with gateway info
134136 _stateMachine . Current = _stateMachine . Current with
@@ -273,7 +275,7 @@ public async Task DisconnectAsync()
273275 await _transitionSemaphore . WaitAsync ( ) ;
274276 try
275277 {
276- DisconnectCore ( ) ;
278+ await DisconnectCoreAsync ( ) ;
277279 }
278280 finally
279281 {
@@ -282,10 +284,10 @@ public async Task DisconnectAsync()
282284 }
283285
284286 /// <summary>Core disconnect logic. Caller must hold <see cref="_transitionSemaphore"/>.</summary>
285- private void DisconnectCore ( )
287+ private async Task DisconnectCoreAsync ( )
286288 {
287289 var prev = _stateMachine . Current . OverallState ;
288- DisposeActiveClient ( ) ;
290+ await DisposeActiveClientAsync ( ) ;
289291 _stateMachine . TryTransition ( ConnectionTrigger . DisconnectRequested ) ;
290292 _diagnostics . RecordStateChange ( prev , _stateMachine . Current . OverallState ) ;
291293 EmitStateChanged ( prev ) ;
@@ -297,7 +299,7 @@ public async Task ReconnectAsync()
297299 await _transitionSemaphore . WaitAsync ( ) ;
298300 try
299301 {
300- DisconnectCore ( ) ;
302+ await DisconnectCoreAsync ( ) ;
301303 await ConnectCoreAsync ( ) ;
302304 }
303305 finally
@@ -312,7 +314,7 @@ public async Task SwitchGatewayAsync(string gatewayId)
312314 await _transitionSemaphore . WaitAsync ( ) ;
313315 try
314316 {
315- DisconnectCore ( ) ;
317+ await DisconnectCoreAsync ( ) ;
316318 // Stop tunnel when switching gateways — the new one may not need it.
317319 // Use a bounded timeout to avoid blocking all connection transitions.
318320 if ( _tunnelManager ? . IsActive == true )
@@ -767,7 +769,14 @@ await _nodeConnector.ConnectAsync(nodeConnectUrl, nodeCredential, _activeIdentit
767769 return true ;
768770 }
769771
770- private async void OnNodeStatusChanged ( object ? sender , ConnectionStatus status )
772+ private void OnNodeStatusChanged ( object ? sender , ConnectionStatus status ) =>
773+ AsyncEventHandlerGuard . Run (
774+ ( ) => OnNodeStatusChangedAsync ( status ) ,
775+ _logger ,
776+ nameof ( OnNodeStatusChanged ) ,
777+ ex => _diagnostics . Record ( "node" , "Node status handler failed" , ex . Message ) ) ;
778+
779+ private async Task OnNodeStatusChangedAsync ( ConnectionStatus status )
771780 {
772781 _diagnostics . Record ( "node" , $ "Node status: { status } ") ;
773782
@@ -815,7 +824,14 @@ private async void OnNodeStatusChanged(object? sender, ConnectionStatus status)
815824 }
816825 }
817826
818- private async void OnNodePairingStatusChanged ( object ? sender , PairingStatusEventArgs e )
827+ private void OnNodePairingStatusChanged ( object ? sender , PairingStatusEventArgs e ) =>
828+ AsyncEventHandlerGuard . Run (
829+ ( ) => OnNodePairingStatusChangedAsync ( e ) ,
830+ _logger ,
831+ nameof ( OnNodePairingStatusChanged ) ,
832+ ex => _diagnostics . Record ( "node" , "Node pairing handler failed" , ex . Message ) ) ;
833+
834+ private async Task OnNodePairingStatusChangedAsync ( PairingStatusEventArgs e )
819835 {
820836 _diagnostics . Record ( "node" , $ "Node pairing: { e . Status } ") ;
821837
@@ -926,12 +942,13 @@ private void EmitStateChanged(OverallConnectionState previousOverall)
926942 StateChanged ? . Invoke ( this , snapshot ) ;
927943 }
928944
929- private void DisposeActiveClient ( )
945+ private async Task DisposeActiveClientAsync ( )
930946 {
931- // Disconnect node first — run on threadpool to avoid sync context deadlocks
947+ // Disconnect node first, but do not block the caller thread; shutdown
948+ // and reconnect paths await this with a bounded timeout.
932949 if ( _nodeConnector != null )
933950 {
934- try { Task . Run ( ( ) => _nodeConnector . DisconnectAsync ( ) ) . Wait ( TimeSpan . FromSeconds ( 2 ) ) ; }
951+ try { await WaitWithTimeoutAsync ( _nodeConnector . DisconnectAsync ( ) , TimeSpan . FromSeconds ( 2 ) , "Node disconnect" ) ; }
935952 catch ( Exception ex ) { _logger . Warn ( $ "[ConnMgr] Node disconnect error: { ex . Message } ") ; }
936953 }
937954
@@ -951,42 +968,116 @@ private void DisposeActiveClient()
951968 }
952969 }
953970
971+ private async Task WaitWithTimeoutAsync ( Task task , TimeSpan timeout , string operation )
972+ {
973+ var completed = await Task . WhenAny ( task , Task . Delay ( timeout ) ) . ConfigureAwait ( false ) ;
974+ if ( completed != task )
975+ {
976+ _logger . Warn ( $ "[ConnMgr] { operation } timed out after { timeout . TotalSeconds : F1} s") ;
977+ return ;
978+ }
979+
980+ await task . ConfigureAwait ( false ) ;
981+ }
982+
954983 private void ThrowIfDisposed ( )
955984 {
956985 ObjectDisposedException . ThrowIf ( _disposed , this ) ;
957986 }
958987
988+ public ValueTask DisposeAsync ( )
989+ {
990+ var task = EnsureDisposeTask ( ) ;
991+ return new ValueTask ( task ) ;
992+ }
993+
959994 public void Dispose ( )
995+ {
996+ ObserveBackgroundFault ( EnsureDisposeTask ( ) , "[ConnMgr] Dispose error" ) ;
997+ }
998+
999+ private Task EnsureDisposeTask ( )
1000+ {
1001+ lock ( _disposeLock )
1002+ {
1003+ return _disposeTask ??= DisposeCoreAsync ( ) ;
1004+ }
1005+ }
1006+
1007+ private async Task DisposeCoreAsync ( )
9601008 {
9611009 if ( _disposed ) return ;
9621010 _disposed = true ;
1011+ _operationCts ? . Cancel ( ) ;
1012+
9631013 // Unsubscribe from node events before disposing the semaphore
964- // to prevent async void handlers from crashing via ObjectDisposedException .
1014+ // to prevent guarded async handlers from racing the disposed semaphore .
9651015 if ( _nodeConnector != null )
9661016 {
9671017 _nodeConnector . StatusChanged -= OnNodeStatusChanged ;
9681018 _nodeConnector . PairingStatusChanged -= OnNodePairingStatusChanged ;
9691019 }
9701020 // Acquire semaphore briefly to ensure no in-flight reconnect/switch is mid-transition.
971- // Use a short timeout — if something is stuck, proceed with disposal anyway.
972- try { _transitionSemaphore . Wait ( TimeSpan . FromSeconds ( 2 ) ) ; } catch { }
1021+ // Use a short timeout — if something is stuck, proceed with disposal anyway,
1022+ // but do not dispose the semaphore out from under the holder.
1023+ var semaphoreEntered = false ;
1024+ try
1025+ {
1026+ semaphoreEntered = await _transitionSemaphore . WaitAsync ( TimeSpan . FromSeconds ( 2 ) ) . ConfigureAwait ( false ) ;
1027+ if ( ! semaphoreEntered )
1028+ _logger . Warn ( "[ConnMgr] Dispose timed out waiting for transition semaphore" ) ;
1029+ }
1030+ catch ( ObjectDisposedException )
1031+ {
1032+ return ;
1033+ }
1034+
9731035 try
9741036 {
9751037 _stateMachine . TryTransition ( ConnectionTrigger . Disposed ) ;
976- DisposeActiveClient ( ) ;
977- // Stop tunnel on app shutdown — run on threadpool with timeout to avoid stalling exit
1038+ await DisposeActiveClientAsync ( ) ;
1039+ // Stop tunnel on app shutdown with timeout to avoid stalling exit.
9781040 if ( _tunnelManager ? . IsActive == true )
9791041 {
980- try { Task . Run ( ( ) => _tunnelManager . StopAsync ( ) ) . Wait ( TimeSpan . FromSeconds ( 3 ) ) ; }
981- catch { /* shutting down — best effort */ }
1042+ try { await WaitWithTimeoutAsync ( _tunnelManager . StopAsync ( ) , TimeSpan . FromSeconds ( 3 ) , "Tunnel stop" ) ; }
1043+ catch ( Exception ex ) { _logger . Warn ( $ "[ConnMgr] Tunnel stop error during dispose: { ex . Message } " ) ; }
9821044 }
983- _operationCts ? . Cancel ( ) ;
9841045 _operationCts ? . Dispose ( ) ;
1046+ _operationCts = null ;
9851047 }
9861048 finally
9871049 {
988- try { _transitionSemaphore . Release ( ) ; } catch { }
989- _transitionSemaphore . Dispose ( ) ;
1050+ if ( semaphoreEntered )
1051+ {
1052+ try { _transitionSemaphore . Release ( ) ; } catch { }
1053+ _transitionSemaphore . Dispose ( ) ;
1054+ }
1055+
1056+ GC . SuppressFinalize ( this ) ;
1057+ }
1058+ }
1059+
1060+ private void ObserveBackgroundFault ( Task task , string message )
1061+ {
1062+ if ( task . IsFaulted )
1063+ {
1064+ _logger . Warn ( $ "{ message } : { task . Exception . GetBaseException ( ) . Message } ") ;
1065+ return ;
1066+ }
1067+
1068+ if ( task . IsCanceled )
1069+ {
1070+ _logger . Warn ( $ "{ message } : canceled") ;
1071+ return ;
1072+ }
1073+
1074+ if ( ! task . IsCompleted )
1075+ {
1076+ _ = task . ContinueWith (
1077+ t => _logger . Warn ( $ "{ message } : { t . Exception ! . GetBaseException ( ) . Message } ") ,
1078+ CancellationToken . None ,
1079+ TaskContinuationOptions . OnlyOnFaulted | TaskContinuationOptions . ExecuteSynchronously ,
1080+ TaskScheduler . Default ) ;
9901081 }
9911082 }
9921083}
0 commit comments