Skip to content

Commit 55ec4d0

Browse files
authored
fix(security): add SSH session token expiry, connection limits, and lifecycle cleanup (#182)
* fix(security): add SSH session token expiry, connection limits, and lifecycle cleanup Closes #22 SSH session tokens previously had no TTL and remained valid indefinitely. This adds configurable token expiry (default 24h), per-token (10) and per-sandbox (20) concurrent connection limits, session cleanup on sandbox deletion, and a background reaper for expired/revoked sessions. * fix(security): lower per-token concurrent connection limit from 10 to 3 --------- Co-authored-by: John Myers <johntmyers@users.noreply.github.com>
1 parent 86335c4 commit 55ec4d0

File tree

6 files changed

+409
-6
lines changed

6 files changed

+409
-6
lines changed

architecture/gateway-security.md

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -229,7 +229,31 @@ SSH connections into sandboxes pass through the gateway's HTTP CONNECT tunnel at
229229
| `x-sandbox-id` | Identifies the target sandbox |
230230
| `x-sandbox-token` | Session token (created via `CreateSshSession` RPC) |
231231

232-
The gateway validates the token against the stored `SshSession` record, checks that it has not been revoked, and confirms the `sandbox_id` matches.
232+
The gateway validates the token against the stored `SshSession` record and checks:
233+
234+
1. The token has not been revoked.
235+
2. The `sandbox_id` matches the request header.
236+
3. The token has not expired (`expires_at_ms` check; 0 means no expiry for backward compatibility).
237+
238+
### Session Lifecycle
239+
240+
SSH session tokens have a configurable TTL (`ssh_session_ttl_secs`, default 24 hours). The `expires_at_ms` field is set at creation time and checked on every tunnel request. Setting the TTL to 0 disables expiry.
241+
242+
Sessions are cleaned up automatically:
243+
244+
- **On sandbox deletion**: all SSH sessions for the deleted sandbox are removed from the store.
245+
- **Background reaper**: a periodic task (hourly) deletes expired and revoked session records to prevent unbounded database growth.
246+
247+
### Connection Limits
248+
249+
The gateway enforces two concurrent connection limits to bound the impact of credential misuse:
250+
251+
| Limit | Value | Purpose |
252+
|---|---|---|
253+
| Per-token | 10 concurrent tunnels | Limits damage from a single leaked token |
254+
| Per-sandbox | 20 concurrent tunnels | Prevents bypass via creating many tokens for one sandbox |
255+
256+
These limits are tracked in-memory and decremented when tunnels close. Exceeding either limit returns HTTP 429 (Too Many Requests).
233257

234258
### NSSH1 Handshake
235259

@@ -362,6 +386,7 @@ This section defines the primary attacker profiles, what the current design prot
362386
| Weak cryptoperiod | Certificates are effectively non-expiring by default |
363387
| Limited fine-grained revocation | CA private key is not persisted; rotation is coarse-grained |
364388
| Local credential theft risk | CLI mTLS key material is stored on developer filesystem |
389+
| SSH token + mTLS = persistent access within trust boundary | SSH tokens expire after 24h (configurable) and are capped at 3 concurrent connections per token / 20 per sandbox, but within the mTLS trust boundary a stolen token remains usable until TTL expires |
365390

366391
### Out of Scope / Not Defended By This Layer
367392

crates/navigator-core/src/config.rs

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,10 @@ pub struct Config {
6161
#[serde(default = "default_ssh_handshake_skew_secs")]
6262
pub ssh_handshake_skew_secs: u64,
6363

64+
/// TTL for SSH session tokens, in seconds. 0 disables expiry.
65+
#[serde(default = "default_ssh_session_ttl_secs")]
66+
pub ssh_session_ttl_secs: u64,
67+
6468
/// Kubernetes secret name containing client TLS materials for sandbox pods.
6569
/// When set, sandbox pods get this secret mounted so they can connect to
6670
/// the server over mTLS.
@@ -103,6 +107,7 @@ impl Config {
103107
sandbox_ssh_port: default_sandbox_ssh_port(),
104108
ssh_handshake_secret: String::new(),
105109
ssh_handshake_skew_secs: default_ssh_handshake_skew_secs(),
110+
ssh_session_ttl_secs: default_ssh_session_ttl_secs(),
106111
client_tls_secret_name: String::new(),
107112
}
108113
}
@@ -191,6 +196,13 @@ impl Config {
191196
self
192197
}
193198

199+
/// Create a new configuration with the SSH session TTL.
200+
#[must_use]
201+
pub const fn with_ssh_session_ttl_secs(mut self, secs: u64) -> Self {
202+
self.ssh_session_ttl_secs = secs;
203+
self
204+
}
205+
194206
/// Set the Kubernetes secret name for sandbox client TLS materials.
195207
#[must_use]
196208
pub fn with_client_tls_secret_name(mut self, name: impl Into<String>) -> Self {
@@ -230,3 +242,7 @@ const fn default_sandbox_ssh_port() -> u16 {
230242
const fn default_ssh_handshake_skew_secs() -> u64 {
231243
300
232244
}
245+
246+
const fn default_ssh_session_ttl_secs() -> u64 {
247+
86400 // 24 hours
248+
}

crates/navigator-server/src/grpc.rs

Lines changed: 37 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -544,6 +544,33 @@ impl Navigator for NavigatorService {
544544
self.state.sandbox_index.update_from_sandbox(&sandbox);
545545
self.state.sandbox_watch_bus.notify(&id);
546546

547+
// Clean up SSH sessions associated with this sandbox.
548+
if let Ok(records) = self
549+
.state
550+
.store
551+
.list(SshSession::object_type(), 1000, 0)
552+
.await
553+
{
554+
for record in records {
555+
if let Ok(session) = SshSession::decode(record.payload.as_slice()) {
556+
if session.sandbox_id == id {
557+
if let Err(e) = self
558+
.state
559+
.store
560+
.delete(SshSession::object_type(), &session.id)
561+
.await
562+
{
563+
warn!(
564+
session_id = %session.id,
565+
error = %e,
566+
"Failed to delete SSH session during sandbox cleanup"
567+
);
568+
}
569+
}
570+
}
571+
}
572+
}
573+
547574
let deleted = match self.state.sandbox_client.delete(&sandbox.name).await {
548575
Ok(deleted) => deleted,
549576
Err(err) => {
@@ -787,14 +814,21 @@ impl Navigator for NavigatorService {
787814
}
788815

789816
let token = uuid::Uuid::new_v4().to_string();
817+
let now_ms = current_time_ms()
818+
.map_err(|e| Status::internal(format!("timestamp generation failed: {e}")))?;
819+
let expires_at_ms = if self.state.config.ssh_session_ttl_secs > 0 {
820+
now_ms + (self.state.config.ssh_session_ttl_secs as i64 * 1000)
821+
} else {
822+
0
823+
};
790824
let session = SshSession {
791825
id: token.clone(),
792826
sandbox_id: req.sandbox_id.clone(),
793827
token: token.clone(),
794-
created_at_ms: current_time_ms()
795-
.map_err(|e| Status::internal(format!("timestamp generation failed: {e}")))?,
828+
created_at_ms: now_ms,
796829
revoked: false,
797830
name: generate_name(),
831+
expires_at_ms,
798832
};
799833

800834
self.state
@@ -814,6 +848,7 @@ impl Navigator for NavigatorService {
814848
gateway_scheme: scheme.to_string(),
815849
connect_path: self.state.config.ssh_connect_path.clone(),
816850
host_key_fingerprint: String::new(),
851+
expires_at_ms,
817852
}))
818853
}
819854

crates/navigator-server/src/lib.rs

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,8 @@ mod tls;
2222
pub mod tracing_bus;
2323

2424
use navigator_core::{Config, Error, Result};
25-
use std::sync::Arc;
25+
use std::collections::HashMap;
26+
use std::sync::{Arc, Mutex};
2627
use tokio::net::TcpListener;
2728
use tracing::{error, info};
2829

@@ -56,6 +57,12 @@ pub struct ServerState {
5657

5758
/// In-memory bus for server process logs.
5859
pub tracing_log_bus: TracingLogBus,
60+
61+
/// Active SSH tunnel connection counts per session token.
62+
pub ssh_connections_by_token: Mutex<HashMap<String, u32>>,
63+
64+
/// Active SSH tunnel connection counts per sandbox id.
65+
pub ssh_connections_by_sandbox: Mutex<HashMap<String, u32>>,
5966
}
6067

6168
impl ServerState {
@@ -76,6 +83,8 @@ impl ServerState {
7683
sandbox_index,
7784
sandbox_watch_bus,
7885
tracing_log_bus,
86+
ssh_connections_by_token: Mutex::new(HashMap::new()),
87+
ssh_connections_by_sandbox: Mutex::new(HashMap::new()),
7988
}
8089
}
8190
}
@@ -138,6 +147,7 @@ pub async fn run_server(config: Config, tracing_log_bus: TracingLogBus) -> Resul
138147
state.tracing_log_bus.clone(),
139148
);
140149
spawn_kube_event_tailer(state.clone());
150+
ssh_tunnel::spawn_session_reaper(store.clone(), std::time::Duration::from_secs(3600));
141151

142152
// Create the multiplexed service
143153
let service = MultiplexService::new(state.clone());

0 commit comments

Comments
 (0)