Skip to content

Commit 1c659c1

Browse files
authored
fix(sandbox/bootstrap): GPU Landlock baseline paths and CDI spec missing diagnosis (#710)
* fix(sandbox): add GPU device nodes and nvidia-persistenced to landlock baseline Landlock READ_FILE/WRITE_FILE restricts open(2) on character device files even when DAC permissions would otherwise allow it. GPU sandboxes need /dev/nvidiactl, /dev/nvidia-uvm, /dev/nvidia-uvm-tools, /dev/nvidia-modeset, and per-GPU /dev/nvidiaX nodes in the policy to allow NVML initialization. Additionally, CDI bind-mounts /run/nvidia-persistenced/socket into the container. NVML tries to connect to this socket at init time; if the directory is not in the landlock policy, it receives EACCES (not ECONNREFUSED), which causes NVML to abort with NVML_ERROR_INSUFFICIENT_PERMISSIONS even though nvidia-persistenced is optional. Both classes of paths are auto-added to the baseline when /dev/nvidiactl is present. Per-GPU device nodes are enumerated at runtime to handle multi-GPU configurations.
1 parent d9e8fe5 commit 1c659c1

4 files changed

Lines changed: 294 additions & 20 deletions

File tree

crates/openshell-bootstrap/src/errors.rs

Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -169,6 +169,21 @@ const FAILURE_PATTERNS: &[FailurePattern] = &[
169169
match_mode: MatchMode::Any,
170170
diagnose: diagnose_docker_not_running,
171171
},
172+
// CDI specs missing — Docker daemon has CDI configured but no spec files exist
173+
// or the requested device ID (nvidia.com/gpu=all) is not in any spec.
174+
// Matches errors from Docker 25+ and containerd CDI injection paths.
175+
FailurePattern {
176+
matchers: &[
177+
"CDI device not found",
178+
"unknown CDI device",
179+
"failed to inject CDI devices",
180+
"no CDI devices found",
181+
"CDI device injection failed",
182+
"unresolvable CDI devices",
183+
],
184+
match_mode: MatchMode::Any,
185+
diagnose: diagnose_cdi_specs_missing,
186+
},
172187
];
173188

174189
fn diagnose_corrupted_state(gateway_name: &str) -> GatewayFailureDiagnosis {
@@ -396,6 +411,29 @@ fn diagnose_certificate_issue(gateway_name: &str) -> GatewayFailureDiagnosis {
396411
}
397412
}
398413

414+
fn diagnose_cdi_specs_missing(_gateway_name: &str) -> GatewayFailureDiagnosis {
415+
GatewayFailureDiagnosis {
416+
summary: "CDI specs not found on host".to_string(),
417+
explanation: "GPU passthrough via CDI was selected (your Docker daemon has CDI spec \
418+
directories configured) but no CDI device specs were found on the host. \
419+
Specs must be pre-generated before OpenShell can inject the GPU into the \
420+
cluster container."
421+
.to_string(),
422+
recovery_steps: vec![
423+
RecoveryStep::with_command(
424+
"Generate CDI specs on the host (nvidia-ctk creates /etc/cdi/ if it does not exist)",
425+
"sudo nvidia-ctk cdi generate --output=/etc/cdi/nvidia.yaml",
426+
),
427+
RecoveryStep::with_command(
428+
"Verify the specs were generated and include nvidia.com/gpu entries",
429+
"nvidia-ctk cdi list",
430+
),
431+
RecoveryStep::new("Then retry: openshell gateway start --gpu"),
432+
],
433+
retryable: false,
434+
}
435+
}
436+
399437
fn diagnose_docker_not_running(_gateway_name: &str) -> GatewayFailureDiagnosis {
400438
GatewayFailureDiagnosis {
401439
summary: "Docker is not running".to_string(),
@@ -925,4 +963,76 @@ mod tests {
925963
"commands should include gateway name, got: {all_commands}"
926964
);
927965
}
966+
967+
#[test]
968+
fn test_diagnose_cdi_device_not_found() {
969+
let diagnosis = diagnose_failure(
970+
"test",
971+
"could not run container: CDI device not found: nvidia.com/gpu=all",
972+
None,
973+
);
974+
assert!(diagnosis.is_some());
975+
let d = diagnosis.unwrap();
976+
assert!(
977+
d.summary.contains("CDI"),
978+
"expected CDI diagnosis, got: {}",
979+
d.summary
980+
);
981+
assert!(!d.retryable);
982+
}
983+
984+
#[test]
985+
fn test_diagnose_cdi_injection_failed_unresolvable() {
986+
// Exact error observed from Docker 500 response
987+
let diagnosis = diagnose_failure(
988+
"test",
989+
"Docker responded with status code 500: CDI device injection failed: unresolvable CDI devices nvidia.com/gpu=all",
990+
None,
991+
);
992+
assert!(diagnosis.is_some());
993+
let d = diagnosis.unwrap();
994+
assert!(
995+
d.summary.contains("CDI"),
996+
"expected CDI diagnosis, got: {}",
997+
d.summary
998+
);
999+
assert!(!d.retryable);
1000+
}
1001+
1002+
#[test]
1003+
fn test_diagnose_unknown_cdi_device() {
1004+
// containerd error path
1005+
let diagnosis = diagnose_failure(
1006+
"test",
1007+
"unknown CDI device requested: nvidia.com/gpu=all",
1008+
None,
1009+
);
1010+
assert!(diagnosis.is_some());
1011+
let d = diagnosis.unwrap();
1012+
assert!(
1013+
d.summary.contains("CDI"),
1014+
"expected CDI diagnosis, got: {}",
1015+
d.summary
1016+
);
1017+
}
1018+
1019+
#[test]
1020+
fn test_diagnose_cdi_recovery_mentions_nvidia_ctk() {
1021+
let d = diagnose_failure("test", "CDI device not found", None)
1022+
.expect("should match CDI pattern");
1023+
let all_steps: String = d
1024+
.recovery_steps
1025+
.iter()
1026+
.map(|s| format!("{} {}", s.description, s.command.as_deref().unwrap_or("")))
1027+
.collect::<Vec<_>>()
1028+
.join("\n");
1029+
assert!(
1030+
all_steps.contains("nvidia-ctk cdi generate"),
1031+
"recovery steps should mention nvidia-ctk cdi generate, got: {all_steps}"
1032+
);
1033+
assert!(
1034+
all_steps.contains("/etc/cdi/"),
1035+
"recovery steps should mention /etc/cdi/, got: {all_steps}"
1036+
);
1037+
}
9281038
}

crates/openshell-sandbox/src/lib.rs

Lines changed: 182 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -877,13 +877,106 @@ pub(crate) fn spawn_route_refresh(
877877

878878
/// Minimum read-only paths required for a proxy-mode sandbox child process to
879879
/// function: dynamic linker, shared libraries, DNS resolution, CA certs,
880-
/// Python venv, and openshell logs.
881-
const PROXY_BASELINE_READ_ONLY: &[&str] = &["/usr", "/lib", "/etc", "/app", "/var/log"];
880+
/// Python venv, openshell logs, process info, and random bytes.
881+
///
882+
/// `/proc` and `/dev/urandom` are included here for the same reasons they
883+
/// appear in `restrictive_default_policy()`: virtually every process needs
884+
/// them. Before the Landlock per-path fix (#677) these were effectively free
885+
/// because a missing path silently disabled the entire ruleset; now they must
886+
/// be explicit.
887+
const PROXY_BASELINE_READ_ONLY: &[&str] = &[
888+
"/usr",
889+
"/lib",
890+
"/etc",
891+
"/app",
892+
"/var/log",
893+
"/proc",
894+
"/dev/urandom",
895+
];
882896

883897
/// Minimum read-write paths required for a proxy-mode sandbox child process:
884898
/// user working directory and temporary files.
885899
const PROXY_BASELINE_READ_WRITE: &[&str] = &["/sandbox", "/tmp"];
886900

901+
/// GPU read-only paths.
902+
///
903+
/// `/run/nvidia-persistenced`: NVML tries to connect to the persistenced
904+
/// socket at init time. If the directory exists but Landlock denies traversal
905+
/// (EACCES vs ECONNREFUSED), NVML returns `NVML_ERROR_INSUFFICIENT_PERMISSIONS`
906+
/// even though the daemon is optional. Only read/traversal access is needed.
907+
const GPU_BASELINE_READ_ONLY: &[&str] = &["/run/nvidia-persistenced"];
908+
909+
/// GPU read-write paths (static).
910+
///
911+
/// `/dev/nvidiactl`, `/dev/nvidia-uvm`, `/dev/nvidia-uvm-tools`,
912+
/// `/dev/nvidia-modeset`: control and UVM devices injected by CDI.
913+
/// Landlock restricts `open(2)` on device files even when DAC allows it;
914+
/// these need read-write because NVML/CUDA opens them with `O_RDWR`.
915+
///
916+
/// `/proc`: CUDA writes to `/proc/<pid>/task/<tid>/comm` during `cuInit()`
917+
/// to set thread names. Without write access, `cuInit()` returns error 304.
918+
/// Must use `/proc` (not `/proc/self/task`) because Landlock rules bind to
919+
/// inodes and child processes have different procfs inodes than the parent.
920+
///
921+
/// Per-GPU device files (`/dev/nvidia0`, …) are enumerated at runtime by
922+
/// `enumerate_gpu_device_nodes()` since the count varies.
923+
const GPU_BASELINE_READ_WRITE: &[&str] = &[
924+
"/dev/nvidiactl",
925+
"/dev/nvidia-uvm",
926+
"/dev/nvidia-uvm-tools",
927+
"/dev/nvidia-modeset",
928+
"/proc",
929+
];
930+
931+
/// Returns true if GPU devices are present in the container.
932+
fn has_gpu_devices() -> bool {
933+
std::path::Path::new("/dev/nvidiactl").exists()
934+
}
935+
936+
/// Enumerate per-GPU device nodes (`/dev/nvidia0`, `/dev/nvidia1`, …).
937+
fn enumerate_gpu_device_nodes() -> Vec<String> {
938+
let mut paths = Vec::new();
939+
if let Ok(entries) = std::fs::read_dir("/dev") {
940+
for entry in entries.flatten() {
941+
let name = entry.file_name();
942+
let name = name.to_string_lossy();
943+
if let Some(suffix) = name.strip_prefix("nvidia") {
944+
if suffix.is_empty() || !suffix.chars().all(|c| c.is_ascii_digit()) {
945+
continue;
946+
}
947+
paths.push(entry.path().to_string_lossy().into_owned());
948+
}
949+
}
950+
}
951+
paths
952+
}
953+
954+
/// Collect all baseline paths for enrichment: proxy defaults + GPU (if present).
955+
/// Returns `(read_only, read_write)` as owned `String` vecs.
956+
fn baseline_enrichment_paths() -> (Vec<String>, Vec<String>) {
957+
let mut ro: Vec<String> = PROXY_BASELINE_READ_ONLY
958+
.iter()
959+
.map(|&s| s.to_string())
960+
.collect();
961+
let mut rw: Vec<String> = PROXY_BASELINE_READ_WRITE
962+
.iter()
963+
.map(|&s| s.to_string())
964+
.collect();
965+
966+
if has_gpu_devices() {
967+
ro.extend(GPU_BASELINE_READ_ONLY.iter().map(|&s| s.to_string()));
968+
rw.extend(GPU_BASELINE_READ_WRITE.iter().map(|&s| s.to_string()));
969+
rw.extend(enumerate_gpu_device_nodes());
970+
}
971+
972+
// A path promoted to read_write (e.g. /proc for GPU) should not also
973+
// appear in read_only — Landlock handles the overlap correctly but the
974+
// duplicate is confusing when inspecting the effective policy.
975+
ro.retain(|p| !rw.contains(p));
976+
977+
(ro, rw)
978+
}
979+
887980
/// Ensure a proto `SandboxPolicy` includes the baseline filesystem paths
888981
/// required for proxy-mode sandboxes. Paths are only added if missing;
889982
/// user-specified paths are never removed.
@@ -902,35 +995,37 @@ fn enrich_proto_baseline_paths(proto: &mut openshell_core::proto::SandboxPolicy)
902995
..Default::default()
903996
});
904997

998+
let (ro, rw) = baseline_enrichment_paths();
999+
1000+
// Baseline paths are system-injected, not user-specified. Skip paths
1001+
// that do not exist in this container image to avoid noisy warnings from
1002+
// Landlock and, more critically, to prevent a single missing baseline
1003+
// path from abandoning the entire Landlock ruleset under best-effort
1004+
// mode (see issue #664).
9051005
let mut modified = false;
906-
for &path in PROXY_BASELINE_READ_ONLY {
907-
if !fs.read_only.iter().any(|p| p.as_str() == path) {
908-
// Baseline paths are system-injected, not user-specified. Skip
909-
// paths that do not exist in this container image to avoid noisy
910-
// warnings from Landlock and, more critically, to prevent a single
911-
// missing baseline path from abandoning the entire Landlock
912-
// ruleset under best-effort mode (see issue #664).
1006+
for path in &ro {
1007+
if !fs.read_only.iter().any(|p| p == path) && !fs.read_write.iter().any(|p| p == path) {
9131008
if !std::path::Path::new(path).exists() {
9141009
debug!(
9151010
path,
9161011
"Baseline read-only path does not exist, skipping enrichment"
9171012
);
9181013
continue;
9191014
}
920-
fs.read_only.push(path.to_string());
1015+
fs.read_only.push(path.clone());
9211016
modified = true;
9221017
}
9231018
}
924-
for &path in PROXY_BASELINE_READ_WRITE {
925-
if !fs.read_write.iter().any(|p| p.as_str() == path) {
1019+
for path in &rw {
1020+
if !fs.read_write.iter().any(|p| p == path) {
9261021
if !std::path::Path::new(path).exists() {
9271022
debug!(
9281023
path,
9291024
"Baseline read-write path does not exist, skipping enrichment"
9301025
);
9311026
continue;
9321027
}
933-
fs.read_write.push(path.to_string());
1028+
fs.read_write.push(path.clone());
9341029
modified = true;
9351030
}
9361031
}
@@ -950,12 +1045,12 @@ fn enrich_sandbox_baseline_paths(policy: &mut SandboxPolicy) {
9501045
return;
9511046
}
9521047

1048+
let (ro, rw) = baseline_enrichment_paths();
1049+
9531050
let mut modified = false;
954-
for &path in PROXY_BASELINE_READ_ONLY {
1051+
for path in &ro {
9551052
let p = std::path::PathBuf::from(path);
956-
if !policy.filesystem.read_only.contains(&p) {
957-
// Baseline paths are system-injected — skip non-existent paths to
958-
// avoid Landlock ruleset abandonment (issue #664).
1053+
if !policy.filesystem.read_only.contains(&p) && !policy.filesystem.read_write.contains(&p) {
9591054
if !p.exists() {
9601055
debug!(
9611056
path,
@@ -967,7 +1062,7 @@ fn enrich_sandbox_baseline_paths(policy: &mut SandboxPolicy) {
9671062
modified = true;
9681063
}
9691064
}
970-
for &path in PROXY_BASELINE_READ_WRITE {
1065+
for path in &rw {
9711066
let p = std::path::PathBuf::from(path);
9721067
if !policy.filesystem.read_write.contains(&p) {
9731068
if !p.exists() {
@@ -987,6 +1082,75 @@ fn enrich_sandbox_baseline_paths(policy: &mut SandboxPolicy) {
9871082
}
9881083
}
9891084

1085+
#[cfg(test)]
1086+
mod baseline_tests {
1087+
use super::*;
1088+
1089+
#[test]
1090+
fn proc_not_in_both_read_only_and_read_write_when_gpu_present() {
1091+
// When GPU devices are present, /proc is promoted to read_write
1092+
// (CUDA needs to write /proc/<pid>/task/<tid>/comm). It should
1093+
// NOT also appear in read_only.
1094+
if !has_gpu_devices() {
1095+
// Can't test GPU dedup without GPU devices; skip silently.
1096+
return;
1097+
}
1098+
let (ro, rw) = baseline_enrichment_paths();
1099+
assert!(
1100+
rw.contains(&"/proc".to_string()),
1101+
"/proc should be in read_write when GPU is present"
1102+
);
1103+
assert!(
1104+
!ro.contains(&"/proc".to_string()),
1105+
"/proc should NOT be in read_only when it is already in read_write"
1106+
);
1107+
}
1108+
1109+
#[test]
1110+
fn proc_in_read_only_without_gpu() {
1111+
if has_gpu_devices() {
1112+
// On a GPU host we can't test the non-GPU path; skip silently.
1113+
return;
1114+
}
1115+
let (ro, _rw) = baseline_enrichment_paths();
1116+
assert!(
1117+
ro.contains(&"/proc".to_string()),
1118+
"/proc should be in read_only when GPU is not present"
1119+
);
1120+
}
1121+
1122+
#[test]
1123+
fn baseline_read_write_always_includes_sandbox_and_tmp() {
1124+
let (_ro, rw) = baseline_enrichment_paths();
1125+
assert!(rw.contains(&"/sandbox".to_string()));
1126+
assert!(rw.contains(&"/tmp".to_string()));
1127+
}
1128+
1129+
#[test]
1130+
fn enumerate_gpu_device_nodes_skips_bare_nvidia() {
1131+
// "nvidia" (without a trailing digit) is a valid /dev entry on some
1132+
// systems but is not a per-GPU device node. The enumerator must
1133+
// not match it.
1134+
let nodes = enumerate_gpu_device_nodes();
1135+
assert!(
1136+
!nodes.contains(&"/dev/nvidia".to_string()),
1137+
"bare /dev/nvidia should not be enumerated: {nodes:?}"
1138+
);
1139+
}
1140+
1141+
#[test]
1142+
fn no_duplicate_paths_in_baseline() {
1143+
let (ro, rw) = baseline_enrichment_paths();
1144+
// No path should appear in both lists.
1145+
for path in &ro {
1146+
assert!(
1147+
!rw.contains(path),
1148+
"path {path} appears in both read_only and read_write"
1149+
);
1150+
}
1151+
}
1152+
}
1153+
9901154
/// Load sandbox policy from local files or gRPC.
9911155
///
9921156
/// Priority:

e2e/python/test_inference_routing.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@
3232

3333
_BASE_FILESYSTEM = sandbox_pb2.FilesystemPolicy(
3434
include_workdir=True,
35-
read_only=["/usr", "/lib", "/etc", "/app", "/var/log"],
35+
read_only=["/usr", "/lib", "/etc", "/app", "/var/log", "/proc", "/dev/urandom"],
3636
read_write=["/sandbox", "/tmp"],
3737
)
3838
_BASE_LANDLOCK = sandbox_pb2.LandlockPolicy(compatibility="best_effort")

0 commit comments

Comments
 (0)