From 7b82b6884b0055c08eb32379333f7f41b6c3710d Mon Sep 17 00:00:00 2001
From: Evan Lezar <elezar@nvidia.com>
Date: Tue, 24 Mar 2026 11:34:41 +0100
Subject: [PATCH 1/4] fix(gpu): add Tegra host-files bind-mount
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Bind-mount /etc/nvidia-container-runtime/host-files-for-container.d
(read-only) into the gateway container when it exists, so the nvidia
runtime running inside k3s can apply the same host-file injection
config as on the host — required for Jetson/Tegra platforms.

Signed-off-by: Evan Lezar <elezar@nvidia.com>
---
 crates/openshell-bootstrap/src/docker.rs | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/crates/openshell-bootstrap/src/docker.rs b/crates/openshell-bootstrap/src/docker.rs
index 9c365bfe..f5cdc8a5 100644
--- a/crates/openshell-bootstrap/src/docker.rs
+++ b/crates/openshell-bootstrap/src/docker.rs
@@ -559,6 +559,19 @@ pub async fn ensure_container(
         }]);
     }
 
+    // On Tegra platforms the nvidia runtime or CDI spec generation reads
+    // host-file injection config from
+    // /etc/nvidia-container-runtime/host-files-for-container.d on the host.
+    // Bind-mount that directory (read-only) into the gateway so the same
+    // nvidia runtime or CDI spec generation running inside k3s (for sandbox
+    // pods) can apply the same config.
+    const HOST_FILES_DIR: &str = "/etc/nvidia-container-runtime/host-files-for-container.d";
+    if std::path::Path::new(HOST_FILES_DIR).is_dir() {
+        let mut binds = host_config.binds.take().unwrap_or_default();
+        binds.push(format!("{HOST_FILES_DIR}:{HOST_FILES_DIR}:ro"));
+        host_config.binds = Some(binds);
+    }
+
     let mut cmd = vec![
         "server".to_string(),
         "--disable=traefik".to_string(),

From 2b0f4d75edee0f0c0745b3cad8fa6a7c24c44168 Mon Sep 17 00:00:00 2001
From: Evan Lezar <elezar@nvidia.com>
Date: Wed, 25 Mar 2026 19:58:42 +0100
Subject: [PATCH 2/4] fix(gpu): pin device plugin to development image

Use ghcr.io/nvidia/k8s-device-plugin:2ab68c16 which includes support for
mounting /etc/nvidia-container-runtime/host-files-for-container.d into the
device plugin pod, required for correct CDI spec generation on Tegra-based
systems.

Also included is an nvcdi API bump that ensures that additional GIDs are
included in the generated CDI spec.

Signed-off-by: Evan Lezar <elezar@nvidia.com>
---
 deploy/kube/gpu-manifests/nvidia-device-plugin-helmchart.yaml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/deploy/kube/gpu-manifests/nvidia-device-plugin-helmchart.yaml b/deploy/kube/gpu-manifests/nvidia-device-plugin-helmchart.yaml
index 088562ac..c179798d 100644
--- a/deploy/kube/gpu-manifests/nvidia-device-plugin-helmchart.yaml
+++ b/deploy/kube/gpu-manifests/nvidia-device-plugin-helmchart.yaml
@@ -27,6 +27,9 @@ spec:
   targetNamespace: nvidia-device-plugin
   createNamespace: true
   valuesContent: |-
+    image:
+      repository: ghcr.io/nvidia/k8s-device-plugin
+      tag: "2ab68c16"
     runtimeClassName: nvidia
     gfd:
       enabled: false

From 8716adc6470874a5e4afd31fa33e33921454d1aa Mon Sep 17 00:00:00 2001
From: Evan Lezar <elezar@nvidia.com>
Date: Thu, 26 Mar 2026 14:30:58 +0100
Subject: [PATCH 3/4] fix(sandbox): preserve CDI-injected GIDs across privilege
 drop

initgroups(3) replaces all supplemental groups with the user's entries
from /etc/group, discarding GIDs injected by the container runtime via
CDI (e.g. GID 44/video needed for /dev/nvmap on Tegra). Snapshot the
container-level GIDs before initgroups runs and merge them back
afterwards, excluding GID 0 (root) to avoid privilege retention.

Signed-off-by: Evan Lezar <elezar@nvidia.com>
---
 crates/openshell-sandbox/src/process.rs | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/crates/openshell-sandbox/src/process.rs b/crates/openshell-sandbox/src/process.rs
index b93d125a..9726e4b7 100644
--- a/crates/openshell-sandbox/src/process.rs
+++ b/crates/openshell-sandbox/src/process.rs
@@ -414,7 +414,26 @@ pub fn drop_privileges(policy: &SandboxPolicy) -> Result<()> {
             target_os = "redox"
         )))]
         {
+            // Snapshot the container-level supplemental GIDs (e.g. injected by
+            // CDI for GPU device access) before initgroups replaces them.
+            // Exclude GID 0 (root) to avoid inadvertent privilege retention.
+            let root_gid = nix::unistd::Gid::from_raw(0);
+            let container_gids: Vec<nix::unistd::Gid> = nix::unistd::getgroups()
+                .unwrap_or_default()
+                .into_iter()
+                .filter(|&g| g != root_gid)
+                .collect();
             nix::unistd::initgroups(user_cstr.as_c_str(), group.gid).into_diagnostic()?;
+            // Merge back any CDI-injected GIDs that initgroups dropped so that
+            // exec'd processes retain access to GPU devices (e.g. /dev/nvmap on
+            // Tegra requires the video GID).
+            let mut merged: Vec<nix::unistd::Gid> = nix::unistd::getgroups().unwrap_or_default();
+            for gid in container_gids {
+                if !merged.contains(&gid) {
+                    merged.push(gid);
+                }
+            }
+            nix::unistd::setgroups(&merged).into_diagnostic()?;
         }
     }
 

From d39c1bc603c9f2f0f481144849623ad77999fe17 Mon Sep 17 00:00:00 2001
From: Evan Lezar <elezar@nvidia.com>
Date: Thu, 26 Mar 2026 14:29:01 +0100
Subject: [PATCH 4/4] test(e2e): fall back to /usr/sbin/nvidia-smi on Tegra

On Jetson/Tegra platforms nvidia-smi is installed at /usr/sbin/nvidia-smi
rather than /usr/bin/nvidia-smi and may not be on PATH inside the sandbox.
Fall back to the full path when the bare command is not found.

Signed-off-by: Evan Lezar <elezar@nvidia.com>
---
 e2e/python/test_sandbox_gpu.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/e2e/python/test_sandbox_gpu.py b/e2e/python/test_sandbox_gpu.py
index 510b3d92..472d0e38 100644
--- a/e2e/python/test_sandbox_gpu.py
+++ b/e2e/python/test_sandbox_gpu.py
@@ -20,11 +20,13 @@ def test_gpu_sandbox_reports_available_gpu(
     sandbox: Callable[..., Sandbox],
     gpu_sandbox_spec: datamodel_pb2.SandboxSpec,
 ) -> None:
+    nvidia_smi_args = ["--query-gpu=name", "--format=csv,noheader"]
     with sandbox(spec=gpu_sandbox_spec, delete_on_exit=True) as sb:
-        result = sb.exec(
-            ["nvidia-smi", "--query-gpu=name", "--format=csv,noheader"],
-            timeout_seconds=30,
-        )
+        result = sb.exec(["nvidia-smi", *nvidia_smi_args], timeout_seconds=30)
+        if result.exit_code != 0:
+            # On some platforms (e.g. Tegra/Jetson) nvidia-smi lives in
+            # /usr/sbin rather than /usr/bin and may not be on PATH.
+            result = sb.exec(["/usr/sbin/nvidia-smi", *nvidia_smi_args], timeout_seconds=30)
 
         assert result.exit_code == 0, result.stderr
         assert result.stdout.strip()