From b6f1fc5c4a37437d2469e541e325133a16fdb3e1 Mon Sep 17 00:00:00 2001
From: Nuno Marques <nuno.msm1@gmail.com>
Date: Tue, 12 May 2026 14:42:31 -0700
Subject: [PATCH] docker: make --nvidia actually work; clear error when toolkit
 is missing

Three closely related changes to docker_run.sh so the --nvidia path
actually renders Gazebo on the discrete GPU:

1. Always forward /dev/dri (instead of only in the non-nvidia branch).
   The nvidia runtime by itself only ships NVIDIA's GL stack; without
   /dev/dri the Mesa iris driver fails to query DRM and Ogre cannot
   create a GLX/EGL screen, producing
       MESA: error: Failed to query drm device.
       libGL error: glx: failed to create dri3 screen
       libGL error: failed to load driver: iris

2. Pass __NV_PRIME_RENDER_OFFLOAD=1 and __GLX_VENDOR_LIBRARY_NAME=nvidia
   so GLX is routed through NVIDIA's vendor library on hybrid laptops.
   With this set, glxinfo -B reports
       OpenGL vendor:   NVIDIA Corporation
       OpenGL renderer: NVIDIA GeForce GTX 1070 ...

3. Pre-flight check for the 'nvidia' runtime via `docker info`. If
   the NVIDIA Container Toolkit is not installed/registered, the
   bare docker error is the cryptic
       unknown or invalid runtime name: nvidia
   Print a clear message pointing to the install guide and the
   `nvidia-ctk runtime configure --runtime=docker` + restart step,
   and mention the fallback (omit --nvidia for the integrated GPU).

docs/setup.md is updated to mention the runtime registration step
alongside the toolkit install.
---
 docker/docker_run.sh | 53 ++++++++++++++++++++++++++++++++++----------
 docs/setup.md        |  2 +-
 2 files changed, 42 insertions(+), 13 deletions(-)

diff --git a/docker/docker_run.sh b/docker/docker_run.sh
index da78e86..0becf8e 100755
--- a/docker/docker_run.sh
+++ b/docker/docker_run.sh
@@ -31,22 +31,51 @@ if [ "$NO_GUI" = false ]; then
     DOCKER_CMD="$DOCKER_CMD -v /tmp/.X11-unix:/tmp/.X11-unix:ro"
     DOCKER_CMD="$DOCKER_CMD -e DISPLAY=$DISPLAY"
 
+    # Always forward /dev/dri so Mesa has a working DRM path. The nvidia
+    # runtime by itself only provides NVIDIA's GL stack; it does not expose
+    # the integrated GPU. Without /dev/dri the iris/i915 Mesa driver fails
+    # to query DRM and Gazebo's renderer cannot create a GLX/EGL screen.
+    DOCKER_CMD="$DOCKER_CMD --device /dev/dri:/dev/dri"
+    # /dev/dri/* on the host is mode 660, owned by host groups (typically
+    # 'video' for card* and 'render' for renderD*). The container's
+    # 'ubuntu' user is not in those groups by default, so EGL/Vulkan/DRI
+    # would fall back with "Permission denied". Pass the host GIDs so the
+    # container user can open the GPU device nodes.
+    DRI_GIDS=$(stat -c %g /dev/dri/card* /dev/dri/renderD* 2>/dev/null | sort -u)
+    for gid in $DRI_GIDS; do
+        DOCKER_CMD="$DOCKER_CMD --group-add $gid"
+    done
+
     # Add nvidia runtime if --nvidia is specified
     if [ "$NVIDIA" = true ]; then
+        # Check the nvidia runtime is actually registered with the Docker
+        # daemon. Without nvidia-container-toolkit installed and configured,
+        # `--runtime nvidia` fails with the unhelpful
+        #     "unknown or invalid runtime name: nvidia"
+        if ! docker info --format '{{json .Runtimes}}' 2>/dev/null | grep -q '"nvidia"'; then
+            echo "Error: --nvidia requested but the 'nvidia' Docker runtime is not registered." >&2
+            echo "" >&2
+            echo "Install the NVIDIA Container Toolkit on the host:" >&2
+            echo "  https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html" >&2
+            echo "" >&2
+            echo "After installing, run:" >&2
+            echo "  sudo nvidia-ctk runtime configure --runtime=docker" >&2
+            echo "  sudo systemctl restart docker" >&2
+            echo "" >&2
+            echo "Or, if you don't need the NVIDIA GPU, omit --nvidia and the script" >&2
+            echo "will fall back to the integrated GPU via /dev/dri." >&2
+            exit 1
+        fi
         DOCKER_CMD="$DOCKER_CMD --runtime nvidia"
         DOCKER_CMD="$DOCKER_CMD -e NVIDIA_VISIBLE_DEVICES=all"
         DOCKER_CMD="$DOCKER_CMD -e NVIDIA_DRIVER_CAPABILITIES=all"
-    else
-        DOCKER_CMD="$DOCKER_CMD --device /dev/dri:/dev/dri"
-        # /dev/dri/* on the host is mode 660, owned by host groups (typically
-        # 'video' for card* and 'render' for renderD*). The container's
-        # 'ubuntu' user is not in those groups, so EGL/Vulkan/DRI fall back
-        # with "Permission denied". Pass the host GIDs so the container user
-        # can access the GPU device nodes.
-        DRI_GIDS=$(stat -c %g /dev/dri/card* /dev/dri/renderD* 2>/dev/null | sort -u)
-        for gid in $DRI_GIDS; do
-            DOCKER_CMD="$DOCKER_CMD --group-add $gid"
-        done
+        # Route GLX through NVIDIA's vendor library so Gazebo's Ogre
+        # renderer actually uses the discrete GPU instead of trying the
+        # Mesa iris driver first (which causes "failed to create dri3
+        # screen" / "failed to load driver: iris" warnings on hybrid
+        # laptops where both the integrated GPU and the dGPU are visible).
+        DOCKER_CMD="$DOCKER_CMD -e __NV_PRIME_RENDER_OFFLOAD=1"
+        DOCKER_CMD="$DOCKER_CMD -e __GLX_VENDOR_LIBRARY_NAME=nvidia"
     fi
 fi
 
@@ -59,4 +88,4 @@ DOCKER_CMD="$DOCKER_CMD -w /home/ubuntu/ossna-26-workshop_ws"
 DOCKER_CMD="$DOCKER_CMD dronecode/ossna-26-workshop bash"
 
 # Execute the command
-eval $DOCKER_CMD
\ No newline at end of file
+eval $DOCKER_CMD
diff --git a/docs/setup.md b/docs/setup.md
index 27abfc0..e55ceee 100644
--- a/docs/setup.md
+++ b/docs/setup.md
@@ -91,7 +91,7 @@ You can use also use two options:
 
 - `--no-gui` to disable GUI in the container.
 This option also forwards port `18570` to allow external (Host) QGC connection.
-- `--nvidia` to run the container with the `nvidia` runtime (it requires the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html) installed on the host).
+- `--nvidia` to run the container with the `nvidia` runtime (it requires the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html) installed and registered with Docker on the host; see `nvidia-ctk runtime configure --runtime=docker` followed by `sudo systemctl restart docker`).
 
 When using this method you can attach new shell to your container by running