From b6f1fc5c4a37437d2469e541e325133a16fdb3e1 Mon Sep 17 00:00:00 2001 From: Nuno Marques Date: Tue, 12 May 2026 14:42:31 -0700 Subject: [PATCH] docker: make --nvidia actually work; clear error when toolkit is missing Three closely related changes to docker_run.sh so the --nvidia path actually renders Gazebo on the discrete GPU: 1. Always forward /dev/dri (instead of only in the non-nvidia branch). The nvidia runtime by itself only ships NVIDIA's GL stack; without /dev/dri the Mesa iris driver fails to query DRM and Ogre cannot create a GLX/EGL screen, producing MESA: error: Failed to query drm device. libGL error: glx: failed to create dri3 screen libGL error: failed to load driver: iris 2. Pass __NV_PRIME_RENDER_OFFLOAD=1 and __GLX_VENDOR_LIBRARY_NAME=nvidia so GLX is routed through NVIDIA's vendor library on hybrid laptops. With this set, glxinfo -B reports OpenGL vendor: NVIDIA Corporation OpenGL renderer: NVIDIA GeForce GTX 1070 ... 3. Pre-flight check for the 'nvidia' runtime via `docker info`. If the NVIDIA Container Toolkit is not installed/registered, the bare docker error is the cryptic unknown or invalid runtime name: nvidia Print a clear message pointing to the install guide and the `nvidia-ctk runtime configure --runtime=docker` + restart step, and mention the fallback (omit --nvidia for the integrated GPU). docs/setup.md is updated to mention the runtime registration step alongside the toolkit install. --- docker/docker_run.sh | 53 ++++++++++++++++++++++++++++++++++---------- docs/setup.md | 2 +- 2 files changed, 42 insertions(+), 13 deletions(-) diff --git a/docker/docker_run.sh b/docker/docker_run.sh index da78e86..0becf8e 100755 --- a/docker/docker_run.sh +++ b/docker/docker_run.sh @@ -31,22 +31,51 @@ if [ "$NO_GUI" = false ]; then DOCKER_CMD="$DOCKER_CMD -v /tmp/.X11-unix:/tmp/.X11-unix:ro" DOCKER_CMD="$DOCKER_CMD -e DISPLAY=$DISPLAY" + # Always forward /dev/dri so Mesa has a working DRM path. The nvidia + # runtime by itself only provides NVIDIA's GL stack; it does not expose + # the integrated GPU. Without /dev/dri the iris/i915 Mesa driver fails + # to query DRM and Gazebo's renderer cannot create a GLX/EGL screen. + DOCKER_CMD="$DOCKER_CMD --device /dev/dri:/dev/dri" + # /dev/dri/* on the host is mode 660, owned by host groups (typically + # 'video' for card* and 'render' for renderD*). The container's + # 'ubuntu' user is not in those groups by default, so EGL/Vulkan/DRI + # would fall back with "Permission denied". Pass the host GIDs so the + # container user can open the GPU device nodes. + DRI_GIDS=$(stat -c %g /dev/dri/card* /dev/dri/renderD* 2>/dev/null | sort -u) + for gid in $DRI_GIDS; do + DOCKER_CMD="$DOCKER_CMD --group-add $gid" + done + # Add nvidia runtime if --nvidia is specified if [ "$NVIDIA" = true ]; then + # Check the nvidia runtime is actually registered with the Docker + # daemon. Without nvidia-container-toolkit installed and configured, + # `--runtime nvidia` fails with the unhelpful + # "unknown or invalid runtime name: nvidia" + if ! docker info --format '{{json .Runtimes}}' 2>/dev/null | grep -q '"nvidia"'; then + echo "Error: --nvidia requested but the 'nvidia' Docker runtime is not registered." >&2 + echo "" >&2 + echo "Install the NVIDIA Container Toolkit on the host:" >&2 + echo " https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html" >&2 + echo "" >&2 + echo "After installing, run:" >&2 + echo " sudo nvidia-ctk runtime configure --runtime=docker" >&2 + echo " sudo systemctl restart docker" >&2 + echo "" >&2 + echo "Or, if you don't need the NVIDIA GPU, omit --nvidia and the script" >&2 + echo "will fall back to the integrated GPU via /dev/dri." >&2 + exit 1 + fi DOCKER_CMD="$DOCKER_CMD --runtime nvidia" DOCKER_CMD="$DOCKER_CMD -e NVIDIA_VISIBLE_DEVICES=all" DOCKER_CMD="$DOCKER_CMD -e NVIDIA_DRIVER_CAPABILITIES=all" - else - DOCKER_CMD="$DOCKER_CMD --device /dev/dri:/dev/dri" - # /dev/dri/* on the host is mode 660, owned by host groups (typically - # 'video' for card* and 'render' for renderD*). The container's - # 'ubuntu' user is not in those groups, so EGL/Vulkan/DRI fall back - # with "Permission denied". Pass the host GIDs so the container user - # can access the GPU device nodes. - DRI_GIDS=$(stat -c %g /dev/dri/card* /dev/dri/renderD* 2>/dev/null | sort -u) - for gid in $DRI_GIDS; do - DOCKER_CMD="$DOCKER_CMD --group-add $gid" - done + # Route GLX through NVIDIA's vendor library so Gazebo's Ogre + # renderer actually uses the discrete GPU instead of trying the + # Mesa iris driver first (which causes "failed to create dri3 + # screen" / "failed to load driver: iris" warnings on hybrid + # laptops where both the integrated GPU and the dGPU are visible). + DOCKER_CMD="$DOCKER_CMD -e __NV_PRIME_RENDER_OFFLOAD=1" + DOCKER_CMD="$DOCKER_CMD -e __GLX_VENDOR_LIBRARY_NAME=nvidia" fi fi @@ -59,4 +88,4 @@ DOCKER_CMD="$DOCKER_CMD -w /home/ubuntu/ossna-26-workshop_ws" DOCKER_CMD="$DOCKER_CMD dronecode/ossna-26-workshop bash" # Execute the command -eval $DOCKER_CMD \ No newline at end of file +eval $DOCKER_CMD diff --git a/docs/setup.md b/docs/setup.md index 27abfc0..e55ceee 100644 --- a/docs/setup.md +++ b/docs/setup.md @@ -91,7 +91,7 @@ You can use also use two options: - `--no-gui` to disable GUI in the container. This option also forwards port `18570` to allow external (Host) QGC connection. -- `--nvidia` to run the container with the `nvidia` runtime (it requires the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html) installed on the host). +- `--nvidia` to run the container with the `nvidia` runtime (it requires the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html) installed and registered with Docker on the host; see `nvidia-ctk runtime configure --runtime=docker` followed by `sudo systemctl restart docker`). When using this method you can attach new shell to your container by running