From 66b50d27107dd613eea265f145a5846709829043 Mon Sep 17 00:00:00 2001 From: Christopher Desiniotis Date: Thu, 14 May 2026 12:07:59 -0700 Subject: [PATCH 1/2] Update systemd unit conditions to execute nvidia-smi This ensures that the nvidia-cdi-refresh.service does not start until after the driver is loaded and available. The added benefit of running nvidia-smi is that it will load the module (if not already loaded) and create the device nodes (if not already created). Signed-off-by: Christopher Desiniotis --- deployments/systemd/nvidia-cdi-refresh.service | 1 + 1 file changed, 1 insertion(+) diff --git a/deployments/systemd/nvidia-cdi-refresh.service b/deployments/systemd/nvidia-cdi-refresh.service index 14adb3d23..27ea78451 100644 --- a/deployments/systemd/nvidia-cdi-refresh.service +++ b/deployments/systemd/nvidia-cdi-refresh.service @@ -25,6 +25,7 @@ Type=oneshot Environment=NVIDIA_CTK_CDI_OUTPUT_FILE_PATH=/var/run/cdi/nvidia.yaml EnvironmentFile=-/etc/nvidia-container-toolkit/nvidia-cdi-refresh.env ExecCondition=/bin/sh -c '/usr/bin/grep -qE "/(nvidia|nvidia-current)[.]ko" /lib/modules/%v/modules.dep || [ -e /dev/dxg ]' +ExecCondition=/bin/sh -c '/usr/bin/nvidia-smi -L || /usr/sbin/nvidia-smi -L || /usr/lib/wsl/lib/nvidia-smi -L' ExecStart=/usr/bin/nvidia-ctk cdi generate CapabilityBoundingSet=CAP_SYS_MODULE CAP_SYS_ADMIN CAP_MKNOD From d5177c73c0956bc75d26ed80a3837bcc52d550ed Mon Sep 17 00:00:00 2001 From: Christopher Desiniotis Date: Thu, 14 May 2026 13:09:59 -0700 Subject: [PATCH 2/2] Reintroduce restart logic to nvidia-cdi-refresh.service The restart logic was removed when a dependency on the multi-user-target was added in https://github.com/NVIDIA/nvidia-container-toolkit/commit/5fe6b42eab11f144866de07df4f755845c4b94cf. However, this change had unforeseen consequences and was later reverted in https://github.com/NVIDIA/nvidia-container-toolkit/commit/5eee5ce7b8ab19012f5aabdafd525f21b3a54496. When reverting this change, the restart logic was not re-added to the service. Signed-off-by: Christopher Desiniotis --- deployments/systemd/nvidia-cdi-refresh.service | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/deployments/systemd/nvidia-cdi-refresh.service b/deployments/systemd/nvidia-cdi-refresh.service index 27ea78451..9a6e0f5b7 100644 --- a/deployments/systemd/nvidia-cdi-refresh.service +++ b/deployments/systemd/nvidia-cdi-refresh.service @@ -18,6 +18,9 @@ ConditionPathExists=|/usr/bin/nvidia-smi ConditionPathExists=|/usr/sbin/nvidia-smi ConditionPathExists=|/usr/lib/wsl/lib/nvidia-smi ConditionPathExists=/usr/bin/nvidia-ctk +# Limit the number of successive restarts to 5 in 10 seconds. +StartLimitBurst=5 +StartLimitIntervalSec=10s [Service] Type=oneshot @@ -28,6 +31,10 @@ ExecCondition=/bin/sh -c '/usr/bin/grep -qE "/(nvidia|nvidia-current)[.]ko" /lib ExecCondition=/bin/sh -c '/usr/bin/nvidia-smi -L || /usr/sbin/nvidia-smi -L || /usr/lib/wsl/lib/nvidia-smi -L' ExecStart=/usr/bin/nvidia-ctk cdi generate CapabilityBoundingSet=CAP_SYS_MODULE CAP_SYS_ADMIN CAP_MKNOD +# We set the service to restart on failure to ensure that a CDI spec is +# eventually generated. +Restart=on-failure +RestartSec=1s [Install] WantedBy=multi-user.target