diff --git a/CHANGELOG.md b/CHANGELOG.md index b0d16fb13..d51ee6bc2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,8 @@ - ensure the toolkit exits if NRI Plugin init fails - feat(nvcdi): Allow IPC sockets to not be discovered - fix(cudacompat): Fix handling of CUDA compat on Orin +- Update systemd unit conditions to execute nvidia-smi +- Reintroduce restart logic to nvidia-cdi-refresh.service ## v1.19.0 - Promote v1.19.0-rc.7 to v1.19.0 diff --git a/deployments/systemd/nvidia-cdi-refresh.service b/deployments/systemd/nvidia-cdi-refresh.service index 14adb3d23..9a6e0f5b7 100644 --- a/deployments/systemd/nvidia-cdi-refresh.service +++ b/deployments/systemd/nvidia-cdi-refresh.service @@ -18,6 +18,9 @@ ConditionPathExists=|/usr/bin/nvidia-smi ConditionPathExists=|/usr/sbin/nvidia-smi ConditionPathExists=|/usr/lib/wsl/lib/nvidia-smi ConditionPathExists=/usr/bin/nvidia-ctk +# Limit the number of successive restarts to 5 in 10 seconds. +StartLimitBurst=5 +StartLimitIntervalSec=10s [Service] Type=oneshot @@ -25,8 +28,13 @@ Type=oneshot Environment=NVIDIA_CTK_CDI_OUTPUT_FILE_PATH=/var/run/cdi/nvidia.yaml EnvironmentFile=-/etc/nvidia-container-toolkit/nvidia-cdi-refresh.env ExecCondition=/bin/sh -c '/usr/bin/grep -qE "/(nvidia|nvidia-current)[.]ko" /lib/modules/%v/modules.dep || [ -e /dev/dxg ]' +ExecCondition=/bin/sh -c '/usr/bin/nvidia-smi -L || /usr/sbin/nvidia-smi -L || /usr/lib/wsl/lib/nvidia-smi -L' ExecStart=/usr/bin/nvidia-ctk cdi generate CapabilityBoundingSet=CAP_SYS_MODULE CAP_SYS_ADMIN CAP_MKNOD +# We set the service to restart on failure to ensure that a CDI spec is +# eventually generated. +Restart=on-failure +RestartSec=1s [Install] WantedBy=multi-user.target