From c29c81ffbd5cd76ee06f208741b6605d6e08ca82 Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Fri, 15 May 2026 08:53:18 +0200 Subject: [PATCH] Fix nvidia-cdi-refresh systemd packaging Signed-off-by: Evan Lezar --- .../90-nvidia-container-toolkit.preset | 2 + docker/Dockerfile.opensuse-leap | 5 +- docker/Dockerfile.rpm-yum | 6 +- packaging/debian/control | 3 +- .../nvidia-container-toolkit-base.install | 4 +- .../nvidia-container-toolkit-base.maintscript | 2 + .../nvidia-container-toolkit-base.postinst | 17 ----- packaging/debian/prepare | 2 + packaging/debian/rules | 6 +- .../rpm/SPECS/nvidia-container-toolkit.spec | 47 ++++++++------ tests/e2e/nvidia-cdi-refresh_test.go | 63 +++++++++++++++---- 11 files changed, 101 insertions(+), 56 deletions(-) create mode 100644 deployments/systemd/90-nvidia-container-toolkit.preset create mode 100644 packaging/debian/nvidia-container-toolkit-base.maintscript diff --git a/deployments/systemd/90-nvidia-container-toolkit.preset b/deployments/systemd/90-nvidia-container-toolkit.preset new file mode 100644 index 000000000..17f336b5f --- /dev/null +++ b/deployments/systemd/90-nvidia-container-toolkit.preset @@ -0,0 +1,2 @@ +enable nvidia-cdi-refresh.path +enable nvidia-cdi-refresh.service diff --git a/docker/Dockerfile.opensuse-leap b/docker/Dockerfile.opensuse-leap index c545f3809..945155dc1 100644 --- a/docker/Dockerfile.opensuse-leap +++ b/docker/Dockerfile.opensuse-leap @@ -5,7 +5,8 @@ RUN zypper install -y \ ca-certificates \ wget \ git \ - rpm-build && \ + rpm-build \ + systemd-rpm-macros && \ rm -rf /var/cache/zypp/* ARG GOLANG_VERSION=0.0.0 @@ -46,7 +47,7 @@ RUN make PREFIX=${DIST_DIR} cmds WORKDIR $DIST_DIR/.. COPY packaging/rpm . -COPY deployments/systemd/ . +COPY deployments/systemd/ ${DIST_DIR}/ CMD arch=$(uname -m) && \ rpmbuild --clean --target=$arch -bb \ diff --git a/docker/Dockerfile.rpm-yum b/docker/Dockerfile.rpm-yum index 8df1d667b..6a54355c6 100644 --- a/docker/Dockerfile.rpm-yum +++ b/docker/Dockerfile.rpm-yum @@ -30,7 +30,11 @@ RUN yum install -y \ wget \ git \ make \ - rpm-build && \ + rpm-build \ + systemd && \ + if yum info systemd-rpm-macros >/dev/null 2>&1; then \ + yum install -y systemd-rpm-macros; \ + fi && \ rm -rf /var/cache/yum/* ARG GOLANG_VERSION=0.0.0 diff --git a/packaging/debian/control b/packaging/debian/control index 88ad8a9e7..3c3c1756e 100644 --- a/packaging/debian/control +++ b/packaging/debian/control @@ -6,7 +6,7 @@ Standards-Version: 3.9.8 Homepage: https://github.com/NVIDIA/nvidia-container-toolkit Vcs-Git: https://github.com/NVIDIA/nvidia-container-toolkit Vcs-Browser: https://github.com/NVIDIA/nvidia-container-toolkit -Build-Depends: debhelper (>= 9) +Build-Depends: debhelper (>= 9.20160709) Package: nvidia-container-toolkit Architecture: any @@ -18,6 +18,7 @@ Description: NVIDIA Container toolkit Package: nvidia-container-toolkit-base Architecture: any +Pre-Depends: ${misc:Pre-Depends} Depends: ${misc:Depends} Breaks: nvidia-container-runtime (<= 3.5.0-1), nvidia-container-runtime-hook, nvidia-container-toolkit (<= 1.10.0-1) Replaces: nvidia-container-runtime (<= 3.5.0-1), nvidia-container-runtime-hook diff --git a/packaging/debian/nvidia-container-toolkit-base.install b/packaging/debian/nvidia-container-toolkit-base.install index 58cdafb64..09a3eb8b4 100644 --- a/packaging/debian/nvidia-container-toolkit-base.install +++ b/packaging/debian/nvidia-container-toolkit-base.install @@ -1,6 +1,6 @@ nvidia-container-runtime /usr/bin nvidia-ctk /usr/bin nvidia-cdi-hook /usr/bin -nvidia-cdi-refresh.service /etc/systemd/system/ -nvidia-cdi-refresh.path /etc/systemd/system/ +nvidia-cdi-refresh.service /lib/systemd/system/ +nvidia-cdi-refresh.path /lib/systemd/system/ nvidia-cdi-refresh.env /etc/nvidia-container-toolkit/ diff --git a/packaging/debian/nvidia-container-toolkit-base.maintscript b/packaging/debian/nvidia-container-toolkit-base.maintscript new file mode 100644 index 000000000..c38b494e6 --- /dev/null +++ b/packaging/debian/nvidia-container-toolkit-base.maintscript @@ -0,0 +1,2 @@ +rm_conffile /etc/systemd/system/nvidia-cdi-refresh.service @VERSION@~ nvidia-container-toolkit-base +rm_conffile /etc/systemd/system/nvidia-cdi-refresh.path @VERSION@~ nvidia-container-toolkit-base diff --git a/packaging/debian/nvidia-container-toolkit-base.postinst b/packaging/debian/nvidia-container-toolkit-base.postinst index 953b78b83..7ee72e46f 100644 --- a/packaging/debian/nvidia-container-toolkit-base.postinst +++ b/packaging/debian/nvidia-container-toolkit-base.postinst @@ -5,23 +5,6 @@ set -e case "$1" in configure) /usr/bin/nvidia-ctk --quiet config --config-file=/etc/nvidia-container-runtime/config.toml --in-place - - # Enable nvidia-cdi-refresh services on both install and upgrade - # Support running and degraded systemd states - if command -v systemctl >/dev/null 2>&1; then - SYSTEMD_STATE=$(systemctl is-system-running 2>/dev/null || true) - case "$SYSTEMD_STATE" in - running|degraded) - systemctl daemon-reload || echo "Warning: Failed to reload systemd daemon" >&2 - systemctl enable --now nvidia-cdi-refresh.path || echo "Warning: Failed to enable nvidia-cdi-refresh.path" >&2 - systemctl enable --now nvidia-cdi-refresh.service || echo "Warning: Failed to enable nvidia-cdi-refresh.service" >&2 - - # Trigger CDI spec regeneration immediately after install/upgrade - echo "Regenerating NVIDIA CDI specification..." - systemctl start nvidia-cdi-refresh.service || echo "Warning: Failed to trigger CDI refresh" >&2 - ;; - esac - fi ;; abort-upgrade|abort-remove|abort-deconfigure) diff --git a/packaging/debian/prepare b/packaging/debian/prepare index 542ff830b..4b01c8b7a 100755 --- a/packaging/debian/prepare +++ b/packaging/debian/prepare @@ -5,6 +5,8 @@ set -e sed -i "s;@SECTION@;${SECTION:+$SECTION/};g" debian/control sed -i "s;@VERSION@;${VERSION:+$VERSION};g" debian/control +sed -i "s;@VERSION@;${VERSION:+$VERSION};g" debian/nvidia-container-toolkit-base.maintscript + if [ -n "$DISTRIB" ]; then sed -i "s;UNRELEASED;$DISTRIB;" debian/changelog fi diff --git a/packaging/debian/rules b/packaging/debian/rules index d57bd1d43..dd5e0966b 100755 --- a/packaging/debian/rules +++ b/packaging/debian/rules @@ -4,7 +4,7 @@ #export DH_VERBOSE=1 %: - dh $@ + dh $@ --with systemd override_dh_fixperms: dh_fixperms @@ -14,5 +14,5 @@ override_dh_fixperms: chmod 755 debian/$(shell dh_listpackages)/usr/bin/nvidia-container-runtime.legacy || true chmod 755 debian/$(shell dh_listpackages)/usr/bin/nvidia-ctk || true chmod 755 debian/$(shell dh_listpackages)/usr/bin/nvidia-cdi-hook || true - chmod 644 debian/$(shell dh_listpackages)/etc/systemd/system/nvidia-cdi-refresh.service || true - chmod 644 debian/$(shell dh_listpackages)/etc/systemd/system/nvidia-cdi-refresh.path || true + chmod 644 debian/$(shell dh_listpackages)/lib/systemd/system/nvidia-cdi-refresh.service || true + chmod 644 debian/$(shell dh_listpackages)/lib/systemd/system/nvidia-cdi-refresh.path || true diff --git a/packaging/rpm/SPECS/nvidia-container-toolkit.spec b/packaging/rpm/SPECS/nvidia-container-toolkit.spec index 8d726f34e..9222d7034 100644 --- a/packaging/rpm/SPECS/nvidia-container-toolkit.spec +++ b/packaging/rpm/SPECS/nvidia-container-toolkit.spec @@ -20,6 +20,13 @@ Source6: nvidia-cdi-hook Source7: nvidia-cdi-refresh.service Source8: nvidia-cdi-refresh.path Source9: nvidia-cdi-refresh.env +Source10: 90-nvidia-container-toolkit.preset + +%if 0%{?rhel} == 7 || 0%{?amzn} == 2 +BuildRequires: systemd +%else +BuildRequires: systemd-rpm-macros +%endif Obsoletes: nvidia-container-runtime <= 3.5.0-1, nvidia-container-runtime-hook <= 1.4.0-2 Provides: nvidia-container-runtime @@ -31,11 +38,12 @@ Requires: nvidia-container-toolkit-base == %{version}-%{release} Provides tools and utilities to enable GPU support in containers. %prep -cp %{SOURCE0} %{SOURCE1} %{SOURCE2} %{SOURCE3} %{SOURCE4} %{SOURCE5} %{SOURCE6} %{SOURCE7} %{SOURCE8} %{SOURCE9} . +cp %{SOURCE0} %{SOURCE1} %{SOURCE2} %{SOURCE3} %{SOURCE4} %{SOURCE5} %{SOURCE6} %{SOURCE7} %{SOURCE8} %{SOURCE9} %{SOURCE10} . %install mkdir -p %{buildroot}%{_bindir} -mkdir -p %{buildroot}%{_sysconfdir}/systemd/system/ +mkdir -p %{buildroot}%{_unitdir} +mkdir -p %{buildroot}%{_presetdir} mkdir -p %{buildroot}%{_sysconfdir}/nvidia-container-toolkit install -m 755 -t %{buildroot}%{_bindir} nvidia-container-runtime-hook @@ -44,8 +52,9 @@ install -m 755 -t %{buildroot}%{_bindir} nvidia-container-runtime.cdi install -m 755 -t %{buildroot}%{_bindir} nvidia-container-runtime.legacy install -m 755 -t %{buildroot}%{_bindir} nvidia-ctk install -m 755 -t %{buildroot}%{_bindir} nvidia-cdi-hook -install -m 644 -t %{buildroot}%{_sysconfdir}/systemd/system nvidia-cdi-refresh.service -install -m 644 -t %{buildroot}%{_sysconfdir}/systemd/system nvidia-cdi-refresh.path +install -m 644 -t %{buildroot}%{_unitdir} nvidia-cdi-refresh.service +install -m 644 -t %{buildroot}%{_unitdir} nvidia-cdi-refresh.path +install -m 644 -t %{buildroot}%{_presetdir} 90-nvidia-container-toolkit.preset install -m 644 -t %{buildroot}%{_sysconfdir}/nvidia-container-toolkit nvidia-cdi-refresh.env %post @@ -94,29 +103,29 @@ Provides tools such as the NVIDIA Container Runtime and NVIDIA Container Toolkit # Generate the default config; If this file already exists no changes are made. %{_bindir}/nvidia-ctk --quiet config --config-file=%{_sysconfdir}/nvidia-container-runtime/config.toml --in-place -# Reload systemd unit cache and enable nvidia-cdi-refresh services on both install and upgrade +%systemd_post nvidia-cdi-refresh.path nvidia-cdi-refresh.service + +# Trigger CDI refresh on running systemd hosts without making install depend on +# the current system state. if command -v systemctl >/dev/null 2>&1; then - SYSTEMD_STATE=$(systemctl is-system-running 2>/dev/null || true) - case "$SYSTEMD_STATE" in - running|degraded) - systemctl daemon-reload || echo "Warning: Failed to reload systemd daemon" >&2 - systemctl enable --now nvidia-cdi-refresh.path || echo "Warning: Failed to enable nvidia-cdi-refresh.path" >&2 - systemctl enable --now nvidia-cdi-refresh.service || echo "Warning: Failed to enable nvidia-cdi-refresh.service" >&2 - - # Trigger CDI spec regeneration immediately after install/upgrade - echo "Regenerating NVIDIA CDI specification..." - systemctl start nvidia-cdi-refresh.service || echo "Warning: Failed to trigger CDI refresh" >&2 - ;; - esac + systemctl start nvidia-cdi-refresh.path >/dev/null 2>&1 || : + systemctl start nvidia-cdi-refresh.service >/dev/null 2>&1 || : fi +%preun base +%systemd_preun nvidia-cdi-refresh.path nvidia-cdi-refresh.service + +%postun base +%systemd_postun nvidia-cdi-refresh.path nvidia-cdi-refresh.service + %files base %license LICENSE %{_bindir}/nvidia-container-runtime %{_bindir}/nvidia-ctk %{_bindir}/nvidia-cdi-hook -%{_sysconfdir}/systemd/system/nvidia-cdi-refresh.service -%{_sysconfdir}/systemd/system/nvidia-cdi-refresh.path +%{_unitdir}/nvidia-cdi-refresh.service +%{_unitdir}/nvidia-cdi-refresh.path +%{_presetdir}/90-nvidia-container-toolkit.preset %config(noreplace) %{_sysconfdir}/nvidia-container-toolkit/nvidia-cdi-refresh.env # The OPERATOR EXTENSIONS package consists of components that are required to enable GPU support in Kubernetes. diff --git a/tests/e2e/nvidia-cdi-refresh_test.go b/tests/e2e/nvidia-cdi-refresh_test.go index 88e435f24..21265dedf 100644 --- a/tests/e2e/nvidia-cdi-refresh_test.go +++ b/tests/e2e/nvidia-cdi-refresh_test.go @@ -62,6 +62,13 @@ EOF rm -rf /etc/systemd/system/dummy.service systemctl daemon-reload ` + disablePolicyRcDScript = `#!/usr/bin/env bash + # The kindest/base image carries a Docker-oriented policy-rc.d that blocks + # deb-systemd-invoke from starting units during package installation. These + # tests run a real systemd instance, so package-triggered unit starts should + # be allowed. + rm -f /usr/sbin/policy-rc.d + ` nvidiaCdiRefreshPathActiveTemplate = ` if ! systemctl status nvidia-cdi-refresh.path | grep "Active: active"; then @@ -69,6 +76,16 @@ EOF exit 1 fi ` + nvidiaCdiRefreshUnitsEnabledTemplate = ` + if ! systemctl is-enabled --quiet nvidia-cdi-refresh.path; then + echo "nvidia-cdi-refresh.path is not enabled" + exit 1 + fi + if ! systemctl is-enabled --quiet nvidia-cdi-refresh.service; then + echo "nvidia-cdi-refresh.service is not enabled" + exit 1 + fi + ` nvidiaCdiRefreshServiceLoadedTemplate = ` if ! systemctl status nvidia-cdi-refresh.service | grep "Loaded: loaded"; then echo "nvidia-cdi-refresh.service is not loaded" @@ -130,6 +147,20 @@ var _ = Describe("nvidia-cdi-refresh", Ordered, ContinueOnFailure, Label("system outerContainerImage = "docker.io/kindest/base:v20250521-31a79fd4" ) + installToolkit := func() { + _, _, err := toolkitInstaller.Install(systemdRunner) + Expect(err).ToNot(HaveOccurred()) + + output, _, err := systemdRunner.Run("nvidia-ctk --version") + Expect(err).ToNot(HaveOccurred()) + GinkgoLogr.Info("using nvidia-ctk", "version", strings.TrimSpace(output)) + } + + purgeToolkit := func() { + _, _, err := systemdRunner.Run("apt-get purge -y libnvidia-container* nvidia-container-toolkit*") + Expect(err).ToNot(HaveOccurred()) + } + BeforeAll(func(ctx context.Context) { var err error systemdRunner, err = NewNestedContainerRunner(runner, outerContainerImage, false, containerName, localCacheDir, true) @@ -143,6 +174,9 @@ var _ = Describe("nvidia-cdi-refresh", Ordered, ContinueOnFailure, Label("system GinkgoLogr.Error(err, "systemctl state") time.Sleep(1 * time.Second) } + + _, _, err = systemdRunner.Run(disablePolicyRcDScript) + Expect(err).ToNot(HaveOccurred()) }) AfterAll(func(ctx context.Context) { @@ -153,18 +187,11 @@ var _ = Describe("nvidia-cdi-refresh", Ordered, ContinueOnFailure, Label("system When("installing nvidia-container-toolkit", Ordered, func() { BeforeAll(func(ctx context.Context) { - - _, _, err := toolkitInstaller.Install(systemdRunner) - Expect(err).ToNot(HaveOccurred()) - - output, _, err := systemdRunner.Run("nvidia-ctk --version") - Expect(err).ToNot(HaveOccurred()) - GinkgoLogr.Info("using nvidia-ctk", "version", strings.TrimSpace(output)) + installToolkit() }) AfterAll(func(ctx context.Context) { - _, _, err := systemdRunner.Run("apt-get purge -y libnvidia-container* nvidia-container-toolkit*") - Expect(err).ToNot(HaveOccurred()) + purgeToolkit() }) It("should load the nvidia-cdi-refresh.path unit", func(ctx context.Context) { @@ -172,6 +199,11 @@ var _ = Describe("nvidia-cdi-refresh", Ordered, ContinueOnFailure, Label("system Expect(err).ToNot(HaveOccurred()) }) + It("should enable the nvidia-cdi-refresh units", func(ctx context.Context) { + _, _, err := systemdRunner.Run(nvidiaCdiRefreshUnitsEnabledTemplate) + Expect(err).ToNot(HaveOccurred()) + }) + It("should load the nvidia-cdi-refresh.service unit", func(ctx context.Context) { _, _, err := systemdRunner.Run(nvidiaCdiRefreshServiceLoadedTemplate) Expect(err).ToNot(HaveOccurred()) @@ -196,9 +228,13 @@ var _ = Describe("nvidia-cdi-refresh", Ordered, ContinueOnFailure, Label("system _, _, err = systemdRunner.Run(getSystemStateScript) Expect(err).To(HaveOccurred()) Expect(err.Error()).To(ContainSubstring("degraded")) + + installToolkit() }) AfterAll(func(ctx context.Context) { + purgeToolkit() + _, _, err := systemdRunner.Run(fixSystemDegradedScript) Expect(err).ToNot(HaveOccurred()) @@ -212,6 +248,11 @@ var _ = Describe("nvidia-cdi-refresh", Ordered, ContinueOnFailure, Label("system Expect(err).ToNot(HaveOccurred()) }) + It("should enable the nvidia-cdi-refresh units", func(ctx context.Context) { + _, _, err := systemdRunner.Run(nvidiaCdiRefreshUnitsEnabledTemplate) + Expect(err).ToNot(HaveOccurred()) + }) + It("should load the nvidia-cdi-refresh.service unit", func(ctx context.Context) { _, _, err := systemdRunner.Run(nvidiaCdiRefreshServiceLoadedTemplate) Expect(err).ToNot(HaveOccurred()) @@ -222,8 +263,8 @@ var _ = Describe("nvidia-cdi-refresh", Ordered, ContinueOnFailure, Label("system Expect(err).ToNot(HaveOccurred()) }) - It("should generate the nvidia.yaml file", func(ctx context.Context) { - _, _, err := systemdRunner.Run(nvidiaCdiRefreshFileExistsTemplate) + It("should refresh the nvidia.yaml file after upgrading the nvidia-container-toolkit", func(ctx context.Context) { + _, _, err := systemdRunner.Run(nvidiaCdiRefreshUpgradeTemplate) Expect(err).ToNot(HaveOccurred()) }) })