Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions deployments/systemd/90-nvidia-container-toolkit.preset
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
enable nvidia-cdi-refresh.path
enable nvidia-cdi-refresh.service
5 changes: 3 additions & 2 deletions docker/Dockerfile.opensuse-leap
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@ RUN zypper install -y \
ca-certificates \
wget \
git \
rpm-build && \
rpm-build \
systemd-rpm-macros && \
rm -rf /var/cache/zypp/*

ARG GOLANG_VERSION=0.0.0
Expand Down Expand Up @@ -46,7 +47,7 @@ RUN make PREFIX=${DIST_DIR} cmds

WORKDIR $DIST_DIR/..
COPY packaging/rpm .
COPY deployments/systemd/ .
COPY deployments/systemd/ ${DIST_DIR}/

CMD arch=$(uname -m) && \
rpmbuild --clean --target=$arch -bb \
Expand Down
6 changes: 5 additions & 1 deletion docker/Dockerfile.rpm-yum
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,11 @@ RUN yum install -y \
wget \
git \
make \
rpm-build && \
rpm-build \
systemd && \
if yum info systemd-rpm-macros >/dev/null 2>&1; then \
yum install -y systemd-rpm-macros; \
fi && \
rm -rf /var/cache/yum/*

ARG GOLANG_VERSION=0.0.0
Expand Down
3 changes: 2 additions & 1 deletion packaging/debian/control
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ Standards-Version: 3.9.8
Homepage: https://github.com/NVIDIA/nvidia-container-toolkit
Vcs-Git: https://github.com/NVIDIA/nvidia-container-toolkit
Vcs-Browser: https://github.com/NVIDIA/nvidia-container-toolkit
Build-Depends: debhelper (>= 9)
Build-Depends: debhelper (>= 9.20160709)

Package: nvidia-container-toolkit
Architecture: any
Expand All @@ -18,6 +18,7 @@ Description: NVIDIA Container toolkit

Package: nvidia-container-toolkit-base
Architecture: any
Pre-Depends: ${misc:Pre-Depends}
Depends: ${misc:Depends}
Breaks: nvidia-container-runtime (<= 3.5.0-1), nvidia-container-runtime-hook, nvidia-container-toolkit (<= 1.10.0-1)
Replaces: nvidia-container-runtime (<= 3.5.0-1), nvidia-container-runtime-hook
Expand Down
4 changes: 2 additions & 2 deletions packaging/debian/nvidia-container-toolkit-base.install
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
nvidia-container-runtime /usr/bin
nvidia-ctk /usr/bin
nvidia-cdi-hook /usr/bin
nvidia-cdi-refresh.service /etc/systemd/system/
nvidia-cdi-refresh.path /etc/systemd/system/
nvidia-cdi-refresh.service /lib/systemd/system/
nvidia-cdi-refresh.path /lib/systemd/system/
nvidia-cdi-refresh.env /etc/nvidia-container-toolkit/
2 changes: 2 additions & 0 deletions packaging/debian/nvidia-container-toolkit-base.maintscript
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
rm_conffile /etc/systemd/system/nvidia-cdi-refresh.service @VERSION@~ nvidia-container-toolkit-base
rm_conffile /etc/systemd/system/nvidia-cdi-refresh.path @VERSION@~ nvidia-container-toolkit-base
17 changes: 0 additions & 17 deletions packaging/debian/nvidia-container-toolkit-base.postinst
Original file line number Diff line number Diff line change
Expand Up @@ -5,23 +5,6 @@ set -e
case "$1" in
configure)
/usr/bin/nvidia-ctk --quiet config --config-file=/etc/nvidia-container-runtime/config.toml --in-place

# Enable nvidia-cdi-refresh services on both install and upgrade
# Support running and degraded systemd states
if command -v systemctl >/dev/null 2>&1; then
SYSTEMD_STATE=$(systemctl is-system-running 2>/dev/null || true)
case "$SYSTEMD_STATE" in
running|degraded)
systemctl daemon-reload || echo "Warning: Failed to reload systemd daemon" >&2
systemctl enable --now nvidia-cdi-refresh.path || echo "Warning: Failed to enable nvidia-cdi-refresh.path" >&2
systemctl enable --now nvidia-cdi-refresh.service || echo "Warning: Failed to enable nvidia-cdi-refresh.service" >&2

# Trigger CDI spec regeneration immediately after install/upgrade
echo "Regenerating NVIDIA CDI specification..."
systemctl start nvidia-cdi-refresh.service || echo "Warning: Failed to trigger CDI refresh" >&2
;;
esac
fi
;;

abort-upgrade|abort-remove|abort-deconfigure)
Expand Down
2 changes: 2 additions & 0 deletions packaging/debian/prepare
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ set -e
sed -i "s;@SECTION@;${SECTION:+$SECTION/};g" debian/control
sed -i "s;@VERSION@;${VERSION:+$VERSION};g" debian/control

sed -i "s;@VERSION@;${VERSION:+$VERSION};g" debian/nvidia-container-toolkit-base.maintscript

if [ -n "$DISTRIB" ]; then
sed -i "s;UNRELEASED;$DISTRIB;" debian/changelog
fi
6 changes: 3 additions & 3 deletions packaging/debian/rules
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
#export DH_VERBOSE=1

%:
dh $@
dh $@ --with systemd

override_dh_fixperms:
dh_fixperms
Expand All @@ -14,5 +14,5 @@ override_dh_fixperms:
chmod 755 debian/$(shell dh_listpackages)/usr/bin/nvidia-container-runtime.legacy || true
chmod 755 debian/$(shell dh_listpackages)/usr/bin/nvidia-ctk || true
chmod 755 debian/$(shell dh_listpackages)/usr/bin/nvidia-cdi-hook || true
chmod 644 debian/$(shell dh_listpackages)/etc/systemd/system/nvidia-cdi-refresh.service || true
chmod 644 debian/$(shell dh_listpackages)/etc/systemd/system/nvidia-cdi-refresh.path || true
chmod 644 debian/$(shell dh_listpackages)/lib/systemd/system/nvidia-cdi-refresh.service || true
chmod 644 debian/$(shell dh_listpackages)/lib/systemd/system/nvidia-cdi-refresh.path || true
47 changes: 28 additions & 19 deletions packaging/rpm/SPECS/nvidia-container-toolkit.spec
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,13 @@ Source6: nvidia-cdi-hook
Source7: nvidia-cdi-refresh.service
Source8: nvidia-cdi-refresh.path
Source9: nvidia-cdi-refresh.env
Source10: 90-nvidia-container-toolkit.preset

%if 0%{?rhel} == 7 || 0%{?amzn} == 2
BuildRequires: systemd
%else
BuildRequires: systemd-rpm-macros
%endif

Obsoletes: nvidia-container-runtime <= 3.5.0-1, nvidia-container-runtime-hook <= 1.4.0-2
Provides: nvidia-container-runtime
Expand All @@ -31,11 +38,12 @@ Requires: nvidia-container-toolkit-base == %{version}-%{release}
Provides tools and utilities to enable GPU support in containers.

%prep
cp %{SOURCE0} %{SOURCE1} %{SOURCE2} %{SOURCE3} %{SOURCE4} %{SOURCE5} %{SOURCE6} %{SOURCE7} %{SOURCE8} %{SOURCE9} .
cp %{SOURCE0} %{SOURCE1} %{SOURCE2} %{SOURCE3} %{SOURCE4} %{SOURCE5} %{SOURCE6} %{SOURCE7} %{SOURCE8} %{SOURCE9} %{SOURCE10} .

%install
mkdir -p %{buildroot}%{_bindir}
mkdir -p %{buildroot}%{_sysconfdir}/systemd/system/
mkdir -p %{buildroot}%{_unitdir}
mkdir -p %{buildroot}%{_presetdir}
mkdir -p %{buildroot}%{_sysconfdir}/nvidia-container-toolkit

install -m 755 -t %{buildroot}%{_bindir} nvidia-container-runtime-hook
Expand All @@ -44,8 +52,9 @@ install -m 755 -t %{buildroot}%{_bindir} nvidia-container-runtime.cdi
install -m 755 -t %{buildroot}%{_bindir} nvidia-container-runtime.legacy
install -m 755 -t %{buildroot}%{_bindir} nvidia-ctk
install -m 755 -t %{buildroot}%{_bindir} nvidia-cdi-hook
install -m 644 -t %{buildroot}%{_sysconfdir}/systemd/system nvidia-cdi-refresh.service
install -m 644 -t %{buildroot}%{_sysconfdir}/systemd/system nvidia-cdi-refresh.path
install -m 644 -t %{buildroot}%{_unitdir} nvidia-cdi-refresh.service
install -m 644 -t %{buildroot}%{_unitdir} nvidia-cdi-refresh.path
install -m 644 -t %{buildroot}%{_presetdir} 90-nvidia-container-toolkit.preset
install -m 644 -t %{buildroot}%{_sysconfdir}/nvidia-container-toolkit nvidia-cdi-refresh.env

%post
Expand Down Expand Up @@ -94,29 +103,29 @@ Provides tools such as the NVIDIA Container Runtime and NVIDIA Container Toolkit
# Generate the default config; If this file already exists no changes are made.
%{_bindir}/nvidia-ctk --quiet config --config-file=%{_sysconfdir}/nvidia-container-runtime/config.toml --in-place

# Reload systemd unit cache and enable nvidia-cdi-refresh services on both install and upgrade
%systemd_post nvidia-cdi-refresh.path nvidia-cdi-refresh.service

# Trigger CDI refresh on running systemd hosts without making install depend on
# the current system state.
if command -v systemctl >/dev/null 2>&1; then
SYSTEMD_STATE=$(systemctl is-system-running 2>/dev/null || true)
case "$SYSTEMD_STATE" in
running|degraded)
systemctl daemon-reload || echo "Warning: Failed to reload systemd daemon" >&2
systemctl enable --now nvidia-cdi-refresh.path || echo "Warning: Failed to enable nvidia-cdi-refresh.path" >&2
systemctl enable --now nvidia-cdi-refresh.service || echo "Warning: Failed to enable nvidia-cdi-refresh.service" >&2

# Trigger CDI spec regeneration immediately after install/upgrade
echo "Regenerating NVIDIA CDI specification..."
systemctl start nvidia-cdi-refresh.service || echo "Warning: Failed to trigger CDI refresh" >&2
;;
esac
systemctl start nvidia-cdi-refresh.path >/dev/null 2>&1 || :
systemctl start nvidia-cdi-refresh.service >/dev/null 2>&1 || :
fi

%preun base
%systemd_preun nvidia-cdi-refresh.path nvidia-cdi-refresh.service

%postun base
%systemd_postun nvidia-cdi-refresh.path nvidia-cdi-refresh.service

%files base
%license LICENSE
%{_bindir}/nvidia-container-runtime
%{_bindir}/nvidia-ctk
%{_bindir}/nvidia-cdi-hook
%{_sysconfdir}/systemd/system/nvidia-cdi-refresh.service
%{_sysconfdir}/systemd/system/nvidia-cdi-refresh.path
%{_unitdir}/nvidia-cdi-refresh.service
%{_unitdir}/nvidia-cdi-refresh.path
%{_presetdir}/90-nvidia-container-toolkit.preset
%config(noreplace) %{_sysconfdir}/nvidia-container-toolkit/nvidia-cdi-refresh.env

# The OPERATOR EXTENSIONS package consists of components that are required to enable GPU support in Kubernetes.
Expand Down
63 changes: 52 additions & 11 deletions tests/e2e/nvidia-cdi-refresh_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,13 +62,30 @@ EOF
rm -rf /etc/systemd/system/dummy.service
systemctl daemon-reload
`
disablePolicyRcDScript = `#!/usr/bin/env bash
# The kindest/base image carries a Docker-oriented policy-rc.d that blocks
# deb-systemd-invoke from starting units during package installation. These
# tests run a real systemd instance, so package-triggered unit starts should
# be allowed.
rm -f /usr/sbin/policy-rc.d
`

nvidiaCdiRefreshPathActiveTemplate = `
if ! systemctl status nvidia-cdi-refresh.path | grep "Active: active"; then
echo "nvidia-cdi-refresh.path is not Active"
exit 1
fi
`
nvidiaCdiRefreshUnitsEnabledTemplate = `
if ! systemctl is-enabled --quiet nvidia-cdi-refresh.path; then
echo "nvidia-cdi-refresh.path is not enabled"
exit 1
fi
if ! systemctl is-enabled --quiet nvidia-cdi-refresh.service; then
echo "nvidia-cdi-refresh.service is not enabled"
exit 1
fi
`
nvidiaCdiRefreshServiceLoadedTemplate = `
if ! systemctl status nvidia-cdi-refresh.service | grep "Loaded: loaded"; then
echo "nvidia-cdi-refresh.service is not loaded"
Expand Down Expand Up @@ -130,6 +147,20 @@ var _ = Describe("nvidia-cdi-refresh", Ordered, ContinueOnFailure, Label("system
outerContainerImage = "docker.io/kindest/base:v20250521-31a79fd4"
)

installToolkit := func() {
_, _, err := toolkitInstaller.Install(systemdRunner)
Expect(err).ToNot(HaveOccurred())

output, _, err := systemdRunner.Run("nvidia-ctk --version")
Expect(err).ToNot(HaveOccurred())
GinkgoLogr.Info("using nvidia-ctk", "version", strings.TrimSpace(output))
}

purgeToolkit := func() {
_, _, err := systemdRunner.Run("apt-get purge -y libnvidia-container* nvidia-container-toolkit*")
Expect(err).ToNot(HaveOccurred())
}

BeforeAll(func(ctx context.Context) {
var err error
systemdRunner, err = NewNestedContainerRunner(runner, outerContainerImage, false, containerName, localCacheDir, true)
Expand All @@ -143,6 +174,9 @@ var _ = Describe("nvidia-cdi-refresh", Ordered, ContinueOnFailure, Label("system
GinkgoLogr.Error(err, "systemctl state")
time.Sleep(1 * time.Second)
}

_, _, err = systemdRunner.Run(disablePolicyRcDScript)
Expect(err).ToNot(HaveOccurred())
})

AfterAll(func(ctx context.Context) {
Expand All @@ -153,25 +187,23 @@ var _ = Describe("nvidia-cdi-refresh", Ordered, ContinueOnFailure, Label("system

When("installing nvidia-container-toolkit", Ordered, func() {
BeforeAll(func(ctx context.Context) {

_, _, err := toolkitInstaller.Install(systemdRunner)
Expect(err).ToNot(HaveOccurred())

output, _, err := systemdRunner.Run("nvidia-ctk --version")
Expect(err).ToNot(HaveOccurred())
GinkgoLogr.Info("using nvidia-ctk", "version", strings.TrimSpace(output))
installToolkit()
})

AfterAll(func(ctx context.Context) {
_, _, err := systemdRunner.Run("apt-get purge -y libnvidia-container* nvidia-container-toolkit*")
Expect(err).ToNot(HaveOccurred())
purgeToolkit()
})

It("should load the nvidia-cdi-refresh.path unit", func(ctx context.Context) {
_, _, err := systemdRunner.Run(nvidiaCdiRefreshPathActiveTemplate)
Expect(err).ToNot(HaveOccurred())
})

It("should enable the nvidia-cdi-refresh units", func(ctx context.Context) {
_, _, err := systemdRunner.Run(nvidiaCdiRefreshUnitsEnabledTemplate)
Expect(err).ToNot(HaveOccurred())
})

It("should load the nvidia-cdi-refresh.service unit", func(ctx context.Context) {
_, _, err := systemdRunner.Run(nvidiaCdiRefreshServiceLoadedTemplate)
Expect(err).ToNot(HaveOccurred())
Expand All @@ -196,9 +228,13 @@ var _ = Describe("nvidia-cdi-refresh", Ordered, ContinueOnFailure, Label("system
_, _, err = systemdRunner.Run(getSystemStateScript)
Expect(err).To(HaveOccurred())
Expect(err.Error()).To(ContainSubstring("degraded"))

installToolkit()
})

AfterAll(func(ctx context.Context) {
purgeToolkit()

_, _, err := systemdRunner.Run(fixSystemDegradedScript)
Expect(err).ToNot(HaveOccurred())

Expand All @@ -212,6 +248,11 @@ var _ = Describe("nvidia-cdi-refresh", Ordered, ContinueOnFailure, Label("system
Expect(err).ToNot(HaveOccurred())
})

It("should enable the nvidia-cdi-refresh units", func(ctx context.Context) {
_, _, err := systemdRunner.Run(nvidiaCdiRefreshUnitsEnabledTemplate)
Expect(err).ToNot(HaveOccurred())
})

It("should load the nvidia-cdi-refresh.service unit", func(ctx context.Context) {
_, _, err := systemdRunner.Run(nvidiaCdiRefreshServiceLoadedTemplate)
Expect(err).ToNot(HaveOccurred())
Expand All @@ -222,8 +263,8 @@ var _ = Describe("nvidia-cdi-refresh", Ordered, ContinueOnFailure, Label("system
Expect(err).ToNot(HaveOccurred())
})

It("should generate the nvidia.yaml file", func(ctx context.Context) {
_, _, err := systemdRunner.Run(nvidiaCdiRefreshFileExistsTemplate)
It("should refresh the nvidia.yaml file after upgrading the nvidia-container-toolkit", func(ctx context.Context) {
_, _, err := systemdRunner.Run(nvidiaCdiRefreshUpgradeTemplate)
Expect(err).ToNot(HaveOccurred())
})
})
Expand Down
Loading