diff --git a/rhel10/precompiled/Dockerfile b/rhel10/precompiled/Dockerfile index 3175967e5..caaf081fd 100644 --- a/rhel10/precompiled/Dockerfile +++ b/rhel10/precompiled/Dockerfile @@ -162,7 +162,19 @@ RUN --mount=type=secret,id=RHSM_ORG,target=/run/secrets/RHSM_ORG \ && if [ "$DRIVER_TYPE" != "vgpu" ] && [ "$TARGETARCH" != "arm64" ]; then \ versionArray=(${DRIVER_VERSION//./ }); \ DRIVER_BRANCH=${versionArray[0]}; \ + if [ "$DRIVER_BRANCH" -ge "580" ]; then \ + dnf install -y nvidia-fabricmanager-${DRIVER_VERSION} libnvidia-nscq-${DRIVER_VERSION}; \ + else \ dnf install -y nvidia-fabric-manager-${DRIVER_VERSION} libnvidia-nscq-${DRIVER_BRANCH}-${DRIVER_VERSION} ; \ + fi \ + if [ "$DRIVER_BRANCH" -ge "580" ]; then \ + dnf install -y nvidia-imex-${DRIVER_VERSION} libnvdsm-${DRIVER_VERSION}; \ + elif [ "$DRIVER_BRANCH" -ge "570" ]; then \ + dnf install -y nvidia-imex-${DRIVER_BRANCH}-${DRIVER_VERSION} libnvsdm-${DRIVER_BRANCH}-${DRIVER_VERSION} ; \ + fi \ + if [ "$DRIVER_BRANCH" -ge "550" ]; then \ + dnf install install -y infiniband-diags nvlsm ; \ + fi \ fi \ && dnf clean all \ && subscription-manager unregister ; \ diff --git a/rhel10/precompiled/nvidia-driver b/rhel10/precompiled/nvidia-driver index 8b9c29a9f..1c82533d2 100755 --- a/rhel10/precompiled/nvidia-driver +++ b/rhel10/precompiled/nvidia-driver @@ -122,6 +122,31 @@ _assert_nvswitch_system() { return 0 } +_assert_nvlink5_system() ( + for dir in /sys/class/infiniband/*/device; do + # Define the path to the VPD file + vpd_file="$dir/vpd" + + # Check if the VPD file exists + if [ -f "$vpd_file" ]; then + # Search for 'SW_MNG' in the VPD file + if grep -q "SW_MNG" "$vpd_file"; then + echo "Detected NVLink5+ system" + return 0 + fi + fi + done + return 1 +) + +_ensure_nvlink5_prerequisites() ( + until lsmod | grep mlx5_core > /dev/null 2>&1 && lsmod | grep ib_umad > /dev/null 2>&1; + do + echo "waiting for the mlx5_core and ib_umad kernel modules to be loaded" + sleep 10 + done +) + # For each kernel module configuration file mounted into the container, # parse the file contents and extract the custom module parameters that # are to be passed as input to 'modprobe'. @@ -224,7 +249,22 @@ _load_driver() { _start_vgpu_topology_daemon fi - if _assert_nvswitch_system; then + if _assert_nvlink5_system; then + _ensure_nvlink5_prerequisites || return 1 + echo "Starting NVIDIA fabric manager daemon for NVLink5+..." + + fm_config_file=/usr/share/nvidia/nvswitch/fabricmanager.cfg + fm_pid_file=/var/run/nvidia-fabricmanager/nv-fabricmanager.pid + nvlsm_config_file=/usr/share/nvidia/nvlsm/nvlsm.conf + nvlsm_pid_file=/var/run/nvidia-fabricmanager/nvlsm.pid + /usr/bin/nvidia-fabricmanager-start.sh --mode start \ + --fm-config-file $fm_config_file \ + --fm-pid-file $fm_pid_file \ + --nvlsm-config-file $nvlsm_config_file \ + --nvlsm-pid-file $nvlsm_pid_file + + # If not a NVLink5+ switch, check for the presence of NVLink4 (or below) switches + elif _assert_nvswitch_system; then echo "Starting NVIDIA fabric manager daemon..." nv-fabricmanager -c /usr/share/nvidia/nvswitch/fabricmanager.cfg fi diff --git a/rhel9/precompiled/Dockerfile b/rhel9/precompiled/Dockerfile index 3175967e5..caaf081fd 100644 --- a/rhel9/precompiled/Dockerfile +++ b/rhel9/precompiled/Dockerfile @@ -162,7 +162,19 @@ RUN --mount=type=secret,id=RHSM_ORG,target=/run/secrets/RHSM_ORG \ && if [ "$DRIVER_TYPE" != "vgpu" ] && [ "$TARGETARCH" != "arm64" ]; then \ versionArray=(${DRIVER_VERSION//./ }); \ DRIVER_BRANCH=${versionArray[0]}; \ + if [ "$DRIVER_BRANCH" -ge "580" ]; then \ + dnf install -y nvidia-fabricmanager-${DRIVER_VERSION} libnvidia-nscq-${DRIVER_VERSION}; \ + else \ dnf install -y nvidia-fabric-manager-${DRIVER_VERSION} libnvidia-nscq-${DRIVER_BRANCH}-${DRIVER_VERSION} ; \ + fi \ + if [ "$DRIVER_BRANCH" -ge "580" ]; then \ + dnf install -y nvidia-imex-${DRIVER_VERSION} libnvdsm-${DRIVER_VERSION}; \ + elif [ "$DRIVER_BRANCH" -ge "570" ]; then \ + dnf install -y nvidia-imex-${DRIVER_BRANCH}-${DRIVER_VERSION} libnvsdm-${DRIVER_BRANCH}-${DRIVER_VERSION} ; \ + fi \ + if [ "$DRIVER_BRANCH" -ge "550" ]; then \ + dnf install install -y infiniband-diags nvlsm ; \ + fi \ fi \ && dnf clean all \ && subscription-manager unregister ; \ diff --git a/rhel9/precompiled/nvidia-driver b/rhel9/precompiled/nvidia-driver index 8b22e440a..4e18e22e0 100755 --- a/rhel9/precompiled/nvidia-driver +++ b/rhel9/precompiled/nvidia-driver @@ -124,6 +124,31 @@ _assert_nvswitch_system() { return 0 } +_assert_nvlink5_system() ( + for dir in /sys/class/infiniband/*/device; do + # Define the path to the VPD file + vpd_file="$dir/vpd" + + # Check if the VPD file exists + if [ -f "$vpd_file" ]; then + # Search for 'SW_MNG' in the VPD file + if grep -q "SW_MNG" "$vpd_file"; then + echo "Detected NVLink5+ system" + return 0 + fi + fi + done + return 1 +) + +_ensure_nvlink5_prerequisites() ( + until lsmod | grep mlx5_core > /dev/null 2>&1 && lsmod | grep ib_umad > /dev/null 2>&1; + do + echo "waiting for the mlx5_core and ib_umad kernel modules to be loaded" + sleep 10 + done +) + # For each kernel module configuration file mounted into the container, # parse the file contents and extract the custom module parameters that # are to be passed as input to 'modprobe'. @@ -250,7 +275,22 @@ _load_driver() { _start_vgpu_topology_daemon fi - if _assert_nvswitch_system; then + if _assert_nvlink5_system; then + _ensure_nvlink5_prerequisites || return 1 + echo "Starting NVIDIA fabric manager daemon for NVLink5+..." + + fm_config_file=/usr/share/nvidia/nvswitch/fabricmanager.cfg + fm_pid_file=/var/run/nvidia-fabricmanager/nv-fabricmanager.pid + nvlsm_config_file=/usr/share/nvidia/nvlsm/nvlsm.conf + nvlsm_pid_file=/var/run/nvidia-fabricmanager/nvlsm.pid + /usr/bin/nvidia-fabricmanager-start.sh --mode start \ + --fm-config-file $fm_config_file \ + --fm-pid-file $fm_pid_file \ + --nvlsm-config-file $nvlsm_config_file \ + --nvlsm-pid-file $nvlsm_pid_file + + # If not a NVLink5+ switch, check for the presence of NVLink4 (or below) switches + elif _assert_nvswitch_system; then echo "Starting NVIDIA fabric manager daemon..." nv-fabricmanager -c /usr/share/nvidia/nvswitch/fabricmanager.cfg fi