From fe1b7ff1fe50517d87eedcc9b2b800e595a6277c Mon Sep 17 00:00:00 2001 From: Ankit Joju Date: Wed, 11 Mar 2026 12:05:51 -0700 Subject: [PATCH 1/2] moved MIG discovery logic to its own function --- pkg/gpu/nvidia/mig/mig.go | 68 +++++++++++++++++++++++---------------- 1 file changed, 40 insertions(+), 28 deletions(-) diff --git a/pkg/gpu/nvidia/mig/mig.go b/pkg/gpu/nvidia/mig/mig.go index ab2ce4a8e..4394fff69 100644 --- a/pkg/gpu/nvidia/mig/mig.go +++ b/pkg/gpu/nvidia/mig/mig.go @@ -130,17 +130,43 @@ func (d *DeviceManager) Start(partitionSize string) error { d.gpuPartitionSpecs = make(map[string][]pluginapi.DeviceSpec) + numPartitionsPerGPU, err := d.discoverMIGDevices() + if err != nil { + return err + } + + numPartitionedGPUs := 0 + for gpuID, numPartitions := range numPartitionsPerGPU { + if numPartitions != maxPartitionCount { + return fmt.Errorf("Number of partitions (%d) for GPU %s does not match expected partition count (%d)", numPartitions, gpuID, maxPartitionCount) + } + numPartitionedGPUs++ + } + + numGPUs, err := d.discoverNumGPUs() + if err != nil { + return err + } + if numPartitionedGPUs != numGPUs { + return fmt.Errorf("Not all GPUs are partitioned as expected. Total number of GPUs: %d, number of partitioned GPUs: %d", numGPUs, numPartitionedGPUs) + } + + return nil +} + +// discoverMIGDevices discovers all the MIG devices on the node and returns a map of GPU ID to the number of partitions. +func (d *DeviceManager) discoverMIGDevices() (map[string]int, error) { nvidiaCapDir := path.Join(d.procDirectory, "driver/nvidia/capabilities") capFiles, err := ioutil.ReadDir(nvidiaCapDir) if err != nil { - return fmt.Errorf("failed to read capabilities directory (%s): %v", nvidiaCapDir, err) + return nil, fmt.Errorf("failed to read capabilities directory (%s): %v", nvidiaCapDir, err) } gpuFileRegexp := regexp.MustCompile("gpu([0-9]+)") giFileRegexp := regexp.MustCompile("gi([0-9]+)") deviceRegexp := regexp.MustCompile("DeviceFileMinor: ([0-9]+)") - numPartitionedGPUs := 0 + numPartitionsPerGPU := make(map[string]int) for _, capFile := range capFiles { m := gpuFileRegexp.FindStringSubmatch(capFile.Name()) @@ -150,12 +176,10 @@ func (d *DeviceManager) Start(partitionSize string) error { } gpuID := m[1] - numPartitionedGPUs++ - giBasePath := path.Join(nvidiaCapDir, capFile.Name(), "mig") giFiles, err := ioutil.ReadDir(giBasePath) if err != nil { - return fmt.Errorf("failed to read GPU instance capabilities dir (%s): %v", giBasePath, err) + return nil, fmt.Errorf("failed to read GPU instance capabilities dir (%s): %v", giBasePath, err) } numPartitions := 0 @@ -170,46 +194,46 @@ func (d *DeviceManager) Start(partitionSize string) error { giAccessFile := path.Join(giBasePath, giFile.Name(), "access") content, err := ioutil.ReadFile(giAccessFile) if err != nil { - return fmt.Errorf("failed to read GPU Instance access file (%s): %v", giAccessFile, err) + return nil, fmt.Errorf("failed to read GPU Instance access file (%s): %v", giAccessFile, err) } m := deviceRegexp.FindStringSubmatch(string(content)) if len(m) != 2 { - return fmt.Errorf("unexpected contents in GPU instance access file(%s): %v", giAccessFile, err) + return nil, fmt.Errorf("unexpected contents in GPU instance access file(%s): %v", giAccessFile, err) } giMinorDevice, err := strconv.Atoi(m[1]) if err != nil { - return fmt.Errorf("failed to find minor device from GPU instance access file(%s): %v", giAccessFile, err) + return nil, fmt.Errorf("failed to find minor device from GPU instance access file(%s): %v", giAccessFile, err) } ciAccessFile := path.Join(giBasePath, giFile.Name(), "ci0", "access") content, err = ioutil.ReadFile(ciAccessFile) if err != nil { - return fmt.Errorf("unable to read Compute Instance access file (%s): %v", ciAccessFile, err) + return nil, fmt.Errorf("unable to read Compute Instance access file (%s): %v", ciAccessFile, err) } m = deviceRegexp.FindStringSubmatch(string(content)) if len(m) != 2 { - return fmt.Errorf("unexpected contents in compute instance access file(%s): %v", ciAccessFile, err) + return nil, fmt.Errorf("unexpected contents in compute instance access file(%s): %v", ciAccessFile, err) } ciMinorDevice, err := strconv.Atoi(m[1]) if err != nil { - return fmt.Errorf("failed to find minor device from compute instance access file(%s): %v", ciAccessFile, err) + return nil, fmt.Errorf("failed to find minor device from compute instance access file(%s): %v", ciAccessFile, err) } gpuDevice := path.Join(d.devDirectory, "nvidia"+gpuID) if _, err := os.Stat(gpuDevice); err != nil { - return fmt.Errorf("GPU device (%s) not fount: %v", gpuDevice, err) + return nil, fmt.Errorf("GPU device (%s) not fount: %v", gpuDevice, err) } giDevice := path.Join(d.devDirectory, "nvidia-caps", "nvidia-cap"+strconv.Itoa(giMinorDevice)) if _, err := os.Stat(giDevice); err != nil { - return fmt.Errorf("GPU instance device (%s) not fount: %v", giDevice, err) + return nil, fmt.Errorf("GPU instance device (%s) not fount: %v", giDevice, err) } ciDevice := path.Join(d.devDirectory, "nvidia-caps", "nvidia-cap"+strconv.Itoa(ciMinorDevice)) if _, err := os.Stat(ciDevice); err != nil { - return fmt.Errorf("Compute instance device (%s) not fount: %v", ciDevice, err) + return nil, fmt.Errorf("Compute instance device (%s) not fount: %v", ciDevice, err) } glog.Infof("Discovered GPU partition: %s", gpuInstanceID) @@ -236,21 +260,9 @@ func (d *DeviceManager) Start(partitionSize string) error { } d.gpuPartitions[gpuInstanceID] = pluginapi.Device{ID: gpuInstanceID, Health: pluginapi.Healthy, Topology: topologyInfo} } - - if numPartitions != maxPartitionCount { - return fmt.Errorf("Number of partitions (%d) for GPU %s does not match expected partition count (%d)", numPartitions, gpuID, maxPartitionCount) - } + numPartitionsPerGPU[gpuID] = numPartitions } - - numGPUs, err := d.discoverNumGPUs() - if err != nil { - return err - } - if numPartitionedGPUs != numGPUs { - return fmt.Errorf("Not all GPUs are partitioned as expected. Total number of GPUs: %d, number of partitioned GPUs: %d", numGPUs, numPartitionedGPUs) - } - - return nil + return numPartitionsPerGPU, nil } // SetDeviceHealth sets the health status for a GPU partition From efc102fee6ec1db4784a484efecf43bba6e7c627 Mon Sep 17 00:00:00 2001 From: Ankit Joju Date: Wed, 11 Mar 2026 13:23:22 -0700 Subject: [PATCH 2/2] Add RTX PRO 6000 as a vgpu machine --- pkg/gpu/nvidia/manager.go | 42 ++++++++++++++++++++++++++++ pkg/gpu/nvidia/mig/mig.go | 6 ++-- pkg/gpu/nvidia/nvmlutil/nvml_mock.go | 4 +++ pkg/gpu/nvidia/nvmlutil/nvmlutil.go | 5 ++++ 4 files changed, 54 insertions(+), 3 deletions(-) diff --git a/pkg/gpu/nvidia/manager.go b/pkg/gpu/nvidia/manager.go index 532393a21..d6374d9b3 100644 --- a/pkg/gpu/nvidia/manager.go +++ b/pkg/gpu/nvidia/manager.go @@ -61,6 +61,7 @@ const ( mpsActiveThreadCmd = "get_default_active_thread_percentage" mpsMemLimitEnv = "CUDA_MPS_PINNED_DEVICE_MEM_LIMIT" mpsThreadLimitEnv = "CUDA_MPS_ACTIVE_THREAD_PERCENTAGE" + NvidiaRtxPro6000 = "NVIDIA RTX PRO 6000" ) var ( @@ -393,6 +394,11 @@ func (ngm *nvidiaGPUManager) Start() error { if err := ngm.migDeviceManager.Start(ngm.gpuConfig.GPUPartitionSize); err != nil { return fmt.Errorf("failed to start mig device manager: %v", err) } + } else if supportsVGPU() { + // For vGPU, we don't initialize the MIG manager as vGPU is already built on top of MIG at the GCE-level. + if _, err := ngm.migDeviceManager.DiscoverMIGDevices(); err != nil { + return fmt.Errorf("failed to discover MIG devices: %v", err) + } } if ngm.gpuConfig.GPUSharingConfig.GPUSharingStrategy == "mps" { @@ -429,6 +435,42 @@ func totalMemPerGPU() (uint64, error) { return memory.Total, nil } +// supportsVGPU checks if any of the attached GPUs support vGPU. +func supportsVGPU() bool { + supportedModels := map[string]bool{ + NvidiaRtxPro6000: true, + } + + if nvmlutil.NvmlDeviceInfo == nil { + nvmlutil.NvmlDeviceInfo = &nvmlutil.DeviceInfo{} + } + + devicesCount, ret := nvmlutil.NvmlDeviceInfo.DeviceCount() + if ret != nvml.SUCCESS { + return false + } + + for i := 0; i < devicesCount; i++ { + device, ret := nvmlutil.NvmlDeviceInfo.DeviceHandleByIndex(i) + if ret != nvml.SUCCESS { + continue + } + + name, ret := nvmlutil.NvmlDeviceInfo.Name(device) + if ret != nvml.SUCCESS { + continue + } + + for model := range supportedModels { + if strings.Contains(name, model) { + return true + } + } + } + + return false +} + func (ngm *nvidiaGPUManager) Serve(pMountPath, kEndpoint, pluginEndpoint string) { registerWithKubelet := false // Check if the unix socket device-plugin/kubelet.sock is at the host path. diff --git a/pkg/gpu/nvidia/mig/mig.go b/pkg/gpu/nvidia/mig/mig.go index 4394fff69..445884869 100644 --- a/pkg/gpu/nvidia/mig/mig.go +++ b/pkg/gpu/nvidia/mig/mig.go @@ -130,7 +130,7 @@ func (d *DeviceManager) Start(partitionSize string) error { d.gpuPartitionSpecs = make(map[string][]pluginapi.DeviceSpec) - numPartitionsPerGPU, err := d.discoverMIGDevices() + numPartitionsPerGPU, err := d.DiscoverMIGDevices() if err != nil { return err } @@ -154,8 +154,8 @@ func (d *DeviceManager) Start(partitionSize string) error { return nil } -// discoverMIGDevices discovers all the MIG devices on the node and returns a map of GPU ID to the number of partitions. -func (d *DeviceManager) discoverMIGDevices() (map[string]int, error) { +// DiscoverMIGDevices discovers all the MIG devices on the node and returns a map of GPU ID to the number of partitions. +func (d *DeviceManager) DiscoverMIGDevices() (map[string]int, error) { nvidiaCapDir := path.Join(d.procDirectory, "driver/nvidia/capabilities") capFiles, err := ioutil.ReadDir(nvidiaCapDir) if err != nil { diff --git a/pkg/gpu/nvidia/nvmlutil/nvml_mock.go b/pkg/gpu/nvidia/nvmlutil/nvml_mock.go index 14e9e13bd..b8359fa70 100644 --- a/pkg/gpu/nvidia/nvmlutil/nvml_mock.go +++ b/pkg/gpu/nvidia/nvmlutil/nvml_mock.go @@ -65,6 +65,10 @@ func (gpuDeviceInfo *MockDeviceInfo) MinorNumber(d nvml.Device) (int, nvml.Retur return gpuDeviceInfo.CurrentDevice, nvml.SUCCESS } +func (m *MockDeviceInfo) Name(device nvml.Device) (string, nvml.Return) { + return "", nvml.SUCCESS +} + func (gpuDeviceInfo *MockDeviceInfo) PciInfo(d nvml.Device) (nvml.PciInfo, nvml.Return) { return nvml.PciInfo{BusId: gpuDeviceInfo.BusID}, nvml.SUCCESS } diff --git a/pkg/gpu/nvidia/nvmlutil/nvmlutil.go b/pkg/gpu/nvidia/nvmlutil/nvmlutil.go index a47ae0db1..64450de98 100644 --- a/pkg/gpu/nvidia/nvmlutil/nvmlutil.go +++ b/pkg/gpu/nvidia/nvmlutil/nvmlutil.go @@ -33,6 +33,7 @@ type NvmlOperations interface { MigDeviceHandleByIndex(nvml.Device, int) (nvml.Device, nvml.Return) MigMode(nvml.Device) (int, int, nvml.Return) MinorNumber(nvml.Device) (int, nvml.Return) + Name(nvml.Device) (string, nvml.Return) PciInfo(d nvml.Device) (nvml.PciInfo, nvml.Return) } @@ -73,6 +74,10 @@ func (gpuDeviceInfo *DeviceInfo) PciInfo(d nvml.Device) (nvml.PciInfo, nvml.Retu return d.GetPciInfo() } +func (di *DeviceInfo) Name(device nvml.Device) (string, nvml.Return) { + return device.GetName() +} + // topology determines the NUMA topology information for a GPU device. // Returns a TopologyInfo containing the NUMA node ID for the GPU device // if NUMA is enabled, nil otherwise.