Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 42 additions & 0 deletions pkg/gpu/nvidia/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ const (
mpsActiveThreadCmd = "get_default_active_thread_percentage"
mpsMemLimitEnv = "CUDA_MPS_PINNED_DEVICE_MEM_LIMIT"
mpsThreadLimitEnv = "CUDA_MPS_ACTIVE_THREAD_PERCENTAGE"
NvidiaRtxPro6000 = "NVIDIA RTX PRO 6000"
)

var (
Expand Down Expand Up @@ -393,6 +394,11 @@ func (ngm *nvidiaGPUManager) Start() error {
if err := ngm.migDeviceManager.Start(ngm.gpuConfig.GPUPartitionSize); err != nil {
return fmt.Errorf("failed to start mig device manager: %v", err)
}
} else if supportsVGPU() {
// For vGPU, we don't initialize the MIG manager as vGPU is already built on top of MIG at the GCE-level.
if _, err := ngm.migDeviceManager.DiscoverMIGDevices(); err != nil {
return fmt.Errorf("failed to discover MIG devices: %v", err)
}
}

if ngm.gpuConfig.GPUSharingConfig.GPUSharingStrategy == "mps" {
Expand Down Expand Up @@ -429,6 +435,42 @@ func totalMemPerGPU() (uint64, error) {
return memory.Total, nil
}

// supportsVGPU checks if any of the attached GPUs support vGPU.
func supportsVGPU() bool {
supportedModels := map[string]bool{
NvidiaRtxPro6000: true,
}

if nvmlutil.NvmlDeviceInfo == nil {
nvmlutil.NvmlDeviceInfo = &nvmlutil.DeviceInfo{}
}

devicesCount, ret := nvmlutil.NvmlDeviceInfo.DeviceCount()
if ret != nvml.SUCCESS {
return false
}

for i := 0; i < devicesCount; i++ {
device, ret := nvmlutil.NvmlDeviceInfo.DeviceHandleByIndex(i)
if ret != nvml.SUCCESS {
continue
}

name, ret := nvmlutil.NvmlDeviceInfo.Name(device)
if ret != nvml.SUCCESS {
continue
}

for model := range supportedModels {
if strings.Contains(name, model) {
return true
}
}
}

return false
}

func (ngm *nvidiaGPUManager) Serve(pMountPath, kEndpoint, pluginEndpoint string) {
registerWithKubelet := false
// Check if the unix socket device-plugin/kubelet.sock is at the host path.
Expand Down
68 changes: 40 additions & 28 deletions pkg/gpu/nvidia/mig/mig.go
Original file line number Diff line number Diff line change
Expand Up @@ -130,17 +130,43 @@ func (d *DeviceManager) Start(partitionSize string) error {

d.gpuPartitionSpecs = make(map[string][]pluginapi.DeviceSpec)

numPartitionsPerGPU, err := d.DiscoverMIGDevices()
if err != nil {
return err
}

numPartitionedGPUs := 0
for gpuID, numPartitions := range numPartitionsPerGPU {
if numPartitions != maxPartitionCount {
return fmt.Errorf("Number of partitions (%d) for GPU %s does not match expected partition count (%d)", numPartitions, gpuID, maxPartitionCount)
}
numPartitionedGPUs++
}

numGPUs, err := d.discoverNumGPUs()
if err != nil {
return err
}
if numPartitionedGPUs != numGPUs {
return fmt.Errorf("Not all GPUs are partitioned as expected. Total number of GPUs: %d, number of partitioned GPUs: %d", numGPUs, numPartitionedGPUs)
}

return nil
}

// DiscoverMIGDevices discovers all the MIG devices on the node and returns a map of GPU ID to the number of partitions.
func (d *DeviceManager) DiscoverMIGDevices() (map[string]int, error) {
nvidiaCapDir := path.Join(d.procDirectory, "driver/nvidia/capabilities")
capFiles, err := ioutil.ReadDir(nvidiaCapDir)
if err != nil {
return fmt.Errorf("failed to read capabilities directory (%s): %v", nvidiaCapDir, err)
return nil, fmt.Errorf("failed to read capabilities directory (%s): %v", nvidiaCapDir, err)
}

gpuFileRegexp := regexp.MustCompile("gpu([0-9]+)")
giFileRegexp := regexp.MustCompile("gi([0-9]+)")
deviceRegexp := regexp.MustCompile("DeviceFileMinor: ([0-9]+)")

numPartitionedGPUs := 0
numPartitionsPerGPU := make(map[string]int)

for _, capFile := range capFiles {
m := gpuFileRegexp.FindStringSubmatch(capFile.Name())
Expand All @@ -150,12 +176,10 @@ func (d *DeviceManager) Start(partitionSize string) error {
}

gpuID := m[1]
numPartitionedGPUs++

giBasePath := path.Join(nvidiaCapDir, capFile.Name(), "mig")
giFiles, err := ioutil.ReadDir(giBasePath)
if err != nil {
return fmt.Errorf("failed to read GPU instance capabilities dir (%s): %v", giBasePath, err)
return nil, fmt.Errorf("failed to read GPU instance capabilities dir (%s): %v", giBasePath, err)
}

numPartitions := 0
Expand All @@ -170,46 +194,46 @@ func (d *DeviceManager) Start(partitionSize string) error {
giAccessFile := path.Join(giBasePath, giFile.Name(), "access")
content, err := ioutil.ReadFile(giAccessFile)
if err != nil {
return fmt.Errorf("failed to read GPU Instance access file (%s): %v", giAccessFile, err)
return nil, fmt.Errorf("failed to read GPU Instance access file (%s): %v", giAccessFile, err)
}

m := deviceRegexp.FindStringSubmatch(string(content))
if len(m) != 2 {
return fmt.Errorf("unexpected contents in GPU instance access file(%s): %v", giAccessFile, err)
return nil, fmt.Errorf("unexpected contents in GPU instance access file(%s): %v", giAccessFile, err)
}
giMinorDevice, err := strconv.Atoi(m[1])
if err != nil {
return fmt.Errorf("failed to find minor device from GPU instance access file(%s): %v", giAccessFile, err)
return nil, fmt.Errorf("failed to find minor device from GPU instance access file(%s): %v", giAccessFile, err)
}

ciAccessFile := path.Join(giBasePath, giFile.Name(), "ci0", "access")
content, err = ioutil.ReadFile(ciAccessFile)
if err != nil {
return fmt.Errorf("unable to read Compute Instance access file (%s): %v", ciAccessFile, err)
return nil, fmt.Errorf("unable to read Compute Instance access file (%s): %v", ciAccessFile, err)
}

m = deviceRegexp.FindStringSubmatch(string(content))
if len(m) != 2 {
return fmt.Errorf("unexpected contents in compute instance access file(%s): %v", ciAccessFile, err)
return nil, fmt.Errorf("unexpected contents in compute instance access file(%s): %v", ciAccessFile, err)
}
ciMinorDevice, err := strconv.Atoi(m[1])
if err != nil {
return fmt.Errorf("failed to find minor device from compute instance access file(%s): %v", ciAccessFile, err)
return nil, fmt.Errorf("failed to find minor device from compute instance access file(%s): %v", ciAccessFile, err)
}

gpuDevice := path.Join(d.devDirectory, "nvidia"+gpuID)
if _, err := os.Stat(gpuDevice); err != nil {
return fmt.Errorf("GPU device (%s) not fount: %v", gpuDevice, err)
return nil, fmt.Errorf("GPU device (%s) not fount: %v", gpuDevice, err)
}

giDevice := path.Join(d.devDirectory, "nvidia-caps", "nvidia-cap"+strconv.Itoa(giMinorDevice))
if _, err := os.Stat(giDevice); err != nil {
return fmt.Errorf("GPU instance device (%s) not fount: %v", giDevice, err)
return nil, fmt.Errorf("GPU instance device (%s) not fount: %v", giDevice, err)
}

ciDevice := path.Join(d.devDirectory, "nvidia-caps", "nvidia-cap"+strconv.Itoa(ciMinorDevice))
if _, err := os.Stat(ciDevice); err != nil {
return fmt.Errorf("Compute instance device (%s) not fount: %v", ciDevice, err)
return nil, fmt.Errorf("Compute instance device (%s) not fount: %v", ciDevice, err)
}

glog.Infof("Discovered GPU partition: %s", gpuInstanceID)
Expand All @@ -236,21 +260,9 @@ func (d *DeviceManager) Start(partitionSize string) error {
}
d.gpuPartitions[gpuInstanceID] = pluginapi.Device{ID: gpuInstanceID, Health: pluginapi.Healthy, Topology: topologyInfo}
}

if numPartitions != maxPartitionCount {
return fmt.Errorf("Number of partitions (%d) for GPU %s does not match expected partition count (%d)", numPartitions, gpuID, maxPartitionCount)
}
numPartitionsPerGPU[gpuID] = numPartitions
}

numGPUs, err := d.discoverNumGPUs()
if err != nil {
return err
}
if numPartitionedGPUs != numGPUs {
return fmt.Errorf("Not all GPUs are partitioned as expected. Total number of GPUs: %d, number of partitioned GPUs: %d", numGPUs, numPartitionedGPUs)
}

return nil
return numPartitionsPerGPU, nil
}

// SetDeviceHealth sets the health status for a GPU partition
Expand Down
4 changes: 4 additions & 0 deletions pkg/gpu/nvidia/nvmlutil/nvml_mock.go
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,10 @@ func (gpuDeviceInfo *MockDeviceInfo) MinorNumber(d nvml.Device) (int, nvml.Retur
return gpuDeviceInfo.CurrentDevice, nvml.SUCCESS
}

func (m *MockDeviceInfo) Name(device nvml.Device) (string, nvml.Return) {
return "", nvml.SUCCESS
}

func (gpuDeviceInfo *MockDeviceInfo) PciInfo(d nvml.Device) (nvml.PciInfo, nvml.Return) {
return nvml.PciInfo{BusId: gpuDeviceInfo.BusID}, nvml.SUCCESS
}
5 changes: 5 additions & 0 deletions pkg/gpu/nvidia/nvmlutil/nvmlutil.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ type NvmlOperations interface {
MigDeviceHandleByIndex(nvml.Device, int) (nvml.Device, nvml.Return)
MigMode(nvml.Device) (int, int, nvml.Return)
MinorNumber(nvml.Device) (int, nvml.Return)
Name(nvml.Device) (string, nvml.Return)
PciInfo(d nvml.Device) (nvml.PciInfo, nvml.Return)
}

Expand Down Expand Up @@ -73,6 +74,10 @@ func (gpuDeviceInfo *DeviceInfo) PciInfo(d nvml.Device) (nvml.PciInfo, nvml.Retu
return d.GetPciInfo()
}

func (di *DeviceInfo) Name(device nvml.Device) (string, nvml.Return) {
return device.GetName()
}

// topology determines the NUMA topology information for a GPU device.
// Returns a TopologyInfo containing the NUMA node ID for the GPU device
// if NUMA is enabled, nil otherwise.
Expand Down
Loading