File tree Expand file tree Collapse file tree
Expand file tree Collapse file tree Original file line number Diff line number Diff line change @@ -516,18 +516,23 @@ def _check_device_memory(index):
516516 index: int
517517 The index of the CUDA device.
518518 """
519- from pynvml import (
520- nvmlInit ,
521- nvmlShutdown ,
522- nvmlDeviceGetHandleByIndex ,
523- nvmlDeviceGetMemoryInfo ,
524- )
519+ try :
520+ from pynvml import (
521+ nvmlInit ,
522+ nvmlShutdown ,
523+ nvmlDeviceGetHandleByIndex ,
524+ nvmlDeviceGetMemoryInfo ,
525+ )
526+
527+ nvmlInit ()
528+ handle = nvmlDeviceGetHandleByIndex (index )
529+ info = nvmlDeviceGetMemoryInfo (handle )
530+ result = (info .used , info .free , info .total )
531+ nvmlShutdown ()
532+ except Exception as e :
533+ msg = f"Could not determine memory usage for device { index } : { e } "
534+ _logger .error (msg )
525535
526- nvmlInit ()
527- handle = nvmlDeviceGetHandleByIndex (index )
528- info = nvmlDeviceGetMemoryInfo (handle )
529- result = (info .used , info .free , info .total )
530- nvmlShutdown ()
531536 return result
532537
533538
You can’t perform that action at this time.
0 commit comments