Skip to content

Commit

Permalink
add available memory check to accelerators (microsoft#4508)
Browse files Browse the repository at this point in the history
* add available memory check to accelerator

* catch case where nvmlInit fails

* add pynvml to reqs

* fix for cpu systems

* Update accelerator/cuda_accelerator.py

Co-authored-by: Michael Wyatt <[email protected]>

* simplify

---------

Co-authored-by: Michael Wyatt <[email protected]>
  • Loading branch information
jeffra and mrwyattii committed Oct 16, 2023
1 parent 78c518e commit 12aedac
Show file tree
Hide file tree
Showing 6 changed files with 39 additions and 0 deletions.
4 changes: 4 additions & 0 deletions accelerator/abstract_accelerator.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,10 @@ def max_memory_reserved(self, device_index=None):
def total_memory(self, device_index=None):
...

@abc.abstractmethod
def available_memory(self, device_index=None):
...

# Data types
@abc.abstractmethod
def is_bf16_supported(self):
Expand Down
3 changes: 3 additions & 0 deletions accelerator/cpu_accelerator.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,9 @@ def max_memory_reserved(self, device_index=None):
def total_memory(self, device_index=None):
return psutil.virtual_memory().total

def available_memory(self, device_index=None):
return psutil.virtual_memory().available

# Misc
def amp(self):
return torch.cpu.amp
Expand Down
25 changes: 25 additions & 0 deletions accelerator/cuda_accelerator.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,29 @@
except ImportError:
pass

# Delay import pynvml to avoid import error when CUDA is not available
pynvml = None


class CUDA_Accelerator(DeepSpeedAccelerator):

def __init__(self):
self._name = 'cuda'
self._communication_backend_name = 'nccl'
if pynvml is None:
self._init_pynvml()

def _init_pynvml(self):
global pynvml
try:
import pynvml
except ImportError:
return
try:
pynvml.nvmlInit()
except pynvml.NVMLError:
pynvml = None
return

def is_synchronized_device(self):
return False
Expand Down Expand Up @@ -136,6 +153,14 @@ def max_memory_reserved(self, device_index=None):
def total_memory(self, device_index=None):
return torch.cuda.get_device_properties(device_index).total_memory

def available_memory(self, device_index=None):
if pynvml:
handle = pynvml.nvmlDeviceGetHandleByIndex(device_index)
info = pynvml.nvmlDeviceGetMemoryInfo(handle)
return info.free
else:
return self.total_memory(device_index) - self.memory_allocated(device_index)

# Data types
def is_bf16_supported(self):
return torch.cuda.is_bf16_supported()
Expand Down
3 changes: 3 additions & 0 deletions accelerator/mps_accelerator.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,9 @@ def max_memory_reserved(self, device_index=None):
def total_memory(self, device_index=None):
return

def available_memory(self, device_index=None):
return

# Data types
def is_bf16_supported(self):
return False
Expand Down
3 changes: 3 additions & 0 deletions accelerator/npu_accelerator.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,9 @@ def max_memory_reserved(self, device_index=None):
def total_memory(self, device_index=None):
return torch.npu.get_device_properties(device_index).total_memory

def available_memory(self, device_index=None):
return self.total_memory(device_index) - self.memory_allocated(device_index)

# Data types
def is_bf16_supported(self):
return torch.npu.is_bf16_supported()
Expand Down
1 change: 1 addition & 0 deletions requirements/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,5 +5,6 @@ packaging>=20.0
psutil
py-cpuinfo
pydantic
pynvml
torch
tqdm

0 comments on commit 12aedac

Please sign in to comment.