diff --git a/deepspeed/__init__.py b/deepspeed/__init__.py index 9e2e25513a72..bead98a40a51 100755 --- a/deepspeed/__init__.py +++ b/deepspeed/__init__.py @@ -10,6 +10,7 @@ from torch.optim import Optimizer from torch.optim.lr_scheduler import _LRScheduler from packaging import version as pkg_version +import subprocess from . import ops from . import module_inject @@ -116,6 +117,11 @@ def initialize(args=None, __git_branch__), ranks=[0]) + hostname_cmd = ["hostname -I"] + result = subprocess.check_output(hostname_cmd, shell=True) + hostname = result.decode('utf-8').split()[0] + print("Host: {}".format(hostname)) + # Disable zero.Init context if it's currently enabled zero.partition_parameters.shutdown_init_context() diff --git a/deepspeed/launcher/multinode_runner.py b/deepspeed/launcher/multinode_runner.py index 6027d1076e80..f7196f5bd8ba 100644 --- a/deepspeed/launcher/multinode_runner.py +++ b/deepspeed/launcher/multinode_runner.py @@ -120,6 +120,13 @@ def __init__(self, args, world_info_base64, resource_pool): super().__init__(args, world_info_base64) self.resource_pool = resource_pool self.add_export('UCX_TLS', 'tcp') + #self.add_export('LD_PRELOAD', '/usr/local/cuda-11.6/lib/libnccl.so') + self.add_export('FI_EFA_FORK_SAFE', '1') + self.add_export('NCCL_DEBUG_SYBSYS', '') + self.add_export('FI_EFA_USE_DEVICE_RDMA', '1') + self.add_export('NCCL_TREE_THRESHOLD', '0') + self.add_export('NCCL_PROTO', 'simple') + self.add_export('NCCL_SOCKET_IFNAME', '^docker0,lo') def backend_exists(self): #TODO: if IB is available we should suggestion mvapich @@ -148,12 +155,20 @@ def get_cmd(self, environment, active_resources): f'{total_process_count}', '-hostfile', f'{self.args.hostfile}', + '-x', + 'PATH', + '-x', + 'LD_LIBRARY_PATH', + '-x', + 'LD_PRELOAD', '--mca', 'btl', '^openib', '--mca', - 'btl_tcp_if_include', - 'eth0', + 'plm', + '^slurm', + '--tag-output', + '--timestamp-output', ] + split(self.args.launcher_args) export_cmd = [] @@ -238,6 +253,7 @@ def get_cmd(self, environment, active_resources): 'srun', '-n', f'{total_process_count}', + '--label', ] + split(self.args.launcher_args) if getattr(self.args, 'slurm_comment', ''): diff --git a/deepspeed/runtime/pipe/engine.py b/deepspeed/runtime/pipe/engine.py index a080559b1a2a..765733f45ae8 100644 --- a/deepspeed/runtime/pipe/engine.py +++ b/deepspeed/runtime/pipe/engine.py @@ -990,9 +990,7 @@ def _exec_send_grads(self, buffer_id): if isinstance(inputs, tuple): first_input = inputs[0] assert all([torch.is_tensor(elt) for elt in inputs[1:]]) - inputs_grad_tail = [ - elt.grad for elt in inputs[1:] if elt.grad is not None - ] + inputs_grad_tail = [elt.grad for elt in inputs[1:]] elif torch.is_tensor(inputs): first_input = inputs inputs_grad_tail = []