From 64d6c5a545b2182a680a111eced1a34f84aefbd4 Mon Sep 17 00:00:00 2001 From: Quentin Anthony Date: Thu, 9 Mar 2023 18:12:50 -0500 Subject: [PATCH 1/5] Update engine.py Remove PP Grad Tail Check (until https://github.com/microsoft/DeepSpeed/pull/2538 is merged to upstream) --- deepspeed/runtime/pipe/engine.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/deepspeed/runtime/pipe/engine.py b/deepspeed/runtime/pipe/engine.py index a080559b1a2a..765733f45ae8 100644 --- a/deepspeed/runtime/pipe/engine.py +++ b/deepspeed/runtime/pipe/engine.py @@ -990,9 +990,7 @@ def _exec_send_grads(self, buffer_id): if isinstance(inputs, tuple): first_input = inputs[0] assert all([torch.is_tensor(elt) for elt in inputs[1:]]) - inputs_grad_tail = [ - elt.grad for elt in inputs[1:] if elt.grad is not None - ] + inputs_grad_tail = [elt.grad for elt in inputs[1:]] elif torch.is_tensor(inputs): first_input = inputs inputs_grad_tail = [] From 592d34699f6f7f7d47a013a1306056d2eebb3ffc Mon Sep 17 00:00:00 2001 From: Quentin-Anthony Date: Fri, 10 Mar 2023 00:08:04 +0000 Subject: [PATCH 2/5] Set up OpenMPI runner for stability cluster --- deepspeed/launcher/multinode_runner.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/deepspeed/launcher/multinode_runner.py b/deepspeed/launcher/multinode_runner.py index 6027d1076e80..728b2024ddf1 100644 --- a/deepspeed/launcher/multinode_runner.py +++ b/deepspeed/launcher/multinode_runner.py @@ -120,6 +120,13 @@ def __init__(self, args, world_info_base64, resource_pool): super().__init__(args, world_info_base64) self.resource_pool = resource_pool self.add_export('UCX_TLS', 'tcp') + #self.add_export('LD_PRELOAD', '/usr/local/cuda-11.6/lib/libnccl.so') + self.add_export('FI_EFA_FORK_SAFE', '1') + self.add_export('NCCL_DEBUG_SYBSYS', '') + self.add_export('FI_EFA_USE_DEVICE_RDMA', '1') + self.add_export('NCCL_TREE_THRESHOLD', '0') + self.add_export('NCCL_PROTO', 'simple') + self.add_export('NCCL_SOCKET_IFNAME', '^docker0,lo') def backend_exists(self): #TODO: if IB is available we should suggestion mvapich @@ -148,12 +155,19 @@ def get_cmd(self, environment, active_resources): f'{total_process_count}', '-hostfile', f'{self.args.hostfile}', + '-x', + 'PATH', + '-x', + 'LD_LIBRARY_PATH', + '-x', + 'LD_PRELOAD', '--mca', 'btl', '^openib', '--mca', - 'btl_tcp_if_include', - 'eth0', + 'plm', + '^slurm', + '--mca', ] + split(self.args.launcher_args) export_cmd = [] From a9d0263cb1b3a6288e6265411ebb4b1a07c23e93 Mon Sep 17 00:00:00 2001 From: Quentin-Anthony Date: Tue, 28 Mar 2023 20:35:09 +0000 Subject: [PATCH 3/5] tag and timestamp the output --- deepspeed/launcher/multinode_runner.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/deepspeed/launcher/multinode_runner.py b/deepspeed/launcher/multinode_runner.py index 728b2024ddf1..959f76a4fc58 100644 --- a/deepspeed/launcher/multinode_runner.py +++ b/deepspeed/launcher/multinode_runner.py @@ -167,7 +167,8 @@ def get_cmd(self, environment, active_resources): '--mca', 'plm', '^slurm', - '--mca', + '--tag-output', + '--timestamp-output', ] + split(self.args.launcher_args) export_cmd = [] From 37f25c15858808f9cb8697f8d6462edecadb2886 Mon Sep 17 00:00:00 2001 From: Quentin-Anthony Date: Tue, 28 Mar 2023 20:52:04 +0000 Subject: [PATCH 4/5] add slurm label --- deepspeed/launcher/multinode_runner.py | 1 + 1 file changed, 1 insertion(+) diff --git a/deepspeed/launcher/multinode_runner.py b/deepspeed/launcher/multinode_runner.py index 959f76a4fc58..f7196f5bd8ba 100644 --- a/deepspeed/launcher/multinode_runner.py +++ b/deepspeed/launcher/multinode_runner.py @@ -253,6 +253,7 @@ def get_cmd(self, environment, active_resources): 'srun', '-n', f'{total_process_count}', + '--label', ] + split(self.args.launcher_args) if getattr(self.args, 'slurm_comment', ''): From 67e89b7dcce083709a4bcdab70ff17f2272af0e4 Mon Sep 17 00:00:00 2001 From: Quentin-Anthony Date: Tue, 28 Mar 2023 21:28:26 +0000 Subject: [PATCH 5/5] Print the hostname to be used in conjunction with --tag-output (mpirun) or --label (srun) --- deepspeed/__init__.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/deepspeed/__init__.py b/deepspeed/__init__.py index 9e2e25513a72..bead98a40a51 100755 --- a/deepspeed/__init__.py +++ b/deepspeed/__init__.py @@ -10,6 +10,7 @@ from torch.optim import Optimizer from torch.optim.lr_scheduler import _LRScheduler from packaging import version as pkg_version +import subprocess from . import ops from . import module_inject @@ -116,6 +117,11 @@ def initialize(args=None, __git_branch__), ranks=[0]) + hostname_cmd = ["hostname -I"] + result = subprocess.check_output(hostname_cmd, shell=True) + hostname = result.decode('utf-8').split()[0] + print("Host: {}".format(hostname)) + # Disable zero.Init context if it's currently enabled zero.partition_parameters.shutdown_init_context()