Skip to content

Commit

Permalink
Set up OpenMPI runner for stability cluster
Browse files Browse the repository at this point in the history
  • Loading branch information
Quentin-Anthony committed Mar 10, 2023
1 parent 64d6c5a commit 592d346
Showing 1 changed file with 16 additions and 2 deletions.
18 changes: 16 additions & 2 deletions deepspeed/launcher/multinode_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,13 @@ def __init__(self, args, world_info_base64, resource_pool):
super().__init__(args, world_info_base64)
self.resource_pool = resource_pool
self.add_export('UCX_TLS', 'tcp')
#self.add_export('LD_PRELOAD', '/usr/local/cuda-11.6/lib/libnccl.so')
self.add_export('FI_EFA_FORK_SAFE', '1')
self.add_export('NCCL_DEBUG_SYBSYS', '')
self.add_export('FI_EFA_USE_DEVICE_RDMA', '1')
self.add_export('NCCL_TREE_THRESHOLD', '0')
self.add_export('NCCL_PROTO', 'simple')
self.add_export('NCCL_SOCKET_IFNAME', '^docker0,lo')

def backend_exists(self):
#TODO: if IB is available we should suggestion mvapich
Expand Down Expand Up @@ -148,12 +155,19 @@ def get_cmd(self, environment, active_resources):
f'{total_process_count}',
'-hostfile',
f'{self.args.hostfile}',
'-x',
'PATH',
'-x',
'LD_LIBRARY_PATH',
'-x',
'LD_PRELOAD',
'--mca',
'btl',
'^openib',
'--mca',
'btl_tcp_if_include',
'eth0',
'plm',
'^slurm',
'--mca',
] + split(self.args.launcher_args)

export_cmd = []
Expand Down

0 comments on commit 592d346

Please sign in to comment.