diff --git a/deepspeed/__init__.py b/deepspeed/__init__.py
index 9e2e25513a72..bead98a40a51 100755
--- a/deepspeed/__init__.py
+++ b/deepspeed/__init__.py
@@ -10,6 +10,7 @@
 from torch.optim import Optimizer
 from torch.optim.lr_scheduler import _LRScheduler
 from packaging import version as pkg_version
+import subprocess
 
 from . import ops
 from . import module_inject
@@ -116,6 +117,11 @@ def initialize(args=None,
         __git_branch__),
              ranks=[0])
 
+    hostname_cmd = ["hostname -I"]
+    result = subprocess.check_output(hostname_cmd, shell=True)
+    hostname = result.decode('utf-8').split()[0]
+    print("Host: {}".format(hostname))
+
     # Disable zero.Init context if it's currently enabled
     zero.partition_parameters.shutdown_init_context()
 
diff --git a/deepspeed/launcher/multinode_runner.py b/deepspeed/launcher/multinode_runner.py
index 6027d1076e80..f7196f5bd8ba 100644
--- a/deepspeed/launcher/multinode_runner.py
+++ b/deepspeed/launcher/multinode_runner.py
@@ -120,6 +120,13 @@ def __init__(self, args, world_info_base64, resource_pool):
         super().__init__(args, world_info_base64)
         self.resource_pool = resource_pool
         self.add_export('UCX_TLS', 'tcp')
+        #self.add_export('LD_PRELOAD', '/usr/local/cuda-11.6/lib/libnccl.so')
+        self.add_export('FI_EFA_FORK_SAFE', '1')
+        self.add_export('NCCL_DEBUG_SYBSYS', '')
+        self.add_export('FI_EFA_USE_DEVICE_RDMA', '1')
+        self.add_export('NCCL_TREE_THRESHOLD', '0')
+        self.add_export('NCCL_PROTO', 'simple')
+        self.add_export('NCCL_SOCKET_IFNAME', '^docker0,lo')
 
     def backend_exists(self):
         #TODO: if IB is available we should suggestion mvapich
@@ -148,12 +155,20 @@ def get_cmd(self, environment, active_resources):
             f'{total_process_count}',
             '-hostfile',
             f'{self.args.hostfile}',
+            '-x',
+            'PATH',
+            '-x',
+            'LD_LIBRARY_PATH',
+            '-x',
+            'LD_PRELOAD',
             '--mca',
             'btl',
             '^openib',
             '--mca',
-            'btl_tcp_if_include',
-            'eth0',
+            'plm',
+            '^slurm',
+            '--tag-output',
+            '--timestamp-output',
         ] + split(self.args.launcher_args)
 
         export_cmd = []
@@ -238,6 +253,7 @@ def get_cmd(self, environment, active_resources):
             'srun',
             '-n',
             f'{total_process_count}',
+            '--label',
         ] + split(self.args.launcher_args)
 
         if getattr(self.args, 'slurm_comment', ''):
diff --git a/deepspeed/runtime/pipe/engine.py b/deepspeed/runtime/pipe/engine.py
index a080559b1a2a..765733f45ae8 100644
--- a/deepspeed/runtime/pipe/engine.py
+++ b/deepspeed/runtime/pipe/engine.py
@@ -990,9 +990,7 @@ def _exec_send_grads(self, buffer_id):
             if isinstance(inputs, tuple):
                 first_input = inputs[0]
                 assert all([torch.is_tensor(elt) for elt in inputs[1:]])
-                inputs_grad_tail = [
-                    elt.grad for elt in inputs[1:] if elt.grad is not None
-                ]
+                inputs_grad_tail = [elt.grad for elt in inputs[1:]]
             elif torch.is_tensor(inputs):
                 first_input = inputs
                 inputs_grad_tail = []