Skip to content

Commit

Permalink
Set the nccl env to execute gpu test task. (intelligent-machine-learn…
Browse files Browse the repository at this point in the history
…ing#1148)

* Set the nccl env to execute gpu test task.

* Fix test cases.

* Fix by comments.
  • Loading branch information
workingloong committed May 30, 2024
1 parent 0dfeadc commit 27c965f
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 0 deletions.
8 changes: 8 additions & 0 deletions dlrover/trainer/tests/torch/node_check_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

from dlrover.trainer.torch.node_check.ascend_npu import main as npu_main
from dlrover.trainer.torch.node_check.nvidia_gpu import main as gpu_main
from dlrover.trainer.torch.node_check.nvidia_gpu import set_nccl_env
from dlrover.trainer.torch.node_check.utils import mock_error


Expand Down Expand Up @@ -65,3 +66,10 @@ def test_mock_error(self):
except ValueError:
raised_error = True
self.assertTrue(raised_error)

def test_set_nccl_env(self):
os.environ[
"NCCL_SETTINGS"
] = "NCCL_DEBUG=INFO,NCCL_SOCKET_IFNAME=eth0,NCCL_IB_GID_INDEX=3"
set_nccl_env()
self.assertEqual(os.environ["NCCL_SOCKET_IFNAME"], "eth0")
8 changes: 8 additions & 0 deletions dlrover/trainer/torch/node_check/nvidia_gpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,13 @@
from .utils import bm_allreduce, matmul, record_execution_time


def set_nccl_env():
env_conf = os.getenv("NCCL_SETTINGS", "")
for item in env_conf.split(","):
k, v = item.split("=")
os.environ[k] = v


@record_execution_time
def main():
use_cuda = torch.cuda.is_available()
Expand All @@ -39,4 +46,5 @@ def main():


if __name__ == "__main__":
set_nccl_env()
t = main()

0 comments on commit 27c965f

Please sign in to comment.