Skip to content

Commit

Permalink
Merge pull request intelligent-machine-learning#1160 from BalaBalaYi/…
Browse files Browse the repository at this point in the history
…add_log_for_master_svc_check

add log for master svc check
  • Loading branch information
BalaBalaYi committed Jun 17, 2024
2 parents f0d5f88 + 5051fe3 commit 7fca2ba
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 5 deletions.
16 changes: 13 additions & 3 deletions dlrover/python/master/scaler/pod_scaler.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ def _get_master_addr(self):
# the service is not avalilable.
logger.info(
f"The service {master_addr} is not available and "
"use the IP of master Pod."
f"use the IP of master Pod."
)
master_ip = os.getenv("POD_IP", "")
if not master_ip:
Expand Down Expand Up @@ -496,14 +496,24 @@ def _create_pod(self, node: Node, pod_stats: Dict[str, int], ps_addrs):

def _check_master_service_avaliable(self, host, port, timeout=15):
"""Verify that the master grpc servicer is available."""
for _ in range(timeout):
for i in range(timeout):
try:
telnetlib.Telnet(host=host, port=port, timeout=3)
return True
except socket.gaierror:
logger.warning(
f"Attempt {i}: Encountered gaierror while "
f"performing master service check."
)
return False
except Exception:
except Exception as e:
logger.warning(
f"Attempt {i}: Encountered {str(e)} while "
f"performing master service check."
)
time.sleep(1)

logger.warning(f"Master service check failed after {timeout} retries.")
return False

def _patch_tf_config_into_env(self, pod, node: Node, pod_stats, ps_addrs):
Expand Down
21 changes: 19 additions & 2 deletions dlrover/python/tests/test_pod_scaler.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,14 +57,31 @@ def test_init_pod_template(self):
],
)

def test_create_pod(self):
def test_check_master_service_avaliable(self):
scaler = PodScaler("elasticjob-sample", "default")
_dlrover_ctx.config_master_port()
port = _dlrover_ctx.master_port
if 22222 == port:
wrong_port = 11111
else:
wrong_port = 22222
passed = scaler._check_master_service_avaliable(
"elasticjob-test-master", wrong_port, 2
)
self.assertFalse(passed)

passed = scaler._check_master_service_avaliable(
"elasticjob-test-master", 2222, 2
"localhost", wrong_port, 2
)
self.assertFalse(passed)

passed = scaler._check_master_service_avaliable("localhost", port, 2)
self.assertFalse(passed)

def test_create_pod(self):
scaler = PodScaler("elasticjob-sample", "default")
_dlrover_ctx.config_master_port()

scaler.start()
scaler._init_pod_config_by_job()
scaler._distribution_strategy = DistributionStrategy.PS
Expand Down

0 comments on commit 7fca2ba

Please sign in to comment.