Skip to content

Commit

Permalink
rm stop field(no need)
Browse files Browse the repository at this point in the history
  • Loading branch information
BalaBalaYi committed Jun 26, 2024
1 parent 494b046 commit 4c20d5d
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 5 deletions.
9 changes: 4 additions & 5 deletions dlrover/python/master/node/job_auto_scaler.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,6 @@ class JobAutoScaler(metaclass=ABCMeta):

def __init__(self):
self._suggested_stop = False
self._stop_autoscaling = False
self._autoscaling_started = False

def suggested_stop(self):
Expand Down Expand Up @@ -157,7 +156,7 @@ def start_auto_scaling(self):
).start()

def stop_auto_scaling(self):
self._stop_autoscaling = True
self._autoscaling_started = False

def _periodic_optimize_running_resource(self):
"""Adjust job resource periodically and stop adjustment
Expand All @@ -167,7 +166,7 @@ def _periodic_optimize_running_resource(self):
last_plan_time = 0
opt_interval = _dlrover_context.seconds_interval_to_optimize
while True:
if self._stop_autoscaling:
if not self._autoscaling_started:
logger.info("Stop auto-scaling thread for PS Training.")
break
if (
Expand Down Expand Up @@ -286,13 +285,13 @@ def start_auto_scaling(self):
).start()

def stop_auto_scaling(self):
self._stop_autoscaling = True
self._autoscaling_started = False

def _periodic_adjust_worker(self):
"""Periodicaly adjust the number of worker."""
logger.info("Start auto-scaling thread for AllReduce Training.")
while True:
if self._stop_autoscaling:
if not self._autoscaling_started:
logger.info("Stop auto-scaling thread for AllReduce Training.")
break
time.sleep(self._scale_interval)
Expand Down
4 changes: 4 additions & 0 deletions dlrover/python/tests/test_job_auto_scaler.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,9 @@ def test_execute_job_optimization_plan(self):
ps_addrs.append("test-edljob-ps-{}.default.svc:2222".format(i))
self.assertListEqual(scale_plan.ps_addrs, ps_addrs)
auto_scaler.start_auto_scaling()
self.assertTrue(auto_scaler._autoscaling_started)
auto_scaler.stop_auto_scaling()
self.assertFalse(auto_scaler._autoscaling_started)

def test_reduce_timeout_pending_node_resource(self):
params = MockK8sPSJobArgs()
Expand Down Expand Up @@ -142,4 +144,6 @@ def test_execute_job_optimization_plan(self):
alive_num = auto_scaler._get_alive_worker_num()
self.assertEqual(alive_num, 16)
auto_scaler.start_auto_scaling()
self.assertTrue(auto_scaler._autoscaling_started)
auto_scaler.stop_auto_scaling()
self.assertFalse(auto_scaler._autoscaling_started)

0 comments on commit 4c20d5d

Please sign in to comment.