Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[autoscaler] Autoscaler metrics #16066

Merged
merged 32 commits into from
Jun 1, 2021
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
46ea81b
initial
ckw017 May 24, 2021
7c7cd1a
sanity check
ckw017 May 24, 2021
f170974
lint and more
ckw017 May 25, 2021
4150bf0
remove extra file?
ckw017 May 25, 2021
771b304
format
ckw017 May 26, 2021
3f11bc6
store ip of machine running monitor process
ckw017 May 27, 2021
eb9edb1
autoscaler_ip -> monitor_ip
ckw017 May 27, 2021
b4130a8
lint
ckw017 May 27, 2021
09a8998
add worker startup time buckets
ckw017 May 27, 2021
849c1ba
better descriptions
ckw017 May 27, 2021
6f58b81
more lint
ckw017 May 27, 2021
9050bab
propogate exception when starting prom http
ckw017 May 27, 2021
7be69c1
lint
ckw017 May 27, 2021
8f682b4
fix redis set/get
ckw017 May 27, 2021
51f7a5c
move start_http to monitor.py
ckw017 May 27, 2021
4259ecd
break up exception types and add pending_nodes metric
ckw017 May 27, 2021
bb9896e
Adjust buckets, fix test_autoscaler failures
ckw017 May 27, 2021
091610a
Add metric_agent tests
ckw017 May 27, 2021
ee005b1
explain _AUTOSCALER_METRICS
ckw017 May 27, 2021
1d82167
add basic exception count checks
ckw017 May 27, 2021
a43b38e
more autoscaler metric tests
ckw017 May 28, 2021
25f55dc
less dangerous way to handle no prom_metrics
ckw017 May 28, 2021
f7013c2
more mock checks
ckw017 May 28, 2021
b2bd1e5
better docs
ckw017 May 28, 2021
a0c10f0
nits
ckw017 May 28, 2021
c069b8c
cases for started_nodes and worker_startup_time histogram
ckw017 May 28, 2021
2c18da5
add node_launch_exceptions case
ckw017 May 28, 2021
216060c
use waitFor
ckw017 May 28, 2021
377d2c8
don't start http server if monitor_ip isn't provided
ckw017 May 31, 2021
0f1a6b2
drop worker_startup_time
ckw017 May 31, 2021
a6cc229
lint
ckw017 May 31, 2021
2edcb1a
Hotfix [nodes -> workers] + [count failed nodes as stopped]
ijrsvt May 31, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Hotfix [nodes -> workers] + [count failed nodes as stopped]
  • Loading branch information
ijrsvt committed May 31, 2021
commit 2edcb1a244865dd13ea0ac0a975a623574ea9425
4 changes: 3 additions & 1 deletion python/ray/autoscaler/_private/autoscaler.py
Original file line number Diff line number Diff line change
Expand Up @@ -303,6 +303,8 @@ def _update(self):
" Failed to update node."
" Node has already been terminated.")
if nodes_to_terminate:
self.prom_metrics.stopped_nodes.inc(
len(nodes_to_terminate))
self.provider.terminate_nodes(nodes_to_terminate)
nodes = self.workers()

Expand Down Expand Up @@ -750,7 +752,7 @@ def workers(self):
nodes = self.provider.non_terminated_nodes(
tag_filters={TAG_RAY_NODE_KIND: NODE_KIND_WORKER})
# Update running nodes gauge whenever we check workers
self.prom_metrics.running_nodes.set(len(nodes))
self.prom_metrics.running_workers.set(len(nodes))
return nodes

def unmanaged_workers(self):
Expand Down
6 changes: 3 additions & 3 deletions python/ray/autoscaler/_private/prom_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,9 @@ def __init__(self, registry: CollectorRegistry = None):
unit="nodes",
namespace="autoscaler",
registry=self.registry)
self.running_nodes: Gauge = Gauge(
"running_nodes",
"Number of nodes running.",
self.running_workers: Gauge = Gauge(
"running_workers",
"Number of worker nodes running.",
unit="nodes",
namespace="autoscaler",
registry=self.registry)
Expand Down
12 changes: 8 additions & 4 deletions python/ray/tests/test_autoscaler.py
Original file line number Diff line number Diff line change
Expand Up @@ -901,8 +901,8 @@ def testScaleUp(self):
autoscaler.update()
self.waitForNodes(2)

# running_nodes metric should be set to 2
mock_metrics.running_nodes.set.assert_called_with(2)
# running_workers metric should be set to 2
mock_metrics.running_workers.set.assert_called_with(2)

def testTerminateOutdatedNodesGracefully(self):
config = SMALL_CLUSTER.copy()
Expand Down Expand Up @@ -998,7 +998,7 @@ def testDynamicScaling(self):
assert ("Removing 1 nodes of type "
"ray-legacy-worker-node-type (max workers)." in events), events
assert mock_metrics.stopped_nodes.inc.call_count == 1
mock_metrics.running_nodes.set.assert_called_with(10)
mock_metrics.running_workers.set.assert_called_with(10)

def testInitialWorkers(self):
"""initial_workers is deprecated, this tests that it is ignored."""
Expand Down Expand Up @@ -2305,12 +2305,14 @@ def testNodeTerminatedDuringUpdate(self):
self.provider = MockProvider()
runner = MockProcessRunner()
lm = LoadMetrics()
mock_metrics = Mock(spec=AutoscalerPrometheusMetrics())
autoscaler = StandardAutoscaler(
config_path,
lm,
max_failures=0,
process_runner=runner,
update_interval_s=0)
update_interval_s=0,
prom_metrics=mock_metrics)

# Scale up to two up-to-date workers
autoscaler.update()
Expand Down Expand Up @@ -2388,6 +2390,8 @@ def terminate_worker_zero():
assert set(autoscaler.workers()) == {2, 3},\
"Unexpected node_ids"

assert len(mock_metrics.stopped_nodes.mock_calls) == 1

def testProviderException(self):
config_path = self.write_config(SMALL_CLUSTER)
self.provider = MockProvider()
Expand Down
2 changes: 1 addition & 1 deletion python/ray/tests/test_metrics_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@
_AUTOSCALER_METRICS = [
"autoscaler_config_validation_exceptions",
"autoscaler_node_launch_exceptions", "autoscaler_pending_nodes",
"autoscaler_reset_exceptions", "autoscaler_running_nodes",
"autoscaler_reset_exceptions", "autoscaler_running_workers",
"autoscaler_started_nodes", "autoscaler_stopped_nodes",
"autoscaler_update_loop_exceptions"
]
Expand Down