Skip to content

Commit

Permalink
[Serve] Add Multiplex metrics into dashboard (ray-project#37722)
Browse files Browse the repository at this point in the history
  • Loading branch information
sihanwang41 authored and harborn committed Aug 17, 2023
1 parent 634ee80 commit b2caaba
Show file tree
Hide file tree
Showing 4 changed files with 146 additions and 12 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,108 @@
stack=False,
grid_pos=GridPos(16, 2, 8, 8),
),
Panel(
id=10,
title="Multiplexed models per replica",
description="The number of multiplexed models for each replica.",
unit="models",
targets=[
Target(
expr="sum(ray_serve_num_multiplexed_models{{{global_filters}}}) by (deployment, replica)",
legend="{{replica}}",
),
],
fill=0,
stack=False,
grid_pos=GridPos(0, 3, 8, 8),
),
Panel(
id=11,
title="Multiplexed model loads per replica",
description="The number of times of multiplexed models loaded for each replica.",
unit="times",
targets=[
Target(
expr="sum(ray_serve_multiplexed_models_load_counter{{{global_filters}}}) by (deployment, replica)",
legend="{{replica}}",
),
],
fill=0,
stack=False,
grid_pos=GridPos(8, 3, 8, 8),
),
Panel(
id=12,
title="Multiplexed model unloads per replica",
description="The number of times of multiplexed models unloaded for each replica.",
unit="times",
targets=[
Target(
expr="sum(ray_serve_multiplexed_models_unload_counter{{{global_filters}}}) by (deployment, replica)",
legend="{{replica}}",
),
],
fill=0,
stack=False,
grid_pos=GridPos(16, 3, 8, 8),
),
Panel(
id=13,
title="P99 latency of multiplexed model loads per replica",
description="P99 latency of mutliplexed model load per replica.",
unit="ms",
targets=[
Target(
expr="histogram_quantile(0.99, sum(rate(ray_serve_multiplexed_model_load_latency_ms_bucket{{{global_filters}}}[5m])) by (deployment, replica, le))",
legend="{{replica}}",
),
],
fill=0,
stack=False,
grid_pos=GridPos(0, 4, 8, 8),
),
Panel(
id=14,
title="P99 latency of multiplexed model unloads per replica",
description="P99 latency of mutliplexed model unload per replica.",
unit="ms",
targets=[
Target(
expr="histogram_quantile(0.99, sum(rate(ray_serve_multiplexed_model_unload_latency_ms_bucket{{{global_filters}}}[5m])) by (deployment, replica, le))",
legend="{{replica}}",
),
],
fill=0,
stack=False,
grid_pos=GridPos(8, 4, 8, 8),
),
Panel(
id=15,
title="Multiplexed model ids per replica",
description="The ids of multiplexed models for each replica.",
unit="model",
targets=[
Target(
expr="ray_serve_registered_multiplexed_model_id{{{global_filters}}}",
legend="{{replica}}:{{model_id}}",
),
],
grid_pos=GridPos(16, 4, 8, 8),
stack=False,
),
Panel(
id=16,
title="Multiplexed model cache hit rate",
description="The cache hit rate of multiplexed models for the deployment.",
unit="%",
targets=[
Target(
expr="(1 - sum(rate(ray_serve_multiplexed_models_load_counter{{{global_filters}}}[5m]))/sum(rate(ray_serve_multiplexed_get_model_requests_counter{{{global_filters}}}[5m])))",
legend="{{replica}}",
),
],
grid_pos=GridPos(0, 5, 8, 8),
),
]

ids = []
Expand Down
15 changes: 13 additions & 2 deletions doc/source/serve/monitoring.md
Original file line number Diff line number Diff line change
Expand Up @@ -343,12 +343,12 @@ The following metrics are exposed by Ray Serve:
- * route
* application
- The end-to-end latency of HTTP requests (measured from the Serve HTTP proxy).
* - ``serve_multiplexed_model_load_latency_s``
* - ``serve_multiplexed_model_load_latency_ms``
- * deployment
* replica
* application
- The time it takes to load a model.
* - ``serve_multiplexed_model_unload_latency_s``
* - ``serve_multiplexed_model_unload_latency_ms``
- * deployment
* replica
* application
Expand All @@ -368,6 +368,17 @@ The following metrics are exposed by Ray Serve:
* replica
* application
- The number of times models loaded on the current replica.
* - ``serve_registered_multiplexed_model_id``
- * deployment
* replica
* application
* model_id
- The mutlplexed model ID registered on the current replica.
* - ``serve_multiplexed_get_model_requests_counter``
- * deployment
* replica
* application
- The number of calls to get a multiplexed model.
```
[*] - only available when using HTTP calls
[**] - only available when using Python `ServeHandle` calls
Expand Down
37 changes: 29 additions & 8 deletions python/ray/serve/multiplex.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

from ray._private.async_compat import sync_to_async
from ray.serve._private.constants import (
DEFAULT_LATENCY_BUCKET_MS,
SERVE_LOGGER_NAME,
PUSH_MULTIPLEXED_MODEL_IDS_INTERVAL_S,
)
Expand Down Expand Up @@ -56,19 +57,30 @@ def __init__(
self.self_arg: Any = self_arg
self.max_num_models_per_replica: int = max_num_models_per_replica

self.model_load_latency_s = metrics.Gauge(
"serve_multiplexed_model_load_latency_s",
self.model_load_latency_ms = metrics.Histogram(
"serve_multiplexed_model_load_latency_ms",
description="The time it takes to load a model.",
boundaries=DEFAULT_LATENCY_BUCKET_MS,
)
self.model_unload_latency_s = metrics.Gauge(
"serve_multiplexed_model_unload_latency_s",
self.model_unload_latency_ms = metrics.Histogram(
"serve_multiplexed_model_unload_latency_ms",
description="The time it takes to unload a model.",
boundaries=DEFAULT_LATENCY_BUCKET_MS,
)
self.num_models = metrics.Gauge(
self.num_models_gauge = metrics.Gauge(
"serve_num_multiplexed_models",
description="The number of models loaded on the current replica.",
)

self.registered_model_gauge = metrics.Gauge(
"serve_registered_multiplexed_model_id",
description="The model id registered on the current replica.",
tag_keys=("model_id",),
)
self.get_model_requests_counter = metrics.Counter(
"serve_multiplexed_get_model_requests_counter",
description="The counter for get model requests on the current replica.",
)
self.models_unload_counter = metrics.Counter(
"serve_multiplexed_models_unload_counter",
description="The counter for unloaded models on the current replica.",
Expand Down Expand Up @@ -117,6 +129,12 @@ def _get_loading_and_loaded_model_ids(self) -> List[str]:
def _push_model_ids_info(self):
"""Push the multiplexed replica info to the controller."""
try:

self.num_models_gauge.set(len(self.models))

for model_id in self.models:
self.registered_model_gauge.set(1, tags={"model_id": model_id})

if self._push_multiplexed_replica_info:
get_global_client().record_multiplexed_replica_info(
MultiplexedReplicaInfo(
Expand Down Expand Up @@ -158,7 +176,7 @@ async def load_model(self, model_id: str) -> Any:
if not model_id:
raise ValueError("The model ID cannot be empty.")

self.num_models.set(len(self.models))
self.get_model_requests_counter.inc()

if model_id in self.models:
# Move the model to the end of the OrderedDict to ensure LRU caching.
Expand Down Expand Up @@ -203,7 +221,9 @@ async def load_model(self, model_id: str) -> Any:
f"Successfully loaded model '{model_id}' in {loaded_time}s."
)
self._model_load_tasks.discard(model_id)
self.model_load_latency_s.set(time.time() - load_start_time)
self.model_load_latency_ms.observe(
(time.time() - load_start_time) * 1000.0
)
return self.models[model_id]
except Exception as e:
logger.error(
Expand All @@ -229,5 +249,6 @@ async def unload_model_lru(self) -> None:
await sync_to_async(model.__del__)()
setattr(model, "__del__", lambda _: None)
unloaded_time = time.time() - unload_start_time
self.model_unload_latency_s.set(unloaded_time)
self.model_unload_latency_ms.observe(unloaded_time * 1000.0)
logger.info(f"Successfully unloaded model '{model_id}' in {unloaded_time}s.")
self.registered_model_gauge.set(0, tags={"model_id": model_id})
4 changes: 2 additions & 2 deletions python/ray/serve/tests/test_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -849,8 +849,8 @@ async def __call__(self, model_id: str):
# Trigger model eviction.
handle.remote("model3")
expected_metrics = [
"serve_multiplexed_model_load_latency_s",
"serve_multiplexed_model_unload_latency_s",
"serve_multiplexed_model_load_latency_ms",
"serve_multiplexed_model_unload_latency_ms",
"serve_num_multiplexed_models",
"serve_multiplexed_models_load_counter",
"serve_multiplexed_models_unload_counter",
Expand Down

0 comments on commit b2caaba

Please sign in to comment.