[Serve] Add Multiplex metrics into dashboard (ray-project#37722)

Number of models per replica: <img width="1639" alt="image" src="https://github.com/ray-project/ray/assets/6515354/6f45f047-f02e-453c-914a-96098739c3a3"> Number of times models loaded <img width="1630" alt="image" src="https://github.com/ray-project/ray/assets/6515354/e5bb0f9a-6958-4e73-9a6a-294aa02891cc"> Number of times models unloaded <img width="1596" alt="image" src="https://github.com/ray-project/ray/assets/6515354/717a2f45-29a0-4204-8dcf-b3caf3f53035"> Model load latency p99 <img width="1638" alt="image" src="https://github.com/ray-project/ray/assets/6515354/69729506-6f37-4ee7-93c9-979a09c7c234"> Model unloaded latency p99 <img width="1670" alt="image" src="https://github.com/ray-project/ray/assets/6515354/f8ac2a83-42c3-4c49-95b4-2089ca94ff5c"> Registered model <img width="1661" alt="image" src="https://github.com/ray-project/ray/assets/6515354/d1cd94e1-46f4-4431-a78c-b8177291d948"> Cache hit rate <img width="1653" alt="image" src="https://github.com/ray-project/ray/assets/6515354/b38472f4-57e7-45d7-aebb-375a0f96cbd1">
harborn · Aug 17, 2023 · b2caaba · b2caaba
1 parent 634ee80
commit b2caaba
Show file tree

Hide file tree

Showing 4 changed files with 146 additions and 12 deletions.
diff --git a/dashboard/modules/metrics/dashboards/serve_deployment_dashboard_panels.py b/dashboard/modules/metrics/dashboards/serve_deployment_dashboard_panels.py
@@ -149,6 +149,108 @@
  stack=False,
  grid_pos=GridPos(16, 2, 8, 8),
  ),
+ Panel(
+ id=10,
+ title="Multiplexed models per replica",
+ description="The number of multiplexed models for each replica.",
+ unit="models",
+ targets=[
+ Target(
+ expr="sum(ray_serve_num_multiplexed_models{{{global_filters}}}) by (deployment, replica)",
+ legend="{{replica}}",
+ ),
+ ],
+ fill=0,
+ stack=False,
+ grid_pos=GridPos(0, 3, 8, 8),
+ ),
+ Panel(
+ id=11,
+ title="Multiplexed model loads per replica",
+ description="The number of times of multiplexed models loaded for each replica.",
+ unit="times",
+ targets=[
+ Target(
+ expr="sum(ray_serve_multiplexed_models_load_counter{{{global_filters}}}) by (deployment, replica)",
+ legend="{{replica}}",
+ ),
+ ],
+ fill=0,
+ stack=False,
+ grid_pos=GridPos(8, 3, 8, 8),
+ ),
+ Panel(
+ id=12,
+ title="Multiplexed model unloads per replica",
+ description="The number of times of multiplexed models unloaded for each replica.",
+ unit="times",
+ targets=[
+ Target(
+ expr="sum(ray_serve_multiplexed_models_unload_counter{{{global_filters}}}) by (deployment, replica)",
+ legend="{{replica}}",
+ ),
+ ],
+ fill=0,
+ stack=False,
+ grid_pos=GridPos(16, 3, 8, 8),
+ ),
+ Panel(
+ id=13,
+ title="P99 latency of multiplexed model loads per replica",
+ description="P99 latency of mutliplexed model load per replica.",
+ unit="ms",
+ targets=[
+ Target(
+ expr="histogram_quantile(0.99, sum(rate(ray_serve_multiplexed_model_load_latency_ms_bucket{{{global_filters}}}[5m])) by (deployment, replica, le))",
+ legend="{{replica}}",
+ ),
+ ],
+ fill=0,
+ stack=False,
+ grid_pos=GridPos(0, 4, 8, 8),
+ ),
+ Panel(
+ id=14,
+ title="P99 latency of multiplexed model unloads per replica",
+ description="P99 latency of mutliplexed model unload per replica.",
+ unit="ms",
+ targets=[
+ Target(
+ expr="histogram_quantile(0.99, sum(rate(ray_serve_multiplexed_model_unload_latency_ms_bucket{{{global_filters}}}[5m])) by (deployment, replica, le))",
+ legend="{{replica}}",
+ ),
+ ],
+ fill=0,
+ stack=False,
+ grid_pos=GridPos(8, 4, 8, 8),
+ ),
+ Panel(
+ id=15,
+ title="Multiplexed model ids per replica",
+ description="The ids of multiplexed models for each replica.",
+ unit="model",
+ targets=[
+ Target(
+ expr="ray_serve_registered_multiplexed_model_id{{{global_filters}}}",
+ legend="{{replica}}:{{model_id}}",
+ ),
+ ],
+ grid_pos=GridPos(16, 4, 8, 8),
+ stack=False,
+ ),
+ Panel(
+ id=16,
+ title="Multiplexed model cache hit rate",
+ description="The cache hit rate of multiplexed models for the deployment.",
+ unit="%",
+ targets=[
+ Target(
+ expr="(1 - sum(rate(ray_serve_multiplexed_models_load_counter{{{global_filters}}}[5m]))/sum(rate(ray_serve_multiplexed_get_model_requests_counter{{{global_filters}}}[5m])))",
+ legend="{{replica}}",
+ ),
+ ],
+ grid_pos=GridPos(0, 5, 8, 8),
+ ),
 ]
 
 ids = []

diff --git a/doc/source/serve/monitoring.md b/doc/source/serve/monitoring.md
@@ -343,12 +343,12 @@ The following metrics are exposed by Ray Serve:
  - * route
  * application
  - The end-to-end latency of HTTP requests (measured from the Serve HTTP proxy).
- * - ``serve_multiplexed_model_load_latency_s``
+ * - ``serve_multiplexed_model_load_latency_ms``
  - * deployment
  * replica
  * application
  - The time it takes to load a model.
- * - ``serve_multiplexed_model_unload_latency_s``
+ * - ``serve_multiplexed_model_unload_latency_ms``
  - * deployment
  * replica
  * application
@@ -368,6 +368,17 @@ The following metrics are exposed by Ray Serve:
  * replica
  * application
  - The number of times models loaded on the current replica.
+ * - ``serve_registered_multiplexed_model_id``
+ - * deployment
+ * replica
+ * application
+ * model_id
+ - The mutlplexed model ID registered on the current replica.
+ * - ``serve_multiplexed_get_model_requests_counter``
+ - * deployment
+ * replica
+ * application
+ - The number of calls to get a multiplexed model.
 ```
 [*] - only available when using HTTP calls
 [**] - only available when using Python `ServeHandle` calls

diff --git a/python/ray/serve/multiplex.py b/python/ray/serve/multiplex.py
@@ -7,6 +7,7 @@
 
 from ray._private.async_compat import sync_to_async
 from ray.serve._private.constants import (
+ DEFAULT_LATENCY_BUCKET_MS,
  SERVE_LOGGER_NAME,
  PUSH_MULTIPLEXED_MODEL_IDS_INTERVAL_S,
 )
@@ -56,19 +57,30 @@ def __init__(
  self.self_arg: Any = self_arg
  self.max_num_models_per_replica: int = max_num_models_per_replica
 
- self.model_load_latency_s = metrics.Gauge(
- "serve_multiplexed_model_load_latency_s",
+ self.model_load_latency_ms = metrics.Histogram(
+ "serve_multiplexed_model_load_latency_ms",
  description="The time it takes to load a model.",
+ boundaries=DEFAULT_LATENCY_BUCKET_MS,
  )
- self.model_unload_latency_s = metrics.Gauge(
- "serve_multiplexed_model_unload_latency_s",
+ self.model_unload_latency_ms = metrics.Histogram(
+ "serve_multiplexed_model_unload_latency_ms",
  description="The time it takes to unload a model.",
+ boundaries=DEFAULT_LATENCY_BUCKET_MS,
  )
- self.num_models = metrics.Gauge(
+ self.num_models_gauge = metrics.Gauge(
  "serve_num_multiplexed_models",
  description="The number of models loaded on the current replica.",
  )
 
+ self.registered_model_gauge = metrics.Gauge(
+ "serve_registered_multiplexed_model_id",
+ description="The model id registered on the current replica.",
+ tag_keys=("model_id",),
+ )
+ self.get_model_requests_counter = metrics.Counter(
+ "serve_multiplexed_get_model_requests_counter",
+ description="The counter for get model requests on the current replica.",
+ )
  self.models_unload_counter = metrics.Counter(
  "serve_multiplexed_models_unload_counter",
  description="The counter for unloaded models on the current replica.",
@@ -117,6 +129,12 @@ def _get_loading_and_loaded_model_ids(self) -> List[str]:
  def _push_model_ids_info(self):
  """Push the multiplexed replica info to the controller."""
  try:
+
+ self.num_models_gauge.set(len(self.models))
+
+ for model_id in self.models:
+ self.registered_model_gauge.set(1, tags={"model_id": model_id})
+
  if self._push_multiplexed_replica_info:
  get_global_client().record_multiplexed_replica_info(
  MultiplexedReplicaInfo(
@@ -158,7 +176,7 @@ async def load_model(self, model_id: str) -> Any:
  if not model_id:
  raise ValueError("The model ID cannot be empty.")
 
- self.num_models.set(len(self.models))
+ self.get_model_requests_counter.inc()
 
  if model_id in self.models:
  # Move the model to the end of the OrderedDict to ensure LRU caching.
@@ -203,7 +221,9 @@ async def load_model(self, model_id: str) -> Any:
  f"Successfully loaded model '{model_id}' in {loaded_time}s."
  )
  self._model_load_tasks.discard(model_id)
- self.model_load_latency_s.set(time.time() - load_start_time)
+ self.model_load_latency_ms.observe(
+ (time.time() - load_start_time) * 1000.0
+ )
  return self.models[model_id]
  except Exception as e:
  logger.error(
@@ -229,5 +249,6 @@ async def unload_model_lru(self) -> None:
  await sync_to_async(model.__del__)()
  setattr(model, "__del__", lambda _: None)
  unloaded_time = time.time() - unload_start_time
- self.model_unload_latency_s.set(unloaded_time)
+ self.model_unload_latency_ms.observe(unloaded_time * 1000.0)
  logger.info(f"Successfully unloaded model '{model_id}' in {unloaded_time}s.")
+ self.registered_model_gauge.set(0, tags={"model_id": model_id})
diff --git a/python/ray/serve/tests/test_metrics.py b/python/ray/serve/tests/test_metrics.py
@@ -849,8 +849,8 @@ async def __call__(self, model_id: str):
  # Trigger model eviction.
  handle.remote("model3")
  expected_metrics = [
- "serve_multiplexed_model_load_latency_s",
- "serve_multiplexed_model_unload_latency_s",
+ "serve_multiplexed_model_load_latency_ms",
+ "serve_multiplexed_model_unload_latency_ms",
  "serve_num_multiplexed_models",
  "serve_multiplexed_models_load_counter",
  "serve_multiplexed_models_unload_counter",