[Core][State Observability] Truncate warning message is incorrect whe…

…n filter is used (ray-project#26801) Signed-off-by: rickyyx [email protected] # Why are these changes needed? When we returned less/incomplete results to users, there could be 3 reasons: Data being truncated at the data source (raylets -> API server) Data being filtered at the API server Data being limited at the API server We are not distinguishing the those 3 scenarios, but we should. This is why we thought data being truncated when it's actually filtered/limited. This PR distinguishes these scenarios and prompt warnings accordingly. # Related issue number Closes ray-project#26570 Closes ray-project#26923
charleslai2000 · Jul 26, 2022 · 259473c · 259473c
1 parent 65563e9
commit 259473c
Show file tree

Hide file tree

Showing 6 changed files with 215 additions and 93 deletions.
diff --git a/dashboard/modules/state/state_head.py b/dashboard/modules/state/state_head.py
@@ -18,6 +18,7 @@
 from ray.dashboard.state_aggregator import StateAPIManager
 from ray.dashboard.utils import Change
 from ray.experimental.state.common import (
+ RAY_MAX_LIMIT_FROM_API_SERVER,
  ListApiOptions,
  GetLogOptions,
  SummaryApiOptions,
@@ -166,6 +167,13 @@ def _options_from_req(self, req: aiohttp.web.Request) -> ListApiOptions:
  if req.query.get("limit") is not None
  else DEFAULT_LIMIT
  )
+
+ if limit > RAY_MAX_LIMIT_FROM_API_SERVER:
+ raise ValueError(
+ f"Given limit {limit} exceeds the supported "
+ f"limit {RAY_MAX_LIMIT_FROM_API_SERVER}. Use a lower limit."
+ )
+
  timeout = int(req.query.get("timeout"))
  filter_keys = req.query.getall("filter_keys", [])
  filter_predicates = req.query.getall("filter_predicates", [])

diff --git a/dashboard/state_aggregator.py b/dashboard/state_aggregator.py
@@ -20,7 +20,7 @@
  PlacementGroupState,
  RuntimeEnvState,
  SummaryApiResponse,
- MAX_LIMIT,
+ RAY_MAX_LIMIT_FROM_API_SERVER,
  SummaryApiOptions,
  TaskSummaries,
  StateSchema,
@@ -51,7 +51,7 @@
 )
 NODE_QUERY_FAILURE_WARNING = (
  "Failed to query data from {type}. "
- "Queryed {total} {type} "
+ "Queried {total} {type} "
  "and {network_failures} {type} failed to reply. It is due to "
  "(1) {type} is unexpectedly failed. "
  "(2) {type} is overloaded. "
@@ -202,14 +202,18 @@ async def list_actors(self, *, option: ListApiOptions) -> ListApiResponse:
  message=message, fields_to_decode=["actor_id", "owner_id"]
  )
  result.append(data)
-
+ num_after_truncation = len(result)
  result = self._filter(result, option.filters, ActorState, option.detail)
+ num_filtered = len(result)
+
  # Sort to make the output deterministic.
  result.sort(key=lambda entry: entry["actor_id"])
  result = list(islice(result, option.limit))
  return ListApiResponse(
  result=result,
  total=reply.total,
+ num_after_truncation=num_after_truncation,
+ num_filtered=num_filtered,
  )
 
  async def list_placement_groups(self, *, option: ListApiOptions) -> ListApiResponse:
@@ -234,15 +238,19 @@ async def list_placement_groups(self, *, option: ListApiOptions) -> ListApiRespo
  fields_to_decode=["placement_group_id", "node_id"],
  )
  result.append(data)
+ num_after_truncation = len(result)
 
  result = self._filter(
  result, option.filters, PlacementGroupState, option.detail
  )
+ num_filtered = len(result)
  # Sort to make the output deterministic.
  result.sort(key=lambda entry: entry["placement_group_id"])
  return ListApiResponse(
  result=list(islice(result, option.limit)),
  total=reply.total,
+ num_after_truncation=num_after_truncation,
+ num_filtered=num_filtered,
  )
 
  async def list_nodes(self, *, option: ListApiOptions) -> ListApiResponse:
@@ -263,15 +271,21 @@ async def list_nodes(self, *, option: ListApiOptions) -> ListApiResponse:
  data["node_ip"] = data["node_manager_address"]
  result.append(data)
 
+ total_nodes = len(result)
+ # No reason to truncate node because they are usually small.
+ num_after_truncation = len(result)
+
  result = self._filter(result, option.filters, NodeState, option.detail)
+ num_filtered = len(result)
+
  # Sort to make the output deterministic.
  result.sort(key=lambda entry: entry["node_id"])
- total_nodes = len(result)
  result = list(islice(result, option.limit))
  return ListApiResponse(
  result=result,
- # No reason to truncate node because they are usually small.
  total=total_nodes,
+ num_after_truncation=num_after_truncation,
+ num_filtered=num_filtered,
  )
 
  async def list_workers(self, *, option: ListApiOptions) -> ListApiResponse:
@@ -296,13 +310,17 @@ async def list_workers(self, *, option: ListApiOptions) -> ListApiResponse:
  data["ip"] = data["worker_address"]["ip_address"]
  result.append(data)
 
+ num_after_truncation = len(result)
  result = self._filter(result, option.filters, WorkerState, option.detail)
+ num_filtered = len(result)
  # Sort to make the output deterministic.
  result.sort(key=lambda entry: entry["worker_id"])
  result = list(islice(result, option.limit))
  return ListApiResponse(
  result=result,
  total=reply.total,
+ num_after_truncation=num_after_truncation,
+ num_filtered=num_filtered,
  )
 
  def list_jobs(self, *, option: ListApiOptions) -> ListApiResponse:
@@ -320,6 +338,8 @@ def list_jobs(self, *, option: ListApiOptions) -> ListApiResponse:
  result=result,
  # TODO(sang): Support this.
  total=len(result),
+ num_after_truncation=len(result),
+ num_filtered=len(result),
  )
 
  async def list_tasks(self, *, option: ListApiOptions) -> ListApiResponse:
@@ -382,15 +402,18 @@ async def list_tasks(self, *, option: ListApiOptions) -> ListApiResponse:
  TaskStatus.RUNNING
  ].name
  result.append(data)
-
+ num_after_truncation = len(result)
  result = self._filter(result, option.filters, TaskState, option.detail)
+ num_filtered = len(result)
  # Sort to make the output deterministic.
  result.sort(key=lambda entry: entry["task_id"])
  result = list(islice(result, option.limit))
  return ListApiResponse(
  result=result,
  partial_failure_warning=partial_failure_warning,
  total=total_tasks,
+ num_after_truncation=num_after_truncation,
+ num_filtered=num_filtered,
  )
 
  async def list_objects(self, *, option: ListApiOptions) -> ListApiResponse:
@@ -471,14 +494,18 @@ async def list_objects(self, *, option: ListApiOptions) -> ListApiResponse:
  "and `ray.init`."
  )
 
+ num_after_truncation = len(result)
  result = self._filter(result, option.filters, ObjectState, option.detail)
+ num_filtered = len(result)
  # Sort to make the output deterministic.
  result.sort(key=lambda entry: entry["object_id"])
  result = list(islice(result, option.limit))
  return ListApiResponse(
  result=result,
  partial_failure_warning=partial_failure_warning,
  total=total_objects,
+ num_after_truncation=num_after_truncation,
+ num_filtered=num_filtered,
  warnings=callsite_warning,
  )
 
@@ -515,7 +542,7 @@ async def list_runtime_envs(self, *, option: ListApiOptions) -> ListApiResponse:
  states = reply.runtime_env_states
  for state in states:
  data = self._message_to_dict(message=state, fields_to_decode=[])
- # Need to deseiralize this field.
+ # Need to deserialize this field.
  data["runtime_env"] = RuntimeEnv.deserialize(
  data["runtime_env"]
  ).to_dict()
@@ -535,8 +562,9 @@ async def list_runtime_envs(self, *, option: ListApiOptions) -> ListApiResponse:
  partial_failure_warning = (
  f"The returned data may contain incomplete result. {warning_msg}"
  )
-
+ num_after_truncation = len(result)
  result = self._filter(result, option.filters, RuntimeEnvState, option.detail)
+ num_filtered = len(result)
 
  # Sort to make the output deterministic.
  def sort_func(entry):
@@ -556,12 +584,16 @@ def sort_func(entry):
  result=result,
  partial_failure_warning=partial_failure_warning,
  total=total_runtime_envs,
+ num_after_truncation=num_after_truncation,
+ num_filtered=num_filtered,
  )
 
  async def summarize_tasks(self, option: SummaryApiOptions) -> SummaryApiResponse:
  # For summary, try getting as many entries as possible to minimze data loss.
  result = await self.list_tasks(
- option=ListApiOptions(timeout=option.timeout, limit=MAX_LIMIT, filters=[])
+ option=ListApiOptions(
+ timeout=option.timeout, limit=RAY_MAX_LIMIT_FROM_API_SERVER, filters=[]
+ )
  )
  summary = StateSummary(
  node_id_to_summary={
@@ -573,12 +605,15 @@ async def summarize_tasks(self, option: SummaryApiOptions) -> SummaryApiResponse
  result=summary,
  partial_failure_warning=result.partial_failure_warning,
  warnings=result.warnings,
+ num_after_truncation=result.num_after_truncation,
  )
 
  async def summarize_actors(self, option: SummaryApiOptions) -> SummaryApiResponse:
  # For summary, try getting as many entries as possible to minimze data loss.
  result = await self.list_actors(
- option=ListApiOptions(timeout=option.timeout, limit=MAX_LIMIT, filters=[])
+ option=ListApiOptions(
+ timeout=option.timeout, limit=RAY_MAX_LIMIT_FROM_API_SERVER, filters=[]
+ )
  )
  summary = StateSummary(
  node_id_to_summary={
@@ -590,12 +625,15 @@ async def summarize_actors(self, option: SummaryApiOptions) -> SummaryApiRespons
  result=summary,
  partial_failure_warning=result.partial_failure_warning,
  warnings=result.warnings,
+ num_after_truncation=result.num_after_truncation,
  )
 
  async def summarize_objects(self, option: SummaryApiOptions) -> SummaryApiResponse:
- # For summary, try getting as many entries as possible to minimze data loss.
+ # For summary, try getting as many entries as possible to minimize data loss.
  result = await self.list_objects(
- option=ListApiOptions(timeout=option.timeout, limit=MAX_LIMIT, filters=[])
+ option=ListApiOptions(
+ timeout=option.timeout, limit=RAY_MAX_LIMIT_FROM_API_SERVER, filters=[]
+ )
  )
  summary = StateSummary(
  node_id_to_summary={
@@ -607,6 +645,7 @@ async def summarize_objects(self, option: SummaryApiOptions) -> SummaryApiRespon
  result=summary,
  partial_failure_warning=result.partial_failure_warning,
  warnings=result.warnings,
+ num_after_truncation=result.num_after_truncation,
  )
 
  def _message_to_dict(

diff --git a/python/ray/experimental/state/api.py b/python/ray/experimental/state/api.py
@@ -314,6 +314,12 @@ def get(
  def _print_api_warning(self, resource: StateResource, api_response: dict):
  """Print the API warnings.
 
+ We print warnings for users:
+ 1. when some data sources are not available
+ 2. when results were truncated at the data source
+ 3. when results were limited
+ 4. when callsites not enabled for listing objects
+
  Args:
  resource: Resource names, i.e. 'jobs', 'actors', 'nodes',
  see `StateResource` for details.
@@ -324,16 +330,34 @@ def _print_api_warning(self, resource: StateResource, api_response: dict):
  if warning_msgs:
  warnings.warn(warning_msgs)
 
- # Print warnings if data is truncated.
- data = api_response["result"]
+ # Print warnings if data is truncated at the data source.
+ num_after_truncation = api_response["num_after_truncation"]
  total = api_response["total"]
- if total > len(data):
+ if total > num_after_truncation:
+ # NOTE(rickyyx): For now, there's not much users could do (neither can we),
+ # with hard truncation. Unless we allow users to set a higher
+ # `RAY_MAX_LIMIT_FROM_DATA_SOURCE`, the data will always be truncated at the
+ # data source.
+ warnings.warn(
+ (
+ f"{num_after_truncation} ({total} total) {resource.value} "
+ "are retrieved from the data source. "
+ f"{total - num_after_truncation} entries have been truncated. "
+ f"Max of {num_after_truncation} entries are retrieved from data "
+ "source to prevent over-sized payloads."
+ ),
+ )
+
+ # Print warnings if return data is limited at the API server due to
+ # limit enforced at the server side
+ num_filtered = api_response["num_filtered"]
+ data = api_response["result"]
+ if num_filtered > len(data):
  warnings.warn(
  (
- f"{len(data)} ({total} total) {resource.value} "
- f"are returned. {total - len(data)} entries have been truncated. "
- "Use `--filter` to reduce the amount of data to return "
- "or increase the limit by specifying`--limit`."
+ f"{len(data)}/{num_filtered} {resource.value} returned. "
+ "Use `--filter` to reduce the amount of data to return or "
+ "setting a higher limit with `--limit` to see all data. "
  ),
  )