Skip to content

Commit

Permalink
[Core][State Observability] Truncate warning message is incorrect whe…
Browse files Browse the repository at this point in the history
…n filter is used (ray-project#26801)

Signed-off-by: rickyyx [email protected]

# Why are these changes needed?
When we returned less/incomplete results to users, there could be 3 reasons:

Data being truncated at the data source (raylets -> API server)
Data being filtered at the API server
Data being limited at the API server
We are not distinguishing the those 3 scenarios, but we should. This is why we thought data being truncated when it's actually filtered/limited.

This PR distinguishes these scenarios and prompt warnings accordingly.

# Related issue number
Closes ray-project#26570
Closes ray-project#26923
  • Loading branch information
rickyyx committed Jul 26, 2022
1 parent 65563e9 commit 259473c
Show file tree
Hide file tree
Showing 6 changed files with 215 additions and 93 deletions.
8 changes: 8 additions & 0 deletions dashboard/modules/state/state_head.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from ray.dashboard.state_aggregator import StateAPIManager
from ray.dashboard.utils import Change
from ray.experimental.state.common import (
RAY_MAX_LIMIT_FROM_API_SERVER,
ListApiOptions,
GetLogOptions,
SummaryApiOptions,
Expand Down Expand Up @@ -166,6 +167,13 @@ def _options_from_req(self, req: aiohttp.web.Request) -> ListApiOptions:
if req.query.get("limit") is not None
else DEFAULT_LIMIT
)

if limit > RAY_MAX_LIMIT_FROM_API_SERVER:
raise ValueError(
f"Given limit {limit} exceeds the supported "
f"limit {RAY_MAX_LIMIT_FROM_API_SERVER}. Use a lower limit."
)

timeout = int(req.query.get("timeout"))
filter_keys = req.query.getall("filter_keys", [])
filter_predicates = req.query.getall("filter_predicates", [])
Expand Down
63 changes: 51 additions & 12 deletions dashboard/state_aggregator.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
PlacementGroupState,
RuntimeEnvState,
SummaryApiResponse,
MAX_LIMIT,
RAY_MAX_LIMIT_FROM_API_SERVER,
SummaryApiOptions,
TaskSummaries,
StateSchema,
Expand Down Expand Up @@ -51,7 +51,7 @@
)
NODE_QUERY_FAILURE_WARNING = (
"Failed to query data from {type}. "
"Queryed {total} {type} "
"Queried {total} {type} "
"and {network_failures} {type} failed to reply. It is due to "
"(1) {type} is unexpectedly failed. "
"(2) {type} is overloaded. "
Expand Down Expand Up @@ -202,14 +202,18 @@ async def list_actors(self, *, option: ListApiOptions) -> ListApiResponse:
message=message, fields_to_decode=["actor_id", "owner_id"]
)
result.append(data)

num_after_truncation = len(result)
result = self._filter(result, option.filters, ActorState, option.detail)
num_filtered = len(result)

# Sort to make the output deterministic.
result.sort(key=lambda entry: entry["actor_id"])
result = list(islice(result, option.limit))
return ListApiResponse(
result=result,
total=reply.total,
num_after_truncation=num_after_truncation,
num_filtered=num_filtered,
)

async def list_placement_groups(self, *, option: ListApiOptions) -> ListApiResponse:
Expand All @@ -234,15 +238,19 @@ async def list_placement_groups(self, *, option: ListApiOptions) -> ListApiRespo
fields_to_decode=["placement_group_id", "node_id"],
)
result.append(data)
num_after_truncation = len(result)

result = self._filter(
result, option.filters, PlacementGroupState, option.detail
)
num_filtered = len(result)
# Sort to make the output deterministic.
result.sort(key=lambda entry: entry["placement_group_id"])
return ListApiResponse(
result=list(islice(result, option.limit)),
total=reply.total,
num_after_truncation=num_after_truncation,
num_filtered=num_filtered,
)

async def list_nodes(self, *, option: ListApiOptions) -> ListApiResponse:
Expand All @@ -263,15 +271,21 @@ async def list_nodes(self, *, option: ListApiOptions) -> ListApiResponse:
data["node_ip"] = data["node_manager_address"]
result.append(data)

total_nodes = len(result)
# No reason to truncate node because they are usually small.
num_after_truncation = len(result)

result = self._filter(result, option.filters, NodeState, option.detail)
num_filtered = len(result)

# Sort to make the output deterministic.
result.sort(key=lambda entry: entry["node_id"])
total_nodes = len(result)
result = list(islice(result, option.limit))
return ListApiResponse(
result=result,
# No reason to truncate node because they are usually small.
total=total_nodes,
num_after_truncation=num_after_truncation,
num_filtered=num_filtered,
)

async def list_workers(self, *, option: ListApiOptions) -> ListApiResponse:
Expand All @@ -296,13 +310,17 @@ async def list_workers(self, *, option: ListApiOptions) -> ListApiResponse:
data["ip"] = data["worker_address"]["ip_address"]
result.append(data)

num_after_truncation = len(result)
result = self._filter(result, option.filters, WorkerState, option.detail)
num_filtered = len(result)
# Sort to make the output deterministic.
result.sort(key=lambda entry: entry["worker_id"])
result = list(islice(result, option.limit))
return ListApiResponse(
result=result,
total=reply.total,
num_after_truncation=num_after_truncation,
num_filtered=num_filtered,
)

def list_jobs(self, *, option: ListApiOptions) -> ListApiResponse:
Expand All @@ -320,6 +338,8 @@ def list_jobs(self, *, option: ListApiOptions) -> ListApiResponse:
result=result,
# TODO(sang): Support this.
total=len(result),
num_after_truncation=len(result),
num_filtered=len(result),
)

async def list_tasks(self, *, option: ListApiOptions) -> ListApiResponse:
Expand Down Expand Up @@ -382,15 +402,18 @@ async def list_tasks(self, *, option: ListApiOptions) -> ListApiResponse:
TaskStatus.RUNNING
].name
result.append(data)

num_after_truncation = len(result)
result = self._filter(result, option.filters, TaskState, option.detail)
num_filtered = len(result)
# Sort to make the output deterministic.
result.sort(key=lambda entry: entry["task_id"])
result = list(islice(result, option.limit))
return ListApiResponse(
result=result,
partial_failure_warning=partial_failure_warning,
total=total_tasks,
num_after_truncation=num_after_truncation,
num_filtered=num_filtered,
)

async def list_objects(self, *, option: ListApiOptions) -> ListApiResponse:
Expand Down Expand Up @@ -471,14 +494,18 @@ async def list_objects(self, *, option: ListApiOptions) -> ListApiResponse:
"and `ray.init`."
)

num_after_truncation = len(result)
result = self._filter(result, option.filters, ObjectState, option.detail)
num_filtered = len(result)
# Sort to make the output deterministic.
result.sort(key=lambda entry: entry["object_id"])
result = list(islice(result, option.limit))
return ListApiResponse(
result=result,
partial_failure_warning=partial_failure_warning,
total=total_objects,
num_after_truncation=num_after_truncation,
num_filtered=num_filtered,
warnings=callsite_warning,
)

Expand Down Expand Up @@ -515,7 +542,7 @@ async def list_runtime_envs(self, *, option: ListApiOptions) -> ListApiResponse:
states = reply.runtime_env_states
for state in states:
data = self._message_to_dict(message=state, fields_to_decode=[])
# Need to deseiralize this field.
# Need to deserialize this field.
data["runtime_env"] = RuntimeEnv.deserialize(
data["runtime_env"]
).to_dict()
Expand All @@ -535,8 +562,9 @@ async def list_runtime_envs(self, *, option: ListApiOptions) -> ListApiResponse:
partial_failure_warning = (
f"The returned data may contain incomplete result. {warning_msg}"
)

num_after_truncation = len(result)
result = self._filter(result, option.filters, RuntimeEnvState, option.detail)
num_filtered = len(result)

# Sort to make the output deterministic.
def sort_func(entry):
Expand All @@ -556,12 +584,16 @@ def sort_func(entry):
result=result,
partial_failure_warning=partial_failure_warning,
total=total_runtime_envs,
num_after_truncation=num_after_truncation,
num_filtered=num_filtered,
)

async def summarize_tasks(self, option: SummaryApiOptions) -> SummaryApiResponse:
# For summary, try getting as many entries as possible to minimze data loss.
result = await self.list_tasks(
option=ListApiOptions(timeout=option.timeout, limit=MAX_LIMIT, filters=[])
option=ListApiOptions(
timeout=option.timeout, limit=RAY_MAX_LIMIT_FROM_API_SERVER, filters=[]
)
)
summary = StateSummary(
node_id_to_summary={
Expand All @@ -573,12 +605,15 @@ async def summarize_tasks(self, option: SummaryApiOptions) -> SummaryApiResponse
result=summary,
partial_failure_warning=result.partial_failure_warning,
warnings=result.warnings,
num_after_truncation=result.num_after_truncation,
)

async def summarize_actors(self, option: SummaryApiOptions) -> SummaryApiResponse:
# For summary, try getting as many entries as possible to minimze data loss.
result = await self.list_actors(
option=ListApiOptions(timeout=option.timeout, limit=MAX_LIMIT, filters=[])
option=ListApiOptions(
timeout=option.timeout, limit=RAY_MAX_LIMIT_FROM_API_SERVER, filters=[]
)
)
summary = StateSummary(
node_id_to_summary={
Expand All @@ -590,12 +625,15 @@ async def summarize_actors(self, option: SummaryApiOptions) -> SummaryApiRespons
result=summary,
partial_failure_warning=result.partial_failure_warning,
warnings=result.warnings,
num_after_truncation=result.num_after_truncation,
)

async def summarize_objects(self, option: SummaryApiOptions) -> SummaryApiResponse:
# For summary, try getting as many entries as possible to minimze data loss.
# For summary, try getting as many entries as possible to minimize data loss.
result = await self.list_objects(
option=ListApiOptions(timeout=option.timeout, limit=MAX_LIMIT, filters=[])
option=ListApiOptions(
timeout=option.timeout, limit=RAY_MAX_LIMIT_FROM_API_SERVER, filters=[]
)
)
summary = StateSummary(
node_id_to_summary={
Expand All @@ -607,6 +645,7 @@ async def summarize_objects(self, option: SummaryApiOptions) -> SummaryApiRespon
result=summary,
partial_failure_warning=result.partial_failure_warning,
warnings=result.warnings,
num_after_truncation=result.num_after_truncation,
)

def _message_to_dict(
Expand Down
38 changes: 31 additions & 7 deletions python/ray/experimental/state/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -314,6 +314,12 @@ def get(
def _print_api_warning(self, resource: StateResource, api_response: dict):
"""Print the API warnings.
We print warnings for users:
1. when some data sources are not available
2. when results were truncated at the data source
3. when results were limited
4. when callsites not enabled for listing objects
Args:
resource: Resource names, i.e. 'jobs', 'actors', 'nodes',
see `StateResource` for details.
Expand All @@ -324,16 +330,34 @@ def _print_api_warning(self, resource: StateResource, api_response: dict):
if warning_msgs:
warnings.warn(warning_msgs)

# Print warnings if data is truncated.
data = api_response["result"]
# Print warnings if data is truncated at the data source.
num_after_truncation = api_response["num_after_truncation"]
total = api_response["total"]
if total > len(data):
if total > num_after_truncation:
# NOTE(rickyyx): For now, there's not much users could do (neither can we),
# with hard truncation. Unless we allow users to set a higher
# `RAY_MAX_LIMIT_FROM_DATA_SOURCE`, the data will always be truncated at the
# data source.
warnings.warn(
(
f"{num_after_truncation} ({total} total) {resource.value} "
"are retrieved from the data source. "
f"{total - num_after_truncation} entries have been truncated. "
f"Max of {num_after_truncation} entries are retrieved from data "
"source to prevent over-sized payloads."
),
)

# Print warnings if return data is limited at the API server due to
# limit enforced at the server side
num_filtered = api_response["num_filtered"]
data = api_response["result"]
if num_filtered > len(data):
warnings.warn(
(
f"{len(data)} ({total} total) {resource.value} "
f"are returned. {total - len(data)} entries have been truncated. "
"Use `--filter` to reduce the amount of data to return "
"or increase the limit by specifying`--limit`."
f"{len(data)}/{num_filtered} {resource.value} returned. "
"Use `--filter` to reduce the amount of data to return or "
"setting a higher limit with `--limit` to see all data. "
),
)

Expand Down
Loading

0 comments on commit 259473c

Please sign in to comment.