diff --git a/dashboard/client/src/pages/node/NodeRow.tsx b/dashboard/client/src/pages/node/NodeRow.tsx index 5456afef45df9..1f9239925e925 100644 --- a/dashboard/client/src/pages/node/NodeRow.tsx +++ b/dashboard/client/src/pages/node/NodeRow.tsx @@ -1,6 +1,8 @@ import { Box, + createStyles, IconButton, + makeStyles, TableCell, TableRow, Tooltip, @@ -10,6 +12,7 @@ import React, { useState } from "react"; import { RiArrowDownSLine, RiArrowRightSLine } from "react-icons/ri"; import { Link } from "react-router-dom"; import useSWR from "swr"; +import { CodeDialogButtonWithPreview } from "../../common/CodeDialogButton"; import { API_REFRESH_INTERVAL_MS } from "../../common/constants"; import { NodeLink } from "../../common/links"; import rowStyles from "../../common/RowStyles"; @@ -35,6 +38,39 @@ type NodeRowProps = Pick & { onExpandButtonClick: () => void; }; +const useStyles = makeStyles((theme) => + createStyles({ + tableContainer: { + overflowX: "scroll", + }, + expandCollapseIcon: { + color: theme.palette.text.secondary, + fontSize: "1.5em", + verticalAlign: "middle", + }, + idCol: { + display: "block", + width: "50px", + overflow: "hidden", + textOverflow: "ellipsis", + whiteSpace: "nowrap", + }, + OverflowCol: { + display: "block", + width: "100px", + overflow: "hidden", + textOverflow: "ellipsis", + whiteSpace: "nowrap", + }, + helpInfo: { + marginLeft: theme.spacing(1), + }, + logicalResources: { + maxWidth: 200, + }, + }), +); + /** * A single row that represents the node information only. * Does not show any data about the node's workers. @@ -53,9 +89,10 @@ export const NodeRow = ({ networkSpeed = [0, 0], raylet, logUrl, + logicalResources, } = node; - const classes = rowStyles(); + const classes = useStyles(); const objectStoreTotalMemory = raylet.objectStoreAvailableMemory + raylet.objectStoreUsedMemory; @@ -149,6 +186,17 @@ export const NodeRow = ({ {memoryConverter(networkSpeed[0])}/s {memoryConverter(networkSpeed[1])}/s + + {logicalResources ? ( + + ) : ( + "-" + )} + ); }; @@ -249,6 +297,7 @@ export const WorkerRow = ({ node, worker }: WorkerRowProps) => { N/A N/A N/A + N/A ); }; diff --git a/dashboard/client/src/pages/node/hook/useNodeList.ts b/dashboard/client/src/pages/node/hook/useNodeList.ts index ce9c764cf1a53..26811c1325cb9 100644 --- a/dashboard/client/src/pages/node/hook/useNodeList.ts +++ b/dashboard/client/src/pages/node/hook/useNodeList.ts @@ -36,21 +36,23 @@ export const useNodeList = () => { } else { setMsg(""); } - return rspData.summary; + return rspData; }, { refreshInterval: isRefreshing ? API_REFRESH_INTERVAL_MS : 0 }, ); - const nodeList = data ?? []; + const nodeList = data?.summary ?? []; + const nodeLogicalResources = data?.nodeLogicalResources ?? {}; - const nodeListWithState = nodeList + const nodeListWithAdditionalInfo = nodeList .map((e) => ({ ...e, state: e.raylet.state, + logicalResources: nodeLogicalResources[e.raylet.nodeId], })) .sort(sorterFunc); - const sortedList = _.sortBy(nodeListWithState, [ + const sortedList = _.sortBy(nodeListWithAdditionalInfo, [ (obj) => !obj.raylet.isHeadNode, // sort by alive first, then alphabetically for other states (obj) => (obj.raylet.state === "ALIVE" ? "0" : obj.raylet.state), diff --git a/dashboard/client/src/pages/node/index.tsx b/dashboard/client/src/pages/node/index.tsx index 1bd8e3e4bb88b..e45f153feda34 100644 --- a/dashboard/client/src/pages/node/index.tsx +++ b/dashboard/client/src/pages/node/index.tsx @@ -41,6 +41,9 @@ const useStyles = makeStyles((theme) => ({ }, })); +const codeTextStyle = { + fontFamily: "Roboto Mono, monospace", +}; const columns = [ { label: "" }, // Expand button { label: "Host / Worker Process name" }, @@ -101,6 +104,19 @@ const columns = [ }, { label: "Sent" }, { label: "Received" }, + { + label: "Logical Resources", + helpInfo: ( + + + Logical resources usage + {" "} + (e.g., CPU, memory) for a node. Alternatively, you can run the CLI + command

ray status -v

+ to obtain a similar result. +
+ ), + }, ]; export const brpcLinkChanger = (href: string) => { diff --git a/dashboard/client/src/type/node.d.ts b/dashboard/client/src/type/node.d.ts index 5d239b86476d1..13bdfc79dfe06 100644 --- a/dashboard/client/src/type/node.d.ts +++ b/dashboard/client/src/type/node.d.ts @@ -36,11 +36,19 @@ export type NodeDetail = { cmdline: string[]; state: string; logUrl: string; + logicalResources?: str; +}; + +// Example: +// "27fcdbcd36f9227b88bf07d48769efb4471cb204adbfb4b077cd2bc7": "0.0/8.0 CPU\n 0B/25.75GiB memory\n 0B/12.88GiB object_store_memory" +type NodeLogicalResourcesMap = { + [nodeId: string]: str; }; export type NodeListRsp = { data: { summary: NodeDetail[]; + nodeLogicalResources: NodeLogicalResourcesMap; }; result: boolean; msg: string; diff --git a/dashboard/modules/node/node_head.py b/dashboard/modules/node/node_head.py index 61c85900f47c3..9542ab2ba4a89 100644 --- a/dashboard/modules/node/node_head.py +++ b/dashboard/modules/node/node_head.py @@ -7,6 +7,12 @@ import aiohttp.web import ray._private.utils +from ray.dashboard.consts import GCS_RPC_TIMEOUT_SECONDS + +from ray.autoscaler._private.util import ( + LoadMetricsSummary, + get_per_node_breakdown_as_dict, +) import ray.dashboard.consts as dashboard_consts import ray.dashboard.optional_utils as dashboard_optional_utils import ray.dashboard.utils as dashboard_utils @@ -23,6 +29,10 @@ FREQUENTY_UPDATE_NODES_INTERVAL_SECONDS, FREQUENT_UPDATE_TIMEOUT_SECONDS, ) +from ray._private.ray_constants import ( + DEBUG_AUTOSCALING_ERROR, + DEBUG_AUTOSCALING_STATUS, +) from ray.dashboard.utils import async_loop_forever logger = logging.getLogger(__name__) @@ -235,14 +245,45 @@ async def get_node_module_internal_state(self, req) -> aiohttp.web.Response: **self.get_internal_states(), ) + async def get_nodes_logical_resources(self) -> dict: + (status_string, error) = await asyncio.gather( + *[ + self._gcs_aio_client.internal_kv_get( + key.encode(), namespace=None, timeout=GCS_RPC_TIMEOUT_SECONDS + ) + for key in [ + DEBUG_AUTOSCALING_STATUS, + DEBUG_AUTOSCALING_ERROR, + ] + ] + ) + + status_dict = json.loads(status_string) + + lm_summary_dict = status_dict.get("load_metrics_report") + if lm_summary_dict: + lm_summary = LoadMetricsSummary(**lm_summary_dict) + + node_logical_resources = get_per_node_breakdown_as_dict(lm_summary) + return node_logical_resources if error is None else {} + @routes.get("/nodes") @dashboard_optional_utils.aiohttp_cache async def get_all_nodes(self, req) -> aiohttp.web.Response: view = req.query.get("view") if view == "summary": - all_node_summary = await DataOrganizer.get_all_node_summary() + all_node_summary_task = DataOrganizer.get_all_node_summary() + nodes_logical_resource_task = self.get_nodes_logical_resources() + + all_node_summary, nodes_logical_resources = await asyncio.gather( + all_node_summary_task, nodes_logical_resource_task + ) + return dashboard_optional_utils.rest_response( - success=True, message="Node summary fetched.", summary=all_node_summary + success=True, + message="Node summary fetched.", + summary=all_node_summary, + node_logical_resources=nodes_logical_resources, ) elif view is not None and view.lower() == "hostNameList".lower(): alive_hostnames = set() diff --git a/python/ray/autoscaler/_private/util.py b/python/ray/autoscaler/_private/util.py index 7d1d70b079460..1ef7c0ed9bdf0 100644 --- a/python/ray/autoscaler/_private/util.py +++ b/python/ray/autoscaler/_private/util.py @@ -725,6 +725,20 @@ def get_demand_report(lm_summary: LoadMetricsSummary): return demand_report +def get_per_node_breakdown_as_dict( + lm_summary: LoadMetricsSummary, +) -> dict: + per_node_breakdown = {} + + for node_id, usage in lm_summary.usage_by_node.items(): + usage_string = "" + for line in parse_usage(usage, verbose=True): + usage_string += f"{line}\n" + per_node_breakdown[node_id] = usage_string.strip() + + return per_node_breakdown + + def get_per_node_breakdown( lm_summary: LoadMetricsSummary, node_type_mapping: Optional[Dict[str, float]], diff --git a/python/ray/tests/autoscaler/util.py b/python/ray/tests/autoscaler/util.py new file mode 100644 index 0000000000000..426ff3662402b --- /dev/null +++ b/python/ray/tests/autoscaler/util.py @@ -0,0 +1,67 @@ +import unittest +from unittest.mock import Mock +from ray.autoscaler._private.util import get_per_node_breakdown_as_dict + + +class TestGetPerNodeBreakdown(unittest.TestCase): + def setUp(self): + # Create a mock LoadMetricsSummary object with the required attributes + lm_summary_mock_data = { + "e9919752e5e8d757765d97d8bec910a2e78e8826f20bce46fd58f92e": { + "node:172.31.6.57": [0.0, 1.0], + "object_store_memory": [0.0, 13984228147.0], + "memory": [0.0, 27968456295.0], + "node:__internal_head__": [0.0, 1.0], + "CPU": [0.0, 8.0], + } + } + self.lm_summary_mock = Mock() + self.lm_summary_mock.usage_by_node = lm_summary_mock_data + + def test_get_per_node_breakdown_as_dict(self): + result = get_per_node_breakdown_as_dict(self.lm_summary_mock) + + expected_output = { + "e9919752e5e8d757765d97d8bec910a2e78e8826f20bce46fd58f92e": ( + "0.0/8.0 CPU\n0B/26.05GiB memory\n0B/13.02GiB object_store_memory" + ) + } + + self.assertEqual(result, expected_output) + + def test_get_per_node_breakdown_as_dict_empty_summary(self): + # Test with an empty lm_summary + lm_summary_mock_data = {} + self.lm_summary_mock.usage_by_node = lm_summary_mock_data + + result = get_per_node_breakdown_as_dict(self.lm_summary_mock) + + expected_output = {} + + self.assertEqual(result, expected_output) + + def test_get_per_node_breakdown_as_dict_missing_usage(self): + # Test with missing usage data for a node + lm_summary_mock_data = { + "e9919752e5e8d757765d97d8bec910a2e78e8826f20bce46fd58f92e": { + "node:172.31.6.57": [0.0, 1.0], + "object_store_memory": [0.0, 13984228147.0], + # 'memory': [0.0, 27968456295.0], # Missing memory data + "node:__internal_head__": [0.0, 1.0], + "CPU": [0.0, 8.0], + } + } + self.lm_summary_mock.usage_by_node = lm_summary_mock_data + + result = get_per_node_breakdown_as_dict(self.lm_summary_mock) + + expected_output = { + "e9919752e5e8d757765d97d8bec910a2e78e8826f20bce46fd58f92e": "0.0/8.0 CPU\n" + "0B/13.02GiB object_store_memory" + } + + self.assertEqual(result, expected_output) + + +if __name__ == "__main__": + unittest.main()