[Cluster] Add custom resources in node list (ray-project#37118)

Frontend: Add a column Logical Resources in Cluster page, to display logical usage(CPU, Memory and Object Store) in order to help users to have a better understanding of the task scheduling info Backend: Add a field node_logical_resources in the existing api/cluster_status to get logical resources info for all nodes Signed-off-by: Victor <[email protected]>
vymao · Oct 11, 2023 · b077f0a · b077f0a
1 parent 8245c1c
commit b077f0a
Show file tree

Hide file tree

Showing 7 changed files with 204 additions and 7 deletions.
diff --git a/dashboard/client/src/pages/node/NodeRow.tsx b/dashboard/client/src/pages/node/NodeRow.tsx
@@ -1,6 +1,8 @@
 import {
  Box,
+ createStyles,
  IconButton,
+ makeStyles,
  TableCell,
  TableRow,
  Tooltip,
@@ -10,6 +12,7 @@ import React, { useState } from "react";
 import { RiArrowDownSLine, RiArrowRightSLine } from "react-icons/ri";
 import { Link } from "react-router-dom";
 import useSWR from "swr";
+import { CodeDialogButtonWithPreview } from "../../common/CodeDialogButton";
 import { API_REFRESH_INTERVAL_MS } from "../../common/constants";
 import { NodeLink } from "../../common/links";
 import rowStyles from "../../common/RowStyles";
@@ -35,6 +38,39 @@ type NodeRowProps = Pick<NodeRowsProps, "node"> & {
  onExpandButtonClick: () => void;
 };
 
+const useStyles = makeStyles((theme) =>
+ createStyles({
+ tableContainer: {
+ overflowX: "scroll",
+ },
+ expandCollapseIcon: {
+ color: theme.palette.text.secondary,
+ fontSize: "1.5em",
+ verticalAlign: "middle",
+ },
+ idCol: {
+ display: "block",
+ width: "50px",
+ overflow: "hidden",
+ textOverflow: "ellipsis",
+ whiteSpace: "nowrap",
+ },
+ OverflowCol: {
+ display: "block",
+ width: "100px",
+ overflow: "hidden",
+ textOverflow: "ellipsis",
+ whiteSpace: "nowrap",
+ },
+ helpInfo: {
+ marginLeft: theme.spacing(1),
+ },
+ logicalResources: {
+ maxWidth: 200,
+ },
+ }),
+);
+
 /**
  * A single row that represents the node information only.
  * Does not show any data about the node's workers.
@@ -53,9 +89,10 @@ export const NodeRow = ({
  networkSpeed = [0, 0],
  raylet,
  logUrl,
+ logicalResources,
  } = node;
 
- const classes = rowStyles();
+ const classes = useStyles();
 
  const objectStoreTotalMemory =
  raylet.objectStoreAvailableMemory + raylet.objectStoreUsedMemory;
@@ -149,6 +186,17 @@ export const NodeRow = ({
  </TableCell>
  <TableCell align="center">{memoryConverter(networkSpeed[0])}/s</TableCell>
  <TableCell align="center">{memoryConverter(networkSpeed[1])}/s</TableCell>
+ <TableCell align="center">
+ {logicalResources ? (
+ <CodeDialogButtonWithPreview
+ className={classes.logicalResources}
+ title="Logical Resources"
+ code={logicalResources}
+ />
+ ) : (
+ "-"
+ )}
+ </TableCell>
  </TableRow>
  );
 };
@@ -249,6 +297,7 @@ export const WorkerRow = ({ node, worker }: WorkerRowProps) => {
  <TableCell>N/A</TableCell>
  <TableCell align="center">N/A</TableCell>
  <TableCell align="center">N/A</TableCell>
+ <TableCell align="center">N/A</TableCell>
  </TableRow>
  );
 };

diff --git a/dashboard/client/src/pages/node/hook/useNodeList.ts b/dashboard/client/src/pages/node/hook/useNodeList.ts
@@ -36,21 +36,23 @@ export const useNodeList = () => {
  } else {
  setMsg("");
  }
- return rspData.summary;
+ return rspData;
  },
  { refreshInterval: isRefreshing ? API_REFRESH_INTERVAL_MS : 0 },
  );
 
- const nodeList = data ?? [];
+ const nodeList = data?.summary ?? [];
+ const nodeLogicalResources = data?.nodeLogicalResources ?? {};
 
- const nodeListWithState = nodeList
+ const nodeListWithAdditionalInfo = nodeList
  .map((e) => ({
  ...e,
  state: e.raylet.state,
+ logicalResources: nodeLogicalResources[e.raylet.nodeId],
  }))
  .sort(sorterFunc);
 
- const sortedList = _.sortBy(nodeListWithState, [
+ const sortedList = _.sortBy(nodeListWithAdditionalInfo, [
  (obj) => !obj.raylet.isHeadNode,
  // sort by alive first, then alphabetically for other states
  (obj) => (obj.raylet.state === "ALIVE" ? "0" : obj.raylet.state),

diff --git a/dashboard/client/src/pages/node/index.tsx b/dashboard/client/src/pages/node/index.tsx
@@ -41,6 +41,9 @@ const useStyles = makeStyles((theme) => ({
  },
 }));
 
+const codeTextStyle = {
+ fontFamily: "Roboto Mono, monospace",
+};
 const columns = [
  { label: "" }, // Expand button
  { label: "Host / Worker Process name" },
@@ -101,6 +104,19 @@ const columns = [
  },
  { label: "Sent" },
  { label: "Received" },
+ {
+ label: "Logical Resources",
+ helpInfo: (
+ <Typography>
+ <a href="https://docs.ray.io/en/latest/ray-core/scheduling/resources.html#physical-resources-and-logical-resources">
+ Logical resources usage
+ </a>{" "}
+ (e.g., CPU, memory) for a node. Alternatively, you can run the CLI
+ command <p style={codeTextStyle}>ray status -v </p>
+ to obtain a similar result.
+ </Typography>
+ ),
+ },
 ];
 
 export const brpcLinkChanger = (href: string) => {

diff --git a/dashboard/client/src/type/node.d.ts b/dashboard/client/src/type/node.d.ts
@@ -36,11 +36,19 @@ export type NodeDetail = {
  cmdline: string[];
  state: string;
  logUrl: string;
+ logicalResources?: str;
+};
+
+// Example:
+// "27fcdbcd36f9227b88bf07d48769efb4471cb204adbfb4b077cd2bc7": "0.0/8.0 CPU\n 0B/25.75GiB memory\n 0B/12.88GiB object_store_memory"
+type NodeLogicalResourcesMap = {
+ [nodeId: string]: str;
 };
 
 export type NodeListRsp = {
  data: {
  summary: NodeDetail[];
+ nodeLogicalResources: NodeLogicalResourcesMap;
  };
  result: boolean;
  msg: string;

diff --git a/dashboard/modules/node/node_head.py b/dashboard/modules/node/node_head.py
@@ -7,6 +7,12 @@
 import aiohttp.web
 
 import ray._private.utils
+from ray.dashboard.consts import GCS_RPC_TIMEOUT_SECONDS
+
+from ray.autoscaler._private.util import (
+ LoadMetricsSummary,
+ get_per_node_breakdown_as_dict,
+)
 import ray.dashboard.consts as dashboard_consts
 import ray.dashboard.optional_utils as dashboard_optional_utils
 import ray.dashboard.utils as dashboard_utils
@@ -23,6 +29,10 @@
  FREQUENTY_UPDATE_NODES_INTERVAL_SECONDS,
  FREQUENT_UPDATE_TIMEOUT_SECONDS,
 )
+from ray._private.ray_constants import (
+ DEBUG_AUTOSCALING_ERROR,
+ DEBUG_AUTOSCALING_STATUS,
+)
 from ray.dashboard.utils import async_loop_forever
 
 logger = logging.getLogger(__name__)
@@ -235,14 +245,45 @@ async def get_node_module_internal_state(self, req) -> aiohttp.web.Response:
  **self.get_internal_states(),
  )
 
+ async def get_nodes_logical_resources(self) -> dict:
+ (status_string, error) = await asyncio.gather(
+ *[
+ self._gcs_aio_client.internal_kv_get(
+ key.encode(), namespace=None, timeout=GCS_RPC_TIMEOUT_SECONDS
+ )
+ for key in [
+ DEBUG_AUTOSCALING_STATUS,
+ DEBUG_AUTOSCALING_ERROR,
+ ]
+ ]
+ )
+
+ status_dict = json.loads(status_string)
+
+ lm_summary_dict = status_dict.get("load_metrics_report")
+ if lm_summary_dict:
+ lm_summary = LoadMetricsSummary(**lm_summary_dict)
+
+ node_logical_resources = get_per_node_breakdown_as_dict(lm_summary)
+ return node_logical_resources if error is None else {}
+
  @routes.get("/nodes")
  @dashboard_optional_utils.aiohttp_cache
  async def get_all_nodes(self, req) -> aiohttp.web.Response:
  view = req.query.get("view")
  if view == "summary":
- all_node_summary = await DataOrganizer.get_all_node_summary()
+ all_node_summary_task = DataOrganizer.get_all_node_summary()
+ nodes_logical_resource_task = self.get_nodes_logical_resources()
+
+ all_node_summary, nodes_logical_resources = await asyncio.gather(
+ all_node_summary_task, nodes_logical_resource_task
+ )
+
  return dashboard_optional_utils.rest_response(
- success=True, message="Node summary fetched.", summary=all_node_summary
+ success=True,
+ message="Node summary fetched.",
+ summary=all_node_summary,
+ node_logical_resources=nodes_logical_resources,
  )
  elif view is not None and view.lower() == "hostNameList".lower():
  alive_hostnames = set()

diff --git a/python/ray/autoscaler/_private/util.py b/python/ray/autoscaler/_private/util.py
@@ -725,6 +725,20 @@ def get_demand_report(lm_summary: LoadMetricsSummary):
  return demand_report
 
 
+def get_per_node_breakdown_as_dict(
+ lm_summary: LoadMetricsSummary,
+) -> dict:
+ per_node_breakdown = {}
+
+ for node_id, usage in lm_summary.usage_by_node.items():
+ usage_string = ""
+ for line in parse_usage(usage, verbose=True):
+ usage_string += f"{line}\n"
+ per_node_breakdown[node_id] = usage_string.strip()
+
+ return per_node_breakdown
+
+
 def get_per_node_breakdown(
  lm_summary: LoadMetricsSummary,
  node_type_mapping: Optional[Dict[str, float]],

diff --git a/python/ray/tests/autoscaler/util.py b/python/ray/tests/autoscaler/util.py
@@ -0,0 +1,67 @@
+import unittest
+from unittest.mock import Mock
+from ray.autoscaler._private.util import get_per_node_breakdown_as_dict
+
+
+class TestGetPerNodeBreakdown(unittest.TestCase):
+ def setUp(self):
+ # Create a mock LoadMetricsSummary object with the required attributes
+ lm_summary_mock_data = {
+ "e9919752e5e8d757765d97d8bec910a2e78e8826f20bce46fd58f92e": {
+ "node:172.31.6.57": [0.0, 1.0],
+ "object_store_memory": [0.0, 13984228147.0],
+ "memory": [0.0, 27968456295.0],
+ "node:__internal_head__": [0.0, 1.0],
+ "CPU": [0.0, 8.0],
+ }
+ }
+ self.lm_summary_mock = Mock()
+ self.lm_summary_mock.usage_by_node = lm_summary_mock_data
+
+ def test_get_per_node_breakdown_as_dict(self):
+ result = get_per_node_breakdown_as_dict(self.lm_summary_mock)
+
+ expected_output = {
+ "e9919752e5e8d757765d97d8bec910a2e78e8826f20bce46fd58f92e": (
+ "0.0/8.0 CPU\n0B/26.05GiB memory\n0B/13.02GiB object_store_memory"
+ )
+ }
+
+ self.assertEqual(result, expected_output)
+
+ def test_get_per_node_breakdown_as_dict_empty_summary(self):
+ # Test with an empty lm_summary
+ lm_summary_mock_data = {}
+ self.lm_summary_mock.usage_by_node = lm_summary_mock_data
+
+ result = get_per_node_breakdown_as_dict(self.lm_summary_mock)
+
+ expected_output = {}
+
+ self.assertEqual(result, expected_output)
+
+ def test_get_per_node_breakdown_as_dict_missing_usage(self):
+ # Test with missing usage data for a node
+ lm_summary_mock_data = {
+ "e9919752e5e8d757765d97d8bec910a2e78e8826f20bce46fd58f92e": {
+ "node:172.31.6.57": [0.0, 1.0],
+ "object_store_memory": [0.0, 13984228147.0],
+ # 'memory': [0.0, 27968456295.0], # Missing memory data
+ "node:__internal_head__": [0.0, 1.0],
+ "CPU": [0.0, 8.0],
+ }
+ }
+ self.lm_summary_mock.usage_by_node = lm_summary_mock_data
+
+ result = get_per_node_breakdown_as_dict(self.lm_summary_mock)
+
+ expected_output = {
+ "e9919752e5e8d757765d97d8bec910a2e78e8826f20bce46fd58f92e": "0.0/8.0 CPU\n"
+ "0B/13.02GiB object_store_memory"
+ }
+
+ self.assertEqual(result, expected_output)
+
+
+if __name__ == "__main__":
+ unittest.main()