Skip to content

Commit

Permalink
[Cluster] Add custom resources in node list (ray-project#37118)
Browse files Browse the repository at this point in the history
Frontend: Add a column Logical Resources in Cluster page, to display logical usage(CPU, Memory and Object Store) in order to help users to have a better understanding of the task scheduling info
Backend: Add a field node_logical_resources in the existing api/cluster_status to get logical resources info for all nodes
Signed-off-by: Victor <[email protected]>
  • Loading branch information
chaowanggg authored and Victor committed Oct 11, 2023
1 parent 8245c1c commit b077f0a
Show file tree
Hide file tree
Showing 7 changed files with 204 additions and 7 deletions.
51 changes: 50 additions & 1 deletion dashboard/client/src/pages/node/NodeRow.tsx
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import {
Box,
createStyles,
IconButton,
makeStyles,
TableCell,
TableRow,
Tooltip,
Expand All @@ -10,6 +12,7 @@ import React, { useState } from "react";
import { RiArrowDownSLine, RiArrowRightSLine } from "react-icons/ri";
import { Link } from "react-router-dom";
import useSWR from "swr";
import { CodeDialogButtonWithPreview } from "../../common/CodeDialogButton";
import { API_REFRESH_INTERVAL_MS } from "../../common/constants";
import { NodeLink } from "../../common/links";
import rowStyles from "../../common/RowStyles";
Expand All @@ -35,6 +38,39 @@ type NodeRowProps = Pick<NodeRowsProps, "node"> & {
onExpandButtonClick: () => void;
};

const useStyles = makeStyles((theme) =>
createStyles({
tableContainer: {
overflowX: "scroll",
},
expandCollapseIcon: {
color: theme.palette.text.secondary,
fontSize: "1.5em",
verticalAlign: "middle",
},
idCol: {
display: "block",
width: "50px",
overflow: "hidden",
textOverflow: "ellipsis",
whiteSpace: "nowrap",
},
OverflowCol: {
display: "block",
width: "100px",
overflow: "hidden",
textOverflow: "ellipsis",
whiteSpace: "nowrap",
},
helpInfo: {
marginLeft: theme.spacing(1),
},
logicalResources: {
maxWidth: 200,
},
}),
);

/**
* A single row that represents the node information only.
* Does not show any data about the node's workers.
Expand All @@ -53,9 +89,10 @@ export const NodeRow = ({
networkSpeed = [0, 0],
raylet,
logUrl,
logicalResources,
} = node;

const classes = rowStyles();
const classes = useStyles();

const objectStoreTotalMemory =
raylet.objectStoreAvailableMemory + raylet.objectStoreUsedMemory;
Expand Down Expand Up @@ -149,6 +186,17 @@ export const NodeRow = ({
</TableCell>
<TableCell align="center">{memoryConverter(networkSpeed[0])}/s</TableCell>
<TableCell align="center">{memoryConverter(networkSpeed[1])}/s</TableCell>
<TableCell align="center">
{logicalResources ? (
<CodeDialogButtonWithPreview
className={classes.logicalResources}
title="Logical Resources"
code={logicalResources}
/>
) : (
"-"
)}
</TableCell>
</TableRow>
);
};
Expand Down Expand Up @@ -249,6 +297,7 @@ export const WorkerRow = ({ node, worker }: WorkerRowProps) => {
<TableCell>N/A</TableCell>
<TableCell align="center">N/A</TableCell>
<TableCell align="center">N/A</TableCell>
<TableCell align="center">N/A</TableCell>
</TableRow>
);
};
Expand Down
10 changes: 6 additions & 4 deletions dashboard/client/src/pages/node/hook/useNodeList.ts
Original file line number Diff line number Diff line change
Expand Up @@ -36,21 +36,23 @@ export const useNodeList = () => {
} else {
setMsg("");
}
return rspData.summary;
return rspData;
},
{ refreshInterval: isRefreshing ? API_REFRESH_INTERVAL_MS : 0 },
);

const nodeList = data ?? [];
const nodeList = data?.summary ?? [];
const nodeLogicalResources = data?.nodeLogicalResources ?? {};

const nodeListWithState = nodeList
const nodeListWithAdditionalInfo = nodeList
.map((e) => ({
...e,
state: e.raylet.state,
logicalResources: nodeLogicalResources[e.raylet.nodeId],
}))
.sort(sorterFunc);

const sortedList = _.sortBy(nodeListWithState, [
const sortedList = _.sortBy(nodeListWithAdditionalInfo, [
(obj) => !obj.raylet.isHeadNode,
// sort by alive first, then alphabetically for other states
(obj) => (obj.raylet.state === "ALIVE" ? "0" : obj.raylet.state),
Expand Down
16 changes: 16 additions & 0 deletions dashboard/client/src/pages/node/index.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,9 @@ const useStyles = makeStyles((theme) => ({
},
}));

const codeTextStyle = {
fontFamily: "Roboto Mono, monospace",
};
const columns = [
{ label: "" }, // Expand button
{ label: "Host / Worker Process name" },
Expand Down Expand Up @@ -101,6 +104,19 @@ const columns = [
},
{ label: "Sent" },
{ label: "Received" },
{
label: "Logical Resources",
helpInfo: (
<Typography>
<a href="https://docs.ray.io/en/latest/ray-core/scheduling/resources.html#physical-resources-and-logical-resources">
Logical resources usage
</a>{" "}
(e.g., CPU, memory) for a node. Alternatively, you can run the CLI
command <p style={codeTextStyle}>ray status -v </p>
to obtain a similar result.
</Typography>
),
},
];

export const brpcLinkChanger = (href: string) => {
Expand Down
8 changes: 8 additions & 0 deletions dashboard/client/src/type/node.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -36,11 +36,19 @@ export type NodeDetail = {
cmdline: string[];
state: string;
logUrl: string;
logicalResources?: str;
};

// Example:
// "27fcdbcd36f9227b88bf07d48769efb4471cb204adbfb4b077cd2bc7": "0.0/8.0 CPU\n 0B/25.75GiB memory\n 0B/12.88GiB object_store_memory"
type NodeLogicalResourcesMap = {
[nodeId: string]: str;
};

export type NodeListRsp = {
data: {
summary: NodeDetail[];
nodeLogicalResources: NodeLogicalResourcesMap;
};
result: boolean;
msg: string;
Expand Down
45 changes: 43 additions & 2 deletions dashboard/modules/node/node_head.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,12 @@
import aiohttp.web

import ray._private.utils
from ray.dashboard.consts import GCS_RPC_TIMEOUT_SECONDS

from ray.autoscaler._private.util import (
LoadMetricsSummary,
get_per_node_breakdown_as_dict,
)
import ray.dashboard.consts as dashboard_consts
import ray.dashboard.optional_utils as dashboard_optional_utils
import ray.dashboard.utils as dashboard_utils
Expand All @@ -23,6 +29,10 @@
FREQUENTY_UPDATE_NODES_INTERVAL_SECONDS,
FREQUENT_UPDATE_TIMEOUT_SECONDS,
)
from ray._private.ray_constants import (
DEBUG_AUTOSCALING_ERROR,
DEBUG_AUTOSCALING_STATUS,
)
from ray.dashboard.utils import async_loop_forever

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -235,14 +245,45 @@ async def get_node_module_internal_state(self, req) -> aiohttp.web.Response:
**self.get_internal_states(),
)

async def get_nodes_logical_resources(self) -> dict:
(status_string, error) = await asyncio.gather(
*[
self._gcs_aio_client.internal_kv_get(
key.encode(), namespace=None, timeout=GCS_RPC_TIMEOUT_SECONDS
)
for key in [
DEBUG_AUTOSCALING_STATUS,
DEBUG_AUTOSCALING_ERROR,
]
]
)

status_dict = json.loads(status_string)

lm_summary_dict = status_dict.get("load_metrics_report")
if lm_summary_dict:
lm_summary = LoadMetricsSummary(**lm_summary_dict)

node_logical_resources = get_per_node_breakdown_as_dict(lm_summary)
return node_logical_resources if error is None else {}

@routes.get("/nodes")
@dashboard_optional_utils.aiohttp_cache
async def get_all_nodes(self, req) -> aiohttp.web.Response:
view = req.query.get("view")
if view == "summary":
all_node_summary = await DataOrganizer.get_all_node_summary()
all_node_summary_task = DataOrganizer.get_all_node_summary()
nodes_logical_resource_task = self.get_nodes_logical_resources()

all_node_summary, nodes_logical_resources = await asyncio.gather(
all_node_summary_task, nodes_logical_resource_task
)

return dashboard_optional_utils.rest_response(
success=True, message="Node summary fetched.", summary=all_node_summary
success=True,
message="Node summary fetched.",
summary=all_node_summary,
node_logical_resources=nodes_logical_resources,
)
elif view is not None and view.lower() == "hostNameList".lower():
alive_hostnames = set()
Expand Down
14 changes: 14 additions & 0 deletions python/ray/autoscaler/_private/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -725,6 +725,20 @@ def get_demand_report(lm_summary: LoadMetricsSummary):
return demand_report


def get_per_node_breakdown_as_dict(
lm_summary: LoadMetricsSummary,
) -> dict:
per_node_breakdown = {}

for node_id, usage in lm_summary.usage_by_node.items():
usage_string = ""
for line in parse_usage(usage, verbose=True):
usage_string += f"{line}\n"
per_node_breakdown[node_id] = usage_string.strip()

return per_node_breakdown


def get_per_node_breakdown(
lm_summary: LoadMetricsSummary,
node_type_mapping: Optional[Dict[str, float]],
Expand Down
67 changes: 67 additions & 0 deletions python/ray/tests/autoscaler/util.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
import unittest
from unittest.mock import Mock
from ray.autoscaler._private.util import get_per_node_breakdown_as_dict


class TestGetPerNodeBreakdown(unittest.TestCase):
def setUp(self):
# Create a mock LoadMetricsSummary object with the required attributes
lm_summary_mock_data = {
"e9919752e5e8d757765d97d8bec910a2e78e8826f20bce46fd58f92e": {
"node:172.31.6.57": [0.0, 1.0],
"object_store_memory": [0.0, 13984228147.0],
"memory": [0.0, 27968456295.0],
"node:__internal_head__": [0.0, 1.0],
"CPU": [0.0, 8.0],
}
}
self.lm_summary_mock = Mock()
self.lm_summary_mock.usage_by_node = lm_summary_mock_data

def test_get_per_node_breakdown_as_dict(self):
result = get_per_node_breakdown_as_dict(self.lm_summary_mock)

expected_output = {
"e9919752e5e8d757765d97d8bec910a2e78e8826f20bce46fd58f92e": (
"0.0/8.0 CPU\n0B/26.05GiB memory\n0B/13.02GiB object_store_memory"
)
}

self.assertEqual(result, expected_output)

def test_get_per_node_breakdown_as_dict_empty_summary(self):
# Test with an empty lm_summary
lm_summary_mock_data = {}
self.lm_summary_mock.usage_by_node = lm_summary_mock_data

result = get_per_node_breakdown_as_dict(self.lm_summary_mock)

expected_output = {}

self.assertEqual(result, expected_output)

def test_get_per_node_breakdown_as_dict_missing_usage(self):
# Test with missing usage data for a node
lm_summary_mock_data = {
"e9919752e5e8d757765d97d8bec910a2e78e8826f20bce46fd58f92e": {
"node:172.31.6.57": [0.0, 1.0],
"object_store_memory": [0.0, 13984228147.0],
# 'memory': [0.0, 27968456295.0], # Missing memory data
"node:__internal_head__": [0.0, 1.0],
"CPU": [0.0, 8.0],
}
}
self.lm_summary_mock.usage_by_node = lm_summary_mock_data

result = get_per_node_breakdown_as_dict(self.lm_summary_mock)

expected_output = {
"e9919752e5e8d757765d97d8bec910a2e78e8826f20bce46fd58f92e": "0.0/8.0 CPU\n"
"0B/13.02GiB object_store_memory"
}

self.assertEqual(result, expected_output)


if __name__ == "__main__":
unittest.main()

0 comments on commit b077f0a

Please sign in to comment.