Skip to content

Commit

Permalink
[Serve] [Dashboard] Add serve controller metrics to serve system dash…
Browse files Browse the repository at this point in the history
…board page (#43797)

Adds the following Serve controller metrics to the Serve details page on the dashboard:

ray_serve_num_ongoing_http_requests
ray_serve_num_ongoing_grpc_requests
ray_serve_controller_num_starts
ray_serve_num_scheduling_tasks
ray_serve_num_scheduling_tasks_in_backoff
ray_serve_controller_num_control_loops

---------

Signed-off-by: Archit Kulkarni <[email protected]>
  • Loading branch information
architkulkarni committed Mar 11, 2024
1 parent a949436 commit 67cf06a
Show file tree
Hide file tree
Showing 7 changed files with 167 additions and 12 deletions.
10 changes: 8 additions & 2 deletions dashboard/client/src/pages/serve/ServeDeploymentsListPage.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,10 @@ import { HelpInfo } from "../../components/Tooltip";
import { useServeDeployments } from "./hook/useServeApplications";
import { ServeApplicationRows } from "./ServeApplicationRow";
import { ServeEntityLogViewer } from "./ServeEntityLogViewer";
import { ServeMetricsSection } from "./ServeMetricsSection";
import {
APPS_METRICS_CONFIG,
ServeMetricsSection,
} from "./ServeMetricsSection";
import { ServeSystemPreview } from "./ServeSystemDetails";

const useStyles = makeStyles((theme) =>
Expand Down Expand Up @@ -172,7 +175,10 @@ export const ServeDeploymentsListPage = () => {
</CollapsibleSection>
</React.Fragment>
)}
<ServeMetricsSection className={classes.section} />
<ServeMetricsSection
className={classes.section}
metricsConfig={APPS_METRICS_CONFIG}
/>
</div>
);
};
Original file line number Diff line number Diff line change
@@ -1,7 +1,11 @@
import { render, screen, waitFor } from "@testing-library/react";
import React, { PropsWithChildren } from "react";
import { GlobalContext } from "../../App";
import { ServeMetricsSection } from "./ServeMetricsSection";
import {
APPS_METRICS_CONFIG,
SERVE_SYSTEM_METRICS_CONFIG,
ServeMetricsSection,
} from "./ServeMetricsSection";

const Wrapper = ({ children }: PropsWithChildren<{}>) => {
return (
Expand Down Expand Up @@ -54,10 +58,12 @@ const MetricsDisabledWrapper = ({ children }: PropsWithChildren<{}>) => {
};

describe("ServeMetricsSection", () => {
it("renders", async () => {
it("renders app metrics", async () => {
expect.assertions(4);

render(<ServeMetricsSection />, { wrapper: Wrapper });
render(<ServeMetricsSection metricsConfig={APPS_METRICS_CONFIG} />, {
wrapper: Wrapper,
});
await screen.findByText(/View in Grafana/);
expect(screen.getByText(/5 minutes/)).toBeVisible();
expect(screen.getByTitle("QPS per application")).toBeInTheDocument();
Expand All @@ -67,10 +73,34 @@ describe("ServeMetricsSection", () => {
).toBeInTheDocument();
});

it("renders serve system metrics", async () => {
expect.assertions(6);

render(
<ServeMetricsSection metricsConfig={SERVE_SYSTEM_METRICS_CONFIG} />,
{
wrapper: Wrapper,
},
);
await screen.findByText(/View in Grafana/);
expect(screen.getByTitle("Ongoing HTTP Requests")).toBeInTheDocument();
expect(screen.getByTitle("Ongoing gRPC Requests")).toBeInTheDocument();
expect(screen.getByTitle("Scheduling Tasks")).toBeInTheDocument();
expect(
screen.getByTitle("Scheduling Tasks in Backoff"),
).toBeInTheDocument();
expect(
screen.getByTitle("Controller Control Loop Duration"),
).toBeInTheDocument();
expect(screen.getByTitle("Number of Control Loops")).toBeInTheDocument();
});

it("renders nothing when grafana is not available", async () => {
expect.assertions(5);

render(<ServeMetricsSection />, { wrapper: MetricsDisabledWrapper });
render(<ServeMetricsSection metricsConfig={APPS_METRICS_CONFIG} />, {
wrapper: MetricsDisabledWrapper,
});
// Wait .1 seconds for render to finish
await waitFor(() => new Promise((r) => setTimeout(r, 100)));

Expand Down
37 changes: 34 additions & 3 deletions dashboard/client/src/pages/serve/ServeMetricsSection.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ const useStyles = makeStyles((theme) =>
);

// NOTE: please keep the titles here in sync with dashboard/modules/metrics/dashboards/serve_dashboard_panels.py
const METRICS_CONFIG: MetricConfig[] = [
export const APPS_METRICS_CONFIG: MetricConfig[] = [
{
title: "QPS per application",
pathParams: "orgId=1&theme=light&panelId=7",
Expand All @@ -75,10 +75,41 @@ const METRICS_CONFIG: MetricConfig[] = [
},
];

type ServeMetricsSectionProps = ClassNameProps;
// NOTE: please keep the titles here in sync with dashboard/modules/metrics/dashboards/serve_dashboard_panels.py
export const SERVE_SYSTEM_METRICS_CONFIG: MetricConfig[] = [
{
title: "Ongoing HTTP Requests",
pathParams: "orgId=1&theme=light&panelId=20",
},
{
title: "Ongoing gRPC Requests",
pathParams: "orgId=1&theme=light&panelId=21",
},
{
title: "Scheduling Tasks",
pathParams: "orgId=1&theme=light&panelId=22",
},
{
title: "Scheduling Tasks in Backoff",
pathParams: "orgId=1&theme=light&panelId=23",
},
{
title: "Controller Control Loop Duration",
pathParams: "orgId=1&theme=light&panelId=24",
},
{
title: "Number of Control Loops",
pathParams: "orgId=1&theme=light&panelId=25",
},
];

type ServeMetricsSectionProps = ClassNameProps & {
metricsConfig: MetricConfig[];
};

export const ServeMetricsSection = ({
className,
metricsConfig,
}: ServeMetricsSectionProps) => {
const classes = useStyles();
const { grafanaHost, prometheusHealth, dashboardUids, dashboardDatasource } =
Expand Down Expand Up @@ -131,7 +162,7 @@ export const ServeMetricsSection = ({
</TextField>
</Paper>
<div className={classes.grafanaEmbedsContainer}>
{METRICS_CONFIG.map(({ title, pathParams }) => {
{metricsConfig.map(({ title, pathParams }) => {
const path =
`/d-solo/${grafanaServeDashboardUid}?${pathParams}` +
`&refresh${timeRangeParams}&var-datasource=${dashboardDatasource}`;
Expand Down
12 changes: 11 additions & 1 deletion dashboard/client/src/pages/serve/ServeSystemDetailPage.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,11 @@ import { Outlet } from "react-router-dom";
import Loading from "../../components/Loading";
import { MainNavPageInfo } from "../layout/mainNavContext";
import { useServeDeployments } from "./hook/useServeApplications";
import {
SERVE_SYSTEM_METRICS_CONFIG,
ServeMetricsSection,
} from "./ServeMetricsSection";
import { ServeSystemDetails } from "./ServeSystemDetails";

const useStyles = makeStyles((theme) =>
createStyles({
root: {
Expand All @@ -15,6 +18,9 @@ const useStyles = makeStyles((theme) =>
serveInstanceWarning: {
marginBottom: theme.spacing(2),
},
section: {
marginTop: theme.spacing(4),
},
}),
);

Expand Down Expand Up @@ -53,6 +59,10 @@ export const ServeSystemDetailPage = () => {
setPage={setProxiesPage}
/>
)}
<ServeMetricsSection
className={classes.section}
metricsConfig={SERVE_SYSTEM_METRICS_CONFIG}
/>
</div>
);
};
Expand Down
78 changes: 78 additions & 0 deletions dashboard/modules/metrics/dashboards/serve_dashboard_panels.py
Original file line number Diff line number Diff line change
Expand Up @@ -322,6 +322,84 @@
stack=False,
grid_pos=GridPos(16, 5, 8, 8),
),
Panel(
id=20,
title="Ongoing HTTP Requests",
description="The number of ongoing requests in the HTTP Proxy.",
unit="requests",
targets=[
Target(
expr="ray_serve_num_ongoing_http_requests{{{global_filters}}}",
legend="Ongoing HTTP Requests",
),
],
grid_pos=GridPos(0, 6, 8, 8),
),
Panel(
id=21,
title="Ongoing gRPC Requests",
description="The number of ongoing requests in the gRPC Proxy.",
unit="requests",
targets=[
Target(
expr="ray_serve_num_ongoing_grpc_requests{{{global_filters}}}",
legend="Ongoing gRPC Requests",
),
],
grid_pos=GridPos(8, 6, 8, 8),
),
Panel(
id=22,
title="Scheduling Tasks",
description="The number of request scheduling tasks in the router.",
unit="tasks",
targets=[
Target(
expr="ray_serve_num_scheduling_tasks{{{global_filters}}}",
legend="Scheduling Tasks",
),
],
grid_pos=GridPos(16, 6, 8, 8),
),
Panel(
id=23,
title="Scheduling Tasks in Backoff",
description="The number of request scheduling tasks in the router that are undergoing backoff.",
unit="tasks",
targets=[
Target(
expr="ray_serve_num_scheduling_tasks_in_backoff{{{global_filters}}}",
legend="Scheduling Tasks in Backoff",
),
],
grid_pos=GridPos(0, 7, 8, 8),
),
Panel(
id=24,
title="Controller Control Loop Duration",
description="The duration of the last control loop.",
unit="seconds",
targets=[
Target(
expr="ray_serve_controller_control_loop_duration_s{{{global_filters}}}",
legend="Control Loop Duration",
),
],
grid_pos=GridPos(8, 7, 8, 8),
),
Panel(
id=25,
title="Number of Control Loops",
description="The number of control loops performed by the controller. Increases monotonically over the controller's lifetime.",
unit="loops",
targets=[
Target(
expr="ray_serve_controller_num_control_loops{{{global_filters}}}",
legend="Control Loops",
),
],
grid_pos=GridPos(16, 7, 8, 8),
),
]

ids = []
Expand Down
2 changes: 1 addition & 1 deletion doc/source/serve/monitoring.md
Original file line number Diff line number Diff line change
Expand Up @@ -623,7 +623,7 @@ The following metrics are exposed by Ray Serve:
* replica
* application
* model_id
- The mutlplexed model ID registered on the current replica.
- The mutliplexed model ID registered on the current replica.
* - ``ray_serve_multiplexed_get_model_requests_counter``
- * deployment
* replica
Expand Down
2 changes: 1 addition & 1 deletion python/ray/serve/_private/controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,7 +212,7 @@ async def __init__(
# Track the number of times the controller has started
metrics.Counter(
"serve_controller_num_starts",
description="The number of times that controller has started.",
description="The number of times the controller has started.",
).inc()

def reconfigure_global_logging_config(self, global_logging_config: LoggingConfig):
Expand Down

0 comments on commit 67cf06a

Please sign in to comment.