Skip to content

Commit

Permalink
[UI] [Job Detail Page] [Task Table] Allow profiling / stack trace view (
Browse files Browse the repository at this point in the history
ray-project#37043)

Frontend: Add CPU flame graph and stack trace to the jobDetail/taskTable for the running task.
Backend: Add two routers for task/traceback.
Backend: Sync with @rickyyx, removed the optimization that distributed some filters in GCS to decrease the latency of filtering.
Notice: We support the profiling and stack trace view for sync and async tasks

Job Detail/CPU profile

Signed-off-by: Victor <[email protected]>
  • Loading branch information
chaowanggg authored and Victor committed Oct 11, 2023
1 parent b077f0a commit 1dfd758
Show file tree
Hide file tree
Showing 8 changed files with 508 additions and 9 deletions.
47 changes: 46 additions & 1 deletion dashboard/client/src/common/ProfilingLink.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,52 @@ type CpuProfilingLinkProps = PropsWithChildren<
} & ClassNameProps
>;

type TaskProfilingStackTraceProps = {
taskId: string | null | undefined;
attemptNumber: number;
nodeId: string;
};

export const TaskCpuProfilingLink = ({
taskId,
attemptNumber,
nodeId,
}: TaskProfilingStackTraceProps) => {
if (!taskId) {
return null;
}
return (
<Link
href={`task/cpu_profile?task_id=${taskId}&attempt_number=${attemptNumber}&node_id=${nodeId}`}
target="_blank"
title="Profile the Python worker for 5 seconds (default) and display a CPU flame graph."
rel="noreferrer"
>
CPU&nbsp;Flame&nbsp;Graph
</Link>
);
};

export const TaskCpuStackTraceLink = ({
taskId,
attemptNumber,
nodeId,
}: TaskProfilingStackTraceProps) => {
if (!taskId) {
return null;
}
return (
<Link
href={`task/traceback?task_id=${taskId}&attempt_number=${attemptNumber}&node_id=${nodeId}`}
target="_blank"
title="Sample the current Python stack trace for this worker."
rel="noreferrer"
>
Stack&nbsp;Trace
</Link>
);
};

export const CpuProfilingLink = ({
pid,
ip,
Expand All @@ -18,7 +64,6 @@ export const CpuProfilingLink = ({
if (!pid || !ip || typeof pid === "undefined" || typeof ip === "undefined") {
return <div></div>;
}

return (
<Link
href={`worker/traceback?pid=${pid}&ip=${ip}&native=0`}
Expand Down
27 changes: 26 additions & 1 deletion dashboard/client/src/components/TaskTable.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,16 @@ import { Link as RouterLink } from "react-router-dom";
import { CodeDialogButton } from "../common/CodeDialogButton";
import { DurationText } from "../common/DurationText";
import { ActorLink, NodeLink } from "../common/links";
import {
TaskCpuProfilingLink,
TaskCpuStackTraceLink,
} from "../common/ProfilingLink";
import rowStyles from "../common/RowStyles";
import { Task } from "../type/task";
import { useFilter } from "../util/hook";
import StateCounter from "./StatesCounter";
import { StatusChip } from "./StatusChip";
import { HelpInfo } from "./Tooltip";

export type TaskTableProps = {
tasks: Task[];
jobId?: string;
Expand Down Expand Up @@ -71,6 +74,10 @@ const TaskTable = ({
tasks.
<br />- Error: For tasks that have failed, show a stack trace for the
faiure.
<br /> Stack Trace: Get a stacktrace of the worker process where the
task is running.
<br />- CPU Flame Graph: Get a flame graph of the next 5 seconds of
the worker process where the task is running.
</Typography>
),
},
Expand Down Expand Up @@ -330,11 +337,29 @@ const TaskTableActions = ({ task }: TaskTableActionsProps) => {
? `Error Type: ${task.error_type}\n\n${task.error_message}`
: undefined;

const isTaskActive = task.state === "RUNNING" && task.worker_id;

return (
<React.Fragment>
<Link component={RouterLink} to={`tasks/${task.task_id}`}>
Log
</Link>
{isTaskActive && (
<React.Fragment>
<br />
<TaskCpuProfilingLink
taskId={task.task_id}
attemptNumber={task.attempt_number}
nodeId={task.node_id}
/>
<br />
<TaskCpuStackTraceLink
taskId={task.task_id}
attemptNumber={task.attempt_number}
nodeId={task.node_id}
/>
</React.Fragment>
)}
<br />

{errorDetails && (
Expand Down
29 changes: 29 additions & 0 deletions dashboard/client/src/pages/task/TaskPage.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,10 @@ import {
MultiTabLogViewer,
MultiTabLogViewerTabDetails,
} from "../../common/MultiTabLogViewer";
import {
TaskCpuProfilingLink,
TaskCpuStackTraceLink,
} from "../../common/ProfilingLink";
import { Section } from "../../common/Section";
import Loading from "../../components/Loading";
import { MetadataSection } from "../../components/MetadataSection";
Expand Down Expand Up @@ -78,6 +82,7 @@ const TaskPageContents = ({
}

const {
attempt_number,
task_id,
actor_id,
end_time_ms,
Expand All @@ -92,6 +97,7 @@ const TaskPageContents = ({
func_or_class_name,
name,
} = task;
const isTaskActive = task.state === "RUNNING" && task.worker_id;

return (
<div>
Expand Down Expand Up @@ -217,6 +223,29 @@ const TaskPageContents = ({
}
),
},
isTaskActive
? {
label: "Actions",
content: (
<React.Fragment>
<TaskCpuProfilingLink
taskId={task_id}
attemptNumber={attempt_number}
nodeId={node_id}
/>
<br />
<TaskCpuStackTraceLink
taskId={task_id}
attemptNumber={attempt_number}
nodeId={node_id}
/>
</React.Fragment>
),
}
: {
label: "",
content: undefined,
},
]}
/>
<CollapsibleSection title="Logs" startExpanded>
Expand Down
1 change: 1 addition & 0 deletions dashboard/client/src/type/task.ts
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ export enum TypeTaskType {
export type Task = {
task_id: string;
name: string;
attempt_number: number;
state: TypeTaskStatus;
job_id: string;
node_id: string;
Expand Down
2 changes: 1 addition & 1 deletion dashboard/modules/log/log_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,7 @@ async def _stream_log_in_chunk(
"""
assert "b" in file.mode, "Only binary file is supported."
assert not (
keep_alive_interval_sec >= 0 and end_offset is not -1
keep_alive_interval_sec >= 0 and end_offset != -1
), "Keep-alive is not allowed when specifying an end offset"

file.seek(start_offset, 0)
Expand Down
Loading

0 comments on commit 1dfd758

Please sign in to comment.