Skip to content

Commit

Permalink
cpudist: Use finish_task_switch kprobe instead of sched_switch tr…
Browse files Browse the repository at this point in the history
…acepoint

The `sched_switch` tracepoint approach requires storing the previous
task's tgid in a map and fetching it from there, because it is not
available as a tracepoint argument. Instead, placing a kprobe on the
`finish_task_switch` function allows cleanly fetching the previous
task's pid and tgid from the task_struct.
  • Loading branch information
goldshtn committed Jun 30, 2016
1 parent 3c976bb commit 06d90d3
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 39 deletions.
5 changes: 0 additions & 5 deletions man/man8/cpudist.8
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,6 @@ This tool uses in-kernel eBPF maps for storing timestamps and the histogram,
for efficiency. Despite this, the overhead of this tool may become significant
for some workloads: see the OVERHEAD section.

This tool uses the sched:sched_switch kernel tracepoint to determine when a
task is scheduled and descheduled. If the tracepoint arguments change in the
future, this tool will have to be updated. Still, it is more reliable than
using kprobes on the respective kernel functions directly.

Since this uses BPF, only the root user can use this tool.
.SH REQUIREMENTS
CONFIG_BPF and bcc.
Expand Down
42 changes: 8 additions & 34 deletions tools/cpudist.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,12 +48,9 @@
countdown = int(args.count)
debug = 0

tp = Tracepoint.enable_tracepoint("sched", "sched_switch")
bpf_text = "#include <uapi/linux/ptrace.h>\n"
bpf_text += "#include <linux/sched.h>\n"
bpf_text += tp.generate_decl()
bpf_text += tp.generate_entry_probe()
bpf_text += tp.generate_struct()
bpf_text = """#include <uapi/linux/ptrace.h>
#include <linux/sched.h>
"""

if not args.offcpu:
bpf_text += "#define ONCPU\n"
Expand All @@ -66,17 +63,8 @@
BPF_HASH(start, u32, u64);
BPF_HASH(tgid_for_pid, u32, u32);
STORAGE
static inline u32 get_tgid(u32 pid)
{
u32 *stored_tgid = tgid_for_pid.lookup(&pid);
if (stored_tgid != 0)
return *stored_tgid;
return 0xffffffff;
}
static inline void store_start(u32 tgid, u32 pid, u64 ts)
{
if (FILTER)
Expand All @@ -99,32 +87,19 @@
STORE
}
int sched_switch(struct pt_regs *ctx)
int sched_switch(struct pt_regs *ctx, struct task_struct *prev)
{
u64 ts = bpf_ktime_get_ns();
u64 pid_tgid = bpf_get_current_pid_tgid();
u32 tgid = pid_tgid >> 32, pid = pid_tgid;
// Keep a mapping of tgid for pid because when sched_switch hits,
// we only have the tgid information for the *current* pid, but not
// for the previous one.
tgid_for_pid.update(&pid, &tgid);
u64 *di = __trace_di.lookup(&pid_tgid);
if (di == 0)
return 0;
struct sched_switch_trace_entry args = {};
bpf_probe_read(&args, sizeof(args), (void *)*di);
#ifdef ONCPU
if (args.prev_state == TASK_RUNNING) {
if (prev->state == TASK_RUNNING) {
#else
if (1) {
#endif
u32 prev_pid = args.prev_pid;
u32 prev_tgid = get_tgid(prev_pid);
if (prev_tgid == 0xffffffff)
goto BAIL;
u32 prev_pid = prev->pid;
u32 prev_tgid = prev->tgid;
#ifdef ONCPU
update_hist(prev_tgid, prev_pid, ts);
#else
Expand Down Expand Up @@ -173,8 +148,7 @@
print(bpf_text)

b = BPF(text=bpf_text)
Tracepoint.attach(b)
b.attach_kprobe(event="perf_trace_sched_switch", fn_name="sched_switch")
b.attach_kprobe(event="finish_task_switch", fn_name="sched_switch")

print("Tracing %s-CPU time... Hit Ctrl-C to end." %
("off" if args.offcpu else "on"))
Expand Down

0 comments on commit 06d90d3

Please sign in to comment.