From 40975ab372a49ed05c6c77695e14490f41c8bf8a Mon Sep 17 00:00:00 2001 From: Sasha Goldshtein Date: Wed, 29 Jun 2016 03:57:01 +0300 Subject: [PATCH] cpudist: Summarize task on-CPU time as histograms (#585) * cpudist: summarize on-CPU time per task as a histogram This is the initial implementation of the tool itself, which uses the sched:sched_switch tracepoint to probe task switches. This is a slightly more robust approach than using a kernel function, such as finish_task_switch. When native BCC support for tracepoints is introduced, this tool can be reimplemented and reliance on the current Tracepoint module can be removed. * cpudist: add man page and examples --- README.md | 1 + man/man8/cpudist.8 | 100 +++++++++++++++ tools/cpudist.py | 167 ++++++++++++++++++++++++ tools/cpudist_example.txt | 259 ++++++++++++++++++++++++++++++++++++++ 4 files changed, 527 insertions(+) create mode 100644 man/man8/cpudist.8 create mode 100755 tools/cpudist.py create mode 100644 tools/cpudist_example.txt diff --git a/README.md b/README.md index 4b78876c31c3..6cbdd59b5f60 100644 --- a/README.md +++ b/README.md @@ -78,6 +78,7 @@ Examples: - tools/[btrfsdist](tools/btrfsdist.py): Summarize btrfs operation latency distribution as a histogram. [Examples](tools/btrfsdist_example.txt). - tools/[btrfsslower](tools/btrfsslower.py): Trace slow btrfs operations. [Examples](tools/btrfsslower_example.txt). - tools/[cachestat](tools/cachestat.py): Trace page cache hit/miss ratio. [Examples](tools/cachestat_example.txt). +- tools/[cpudist](tools/cpudist.py): Summarize on-CPU time per task as a histogram. [Examples](tools/cpudist_example.txt) - tools/[dcsnoop](tools/dcsnoop.py): Trace directory entry cache (dcache) lookups. [Examples](tools/dcsnoop_example.txt). - tools/[dcstat](tools/dcstat.py): Directory entry cache (dcache) stats. [Examples](tools/dcstat_example.txt). - tools/[execsnoop](tools/execsnoop.py): Trace new processes via exec() syscalls. [Examples](tools/execsnoop_example.txt). diff --git a/man/man8/cpudist.8 b/man/man8/cpudist.8 new file mode 100644 index 000000000000..cee5165a237c --- /dev/null +++ b/man/man8/cpudist.8 @@ -0,0 +1,100 @@ +.TH cpudist 8 "2016-06-28" "USER COMMANDS" +.SH NAME +cpudist \- On-CPU task time as a histogram. +.SH SYNOPSIS +.B cpudist [\-h] [\-T] [\-m] [\-P] [\-L] [\-p PID] [interval] [count] +.SH DESCRIPTION +This measures the time a task spends on the CPU before being descheduled, and +shows the times as a histogram. Tasks that spend a very short time on the CPU +can be indicative of excessive context-switches and poor workload distribution, +and possibly point to a shared source of contention that keeps tasks switching +in and out as it becomes available (such as a mutex). + +This tool uses in-kernel eBPF maps for storing timestamps and the histogram, +for efficiency. Despite this, the overhead of this tool may become significant +for some workloads: see the OVERHEAD section. + +This tool uses the sched:sched_switch kernel tracepoint to determine when a +task is scheduled and descheduled. If the tracepoint arguments change in the +future, this tool will have to be updated. Still, it is more reliable than +using kprobes on the respective kernel functions directly. + +Since this uses BPF, only the root user can use this tool. +.SH REQUIREMENTS +CONFIG_BPF and bcc. +.SH OPTIONS +.TP +\-h +Print usage message. +.TP +\-T +Include timestamps on output. +.TP +\-m +Output histogram in milliseconds. +.TP +\-P +Print a histogram for each PID (tgid from the kernel's perspective). +.TP +\-L +Print a histogram for each TID (pid from the kernel's perspective). +.TP +\-p PID +Only show this PID (filtered in kernel for efficiency). +.TP +interval +Output interval, in seconds. +.TP +count +Number of outputs. +.SH EXAMPLES +.TP +Summarize task on-CPU time as a histogram: +# +.B cpudist +.TP +Print 1 second summaries, 10 times: +# +.B cpudist 1 10 +.TP +Print 1 second summaries, using milliseconds as units for the histogram, and include timestamps on output: +# +.B cpudist \-mT 1 +.TP +Trace PID 186 only, 1 second summaries: +# +.B cpudist -P 185 1 +.SH FIELDS +.TP +usecs +Microsecond range +.TP +msecs +Millisecond range +.TP +count +How many times a task event fell into this range +.TP +distribution +An ASCII bar chart to visualize the distribution (count column) +.SH OVERHEAD +This traces scheduler tracepoints, which can become very frequent. While eBPF +has very low overhead, and this tool uses in-kernel maps for efficiency, the +frequency of scheduler events for some workloads may be high enough that the +overhead of this tool becomes significant. Measure in a lab environment +to quantify the overhead before use. +.SH SOURCE +This is from bcc. +.IP +https://github.com/iovisor/bcc +.PP +Also look in the bcc distribution for a companion _example.txt file containing +example usage, output, and commentary for this tool. +.SH OS +Linux +.SH STABILITY +Unstable - in development. +.SH AUTHOR +Sasha Goldshtein +.SH SEE ALSO +pidstat(1), runqlat(8) diff --git a/tools/cpudist.py b/tools/cpudist.py new file mode 100755 index 000000000000..560ad92f10c8 --- /dev/null +++ b/tools/cpudist.py @@ -0,0 +1,167 @@ +#!/usr/bin/python +# @lint-avoid-python-3-compatibility-imports +# +# cpudist Summarize on-CPU time per task as a histogram. +# +# USAGE: cpudist [-h] [-T] [-m] [-P] [-L] [-p PID] [interval] [count] +# +# This measures the time a task spends on the CPU, and shows this time as a +# histogram, optionally per-process. +# +# Copyright 2016 Sasha Goldshtein +# Licensed under the Apache License, Version 2.0 (the "License") + +from __future__ import print_function +from bcc import BPF, Tracepoint +from time import sleep, strftime +import argparse + +examples = """examples: + cpudist # summarize on-CPU time as a histogram + cpudist 1 10 # print 1 second summaries, 10 times + cpudist -mT 1 # 1s summaries, milliseconds, and timestamps + cpudist -P # show each PID separately + cpudist -p 185 # trace PID 185 only +""" +parser = argparse.ArgumentParser( + description="Summarize on-CPU time per task as a histogram.", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=examples) +parser.add_argument("-T", "--timestamp", action="store_true", + help="include timestamp on output") +parser.add_argument("-m", "--milliseconds", action="store_true", + help="millisecond histogram") +parser.add_argument("-P", "--pids", action="store_true", + help="print a histogram per process ID") +parser.add_argument("-L", "--tids", action="store_true", + help="print a histogram per thread ID") +parser.add_argument("-p", "--pid", + help="trace this PID only") +parser.add_argument("interval", nargs="?", default=99999999, + help="output interval, in seconds") +parser.add_argument("count", nargs="?", default=99999999, + help="number of outputs") +args = parser.parse_args() +countdown = int(args.count) +debug = 0 + +tp = Tracepoint.enable_tracepoint("sched", "sched_switch") +bpf_text = "#include \n" +bpf_text += "#include \n" +bpf_text += tp.generate_decl() +bpf_text += tp.generate_entry_probe() +bpf_text += tp.generate_struct() + +bpf_text += """ +typedef struct pid_key { + u64 id; + u64 slot; +} pid_key_t; + +// We need to store the start time, which is when the thread got switched in, +// and the tgid for the pid because the sched_switch tracepoint doesn't provide +// that information. +BPF_HASH(start, u32, u64); +BPF_HASH(tgid_for_pid, u32, u32); +STORAGE + +int sched_switch(struct pt_regs *ctx) +{ + u64 pid_tgid = bpf_get_current_pid_tgid(); + u64 *di = __trace_di.lookup(&pid_tgid); + if (di == 0) + return 0; + + struct sched_switch_trace_entry args = {}; + bpf_probe_read(&args, sizeof(args), (void *)*di); + + u32 tgid, pid; + u64 ts = bpf_ktime_get_ns(); + + if (args.prev_state == TASK_RUNNING) { + pid = args.prev_pid; + + u32 *stored_tgid = tgid_for_pid.lookup(&pid); + if (stored_tgid == 0) + goto BAIL; + tgid = *stored_tgid; + + if (FILTER) + goto BAIL; + + u64 *tsp = start.lookup(&pid); + if (tsp == 0) + goto BAIL; + + u64 delta = ts - *tsp; + FACTOR + STORE + } + +BAIL: + tgid = pid_tgid >> 32; + pid = pid_tgid; + if (FILTER) + return 0; + + start.update(&pid, &ts); + tgid_for_pid.update(&pid, &tgid); + + return 0; +} +""" + +if args.pid: + bpf_text = bpf_text.replace('FILTER', 'tgid != %s' % args.pid) +else: + bpf_text = bpf_text.replace('FILTER', '0') +if args.milliseconds: + bpf_text = bpf_text.replace('FACTOR', 'delta /= 1000000;') + label = "msecs" +else: + bpf_text = bpf_text.replace('FACTOR', 'delta /= 1000;') + label = "usecs" +if args.pids or args.tids: + section = "pid" + pid = "tgid" + if args.tids: + pid = "pid" + section = "tid" + bpf_text = bpf_text.replace('STORAGE', + 'BPF_HISTOGRAM(dist, pid_key_t);') + bpf_text = bpf_text.replace('STORE', + 'pid_key_t key = {.id = ' + pid + ', .slot = bpf_log2l(delta)}; ' + + 'dist.increment(key);') +else: + section = "" + bpf_text = bpf_text.replace('STORAGE', 'BPF_HISTOGRAM(dist);') + bpf_text = bpf_text.replace('STORE', + 'dist.increment(bpf_log2l(delta));') +if debug: + print(bpf_text) + +b = BPF(text=bpf_text) +Tracepoint.attach(b) +b.attach_kprobe(event="perf_trace_sched_switch", fn_name="sched_switch") + +print("Tracing on-CPU time... Hit Ctrl-C to end.") + +exiting = 0 if args.interval else 1 +dist = b.get_table("dist") +while (1): + try: + sleep(int(args.interval)) + except KeyboardInterrupt: + exiting = 1 + + print() + if args.timestamp: + print("%-8s\n" % strftime("%H:%M:%S"), end="") + + dist.print_log2_hist(label, section, section_print_fn=int) + dist.clear() + + countdown -= 1 + if exiting or countdown == 0: + exit() + diff --git a/tools/cpudist_example.txt b/tools/cpudist_example.txt new file mode 100644 index 000000000000..596c89b7a234 --- /dev/null +++ b/tools/cpudist_example.txt @@ -0,0 +1,259 @@ +Demonstrations of cpudist. + +This program summarizes task on-CPU time as a histogram, showing how long tasks +spent on the CPU before being descheduled. This provides valuable information +that can indicate oversubscription (too many tasks for too few processors), +overhead due to excessive context switching (e.g. a common shared lock for +multiple threads), uneven workload distribution, too-granular tasks, and more. + +# ./cpudist.py +Tracing on-CPU time... Hit Ctrl-C to end. +^C + usecs : count distribution + 0 -> 1 : 0 | | + 2 -> 3 : 1 | | + 4 -> 7 : 1 | | + 8 -> 15 : 13 |** | + 16 -> 31 : 187 |****************************************| + 32 -> 63 : 89 |******************* | + 64 -> 127 : 26 |***** | + 128 -> 255 : 0 | | + 256 -> 511 : 1 | | + +This is from a mostly idle system. Tasks wake up occasionally and run for only +a few dozen microseconds, and then get descheduled. + +Here's some output from a system that is heavily loaded by threads that perform +computation but also compete for a lock: + +# ./cpudist.py +Tracing on-CPU time... Hit Ctrl-C to end. +^C + usecs : count distribution + 0 -> 1 : 51 |* | + 2 -> 3 : 395 |*********** | + 4 -> 7 : 259 |******* | + 8 -> 15 : 61 |* | + 16 -> 31 : 75 |** | + 32 -> 63 : 31 | | + 64 -> 127 : 7 | | + 128 -> 255 : 5 | | + 256 -> 511 : 3 | | + 512 -> 1023 : 5 | | + 1024 -> 2047 : 6 | | + 2048 -> 4095 : 4 | | + 4096 -> 8191 : 1361 |****************************************| + 8192 -> 16383 : 523 |*************** | + 16384 -> 32767 : 3 | | + +A bimodal distribution is now clearly visible. Most of the time, tasks were +able to run for 4-16ms before being descheduled (this is likely the quantum +length). Occasionally, tasks had to be descheduled a lot earlier -- possibly +because they competed for a shared lock. + +If necessary, you can restrict the output to include only threads from a +particular process -- this helps reduce noise: + +# ./cpudist.py -p $(pidof parprimes) +Tracing on-CPU time... Hit Ctrl-C to end. +^C + usecs : count distribution + 0 -> 1 : 3 | | + 2 -> 3 : 17 | | + 4 -> 7 : 39 | | + 8 -> 15 : 52 |* | + 16 -> 31 : 43 | | + 32 -> 63 : 12 | | + 64 -> 127 : 13 | | + 128 -> 255 : 0 | | + 256 -> 511 : 1 | | + 512 -> 1023 : 11 | | + 1024 -> 2047 : 15 | | + 2048 -> 4095 : 41 | | + 4096 -> 8191 : 1134 |************************ | + 8192 -> 16383 : 1883 |****************************************| + 16384 -> 32767 : 65 |* | + +You can also ask for output at predefined intervals, and include timestamps for +easier interpretation. While we're at it, the -P switch will print a histogram +separately for each process: + +# ./cpudist.py -TP 5 3 +Tracing on-CPU time... Hit Ctrl-C to end. + +03:46:51 + +pid = 0 + usecs : count distribution + 0 -> 1 : 0 | | + 2 -> 3 : 1 |** | + 4 -> 7 : 17 |********************************** | + 8 -> 15 : 11 |********************** | + 16 -> 31 : 20 |****************************************| + 32 -> 63 : 15 |****************************** | + 64 -> 127 : 9 |****************** | + 128 -> 255 : 6 |************ | + 256 -> 511 : 0 | | + 512 -> 1023 : 0 | | + 1024 -> 2047 : 0 | | + 2048 -> 4095 : 1 |** | + +pid = 5068 + usecs : count distribution + 0 -> 1 : 0 | | + 2 -> 3 : 1 |************* | + 4 -> 7 : 3 |****************************************| + 8 -> 15 : 0 | | + 16 -> 31 : 0 | | + 32 -> 63 : 1 |************* | + +03:46:56 + +pid = 0 + usecs : count distribution + 0 -> 1 : 0 | | + 2 -> 3 : 1 |** | + 4 -> 7 : 19 |****************************************| + 8 -> 15 : 11 |*********************** | + 16 -> 31 : 9 |****************** | + 32 -> 63 : 3 |****** | + 64 -> 127 : 1 |** | + 128 -> 255 : 3 |****** | + 256 -> 511 : 0 | | + 512 -> 1023 : 1 |** | + +pid = 5068 + usecs : count distribution + 0 -> 1 : 1 |******************** | + 2 -> 3 : 0 | | + 4 -> 7 : 2 |****************************************| + +03:47:01 + +pid = 0 + usecs : count distribution + 0 -> 1 : 0 | | + 2 -> 3 : 0 | | + 4 -> 7 : 12 |******************************** | + 8 -> 15 : 15 |****************************************| + 16 -> 31 : 15 |****************************************| + 32 -> 63 : 0 | | + 64 -> 127 : 3 |******** | + 128 -> 255 : 1 |** | + 256 -> 511 : 0 | | + 512 -> 1023 : 1 |** | + +pid = 5068 + usecs : count distribution + 0 -> 1 : 0 | | + 2 -> 3 : 1 |****** | + 4 -> 7 : 6 |****************************************| + 8 -> 15 : 0 | | + 16 -> 31 : 0 | | + 32 -> 63 : 2 |************* | + +This histogram was obtained while executing `dd if=/dev/zero of=/dev/null` with +fairly large block sizes. + +Finally, let's ask for a per-thread report and values in milliseconds instead +of microseconds: + +# ./cpudist.py -p $(pidof parprimes) -mL +Tracing on-CPU time... Hit Ctrl-C to end. + + +tid = 5092 + msecs : count distribution + 0 -> 1 : 3 | | + 2 -> 3 : 4 | | + 4 -> 7 : 4 | | + 8 -> 15 : 535 |****************************************| + 16 -> 31 : 14 |* | + +tid = 5093 + msecs : count distribution + 0 -> 1 : 8 | | + 2 -> 3 : 6 | | + 4 -> 7 : 4 | | + 8 -> 15 : 534 |****************************************| + 16 -> 31 : 12 | | + +tid = 5094 + msecs : count distribution + 0 -> 1 : 38 |*** | + 2 -> 3 : 5 | | + 4 -> 7 : 5 | | + 8 -> 15 : 476 |****************************************| + 16 -> 31 : 25 |** | + +tid = 5095 + msecs : count distribution + 0 -> 1 : 31 |** | + 2 -> 3 : 6 | | + 4 -> 7 : 10 | | + 8 -> 15 : 478 |****************************************| + 16 -> 31 : 20 |* | + +tid = 5096 + msecs : count distribution + 0 -> 1 : 21 |* | + 2 -> 3 : 5 | | + 4 -> 7 : 4 | | + 8 -> 15 : 523 |****************************************| + 16 -> 31 : 16 |* | + +tid = 5097 + msecs : count distribution + 0 -> 1 : 11 | | + 2 -> 3 : 7 | | + 4 -> 7 : 7 | | + 8 -> 15 : 502 |****************************************| + 16 -> 31 : 23 |* | + +tid = 5098 + msecs : count distribution + 0 -> 1 : 21 |* | + 2 -> 3 : 5 | | + 4 -> 7 : 3 | | + 8 -> 15 : 494 |****************************************| + 16 -> 31 : 28 |** | + +tid = 5099 + msecs : count distribution + 0 -> 1 : 15 |* | + 2 -> 3 : 4 | | + 4 -> 7 : 6 | | + 8 -> 15 : 521 |****************************************| + 16 -> 31 : 12 | | + +It looks like all threads are more-or-less equally busy, and are typically +switched out after running for 8-15 milliseconds (again, this is the typical +quantum length). + + +USAGE message: + +# ./cpudist.py -h + +usage: cpudist.py [-h] [-T] [-m] [-P] [-L] [-p PID] [interval] [count] + +Summarize on-CPU time per task as a histogram. + +positional arguments: + interval output interval, in seconds + count number of outputs + +optional arguments: + -h, --help show this help message and exit + -T, --timestamp include timestamp on output + -m, --milliseconds millisecond histogram + -P, --pids print a histogram per process ID + -L, --tids print a histogram per thread ID + -p PID, --pid PID trace this PID only + +examples: + cpudist # summarize on-CPU time as a histogram + cpudist 1 10 # print 1 second summaries, 10 times + cpudist -mT 1 # 1s summaries, milliseconds, and timestamps + cpudist -P # show each PID separately + cpudist -p 185 # trace PID 185 only