diff --git a/man/man8/offcputime.8 b/man/man8/offcputime.8 index 5fd6261d5d4b..40f787acd9af 100644 --- a/man/man8/offcputime.8 +++ b/man/man8/offcputime.8 @@ -4,17 +4,17 @@ offcputime \- Summarize off-CPU time by kernel stack trace. Uses Linux eBPF/bcc. .SH SYNOPSIS .B offcputime [\-h] [\-u] [\-p PID] [\-v] [\-f] [duration] .SH DESCRIPTION -This program shows kernel stack traces and task names that were blocked and -"off-CPU", and the total duration they were not running: their "off-CPU time". +This program shows stack traces and task names that were blocked and "off-CPU", +and the total duration they were not running: their "off-CPU time". It works by tracing when threads block and when they return to CPU, measuring -both the time they were off-CPU and the blocked kernel stack trace and the -task name. This data is summarized in the kernel using an eBPF map, and by -summing the off-CPU time by unique stack trace and task name. +both the time they were off-CPU and the blocked stack trace and the task name. +This data is summarized in the kernel using an eBPF map, and by summing the +off-CPU time by unique stack trace and task name. -The output summary will help you identify reasons why threads -were blocking, and quantify the time they were off-CPU. This spans all types -of blocking activity: disk I/O, network I/O, locks, page faults, involuntary -context switches, etc. +The output summary will help you identify reasons why threads were blocking, +and quantify the time they were off-CPU. This spans all types of blocking +activity: disk I/O, network I/O, locks, page faults, involuntary context +switches, etc. This is complementary to CPU profiling (e.g., CPU flame graphs) which shows the time spent on-CPU. This shows the time spent off-CPU, and the output, @@ -35,14 +35,20 @@ Print usage message. \-f Print output in folded stack format. .TP +\-p PID +Trace this process ID only (filtered in-kernel). +.TP \-u -Only trace user threads (not kernel threads). +Only trace user threads (no kernel threads). .TP -\-v -Show raw addresses (for non-folded output). +\-k +Only trace kernel threads (no user threads). .TP -\-p PID -Trace this process ID only (filtered in-kernel). +\-U +Show stacks from user space only (no kernel space stacks). +.TP +\-K +Show stacks from kernel space only (no user space stacks). .TP duration Duration to trace, in seconds. diff --git a/src/cc/export/helpers.h b/src/cc/export/helpers.h index 25708494fa16..dbc797b388fb 100644 --- a/src/cc/export/helpers.h +++ b/src/cc/export/helpers.h @@ -180,6 +180,12 @@ static int (*bpf_skb_load_bytes)(void *ctx, int offset, void *to, u32 len) = * * BPF_STACK_TRACE(_name, _size) will allocate space for _size stack traces. * -ENOMEM will be returned when this limit is reached. + * + * -EFAULT is typically returned when requesting user-space stack straces (using + * BPF_F_USER_STACK) for kernel threads. However, a valid stackid may be + * returned in some cases; consider a tracepoint or kprobe executing in the + * kernel context. Given this you can typically ignore -EFAULT errors when + * retrieving user-space stack traces. */ static int (*bpf_get_stackid_)(void *ctx, void *map, u64 flags) = (void *) BPF_FUNC_get_stackid; diff --git a/src/python/bcc/__init__.py b/src/python/bcc/__init__.py index 3a7755aa49fc..804c48102ec4 100644 --- a/src/python/bcc/__init__.py +++ b/src/python/bcc/__init__.py @@ -63,9 +63,9 @@ def _check_probe_quota(num_new_probes): if len(open_kprobes) + len(open_uprobes) + num_new_probes > _kprobe_limit: raise Exception("Number of open probes would exceed quota") -class KernelSymbolCache(object): - def __init__(self): - self.cache = lib.bcc_symcache_new(-1) +class SymbolCache(object): + def __init__(self, pid): + self.cache = lib.bcc_symcache_new(pid) def resolve(self, addr): sym = bcc_symbol() @@ -87,14 +87,14 @@ class BPF(object): SCHED_ACT = 4 _probe_repl = re.compile("[^a-zA-Z0-9_]") - _ksym_cache = KernelSymbolCache() + _sym_caches = {} _auto_includes = { - "linux/time.h" : ["time"], - "linux/fs.h" : ["fs", "file"], - "linux/blkdev.h" : ["bio", "request"], - "linux/slab.h" : ["alloc"], - "linux/netdevice.h" : ["sk_buff", "net_device"] + "linux/time.h": ["time"], + "linux/fs.h": ["fs", "file"], + "linux/blkdev.h": ["bio", "request"], + "linux/slab.h": ["alloc"], + "linux/netdevice.h": ["sk_buff", "net_device"] } @classmethod @@ -634,6 +634,30 @@ def trace_print(self, fmt=None): except KeyboardInterrupt: exit() + @staticmethod + def _sym_cache(pid): + """_sym_cache(pid) + + Returns a symbol cache for the specified PID. + The kernel symbol cache is accessed by providing any PID less than zero. + """ + if pid < 0 and pid != -1: + pid = -1 + if not pid in BPF._sym_caches: + BPF._sym_caches[pid] = SymbolCache(pid) + return BPF._sym_caches[pid] + + @staticmethod + def sym(addr, pid): + """sym(addr, pid) + + Translate a memory address into a function name for a pid, which is + returned. + A pid of less than zero will access the kernel symbol cache. + """ + name, _ = BPF._sym_cache(pid).resolve(addr) + return name + @staticmethod def ksym(addr): """ksym(addr) @@ -641,8 +665,7 @@ def ksym(addr): Translate a kernel memory address into a kernel function name, which is returned. """ - name, _ = BPF._ksym_cache.resolve(addr) - return name + return BPF.sym(addr, -1) @staticmethod def ksymaddr(addr): @@ -652,7 +675,7 @@ def ksymaddr(addr): instruction offset as a hexidecimal number, which is returned as a string. """ - name, offset = BPF._ksym_cache.resolve(addr) + name, offset = BPF._sym_cache(-1).resolve(addr) return "%s+0x%x" % (name, offset) @staticmethod @@ -661,7 +684,7 @@ def ksymname(name): Translate a kernel name into an address. This is the reverse of ksymaddr. Returns -1 when the function name is unknown.""" - return BPF._ksym_cache.resolve_name(name) + return BPF._sym_cache(-1).resolve_name(name) @staticmethod def num_open_kprobes(): diff --git a/tools/offcputime.py b/tools/offcputime.py index 751349fd9388..185f2cf35ccb 100755 --- a/tools/offcputime.py +++ b/tools/offcputime.py @@ -1,9 +1,9 @@ #!/usr/bin/python # -# offcputime Summarize off-CPU time by kernel stack trace +# offcputime Summarize off-CPU time by stack trace # For Linux, uses BCC, eBPF. # -# USAGE: offcputime [-h] [-p PID | -u | -k] [-f] [duration] +# USAGE: offcputime [-h] [-p PID | -u | -k] [-U | -K] [-f] [duration] # # Copyright 2016 Netflix, Inc. # Licensed under the Apache License, Version 2.0 (the "License") @@ -42,18 +42,25 @@ def positive_nonzero_int(val): ./offcputime -p 185 # only trace threads for PID 185 ./offcputime -u # only trace user threads (no kernel) ./offcputime -k # only trace kernel threads (no user) + ./offcputime -U # only show user space stacks (no kernel) + ./offcputime -K # only show kernel space stacks (no user) """ parser = argparse.ArgumentParser( - description="Summarize off-CPU time by kernel stack trace", + description="Summarize off-CPU time by stack trace", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=examples) thread_group = parser.add_mutually_exclusive_group() thread_group.add_argument("-p", "--pid", type=positive_int, help="trace this PID only") -thread_group.add_argument("-k", "--kernel-threads-only", action="store_true", - help="kernel threads only (no user threads)") thread_group.add_argument("-u", "--user-threads-only", action="store_true", help="user threads only (no kernel threads)") +thread_group.add_argument("-k", "--kernel-threads-only", action="store_true", + help="kernel threads only (no user threads)") +stack_group = parser.add_mutually_exclusive_group() +stack_group.add_argument("-U", "--user-stacks-only", action="store_true", + help="show stacks from user space only (no kernel space stacks)") +stack_group.add_argument("-K", "--kernel-stacks-only", action="store_true", + help="show stacks from kernel space only (no user space stacks)") parser.add_argument("-f", "--folded", action="store_true", help="output folded format") parser.add_argument("--stack-storage-size", default=1024, @@ -79,8 +86,10 @@ def signal_ignore(signal, frame): #define MINBLOCK_US 1 struct key_t { + u32 pid; + int user_stack_id; + int kernel_stack_id; char name[TASK_COMM_LEN]; - int stack_id; }; BPF_HASH(counts, struct key_t); BPF_HASH(start, u32); @@ -97,23 +106,29 @@ def signal_ignore(signal, frame): start.update(&pid, &ts); } - // calculate current thread's delta time + // get the current thread's start time pid = bpf_get_current_pid_tgid(); tsp = start.lookup(&pid); - if (tsp == 0) + if (tsp == 0) { return 0; // missed start or filtered + } + + // calculate current thread's delta time u64 delta = bpf_ktime_get_ns() - *tsp; start.delete(&pid); delta = delta / 1000; - if (delta < MINBLOCK_US) + if (delta < MINBLOCK_US) { return 0; + } // create map key u64 zero = 0, *val; struct key_t key = {}; + key.pid = pid; + key.user_stack_id = USER_STACK_GET; + key.kernel_stack_id = KERNEL_STACK_GET; bpf_get_current_comm(&key.name, sizeof(key.name)); - key.stack_id = stack_traces.get_stackid(ctx, BPF_F_REUSE_STACKID); val = counts.lookup_or_init(&key, &zero); (*val) += delta; @@ -140,6 +155,29 @@ def signal_ignore(signal, frame): # set stack storage size bpf_text = bpf_text.replace('STACK_STORAGE_SIZE', str(args.stack_storage_size)) +# handle stack args +kernel_stack_get = "stack_traces.get_stackid(ctx, BPF_F_REUSE_STACKID)" +user_stack_get = \ + "stack_traces.get_stackid(ctx, BPF_F_REUSE_STACKID | BPF_F_USER_STACK)" +stack_context = "" +if args.user_stacks_only: + stack_context = "user" + kernel_stack_get = "-1" +elif args.kernel_stacks_only: + stack_context = "kernel" + user_stack_get = "-1" +else: + stack_context = "user + kernel" +bpf_text = bpf_text.replace('USER_STACK_GET', user_stack_get) +bpf_text = bpf_text.replace('KERNEL_STACK_GET', kernel_stack_get) + +# check for an edge case; the code below will handle this case correctly +# but ultimately nothing will be displayed +if args.kernel_threads_only and args.user_stacks_only: + print("ERROR: Displaying user stacks for kernel threads " \ + "doesn't make sense.", file=stderr) + exit(1) + # initialize BPF b = BPF(text=bpf_text) b.attach_kprobe(event="finish_task_switch", fn_name="oncpu") @@ -150,8 +188,8 @@ def signal_ignore(signal, frame): # header if not folded: - print("Tracing off-CPU time (us) of %s by kernel stack" % - thread_context, end="") + print("Tracing off-CPU time (us) of %s by %s stack" % + (thread_context, stack_context), end="") if duration < 99999999: print(" for %d secs." % duration) else: @@ -172,25 +210,35 @@ def signal_ignore(signal, frame): stack_traces = b.get_table("stack_traces") for k, v in sorted(counts.items(), key=lambda counts: counts[1].value): # handle get_stackid erorrs - if k.stack_id < 0: + if (not args.user_stacks_only and k.kernel_stack_id < 0) or \ + (not args.kernel_stacks_only and k.user_stack_id < 0 and \ + k.user_stack_id != -14): missing_stacks += 1 # check for an ENOMEM error - if k.stack_id == -12: + if k.kernel_stack_id == -12 or k.user_stack_id == -12: has_enomem = True continue - stack = stack_traces.walk(k.stack_id) + user_stack = [] if k.user_stack_id < 0 else \ + stack_traces.walk(k.user_stack_id) + kernel_stack = [] if k.kernel_stack_id < 0 else \ + stack_traces.walk(k.kernel_stack_id) if folded: # print folded stack output - stack = list(stack)[1:] - line = [k.name.decode()] + [b.ksym(addr) for addr in reversed(stack)] + user_stack = list(user_stack)[1:] + kernel_stack = list(kernel_stack)[1:] + line = [k.name.decode()] + \ + [b.ksym(addr) for addr in reversed(kernel_stack)] + \ + [b.sym(addr, k.pid) for addr in reversed(user_stack)] print("%s %d" % (";".join(line), v.value)) else: # print default multi-line stack output - for addr in stack: - print(" %-16x %s" % (addr, b.ksym(addr))) - print(" %-16s %s" % ("-", k.name)) + for addr in user_stack: + print(" %016x %s" % (addr, b.sym(addr, k.pid))) + for addr in kernel_stack: + print(" %016x %s" % (addr, b.ksym(addr))) + print(" %-16s %s (%d)" % ("-", k.name, k.pid)) print(" %d\n" % v.value) if missing_stacks > 0: diff --git a/tools/offcputime_example.txt b/tools/offcputime_example.txt index fb984c65707c..ee0c6edb75d7 100644 --- a/tools/offcputime_example.txt +++ b/tools/offcputime_example.txt @@ -4,15 +4,16 @@ Demonstrations of offcputime, the Linux eBPF/bcc version. This program shows stack traces that were blocked, and the total duration they were blocked. It works by tracing when threads block and when they return to CPU, measuring both the time they were blocked (aka the "off-CPU time") and the -blocked kernel stack trace and the task name. This data is summarized in kernel -by summing the blocked time by unique stack trace and task name. +blocked stack trace and the task name. This data is summarized in kernel by +summing the blocked time by unique stack trace and task name. -Here is some example output. To explain what we are seeing: the very first -stack trace looks like a page fault (do_page_fault() etc) from the "chmod" -command, and in total was off-CPU for 13 microseconds. +Here is some example output. The -K option was used to only match kernel stacks. +To explain what we are seeing: the very first stack trace looks like a page +fault (do_page_fault() etc) from the "chmod" command, and in total was off-CPU +for 13 microseconds. -# ./offcputime -Tracing off-CPU time (us) by kernel stack... Hit Ctrl-C to end. +# ./offcputime -K +Tracing off-CPU time (us) of all threads by kernel stack... Hit Ctrl-C to end. ^C schedule schedule_timeout @@ -587,7 +588,7 @@ Tracing off-CPU time (us) by kernel stack... Hit Ctrl-C to end. 81670888 The last few stack traces aren't very interesting, since they are threads that -are ofter blocked off-CPU waiting for work. +are often blocked off-CPU waiting for work. Do be somewhat careful with overhead: this is tracing scheduler functions, which can be called very frequently. While this uses in-kernel summaries for @@ -599,8 +600,8 @@ the overhead will be measurable. A -p option can be used to filter (in-kernel) on a single process ID. For example, only matching PID 26651, which is a running "dd" command: -# ./offcputime -p 26651 -Tracing off-CPU time (us) by kernel stack... Hit Ctrl-C to end. +# ./offcputime -K -p 26651 +Tracing off-CPU time (us) of all threads by kernel stack... Hit Ctrl-C to end. ^C schedule schedule_timeout @@ -623,8 +624,8 @@ total of 2.4 seconds during tracing. A duration can be added, for example, tracing for 5 seconds only: -# ./offcputime -p 26651 5 -Tracing off-CPU time (us) by kernel stack for 5 secs. +# ./offcputime -K -p 26651 5 +Tracing off-CPU time (us) of all threads by kernel stack for 5 secs. schedule schedule_timeout @@ -658,7 +659,7 @@ A -f option will emit output using the "folded stacks" format, which can be read directly by flamegraph.pl from the FlameGraph open source software (https://github.com/brendangregg/FlameGraph). Eg: -# ./offcputime -f 5 +# ./offcputime -K -f 5 bash;entry_SYSCALL_64_fastpath;sys_read;vfs_read;__vfs_read;tty_read;n_tty_read;call_rwsem_down_read_failed;rwsem_down_read_failed;schedule 8 yes;entry_SYSCALL_64_fastpath;sys_write;vfs_write;__vfs_write;tty_write;n_tty_write;call_rwsem_down_read_failed;rwsem_down_read_failed;schedule 14 run;page_fault;do_page_fault;__do_page_fault;handle_mm_fault;__do_fault;filemap_fault;__lock_page_or_retry;wait_on_page_bit_killable;__wait_on_bit;bit_wait_io;io_schedule_timeout;schedule_timeout;schedule 33 @@ -718,11 +719,11 @@ creating your "off-CPU time flame graphs". USAGE message: # ./offcputime -h -usage: offcputime.py [-h] [-p PID | -k | -u] [-f] +usage: offcputime.py [-h] [-p PID | -k | -u] [-K | -U] [-f] [--stack-storage-size STACK_STORAGE_SIZE] [duration] -Summarize off-CPU time by kernel stack trace +Summarize off-CPU time by stack trace positional arguments: duration duration of trace, in seconds @@ -730,10 +731,16 @@ positional arguments: optional arguments: -h, --help show this help message and exit -p PID, --pid PID trace this PID only - -k, --kernel-threads-only - kernel threads only (no user threads) -u, --user-threads-only user threads only (no kernel threads) + -k, --kernel-threads-only + kernel threads only (no user threads) + -U, --user-stacks-only + show stacks from user space only (no kernel space + stacks) + -K, --kernel-stacks-only + show stacks from kernel space only (no user space + stacks) -f, --folded output folded format --stack-storage-size STACK_STORAGE_SIZE the number of unique stack traces that can be stored @@ -746,3 +753,5 @@ examples: ./offcputime -p 185 # only trace threads for PID 185 ./offcputime -u # only trace user threads (no kernel) ./offcputime -k # only trace kernel threads (no user) + ./offcputime -U # only show user space stacks (no kernel) + ./offcputime -K # only show kernel space stacks (no user)