diff --git a/man/man8/stackcount.8 b/man/man8/stackcount.8 index 10999e237668..d6ab993af976 100644 --- a/man/man8/stackcount.8 +++ b/man/man8/stackcount.8 @@ -2,14 +2,14 @@ .SH NAME stackcount \- Count function calls and their stack traces. Uses Linux eBPF/bcc. .SH SYNOPSIS -.B stackcount [\-h] [\-p PID] [\-i INTERVAL] [\-T] [\-r] [\-s] - [\-P] [\-v] [\-d] pattern +.B stackcount [\-h] [\-p PID] [\-i INTERVAL] [\-D DURATION] [\-T] [\-r] [\-s] + [\-P] [\-K] [\-U] [\-v] [\-d] [\-f] pattern .SH DESCRIPTION stackcount traces functions and frequency counts them with their entire -stack trace, summarized in-kernel for efficiency. This allows higher -frequency events to be studied. The output consists of unique stack traces, -and their occurrence counts. In addition to kernel and user functions, kernel -tracepoints and USDT tracepoint are also supported. +stack trace, kernel stack and user stack, summarized in-kernel for efficiency. +This allows higher frequency events to be studied. The output consists of +unique stack traces, and their occurrence counts. In addition to kernel and +user functions, kernel tracepoints and USDT tracepoint are also supported. The pattern is a string with optional '*' wildcards, similar to file globbing. If you'd prefer to use regular expressions, use the \-r option. @@ -38,11 +38,19 @@ Include a timestamp with interval output. Show raw addresses. .TP \-d +Print a delimiter ("--") in-between the kernel and user stacks. +.TP +\-\-debug Print the source of the BPF program when loading it (for debugging purposes). .TP \-i interval Summary interval, in seconds. .TP +\-D duration +Total duration of trace, in seconds. +\-f +Folded output format. +.TP \-p PID Trace this process ID only (filtered in-kernel). .TP @@ -52,47 +60,59 @@ A function name, or a search pattern. Can include wildcards ("*"). If the \-r option is used, can include regular expressions. .SH EXAMPLES .TP -Count kernel stack traces for submit_bio(): +Count kernel and user stack traces for submit_bio(): # .B stackcount submit_bio .TP -Count kernel stack traces for ip_output(): +Count stacks with a delimiter for submit_bio(): +# +.B stackcount \-d submit_bio +.TP +Count kernel stack trace only for submit_bio(): +# +.B stackcount \-K submit_bio +.TP +Count user stack trace only for submit_bio(): +# +.B stackcount \-U submit_bio +.TP +Count stack traces for ip_output(): # .B stackcount ip_output .TP Show symbol offsets: # -.B stackcount -s ip_output +.B stackcount \-s ip_output .TP Show offsets and raw addresses (verbose): # -.B stackcount -sv ip_output +.B stackcount \-sv ip_output .TP -Count kernel stacks for kernel functions matching tcp_send*: +Count stacks for kernel functions matching tcp_send*: # .B stackcount 'tcp_send*' .TP Same as previous, but using regular expressions: # -.B stackcount -r '^tcp_send.*' +.B stackcount \-r '^tcp_send.*' .TP Output every 5 seconds, with timestamps: # -.B stackcount -Ti 5 ip_output +.B stackcount \-Ti 5 ip_output .TP Only count stacks when PID 185 is on-CPU: # -.B stackcount -p 185 ip_output +.B stackcount \-p 185 ip_output .TP Count user stacks for dynamic heap allocations with malloc in PID 185: # -.B stackcount -p 185 c:malloc +.B stackcount \-p 185 c:malloc .TP Count user stacks for thread creation (USDT tracepoint) in PID 185: # -.B stackcount -p 185 u:pthread:pthread_create +.B stackcount \-p 185 u:pthread:pthread_create .TP -Count kernel stacks for context switch events using a kernel tracepoint: +Count stacks for context switch events using a kernel tracepoint: # .B stackcount t:sched:sched_switch .SH OVERHEAD diff --git a/tools/stackcount.py b/tools/stackcount.py index 914e7ee18a94..536deed80c37 100755 --- a/tools/stackcount.py +++ b/tools/stackcount.py @@ -3,8 +3,8 @@ # stackcount Count events and their stack traces. # For Linux, uses BCC, eBPF. # -# USAGE: stackcount [-h] [-p PID] [-i INTERVAL] [-T] [-r] [-s] -# [-P] [-v] pattern +# USAGE: stackcount.py [-h] [-p PID] [-i INTERVAL] [-D DURATION] [-T] [-r] [-s] +# [-P] [-K] [-U] [-v] [-d] [-f] [--debug] # # The pattern is a string with optional '*' wildcards, similar to file # globbing. If you'd prefer to use regular expressions, use the -r option. @@ -27,7 +27,8 @@ debug = False class Probe(object): - def __init__(self, pattern, use_regex=False, pid=None, per_pid=False): + def __init__(self, pattern, kernel_stack, user_stack, use_regex=False, + pid=None, per_pid=False): """Init a new probe. Init the probe from the pattern provided by the user. The supported @@ -41,6 +42,8 @@ def __init__(self, pattern, use_regex=False, pid=None, per_pid=False): t:cat:event -- probe a kernel tracepoint u:lib:probe -- probe a USDT tracepoint """ + self.kernel_stack = kernel_stack + self.user_stack = user_stack parts = pattern.split(':') if len(parts) == 1: parts = ["p", "", parts[0]] @@ -103,28 +106,51 @@ def attach(self): self.pattern) def load(self): + ctx_name = "ctx" + stack_trace = "" + if self.user_stack: + stack_trace += """ + key.user_stack_id = stack_traces.get_stackid( + %s, BPF_F_REUSE_STACKID | BPF_F_USER_STACK + );""" % (ctx_name) + else: + stack_trace += "key.user_stack_id = -1;" + if self.kernel_stack: + stack_trace += """ + key.kernel_stack_id = stack_traces.get_stackid( + %s, BPF_F_REUSE_STACKID + );""" % (ctx_name) + else: + stack_trace += "key.kernel_stack_id = -1;" + trace_count_text = """ int trace_count(void *ctx) { FILTER struct key_t key = {}; - key.pid = GET_PID; - key.stackid = stack_traces.get_stackid(ctx, STACK_FLAGS); + key.tgid = GET_TGID; + STORE_COMM + %s u64 zero = 0; u64 *val = counts.lookup_or_init(&key, &zero); (*val)++; return 0; } """ + trace_count_text = trace_count_text % (stack_trace) + bpf_text = """#include +#include struct key_t { - u32 pid; - int stackid; + // no pid (thread ID) so that we do not needlessly split this key + u32 tgid; + int kernel_stack_id; + int user_stack_id; + char name[TASK_COMM_LEN]; }; BPF_HASH(counts, struct key_t); BPF_STACK_TRACE(stack_traces, 1024); - """ # We really mean the tgid from the kernel's perspective, which is in @@ -138,18 +164,18 @@ def load(self): # We need per-pid statistics when tracing a user-space process, because # the meaning of the symbols depends on the pid. We also need them if - # per-pid statistics were requested with -P. - if self.per_pid or not self.is_kernel_probe(): - trace_count_text = trace_count_text.replace('GET_PID', + # per-pid statistics were requested with -P, or for user stacks. + if self.per_pid or not self.is_kernel_probe() or self.user_stack: + trace_count_text = trace_count_text.replace('GET_TGID', 'bpf_get_current_pid_tgid() >> 32') + trace_count_text = trace_count_text.replace('STORE_COMM', + 'bpf_get_current_comm(&key.name, sizeof(key.name));') else: + # kernel stacks only. skip splitting on PID so these aggregate + # together, and don't store the process name. trace_count_text = trace_count_text.replace( - 'GET_PID', '0xffffffff') - - stack_flags = 'BPF_F_REUSE_STACKID' - if not self.is_kernel_probe(): - stack_flags += '| BPF_F_USER_STACK' # can't do both U *and* K - trace_count_text = trace_count_text.replace('STACK_FLAGS', stack_flags) + 'GET_TGID', '0xffffffff') + trace_count_text = trace_count_text.replace('STORE_COMM', '') self.usdt = None if self.type == "u": @@ -182,6 +208,7 @@ class Tool(object): def __init__(self): examples = """examples: ./stackcount submit_bio # count kernel stack traces for submit_bio + ./stackcount -d ip_output # include a user/kernel stack delimiter ./stackcount -s ip_output # show symbol offsets ./stackcount -sv ip_output # show offsets and raw addresses (verbose) ./stackcount 'tcp_send*' # count stacks for funcs matching tcp_send* @@ -191,6 +218,8 @@ def __init__(self): ./stackcount -p 185 c:malloc # count stacks for malloc in PID 185 ./stackcount t:sched:sched_fork # count stacks for sched_fork tracepoint ./stackcount -p 185 u:node:* # count stacks for all USDT probes in node + ./stackcount -K t:sched:sched_switch # kernel stacks only + ./stackcount -U t:sched:sched_switch # user stacks only """ parser = argparse.ArgumentParser( description="Count events and their stack traces", @@ -198,8 +227,10 @@ def __init__(self): epilog=examples) parser.add_argument("-p", "--pid", type=int, help="trace this PID only") - parser.add_argument("-i", "--interval", default=99999999, + parser.add_argument("-i", "--interval", help="summary interval, seconds") + parser.add_argument("-D", "--duration", + help="total duration of trace, seconds") parser.add_argument("-T", "--timestamp", action="store_true", help="include timestamp on output") parser.add_argument("-r", "--regexp", action="store_true", @@ -208,19 +239,56 @@ def __init__(self): help="show address offsets") parser.add_argument("-P", "--perpid", action="store_true", help="display stacks separately for each process") + parser.add_argument("-K", "--kernel-stacks-only", + action="store_true", help="kernel stack only", default=False) + parser.add_argument("-U", "--user-stacks-only", + action="store_true", help="user stack only", default=False) parser.add_argument("-v", "--verbose", action="store_true", help="show raw addresses") - parser.add_argument("-d", "--debug", action="store_true", + parser.add_argument("-d", "--delimited", action="store_true", + help="insert delimiter between kernel/user stacks") + parser.add_argument("-f", "--folded", action="store_true", + help="output folded format") + parser.add_argument("--debug", action="store_true", help="print BPF program before starting (for debugging purposes)") parser.add_argument("pattern", help="search expression for events") self.args = parser.parse_args() global debug debug = self.args.debug - self.probe = Probe(self.args.pattern, self.args.regexp, - self.args.pid, self.args.perpid) - def _print_frame(self, addr, pid): + if self.args.duration and not self.args.interval: + self.args.interval = self.args.duration + if not self.args.interval: + self.args.interval = 99999999 + + if self.args.kernel_stacks_only and self.args.user_stacks_only: + print("ERROR: -K and -U are mutually exclusive. If you want " + + "both stacks, that is the default.") + exit() + if not self.args.kernel_stacks_only and not self.args.user_stacks_only: + self.kernel_stack = True + self.user_stack = True + else: + self.kernel_stack = self.args.kernel_stacks_only + self.user_stack = self.args.user_stacks_only + + self.probe = Probe(self.args.pattern, + self.kernel_stack, self.user_stack, + self.args.regexp, self.args.pid, self.args.perpid) + self.need_delimiter = self.args.delimited and not ( + self.args.kernel_stacks_only or self.args.user_stacks_only) + + def _print_kframe(self, addr): + print(" ", end="") + if self.args.verbose: + print("%-16x " % addr, end="") + if self.args.offset: + print("%s" % self.probe.bpf.ksym(addr, show_offset=True)) + else: + print("%s" % self.probe.bpf.ksym(addr)) + + def _print_uframe(self, addr, pid): print(" ", end="") if self.args.verbose: print("%-16x " % addr, end="") @@ -233,34 +301,31 @@ def _print_frame(self, addr, pid): def _signal_ignore(signal, frame): print() - def _comm_for_pid(self, pid): - if pid in self.comm_cache: - return self.comm_cache[pid] - - try: - comm = " %s [%d]" % ( - open("/proc/%d/comm" % pid).read().strip(), - pid) - self.comm_cache[pid] = comm - return comm - except: - return " unknown process [%d]" % pid + def _print_comm(self, comm, pid): + print(" %s [%d]" % (comm, pid)) def run(self): self.probe.load() self.probe.attach() - print("Tracing %d functions for \"%s\"... Hit Ctrl-C to end." % - (self.probe.matched, self.args.pattern)) + if not self.args.folded: + print("Tracing %d functions for \"%s\"... Hit Ctrl-C to end." % + (self.probe.matched, self.args.pattern)) + b = self.probe.bpf exiting = 0 if self.args.interval else 1 + seconds = 0 while True: try: sleep(int(self.args.interval)) + seconds += int(self.args.interval) except KeyboardInterrupt: exiting = 1 # as cleanup can take many seconds, trap Ctrl-C: signal.signal(signal.SIGINT, Tool._signal_ignore) + if self.args.duration and seconds >= int(self.args.duration): + exiting = 1 - print() + if not self.args.folded: + print() if self.args.timestamp: print("%-8s\n" % strftime("%H:%M:%S"), end="") @@ -269,16 +334,37 @@ def run(self): self.comm_cache = {} for k, v in sorted(counts.items(), key=lambda counts: counts[1].value): - for addr in stack_traces.walk(k.stackid): - pid = -1 if self.probe.is_kernel_probe() else k.pid - self._print_frame(addr, pid) - if not self.args.pid and k.pid != 0xffffffff: - print(self._comm_for_pid(k.pid)) - print(" %d\n" % v.value) + user_stack = [] if k.user_stack_id < 0 else \ + stack_traces.walk(k.user_stack_id) + kernel_stack = [] if k.kernel_stack_id < 0 else \ + stack_traces.walk(k.kernel_stack_id) + + if self.args.folded: + # print folded stack output + user_stack = list(user_stack) + kernel_stack = list(kernel_stack) + line = [k.name.decode()] + \ + [b.sym(addr, k.tgid) for addr in + reversed(user_stack)] + \ + (self.need_delimiter and ["-"] or []) + \ + [b.ksym(addr) for addr in reversed(kernel_stack)] + print("%s %d" % (";".join(line), v.value)) + else: + # print multi-line stack output + for addr in kernel_stack: + self._print_kframe(addr) + if self.need_delimiter: + print(" --") + for addr in user_stack: + self._print_uframe(addr, k.tgid) + if not self.args.pid and k.tgid != 0xffffffff: + self._print_comm(k.name, k.tgid) + print(" %d\n" % v.value) counts.clear() if exiting: - print("Detaching...") + if not self.args.folded: + print("Detaching...") exit() if __name__ == "__main__": diff --git a/tools/stackcount_example.txt b/tools/stackcount_example.txt index d2d6448500fc..92a77a89c3de 100644 --- a/tools/stackcount_example.txt +++ b/tools/stackcount_example.txt @@ -3,55 +3,118 @@ Demonstrations of stackcount, the Linux eBPF/bcc version. This program traces functions and frequency counts them with their entire stack trace, summarized in-kernel for efficiency. For example, counting -stack traces that led to submit_bio(), which creates block device I/O: +stack traces that led to the submit_bio() kernel function, which creates +block device I/O: # ./stackcount submit_bio Tracing 1 functions for "submit_bio"... Hit Ctrl-C to end. +^C + submit_bio + ext4_writepages + do_writepages + __filemap_fdatawrite_range + filemap_flush + ext4_alloc_da_blocks + ext4_release_file + __fput + ____fput + task_work_run + exit_to_usermode_loop + syscall_return_slowpath + entry_SYSCALL_64_fastpath + [unknown] + [unknown] + tar [15069] + 5 + + submit_bio + ext4_bio_write_page + mpage_submit_page + mpage_map_and_submit_buffers + ext4_writepages + do_writepages + __filemap_fdatawrite_range + filemap_flush + ext4_alloc_da_blocks + ext4_release_file + __fput + ____fput + task_work_run + exit_to_usermode_loop + syscall_return_slowpath + entry_SYSCALL_64_fastpath + [unknown] + [unknown] + tar [15069] + 15 + + submit_bio + ext4_readpages + __do_page_cache_readahead + ondemand_readahead + page_cache_async_readahead + generic_file_read_iter + __vfs_read + vfs_read + sys_read + entry_SYSCALL_64_fastpath + [unknown] + tar [15069] + 113 + +Detaching... + +The output shows unique stack traces, in order from leaf (on-CPU) to root, +followed by their occurrence count. The last stack trace in the above output +shows syscall handling, sys_read(), vfs_read(), and then "readahead" functions: +looks like an application issued file read has triggered read ahead. The +application can be seen after the stack trace, in this case, "tar [15069]" +for the "tar" command, PID 15069. + +The order of printed stack traces is from least to most frequent. The most +frequent in this case, the ext4_rename() stack, was taken 113 times during +tracing. + +The "[unknown]" frames are from user-level, since this simple workload is +the tar command, which apparently has been compiled without frame pointers. +It's a common compiler optimization, but it breaks frame pointer-based stack +walkers. Similar broken stacks will be seen by other profilers and debuggers +that use frame pointers. Hopefully your application preserves them so that +the user-level stack trace is visible. So how does one get frame pointers, if +your application doesn't have them to start with? For the current bcc (until +it supports other stack walkers), you need to be running a application binaries +that preserves frame pointers, eg, using gcc's -fno-omit-frame-pointer. That's +about all I'll say here: this is a big topic that is not bcc/BPF specific. + +It can be useful to trace the path to submit_bio to explain unusual rates of +disk IOPS. These could have in-kernel origins (eg, background scrub). + + +Now adding the -d option to delimit kernel and user stacks: + +# ./stackcount -d submit_bio +Tracing 1 functions for "submit_bio"... Hit Ctrl-C to end. ^C submit_bio submit_bh - journal_submit_commit_record.isra.13 + journal_submit_commit_record jbd2_journal_commit_transaction kjournald2 kthread ret_from_fork - mb_cache_list + -- + jbd2/xvda1-8 [405] 1 - submit_bio - __block_write_full_page.constprop.39 - block_write_full_page - blkdev_writepage - __writepage - write_cache_pages - generic_writepages - do_writepages - __writeback_single_inode - writeback_sb_inodes - __writeback_inodes_wb - 2 - - submit_bio - __block_write_full_page.constprop.39 - block_write_full_page - blkdev_writepage - __writepage - write_cache_pages - generic_writepages - do_writepages - __filemap_fdatawrite_range - filemap_fdatawrite - fdatawrite_one_bdev - 36 - submit_bio submit_bh jbd2_journal_commit_transaction kjournald2 kthread ret_from_fork - mb_cache_list - 38 + -- + jbd2/xvda1-8 [405] + 2 submit_bio ext4_writepages @@ -59,138 +122,220 @@ Tracing 1 functions for "submit_bio"... Hit Ctrl-C to end. __filemap_fdatawrite_range filemap_flush ext4_alloc_da_blocks - ext4_rename - ext4_rename2 - vfs_rename - sys_rename + ext4_release_file + __fput + ____fput + task_work_run + exit_to_usermode_loop + syscall_return_slowpath entry_SYSCALL_64_fastpath - 79 + -- + [unknown] + [unknown] + tar [15187] + 5 -Detaching... + submit_bio + ext4_bio_write_page + mpage_submit_page + mpage_map_and_submit_buffers + ext4_writepages + do_writepages + __filemap_fdatawrite_range + filemap_flush + ext4_alloc_da_blocks + ext4_release_file + __fput + ____fput + task_work_run + exit_to_usermode_loop + syscall_return_slowpath + entry_SYSCALL_64_fastpath + -- + [unknown] + [unknown] + tar [15187] + 15 -The output shows unique stack traces, in order from leaf (on-CPU) to root, -followed by their occurrence count. The last stack trace in the above output -shows syscall handling, ext4_rename(), and filemap_flush(): looks like an -application issued file rename has caused back end disk I/O due to ext4 -block allocation and a filemap_flush(). I'd have to browse the code to those -functions to confirm! + submit_bio + ext4_readpages + __do_page_cache_readahead + ondemand_readahead + page_cache_async_readahead + generic_file_read_iter + __vfs_read + vfs_read + sys_read + entry_SYSCALL_64_fastpath + -- + [unknown] + [unknown] + [unknown] + tar [15187] + 171 -The order of printed stack traces is from least to most frequent. The most -frequent in this case, the ext4_rename() stack, was taken 79 times during -tracing. +Detaching... -It can be useful to trace the path to submit_bio to explain unusual rates of -disk IOPS. These could have in-kernel origins (eg, background scrub). +A "--" is printed between the kernel and user stacks. -As another example, here are the code paths that led to ip_output(), which -sends a packet at the IP level: +As a different example, here is the kernel function hrtimer_init_sleeper(): -# ./stackcount ip_output -Tracing 1 functions for "ip_output"... Hit Ctrl-C to end. +# ./stackcount.py -d hrtimer_init_sleeper +Tracing 1 functions for "hrtimer_init_sleeper"... Hit Ctrl-C to end. ^C - ip_output - ip_queue_xmit - tcp_transmit_skb - tcp_write_xmit - tcp_tsq_handler.part.32 - tcp_tasklet_func - tasklet_action - __do_softirq - do_softirq_own_stack - do_softirq - __local_bh_enable_ip + hrtimer_init_sleeper + do_futex + SyS_futex + entry_SYSCALL_64_fastpath + -- + [unknown] + containerd [16020] 1 - ip_output - ip_queue_xmit - tcp_transmit_skb - tcp_send_ack - tcp_send_delayed_ack - __tcp_ack_snd_check - tcp_rcv_established - tcp_v4_do_rcv - tcp_v4_rcv - ip_local_deliver_finish - ip_local_deliver + hrtimer_init_sleeper + do_futex + SyS_futex + entry_SYSCALL_64_fastpath + -- + __pthread_cond_timedwait + Monitor::IWait(Thread*, long) + Monitor::wait(bool, long, bool) + CompileQueue::get() + CompileBroker::compiler_thread_loop() + JavaThread::thread_main_inner() + JavaThread::run() + java_start(Thread*) + start_thread + java [4996] 1 - ip_output - ip_queue_xmit - tcp_transmit_skb - tcp_send_ack - tcp_send_delayed_ack - __tcp_ack_snd_check - tcp_rcv_established - tcp_v4_do_rcv - tcp_v4_rcv - ip_local_deliver_finish - ip_local_deliver + hrtimer_init_sleeper + do_futex + SyS_futex + entry_SYSCALL_64_fastpath + -- + [unknown] + [unknown] + containerd [16020] 1 - ip_output - ip_queue_xmit - tcp_transmit_skb - tcp_send_ack - tcp_delack_timer_handler - tcp_delack_timer - call_timer_fn - run_timer_softirq - __do_softirq - irq_exit - xen_evtchn_do_upcall - 1 + hrtimer_init_sleeper + do_futex + SyS_futex + entry_SYSCALL_64_fastpath + -- + __pthread_cond_timedwait + VMThread::loop() + VMThread::run() + java_start(Thread*) + start_thread + java [4996] + 3 + + hrtimer_init_sleeper + do_futex + SyS_futex + entry_SYSCALL_64_fastpath + -- + [unknown] + dockerd [16008] + 4 + + hrtimer_init_sleeper + do_futex + SyS_futex + entry_SYSCALL_64_fastpath + -- + [unknown] + [unknown] + dockerd [16008] + 4 + + hrtimer_init_sleeper + do_futex + SyS_futex + entry_SYSCALL_64_fastpath + -- + __pthread_cond_timedwait + Lio/netty/util/ThreadDeathWatcher$Watcher;::run + Interpreter + Interpreter + call_stub + JavaCalls::call_helper(JavaValue*, methodHandle*, JavaCallArguments*, Thread*) + JavaCalls::call_virtual(JavaValue*, KlassHandle, Symbol*, Symbol*, JavaCallArguments*, Thread*) + JavaCalls::call_virtual(JavaValue*, Handle, KlassHandle, Symbol*, Symbol*, Thread*) + thread_entry(JavaThread*, Thread*) + JavaThread::thread_main_inner() + JavaThread::run() + java_start(Thread*) + start_thread + java [4996] + 4 + + hrtimer_init_sleeper + do_futex + SyS_futex + entry_SYSCALL_64_fastpath + -- + __pthread_cond_timedwait + clock_gettime + [unknown] + java [4996] + 79 + +Detaching... + +I was just trying to find a more interesting example. This output includes +some Java stacks where user-level has been walked correctly (even includes a +JIT symbol translation). dockerd and containerd don't have frame pointers +(grumble), but Java does (which is running with -XX:+PreserveFramePointer). + +Here's another kernel function, ip_output(): + +# ./stackcount.py -d ip_output +Tracing 1 functions for "ip_output"... Hit Ctrl-C to end. +^C ip_output ip_queue_xmit tcp_transmit_skb tcp_write_xmit __tcp_push_pending_frames - tcp_rcv_established - tcp_v4_do_rcv - release_sock + tcp_push tcp_sendmsg inet_sendmsg sock_sendmsg - 3 - - ip_output - ip_queue_xmit - tcp_transmit_skb - tcp_write_xmit - tcp_tsq_handler.part.32 - tcp_tasklet_func - tasklet_action - __do_softirq - run_ksoftirqd - smpboot_thread_fn - kthread - 3 + sock_write_iter + __vfs_write + vfs_write + SyS_write + entry_SYSCALL_64_fastpath + -- + __write_nocancel + [unknown] + sshd [15015] + 5 ip_output ip_queue_xmit tcp_transmit_skb tcp_write_xmit __tcp_push_pending_frames - tcp_rcv_established - tcp_v4_do_rcv - tcp_v4_rcv - ip_local_deliver_finish - ip_local_deliver - ip_rcv_finish - 4 - - ip_output - ip_queue_xmit - tcp_transmit_skb - tcp_send_ack - tcp_cleanup_rbuf - tcp_recvmsg - inet_recvmsg - sock_recvmsg - sock_read_iter - __vfs_read - vfs_read + tcp_push + tcp_sendmsg + inet_sendmsg + sock_sendmsg + sock_write_iter + __vfs_write + vfs_write + SyS_write + entry_SYSCALL_64_fastpath + -- + __write_nocancel + [unknown] + [unknown] + sshd [8234] 5 ip_output @@ -198,14 +343,30 @@ Tracing 1 functions for "ip_output"... Hit Ctrl-C to end. tcp_transmit_skb tcp_write_xmit __tcp_push_pending_frames - tcp_rcv_established - tcp_v4_do_rcv - release_sock + tcp_push tcp_sendmsg inet_sendmsg sock_sendmsg - 9 + sock_write_iter + __vfs_write + vfs_write + SyS_write + entry_SYSCALL_64_fastpath + -- + __write_nocancel + sshd [15015] + 7 + +Detaching... + +This time just sshd is triggering ip_output() calls. + +Watch what happens if I filter on kernel stacks only (-K) for ip_output(): + +# ./stackcount.py -K ip_output +Tracing 1 functions for "ip_output"... Hit Ctrl-C to end. +^C ip_output ip_queue_xmit tcp_transmit_skb @@ -217,61 +378,45 @@ Tracing 1 functions for "ip_output"... Hit Ctrl-C to end. sock_sendmsg sock_write_iter __vfs_write - 51 + vfs_write + SyS_write + entry_SYSCALL_64_fastpath + 13 - ip_output - ip_queue_xmit - tcp_transmit_skb - tcp_write_xmit - __tcp_push_pending_frames - tcp_rcv_established - tcp_v4_do_rcv - tcp_v4_rcv - ip_local_deliver_finish - ip_local_deliver - ip_rcv_finish - 171 +Detaching... - ip_output - ip_queue_xmit - tcp_transmit_skb - tcp_write_xmit - __tcp_push_pending_frames - tcp_rcv_established - tcp_v4_do_rcv - tcp_v4_rcv - ip_local_deliver_finish - ip_local_deliver - ip_rcv_finish - 994 +They have grouped together as a single unique stack, since the kernel part +was the same. - ip_output - ip_queue_xmit - tcp_transmit_skb - tcp_write_xmit - tcp_tsq_handler.part.32 - tcp_tasklet_func - tasklet_action - __do_softirq - irq_exit - xen_evtchn_do_upcall - xen_do_hypervisor_callback - 1002 -Detaching... +Here is just the user stacks, fetched during the kernel function ip_output(): -The last two most frequent stack traces are via tcp_transmit_skb(). Note the -send last begins with ip_rcv_finish(), for a local receive, which then becomes -a transmit: likely pushing more frames when processing the newly received ones. +# ./stackcount.py -U ip_output +Tracing 1 functions for "ip_output"... Hit Ctrl-C to end. +^C + [unknown] + snmpd [1645] + 1 + + __write_nocancel + [unknown] + [unknown] + sshd [8234] + 3 + + __write_nocancel + sshd [15015] + 4 -As may be obvious, this is a great tool for quickly understanding kernel code -flow. +I should really run a custom sshd with frame pointers so we can see its +stack trace... User-space functions can also be traced if a library name is provided. For -example, to quickly identify code locations that allocate heap memory: +example, to quickly identify code locations that allocate heap memory for +PID 4902 (using -p), by tracing malloc from libc ("c:malloc"): -# ./stackcount -l c -p 4902 malloc +# ./stackcount -p 4902 c:malloc Tracing 1 functions for "malloc"... Hit Ctrl-C to end. ^C malloc @@ -290,10 +435,12 @@ Tracing 1 functions for "malloc"... Hit Ctrl-C to end. Detaching... +Kernel stacks are absent as this didn't enter kernel code. + Note that user-space uses of stackcount can be somewhat more limited because -a lot of user-space libraries and binaries are compiled without debuginfo, or -with frame-pointer omission (-fomit-frame-pointer), which makes it impossible -to reliably obtain the stack trace. +a lot of user-space libraries and binaries are compiled without frame-pointers +as discussed earlier (the -fomit-frame-pointer compiler default) or are used +without debuginfo. In addition to kernel and user-space functions, kernel tracepoints and USDT @@ -313,132 +460,275 @@ Tracing 1 functions for "u:pthread:pthread_create"... Hit Ctrl-C to end. [unknown] 7 +You can use "readelf -n file" to see if it has USDT tracepoints. + + Similarly, to determine where context switching is happening in the kernel, use the sched:sched_switch kernel tracepoint: # ./stackcount t:sched:sched_switch -... (omitted for brevity) + __schedule + schedule + worker_thread + kthread + ret_from_fork + kworker/0:2 [25482] + 1 __schedule schedule schedule_hrtimeout_range_clock schedule_hrtimeout_range - poll_schedule_timeout - do_select - core_sys_select - SyS_select + ep_poll + SyS_epoll_wait entry_SYSCALL_64_fastpath - 40 + epoll_wait + Lsun/nio/ch/SelectorImpl;::lockAndDoSelect + Lsun/nio/ch/SelectorImpl;::select + Lio/netty/channel/nio/NioEventLoop;::select + Lio/netty/channel/nio/NioEventLoop;::run + Interpreter + Interpreter + call_stub + JavaCalls::call_helper(JavaValue*, methodHandle*, JavaCallArguments*, Thread*) + JavaCalls::call_virtual(JavaValue*, KlassHandle, Symbol*, Symbol*, JavaCallArguments*, Thread*) + JavaCalls::call_virtual(JavaValue*, Handle, KlassHandle, Symbol*, Symbol*, Thread*) + thread_entry(JavaThread*, Thread*) + JavaThread::thread_main_inner() + JavaThread::run() + java_start(Thread*) + start_thread + java [4996] + 1 + +... (omitted for brevity) __schedule schedule schedule_preempt_disabled cpu_startup_entry - start_secondary - 85 + xen_play_dead + arch_cpu_idle_dead + cpu_startup_entry + cpu_bringup_and_idle + swapper/1 [0] + 289 A -i option can be used to set an output interval, and -T to include a timestamp. For example: -# ./stackcount -Ti 1 submit_bio +# ./stackcount.py -Tdi 1 submit_bio Tracing 1 functions for "submit_bio"... Hit Ctrl-C to end. -01:11:37 +06:05:13 + +06:05:14 submit_bio - submit_bh - journal_submit_commit_record.isra.13 - jbd2_journal_commit_transaction - kjournald2 + xfs_do_writepage + write_cache_pages + xfs_vm_writepages + do_writepages + __writeback_single_inode + writeback_sb_inodes + __writeback_inodes_wb + wb_writeback + wb_workfn + process_one_work + worker_thread kthread ret_from_fork - mb_cache_list + -- + kworker/u16:1 [15686] 1 submit_bio - ext4_writepages - do_writepages - __filemap_fdatawrite_range - filemap_flush - ext4_alloc_da_blocks - ext4_rename - ext4_rename2 - vfs_rename - sys_rename + process_one_work + worker_thread + kthread + ret_from_fork + -- + kworker/u16:0 [16007] + 1 + + submit_bio + xfs_buf_submit + xlog_bdstrat + xlog_sync + xlog_state_release_iclog + _xfs_log_force + xfs_log_force + xfs_fs_sync_fs + sync_fs_one_sb + iterate_supers + sys_sync entry_SYSCALL_64_fastpath - 20 + -- + [unknown] + sync [16039] + 1 submit_bio submit_bh + journal_submit_commit_record jbd2_journal_commit_transaction kjournald2 kthread ret_from_fork - mb_cache_list - 39 + -- + jbd2/xvda1-8 [405] + 1 + submit_bio + process_one_work + worker_thread + kthread + ret_from_fork + -- + kworker/0:2 [25482] + 2 -01:11:38 submit_bio ext4_writepages do_writepages - __filemap_fdatawrite_range - filemap_flush - ext4_alloc_da_blocks - ext4_rename - ext4_rename2 - vfs_rename - sys_rename - entry_SYSCALL_64_fastpath - 20 + __writeback_single_inode + writeback_sb_inodes + __writeback_inodes_wb + wb_writeback + wb_workfn + process_one_work + worker_thread + kthread + ret_from_fork + -- + kworker/u16:0 [16007] + 4 + submit_bio + xfs_vm_writepages + do_writepages + __writeback_single_inode + writeback_sb_inodes + __writeback_inodes_wb + wb_writeback + wb_workfn + process_one_work + worker_thread + kthread + ret_from_fork + -- + kworker/u16:1 [15686] + 5 -01:11:39 submit_bio - ext4_writepages + __block_write_full_page + block_write_full_page + blkdev_writepage + __writepage + write_cache_pages + generic_writepages + blkdev_writepages do_writepages __filemap_fdatawrite_range - filemap_flush - ext4_alloc_da_blocks - ext4_rename - ext4_rename2 - vfs_rename - sys_rename + filemap_fdatawrite + fdatawrite_one_bdev + iterate_bdevs + sys_sync entry_SYSCALL_64_fastpath - 20 + -- + [unknown] + sync [16039] + 7 -^C -01:11:39 submit_bio + submit_bh + jbd2_journal_commit_transaction + kjournald2 + kthread + ret_from_fork + -- + jbd2/xvda1-8 [405] + 8 + + submit_bio + ext4_bio_write_page + mpage_submit_page + mpage_map_and_submit_buffers ext4_writepages do_writepages - __filemap_fdatawrite_range - filemap_flush - ext4_alloc_da_blocks - ext4_rename - ext4_rename2 - vfs_rename - sys_rename - entry_SYSCALL_64_fastpath - 20 + __writeback_single_inode + writeback_sb_inodes + __writeback_inodes_wb + wb_writeback + wb_workfn + process_one_work + worker_thread + kthread + ret_from_fork + -- + kworker/u16:0 [16007] + 8 + + submit_bio + __block_write_full_page + block_write_full_page + blkdev_writepage + __writepage + write_cache_pages + generic_writepages + blkdev_writepages + do_writepages + __writeback_single_inode + writeback_sb_inodes + __writeback_inodes_wb + wb_writeback + wb_workfn + process_one_work + worker_thread + kthread + ret_from_fork + -- + kworker/u16:0 [16007] + 60 + + +06:05:15 + +06:05:16 Detaching... +This only included output for the 06:05:14 interval. The other internals +did not span block device I/O. + The -s output prints the return instruction offset for each function (aka symbol offset). Eg: -# ./stackcount -s tcp_sendmsg +# ./stackcount.py -s tcp_sendmsg Tracing 1 functions for "tcp_sendmsg"... Hit Ctrl-C to end. ^C - tcp_sendmsg0x1 - sock_sendmsg0x38 - sock_write_iter0x78 - __vfs_write0xaa - vfs_write0xa9 - sys_write0x46 - entry_SYSCALL_64_fastpath0x16 - 29 + tcp_sendmsg+0x1 + sock_sendmsg+0x38 + sock_write_iter+0x85 + __vfs_write+0xe3 + vfs_write+0xb8 + SyS_write+0x55 + entry_SYSCALL_64_fastpath+0x1e + __write_nocancel+0x7 + sshd [15015] + 3 + + tcp_sendmsg+0x1 + sock_sendmsg+0x38 + sock_write_iter+0x85 + __vfs_write+0xe3 + vfs_write+0xb8 + SyS_write+0x55 + entry_SYSCALL_64_fastpath+0x1e + __write_nocancel+0x7 + sshd [8234] + 3 Detaching... @@ -446,9 +736,40 @@ If it wasn't clear how one function called another, knowing the instruction offset can help you locate the lines of code from a disassembly dump. -A wildcard can also be used. Eg, all functions beginning with "tcp_send": +The -v output is verbose, and shows raw addresses: -# ./stackcount -s 'tcp_send*' +./stackcount.py -v tcp_sendmsg +Tracing 1 functions for "tcp_sendmsg"... Hit Ctrl-C to end. +^C + ffffffff817b05c1 tcp_sendmsg + ffffffff8173ea48 sock_sendmsg + ffffffff8173eae5 sock_write_iter + ffffffff81232b33 __vfs_write + ffffffff812331b8 vfs_write + ffffffff81234625 SyS_write + ffffffff818739bb entry_SYSCALL_64_fastpath + 7f120511e6e0 __write_nocancel + sshd [8234] + 3 + + ffffffff817b05c1 tcp_sendmsg + ffffffff8173ea48 sock_sendmsg + ffffffff8173eae5 sock_write_iter + ffffffff81232b33 __vfs_write + ffffffff812331b8 vfs_write + ffffffff81234625 SyS_write + ffffffff818739bb entry_SYSCALL_64_fastpath + 7f919c5a26e0 __write_nocancel + sshd [15015] + 11 + +Detaching... + + +A wildcard can also be used. Eg, all functions beginning with "tcp_send", +kernel stacks only (-K) with offsets (-s): + +# ./stackcount -Ks 'tcp_send*' Tracing 14 functions for "tcp_send*"... Hit Ctrl-C to end. ^C tcp_send_delayed_ack0x1 @@ -501,39 +822,69 @@ Detaching... Use -r to allow regular expressions. +The -f option will emit folded output, which can be used as input to other +tools including flame graphs. For example, with delimiters as well: + +# ./stackcount.py -df t:sched:sched_switch +^Csnmp-pass;[unknown];[unknown];[unknown];[unknown];[unknown];-;entry_SYSCALL_64_fastpath;SyS_select;core_sys_select;do_select;poll_schedule_timeout;schedule_hrtimeout_range;schedule_hrtimeout_range_clock;schedule;__schedule 1 +kworker/7:0;-;ret_from_fork;kthread;worker_thread;schedule;__schedule 1 +watchdog/0;-;ret_from_fork;kthread;smpboot_thread_fn;schedule;__schedule 1 +snmp-pass;[unknown];[unknown];[unknown];[unknown];[unknown];-;entry_SYSCALL_64_fastpath;SyS_select;core_sys_select;do_select;poll_schedule_timeout;schedule_hrtimeout_range;schedule_hrtimeout_range_clock;schedule;__schedule 1 +svscan;[unknown];-;entry_SYSCALL_64_fastpath;SyS_nanosleep;hrtimer_nanosleep;do_nanosleep;schedule;__schedule 1 +python;[unknown];__select_nocancel;-;entry_SYSCALL_64_fastpath;SyS_select;core_sys_select;do_select;poll_schedule_timeout;schedule_hrtimeout_range;schedule_hrtimeout_range_clock;schedule;__schedule 1 +kworker/2:0;-;ret_from_fork;kthread;worker_thread;schedule;__schedule 1 +[...] + +Flame graphs visualize stack traces. For information about them and links +to open source software, see http://www.brendangregg.com/flamegraphs.html . +This folded output can be piped directly into flamegraph.pl (the Perl version). + + USAGE message: # ./stackcount -h -usage: stackcount [-h] [-p PID] [-i INTERVAL] [-T] [-r] [-s] - [-l LIBRARY] [-v] [-d] pattern +usage: stackcount [-h] [-p PID] [-i INTERVAL] [-D DURATION] [-T] [-r] [-s] + [-P] [-K] [-U] [-v] [-d] [-f] [--debug] + pattern -Count function calls and their stack traces +Count events and their stack traces positional arguments: - pattern search expression for functions + pattern search expression for events optional arguments: -h, --help show this help message and exit -p PID, --pid PID trace this PID only -i INTERVAL, --interval INTERVAL summary interval, seconds + -D DURATION, --duration DURATION + total duration of trace, seconds -T, --timestamp include timestamp on output -r, --regexp use regular expressions. Default is "*" wildcards only. -s, --offset show address offsets - -l, --library trace user-space functions from this library or executable + -P, --perpid display stacks separately for each process + -K, --kernel-stacks-only + kernel stack only + -U, --user-stacks-only + user stack only -v, --verbose show raw addresses - -d, --debug print BPF program before starting (for debugging purposes) + -d, --delimited insert delimiter between kernel/user stacks + -f, --folded output folded format + --debug print BPF program before starting (for debugging + purposes) examples: - ./stackcount submit_bio # count kernel stack traces for submit_bio - ./stackcount ip_output # count kernel stack traces for ip_output - ./stackcount -s ip_output # show symbol offsets - ./stackcount -sv ip_output # show offsets and raw addresses (verbose) - ./stackcount 'tcp_send*' # count stacks for funcs matching tcp_send* - ./stackcount -r '^tcp_send.*' # same as above, using regular expressions - ./stackcount -Ti 5 ip_output # output every 5 seconds, with timestamps - ./stackcount -p 185 ip_output # count ip_output stacks for PID 185 only - ./stackcount -p 185 -l c malloc # count stacks for malloc in PID 185 - ./stackcount t:sched:sched_fork # count stacks for the sched_fork tracepoint - ./stackcount -p 185 u:node:* # count stacks for all USDT probes in node + ./stackcount submit_bio # count kernel stack traces for submit_bio + ./stackcount -d ip_output # include a user/kernel stack delimiter + ./stackcount -s ip_output # show symbol offsets + ./stackcount -sv ip_output # show offsets and raw addresses (verbose) + ./stackcount 'tcp_send*' # count stacks for funcs matching tcp_send* + ./stackcount -r '^tcp_send.*' # same as above, using regular expressions + ./stackcount -Ti 5 ip_output # output every 5 seconds, with timestamps + ./stackcount -p 185 ip_output # count ip_output stacks for PID 185 only + ./stackcount -p 185 c:malloc # count stacks for malloc in PID 185 + ./stackcount t:sched:sched_fork # count stacks for sched_fork tracepoint + ./stackcount -p 185 u:node:* # count stacks for all USDT probes in node + ./stackcount -K t:sched:sched_switch # kernel stacks only + ./stackcount -U t:sched:sched_switch # user stacks only