Skip to content

Commit

Permalink
tools/trace.py: aggregate trace events by '-A' option
Browse files Browse the repository at this point in the history
Orignally we can save the backtrace of a specified function into a
text file, then try to analyze the messy trace to guess why the
function is called.

To make the work easier, aggregate amount of trace by -A with -M EVENTS.
A typical example:
1, if we find that the sys CPU utilization is higher by 'top' command
2, then find that the timer interrupt is more normal by 'irqtop' command
3, to confirm kernel timer setting frequence by 'funccount -i 1 clockevents_program_event'
4, to trace timer setting by 'trace clockevents_program_event -K -A -M 1000'

1294576 1294584 CPU 0/KVM       clockevents_program_event
        clockevents_program_event+0x1 [kernel]
        hrtimer_start_range_ns+0x209 [kernel]
        start_sw_timer+0x173 [kvm]
        restart_apic_timer+0x6c [kvm]
        kvm_set_msr_common+0x442 [kvm]
        __kvm_set_msr+0xa2 [kvm]
        kvm_emulate_wrmsr+0x36 [kvm]
        vcpu_enter_guest+0x326 [kvm]
        kvm_arch_vcpu_ioctl_run+0xcc [kvm]
        kvm_vcpu_ioctl+0x22f [kvm]
        do_vfs_ioctl+0xa1 [kernel]
        ksys_ioctl+0x60 [kernel]
        __x64_sys_ioctl+0x16 [kernel]
        do_syscall_64+0x59 [kernel]
        entry_SYSCALL_64_after_hwframe+0x44 [kernel]
-->COUNT 271

...
Then we can know that 271 timer setting in recent 1000(~27%) occurs in
KVM code path.

To avoid too large size of a dictionary in python, -A works always with
-M.

Also fix several minor changes to keep 'trace -h' aligned with trace_example.txt.

Signed-off-by: zhenwei pi <[email protected]>
  • Loading branch information
pizhenwei authored and yonghong-song committed Jan 14, 2022
1 parent aa7200b commit 047541c
Show file tree
Hide file tree
Showing 3 changed files with 83 additions and 21 deletions.
5 changes: 4 additions & 1 deletion man/man8/trace.8
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
trace \- Trace a function and print its arguments or return value, optionally evaluating a filter. Uses Linux eBPF/bcc.
.SH SYNOPSIS
.B trace [-h] [-b BUFFER_PAGES] [-p PID] [-L TID] [--uid UID] [-v] [-Z STRING_SIZE] [-S] [-s SYM_FILE_LIST]
[-M MAX_EVENTS] [-t] [-u] [-T] [-C] [-K] [-U] [-a] [-I header]
[-M MAX_EVENTS] [-t] [-u] [-T] [-C] [-K] [-U] [-a] [-I header] [-A]
probe [probe ...]
.SH DESCRIPTION
trace probes functions you specify and displays trace messages if a particular
Expand Down Expand Up @@ -83,6 +83,9 @@ Additional header files to include in the BPF program. This is needed if your
filter or print expressions use types or data structures that are not available
in the standard headers. For example: 'linux/mm.h'
.TP
\-A
Print aggregated amount of each trace. This should be used with -M/--max-events together.
.TP
probe [probe ...]
One or more probes that attach to functions, filter conditions, and print
information. See PROBE SYNTAX below.
Expand Down
64 changes: 47 additions & 17 deletions tools/trace.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#
# usage: trace [-h] [-p PID] [-L TID] [-v] [-Z STRING_SIZE] [-S] [-c cgroup_path]
# [-M MAX_EVENTS] [-s SYMBOLFILES] [-T] [-t] [-K] [-U] [-a] [-I header]
# [-A]
# probe [probe ...]
#
# Licensed under the Apache License, Version 2.0 (the "License")
Expand Down Expand Up @@ -40,6 +41,8 @@ class Probe(object):
uid = -1
page_cnt = None
build_id_enabled = False
aggregate = False
symcount = {}

@classmethod
def configure(cls, args):
Expand All @@ -58,6 +61,10 @@ def configure(cls, args):
cls.page_cnt = args.buffer_pages
cls.bin_cmp = args.bin_cmp
cls.build_id_enabled = args.sym_file_list is not None
cls.aggregate = args.aggregate
if cls.aggregate and cls.max_events is None:
raise ValueError("-M/--max-events should be specified"
" with -A/--aggregate")

def __init__(self, probe, string_size, kernel_stack, user_stack,
cgroup_map_name, name, msg_filter):
Expand Down Expand Up @@ -584,18 +591,20 @@ def _display_function(self):
else: # self.probe_type == 't'
return self.tp_event

def print_stack(self, bpf, stack_id, tgid):
def _stack_to_string(self, bpf, stack_id, tgid):
if stack_id < 0:
print(" %d" % stack_id)
return
return (" %d" % stack_id)

stackstr = ''
stack = list(bpf.get_table(self.stacks_name).walk(stack_id))
for addr in stack:
print(" ", end="")
stackstr += ' '
if Probe.print_address:
print("%16x " % addr, end="")
print("%s" % (bpf.sym(addr, tgid,
show_module=True, show_offset=True)))
stackstr += ("%16x " % addr)
symstr = bpf.sym(addr, tgid, show_module=True, show_offset=True)
stackstr += ('%s\n' % (symstr.decode('utf-8')))

return stackstr

def _format_message(self, bpf, tgid, values):
# Replace each %K with kernel sym and %U with user sym in tgid
Expand All @@ -610,6 +619,11 @@ def _format_message(self, bpf, tgid, values):
show_module=True, show_offset=True)
return self.python_format % tuple(values)

def print_aggregate_events(self):
for k, v in sorted(self.symcount.items(), key=lambda item: \
item[1], reverse=True):
print("%s-->COUNT %d\n\n" % (k, v), end="")

def print_event(self, bpf, cpu, data, size):
# Cast as the generated structure type and display
# according to the format string in the probe.
Expand All @@ -621,32 +635,43 @@ def print_event(self, bpf, cpu, data, size):
msg = self._format_message(bpf, event.tgid, values)
if self.msg_filter and self.msg_filter not in msg:
return
eventstr = ''
if Probe.print_time:
time = strftime("%H:%M:%S") if Probe.use_localtime else \
Probe._time_off_str(event.timestamp_ns)
if Probe.print_unix_timestamp:
print("%-17s " % time[:17], end="")
eventstr += ("%-17s " % time[:17])
else:
print("%-8s " % time[:8], end="")
eventstr += ("%-8s " % time[:8])
if Probe.print_cpu:
print("%-3s " % event.cpu, end="")
print("%-7d %-7d %-15s %-16s %s" %
eventstr += ("%-3s " % event.cpu)
eventstr += ("%-7d %-7d %-15s %-16s %s\n" %
(event.tgid, event.pid,
event.comm.decode('utf-8', 'replace'),
self._display_function(), msg))

if self.kernel_stack:
self.print_stack(bpf, event.kernel_stack_id, -1)
eventstr += self._stack_to_string(bpf, event.kernel_stack_id, -1)
if self.user_stack:
self.print_stack(bpf, event.user_stack_id, event.tgid)
if self.user_stack or self.kernel_stack:
eventstr += self._stack_to_string(bpf, event.user_stack_id, event.tgid)

if self.aggregate is False:
print(eventstr, end="")
if self.kernel_stack or self.user_stack:
print("")
else:
if eventstr in self.symcount:
self.symcount[eventstr] += 1
else:
self.symcount[eventstr] = 1

Probe.event_count += 1
if Probe.max_events is not None and \
Probe.event_count >= Probe.max_events:
exit()
sys.stdout.flush()
if self.aggregate:
self.print_aggregate_events()
sys.stdout.flush()
exit()

def attach(self, bpf, verbose):
if len(self.library) == 0:
Expand Down Expand Up @@ -700,7 +725,7 @@ class Tool(object):
trace kfree_skb+0x12
Trace the kfree_skb kernel function after the instruction on the 0x12 offset
trace 'do_sys_open "%s", arg2@user'
Trace the open syscall and print the filename. being opened @user is
Trace the open syscall and print the filename being opened @user is
added to arg2 in kprobes to ensure that char * should be copied from
the userspace stack to the bpf stack. If not specified, previous
behaviour is expected.
Expand Down Expand Up @@ -752,6 +777,9 @@ class Tool(object):
to 53 (DNS; 13568 in big endian order)
trace -I 'linux/fs_struct.h' 'mntns_install "users = %d", $task->fs->users'
Trace the number of users accessing the file system of the current task
trace -s /lib/x86_64-linux-gnu/libc.so.6,/bin/ping 'p:c:inet_pton' -U
Trace inet_pton system call and use the specified libraries/executables for
symbol resolution.
"""

def __init__(self):
Expand Down Expand Up @@ -815,6 +843,8 @@ def __init__(self):
"as either full path, "
"or relative to current working directory, "
"or relative to default kernel header search path")
parser.add_argument("-A", "--aggregate", action="store_true",
help="aggregate amount of each trace")
parser.add_argument("--ebpf", action="store_true",
help=argparse.SUPPRESS)
self.args = parser.parse_args()
Expand Down
35 changes: 32 additions & 3 deletions tools/trace_example.txt
Original file line number Diff line number Diff line change
Expand Up @@ -237,6 +237,32 @@ Remember to use the -I argument include the appropriate header file. We didn't
need to do that here because `struct timespec` is used internally by the tool,
so it always includes this header file.

To aggregate amount of trace, you need specify -A with -M EVENTS. A typical
example:
1, if we find that the sys CPU utilization is higher by 'top' command
2, then find that the timer interrupt is more normal by 'irqtop' command
3, to confirm kernel timer setting frequence by 'funccount -i 1 clockevents_program_event'
4, to trace timer setting by 'trace clockevents_program_event -K -A -M 1000'

1294576 1294584 CPU 0/KVM clockevents_program_event
clockevents_program_event+0x1 [kernel]
hrtimer_start_range_ns+0x209 [kernel]
start_sw_timer+0x173 [kvm]
restart_apic_timer+0x6c [kvm]
kvm_set_msr_common+0x442 [kvm]
__kvm_set_msr+0xa2 [kvm]
kvm_emulate_wrmsr+0x36 [kvm]
vcpu_enter_guest+0x326 [kvm]
kvm_arch_vcpu_ioctl_run+0xcc [kvm]
kvm_vcpu_ioctl+0x22f [kvm]
do_vfs_ioctl+0xa1 [kernel]
ksys_ioctl+0x60 [kernel]
__x64_sys_ioctl+0x16 [kernel]
do_syscall_64+0x59 [kernel]
entry_SYSCALL_64_after_hwframe+0x44 [kernel]
-->COUNT 271
...
So we can know that 271 timer setting in recent 1000(~27%).

As a final example, let's trace open syscalls for a specific process. By
default, tracing is system-wide, but the -p switch overrides this:
Expand Down Expand Up @@ -384,6 +410,7 @@ optional arguments:
as either full path, or relative to current working
directory, or relative to default kernel header search
path
-A, --aggregate aggregate amount of each trace

EXAMPLES:

Expand All @@ -392,10 +419,11 @@ trace do_sys_open
trace kfree_skb+0x12
Trace the kfree_skb kernel function after the instruction on the 0x12 offset
trace 'do_sys_open "%s", arg2@user'
Trace the open syscall and print the filename being opened. @user is
Trace the open syscall and print the filename being opened @user is
added to arg2 in kprobes to ensure that char * should be copied from
the userspace stack to the bpf stack. If not specified, previous
behaviour is expected.

trace 'do_sys_open "%s", arg2@user' -n main
Trace the open syscall and only print event that process names containing "main"
trace 'do_sys_open "%s", arg2@user' --uid 1001
Expand All @@ -420,6 +448,8 @@ trace 't:block:block_rq_complete "sectors=%d", args->nr_sector'
Trace the block_rq_complete kernel tracepoint and print # of tx sectors
trace 'u:pthread:pthread_create (arg4 != 0)'
Trace the USDT probe pthread_create when its 4th argument is non-zero
trace 'u:pthread:libpthread:pthread_create (arg4 != 0)'
Ditto, but the provider name "libpthread" is specified.
trace 'p::SyS_nanosleep(struct timespec *ts) "sleep for %lld ns", ts->tv_nsec'
Trace the nanosleep syscall and print the sleep duration in ns
trace -c /sys/fs/cgroup/system.slice/workload.service '__x64_sys_nanosleep' '__x64_sys_clone'
Expand All @@ -435,7 +465,7 @@ trace -I 'kernel/sched/sched.h' \
in kernel/sched/sched.h which is in kernel source tree and not in kernel-devel
package. So this command needs to run at the kernel source tree root directory
so that the added header file can be found by the compiler.
trace -I 'net/sock.h' \\
trace -I 'net/sock.h' \
'udpv6_sendmsg(struct sock *sk) (sk->sk_dport == 13568)'
Trace udpv6 sendmsg calls only if socket's destination port is equal
to 53 (DNS; 13568 in big endian order)
Expand All @@ -444,4 +474,3 @@ trace -I 'linux/fs_struct.h' 'mntns_install "users = %d", $task->fs->users'
trace -s /lib/x86_64-linux-gnu/libc.so.6,/bin/ping 'p:c:inet_pton' -U
Trace inet_pton system call and use the specified libraries/executables for
symbol resolution.
"

0 comments on commit 047541c

Please sign in to comment.