Skip to content

Commit

Permalink
stackcount: Resolve stacks using a StackTrace table
Browse files Browse the repository at this point in the history
The changes for this script are minimal: the inline C probe has been
_significantly_ simplified, and should now perform better since the
stack walk happens in native code inside the kernel thanks to the
BPF_STACK_TRACE table.

The output of the tool should be virtually identical, so the man page
and included examples have essentially no changes.
  • Loading branch information
vmg committed Mar 27, 2016
1 parent 592414e commit b4ebed0
Show file tree
Hide file tree
Showing 4 changed files with 195 additions and 62 deletions.
17 changes: 7 additions & 10 deletions man/man8/stackcount.8
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,8 @@ and their occurrence counts.
The pattern is a string with optional '*' wildcards, similar to file globbing.
If you'd prefer to use regular expressions, use the \-r option.

The stack depth is currently limited to 10 (+1 for the current instruction
pointer).
This tool only works on Linux 4.6+. Stack traces are obtained using the new `BPF_STACK_TRACE` APIs.

This currently only works on x86_64. Check for future versions.
.SH REQUIREMENTS
CONFIG_BPF and bcc.
.SH OPTIONS
Expand Down Expand Up @@ -82,13 +80,12 @@ Only count stacks when PID 185 is on-CPU:
This summarizes unique stack traces in-kernel for efficiency, allowing it to
trace a higher rate of function calls than methods that post-process in user
space. The stack trace data is only copied to user space when the output is
printed, which usually only happens once. Given these techniques, I'd suspect
that call rates of < 10,000/sec would incur negligible overhead (for this
current version; future versions may improve this). Beyond that,
there will be a point where the overhead is measurable, as this does add
a number of instructions to each function call to walk and save stacks.
Test before production use. You can also use funccount to get a handle on
function call rates first.
printed, which usually only happens once. The stack walking also happens in an
optimized code path in the kernel thanks to the new BPF_STACK_TRACE table APIs,
which should be more efficient than the manual walker in the eBPF tracer which
older versions of this script used. With this in mind, call rates of <
10,000/sec would incur negligible overhead. Test before production use. You can
also use funccount to get a handle on function call rates first.
.SH SOURCE
This is from bcc.
.IP
Expand Down
179 changes: 179 additions & 0 deletions tools/old/stackcount.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,179 @@
#!/usr/bin/python
#
# stackcount Count kernel function calls and their stack traces.
# For Linux, uses BCC, eBPF.
#
# USAGE: stackcount [-h] [-p PID] [-i INTERVAL] [-T] [-r] pattern
#
# The pattern is a string with optional '*' wildcards, similar to file
# globbing. If you'd prefer to use regular expressions, use the -r option.
#
# The current implementation uses an unrolled loop for x86_64, and was written
# as a proof of concept. This implementation should be replaced in the future
# with an appropriate bpf_ call, when available.
#
# Currently limited to a stack trace depth of 11 (maxdepth + 1).
#
# Copyright 2016 Netflix, Inc.
# Licensed under the Apache License, Version 2.0 (the "License")
#
# 12-Jan-2016 Brendan Gregg Created this.

from __future__ import print_function
from bcc import BPF
from time import sleep, strftime
import argparse
import signal

# arguments
examples = """examples:
./stackcount submit_bio # count kernel stack traces for submit_bio
./stackcount ip_output # count kernel stack traces for ip_output
./stackcount -s ip_output # show symbol offsets
./stackcount -sv ip_output # show offsets and raw addresses (verbose)
./stackcount 'tcp_send*' # count stacks for funcs matching tcp_send*
./stackcount -r '^tcp_send.*' # same as above, using regular expressions
./stackcount -Ti 5 ip_output # output every 5 seconds, with timestamps
./stackcount -p 185 ip_output # count ip_output stacks for PID 185 only
"""
parser = argparse.ArgumentParser(
description="Count kernel function calls and their stack traces",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=examples)
parser.add_argument("-p", "--pid",
help="trace this PID only")
parser.add_argument("-i", "--interval", default=99999999,
help="summary interval, seconds")
parser.add_argument("-T", "--timestamp", action="store_true",
help="include timestamp on output")
parser.add_argument("-r", "--regexp", action="store_true",
help="use regular expressions. Default is \"*\" wildcards only.")
parser.add_argument("-s", "--offset", action="store_true",
help="show address offsets")
parser.add_argument("-v", "--verbose", action="store_true",
help="show raw addresses")
parser.add_argument("pattern",
help="search expression for kernel functions")
args = parser.parse_args()
pattern = args.pattern
if not args.regexp:
pattern = pattern.replace('*', '.*')
pattern = '^' + pattern + '$'
offset = args.offset
verbose = args.verbose
debug = 0
maxdepth = 10 # and MAXDEPTH

# signal handler
def signal_ignore(signal, frame):
print()

# load BPF program
bpf_text = """
#include <uapi/linux/ptrace.h>
#define MAXDEPTH 10
struct key_t {
u64 ip;
u64 ret[MAXDEPTH];
};
BPF_HASH(counts, struct key_t);
static u64 get_frame(u64 *bp) {
if (*bp) {
// The following stack walker is x86_64 specific
u64 ret = 0;
if (bpf_probe_read(&ret, sizeof(ret), (void *)(*bp+8)))
return 0;
if (bpf_probe_read(bp, sizeof(*bp), (void *)*bp))
*bp = 0;
if (ret < __START_KERNEL_map)
return 0;
return ret;
}
return 0;
}
int trace_count(struct pt_regs *ctx) {
FILTER
struct key_t key = {};
u64 zero = 0, *val, bp = 0;
int depth = 0;
key.ip = ctx->ip;
bp = ctx->bp;
// unrolled loop, 10 (MAXDEPTH) frames deep:
if (!(key.ret[depth++] = get_frame(&bp))) goto out;
if (!(key.ret[depth++] = get_frame(&bp))) goto out;
if (!(key.ret[depth++] = get_frame(&bp))) goto out;
if (!(key.ret[depth++] = get_frame(&bp))) goto out;
if (!(key.ret[depth++] = get_frame(&bp))) goto out;
if (!(key.ret[depth++] = get_frame(&bp))) goto out;
if (!(key.ret[depth++] = get_frame(&bp))) goto out;
if (!(key.ret[depth++] = get_frame(&bp))) goto out;
if (!(key.ret[depth++] = get_frame(&bp))) goto out;
if (!(key.ret[depth++] = get_frame(&bp))) goto out;
out:
val = counts.lookup_or_init(&key, &zero);
(*val)++;
return 0;
}
"""
if args.pid:
bpf_text = bpf_text.replace('FILTER',
('u32 pid; pid = bpf_get_current_pid_tgid(); ' +
'if (pid != %s) { return 0; }') % (args.pid))
else:
bpf_text = bpf_text.replace('FILTER', '')
if debug:
print(bpf_text)
b = BPF(text=bpf_text)
b.attach_kprobe(event_re=pattern, fn_name="trace_count")
matched = b.num_open_kprobes()
if matched == 0:
print("0 functions matched by \"%s\". Exiting." % args.pattern)
exit()

# header
print("Tracing %d functions for \"%s\"... Hit Ctrl-C to end." %
(matched, args.pattern))

def print_frame(addr):
print(" ", end="")
if verbose:
print("%-16x " % addr, end="")
if offset:
print("%s" % b.ksymaddr(addr))
else:
print("%s" % b.ksym(addr))

# output
exiting = 0 if args.interval else 1
while (1):
try:
sleep(int(args.interval))
except KeyboardInterrupt:
exiting = 1
# as cleanup can take many seconds, trap Ctrl-C:
signal.signal(signal.SIGINT, signal_ignore)

print()
if args.timestamp:
print("%-8s\n" % strftime("%H:%M:%S"), end="")

counts = b.get_table("counts")
for k, v in sorted(counts.items(), key=lambda counts: counts[1].value):
print_frame(k.ip)
for i in range(0, maxdepth):
if k.ret[i] == 0:
break
print_frame(k.ret[i])
print(" %d\n" % v.value)
counts.clear()

if exiting:
print("Detaching...")
exit()
58 changes: 9 additions & 49 deletions tools/stackcount.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,52 +72,14 @@ def signal_ignore(signal, frame):
bpf_text = """
#include <uapi/linux/ptrace.h>
#define MAXDEPTH 10
struct key_t {
u64 ip;
u64 ret[MAXDEPTH];
};
BPF_HASH(counts, struct key_t);
static u64 get_frame(u64 *bp) {
if (*bp) {
// The following stack walker is x86_64 specific
u64 ret = 0;
if (bpf_probe_read(&ret, sizeof(ret), (void *)(*bp+8)))
return 0;
if (bpf_probe_read(bp, sizeof(*bp), (void *)*bp))
*bp = 0;
if (ret < __START_KERNEL_map)
return 0;
return ret;
}
return 0;
}
BPF_HASH(counts, int);
BPF_STACK_TRACE(stack_traces, 1024);
int trace_count(struct pt_regs *ctx) {
FILTER
struct key_t key = {};
u64 zero = 0, *val, bp = 0;
int depth = 0;
key.ip = ctx->ip;
bp = ctx->bp;
// unrolled loop, 10 (MAXDEPTH) frames deep:
if (!(key.ret[depth++] = get_frame(&bp))) goto out;
if (!(key.ret[depth++] = get_frame(&bp))) goto out;
if (!(key.ret[depth++] = get_frame(&bp))) goto out;
if (!(key.ret[depth++] = get_frame(&bp))) goto out;
if (!(key.ret[depth++] = get_frame(&bp))) goto out;
if (!(key.ret[depth++] = get_frame(&bp))) goto out;
if (!(key.ret[depth++] = get_frame(&bp))) goto out;
if (!(key.ret[depth++] = get_frame(&bp))) goto out;
if (!(key.ret[depth++] = get_frame(&bp))) goto out;
if (!(key.ret[depth++] = get_frame(&bp))) goto out;
out:
val = counts.lookup_or_init(&key, &zero);
int key = stack_traces.get_stackid(ctx, BPF_F_REUSE_STACKID);
u64 zero = 0;
u64 *val = counts.lookup_or_init(&key, &zero);
(*val)++;
return 0;
}
Expand Down Expand Up @@ -164,13 +126,11 @@ def print_frame(addr):
if args.timestamp:
print("%-8s\n" % strftime("%H:%M:%S"), end="")

counts = b.get_table("counts")
counts = b["counts"]
stack_traces = b["stack_traces"]
for k, v in sorted(counts.items(), key=lambda counts: counts[1].value):
print_frame(k.ip)
for i in range(0, maxdepth):
if k.ret[i] == 0:
break
print_frame(k.ret[i])
for addr in stack_traces.walk(k.value):
print_frame(addr)
print(" %d\n" % v.value)
counts.clear()

Expand Down
3 changes: 0 additions & 3 deletions tools/stackcount_example.txt
Original file line number Diff line number Diff line change
Expand Up @@ -82,9 +82,6 @@ tracing.
It can be useful to trace the path to submit_bio to explain unusual rates of
disk IOPS. These could have in-kernel origins (eg, background scrub).

This version of stackcount truncates stacks to 10 levels deep (plus 1 for
the traced function, so 11).


As another example, here are the code paths that led to ip_output(), which
sends a packet at the IP level:
Expand Down

0 comments on commit b4ebed0

Please sign in to comment.