Skip to content

Commit

Permalink
ext4dist
Browse files Browse the repository at this point in the history
  • Loading branch information
brendangregg committed Feb 12, 2016
1 parent cd1cad1 commit 1dcedc4
Show file tree
Hide file tree
Showing 4 changed files with 477 additions and 0 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ Tools:
- tools/[execsnoop](tools/execsnoop.py): Trace new processes via exec() syscalls. [Examples](tools/execsnoop_example.txt).
- tools/[dcsnoop](tools/dcsnoop.py): Trace directory entry cache (dcache) lookups. [Examples](tools/dcsnoop_example.txt).
- tools/[dcstat](tools/dcstat.py): Directory entry cache (dcache) stats. [Examples](tools/dcstat_example.txt).
- tools/[ext4dist](tools/ext4dist.py): Summarize ext4 operation latency. [Examples](tools/ext4dist_example.txt).
- tools/[ext4slower](tools/ext4slower.py): Trace slow ext4 operations. [Examples](tools/ext4slower_example.txt).
- tools/[filelife](tools/filelife.py): Trace the lifespan of short-lived files. [Examples](tools/filelife_example.txt).
- tools/[fileslower](tools/fileslower.py): Trace slow synchronous file reads and writes. [Examples](tools/fileslower_example.txt).
Expand Down
80 changes: 80 additions & 0 deletions man/man8/ext4dist.8
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
.TH ext4dist 8 "2016-02-12" "USER COMMANDS"
.SH NAME
ext4dist \- Summarize ext4 operation latency. Uses Linux eBPF/bcc.
.SH SYNOPSIS
.B ext4dist [\-h] [\-T] [\-N] [\-d] [interval] [count]
.SH DESCRIPTION
This tool summarizes time (latency) spent in common ext4 file operations: reads,
writes, opens, and syncs, and presents it as a power-of-2 histogram. It uses an
in-kernel eBPF map to store the histogram for efficiency.

Since this works by tracing the ext4_file_operations interface functions, it
will need updating to match any changes to these functions.

Since this uses BPF, only the root user can use this tool.
.SH REQUIREMENTS
CONFIG_BPF and bcc.
.SH OPTIONS
.TP
\-h
Print usage message.
.TP
\-T
Don't include timestamps on interval output.
.TP
\-m
Output in milliseconds.
.TP
\-p PID
Trace this PID only.
.SH EXAMPLES
.TP
Trace ext4 operation time, and print a summary on Ctrl-C:
#
.B ext4dist
.TP
Trace PID 181 only:
#
.B ext4dist -p 181
.TP
Print 1 second summaries, 10 times:
#
.B ext4dist 1 10
.TP
1 second summaries, printed in milliseconds
#
.B ext4dist \-m 1
.SH FIELDS
.TP
msecs
Range of milliseconds for this bucket.
.TP
usecs
Range of microseconds for this bucket.
.TP
count
Number of operations in this time range.
.TP
distribution
ASCII representation of the distribution (the count column).
.SH OVERHEAD
This adds low-overhead instrumentation to these ext4 operations,
including reads and writes from the file system cache. Such reads and writes
can be very frequent (depending on the workload; eg, 1M/sec), at which
point the overhead of this tool may become noticeable.
Measure and quantify before use.
.SH SOURCE
This is from bcc.
.IP
https://github.com/iovisor/bcc
.PP
Also look in the bcc distribution for a companion _examples.txt file containing
example usage, output, and commentary for this tool.
.SH OS
Linux
.SH STABILITY
Unstable - in development.
.SH AUTHOR
Brendan Gregg
.SH SEE ALSO
ext4snoop(8)
203 changes: 203 additions & 0 deletions tools/ext4dist.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,203 @@
#!/usr/bin/python
# @lint-avoid-python-3-compatibility-imports
#
# ext4dist Summarize ext4 operation latency.
# For Linux, uses BCC, eBPF.
#
# USAGE: ext4dist [-h] [-T] [-m] [-p PID] [interval] [count]
#
# Copyright 2016 Netflix, Inc.
# Licensed under the Apache License, Version 2.0 (the "License")
#
# 12-Feb-2016 Brendan Gregg Created this.

from __future__ import print_function
from bcc import BPF
from time import sleep, strftime
import argparse

# symbols
kallsyms = "/proc/kallsyms"

# arguments
examples = """examples:
./ext4dist # show operation latency as a histogram
./ext4dist -p 181 # trace PID 181 only
./ext4dist 1 10 # print 1 second summaries, 10 times
./ext4dist -m 5 # 5s summaries, milliseconds
"""
parser = argparse.ArgumentParser(
description="Summarize ext4 operation latency",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=examples)
parser.add_argument("-T", "--notimestamp", action="store_true",
help="don't include timestamp on interval output")
parser.add_argument("-m", "--milliseconds", action="store_true",
help="output in milliseconds")
parser.add_argument("-p", "--pid",
help="trace this PID only")
parser.add_argument("interval", nargs="?",
help="output interval, in seconds")
parser.add_argument("count", nargs="?", default=99999999,
help="number of outputs")
args = parser.parse_args()
pid = args.pid
countdown = int(args.count)
if args.milliseconds:
factor = 1000000
label = "msecs"
else:
factor = 1000
label = "usecs"
if args.interval and int(args.interval) == 0:
print("ERROR: interval 0. Exiting.")
exit()
debug = 0

# define BPF program
bpf_text = """
#include <uapi/linux/ptrace.h>
#include <linux/fs.h>
#include <linux/sched.h>
#define OP_NAME_LEN 8
typedef struct dist_key {
char op[OP_NAME_LEN];
u64 slot;
} dist_key_t;
BPF_HASH(start, u32);
BPF_HISTOGRAM(dist, dist_key_t);
// time operation
int trace_entry(struct pt_regs *ctx)
{
u32 pid = bpf_get_current_pid_tgid();
if (FILTER_PID)
return 0;
u64 ts = bpf_ktime_get_ns();
start.update(&pid, &ts);
return 0;
}
// The current ext4 (Linux 4.5) uses generic_file_read_iter(), instead of it's
// own function, for reads. So we need to trace that and then filter on ext4,
// which I do by checking file->f_op.
int trace_read_entry(struct pt_regs *ctx, struct kiocb *iocb)
{
u32 pid = bpf_get_current_pid_tgid();
if (FILTER_PID)
return 0;
// ext4 filter on file->f_op == ext4_file_operations
struct file *fp = iocb->ki_filp;
if ((u64)fp->f_op != EXT4_FILE_OPERATIONS)
return 0;
u64 ts = bpf_ktime_get_ns();
start.update(&pid, &ts);
return 0;
}
static int trace_return(struct pt_regs *ctx, const char *op)
{
u64 *tsp;
u32 pid = bpf_get_current_pid_tgid();
// fetch timestamp and calculate delta
tsp = start.lookup(&pid);
if (tsp == 0) {
return 0; // missed start or filtered
}
u64 delta = (bpf_ktime_get_ns() - *tsp) / FACTOR;
// store as histogram
dist_key_t key = {.slot = bpf_log2l(delta)};
__builtin_memcpy(&key.op, op, sizeof(key.op));
dist.increment(key);
start.delete(&pid);
return 0;
}
int trace_read_return(struct pt_regs *ctx)
{
char *op = "read";
return trace_return(ctx, op);
}
int trace_write_return(struct pt_regs *ctx)
{
char *op = "write";
return trace_return(ctx, op);
}
int trace_open_return(struct pt_regs *ctx)
{
char *op = "open";
return trace_return(ctx, op);
}
int trace_fsync_return(struct pt_regs *ctx)
{
char *op = "fsync";
return trace_return(ctx, op);
}
"""

# code replacements
with open(kallsyms) as syms:
ops = ''
for line in syms:
(addr, size, name) = line.rstrip().split(" ", 2)
if name == "ext4_file_operations":
ops = "0x" + addr
break
if ops == '':
print("ERROR: no ext4_file_operations in /proc/kallsyms. Exiting.")
exit()
bpf_text = bpf_text.replace('EXT4_FILE_OPERATIONS', ops)
bpf_text = bpf_text.replace('FACTOR', str(factor))
if args.pid:
bpf_text = bpf_text.replace('FILTER_PID', 'pid != %s' % pid)
else:
bpf_text = bpf_text.replace('FILTER_PID', '0')
if debug:
print(bpf_text)

# load BPF program
b = BPF(text=bpf_text)

# Common file functions. See earlier comment about generic_file_read_iter().
b.attach_kprobe(event="generic_file_read_iter", fn_name="trace_read_entry")
b.attach_kprobe(event="ext4_file_write_iter", fn_name="trace_entry")
b.attach_kprobe(event="ext4_file_open", fn_name="trace_entry")
b.attach_kprobe(event="ext4_sync_file", fn_name="trace_entry")
b.attach_kretprobe(event="generic_file_read_iter", fn_name="trace_read_return")
b.attach_kretprobe(event="ext4_file_write_iter", fn_name="trace_write_return")
b.attach_kretprobe(event="ext4_file_open", fn_name="trace_open_return")
b.attach_kretprobe(event="ext4_sync_file", fn_name="trace_fsync_return")

print("Tracing ext4 operation latency... Hit Ctrl-C to end.")

# output
exiting = 0
dist = b.get_table("dist")
while (1):
try:
if args.interval:
sleep(int(args.interval))
else:
sleep(99999999)
except KeyboardInterrupt:
exiting = 1

print()
if args.interval and (not args.notimestamp):
print(strftime("%H:%M:%S:"))

dist.print_log2_hist(label, "operation")
dist.clear()

countdown -= 1
if exiting or countdown == 0:
exit()
Loading

0 comments on commit 1dcedc4

Please sign in to comment.