Skip to content

Commit

Permalink
fileslower/filetop: use de->d_name.name, add filtering
Browse files Browse the repository at this point in the history
  • Loading branch information
Mark Drayton committed Aug 29, 2016
1 parent 4ebb7cf commit 7434731
Show file tree
Hide file tree
Showing 6 changed files with 92 additions and 55 deletions.
5 changes: 4 additions & 1 deletion man/man8/fileslower.8
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
.SH NAME
fileslower \- Trace slow synchronous file reads and writes.
.SH SYNOPSIS
.B fileslower [\-h] [\-p PID] [min_ms]
.B fileslower [\-h] [\-p PID] [-a] [min_ms]
.SH DESCRIPTION
This script uses kernel dynamic tracing of synchronous reads and writes
at the VFS interface, to identify slow file reads and writes for any file
Expand Down Expand Up @@ -30,6 +30,9 @@ CONFIG_BPF and bcc.
\-p PID
Trace this PID only.
.TP
\-a
Include non-regular file types in output (sockets, FIFOs, etc).
.TP
min_ms
Minimum I/O latency (duration) to trace, in milliseconds. Default is 10 ms.
.SH EXAMPLES
Expand Down
8 changes: 6 additions & 2 deletions man/man8/filetop.8
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,12 @@ filetop \- File reads and writes by filename and process. Top for files.
.SH SYNOPSIS
.B filetop [\-h] [\-C] [\-r MAXROWS] [\-p PID] [interval] [count]
.SH DESCRIPTION
This is top for files.
This is top for files.

This traces file reads and writes, and prints a per-file summary every
interval (by default, 1 second). The summary is sorted on the highest read
throughput (Kbytes).
throughput (Kbytes). By default only IO on regular files is shown. The -a
option will list all file types (sokets, FIFOs, etc).

This uses in-kernel eBPF maps to store per process summaries for efficiency.

Expand All @@ -29,6 +30,9 @@ Since this uses BPF, only the root user can use this tool.
CONFIG_BPF and bcc.
.SH OPTIONS
.TP
\-a
Include non-regular file types (sockets, FIFOs, etc).
.TP
\-C
Don't clear the screen.
.TP
Expand Down
53 changes: 33 additions & 20 deletions tools/fileslower.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
# fileslower Trace slow synchronous file reads and writes.
# For Linux, uses BCC, eBPF.
#
# USAGE: fileslower [-h] [-p PID] [min_ms]
# USAGE: fileslower [-h] [-p PID] [-a] [min_ms]
#
# This script uses kernel dynamic tracing of synchronous reads and writes
# at the VFS interface, to identify slow file reads and writes for any file
Expand Down Expand Up @@ -32,7 +32,6 @@
from bcc import BPF
import argparse
import ctypes as ct
import signal
import time

# arguments
Expand All @@ -45,19 +44,17 @@
description="Trace slow synchronous file reads and writes",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=examples)
parser.add_argument("-p", "--pid",
parser.add_argument("-p", "--pid", type=int, metavar="PID", dest="tgid",
help="trace this PID only")
parser.add_argument("-a", "--all-files", action="store_true",
help="include non-regular file types (sockets, FIFOs, etc)")
parser.add_argument("min_ms", nargs="?", default='10',
help="minimum I/O duration to trace, in ms (default 10)")
args = parser.parse_args()
min_ms = int(args.min_ms)
pid = args.pid
tgid = args.tgid
debug = 0

# signal handler
def signal_ignore(signal, frame):
print()

# define BPF program
bpf_text = """
#include <uapi/linux/ptrace.h>
Expand All @@ -72,6 +69,8 @@ def signal_ignore(signal, frame):
struct val_t {
u32 sz;
u64 ts;
u32 name_len;
// de->d_name.name may point to de->d_iname so limit len accordingly
char name[DNAME_INLINE_LEN];
char comm[TASK_COMM_LEN];
};
Expand All @@ -81,6 +80,7 @@ def signal_ignore(signal, frame):
u32 pid;
u32 sz;
u64 delta_us;
u32 name_len;
char name[DNAME_INLINE_LEN];
char comm[TASK_COMM_LEN];
};
Expand All @@ -92,22 +92,25 @@ def signal_ignore(signal, frame):
static int trace_rw_entry(struct pt_regs *ctx, struct file *file,
char __user *buf, size_t count)
{
u32 pid;
pid = bpf_get_current_pid_tgid();
if (FILTER)
u32 tgid = bpf_get_current_pid_tgid() >> 32;
if (TGID_FILTER)
return 0;
u32 pid = bpf_get_current_pid_tgid();
// skip I/O lacking a filename
struct dentry *de = file->f_path.dentry;
if (de->d_iname[0] == 0)
int mode = file->f_inode->i_mode;
if (de->d_name.len == 0 || TYPE_FILTER)
return 0;
// store size and timestamp by pid
struct val_t val = {};
val.sz = count;
val.ts = bpf_ktime_get_ns();
__builtin_memcpy(&val.name, de->d_iname, sizeof(val.name));
val.name_len = de->d_name.len;
bpf_probe_read(&val.name, sizeof(val.name), (void *)de->d_name.name);
bpf_get_current_comm(&val.comm, sizeof(val.comm));
entryinfo.update(&pid, &val);
Expand Down Expand Up @@ -153,6 +156,7 @@ def signal_ignore(signal, frame):
data.pid = pid;
data.sz = valp->sz;
data.delta_us = delta_us;
data.name_len = valp->name_len;
bpf_probe_read(&data.name, sizeof(data.name), valp->name);
bpf_probe_read(&data.comm, sizeof(data.comm), valp->comm);
events.perf_submit(ctx, &data, sizeof(data));
Expand All @@ -172,15 +176,20 @@ def signal_ignore(signal, frame):
"""
bpf_text = bpf_text.replace('MIN_US', str(min_ms * 1000))
if args.pid:
bpf_text = bpf_text.replace('FILTER', 'pid != %s' % pid)
if args.tgid:
bpf_text = bpf_text.replace('TGID_FILTER', 'tgid != %d' % tgid)
else:
bpf_text = bpf_text.replace('FILTER', '0')
bpf_text = bpf_text.replace('TGID_FILTER', '0')
if args.all_files:
bpf_text = bpf_text.replace('TYPE_FILTER', '0')
else:
bpf_text = bpf_text.replace('TYPE_FILTER', '!S_ISREG(mode)')

if debug:
print(bpf_text)

# initialize BPF
b = BPF(text=bpf_text)
b = BPF(text=bpf_text,)

# I'd rather trace these via new_sync_read/new_sync_write (which used to be
# do_sync_read/do_sync_write), but those became static. So trace these from
Expand All @@ -205,6 +214,7 @@ class Data(ct.Structure):
("pid", ct.c_uint),
("sz", ct.c_uint),
("delta_us", ct.c_ulonglong),
("name_len", ct.c_uint),
("name", ct.c_char * DNAME_INLINE_LEN),
("comm", ct.c_char * TASK_COMM_LEN),
]
Expand All @@ -216,7 +226,7 @@ class Data(ct.Structure):

# header
print("Tracing sync read/writes slower than %d ms" % min_ms)
print("%-8s %-14s %-6s %1s %-7s %7s %s" % ("TIME(s)", "COMM", "PID", "D",
print("%-8s %-14s %-6s %1s %-7s %7s %s" % ("TIME(s)", "COMM", "TID", "D",
"BYTES", "LAT(ms)", "FILENAME"))

start_ts = time.time()
Expand All @@ -225,10 +235,13 @@ def print_event(cpu, data, size):
event = ct.cast(data, ct.POINTER(Data)).contents

ms = float(event.delta_us) / 1000
name = event.name
if event.name_len > DNAME_INLINE_LEN:
name = name[:-3] + "..."

print("%-8.3f %-14.14s %-6s %1s %-7s %7.2f %s" % (
time.time() - start_ts, event.comm, event.pid, mode_s[event.mode],
event.sz, ms, event.name))
event.sz, ms, name))

b["events"].open_perf_buffer(print_event)
while 1:
Expand Down
3 changes: 2 additions & 1 deletion tools/fileslower_example.txt
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ locks, run queue latency, etc. These can be explored using other commands.
USAGE message:

# ./fileslower -h
usage: fileslower [-h] [-p PID] [min_ms]
usage: fileslower.py [-h] [-p PID] [-a] [min_ms]

Trace slow synchronous file reads and writes

Expand All @@ -115,6 +115,7 @@ positional arguments:
optional arguments:
-h, --help show this help message and exit
-p PID, --pid PID trace this PID only
-a, --all-files include non-regular file types

examples:
./fileslower # trace sync file I/O slower than 10 ms (default)
Expand Down
59 changes: 38 additions & 21 deletions tools/filetop.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,11 +32,13 @@
description="File reads and writes by process",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=examples)
parser.add_argument("-a", "--all-files", action="store_true",
help="include non-regular file types (sockets, FIFOs, etc)")
parser.add_argument("-C", "--noclear", action="store_true",
help="don't clear the screen")
parser.add_argument("-r", "--maxrows", default=20,
help="maximum rows to print, default 20")
parser.add_argument("-p", "--pid",
parser.add_argument("-p", "--pid", type=int, metavar="PID", dest="tgid",
help="trace this PID only")
parser.add_argument("interval", nargs="?", default=1,
help="output interval, in seconds")
Expand All @@ -61,13 +63,13 @@ def signal_ignore(signal, frame):
#include <uapi/linux/ptrace.h>
#include <linux/blkdev.h>
#define MAX_FILE_LEN 32
// the key for the output summary
struct info_t {
u32 pid;
char name[TASK_COMM_LEN];
char file[MAX_FILE_LEN];
u32 name_len;
char comm[TASK_COMM_LEN];
// de->d_name.name may point to de->d_iname so limit len accordingly
char name[DNAME_INLINE_LEN];
char type;
};
Expand All @@ -84,22 +86,23 @@ def signal_ignore(signal, frame):
static int do_entry(struct pt_regs *ctx, struct file *file,
char __user *buf, size_t count, int is_read)
{
u32 pid;
pid = bpf_get_current_pid_tgid();
if (FILTER)
u32 tgid = bpf_get_current_pid_tgid() >> 32;
if (TGID_FILTER)
return 0;
u32 pid = bpf_get_current_pid_tgid();
// skip I/O lacking a filename
struct dentry *de = file->f_path.dentry;
if (de->d_iname[0] == 0)
int mode = file->f_inode->i_mode;
if (de->d_name.len == 0 || TYPE_FILTER)
return 0;
// store counts and sizes by pid & file
struct info_t info = {.pid = pid};
bpf_get_current_comm(&info.name, sizeof(info.name));
__builtin_memcpy(&info.file, de->d_iname, sizeof(info.file));
int mode = file->f_inode->i_mode;
bpf_get_current_comm(&info.comm, sizeof(info.comm));
info.name_len = de->d_name.len;
bpf_probe_read(&info.name, sizeof(info.name), (void *)de->d_name.name);
if (S_ISREG(mode)) {
info.type = 'R';
} else if (S_ISSOCK(mode)) {
Expand Down Expand Up @@ -134,17 +137,28 @@ def signal_ignore(signal, frame):
}
"""
if args.pid:
bpf_text = bpf_text.replace('FILTER', 'pid != %s' % args.pid)
if args.tgid:
bpf_text = bpf_text.replace('TGID_FILTER', 'tgid != %d' % args.tgid)
else:
bpf_text = bpf_text.replace('TGID_FILTER', '0')
if args.all_files:
bpf_text = bpf_text.replace('TYPE_FILTER', '0')
else:
bpf_text = bpf_text.replace('FILTER', '0')
bpf_text = bpf_text.replace('TYPE_FILTER', '!S_ISREG(mode)')

if debug:
print(bpf_text)

# initialize BPF
b = BPF(text=bpf_text)
b.attach_kprobe(event="__vfs_read", fn_name="trace_read_entry")
b.attach_kprobe(event="__vfs_write", fn_name="trace_write_entry")
try:
b.attach_kprobe(event="__vfs_write", fn_name="trace_write_entry")
except:
# older kernels don't have __vfs_write so try vfs_write instead
b.attach_kprobe(event="vfs_write", fn_name="trace_write_entry")

DNAME_INLINE_LEN = 32 # linux/dcache.h

print('Tracing... Output every %d secs. Hit Ctrl-C to end' % interval)

Expand All @@ -163,19 +177,22 @@ def signal_ignore(signal, frame):
print()
with open(loadavg) as stats:
print("%-8s loadavg: %s" % (strftime("%H:%M:%S"), stats.read()))
print("%-6s %-16s %-6s %-6s %-7s %-7s %1s %s" % ("PID", "COMM",
print("%-6s %-16s %-6s %-6s %-7s %-7s %1s %s" % ("TID", "COMM",
"READS", "WRITES", "R_Kb", "W_Kb", "T", "FILE"))

# by-PID output
# by-TID output
counts = b.get_table("counts")
line = 0
for k, v in reversed(sorted(counts.items(),
key=lambda counts: counts[1].rbytes)):
name = k.name
if k.name_len > DNAME_INLINE_LEN:
name = name[:-3] + "..."

# print line
print("%-6d %-16s %-6d %-6d %-7d %-7d %1s %s" % (k.pid, k.name,
print("%-6d %-16s %-6d %-6d %-7d %-7d %1s %s" % (k.pid, k.comm,
v.reads, v.writes, v.rbytes / 1024, v.wbytes / 1024, k.type,
k.file))
name))

line += 1
if line >= maxrows:
Expand Down
Loading

0 comments on commit 7434731

Please sign in to comment.