From bc54bb6df0ac490b199857390432b7565ac6f037 Mon Sep 17 00:00:00 2001 From: Brendan Gregg Date: Sun, 14 Feb 2016 23:13:13 -0800 Subject: [PATCH] zfsslower --- README.md | 1 + man/man8/zfsslower.8 | 116 ++++++++++++++ tools/zfsslower.py | 292 ++++++++++++++++++++++++++++++++++++ tools/zfsslower_example.txt | 157 +++++++++++++++++++ 4 files changed, 566 insertions(+) create mode 100644 man/man8/zfsslower.8 create mode 100755 tools/zfsslower.py create mode 100644 tools/zfsslower_example.txt diff --git a/README.md b/README.md index ef581af176f7..b813dbd67cc6 100644 --- a/README.md +++ b/README.md @@ -106,6 +106,7 @@ Tools: - tools/[wakeuptime](tools/wakeuptime.py): Summarize sleep to wakeup time by waker kernel stack. [Examples](tools/wakeuptime_example.txt). - tools/[xfsdist](tools/xfsdist.py): Summarize XFS operation latency. [Examples](tools/xfsdist_example.txt). - tools/[xfsslower](tools/xfsslower.py): Trace slow XFS operations. [Examples](tools/xfsslower_example.txt). +- tools/[zfsslower](tools/zfsslower.py): Trace slow ZFS operations. [Examples](tools/zfsslower_example.txt). ### Networking diff --git a/man/man8/zfsslower.8 b/man/man8/zfsslower.8 new file mode 100644 index 000000000000..8f5c8cf33aa6 --- /dev/null +++ b/man/man8/zfsslower.8 @@ -0,0 +1,116 @@ +.TH zfsslower 8 "2016-02-11" "USER COMMANDS" +.SH NAME +zfsslower \- Trace slow zfs file operations, with per-event details. +.SH SYNOPSIS +.B zfsslower [\-h] [\-j] [\-p PID] [min_ms] +.SH DESCRIPTION +This tool traces common ZFS file operations: reads, writes, opens, and +syncs. It measures the time spent in these operations, and prints details +for each that exceeded a threshold. + +WARNING: See the OVERHEAD section. + +By default, a minimum millisecond threshold of 10 is used. If a threshold of 0 +is used, all events are printed (warning: verbose). + +This uses kernel dynamic tracing of the ZPL interface (ZFS POSIX +Layer), and will need updates to match any changes to this interface. +.TP +This is intended to work with the ZFS on Linux project: +http://zfsonlinux.org +.PP +Since this uses BPF, only the root user can use this tool. +.SH REQUIREMENTS +CONFIG_BPF and bcc. +.SH OPTIONS +\-p PID +Trace this PID only. +.TP +min_ms +Minimum I/O latency (duration) to trace, in milliseconds. Default is 10 ms. +.SH EXAMPLES +.TP +Trace synchronous file reads and writes slower than 10 ms: +# +.B zfsslower +.TP +Trace slower than 1 ms: +# +.B zfsslower 1 +.TP +Trace slower than 1 ms, and output just the fields in parsable format (csv): +# +.B zfsslower \-j 1 +.TP +Trace all file reads and writes (warning: the output will be verbose): +# +.B zfsslower 0 +.TP +Trace slower than 1 ms, for PID 181 only: +# +.B zfsslower \-p 181 1 +.SH FIELDS +.TP +TIME(s) +Time of I/O completion since the first I/O seen, in seconds. +.TP +COMM +Process name. +.TP +PID +Process ID. +.TP +T +Type of operation. R == read, W == write, O == open, S == fsync. +.TP +OFF_KB +File offset for the I/O, in Kbytes. +.TP +BYTES +Size of I/O, in bytes. +.TP +LAT(ms) +Latency (duration) of I/O, measured from when it was issued by VFS to the +filesystem, to when it completed. This time is inclusive of block device I/O, +file system CPU cycles, file system locks, run queue latency, etc. It's a more +accurate measure of the latency suffered by applications performing file +system I/O, than to measure this down at the block device interface. +.TP +FILENAME +A cached kernel file name (comes from dentry->d_iname). +.TP +ENDTIME_us +Completion timestamp, microseconds (\-j only). +.TP +OFFSET_b +File offset, bytes (\-j only). +.TP +LATENCY_us +Latency (duration) of the I/O, in microseconds (\-j only). +.SH OVERHEAD +This adds low-overhead instrumentation to these ZFS operations, +including reads and writes from the file system cache. Such reads and writes +can be very frequent (depending on the workload; eg, 1M/sec), at which +point the overhead of this tool (even if it prints no "slower" events) can +begin to become significant. Measure and quantify before use. If this +continues to be a problem, consider switching to a tool that prints in-kernel +summaries only. +.PP +Note that the overhead of this tool should be less than fileslower(8), as +this tool targets zfs functions only, and not all file read/write paths +(which can include socket I/O). +.SH SOURCE +This is from bcc. +.IP +https://github.com/iovisor/bcc +.PP +Also look in the bcc distribution for a companion _examples.txt file containing +example usage, output, and commentary for this tool. +.SH OS +Linux +.SH STABILITY +Unstable - in development. +.SH AUTHOR +Brendan Gregg +.SH SEE ALSO +biosnoop(8), funccount(8), fileslower(8) diff --git a/tools/zfsslower.py b/tools/zfsslower.py new file mode 100755 index 000000000000..4337ead42029 --- /dev/null +++ b/tools/zfsslower.py @@ -0,0 +1,292 @@ +#!/usr/bin/python +# @lint-avoid-python-3-compatibility-imports +# +# zfsslower Trace slow ZFS operations. +# For Linux, uses BCC, eBPF. +# +# USAGE: zfsslower [-h] [-j] [-p PID] [min_ms] +# +# This script traces common ZFS file operations: reads, writes, opens, and +# syncs. It measures the time spent in these operations, and prints details +# for each that exceeded a threshold. +# +# WARNING: This adds low-overhead instrumentation to these ZFS operations, +# including reads and writes from the file system cache. Such reads and writes +# can be very frequent (depending on the workload; eg, 1M/sec), at which +# point the overhead of this tool (even if it prints no "slower" events) can +# begin to become significant. +# +# This works by using kernel dynamic tracing of the ZPL interface, and will +# need updates to match any changes to this interface. +# +# By default, a minimum millisecond threshold of 10 is used. +# +# Copyright 2016 Netflix, Inc. +# Licensed under the Apache License, Version 2.0 (the "License") +# +# 14-Feb-2016 Brendan Gregg Created this. + +from __future__ import print_function +from bcc import BPF +import argparse +from time import strftime +import ctypes as ct + +# arguments +examples = """examples: + ./zfsslower # trace operations slower than 10 ms (default) + ./zfsslower 1 # trace operations slower than 1 ms + ./zfsslower -j 1 # ... 1 ms, parsable output (csv) + ./zfsslower 0 # trace all operations (warning: verbose) + ./zfsslower -p 185 # trace PID 185 only +""" +parser = argparse.ArgumentParser( + description="Trace common ZFS file operations slower than a threshold", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=examples) +parser.add_argument("-j", "--csv", action="store_true", + help="just print fields: comma-separated values") +parser.add_argument("-p", "--pid", + help="trace this PID only") +parser.add_argument("min_ms", nargs="?", default='10', + help="minimum I/O duration to trace, in ms (default 10)") +args = parser.parse_args() +min_ms = int(args.min_ms) +pid = args.pid +csv = args.csv +debug = 0 + +# define BPF program +bpf_text = """ +#include +#include +#include +#include + +// XXX: switch these to char's when supported +#define TRACE_READ 0 +#define TRACE_WRITE 1 +#define TRACE_OPEN 2 +#define TRACE_FSYNC 3 + +struct val_t { + u64 ts; + u64 offset; + struct file *fp; +}; + +struct data_t { + // XXX: switch some to u32's when supported + u64 ts_us; + u64 type; + u64 size; + u64 offset; + u64 delta_us; + u64 pid; + char task[TASK_COMM_LEN]; + char file[DNAME_INLINE_LEN]; +}; + +BPF_HASH(entryinfo, pid_t, struct val_t); +BPF_PERF_OUTPUT(events); + +// +// Store timestamp and size on entry +// + +// zpl_read(), zpl_write(): +int trace_rw_entry(struct pt_regs *ctx, struct file *filp, char __user *buf, + size_t len, loff_t *ppos) +{ + u32 pid; + pid = bpf_get_current_pid_tgid(); + if (FILTER_PID) + return 0; + + // store filep and timestamp by pid + struct val_t val = {}; + val.ts = bpf_ktime_get_ns(); + val.fp = filp; + val.offset = *ppos; + if (val.fp) + entryinfo.update(&pid, &val); + + return 0; +} + +// zpl_open(): +int trace_open_entry(struct pt_regs *ctx, struct inode *inode, + struct file *filp) +{ + u32 pid; + pid = bpf_get_current_pid_tgid(); + if (FILTER_PID) + return 0; + + // store filep and timestamp by pid + struct val_t val = {}; + val.ts = bpf_ktime_get_ns(); + val.fp = filp; + val.offset = 0; + if (val.fp) + entryinfo.update(&pid, &val); + + return 0; +} + +// zpl_fsync(): +int trace_fsync_entry(struct pt_regs *ctx, struct file *filp) +{ + u32 pid; + pid = bpf_get_current_pid_tgid(); + if (FILTER_PID) + return 0; + + // store filp and timestamp by pid + struct val_t val = {}; + val.ts = bpf_ktime_get_ns(); + val.fp = filp; + val.offset = 0; + if (val.fp) + entryinfo.update(&pid, &val); + + return 0; +} + +// +// Output +// + +static int trace_return(struct pt_regs *ctx, int type) +{ + struct val_t *valp; + u32 pid = bpf_get_current_pid_tgid(); + + valp = entryinfo.lookup(&pid); + if (valp == 0) { + // missed tracing issue or filtered + return 0; + } + + // calculate delta + u64 ts = bpf_ktime_get_ns(); + u64 delta_us = (ts - valp->ts) / 1000; + entryinfo.delete(&pid); + if (FILTER_US) + return 0; + + // workaround (rewriter should handle file to d_iname in one step): + struct dentry *de = NULL; + bpf_probe_read(&de, sizeof(de), &valp->fp->f_path.dentry); + + // populate output struct + u32 size = ctx->ax; + struct data_t data = {.type = type, .size = size, .delta_us = delta_us, + .pid = pid}; + data.ts_us = ts / 1000; + data.offset = valp->offset; + bpf_probe_read(&data.file, sizeof(data.file), de->d_iname); + bpf_get_current_comm(&data.task, sizeof(data.task)); + + events.perf_submit(ctx, &data, sizeof(data)); + + return 0; +} + +int trace_read_return(struct pt_regs *ctx) +{ + return trace_return(ctx, TRACE_READ); +} + +int trace_write_return(struct pt_regs *ctx) +{ + return trace_return(ctx, TRACE_WRITE); +} + +int trace_open_return(struct pt_regs *ctx) +{ + return trace_return(ctx, TRACE_OPEN); +} + +int trace_fsync_return(struct pt_regs *ctx) +{ + return trace_return(ctx, TRACE_FSYNC); +} + +""" +if min_ms == 0: + bpf_text = bpf_text.replace('FILTER_US', '0') +else: + bpf_text = bpf_text.replace('FILTER_US', + 'delta_us <= %s' % str(min_ms * 1000)) +if args.pid: + bpf_text = bpf_text.replace('FILTER_PID', 'pid != %s' % pid) +else: + bpf_text = bpf_text.replace('FILTER_PID', '0') +if debug: + print(bpf_text) + +# kernel->user event data: struct data_t +DNAME_INLINE_LEN = 32 # linux/dcache.h +TASK_COMM_LEN = 16 # linux/sched.h +class Data(ct.Structure): + _fields_ = [ + ("ts_us", ct.c_ulonglong), + ("type", ct.c_ulonglong), + ("size", ct.c_ulonglong), + ("offset", ct.c_ulonglong), + ("delta_us", ct.c_ulonglong), + ("pid", ct.c_ulonglong), + ("task", ct.c_char * TASK_COMM_LEN), + ("file", ct.c_char * DNAME_INLINE_LEN) + ] + +# process event +def print_event(cpu, data, size): + event = ct.cast(data, ct.POINTER(Data)).contents + + type = 'R' + if event.type == 1: + type = 'W' + elif event.type == 2: + type = 'O' + elif event.type == 3: + type = 'S' + + if (csv): + print("%d,%s,%d,%s,%d,%d,%d,%s" % ( + event.ts_us, event.task, event.pid, type, event.size, + event.offset, event.delta_us, event.file)) + return + print("%-8s %-14.14s %-6s %1s %-7s %-8d %7.2f %s" % (strftime("%H:%M:%S"), + event.task, event.pid, type, event.size, event.offset / 1024, + float(event.delta_us) / 1000, event.file)) + +# initialize BPF +b = BPF(text=bpf_text) + +# common file functions +b.attach_kprobe(event="zpl_read", fn_name="trace_rw_entry") +b.attach_kprobe(event="zpl_write", fn_name="trace_rw_entry") +b.attach_kprobe(event="zpl_open", fn_name="trace_open_entry") +b.attach_kprobe(event="zpl_fsync", fn_name="trace_fsync_entry") +b.attach_kretprobe(event="zpl_read", fn_name="trace_read_return") +b.attach_kretprobe(event="zpl_write", fn_name="trace_write_return") +b.attach_kretprobe(event="zpl_open", fn_name="trace_open_return") +b.attach_kretprobe(event="zpl_fsync", fn_name="trace_fsync_return") + +# header +if (csv): + print("ENDTIME_us,TASK,PID,TYPE,BYTES,OFFSET_b,LATENCY_us,FILE") +else: + if min_ms == 0: + print("Tracing ZFS operations") + else: + print("Tracing ZFS operations slower than %d ms" % min_ms) + print("%-8s %-14s %-6s %1s %-7s %-8s %7s %s" % ("TIME", "COMM", "PID", "T", + "BYTES", "OFF_KB", "LAT(ms)", "FILENAME")) + +# read events +b["events"].open_perf_buffer(print_event) +while 1: + b.kprobe_poll() diff --git a/tools/zfsslower_example.txt b/tools/zfsslower_example.txt new file mode 100644 index 000000000000..fddae6e266b2 --- /dev/null +++ b/tools/zfsslower_example.txt @@ -0,0 +1,157 @@ +Demonstrations of zfsslower, the Linux eBPF/bcc version. + + +zfsslower shows ZFS reads, writes, opens, and fsyncs, slower than a threshold. +It has been written to work on ZFS on Linux (http://zfsonlinux.org). For +example: + +# ./zfsslower +Tracing ZFS operations slower than 10 ms +TIME COMM PID T BYTES OFF_KB LAT(ms) FILENAME +06:31:28 dd 25570 W 131072 38784 303.92 data1 +06:31:34 dd 25686 W 131072 38784 388.28 data1 +06:31:35 dd 25686 W 131072 78720 519.66 data1 +06:31:35 dd 25686 W 131072 116992 405.94 data1 +06:31:35 dd 25686 W 131072 153600 433.52 data1 +06:31:36 dd 25686 W 131072 188672 314.37 data1 +06:31:36 dd 25686 W 131072 222336 372.33 data1 +06:31:36 dd 25686 W 131072 254592 309.59 data1 +06:31:37 dd 25686 W 131072 285440 304.52 data1 +06:31:37 dd 25686 W 131072 315008 236.45 data1 +06:31:37 dd 25686 W 131072 343424 193.54 data1 +06:31:38 dd 25686 W 131072 370560 286.07 data1 +06:31:38 dd 25686 W 131072 396672 251.92 data1 +[...] + +This shows writes to a "data1" file, each taking well over the 10 ms threshold. +the slowest, on the 3rd line of output, reached 519.66 ms for a 128 Kbyte +write by the "dd" command. + +This "latency" is measured from when the operation was issued from the VFS +interface to the file system (via the ZFS POSIX layer), to when it completed. +This spans everything: block device I/O (disk I/O), file system CPU cycles, +file system locks, run queue latency, etc. This is a better measure of the +latency suffered by applications reading from the file system than measuring +this down at the block device interface. + +Note that this only traces the common file system operations previously +listed: other file system operations (eg, inode operations including +getattr()) are not traced. + + +A threshold of 0 will trace all operations. Warning: the output will be +verbose, as it will include all file system cache hits. + +# ./zfsslower 0 +Tracing ZFS operations +TIME COMM PID T BYTES OFF_KB LAT(ms) FILENAME +06:36:07 dd 32242 O 0 0 0.01 data1 +06:36:07 dd 32242 W 131072 0 0.25 data1 +06:36:07 dd 32242 W 131072 128 0.03 data1 +06:36:07 dd 32242 W 131072 256 0.04 data1 +06:36:07 dd 32242 W 131072 384 0.04 data1 +06:36:07 dd 32242 W 131072 512 0.04 data1 +06:36:07 dd 32242 W 131072 640 0.03 data1 +06:36:07 dd 32242 W 131072 768 0.03 data1 +06:36:07 dd 32242 W 131072 896 0.04 data1 +06:36:07 dd 32242 W 131072 1024 0.28 data1 +06:36:07 dd 32242 W 131072 1152 0.04 data1 +06:36:07 dd 32242 W 131072 1280 0.03 data1 +[...] +06:36:07 dd 32242 W 131072 13824 0.04 data1 +06:36:07 dd 32242 W 131072 13952 0.04 data1 +06:36:07 dd 32242 W 131072 14080 0.04 data1 +06:36:07 dd 32242 W 131072 14208 398.92 data1 +06:36:07 dd 32242 W 131072 14336 0.04 data1 +06:36:07 dd 32242 W 131072 14464 0.04 data1 +06:36:07 dd 32242 W 131072 15104 0.03 data1 +[...] + +The output now includes the open operation for this file ("O"), and then the +writes. Most of the writes are very fast, with only an occasional outlier that +is in the hundreds of milliseconds. + +Fortunately this is not a real world environment: I setup a zpool on top of a +XFS file system for testing purposes. More debugging using other tools will +explain these outliers: possibly XFS flushing. + + +Here's a random read workload, and showing operations slower than 1 ms: + +# ./zfsslower 1 +Tracing ZFS operations slower than 1 ms +TIME COMM PID T BYTES OFF_KB LAT(ms) FILENAME +06:47:30 randread.pl 15431 R 8192 97840 1.03 data1 +06:47:30 randread.pl 15431 R 8192 416744 1.12 data1 +06:47:31 randread.pl 15431 R 8192 228856 1.96 data1 +06:47:31 randread.pl 15431 R 8192 452248 1.02 data1 +06:47:31 randread.pl 15431 R 8192 315288 5.90 data1 +06:47:31 randread.pl 15431 R 8192 752696 1.20 data1 +06:47:31 randread.pl 15431 R 8192 481832 1.39 data1 +06:47:31 randread.pl 15431 R 8192 673752 1.39 data1 +06:47:31 randread.pl 15431 R 8192 691736 1.01 data1 +06:47:31 randread.pl 15431 R 8192 694776 1.78 data1 +06:47:31 randread.pl 15431 R 8192 403328 3.75 data1 +06:47:31 randread.pl 15431 R 8192 567688 1.08 data1 +06:47:31 randread.pl 15431 R 8192 694280 1.31 data1 +06:47:31 randread.pl 15431 R 8192 669280 1.06 data1 +06:47:31 randread.pl 15431 R 8192 426608 1.56 data1 +06:47:31 randread.pl 15431 R 8192 42512 1.01 data1 +06:47:31 randread.pl 15431 R 8192 22944 1.33 data1 +06:47:31 randread.pl 15431 R 8192 427432 1.48 data1 +06:47:31 randread.pl 15431 R 8192 261320 1.28 data1 +06:47:31 randread.pl 15431 R 8192 132248 1.23 data1 +06:47:31 randread.pl 15431 R 8192 96936 1.04 data1 +06:47:31 randread.pl 15431 R 8192 482800 2.63 data1 +[...] + + +A -j option will print just the fields (parsable output, csv): + +# ./zfsslower -j 1 +ENDTIME_us,TASK,PID,TYPE,BYTES,OFFSET_b,LATENCY_us,FILE +252305490911,randread.pl,17922,R,8192,163446784,1156,data1 +252305493852,randread.pl,17922,R,8192,321437696,1129,data1 +252305498839,randread.pl,17922,R,8192,475152384,1154,data1 +252305505515,randread.pl,17922,R,8192,49094656,1082,data1 +252305506774,randread.pl,17922,R,8192,470401024,1245,data1 +252305509265,randread.pl,17922,R,8192,553246720,2412,data1 +252305512365,randread.pl,17922,R,8192,20963328,1093,data1 +252305513755,randread.pl,17922,R,8192,304111616,1350,data1 +252305583330,randread.pl,17922,R,8192,166174720,1154,data1 +252305593913,randread.pl,17922,R,8192,175079424,1241,data1 +252305602833,randread.pl,17922,R,8192,305340416,3307,data1 +252305608663,randread.pl,17922,R,8192,655958016,2704,data1 +252305611212,randread.pl,17922,R,8192,40951808,1033,data1 +252305614609,randread.pl,17922,R,8192,318922752,2687,data1 +252305623800,randread.pl,17922,R,8192,246734848,2983,data1 +252305711125,randread.pl,17922,R,8192,581795840,1091,data1 +252305728694,randread.pl,17922,R,8192,710483968,1034,data1 +252305762046,randread.pl,17922,R,8192,329367552,1405,data1 +252305798215,randread.pl,17922,R,8192,44482560,1030,data1 +252305806748,randread.pl,17922,R,8192,660602880,1069,data1 +252305826360,randread.pl,17922,R,8192,616144896,2327,data1 +[...] + + +USAGE message: + +# ./zfsslower -h +usage: zfsslower [-h] [-j] [-p PID] [min_ms] + +Trace common ZFS file operations slower than a threshold + +positional arguments: + min_ms minimum I/O duration to trace, in ms (default 10) + +optional arguments: + -h, --help show this help message and exit + -j, --csv just print fields: comma-separated values + -p PID, --pid PID trace this PID only + +examples: + ./zfsslower # trace operations slower than 10 ms (default) + ./zfsslower 1 # trace operations slower than 1 ms + ./zfsslower -j 1 # ... 1 ms, parsable output (csv) + ./zfsslower 0 # trace all operations (warning: verbose) + ./zfsslower -p 185 # trace PID 185 only