diff --git a/README.md b/README.md index a76cfc22fb5d..e333046004ef 100644 --- a/README.md +++ b/README.md @@ -71,6 +71,7 @@ Tools: - tools/[biotop](tools/biotop.py): Top for disks: Summarize block device I/O by process. [Examples](tools/biotop_example.txt). - tools/[biosnoop](tools/biosnoop.py): Trace block device I/O with PID and latency. [Examples](tools/biosnoop_example.txt). - tools/[bitesize](tools/bitesize.py): Show per process I/O size histogram. [Examples](tools/bitesize_example.txt). +- tools/[btrfsdist](tools/btrfsdist.py): Summarize btrfs operation latency distribution as a histogram. [Examples](tools/btrfsdist_example.txt). - tools/[btrfsslower](tools/btrfsslower.py): Trace slow btrfs operations. [Examples](tools/btrfsslower_example.txt). - tools/[cachestat](tools/cachestat.py): Trace page cache hit/miss ratio. [Examples](tools/cachestat_example.txt). - tools/[dcsnoop](tools/dcsnoop.py): Trace directory entry cache (dcache) lookups. [Examples](tools/dcsnoop_example.txt). diff --git a/man/man8/btrfsdist.8 b/man/man8/btrfsdist.8 new file mode 100644 index 000000000000..480264f990f5 --- /dev/null +++ b/man/man8/btrfsdist.8 @@ -0,0 +1,82 @@ +.TH btrfsdist 8 "2016-02-15" "USER COMMANDS" +.SH NAME +btrfsdist \- Summarize btrfs operation latency. Uses Linux eBPF/bcc. +.SH SYNOPSIS +.B btrfsdist [\-h] [\-T] [\-N] [\-d] [interval] [count] +.SH DESCRIPTION +This tool summarizes time (latency) spent in common btrfs file operations: +reads, writes, opens, and syncs, and presents it as a power-of-2 histogram. It +uses an in-kernel eBPF map to store the histogram for efficiency. + +Since this works by tracing the btrfs_file_operations interface functions, it +will need updating to match any changes to these functions. + +Since this uses BPF, only the root user can use this tool. +.SH REQUIREMENTS +CONFIG_BPF and bcc. +.SH OPTIONS +.TP +\-h +Print usage message. +.TP +\-T +Don't include timestamps on interval output. +.TP +\-m +Output in milliseconds. +.TP +\-p PID +Trace this PID only. +.SH EXAMPLES +.TP +Trace btrfs operation time, and print a summary on Ctrl-C: +# +.B btrfsdist +.TP +Trace PID 181 only: +# +.B btrfsdist -p 181 +.TP +Print 1 second summaries, 10 times: +# +.B btrfsdist 1 10 +.TP +1 second summaries, printed in milliseconds +# +.B btrfsdist \-m 1 +.SH FIELDS +.TP +msecs +Range of milliseconds for this bucket. +.TP +usecs +Range of microseconds for this bucket. +.TP +count +Number of operations in this time range. +.TP +distribution +ASCII representation of the distribution (the count column). +.SH OVERHEAD +This adds low-overhead instrumentation to btrfs writes and fsyncs, as well +as all system reads and opens (due to the current implementation of the +btrfs_file_operations interface). Particularly, all reads and writes from +the file system cache will incur extra overhead while tracing. Such reads and +writes can be very frequent (depending on the workload; eg, 1M/sec), at which +point the overhead of this tool may become noticeable. +Measure and quantify before use. +.SH SOURCE +This is from bcc. +.IP +https://github.com/iovisor/bcc +.PP +Also look in the bcc distribution for a companion _examples.txt file containing +example usage, output, and commentary for this tool. +.SH OS +Linux +.SH STABILITY +Unstable - in development. +.SH AUTHOR +Brendan Gregg +.SH SEE ALSO +btrfsslower(8) diff --git a/tools/btrfsdist.py b/tools/btrfsdist.py new file mode 100755 index 000000000000..2ce077cb620c --- /dev/null +++ b/tools/btrfsdist.py @@ -0,0 +1,223 @@ +#!/usr/bin/python +# @lint-avoid-python-3-compatibility-imports +# +# btrfsdist Summarize btrfs operation latency. +# For Linux, uses BCC, eBPF. +# +# USAGE: btrfsdist [-h] [-T] [-m] [-p PID] [interval] [count] +# +# Copyright 2016 Netflix, Inc. +# Licensed under the Apache License, Version 2.0 (the "License") +# +# 15-Feb-2016 Brendan Gregg Created this. + +from __future__ import print_function +from bcc import BPF +from time import sleep, strftime +import argparse + +# symbols +kallsyms = "/proc/kallsyms" + +# arguments +examples = """examples: + ./btrfsdist # show operation latency as a histogram + ./btrfsdist -p 181 # trace PID 181 only + ./btrfsdist 1 10 # print 1 second summaries, 10 times + ./btrfsdist -m 5 # 5s summaries, milliseconds +""" +parser = argparse.ArgumentParser( + description="Summarize btrfs operation latency", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=examples) +parser.add_argument("-T", "--notimestamp", action="store_true", + help="don't include timestamp on interval output") +parser.add_argument("-m", "--milliseconds", action="store_true", + help="output in milliseconds") +parser.add_argument("-p", "--pid", + help="trace this PID only") +parser.add_argument("interval", nargs="?", + help="output interval, in seconds") +parser.add_argument("count", nargs="?", default=99999999, + help="number of outputs") +args = parser.parse_args() +pid = args.pid +countdown = int(args.count) +if args.milliseconds: + factor = 1000000 + label = "msecs" +else: + factor = 1000 + label = "usecs" +if args.interval and int(args.interval) == 0: + print("ERROR: interval 0. Exiting.") + exit() +debug = 0 + +# define BPF program +bpf_text = """ +#include +#include +#include + +#define OP_NAME_LEN 8 +typedef struct dist_key { + char op[OP_NAME_LEN]; + u64 slot; +} dist_key_t; +BPF_HASH(start, u32); +BPF_HISTOGRAM(dist, dist_key_t); + +// time operation +int trace_entry(struct pt_regs *ctx) +{ + u32 pid = bpf_get_current_pid_tgid(); + if (FILTER_PID) + return 0; + u64 ts = bpf_ktime_get_ns(); + start.update(&pid, &ts); + return 0; +} + +// The current btrfs (Linux 4.5) uses generic_file_read_iter() instead of it's +// own read function. So we need to trace that and then filter on btrfs, which +// I do by checking file->f_op. +int trace_read_entry(struct pt_regs *ctx, struct kiocb *iocb) +{ + u32 pid = bpf_get_current_pid_tgid(); + if (FILTER_PID) + return 0; + + // btrfs filter on file->f_op == btrfs_file_operations + struct file *fp = iocb->ki_filp; + if ((u64)fp->f_op != BTRFS_FILE_OPERATIONS) + return 0; + + u64 ts = bpf_ktime_get_ns(); + start.update(&pid, &ts); + return 0; +} + +// The current btrfs (Linux 4.5) uses generic_file_open(), instead of it's own +// function. Same as with reads. Trace the generic path and filter: +int trace_open_entry(struct pt_regs *ctx, struct inode *inode, + struct file *file) +{ + u32 pid; + pid = bpf_get_current_pid_tgid(); + if (FILTER_PID) + return 0; + + // btrfs filter on file->f_op == btrfs_file_operations + if ((u64)file->f_op != BTRFS_FILE_OPERATIONS) + return 0; + + u64 ts = bpf_ktime_get_ns(); + start.update(&pid, &ts); + return 0; +} + +static int trace_return(struct pt_regs *ctx, const char *op) +{ + u64 *tsp; + u32 pid = bpf_get_current_pid_tgid(); + + // fetch timestamp and calculate delta + tsp = start.lookup(&pid); + if (tsp == 0) { + return 0; // missed start or filtered + } + u64 delta = (bpf_ktime_get_ns() - *tsp) / FACTOR; + + // store as histogram + dist_key_t key = {.slot = bpf_log2l(delta)}; + __builtin_memcpy(&key.op, op, sizeof(key.op)); + dist.increment(key); + + start.delete(&pid); + return 0; +} + +int trace_read_return(struct pt_regs *ctx) +{ + char *op = "read"; + return trace_return(ctx, op); +} + +int trace_write_return(struct pt_regs *ctx) +{ + char *op = "write"; + return trace_return(ctx, op); +} + +int trace_open_return(struct pt_regs *ctx) +{ + char *op = "open"; + return trace_return(ctx, op); +} + +int trace_fsync_return(struct pt_regs *ctx) +{ + char *op = "fsync"; + return trace_return(ctx, op); +} +""" + +# code replacements +with open(kallsyms) as syms: + ops = '' + for line in syms: + a = line.rstrip().split() + (addr, name) = (a[0], a[2]) + if name == "btrfs_file_operations": + ops = "0x" + addr + break + if ops == '': + print("ERROR: no btrfs_file_operations in /proc/kallsyms. Exiting.") + exit() + bpf_text = bpf_text.replace('BTRFS_FILE_OPERATIONS', ops) +bpf_text = bpf_text.replace('FACTOR', str(factor)) +if args.pid: + bpf_text = bpf_text.replace('FILTER_PID', 'pid != %s' % pid) +else: + bpf_text = bpf_text.replace('FILTER_PID', '0') +if debug: + print(bpf_text) + +# load BPF program +b = BPF(text=bpf_text) + +# Common file functions. See earlier comment about generic_file_read_iter(). +b.attach_kprobe(event="generic_file_read_iter", fn_name="trace_read_entry") +b.attach_kprobe(event="btrfs_file_write_iter", fn_name="trace_entry") +b.attach_kprobe(event="generic_file_open", fn_name="trace_open_entry") +b.attach_kprobe(event="btrfs_sync_file", fn_name="trace_entry") +b.attach_kretprobe(event="generic_file_read_iter", fn_name="trace_read_return") +b.attach_kretprobe(event="btrfs_file_write_iter", fn_name="trace_write_return") +b.attach_kretprobe(event="generic_file_open", fn_name="trace_open_return") +b.attach_kretprobe(event="btrfs_sync_file", fn_name="trace_fsync_return") + +print("Tracing btrfs operation latency... Hit Ctrl-C to end.") + +# output +exiting = 0 +dist = b.get_table("dist") +while (1): + try: + if args.interval: + sleep(int(args.interval)) + else: + sleep(99999999) + except KeyboardInterrupt: + exiting = 1 + + print() + if args.interval and (not args.notimestamp): + print(strftime("%H:%M:%S:")) + + dist.print_log2_hist(label, "operation") + dist.clear() + + countdown -= 1 + if exiting or countdown == 0: + exit() diff --git a/tools/btrfsdist_example.txt b/tools/btrfsdist_example.txt new file mode 100644 index 000000000000..4cadc76a97c7 --- /dev/null +++ b/tools/btrfsdist_example.txt @@ -0,0 +1,179 @@ +Demonstrations of btrfsdist, the Linux eBPF/bcc version. + + +btrfsdist traces btrfs reads, writes, opens, and fsyncs, and summarizes their +latency as a power-of-2 histogram. For example: + +# ./btrfsdist +Tracing btrfs operation latency... Hit Ctrl-C to end. +^C + +operation = 'read' + usecs : count distribution + 0 -> 1 : 15 | | + 2 -> 3 : 1308 |******* | + 4 -> 7 : 198 |* | + 8 -> 15 : 0 | | + 16 -> 31 : 11 | | + 32 -> 63 : 361 |* | + 64 -> 127 : 55 | | + 128 -> 255 : 104 | | + 256 -> 511 : 7312 |****************************************| + 512 -> 1023 : 387 |** | + 1024 -> 2047 : 10 | | + 2048 -> 4095 : 4 | | + +operation = 'write' + usecs : count distribution + 0 -> 1 : 0 | | + 2 -> 3 : 0 | | + 4 -> 7 : 0 | | + 8 -> 15 : 4 |****************************************| + +operation = 'open' + usecs : count distribution + 0 -> 1 : 1 |********** | + 2 -> 3 : 4 |****************************************| + +This output shows a bi-modal distribution for read latency, with a faster +mode of 1,308 reads that took between 2 and 3 microseconds, and a slower +mode of over 7,312 reads that took between 256 and 511 microseconds. It's +likely that the faster mode was a hit from the in-memory file system cache, +and the slower mode is a read from a storage device (disk). + +This "latency" is measured from when the operation was issued from the VFS +interface to the file system, to when it completed. This spans everything: +block device I/O (disk I/O), file system CPU cycles, file system locks, run +queue latency, etc. This is a better measure of the latency suffered by +applications reading from the file system than measuring this down at the +block device interface. + +Note that this only traces the common file system operations previously +listed: other file system operations (eg, inode operations including +getattr()) are not traced. + + +An optional interval and a count can be provided, as well as -m to show the +distributions in milliseconds. For example, two second summaries, five times: + +# ./btrfsdist 2 5 +Tracing btrfs operation latency... Hit Ctrl-C to end. + +03:40:49: + +operation = 'read' + usecs : count distribution + 0 -> 1 : 15 | | + 2 -> 3 : 833 |******** | + 4 -> 7 : 127 |* | + 8 -> 15 : 0 | | + 16 -> 31 : 8 | | + 32 -> 63 : 907 |******** | + 64 -> 127 : 91 | | + 128 -> 255 : 246 |** | + 256 -> 511 : 4164 |****************************************| + 512 -> 1023 : 193 |* | + 1024 -> 2047 : 4 | | + 2048 -> 4095 : 6 | | + 4096 -> 8191 : 2 | | + +03:40:51: + +operation = 'read' + usecs : count distribution + 0 -> 1 : 25 | | + 2 -> 3 : 1491 |*************** | + 4 -> 7 : 218 |** | + 8 -> 15 : 0 | | + 16 -> 31 : 16 | | + 32 -> 63 : 1527 |*************** | + 64 -> 127 : 319 |*** | + 128 -> 255 : 429 |**** | + 256 -> 511 : 3841 |****************************************| + 512 -> 1023 : 232 |** | + 1024 -> 2047 : 3 | | + 2048 -> 4095 : 6 | | + 4096 -> 8191 : 1 | | + 8192 -> 16383 : 1 | | + +03:40:53: + +operation = 'read' + usecs : count distribution + 0 -> 1 : 27 | | + 2 -> 3 : 2999 |********************************* | + 4 -> 7 : 407 |**** | + 8 -> 15 : 0 | | + 16 -> 31 : 46 | | + 32 -> 63 : 3538 |****************************************| + 64 -> 127 : 595 |****** | + 128 -> 255 : 621 |******* | + 256 -> 511 : 3532 |*************************************** | + 512 -> 1023 : 212 |** | + 1024 -> 2047 : 1 | | + 2048 -> 4095 : 0 | | + 4096 -> 8191 : 0 | | + 8192 -> 16383 : 0 | | + 16384 -> 32767 : 1 | | + +03:40:55: + +operation = 'read' + usecs : count distribution + 0 -> 1 : 221 | | + 2 -> 3 : 12580 |****************************************| + 4 -> 7 : 1366 |**** | + 8 -> 15 : 0 | | + 16 -> 31 : 289 | | + 32 -> 63 : 10782 |********************************** | + 64 -> 127 : 1232 |*** | + 128 -> 255 : 807 |** | + 256 -> 511 : 2299 |******* | + 512 -> 1023 : 135 | | + 1024 -> 2047 : 5 | | + 2048 -> 4095 : 2 | | + +03:40:57: + +operation = 'read' + usecs : count distribution + 0 -> 1 : 73951 |************************* | + 2 -> 3 : 117639 |****************************************| + 4 -> 7 : 7943 |** | + 8 -> 15 : 1841 | | + 16 -> 31 : 1143 | | + 32 -> 63 : 5006 |* | + 64 -> 127 : 483 | | + 128 -> 255 : 242 | | + 256 -> 511 : 253 | | + 512 -> 1023 : 84 | | + 1024 -> 2047 : 23 | | + +This shows a read workload that begins bimodal, and eventually the second +mode disappears. The reason for this is that the workload cached during +tracing. Note that the rate also increased, with over 200k reads for the +final two second sample. + + +USAGE message: + +# ./btrfsdist -h +usage: btrfsdist [-h] [-T] [-m] [-p PID] [interval] [count] + +Summarize btrfs operation latency + +positional arguments: + interval output interval, in seconds + count number of outputs + +optional arguments: + -h, --help show this help message and exit + -T, --notimestamp don't include timestamp on interval output + -m, --milliseconds output in milliseconds + -p PID, --pid PID trace this PID only + +examples: + ./btrfsdist # show operation latency as a histogram + ./btrfsdist -p 181 # trace PID 181 only + ./btrfsdist 1 10 # print 1 second summaries, 10 times + ./btrfsdist -m 5 # 5s summaries, milliseconds