diff --git a/README.md b/README.md index bed0b22d54b8..ce8d8bdf756c 100644 --- a/README.md +++ b/README.md @@ -64,6 +64,7 @@ Examples: Tools: +- tools/[biolatency](tools/biolatency): Summarize block device I/O latency as a histogram. [Examples](tools/biolatency_example.txt). - tools/[biosnoop](tools/biosnoop): Trace block device I/O with PID and latency. [Examples](tools/biosnoop_example.txt). - tools/[funccount](tools/funccount): Count kernel function calls. [Examples](tools/funccount_example.txt). - tools/[funclatency](tools/funclatency): Time kernel functions and show their latency distribution. [Examples](tools/funclatency_example.txt). diff --git a/man/man8/biolatency.8 b/man/man8/biolatency.8 new file mode 100644 index 000000000000..8f0156d9eea6 --- /dev/null +++ b/man/man8/biolatency.8 @@ -0,0 +1,89 @@ +.TH biolatency 8 "2015-08-20" "USER COMMANDS" +.SH NAME +biolatency \- Summarize block device I/O latency as a histogram. +.SH SYNOPSIS +.B biolatency [\-h] [\-T] [\-Q] [\-m] [interval [count]] +.SH DESCRIPTION +biolatency traces block device I/O (disk I/O), and records the distribution +of I/O latency (time). This is printed as a histogram either on Ctrl-C, or +after a given interval in seconds. + +The latency of the disk I/O is measured from the issue to the device to its +completion. A \-Q option can be used to include time queued in the kernel. + +This tool uses in-kernel eBPF maps for storing timestamps and the histogram, +for efficiency. + +This works by tracing various kernel blk_*() functions using dynamic tracing, +and will need updating to match any changes to these functions. + +Since this uses BPF, only the root user can use this tool. +.SH REQUIREMENTS +CONFIG_BPF and bcc. +.SH OPTIONS +\-h +Print usage message. +.TP +\-T +Include timestamps on output. +.TP +\-m +Output histogram in milliseconds. +.TP +interval +Output interval, in seconds. +.TP +count +Number of outputs. +.SH EXAMPLES +.TP +Summarize block device I/O latency as a histogram: +# +.B biolatency +.TP +Print 1 second summaries, 10 times: +# +.B biolatency 1 10 +.TP +Print 1 second summaries, using milliseconds as units for the histogram, and +include timestamps on output: +# +.B biolatency \-mT 1 +.TP +Include OS queued time in I/O time: +# +.B biolatency \-Q +.SH FIELDS +.TP +usecs +Microsecond range +.TP +mecs +Millisecond range +.TP +count +How many I/O fell into this range +.TP +distribution +An ASCII bar chart to visualize the distribution (count column) +.SH OVERHEAD +This traces kernel functions and maintains in-kernel timestamps and a histgroam, +which are asynchronously copied to user-space. This method is very efficient, +and the overhead for most storage I/O rates (< 10k IOPS) should be negligible. +If you have a higher IOPS storage environment, test and quantify the overhead +before use. +.SH SOURCE +This is from bcc. +.IP +https://github.com/iovisor/bcc +.PP +Also look in the bcc distribution for a companion _examples.txt file containing +example usage, output, and commentary for this tool. +.SH OS +Linux +.SH STABILITY +Unstable - in development. +.SH AUTHOR +Brendan Gregg +.SH SEE ALSO +biosnoop(8) diff --git a/tools/biolatency b/tools/biolatency new file mode 100755 index 000000000000..9a6b38a8708d --- /dev/null +++ b/tools/biolatency @@ -0,0 +1,120 @@ +#!/usr/bin/python +# +# biolatency Summarize block device I/O latency as a histogram. +# For Linux, uses BCC, eBPF. +# +# USAGE: biolatency [-h] [-T] [-Q] [-m] [interval] [count] +# +# Copyright (c) 2015 Brendan Gregg. +# Licensed under the Apache License, Version 2.0 (the "License") +# +# 20-Sep-2015 Brendan Gregg Created this. + +from __future__ import print_function +from bcc import BPF +from time import sleep, strftime +import argparse + +# arguments +examples = """examples: + ./biolatency # summarize block I/O latency as a histogram + ./biolatency 1 10 # print 1 second summaries, 10 times + ./biolatency -mT 1 # 1s summaries, milliseconds, and timestamps + ./biolatency -Q # include OS queued time in I/O time +""" +parser = argparse.ArgumentParser( + description="Summarize block device I/O latency as a histogram", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=examples) +parser.add_argument("-T", "--timestamp", action="store_true", + help="include timestamp on output") +parser.add_argument("-Q", "--queued", action="store_true", + help="include OS queued time in I/O time") +parser.add_argument("-m", "--milliseconds", action="store_true", + help="millisecond histogram") +parser.add_argument("interval", nargs="?", default=99999999, + help="output interval, in seconds") +parser.add_argument("count", nargs="?", default=99999999, + help="number of outputs") +args = parser.parse_args() +countdown = int(args.count) +debug = 0 + +# load BPF program +bpf_text = """ +#include +#include + +BPF_TABLE(\"array\", int, u64, dist, 64); +BPF_HASH(start, struct request *); + +// time block I/O +int trace_req_start(struct pt_regs *ctx, struct request *req) +{ + u64 ts = bpf_ktime_get_ns(); + start.update(&req, &ts); + return 0; +} + +// output +int trace_req_completion(struct pt_regs *ctx, struct request *req) +{ + u64 *tsp, delta; + + // fetch timestamp and calculate delta + tsp = start.lookup(&req); + if (tsp == 0) { + return 0; // missed issue + } + delta = bpf_ktime_get_ns() - *tsp; + FACTOR + + // store as histogram + int index = bpf_log2l(delta); + u64 *leaf = dist.lookup(&index); + if (leaf) (*leaf)++; + + start.delete(&req); + return 0; +} +""" +if args.milliseconds: + bpf_text = bpf_text.replace('FACTOR', 'delta /= 1000000;') + label = "msecs" +else: + bpf_text = bpf_text.replace('FACTOR', 'delta /= 1000;') + label = "usecs" +if debug: + print(bpf_text) + +# load BPF program +b = BPF(text=bpf_text) +if args.queued: + b.attach_kprobe(event="blk_account_io_start", fn_name="trace_req_start") +else: + b.attach_kprobe(event="blk_start_request", fn_name="trace_req_start") + b.attach_kprobe(event="blk_mq_start_request", fn_name="trace_req_start") +b.attach_kprobe(event="blk_account_io_completion", + fn_name="trace_req_completion") + +print("Tracing block device I/O... Hit Ctrl-C to end.") + +# output +exiting = 0 if args.interval else 1 +dist = b.get_table("dist") +while (1): + try: + sleep(int(args.interval)) + except KeyboardInterrupt: + exiting=1 + + print() + if args.timestamp: + print("%-8s\n" % strftime("%H:%M:%S"), end="") + + dist.print_log2_hist(label) + dist.clear() + + countdown -= 1 + if exiting or countdown == 0: + exit() diff --git a/tools/biolatency_example.txt b/tools/biolatency_example.txt new file mode 100644 index 000000000000..9431ea92931e --- /dev/null +++ b/tools/biolatency_example.txt @@ -0,0 +1,160 @@ +Demonstrations of biolatency, the Linux eBPF/bcc version. + + +biolatency traces block device I/O (disk I/O), and records the distribution +of I/O latency (time), printing this as a histogram when Ctrl-C is hit. +For example: + +# ./biolatency +Tracing block device I/O... Hit Ctrl-C to end. +^C + usecs : count distribution + 0 -> 1 : 0 | | + 2 -> 3 : 0 | | + 4 -> 7 : 0 | | + 8 -> 15 : 0 | | + 16 -> 31 : 0 | | + 32 -> 63 : 0 | | + 64 -> 127 : 1 | | + 128 -> 255 : 12 |******** | + 256 -> 511 : 15 |********** | + 512 -> 1023 : 43 |******************************* | + 1024 -> 2047 : 52 |**************************************| + 2048 -> 4095 : 47 |********************************** | + 4096 -> 8191 : 52 |**************************************| + 8192 -> 16383 : 36 |************************** | + 16384 -> 32767 : 15 |********** | + 32768 -> 65535 : 2 |* | + 65536 -> 131071 : 2 |* | + +The latency of the disk I/O is measured from the issue to the device to its +completion. A -Q option can be used to include time queued in the kernel. + +This example output shows a large mode of latency from about 128 microseconds +to about 32767 microseconds (33 milliseconds). The bulk of the I/O was +between 1 and 8 ms, which is the expected block device latency for +rotational storage devices. + +The highest latency seen while tracing was between 65 and 131 milliseconds: +the last row printed, for which there were 2 I/O. + +For efficiency, biolatency uses an in-kernel eBPF map to store timestamps +with requests, and another in-kernel map to store the histogram (the "count") +column, which is copied to user-space only when output is printed. These +methods lower the perormance overhead when tracing is performed. + + +In the following example, the -m option is used to print a histogram using +milliseconds as the units (which eliminates the first several rows), -T to +print timestamps with the output, and to print 1 second summaries 5 times: + +# ./biolatency -mT 1 5 +Tracing block device I/O... Hit Ctrl-C to end. + +06:20:16 + msecs : count distribution + 0 -> 1 : 36 |**************************************| + 2 -> 3 : 1 |* | + 4 -> 7 : 3 |*** | + 8 -> 15 : 17 |***************** | + 16 -> 31 : 33 |********************************** | + 32 -> 63 : 7 |******* | + 64 -> 127 : 6 |****** | + +06:20:17 + msecs : count distribution + 0 -> 1 : 96 |************************************ | + 2 -> 3 : 25 |********* | + 4 -> 7 : 29 |*********** | + 8 -> 15 : 62 |*********************** | + 16 -> 31 : 100 |**************************************| + 32 -> 63 : 62 |*********************** | + 64 -> 127 : 18 |****** | + +06:20:18 + msecs : count distribution + 0 -> 1 : 68 |************************* | + 2 -> 3 : 76 |**************************** | + 4 -> 7 : 20 |******* | + 8 -> 15 : 48 |***************** | + 16 -> 31 : 103 |**************************************| + 32 -> 63 : 49 |****************** | + 64 -> 127 : 17 |****** | + +06:20:19 + msecs : count distribution + 0 -> 1 : 522 |*************************************+| + 2 -> 3 : 225 |**************** | + 4 -> 7 : 38 |** | + 8 -> 15 : 8 | | + 16 -> 31 : 1 | | + +06:20:20 + msecs : count distribution + 0 -> 1 : 436 |**************************************| + 2 -> 3 : 106 |********* | + 4 -> 7 : 34 |** | + 8 -> 15 : 19 |* | + 16 -> 31 : 1 | | + +How the I/O latency distribution changes over time can be seen. + + + +The -Q option begins measuring I/O latency from when the request was first +queued in the kernel, and includes queuing latency: + +# ./biolatency -Q +Tracing block device I/O... Hit Ctrl-C to end. +^C + usecs : count distribution + 0 -> 1 : 0 | | + 2 -> 3 : 0 | | + 4 -> 7 : 0 | | + 8 -> 15 : 0 | | + 16 -> 31 : 0 | | + 32 -> 63 : 0 | | + 64 -> 127 : 0 | | + 128 -> 255 : 3 |* | + 256 -> 511 : 37 |************** | + 512 -> 1023 : 30 |*********** | + 1024 -> 2047 : 18 |******* | + 2048 -> 4095 : 22 |******** | + 4096 -> 8191 : 14 |***** | + 8192 -> 16383 : 48 |******************* | + 16384 -> 32767 : 96 |**************************************| + 32768 -> 65535 : 31 |************ | + 65536 -> 131071 : 26 |********** | + 131072 -> 262143 : 12 |**** | + +This better reflects the latency suffered by the application (if it is +synchronous I/O), whereas the default mode without kernel queueing better +reflects the performance of the device. + +Note that the storage device (and storage device controller) usually have +queues of their own, which are always included in the latency, with or +without -Q. + + +USAGE message: + +# ./biolatency -h +usage: biolatency [-h] [-T] [-Q] [-m] [interval] [count] + +Summarize block device I/O latency as a histogram + +positional arguments: + interval output interval, in seconds + count number of outputs + +optional arguments: + -h, --help show this help message and exit + -T, --timestamp include timestamp on output + -Q, --queued include OS queued time in I/O time + -m, --milliseconds millisecond histogram + +examples: + ./biolatency # summarize block I/O latency as a histogram + ./biolatency 1 10 # print 1 second summaries, 10 times + ./biolatency -mT 1 # 1s summaries, milliseconds, and timestamps + ./biolatency -Q # include OS queued time in I/O time