Skip to content

Commit

Permalink
biosnoop: add -Q for queued time (iovisor#2279)
Browse files Browse the repository at this point in the history
* biosnoop: add -Q for queued time

* biotop: fix PID
  • Loading branch information
brendangregg authored and yonghong-song committed Mar 20, 2019
1 parent 2d182dc commit eb32c15
Show file tree
Hide file tree
Showing 4 changed files with 118 additions and 50 deletions.
15 changes: 13 additions & 2 deletions man/man8/biosnoop.8
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
.SH NAME
biosnoop \- Trace block device I/O and print details incl. issuing PID.
.SH SYNOPSIS
.B biosnoop
.B biosnoop [\-hQ]
.SH DESCRIPTION
This tools traces block device I/O (disk I/O), and prints a one-line summary
for each I/O showing various details. These include the latency from the time of
Expand All @@ -22,6 +22,13 @@ which uses an older mechanism
Since this uses BPF, only the root user can use this tool.
.SH REQUIREMENTS
CONFIG_BPF and bcc.
.SH OPTIONS
.TP
\-h
Print usage message.
.TP
\-Q
Include a column showing the time spent quueued in the OS.
.SH EXAMPLES
.TP
Trace all block device I/O and print a summary line per I/O:
Expand All @@ -30,7 +37,7 @@ Trace all block device I/O and print a summary line per I/O:
.SH FIELDS
.TP
TIME(s)
Time of the I/O, in seconds since the first I/O was seen.
Time of the I/O completion, in seconds since the first I/O was seen.
.TP
COMM
Cached process name, if present. This usually (but isn't guaranteed) to identify
Expand All @@ -52,6 +59,10 @@ Device sector for the I/O.
BYTES
Size of the I/O, in bytes.
.TP
QUE(ms)
Time the I/O was queued in the OS before being issued to the device,
in milliseconds.
.TP
LAT(ms)
Time for the I/O (latency) from the issue to the device, to its completion,
in milliseconds.
Expand Down
91 changes: 60 additions & 31 deletions tools/biosnoop.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# @lint-avoid-python-3-compatibility-imports
#
# biosnoop Trace block device I/O and print details including issuing PID.
# For Linux, uses BCC, eBPF.
# For Linux, uses BCC, eBPF.
#
# This uses in-kernel eBPF maps to cache process details (PID and comm) by I/O
# request, as well as a starting timestamp for calculating I/O latency.
Expand All @@ -16,13 +16,31 @@
from __future__ import print_function
from bcc import BPF
import re

# load BPF program
b = BPF(text="""
import argparse

# arguments
examples = """examples:
./biosnoop # trace all block I/O
./biosnoop -Q # include OS queued time
"""
parser = argparse.ArgumentParser(
description="Trace block I/O",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=examples)
parser.add_argument("-Q", "--queue", action="store_true",
help="include OS queued time")
parser.add_argument("--ebpf", action="store_true",
help=argparse.SUPPRESS)
args = parser.parse_args()
debug = 0

# define BPF program
bpf_text="""
#include <uapi/linux/ptrace.h>
#include <linux/blkdev.h>
struct val_t {
u64 ts;
u32 pid;
char name[TASK_COMM_LEN];
};
Expand All @@ -31,6 +49,7 @@
u32 pid;
u64 rwflag;
u64 delta;
u64 qdelta;
u64 sector;
u64 len;
u64 ts;
Expand All @@ -46,9 +65,13 @@
int trace_pid_start(struct pt_regs *ctx, struct request *req)
{
struct val_t val = {};
u64 ts;
if (bpf_get_current_comm(&val.name, sizeof(val.name)) == 0) {
val.pid = bpf_get_current_pid_tgid();
val.pid = bpf_get_current_pid_tgid() >> 32;
if (##QUEUE##) {
val.ts = bpf_ktime_get_ns();
}
infobyreq.update(&req, &val);
}
return 0;
Expand All @@ -58,18 +81,15 @@
int trace_req_start(struct pt_regs *ctx, struct request *req)
{
u64 ts;
ts = bpf_ktime_get_ns();
start.update(&req, &ts);
return 0;
}
// output
int trace_req_completion(struct pt_regs *ctx, struct request *req)
{
u64 *tsp, delta;
u32 *pidp = 0;
u64 *tsp;
struct val_t *valp;
struct data_t data = {};
u64 ts;
Expand All @@ -83,12 +103,16 @@
ts = bpf_ktime_get_ns();
data.delta = ts - *tsp;
data.ts = ts / 1000;
data.qdelta = 0;
valp = infobyreq.lookup(&req);
if (valp == 0) {
data.len = req->__data_len;
strcpy(data.name, "?");
} else {
if (##QUEUE##) {
data.qdelta = *tsp - valp->ts;
}
data.pid = valp->pid;
data.len = req->__data_len;
data.sector = req->__sector;
Expand Down Expand Up @@ -119,7 +143,18 @@
return 0;
}
""", debug=0)
"""
if args.queue:
bpf_text = bpf_text.replace('##QUEUE##', '1')
else:
bpf_text = bpf_text.replace('##QUEUE##', '0')
if debug or args.ebpf:
print(bpf_text)
if args.ebpf:
exit()

# initialize BPF
b = BPF(text=bpf_text)
b.attach_kprobe(event="blk_account_io_start", fn_name="trace_pid_start")
if BPF.get_kprobe_functions(b'blk_start_request'):
b.attach_kprobe(event="blk_start_request", fn_name="trace_req_start")
Expand All @@ -128,8 +163,11 @@
fn_name="trace_req_completion")

# header
print("%-14s %-14s %-6s %-7s %-2s %-9s %-7s %7s" % ("TIME(s)", "COMM", "PID",
"DISK", "T", "SECTOR", "BYTES", "LAT(ms)"))
print("%-11s %-14s %-6s %-7s %-1s %-10s %-7s" % ("TIME(s)", "COMM", "PID",
"DISK", "T", "SECTOR", "BYTES"), end="")
if args.queue:
print("%7s " % ("QUE(ms)"), end="")
print("%7s" % "LAT(ms)")

rwflg = ""
start_ts = 0
Expand All @@ -140,33 +178,24 @@
def print_event(cpu, data, size):
event = b["events"].event(data)

val = -1
global start_ts
global prev_ts
global delta
if start_ts == 0:
start_ts = event.ts

if event.rwflag == 1:
rwflg = "W"

if event.rwflag == 0:
else:
rwflg = "R"

if not re.match(b'\?', event.name):
val = event.sector

if start_ts == 0:
prev_ts = start_ts

if start_ts == 1:
delta = float(delta) + (event.ts - prev_ts)
delta = float(event.ts) - start_ts

print("%-14.9f %-14.14s %-6s %-7s %-2s %-9s %-7s %7.2f" % (
print("%-11.6f %-14.14s %-6s %-7s %-1s %-10s %-7s" % (
delta / 1000000, event.name.decode('utf-8', 'replace'), event.pid,
event.disk_name.decode('utf-8', 'replace'), rwflg, val,
event.len, float(event.delta) / 1000000))

prev_ts = event.ts
start_ts = 1
event.disk_name.decode('utf-8', 'replace'), rwflg, event.sector,
event.len), end="")
if args.queue:
print("%7.2f " % (float(event.qdelta) / 1000000), end="")
print("%7.2f" % (float(event.delta) / 1000000))

# loop with callback to print_event
b["events"].open_perf_buffer(print_event, page_cnt=64)
Expand Down
60 changes: 44 additions & 16 deletions tools/biosnoop_example.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,22 +5,22 @@ biosnoop traces block device I/O (disk I/O), and prints a line of output
per I/O. Example:

# ./biosnoop
TIME(s) COMM PID DISK T SECTOR BYTES LAT(ms)
0.000004001 supervise 1950 xvda1 W 13092560 4096 0.74
0.000178002 supervise 1950 xvda1 W 13092432 4096 0.61
0.001469001 supervise 1956 xvda1 W 13092440 4096 1.24
0.001588002 supervise 1956 xvda1 W 13115128 4096 1.09
1.022346001 supervise 1950 xvda1 W 13115272 4096 0.98
1.022568002 supervise 1950 xvda1 W 13188496 4096 0.93
1.023534000 supervise 1956 xvda1 W 13188520 4096 0.79
1.023585003 supervise 1956 xvda1 W 13189512 4096 0.60
2.003920000 xfsaild/md0 456 xvdc W 62901512 8192 0.23
2.003931001 xfsaild/md0 456 xvdb W 62901513 512 0.25
2.004034001 xfsaild/md0 456 xvdb W 62901520 8192 0.35
2.004042000 xfsaild/md0 456 xvdb W 63542016 4096 0.36
2.004204001 kworker/0:3 26040 xvdb W 41950344 65536 0.34
2.044352002 supervise 1950 xvda1 W 13192672 4096 0.65
2.044574000 supervise 1950 xvda1 W 13189072 4096 0.58
TIME(s) COMM PID DISK T SECTOR BYTES LAT(ms)
0.000004 supervise 1950 xvda1 W 13092560 4096 0.74
0.000178 supervise 1950 xvda1 W 13092432 4096 0.61
0.001469 supervise 1956 xvda1 W 13092440 4096 1.24
0.001588 supervise 1956 xvda1 W 13115128 4096 1.09
1.022346 supervise 1950 xvda1 W 13115272 4096 0.98
1.022568 supervise 1950 xvda1 W 13188496 4096 0.93
1.023534 supervise 1956 xvda1 W 13188520 4096 0.79
1.023585 supervise 1956 xvda1 W 13189512 4096 0.60
2.003920 xfsaild/md0 456 xvdc W 62901512 8192 0.23
2.003931 xfsaild/md0 456 xvdb W 62901513 512 0.25
2.004034 xfsaild/md0 456 xvdb W 62901520 8192 0.35
2.004042 xfsaild/md0 456 xvdb W 63542016 4096 0.36
2.004204 kworker/0:3 26040 xvdb W 41950344 65536 0.34
2.044352 supervise 1950 xvda1 W 13192672 4096 0.65
2.044574 supervise 1950 xvda1 W 13189072 4096 0.58

This includes the PID and comm (process name) that were on-CPU at the time of
issue (which usually means the process responsible).
Expand All @@ -47,3 +47,31 @@ There are 4 write IOPS.
The output of biosnoop identifies the reason: multiple supervise processes are
issuing writes to the xvda1 disk. I can now drill down on supervise using other
tools to understand its file system workload.


The -Q option includes a column to show the time spent queued in the OS:

# biosnoop.py -Q
TIME(s) COMM PID DISK T SECTOR BYTES QUE(ms) LAT(ms)
0.000000 kworker/u72:1 13379 nvme1n1 W 1142400 4096 0.01 0.55
0.000771 sync 22177 nvme1n1 W 41963894 3072 0.11 0.47
5.332998 xfsaild/nvme1n 1061 nvme1n1 W 545728 16384 0.01 0.61
5.333044 xfsaild/nvme1n 1061 nvme1n1 W 2349728 16384 0.02 0.64
5.333065 xfsaild/nvme1n 1061 nvme1n1 W 20971521 512 0.02 0.65
5.333067 xfsaild/nvme1n 1061 nvme1n1 W 20971528 8192 0.00 0.65
[...]


USAGE message:

usage: biosnoop.py [-h] [-Q]

Trace block I/O

optional arguments:
-h, --help show this help message and exit
-Q, --queue include OS queued time

examples:
./biosnoop # trace all block I/O
./biosnoop -Q # include OS queued time
2 changes: 1 addition & 1 deletion tools/biotop.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ def signal_ignore(signal_value, frame):
struct who_t who = {};
if (bpf_get_current_comm(&who.name, sizeof(who.name)) == 0) {
who.pid = bpf_get_current_pid_tgid();
who.pid = bpf_get_current_pid_tgid() >> 32;
whobyreq.update(&req, &who);
}
Expand Down

0 comments on commit eb32c15

Please sign in to comment.