Skip to content

Commit

Permalink
examples/tracing: add io_latencies - IO latency distribution monitor
Browse files Browse the repository at this point in the history
This adds flexible low overhead latency percentile monitoring for block
devices which can be useful in understanding and debugging IO related
issues.

Signed-off-by: Tejun Heo <[email protected]>
  • Loading branch information
htejun authored and yonghong-song committed Apr 6, 2020
1 parent 007ee49 commit 5057bbe
Showing 1 changed file with 249 additions and 0 deletions.
249 changes: 249 additions & 0 deletions examples/tracing/io_latencies.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,249 @@
#!/usr/bin/python3
#
# io_latencies.py Monitor IO latency distribution of a block device.
#
# $ ./io_latencies.py 259:0
# 259:0 p1 p5 p10 p16 p25 p50 p75 p84 p90 p95 p99 p100
# read 95us 175us 305us 515us 895us 985us 995us 1.5ms 2.5ms 3.5ms 4.5ms 10ms
# write 5us 5us 5us 15us 25us 135us 765us 855us 885us 895us 965us 1.5ms
# discard 5us 5us 5us 5us 135us 145us 165us 205us 385us 875us 1.5ms 2.5ms
# flush 5us 5us 5us 5us 5us 5us 5us 5us 5us 1.5ms 4.5ms 5.5ms
#
# Copyright (C) 2020 Tejun Heo <[email protected]>
# Copyright (C) 2020 Facebook

from __future__ import print_function
from bcc import BPF
from time import sleep
import argparse
import json
import sys

description = """
Monitor IO latency distribution of a block device
"""

parser = argparse.ArgumentParser(description = description,
formatter_class = argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('devno', metavar='MAJ:MIN', type=str,
help='Target block device number')
parser.add_argument('-i', '--interval', type=int, default=3,
help='Report interval')
parser.add_argument('-w', '--which', choices=['from-rq-alloc', 'after-rq-alloc', 'on-device'],
default='on-device', help='Which latency to measure')
parser.add_argument('-p', '--pcts', metavar='PCT', type=float, nargs='+',
default=[1, 5, 10, 16, 25, 50, 75, 84, 90, 95, 99, 100],
help='Percentiles to calculate')
parser.add_argument('-j', '--json', action='store_true',
help='Output in json')
parser.add_argument('--verbose', '-v', action='count', default = 0)

bpf_source = """
#include <linux/blk_types.h>
#include <linux/blkdev.h>
#include <linux/time64.h>
BPF_PERCPU_ARRAY(rwdf_100ms, u64, 400);
BPF_PERCPU_ARRAY(rwdf_1ms, u64, 400);
BPF_PERCPU_ARRAY(rwdf_10us, u64, 400);
void kprobe_blk_account_io_done(struct pt_regs *ctx, struct request *rq, u64 now)
{
unsigned int cmd_flags;
u64 dur;
size_t base, slot;
if (!rq->__START_TIME_FIELD__)
return;
if (!rq->rq_disk ||
rq->rq_disk->major != __MAJOR__ ||
rq->rq_disk->first_minor != __MINOR__)
return;
cmd_flags = rq->cmd_flags;
switch (cmd_flags & REQ_OP_MASK) {
case REQ_OP_READ:
base = 0;
break;
case REQ_OP_WRITE:
base = 100;
break;
case REQ_OP_DISCARD:
base = 200;
break;
case REQ_OP_FLUSH:
base = 300;
break;
default:
return;
}
dur = now - rq->__START_TIME_FIELD__;
slot = min_t(size_t, div_u64(dur, 100 * NSEC_PER_MSEC), 99);
rwdf_100ms.increment(base + slot);
if (slot)
return;
slot = min_t(size_t, div_u64(dur, NSEC_PER_MSEC), 99);
rwdf_1ms.increment(base + slot);
if (slot)
return;
slot = min_t(size_t, div_u64(dur, 10 * NSEC_PER_USEC), 99);
rwdf_10us.increment(base + slot);
}
"""

args = parser.parse_args()
args.pcts.sort()

if args.which == 'from-rq-alloc':
start_time_field = 'alloc_time_ns'
elif args.which == 'after-rq-alloc':
start_time_field = 'start_time_ns'
elif args.which == 'on-device':
start_time_field = 'io_start_time_ns'
else:
die()

bpf_source = bpf_source.replace('__START_TIME_FIELD__', start_time_field)
bpf_source = bpf_source.replace('__MAJOR__', str(int(args.devno.split(':')[0])))
bpf_source = bpf_source.replace('__MINOR__', str(int(args.devno.split(':')[1])))

bpf = BPF(text=bpf_source)
bpf.attach_kprobe(event="blk_account_io_done", fn_name="kprobe_blk_account_io_done")

# times are in usecs
MSEC = 1000
SEC = 1000 * 1000

cur_rwdf_100ms = bpf["rwdf_100ms"]
cur_rwdf_1ms = bpf["rwdf_1ms"]
cur_rwdf_10us = bpf["rwdf_10us"]

last_rwdf_100ms = [0] * 400
last_rwdf_1ms = [0] * 400
last_rwdf_10us = [0] * 400

rwdf_100ms = [0] * 400
rwdf_1ms = [0] * 400
rwdf_10us = [0] * 400

io_type = ["read", "write", "discard", "flush"]

def find_pct(req, total, slots, idx, counted):
while idx > 0:
idx -= 1
if slots[idx] > 0:
counted += slots[idx]
if args.verbose > 1:
print('idx={} counted={} pct={:.1f} req={}'
.format(idx, counted, counted / total, req))
if (counted / total) * 100 >= 100 - req:
break
return (idx, counted)

def calc_lat_pct(req_pcts, total, lat_100ms, lat_1ms, lat_10us):
pcts = [0] * len(req_pcts)

if total == 0:
return pcts

data = [(100 * MSEC, lat_100ms), (MSEC, lat_1ms), (10, lat_10us)]
data_sel = 0
idx = 100
counted = 0

for pct_idx in reversed(range(len(req_pcts))):
req = req_pcts[pct_idx]
while True:
last_counted = counted
(gran, slots) = data[data_sel]
(idx, counted) = find_pct(req, total, slots, idx, counted)
if args.verbose > 1:
print('pct_idx={} req={} gran={} idx={} counted={} total={}'
.format(pct_idx, req, gran, idx, counted, total))
if idx > 0 or data_sel == len(data) - 1:
break
counted = last_counted
data_sel += 1
idx = 100

pcts[pct_idx] = gran * idx + gran / 2

return pcts

def format_usec(lat):
if lat > SEC:
return '{:.1f}s'.format(lat / SEC)
elif lat > 10 * MSEC:
return '{:.0f}ms'.format(lat / MSEC)
elif lat > MSEC:
return '{:.1f}ms'.format(lat / MSEC)
elif lat > 0:
return '{:.0f}us'.format(lat)
else:
return '-'

# 0 interval can be used to test whether this script would run successfully.
if args.interval == 0:
sys.exit(0)

while True:
sleep(args.interval)

rwdf_total = [0] * 4;

for i in range(400):
v = cur_rwdf_100ms.sum(i).value
rwdf_100ms[i] = max(v - last_rwdf_100ms[i], 0)
last_rwdf_100ms[i] = v

v = cur_rwdf_1ms.sum(i).value
rwdf_1ms[i] = max(v - last_rwdf_1ms[i], 0)
last_rwdf_1ms[i] = v

v = cur_rwdf_10us.sum(i).value
rwdf_10us[i] = max(v - last_rwdf_10us[i], 0)
last_rwdf_10us[i] = v

rwdf_total[int(i / 100)] += rwdf_100ms[i]

rwdf_lat = []
for i in range(4):
left = i * 100
right = left + 100
rwdf_lat.append(
calc_lat_pct(args.pcts, rwdf_total[i],
rwdf_100ms[left:right],
rwdf_1ms[left:right],
rwdf_10us[left:right]))

if args.verbose:
print('{:7} 100ms {}'.format(io_type[i], rwdf_100ms[left:right]))
print('{:7} 1ms {}'.format(io_type[i], rwdf_1ms[left:right]))
print('{:7} 10us {}'.format(io_type[i], rwdf_10us[left:right]))

if args.json:
result = {}
for iot in range(4):
lats = {}
for pi in range(len(args.pcts)):
lats[args.pcts[pi]] = rwdf_lat[iot][pi] / SEC
result[io_type[iot]] = lats
print(json.dumps(result), flush=True)
else:
print('\n{:<7}'.format(args.devno), end='')
widths = []
for pct in args.pcts:
pct_str = 'p{:g}'.format(pct)
widths.append(max(len(pct_str), 5))
print(' {:>5}'.format(pct_str), end='')
print()
for iot in range(4):
print('{:7}'.format(io_type[iot]), end='')
for pi in range(len(rwdf_lat[iot])):
print(' {:>{}}'.format(format_usec(rwdf_lat[iot][pi]), widths[pi]), end='')
print()

0 comments on commit 5057bbe

Please sign in to comment.