#!/usr/bin/env python # @lint-avoid-python-3-compatibility-imports # # biosnoop Trace block device I/O and print details including issuing PID. # For Linux, uses BCC, eBPF. # # This uses in-kernel eBPF maps to cache process details (PID and comm) by I/O # request, as well as a starting timestamp for calculating I/O latency. # # Copyright (c) 2015 Brendan Gregg. # Licensed under the Apache License, Version 2.0 (the "License") # # 16-Sep-2015 Brendan Gregg Created this. # 11-Feb-2016 Allan McAleavy updated for BPF_PERF_OUTPUT # 21-Jun-2022 Rocky Xing Added disk filter support. # 13-Oct-2022 Rocky Xing Added support for displaying block I/O pattern. # 01-Aug-2023 Jerome Marchand Added support for block tracepoints from __future__ import print_function from bcc import BPF import argparse import os # arguments examples = """examples: ./biosnoop # trace all block I/O ./biosnoop -Q # include OS queued time ./biosnoop -d sdc # trace sdc only ./biosnoop -P # display block I/O pattern """ parser = argparse.ArgumentParser( description="Trace block I/O", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=examples) parser.add_argument("-Q", "--queue", action="store_true", help="include OS queued time") parser.add_argument("-d", "--disk", type=str, help="trace this disk only") parser.add_argument("-P", "--pattern", action="store_true", help="display block I/O pattern (sequential or random)") parser.add_argument("--ebpf", action="store_true", help=argparse.SUPPRESS) args = parser.parse_args() debug = 0 # define BPF program bpf_text = """ #include #include """ if args.pattern: bpf_text += "#define INCLUDE_PATTERN\n" bpf_text += """ // for saving the timestamp and __data_len of each request struct start_req_t { u64 ts; u64 data_len; }; struct val_t { u64 ts; u32 pid; char name[TASK_COMM_LEN]; }; struct tp_args { u64 __unused__; dev_t dev; sector_t sector; unsigned int nr_sector; unsigned int bytes; char rwbs[8]; char comm[16]; char cmd[]; }; struct hash_key { dev_t dev; u32 rwflag; sector_t sector; }; #ifdef INCLUDE_PATTERN struct sector_key_t { u32 dev_major; u32 dev_minor; }; enum bio_pattern { UNKNOWN, SEQUENTIAL, RANDOM, }; #endif struct data_t { u32 pid; u32 dev; u64 rwflag; u64 delta; u64 qdelta; u64 sector; u64 len; #ifdef INCLUDE_PATTERN enum bio_pattern pattern; #endif u64 ts; char name[TASK_COMM_LEN]; }; #ifdef INCLUDE_PATTERN BPF_HASH(last_sectors, struct sector_key_t, u64); #endif BPF_HASH(start, struct hash_key, struct start_req_t); BPF_HASH(infobyreq, struct hash_key, struct val_t); BPF_PERF_OUTPUT(events); static dev_t ddevt(struct gendisk *disk) { return (disk->major << 20) | disk->first_minor; } /* * The following deals with a kernel version change (in mainline 4.7, although * it may be backported to earlier kernels) with how block request write flags * are tested. We handle both pre- and post-change versions here. Please avoid * kernel version tests like this as much as possible: they inflate the code, * test, and maintenance burden. */ static int get_rwflag(u32 cmd_flags) { #ifdef REQ_WRITE return !!(cmd_flags & REQ_WRITE); #elif defined(REQ_OP_SHIFT) return !!((cmd_flags >> REQ_OP_SHIFT) == REQ_OP_WRITE); #else return !!((cmd_flags & REQ_OP_MASK) == REQ_OP_WRITE); #endif } #define RWBS_LEN 8 static int get_rwflag_tp(char *rwbs) { for (int i = 0; i < RWBS_LEN; i++) { if (rwbs[i] == 'W') return 1; if (rwbs[i] == '\\0') return 0; } return 0; } // cache PID and comm by-req static int __trace_pid_start(struct hash_key key) { DISK_FILTER struct val_t val = {}; u64 ts; if (bpf_get_current_comm(&val.name, sizeof(val.name)) == 0) { val.pid = bpf_get_current_pid_tgid() >> 32; if (##QUEUE##) { val.ts = bpf_ktime_get_ns(); } infobyreq.update(&key, &val); } return 0; } int trace_pid_start(struct pt_regs *ctx, struct request *req) { struct hash_key key = { .dev = ddevt(req->__RQ_DISK__), .rwflag = get_rwflag(req->cmd_flags), .sector = req->__sector }; return __trace_pid_start(key); } int trace_pid_start_tp(struct tp_args *args) { struct hash_key key = { .dev = args->dev, .rwflag = get_rwflag_tp(args->rwbs), .sector = args->sector }; return __trace_pid_start(key); } // time block I/O int trace_req_start(struct pt_regs *ctx, struct request *req) { struct hash_key key = { .dev = ddevt(req->__RQ_DISK__), .rwflag = get_rwflag(req->cmd_flags), .sector = req->__sector }; DISK_FILTER struct start_req_t start_req = { .ts = bpf_ktime_get_ns(), .data_len = req->__data_len }; start.update(&key, &start_req); return 0; } // output static int __trace_req_completion(void *ctx, struct hash_key key) { struct start_req_t *startp; struct val_t *valp; struct data_t data = {}; //struct gendisk *rq_disk; u64 ts; // fetch timestamp and calculate delta startp = start.lookup(&key); if (startp == 0) { // missed tracing issue return 0; } ts = bpf_ktime_get_ns(); //rq_disk = req->__RQ_DISK__; data.delta = ts - startp->ts; data.ts = ts / 1000; data.qdelta = 0; data.len = startp->data_len; valp = infobyreq.lookup(&key); if (valp == 0) { data.name[0] = '?'; data.name[1] = 0; } else { if (##QUEUE##) { data.qdelta = startp->ts - valp->ts; } data.pid = valp->pid; data.sector = key.sector; data.dev = key.dev; bpf_probe_read_kernel(&data.name, sizeof(data.name), valp->name); } #ifdef INCLUDE_PATTERN data.pattern = UNKNOWN; u64 *sector, last_sector; struct sector_key_t sector_key = { .dev_major = key.dev >> 20, .dev_minor = key.dev & ((1 << 20) - 1) }; sector = last_sectors.lookup(§or_key); if (sector != 0) { data.pattern = req->__sector == *sector ? SEQUENTIAL : RANDOM; } last_sector = req->__sector + data.len / 512; last_sectors.update(§or_key, &last_sector); #endif data.rwflag = key.rwflag; events.perf_submit(ctx, &data, sizeof(data)); start.delete(&key); infobyreq.delete(&key); return 0; } int trace_req_completion(struct pt_regs *ctx, struct request *req) { struct hash_key key = { .dev = ddevt(req->__RQ_DISK__), .rwflag = get_rwflag(req->cmd_flags), .sector = req->__sector }; return __trace_req_completion(ctx, key); } int trace_req_completion_tp(struct tp_args *args) { struct hash_key key = { .dev = args->dev, .rwflag = get_rwflag_tp(args->rwbs), .sector = args->sector }; return __trace_req_completion(args, key); } """ if args.queue: bpf_text = bpf_text.replace('##QUEUE##', '1') else: bpf_text = bpf_text.replace('##QUEUE##', '0') if BPF.kernel_struct_has_field(b'request', b'rq_disk') == 1: bpf_text = bpf_text.replace('__RQ_DISK__', 'rq_disk') else: bpf_text = bpf_text.replace('__RQ_DISK__', 'q->disk') if args.disk is not None: disk_path = os.path.join('/dev', args.disk) if not os.path.exists(disk_path): print("no such disk '%s'" % args.disk) exit(1) stat_info = os.stat(disk_path) dev = os.major(stat_info.st_rdev) << 20 | os.minor(stat_info.st_rdev) disk_filter_str = """ if(key.dev != %s) { return 0; } """ % (dev) bpf_text = bpf_text.replace('DISK_FILTER', disk_filter_str) else: bpf_text = bpf_text.replace('DISK_FILTER', '') if debug or args.ebpf: print(bpf_text) if args.ebpf: exit() # initialize BPF b = BPF(text=bpf_text) if BPF.get_kprobe_functions(b'__blk_account_io_start'): b.attach_kprobe(event="__blk_account_io_start", fn_name="trace_pid_start") elif BPF.get_kprobe_functions(b'blk_account_io_start'): b.attach_kprobe(event="blk_account_io_start", fn_name="trace_pid_start") else: b.attach_tracepoint(tp="block:block_io_start", fn_name="trace_pid_start_tp") if BPF.get_kprobe_functions(b'blk_start_request'): b.attach_kprobe(event="blk_start_request", fn_name="trace_req_start") b.attach_kprobe(event="blk_mq_start_request", fn_name="trace_req_start") if BPF.get_kprobe_functions(b'__blk_account_io_done'): b.attach_kprobe(event="__blk_account_io_done", fn_name="trace_req_completion") elif BPF.get_kprobe_functions(b'blk_account_io_done'): b.attach_kprobe(event="blk_account_io_done", fn_name="trace_req_completion") else: b.attach_tracepoint(tp="block:block_io_done", fn_name="trace_req_completion_tp") # header print("%-11s %-14s %-7s %-9s %-1s %-10s %-7s" % ("TIME(s)", "COMM", "PID", "DISK", "T", "SECTOR", "BYTES"), end="") if args.pattern: print("%-1s " % ("P"), end="") if args.queue: print("%7s " % ("QUE(ms)"), end="") print("%7s" % "LAT(ms)") # cache disk major,minor -> diskname diskstats = "/proc/diskstats" disklookup = {} with open(diskstats) as stats: for line in stats: a = line.split() disklookup[a[0] + "," + a[1]] = a[2] def disk_print(d): major = d >> 20 minor = d & ((1 << 20) - 1) disk = str(major) + "," + str(minor) if disk in disklookup: diskname = disklookup[disk] else: diskname = "" return diskname rwflg = "" pattern = "" start_ts = 0 prev_ts = 0 delta = 0 P_SEQUENTIAL = 1 P_RANDOM = 2 # process event def print_event(cpu, data, size): event = b["events"].event(data) global start_ts if start_ts == 0: start_ts = event.ts if event.rwflag == 1: rwflg = "W" else: rwflg = "R" delta = float(event.ts) - start_ts disk_name = disk_print(event.dev) print("%-11.6f %-14.14s %-7s %-9s %-1s %-10s %-7s" % ( delta / 1000000, event.name.decode('utf-8', 'replace'), event.pid, disk_name, rwflg, event.sector, event.len), end="") if args.pattern: if event.pattern == P_SEQUENTIAL: pattern = "S" elif event.pattern == P_RANDOM: pattern = "R" else: pattern = "?" print("%-1s " % pattern, end="") if args.queue: print("%7.2f " % (float(event.qdelta) / 1000000), end="") print("%7.2f" % (float(event.delta) / 1000000)) # loop with callback to print_event b["events"].open_perf_buffer(print_event, page_cnt=64) while 1: try: b.perf_buffer_poll() except KeyboardInterrupt: exit()