// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) // Copyright (c) 2020 Wenbo Zhang // // Based on biosnoop(8) from BCC by Brendan Gregg. // 29-Jun-2020 Wenbo Zhang Created this. #include #include #include #include #include #include #include #include #include #include "blk_types.h" #include "biosnoop.h" #include "biosnoop.skel.h" #include "trace_helpers.h" #define PERF_BUFFER_PAGES 16 #define PERF_POLL_TIMEOUT_MS 100 static volatile sig_atomic_t exiting = 0; static struct env { char *disk; int duration; bool timestamp; bool queued; bool verbose; char *cgroupspath; bool cg; } env = {}; static volatile __u64 start_ts; const char *argp_program_version = "biosnoop 0.1"; const char *argp_program_bug_address = "https://github.com/iovisor/bcc/tree/master/libbpf-tools"; const char argp_program_doc[] = "Trace block I/O.\n" "\n" "USAGE: biosnoop [--help] [-d DISK] [-c CG] [-Q]\n" "\n" "EXAMPLES:\n" " biosnoop # trace all block I/O\n" " biosnoop -Q # include OS queued time in I/O time\n" " biosnoop 10 # trace for 10 seconds only\n" " biosnoop -d sdc # trace sdc only\n" " biosnoop -c CG # Trace process under cgroupsPath CG\n"; static const struct argp_option opts[] = { { "queued", 'Q', NULL, 0, "Include OS queued time in I/O time" }, { "disk", 'd', "DISK", 0, "Trace this disk only" }, { "verbose", 'v', NULL, 0, "Verbose debug output" }, { "cgroup", 'c', "/sys/fs/cgroup/unified/CG", 0, "Trace process in cgroup path"}, { NULL, 'h', NULL, OPTION_HIDDEN, "Show the full help" }, {}, }; static error_t parse_arg(int key, char *arg, struct argp_state *state) { static int pos_args; switch (key) { case 'h': argp_state_help(state, stderr, ARGP_HELP_STD_HELP); break; case 'v': env.verbose = true; break; case 'Q': env.queued = true; break; case 'c': env.cg = true; env.cgroupspath = arg; break; case 'd': env.disk = arg; if (strlen(arg) + 1 > DISK_NAME_LEN) { fprintf(stderr, "invaild disk name: too long\n"); argp_usage(state); } break; case ARGP_KEY_ARG: if (pos_args++) { fprintf(stderr, "unrecognized positional argument: %s\n", arg); argp_usage(state); } errno = 0; env.duration = strtoll(arg, NULL, 10); if (errno || env.duration <= 0) { fprintf(stderr, "invalid delay (in us): %s\n", arg); argp_usage(state); } break; default: return ARGP_ERR_UNKNOWN; } return 0; } static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args) { if (level == LIBBPF_DEBUG && !env.verbose) return 0; return vfprintf(stderr, format, args); } static void sig_int(int signo) { exiting = 1; } static void blk_fill_rwbs(char *rwbs, unsigned int op) { int i = 0; if (op & REQ_PREFLUSH) rwbs[i++] = 'F'; switch (op & REQ_OP_MASK) { case REQ_OP_WRITE: case REQ_OP_WRITE_SAME: rwbs[i++] = 'W'; break; case REQ_OP_DISCARD: rwbs[i++] = 'D'; break; case REQ_OP_SECURE_ERASE: rwbs[i++] = 'D'; rwbs[i++] = 'E'; break; case REQ_OP_FLUSH: rwbs[i++] = 'F'; break; case REQ_OP_READ: rwbs[i++] = 'R'; break; default: rwbs[i++] = 'N'; } if (op & REQ_FUA) rwbs[i++] = 'F'; if (op & REQ_RAHEAD) rwbs[i++] = 'A'; if (op & REQ_SYNC) rwbs[i++] = 'S'; if (op & REQ_META) rwbs[i++] = 'M'; rwbs[i] = '\0'; } static struct partitions *partitions; void handle_event(void *ctx, int cpu, void *data, __u32 data_sz) { const struct partition *partition; const struct event *e = data; char rwbs[RWBS_LEN]; if (!start_ts) start_ts = e->ts; blk_fill_rwbs(rwbs, e->cmd_flags); partition = partitions__get_by_dev(partitions, e->dev); printf("%-11.6f %-14.14s %-7d %-7s %-4s %-10lld %-7d ", (e->ts - start_ts) / 1000000000.0, e->comm, e->pid, partition ? partition->name : "Unknown", rwbs, e->sector, e->len); if (env.queued) printf("%7.3f ", e->qdelta != -1 ? e->qdelta / 1000000.0 : -1); printf("%7.3f\n", e->delta / 1000000.0); } void handle_lost_events(void *ctx, int cpu, __u64 lost_cnt) { fprintf(stderr, "lost %llu events on CPU #%d\n", lost_cnt, cpu); } int main(int argc, char **argv) { const struct partition *partition; static const struct argp argp = { .options = opts, .parser = parse_arg, .doc = argp_program_doc, }; struct perf_buffer *pb = NULL; struct ksyms *ksyms = NULL; struct biosnoop_bpf *obj; __u64 time_end = 0; int err; int idx, cg_map_fd; int cgfd = -1; err = argp_parse(&argp, argc, argv, 0, NULL, NULL); if (err) return err; libbpf_set_strict_mode(LIBBPF_STRICT_ALL); libbpf_set_print(libbpf_print_fn); obj = biosnoop_bpf__open(); if (!obj) { fprintf(stderr, "failed to open BPF object\n"); return 1; } partitions = partitions__load(); if (!partitions) { fprintf(stderr, "failed to load partitions info\n"); goto cleanup; } /* initialize global data (filtering options) */ if (env.disk) { partition = partitions__get_by_name(partitions, env.disk); if (!partition) { fprintf(stderr, "invaild partition name: not exist\n"); goto cleanup; } obj->rodata->filter_dev = true; obj->rodata->targ_dev = partition->dev; } obj->rodata->targ_queued = env.queued; obj->rodata->filter_cg = env.cg; if (fentry_can_attach("blk_account_io_start", NULL)) bpf_program__set_attach_target(obj->progs.blk_account_io_start, 0, "blk_account_io_start"); else bpf_program__set_attach_target(obj->progs.blk_account_io_start, 0, "__blk_account_io_start"); err = biosnoop_bpf__load(obj); if (err) { fprintf(stderr, "failed to load BPF object: %d\n", err); goto cleanup; } /* update cgroup path fd to map */ if (env.cg) { idx = 0; cg_map_fd = bpf_map__fd(obj->maps.cgroup_map); cgfd = open(env.cgroupspath, O_RDONLY); if (cgfd < 0) { fprintf(stderr, "Failed opening Cgroup path: %s\n", env.cgroupspath); goto cleanup; } if (bpf_map_update_elem(cg_map_fd, &idx, &cgfd, BPF_ANY)) { fprintf(stderr, "Failed adding target cgroup to map\n"); goto cleanup; } } obj->links.blk_account_io_start = bpf_program__attach(obj->progs.blk_account_io_start); if (!obj->links.blk_account_io_start) { err = -errno; fprintf(stderr, "failed to attach blk_account_io_start: %s\n", strerror(-err)); goto cleanup; } ksyms = ksyms__load(); if (!ksyms) { err = -ENOMEM; fprintf(stderr, "failed to load kallsyms\n"); goto cleanup; } if (ksyms__get_symbol(ksyms, "blk_account_io_merge_bio")) { obj->links.blk_account_io_merge_bio = bpf_program__attach(obj->progs.blk_account_io_merge_bio); if (!obj->links.blk_account_io_merge_bio) { err = -errno; fprintf(stderr, "failed to attach blk_account_io_merge_bio: %s\n", strerror(-err)); goto cleanup; } } if (env.queued) { obj->links.block_rq_insert = bpf_program__attach(obj->progs.block_rq_insert); if (!obj->links.block_rq_insert) { err = -errno; fprintf(stderr, "failed to attach block_rq_insert: %s\n", strerror(-err)); goto cleanup; } } obj->links.block_rq_issue = bpf_program__attach(obj->progs.block_rq_issue); if (!obj->links.block_rq_issue) { err = -errno; fprintf(stderr, "failed to attach block_rq_issue: %s\n", strerror(-err)); goto cleanup; } obj->links.block_rq_complete = bpf_program__attach(obj->progs.block_rq_complete); if (!obj->links.block_rq_complete) { err = -errno; fprintf(stderr, "failed to attach block_rq_complete: %s\n", strerror(-err)); goto cleanup; } pb = perf_buffer__new(bpf_map__fd(obj->maps.events), PERF_BUFFER_PAGES, handle_event, handle_lost_events, NULL, NULL); if (!pb) { err = -errno; fprintf(stderr, "failed to open perf buffer: %d\n", err); goto cleanup; } printf("%-11s %-14s %-7s %-7s %-4s %-10s %-7s ", "TIME(s)", "COMM", "PID", "DISK", "T", "SECTOR", "BYTES"); if (env.queued) printf("%7s ", "QUE(ms)"); printf("%7s\n", "LAT(ms)"); /* setup duration */ if (env.duration) time_end = get_ktime_ns() + env.duration * NSEC_PER_SEC; if (signal(SIGINT, sig_int) == SIG_ERR) { fprintf(stderr, "can't set signal handler: %s\n", strerror(errno)); err = 1; goto cleanup; } /* main: poll */ while (!exiting) { err = perf_buffer__poll(pb, PERF_POLL_TIMEOUT_MS); if (err < 0 && err != -EINTR) { fprintf(stderr, "error polling perf buffer: %s\n", strerror(-err)); goto cleanup; } if (env.duration && get_ktime_ns() > time_end) goto cleanup; /* reset err to return 0 if exiting */ err = 0; } cleanup: perf_buffer__free(pb); biosnoop_bpf__destroy(obj); ksyms__free(ksyms); partitions__free(partitions); if (cgfd > 0) close(cgfd); return err != 0; }