diff --git a/README.md b/README.md index 5420a276d009..c5f0dff0fa25 100644 --- a/README.md +++ b/README.md @@ -85,6 +85,7 @@ Tools: - tools/[gethostlatency](tools/gethostlatency.py): Show latency for getaddrinfo/gethostbyname[2] calls. [Examples](tools/gethostlatency_example.txt). - tools/[hardirqs](tools/hardirqs.py): Measure hard IRQ (hard interrupt) event time. [Examples](tools/hardirqs_example.txt). - tools/[killsnoop](tools/killsnoop.py): Trace signals issued by the kill() syscall. [Examples](tools/killsnoop_example.txt). +- tools/[mdflush](tools/mdflush.py): Trace md flush events. [Examples](tools/mdflush.txt). - tools/[memleak](tools/memleak.py): Display outstanding memory allocations to find memory leaks. [Examples](tools/memleak_examples.txt). - tools/[offcputime](tools/offcputime.py): Summarize off-CPU time by kernel stack trace. [Examples](tools/offcputime_example.txt). - tools/[offwaketime](tools/offwaketime.py): Summarize blocked time by kernel off-CPU stack and waker stack. [Examples](tools/offwaketime_example.txt). diff --git a/man/man8/mdflush.8 b/man/man8/mdflush.8 new file mode 100644 index 000000000000..9d10ca870969 --- /dev/null +++ b/man/man8/mdflush.8 @@ -0,0 +1,57 @@ +.TH pidpersec 8 "2016-02-13" "USER COMMANDS" +.SH NAME +mdflush \- Trace md flush events. Uses Linux eBPF/bcc. +.SH SYNOPSIS +.B mdflush +.SH DESCRIPTION +This tool traces flush events by md, the Linux multiple device driver +(software RAID). The timestamp and md device for the flush are printed. +Knowing when these flushes happen can be useful for correlation with +unexplained spikes in disk latency. + +This works by tracing the kernel md_flush_request() function using dynamic +tracing, and will need updating to match any changes to this function. + +Note that the flushes themselves are likely to originate from higher in the +I/O stack, such as from the file systems. + +Since this uses BPF, only the root user can use this tool. +.SH REQUIREMENTS +CONFIG_BPF and bcc. +.SH EXAMPLES +.TP +Trace md flush events: +# +.B mdflush +.SH FIELDS +.TP +TIME +Time of the flush event (HH:MM:SS). +.TP +PID +The process ID that was on-CPU when the event was issued. This may identify +the cause of the flush (eg, the "sync" command), but will often identify a +kernel worker thread that was managing I/O. +.TP +COMM +The command name for the PID. +.TP +DEVICE +The md device name. +.SH OVERHEAD +Expected to be negligible. +.SH SOURCE +This is from bcc. +.IP +https://github.com/iovisor/bcc +.PP +Also look in the bcc distribution for a companion _examples.txt file containing +example usage, output, and commentary for this tool. +.SH OS +Linux +.SH STABILITY +Unstable - in development. +.SH AUTHOR +Brendan Gregg +.SH SEE ALSO +biosnoop(8) diff --git a/tools/mdflush.py b/tools/mdflush.py new file mode 100755 index 000000000000..955324a04698 --- /dev/null +++ b/tools/mdflush.py @@ -0,0 +1,68 @@ +#!/usr/bin/python +# @lint-avoid-python-3-compatibility-imports +# +# mdflush Trace md flush events. +# For Linux, uses BCC, eBPF. +# +# Todo: add more details of the flush (latency, I/O count). +# +# Copyright 2016 Netflix, Inc. +# Licensed under the Apache License, Version 2.0 (the "License") +# +# 13-Feb-2015 Brendan Gregg Created this. + +from __future__ import print_function +from bcc import BPF +from time import strftime +import ctypes as ct + +# load BPF program +b = BPF(text=""" +#include +#include +#include + +struct data_t { + u64 pid; + char comm[TASK_COMM_LEN]; + char disk[DISK_NAME_LEN]; +}; +BPF_PERF_OUTPUT(events); + +int kprobe__md_flush_request(struct pt_regs *ctx, void *mddev, struct bio *bio) +{ + struct data_t data = {}; + u32 pid = bpf_get_current_pid_tgid(); + data.pid = pid; + bpf_get_current_comm(&data.comm, sizeof(data.comm)); + bpf_probe_read(&data.disk, sizeof(data.disk), + bio->bi_bdev->bd_disk->disk_name); + events.perf_submit(ctx, &data, sizeof(data)); + return 0; +} +""") + +# event data +TASK_COMM_LEN = 16 # linux/sched.h +DISK_NAME_LEN = 32 # linux/genhd.h +class Data(ct.Structure): + _fields_ = [ + ("pid", ct.c_ulonglong), + ("comm", ct.c_char * TASK_COMM_LEN), + ("disk", ct.c_char * DISK_NAME_LEN) + ] + +# header +print("Tracing md flush requests... Hit Ctrl-C to end.") +print("%-8s %-6s %-16s %s" % ("TIME", "PID", "COMM", "DEVICE")) + +# process event +def print_event(cpu, data, size): + event = ct.cast(data, ct.POINTER(Data)).contents + print("%-8s %-6d %-16s %s" % (strftime("%H:%M:%S"), event.pid, event.comm, + event.disk)) + +# read events +b["events"].open_perf_buffer(print_event) +while 1: + b.kprobe_poll() diff --git a/tools/mdflush_example.txt b/tools/mdflush_example.txt new file mode 100644 index 000000000000..36d6d7e13a3a --- /dev/null +++ b/tools/mdflush_example.txt @@ -0,0 +1,47 @@ +Demonstrations of mdflush, the Linux eBPF/bcc version. + + +The mdflush tool traces flushes at the md driver level, and prints details +including the time of the flush: + +# ./mdflush +Tracing md flush requests... Hit Ctrl-C to end. +TIME PID COMM DEVICE +03:13:49 16770 sync md0 +03:14:08 16864 sync md0 +03:14:49 496 kworker/1:0H md0 +03:14:49 488 xfsaild/md0 md0 +03:14:54 488 xfsaild/md0 md0 +03:15:00 488 xfsaild/md0 md0 +03:15:02 85 kswapd0 md0 +03:15:02 488 xfsaild/md0 md0 +03:15:05 488 xfsaild/md0 md0 +03:15:08 488 xfsaild/md0 md0 +03:15:10 488 xfsaild/md0 md0 +03:15:11 488 xfsaild/md0 md0 +03:15:11 488 xfsaild/md0 md0 +03:15:11 488 xfsaild/md0 md0 +03:15:11 488 xfsaild/md0 md0 +03:15:11 488 xfsaild/md0 md0 +03:15:12 488 xfsaild/md0 md0 +03:15:13 488 xfsaild/md0 md0 +03:15:15 488 xfsaild/md0 md0 +03:15:19 496 kworker/1:0H md0 +03:15:49 496 kworker/1:0H md0 +03:15:55 18840 sync md0 +03:16:49 496 kworker/1:0H md0 +03:17:19 496 kworker/1:0H md0 +03:20:19 496 kworker/1:0H md0 +03:21:19 496 kworker/1:0H md0 +03:21:49 496 kworker/1:0H md0 +03:25:19 496 kworker/1:0H md0 +[...] + +This can be useful for correlation with latency outliers or spikes in disk +latency, as measured using another tool (eg, system monitoring). If spikes in +disk latency often coincide with md flush events, then it would make flushing +a target for tuning. + +Note that the flush events are likely to originate from higher in the I/O +stack, such as from file systems. This traces md processing them, and the +timestamp corresponds with when md began to issue the flush to disks.