From 0b813f80583992703328b524a175bb53893864b5 Mon Sep 17 00:00:00 2001 From: Joel Date: Mon, 18 Jun 2018 19:37:38 -0700 Subject: [PATCH] bcc: criticalstat: Add an atomic critical section tracer (#1801) This tool detects code sections in the kernel where IRQs or preemption are disabled for a user-specified amount of time. Requires a kernel built with CONFIG_DEBUG_PREEMPT and CONFIG_PREEMPTIRQ_EVENTS. Signed-off-by: Joel Fernandes --- README.md | 1 + man/man8/criticalstat.8 | 74 ++++++++ tools/criticalstat.py | 331 +++++++++++++++++++++++++++++++++ tools/criticalstat_example.txt | 139 ++++++++++++++ 4 files changed, 545 insertions(+) create mode 100644 man/man8/criticalstat.8 create mode 100755 tools/criticalstat.py create mode 100644 tools/criticalstat_example.txt diff --git a/README.md b/README.md index c2b3375b0667..50d6db0a2fc2 100644 --- a/README.md +++ b/README.md @@ -97,6 +97,7 @@ pair of .c and .py files, and some are directories of files. - tools/[cachetop](tools/cachetop.py): Trace page cache hit/miss ratio by processes. [Examples](tools/cachetop_example.txt). - tools/[cpudist](tools/cpudist.py): Summarize on- and off-CPU time per task as a histogram. [Examples](tools/cpudist_example.txt) - tools/[cpuunclaimed](tools/cpuunclaimed.py): Sample CPU run queues and calculate unclaimed idle CPU. [Examples](tools/cpuunclaimed_example.txt) +- tools/[criticalstat](tools/criticalstat.py): Trace and report long atomic critical sections in the kernel. [Examples](tools/criticalstat_example.txt) - tools/[dbslower](tools/dbslower.py): Trace MySQL/PostgreSQL queries slower than a threshold. [Examples](tools/dbslower_example.txt). - tools/[dbstat](tools/dbstat.py): Summarize MySQL/PostgreSQL query latency as a histogram. [Examples](tools/dbstat_example.txt). - tools/[dcsnoop](tools/dcsnoop.py): Trace directory entry cache (dcache) lookups. [Examples](tools/dcsnoop_example.txt). diff --git a/man/man8/criticalstat.8 b/man/man8/criticalstat.8 new file mode 100644 index 000000000000..52baf1d8cc53 --- /dev/null +++ b/man/man8/criticalstat.8 @@ -0,0 +1,74 @@ +.TH criticalstat 8 "2018-06-07" "USER COMMANDS" +.SH NAME +criticalstat \- A tracer to find and report long atomic critical sections in kernel +.SH SYNOPSIS +.B criticalstat [\-h] [\-p] [\-i] [\-d DURATION] +.SH DESCRIPTION + +criticalstat traces and reports occurences of atomic critical sections in the +kernel with useful stacktraces showing the origin of them. Such critical +sections frequently occur due to use of spinlocks, or if interrupts or +preemption were explicity disabled by a driver. IRQ routines in Linux are also +executed with interrupts disabled. There are many reasons. Such critical +sections are a source of long latency/responsive issues for real-time systems. + +This works by probing the preempt/irq and cpuidle tracepoints in the kernel. +Since this uses BPF, only the root user can use this tool. Further, the kernel +has to be built with certain CONFIG options enabled. See below. + +.SH REQUIREMENTS +Enable CONFIG_PREEMPTIRQ_EVENTS and CONFIG_DEBUG_PREEMPT. Additionally, the +following options should be DISABLED on older kernels: CONFIG_PROVE_LOCKING, +CONFIG_LOCKDEP. +.SH OPTIONS +.TP +\-h +Print usage message. +.TP +\-p +Find long sections where preemption was disabled on local CPU. +.TP +\-i +Find long sections where interrupt was disabled on local CPU. +.TP +\-d DURATION +Only identify sections that are longer than DURATION in microseconds. +.SH EXAMPLES +.TP +Run with default options: irq disabled for more than 100 uS +# +.B criticalstat +.TP +Find sections with preemption disabled for more than 100 uS. +# +.B criticalstat -p +.TP +Find sections with IRQ disabled for more than 500 uS. +# +.B criticalstat -d 500 +.TP +Find sections with preemption disabled for more than 500 uS. +# +.B criticalstat -p -d 500 +.SH OVERHEAD +This tool can cause overhead if the application is spending a lot of time in +kernel mode. The overhead is variable but can be 2-4% of performance +degradation. If overhead is seen to be too much, please pass a higher DURATION +to the -d option to filter more aggressively. + +.SH SOURCE +This is from bcc. +.IP +https://github.com/iovisor/bcc +.PP +Also look in the bcc distribution for a companion _examples.txt file containing +example usage, output, and commentary for this tool. +.SH OS +Linux +.SH STABILITY +Unstable - in development. +.SH AUTHOR +Joel Fernandes +.SH SEE ALSO +Linux kernel's preemptoff and irqoff tracers. + diff --git a/tools/criticalstat.py b/tools/criticalstat.py new file mode 100755 index 000000000000..48dd7b3c3ce7 --- /dev/null +++ b/tools/criticalstat.py @@ -0,0 +1,331 @@ +#!/usr/bin/python +# @lint-avoid-python-3-compatibility-imports +# +# criticalstat Trace long critical sections (IRQs or preemption disabled) +# For Linux, uses BCC, eBPF. Requires kernel built with +# CONFIG_DEBUG_PREEMPT and CONFIG_PREEMPTIRQ_EVENTS +# +# USAGE: criticalstat [-h] [-p] [-i] [-d DURATION] +# +# Copyright (c) 2018, Google, Inc. +# Licensed under the Apache License, Version 2.0 (the "License") +# +# By Joel Fernandes + +from __future__ import print_function +from bcc import BPF +import argparse +import ctypes as ct +import sys +import subprocess +import os.path + +examples="" + +parser = argparse.ArgumentParser( + description="Trace long critical sections", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=examples) + +parser.add_argument("-p", "--preemptoff", action="store_true", + help="Find long sections where preemption was off") + +parser.add_argument("-i", "--irqoff", action="store_true", + help="Find long sections where IRQ was off") + +parser.add_argument("-d", "--duration", default=100, + help="Duration in uS (microseconds) below which we filter") + +args = parser.parse_args() + +preemptoff = False +irqoff = False + +if args.irqoff: + preemptoff = False + irqoff = True +elif args.preemptoff: + preemptoff = True + irqoff = False + +debugfs_path = subprocess.Popen ("cat /proc/mounts | grep -w debugfs" + + " | awk '{print $2}'", + shell=True, + stdout=subprocess.PIPE).stdout.read().split(b"\n")[0] + +if debugfs_path == "": + print("ERROR: Unable to find debugfs mount point"); + sys.exit(0); + +trace_path = debugfs_path + b"/tracing/events/preemptirq/"; + +if (not os.path.exists(trace_path + b"irq_disable") or + not os.path.exists(trace_path + b"irq_enable") or + not os.path.exists(trace_path + b"preempt_disable") or + not os.path.exists(trace_path + b"preempt_enable")): + print("ERROR: required tracing events are not available\n" + + "Make sure the kernel is built with CONFIG_DEBUG_PREEMPT " + + "and CONFIG_PREEMPTIRQ_EVENTS enabled. Also please disable " + + "CONFIG_PROVE_LOCKING and CONFIG_LOCKDEP on older kernels.") + sys.exit(0) + +bpf_text = """ +#include +#include + +enum addr_offs { + START_CALLER_OFF, + START_PARENT_OFF, + END_CALLER_OFF, + END_PARENT_OFF +}; + +struct start_data { + u32 addr_offs[2]; + u64 ts; + int idle_skip; + int active; +}; + +struct data_t { + u64 time; + s64 stack_id; + u32 cpu; + u64 id; + u32 addrs[4]; /* indexed by addr_offs */ + char comm[TASK_COMM_LEN]; +}; + +BPF_STACK_TRACE(stack_traces, 16384); +BPF_PERCPU_ARRAY(sts, struct start_data, 1); +BPF_PERCPU_ARRAY(isidle, u64, 1); +BPF_PERF_OUTPUT(events); + +/* + * In the below code we install tracepoint probes on preempt or + * IRQ disable/enable critical sections and idle events, the cases + * are combinations of 4 different states. + * The states are defined as: + * CSenter: A critical section has been entered. Either due to + * preempt disable or irq disable. + * CSexit: A critical section has been exited. Either due to + * preempt enable or irq enable. + * Ienter: The CPU has entered an idle state. + * Iexit: The CPU has exited an idle state. + * + * The scenario we are trying to detect is if there is an overlap + * between Critical sections and idle entry/exit. If there are any + * such cases, we avoid recording those critical sections since they + * are not worth while to record and just add noise. + */ +TRACEPOINT_PROBE(power, cpu_idle) +{ + int idx = 0; + u64 val; + struct start_data *stdp, std; + + // Mark active sections as that they should be skipped + + // Handle the case CSenter, Ienter, CSexit, Iexit + // Handle the case CSenter, Ienter, Iexit, CSexit + stdp = sts.lookup(&idx); + if (stdp && stdp->active) { + /* + * Due to verifier issues, we have to copy contents + * of stdp onto the stack before the update. + * Fix it to directly update once kernel patch d71962f + * becomes more widespread. + */ + std = *stdp; + std.idle_skip = 1; + sts.update(&idx, &std); + } + + // Mark CPU as actively within idle or not. + if (args->state < 100) { + val = 1; + isidle.update(&idx, &val); + } else { + val = 0; + isidle.update(&idx, &val); + } + return 0; +} + +static int in_idle(void) +{ + u64 *idlep; + int idx = 0; + + // Skip event if we're in idle loop + idlep = isidle.lookup(&idx); + if (idlep && *idlep) + return 1; + return 0; +} + +static void reset_state(void) +{ + int idx = 0; + struct start_data s = {}; + + sts.update(&idx, &s); +} + +TRACEPOINT_PROBE(preemptirq, TYPE_disable) +{ + int idx = 0; + struct start_data s; + + // Handle the case Ienter, CSenter, CSexit, Iexit + // Handle the case Ienter, CSenter, Iexit, CSexit + if (in_idle()) { + reset_state(); + return 0; + } + + u64 ts = bpf_ktime_get_ns(); + + s.idle_skip = 0; + s.addr_offs[START_CALLER_OFF] = args->caller_offs; + s.addr_offs[START_PARENT_OFF] = args->parent_offs; + s.ts = ts; + s.active = 1; + + sts.update(&idx, &s); + return 0; +} + +TRACEPOINT_PROBE(preemptirq, TYPE_enable) +{ + int idx = 0; + u64 start_ts, end_ts, diff; + struct start_data *stdp; + + // Handle the case CSenter, Ienter, CSexit, Iexit + // Handle the case Ienter, CSenter, CSexit, Iexit + if (in_idle()) { + reset_state(); + return 0; + } + + stdp = sts.lookup(&idx); + if (!stdp) { + reset_state(); + return 0; + } + + // Handle the case Ienter, Csenter, Iexit, Csexit + if (!stdp->active) { + reset_state(); + return 0; + } + + // Handle the case CSenter, Ienter, Iexit, CSexit + if (stdp->idle_skip) { + reset_state(); + return 0; + } + + end_ts = bpf_ktime_get_ns(); + start_ts = stdp->ts; + + if (start_ts > end_ts) { + reset_state(); + return 0; + } + + diff = end_ts - start_ts; + + if (diff < DURATION) { + reset_state(); + return 0; + } + + u64 id = bpf_get_current_pid_tgid(); + struct data_t data = {}; + + if (bpf_get_current_comm(&data.comm, sizeof(data.comm)) == 0) { + data.addrs[START_CALLER_OFF] = stdp->addr_offs[START_CALLER_OFF]; + data.addrs[START_PARENT_OFF] = stdp->addr_offs[START_PARENT_OFF]; + data.addrs[END_CALLER_OFF] = args->caller_offs; + data.addrs[END_PARENT_OFF] = args->parent_offs; + + data.id = id; + data.stack_id = stack_traces.get_stackid(args, 0); + data.time = diff; + data.cpu = bpf_get_smp_processor_id(); + events.perf_submit(args, &data, sizeof(data)); + } + + reset_state(); + return 0; +} +""" +bpf_text = bpf_text.replace('DURATION', '{}'.format(int(args.duration) * 1000)) + +if preemptoff: + bpf_text = bpf_text.replace('TYPE', 'preempt') +else: + bpf_text = bpf_text.replace('TYPE', 'irq') + +b = BPF(text=bpf_text) + +TASK_COMM_LEN = 16 # linux/sched.h + +class Data(ct.Structure): + _fields_ = [ + ("time", ct.c_ulonglong), + ("stack_id", ct.c_longlong), + ("cpu", ct.c_int), + ("id", ct.c_ulonglong), + ("addrs", ct.c_int * 4), + ("comm", ct.c_char * TASK_COMM_LEN), + ] + +def get_syms(kstack): + syms = [] + + for addr in kstack: + s = b.ksym(addr, show_offset=True) + syms.append(s) + + return syms + +# process event +def print_event(cpu, data, size): + try: + global b + event = ct.cast(data, ct.POINTER(Data)).contents + stack_traces = b['stack_traces'] + stext = b.ksymname('_stext') + + print("===================================") + print("TASK: %s (pid %5d tid %5d) Total Time: %-9.3fus\n\n" % (event.comm, \ + (event.id >> 32), (event.id & 0xffffffff), float(event.time) / 1000), end="") + print("Section start: {} -> {}".format(b.ksym(stext + event.addrs[0]), b.ksym(stext + event.addrs[1]))) + print("Section end: {} -> {}".format(b.ksym(stext + event.addrs[2]), b.ksym(stext + event.addrs[3]))) + + if event.stack_id < 0: + kstack = stack_traces.walk(event.stack_id) + syms = get_syms(kstack) + if not syms: + return + + for s in syms: + print(" ", end="") + print("%s" % s) + else: + print("NO STACK FOUND DUE TO COLLISION") + print("===================================") + print("") + except: + sys.exit(0) + +b["events"].open_perf_buffer(print_event, page_cnt=256) + +print("Finding critical section with {} disabled for > {}us".format( + ('preempt' if preemptoff else 'IRQ'), args.duration)) + +while 1: + b.perf_buffer_poll(); diff --git a/tools/criticalstat_example.txt b/tools/criticalstat_example.txt new file mode 100644 index 000000000000..1f5376913840 --- /dev/null +++ b/tools/criticalstat_example.txt @@ -0,0 +1,139 @@ +Demonstrations of criticalstat: Find long atomic critical sections in the kernel. + +criticalstat traces and reports occurences of atomic critical sections in the +kernel with useful stacktraces showing the origin of them. Such critical +sections frequently occur due to use of spinlocks, or if interrupts or +preemption were explicity disabled by a driver. IRQ routines in Linux are also +executed with interrupts disabled. There are many reasons. Such critical +sections are a source of long latency/responsive issues for real-time systems. + +This works by probing the preempt/irq and cpuidle tracepoints in the kernel. +Since this uses BPF, only the root user can use this tool. Further, the kernel +has to be built with certain CONFIG options enabled inorder for it to work: +CONFIG_PREEMPTIRQ_EVENTS +CONFIG_DEBUG_PREEMPT +Additionally, the following options should be turned off on older kernels: +CONFIG_PROVE_LOCKING +CONFIG_LOCKDEP + +USAGE: +# ./criticalstat -h +usage: criticalstat [-h] [-p] [-i] [-d DURATION] + +Trace long critical sections + +optional arguments: + -h, --help Show this help message and exit + -p, --preemptoff Find long sections where preemption was off + -i, --irqoff Find long sections where IRQ was off + -d DURATION, --duration DURATION + Duration in uS (microseconds) below which we filter + +examples: + ./criticalstat # run with default options: irq off for more than 100 uS + ./criticalstat -p # find sections with preemption disabled for more than 100 uS + ./criticalstat -d 500 # find sections with IRQs disabled for more than 500 uS + ./criticalstat -p -d 500 # find sections with preemption disabled for more than 500 uS + +The tool runs continuously until interrupted by Ctrl-C + + +By default, criticalstat finds IRQ disable sections for > 100us. + +# ./criticalstat +Finding critical section with IRQ disabled for > 100us +=================================== +TASK: kworker/u16:5 (pid 5903 tid 5903) Total Time: 194.427 us + +Section start: __schedule -> schedule +Section end: _raw_spin_unlock_irq -> finish_task_switch + trace_hardirqs_on+0xdc + trace_hardirqs_on+0xdc + _raw_spin_unlock_irq+0x18 + finish_task_switch+0xf0 + __schedule+0x8c8 + preempt_schedule_irq+0x38 + el1_preempt+0x8 +=================================== + + +If too many sections are showing up, the user can raise the threshold to only +show critical sections that are > 500us by passing "-d" option: + +# ./criticalstat -d 500 +Finding critical section with IRQ disabled for > 500us +=================================== +TASK: crtc_commit:111 (pid 246 tid 246) Total Time: 580.730 us + +Section start: clk_enable_lock -> clk_enable +Section end: _raw_spin_unlock_irqrestore -> clk_enable + trace_hardirqs_on+0xdc + trace_hardirqs_on+0xdc + _raw_spin_unlock_irqrestore+0x24 + clk_enable+0x80 + msm_dss_enable_clk+0x7c + sde_power_resource_enable+0x578 + _sde_crtc_vblank_enable_no_lock+0x68 + sde_crtc_vblank+0x8c + sde_kms_enable_vblank+0x18 + vblank_ctrl_worker+0xd0 + kthread_worker_fn+0xf8 + kthread+0x114 + ret_from_fork+0x10 +=================================== + + +If instead of irq disabled sections, we want to see preempt disabled sections, +then pass the "-p" option. Below we try to find preempt-disabled critical +sections that are > 500us. + +# ./criticalstat -p -d 500 +Finding critical section with preempt disabled for > 500us +=================================== +TASK: swapper/1 (pid 0 tid 0) Total Time: 618.437 us + +Section start: preempt_count_add -> preempt_count_add +Section end: preempt_count_sub -> preempt_count_sub + trace_preempt_on+0x98 + trace_preempt_on+0x98 + preempt_latency_stop+0x164 + preempt_count_sub+0x50 + schedule+0x74 + schedule_preempt_disabled+0x14 + cpu_startup_entry+0x84 + secondary_start_kernel+0x1c8 + [unknown] +=================================== + + +criticalstat -p can also reflect kernel scheduler issues sometimes. These may +show up as long preempt-off sections if the functions in the scheduler take a +long time to run (such as pick_next_task_fair which selects the CPU for a task + +Follow is a report showing a preempt-off latency of 700us during the schedule +loop's execution: + +=================================== +TASK: irq/296-cs35l36 (pid 666 tid 666) Total Time: 732.657 us + +Section start: schedule -> schedule +Section end: schedule -> schedule + trace_preempt_on+0x98 + trace_preempt_on+0x98 + preempt_count_sub+0xa4 + schedule+0x78 + schedule_timeout+0x80 + wait_for_common+0xb4 + wait_for_completion_timeout+0x28 + geni_i2c_xfer+0x298 + __i2c_transfer+0x4e0 + i2c_transfer+0x8 + irq_thread_fn+0x2c + irq_thread+0x160 + kthread+0x118 + ret_from_fork+0x10 +=================================== + + +See Also: Linux kernel's preemptoff and irqoff tracers which provide similar +tracing but with some limitations.