From 0b813f80583992703328b524a175bb53893864b5 Mon Sep 17 00:00:00 2001
From: Joel <agnel.joel@gmail.com>
Date: Mon, 18 Jun 2018 19:37:38 -0700
Subject: [PATCH] bcc: criticalstat: Add an atomic critical section tracer
 (#1801)

This tool detects code sections in the kernel where IRQs or preemption
are disabled for a user-specified amount of time.

Requires a kernel built with CONFIG_DEBUG_PREEMPT and
CONFIG_PREEMPTIRQ_EVENTS.

Signed-off-by: Joel Fernandes <joel@joelfernandes.org>
---
 README.md                      |   1 +
 man/man8/criticalstat.8        |  74 ++++++++
 tools/criticalstat.py          | 331 +++++++++++++++++++++++++++++++++
 tools/criticalstat_example.txt | 139 ++++++++++++++
 4 files changed, 545 insertions(+)
 create mode 100644 man/man8/criticalstat.8
 create mode 100755 tools/criticalstat.py
 create mode 100644 tools/criticalstat_example.txt

diff --git a/README.md b/README.md
index c2b3375b0667..50d6db0a2fc2 100644
--- a/README.md
+++ b/README.md
@@ -97,6 +97,7 @@ pair of .c and .py files, and some are directories of files.
 - tools/[cachetop](tools/cachetop.py): Trace page cache hit/miss ratio by processes. [Examples](tools/cachetop_example.txt).
 - tools/[cpudist](tools/cpudist.py): Summarize on- and off-CPU time per task as a histogram. [Examples](tools/cpudist_example.txt)
 - tools/[cpuunclaimed](tools/cpuunclaimed.py): Sample CPU run queues and calculate unclaimed idle CPU. [Examples](tools/cpuunclaimed_example.txt)
+- tools/[criticalstat](tools/criticalstat.py): Trace and report long atomic critical sections in the kernel. [Examples](tools/criticalstat_example.txt)
 - tools/[dbslower](tools/dbslower.py): Trace MySQL/PostgreSQL queries slower than a threshold. [Examples](tools/dbslower_example.txt).
 - tools/[dbstat](tools/dbstat.py): Summarize MySQL/PostgreSQL query latency as a histogram. [Examples](tools/dbstat_example.txt).
 - tools/[dcsnoop](tools/dcsnoop.py): Trace directory entry cache (dcache) lookups. [Examples](tools/dcsnoop_example.txt).
diff --git a/man/man8/criticalstat.8 b/man/man8/criticalstat.8
new file mode 100644
index 000000000000..52baf1d8cc53
--- /dev/null
+++ b/man/man8/criticalstat.8
@@ -0,0 +1,74 @@
+.TH criticalstat 8  "2018-06-07" "USER COMMANDS"
+.SH NAME
+criticalstat \- A tracer to find and report long atomic critical sections in kernel
+.SH SYNOPSIS
+.B criticalstat [\-h] [\-p] [\-i] [\-d DURATION]
+.SH DESCRIPTION
+
+criticalstat traces and reports occurences of atomic critical sections in the
+kernel with useful stacktraces showing the origin of them. Such critical
+sections frequently occur due to use of spinlocks, or if interrupts or
+preemption were explicity disabled by a driver. IRQ routines in Linux are also
+executed with interrupts disabled. There are many reasons. Such critical
+sections are a source of long latency/responsive issues for real-time systems.
+
+This works by probing the preempt/irq and cpuidle tracepoints in the kernel.
+Since this uses BPF, only the root user can use this tool. Further, the kernel
+has to be built with certain CONFIG options enabled. See below.
+
+.SH REQUIREMENTS
+Enable CONFIG_PREEMPTIRQ_EVENTS and CONFIG_DEBUG_PREEMPT. Additionally, the
+following options should be DISABLED on older kernels: CONFIG_PROVE_LOCKING,
+CONFIG_LOCKDEP.
+.SH OPTIONS
+.TP
+\-h
+Print usage message.
+.TP
+\-p
+Find long sections where preemption was disabled on local CPU.
+.TP
+\-i
+Find long sections where interrupt was disabled on local CPU.
+.TP
+\-d DURATION
+Only identify sections that are longer than DURATION in microseconds.
+.SH EXAMPLES
+.TP
+Run with default options: irq disabled for more than 100 uS
+#
+.B criticalstat
+.TP
+Find sections with preemption disabled for more than 100 uS.
+#
+.B criticalstat -p
+.TP
+Find sections with IRQ disabled for more than 500 uS.
+#
+.B criticalstat -d 500
+.TP
+Find sections with preemption disabled for more than 500 uS.
+#
+.B criticalstat -p -d 500
+.SH OVERHEAD
+This tool can cause overhead if the application is spending a lot of time in
+kernel mode. The overhead is variable but can be 2-4% of performance
+degradation. If overhead is seen to be too much, please pass a higher DURATION
+to the -d option to filter more aggressively.
+
+.SH SOURCE
+This is from bcc.
+.IP
+https://github.com/iovisor/bcc
+.PP
+Also look in the bcc distribution for a companion _examples.txt file containing
+example usage, output, and commentary for this tool.
+.SH OS
+Linux
+.SH STABILITY
+Unstable - in development.
+.SH AUTHOR
+Joel Fernandes
+.SH SEE ALSO
+Linux kernel's preemptoff and irqoff tracers.
+
diff --git a/tools/criticalstat.py b/tools/criticalstat.py
new file mode 100755
index 000000000000..48dd7b3c3ce7
--- /dev/null
+++ b/tools/criticalstat.py
@@ -0,0 +1,331 @@
+#!/usr/bin/python
+# @lint-avoid-python-3-compatibility-imports
+#
+# criticalstat  Trace long critical sections (IRQs or preemption disabled)
+#               For Linux, uses BCC, eBPF. Requires kernel built with
+#               CONFIG_DEBUG_PREEMPT and CONFIG_PREEMPTIRQ_EVENTS
+#
+# USAGE: criticalstat [-h] [-p] [-i] [-d DURATION]
+#
+# Copyright (c) 2018, Google, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# By Joel Fernandes <joel@joelfernandes.org>
+
+from __future__ import print_function
+from bcc import BPF
+import argparse
+import ctypes as ct
+import sys
+import subprocess
+import os.path
+
+examples=""
+
+parser = argparse.ArgumentParser(
+    description="Trace long critical sections",
+    formatter_class=argparse.RawDescriptionHelpFormatter,
+    epilog=examples)
+
+parser.add_argument("-p", "--preemptoff", action="store_true",
+                    help="Find long sections where preemption was off")
+
+parser.add_argument("-i", "--irqoff", action="store_true",
+                    help="Find long sections where IRQ was off")
+
+parser.add_argument("-d", "--duration", default=100,
+                    help="Duration in uS (microseconds) below which we filter")
+
+args = parser.parse_args()
+
+preemptoff = False
+irqoff = False
+
+if args.irqoff:
+    preemptoff = False
+    irqoff = True
+elif args.preemptoff:
+    preemptoff = True
+    irqoff = False
+
+debugfs_path = subprocess.Popen ("cat /proc/mounts | grep -w debugfs" +
+    " | awk '{print $2}'",
+    shell=True,
+    stdout=subprocess.PIPE).stdout.read().split(b"\n")[0]
+
+if debugfs_path == "":
+    print("ERROR: Unable to find debugfs mount point");
+    sys.exit(0);
+
+trace_path = debugfs_path + b"/tracing/events/preemptirq/";
+
+if (not os.path.exists(trace_path + b"irq_disable") or
+   not os.path.exists(trace_path + b"irq_enable") or
+   not os.path.exists(trace_path + b"preempt_disable") or
+   not os.path.exists(trace_path + b"preempt_enable")):
+    print("ERROR: required tracing events are not available\n" +
+        "Make sure the kernel is built with CONFIG_DEBUG_PREEMPT " +
+        "and CONFIG_PREEMPTIRQ_EVENTS enabled. Also please disable " +
+        "CONFIG_PROVE_LOCKING and CONFIG_LOCKDEP on older kernels.")
+    sys.exit(0)
+
+bpf_text = """
+#include <uapi/linux/ptrace.h>
+#include <linux/sched.h>
+
+enum addr_offs {
+    START_CALLER_OFF,
+    START_PARENT_OFF,
+    END_CALLER_OFF,
+    END_PARENT_OFF
+};
+
+struct start_data {
+    u32 addr_offs[2];
+    u64 ts;
+    int idle_skip;
+    int active;
+};
+
+struct data_t {
+    u64 time;
+    s64 stack_id;
+    u32 cpu;
+    u64 id;
+    u32 addrs[4];   /* indexed by addr_offs */
+    char comm[TASK_COMM_LEN];
+};
+
+BPF_STACK_TRACE(stack_traces, 16384);
+BPF_PERCPU_ARRAY(sts, struct start_data, 1);
+BPF_PERCPU_ARRAY(isidle, u64, 1);
+BPF_PERF_OUTPUT(events);
+
+/*
+ * In the below code we install tracepoint probes on preempt or
+ * IRQ disable/enable critical sections and idle events, the cases
+ * are combinations of 4 different states.
+ * The states are defined as:
+ * CSenter: A critical section has been entered. Either due to
+ *          preempt disable or irq disable.
+ * CSexit: A critical section has been exited. Either due to
+ *         preempt enable or irq enable.
+ * Ienter: The CPU has entered an idle state.
+ * Iexit: The CPU has exited an idle state.
+ *
+ * The scenario we are trying to detect is if there is an overlap
+ * between Critical sections and idle entry/exit. If there are any
+ * such cases, we avoid recording those critical sections since they
+ * are not worth while to record and just add noise.
+ */
+TRACEPOINT_PROBE(power, cpu_idle)
+{
+    int idx = 0;
+    u64 val;
+    struct start_data *stdp, std;
+
+    // Mark active sections as that they should be skipped
+
+    // Handle the case CSenter, Ienter, CSexit, Iexit
+    // Handle the case CSenter, Ienter, Iexit, CSexit
+    stdp = sts.lookup(&idx);
+    if (stdp && stdp->active) {
+        /*
+         * Due to verifier issues, we have to copy contents
+         * of stdp onto the stack before the update.
+         * Fix it to directly update once kernel patch d71962f
+         * becomes more widespread.
+         */
+        std = *stdp;
+        std.idle_skip = 1;
+        sts.update(&idx, &std);
+    }
+
+    // Mark CPU as actively within idle or not.
+    if (args->state < 100) {
+        val = 1;
+        isidle.update(&idx, &val);
+    } else {
+        val = 0;
+        isidle.update(&idx, &val);
+    }
+    return 0;
+}
+
+static int in_idle(void)
+{
+     u64 *idlep;
+     int idx = 0;
+
+    // Skip event if we're in idle loop
+    idlep = isidle.lookup(&idx);
+    if (idlep && *idlep)
+        return 1;
+    return 0;
+}
+
+static void reset_state(void)
+{
+    int idx = 0;
+    struct start_data s = {};
+
+    sts.update(&idx, &s);
+}
+
+TRACEPOINT_PROBE(preemptirq, TYPE_disable)
+{
+    int idx = 0;
+    struct start_data s;
+
+    // Handle the case Ienter, CSenter, CSexit, Iexit
+    // Handle the case Ienter, CSenter, Iexit, CSexit
+    if (in_idle()) {
+        reset_state();
+        return 0;
+    }
+
+    u64 ts = bpf_ktime_get_ns();
+
+    s.idle_skip = 0;
+    s.addr_offs[START_CALLER_OFF] = args->caller_offs;
+    s.addr_offs[START_PARENT_OFF] = args->parent_offs;
+    s.ts = ts;
+    s.active = 1;
+
+    sts.update(&idx, &s);
+    return 0;
+}
+
+TRACEPOINT_PROBE(preemptirq, TYPE_enable)
+{
+    int idx = 0;
+    u64 start_ts, end_ts, diff;
+    struct start_data *stdp;
+
+    // Handle the case CSenter, Ienter, CSexit, Iexit
+    // Handle the case Ienter, CSenter, CSexit, Iexit
+    if (in_idle()) {
+        reset_state();
+        return 0;
+    }
+
+    stdp = sts.lookup(&idx);
+    if (!stdp) {
+        reset_state();
+        return 0;
+    }
+
+    // Handle the case Ienter, Csenter, Iexit, Csexit
+    if (!stdp->active) {
+        reset_state();
+        return 0;
+    }
+
+    // Handle the case CSenter, Ienter, Iexit, CSexit
+    if (stdp->idle_skip) {
+        reset_state();
+        return 0;
+    }
+
+    end_ts = bpf_ktime_get_ns();
+    start_ts = stdp->ts;
+
+    if (start_ts > end_ts) {
+        reset_state();
+        return 0;
+    }
+
+    diff = end_ts - start_ts;
+
+    if (diff < DURATION) {
+        reset_state();
+        return 0;
+    }
+
+    u64 id = bpf_get_current_pid_tgid();
+    struct data_t data = {};
+
+    if (bpf_get_current_comm(&data.comm, sizeof(data.comm)) == 0) {
+        data.addrs[START_CALLER_OFF] = stdp->addr_offs[START_CALLER_OFF];
+        data.addrs[START_PARENT_OFF] = stdp->addr_offs[START_PARENT_OFF];
+        data.addrs[END_CALLER_OFF] = args->caller_offs;
+        data.addrs[END_PARENT_OFF] = args->parent_offs;
+
+        data.id = id;
+        data.stack_id = stack_traces.get_stackid(args, 0);
+        data.time = diff;
+        data.cpu = bpf_get_smp_processor_id();
+        events.perf_submit(args, &data, sizeof(data));
+    }
+
+    reset_state();
+    return 0;
+}
+"""
+bpf_text = bpf_text.replace('DURATION', '{}'.format(int(args.duration) * 1000))
+
+if preemptoff:
+    bpf_text = bpf_text.replace('TYPE', 'preempt')
+else:
+    bpf_text = bpf_text.replace('TYPE', 'irq')
+
+b = BPF(text=bpf_text)
+
+TASK_COMM_LEN = 16    # linux/sched.h
+
+class Data(ct.Structure):
+    _fields_ = [
+        ("time", ct.c_ulonglong),
+        ("stack_id", ct.c_longlong),
+        ("cpu", ct.c_int),
+        ("id", ct.c_ulonglong),
+        ("addrs", ct.c_int * 4),
+        ("comm", ct.c_char * TASK_COMM_LEN),
+    ]
+
+def get_syms(kstack):
+    syms = []
+
+    for addr in kstack:
+        s = b.ksym(addr, show_offset=True)
+        syms.append(s)
+
+    return syms
+
+# process event
+def print_event(cpu, data, size):
+    try:
+        global b
+        event = ct.cast(data, ct.POINTER(Data)).contents
+        stack_traces = b['stack_traces']
+        stext = b.ksymname('_stext')
+
+        print("===================================")
+        print("TASK: %s (pid %5d tid %5d) Total Time: %-9.3fus\n\n" % (event.comm, \
+            (event.id >> 32), (event.id & 0xffffffff), float(event.time) / 1000), end="")
+        print("Section start: {} -> {}".format(b.ksym(stext + event.addrs[0]), b.ksym(stext + event.addrs[1])))
+        print("Section end:   {} -> {}".format(b.ksym(stext + event.addrs[2]), b.ksym(stext + event.addrs[3])))
+
+        if event.stack_id < 0:
+            kstack = stack_traces.walk(event.stack_id)
+            syms = get_syms(kstack)
+            if not syms:
+                return
+
+            for s in syms:
+                print("  ", end="")
+                print("%s" % s)
+        else:
+            print("NO STACK FOUND DUE TO COLLISION")
+        print("===================================")
+        print("")
+    except:
+        sys.exit(0)
+
+b["events"].open_perf_buffer(print_event, page_cnt=256)
+
+print("Finding critical section with {} disabled for > {}us".format(
+    ('preempt' if preemptoff else 'IRQ'), args.duration))
+
+while 1:
+    b.perf_buffer_poll();
diff --git a/tools/criticalstat_example.txt b/tools/criticalstat_example.txt
new file mode 100644
index 000000000000..1f5376913840
--- /dev/null
+++ b/tools/criticalstat_example.txt
@@ -0,0 +1,139 @@
+Demonstrations of criticalstat: Find long atomic critical sections in the kernel.
+
+criticalstat traces and reports occurences of atomic critical sections in the
+kernel with useful stacktraces showing the origin of them. Such critical
+sections frequently occur due to use of spinlocks, or if interrupts or
+preemption were explicity disabled by a driver. IRQ routines in Linux are also
+executed with interrupts disabled. There are many reasons. Such critical
+sections are a source of long latency/responsive issues for real-time systems.
+
+This works by probing the preempt/irq and cpuidle tracepoints in the kernel.
+Since this uses BPF, only the root user can use this tool. Further, the kernel
+has to be built with certain CONFIG options enabled inorder for it to work:
+CONFIG_PREEMPTIRQ_EVENTS
+CONFIG_DEBUG_PREEMPT
+Additionally, the following options should be turned off on older kernels:
+CONFIG_PROVE_LOCKING
+CONFIG_LOCKDEP
+
+USAGE:
+# ./criticalstat -h
+usage: criticalstat [-h] [-p] [-i] [-d DURATION]
+
+Trace long critical sections
+
+optional arguments:
+  -h, --help            Show this help message and exit
+  -p, --preemptoff      Find long sections where preemption was off
+  -i, --irqoff          Find long sections where IRQ was off
+  -d DURATION, --duration DURATION
+                        Duration in uS (microseconds) below which we filter
+
+examples:
+    ./criticalstat          	# run with default options: irq off for more than 100 uS
+    ./criticalstat -p       	# find sections with preemption disabled for more than 100 uS
+    ./criticalstat -d 500   	# find sections with IRQs disabled for more than 500 uS
+    ./criticalstat -p -d 500	# find sections with preemption disabled for more than 500 uS
+
+The tool runs continuously until interrupted by Ctrl-C
+
+
+By default, criticalstat finds IRQ disable sections for > 100us.
+
+# ./criticalstat
+Finding critical section with IRQ disabled for > 100us
+===================================
+TASK: kworker/u16:5 (pid  5903 tid  5903) Total Time: 194.427  us
+
+Section start: __schedule -> schedule
+Section end:   _raw_spin_unlock_irq -> finish_task_switch
+  trace_hardirqs_on+0xdc
+  trace_hardirqs_on+0xdc
+  _raw_spin_unlock_irq+0x18
+  finish_task_switch+0xf0
+  __schedule+0x8c8
+  preempt_schedule_irq+0x38
+  el1_preempt+0x8
+===================================
+
+
+If too many sections are showing up, the user can raise the threshold to only
+show critical sections that are > 500us by passing "-d" option:
+
+# ./criticalstat -d 500
+Finding critical section with IRQ disabled for > 500us
+===================================
+TASK: crtc_commit:111 (pid   246 tid   246) Total Time: 580.730  us
+
+Section start: clk_enable_lock -> clk_enable
+Section end:   _raw_spin_unlock_irqrestore -> clk_enable
+  trace_hardirqs_on+0xdc
+  trace_hardirqs_on+0xdc
+  _raw_spin_unlock_irqrestore+0x24
+  clk_enable+0x80
+  msm_dss_enable_clk+0x7c
+  sde_power_resource_enable+0x578
+  _sde_crtc_vblank_enable_no_lock+0x68
+  sde_crtc_vblank+0x8c
+  sde_kms_enable_vblank+0x18
+  vblank_ctrl_worker+0xd0
+  kthread_worker_fn+0xf8
+  kthread+0x114
+  ret_from_fork+0x10
+===================================
+
+
+If instead of irq disabled sections, we want to see preempt disabled sections,
+then pass the "-p" option. Below we try to find preempt-disabled critical
+sections that are > 500us.
+
+# ./criticalstat -p -d 500
+Finding critical section with preempt disabled for > 500us
+===================================
+TASK: swapper/1 (pid     0 tid     0) Total Time: 618.437  us
+
+Section start: preempt_count_add -> preempt_count_add
+Section end:   preempt_count_sub -> preempt_count_sub
+  trace_preempt_on+0x98
+  trace_preempt_on+0x98
+  preempt_latency_stop+0x164
+  preempt_count_sub+0x50
+  schedule+0x74
+  schedule_preempt_disabled+0x14
+  cpu_startup_entry+0x84
+  secondary_start_kernel+0x1c8
+  [unknown]
+===================================
+
+
+criticalstat -p can also reflect kernel scheduler issues sometimes. These may
+show up as long preempt-off sections if the functions in the scheduler take a
+long time to run (such as pick_next_task_fair which selects the CPU for a task
+
+Follow is a report showing a preempt-off latency of 700us during the schedule
+loop's execution:
+
+===================================
+TASK: irq/296-cs35l36 (pid   666 tid   666) Total Time: 732.657  us
+
+Section start: schedule -> schedule
+Section end:   schedule -> schedule
+  trace_preempt_on+0x98
+  trace_preempt_on+0x98
+  preempt_count_sub+0xa4
+  schedule+0x78
+  schedule_timeout+0x80
+  wait_for_common+0xb4
+  wait_for_completion_timeout+0x28
+  geni_i2c_xfer+0x298
+  __i2c_transfer+0x4e0
+  i2c_transfer+0x8
+  irq_thread_fn+0x2c
+  irq_thread+0x160
+  kthread+0x118
+  ret_from_fork+0x10
+===================================
+
+
+See Also: Linux kernel's preemptoff and irqoff tracers which provide similar
+tracing but with some limitations.