#!/usr/bin/python # @lint-avoid-python-3-compatibility-imports # # runqlen Summarize scheduler run queue length as a histogram. # For Linux, uses BCC, eBPF. # # This counts the length of the run queue, excluding the currently running # thread, and shows it as a histogram. # # Also answers run queue occupancy. # # USAGE: runqlen [-h] [-T] [-Q] [-m] [-D] [interval] [count] # # REQUIRES: Linux 4.9+ (BPF_PROG_TYPE_PERF_EVENT support). Under tools/old is # a version of this tool that may work on Linux 4.6 - 4.8. # # Copyright 2016 Netflix, Inc. # Licensed under the Apache License, Version 2.0 (the "License") # # 12-Dec-2016 Brendan Gregg Created this. from __future__ import print_function from bcc import BPF, PerfType, PerfSWConfig from time import sleep, strftime from tempfile import NamedTemporaryFile from os import open, close, dup, unlink, O_WRONLY import argparse # arguments examples = """examples: ./runqlen # summarize run queue length as a histogram ./runqlen 1 10 # print 1 second summaries, 10 times ./runqlen -T 1 # 1s summaries and timestamps ./runqlen -O # report run queue occupancy ./runqlen -C # show each CPU separately """ parser = argparse.ArgumentParser( description="Summarize scheduler run queue length as a histogram", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=examples) parser.add_argument("-T", "--timestamp", action="store_true", help="include timestamp on output") parser.add_argument("-O", "--runqocc", action="store_true", help="report run queue occupancy") parser.add_argument("-C", "--cpus", action="store_true", help="print output for each CPU separately") parser.add_argument("interval", nargs="?", default=99999999, help="output interval, in seconds") parser.add_argument("count", nargs="?", default=99999999, help="number of outputs") parser.add_argument("--ebpf", action="store_true", help=argparse.SUPPRESS) args = parser.parse_args() countdown = int(args.count) debug = 0 frequency = 99 # Linux 4.15 introduced a new field runnable_weight # in linux_src:kernel/sched/sched.h as # struct cfs_rq { # struct load_weight load; # unsigned long runnable_weight; # unsigned int nr_running, h_nr_running; # ...... # } # and this tool requires to access nr_running to get # runqueue len information. # # The commit which introduces cfs_rq->runnable_weight # field also introduces the field sched_entity->runnable_weight # where sched_entity is defined in linux_src:include/linux/sched.h. # # To cope with pre-4.15 and 4.15/post-4.15 releases, # we run a simple BPF program to detect whether # field sched_entity->runnable_weight exists. The existence of # this field should infer the existence of cfs_rq->runnable_weight. # # This will need maintenance as the relationship between these # two fields may change in the future. # def check_runnable_weight_field(): # Define the bpf program for checking purpose bpf_check_text = """ #include unsigned long dummy(struct sched_entity *entity) { return entity->runnable_weight; } """ # Get a temporary file name tmp_file = NamedTemporaryFile(delete=False) tmp_file.close(); # Duplicate and close stderr (fd = 2) old_stderr = dup(2) close(2) # Open a new file, should get fd number 2 # This will avoid printing llvm errors on the screen fd = open(tmp_file.name, O_WRONLY) try: t = BPF(text=bpf_check_text) success_compile = True except: success_compile = False # Release the fd 2, and next dup should restore old stderr close(fd) dup(old_stderr) close(old_stderr) # remove the temporary file and return unlink(tmp_file.name) return success_compile # define BPF program bpf_text = """ #include #include // Declare enough of cfs_rq to find nr_running, since we can't #import the // header. This will need maintenance. It is from kernel/sched/sched.h: struct cfs_rq_partial { struct load_weight load; RUNNABLE_WEIGHT_FIELD unsigned int nr_running, h_nr_running; }; typedef struct cpu_key { int cpu; unsigned int slot; } cpu_key_t; STORAGE int do_perf_event() { unsigned int len = 0; pid_t pid = 0; struct task_struct *task = NULL; struct cfs_rq_partial *my_q = NULL; // Fetch the run queue length from task->se.cfs_rq->nr_running. This is an // unstable interface and may need maintenance. Perhaps a future version // of BPF will support task_rq(p) or something similar as a more reliable // interface. task = (struct task_struct *)bpf_get_current_task(); my_q = (struct cfs_rq_partial *)task->se.cfs_rq; len = my_q->nr_running; // Calculate run queue length by subtracting the currently running task, // if present. len 0 == idle, len 1 == one running task. if (len > 0) len--; STORE return 0; } """ # code substitutions if args.cpus: bpf_text = bpf_text.replace('STORAGE', 'BPF_HISTOGRAM(dist, cpu_key_t);') bpf_text = bpf_text.replace('STORE', 'cpu_key_t key = {.slot = len}; ' + 'key.cpu = bpf_get_smp_processor_id(); ' + 'dist.increment(key);') else: bpf_text = bpf_text.replace('STORAGE', 'BPF_HISTOGRAM(dist, unsigned int);') bpf_text = bpf_text.replace('STORE', 'dist.increment(len);') if check_runnable_weight_field(): bpf_text = bpf_text.replace('RUNNABLE_WEIGHT_FIELD', 'unsigned long runnable_weight;') else: bpf_text = bpf_text.replace('RUNNABLE_WEIGHT_FIELD', '') if debug or args.ebpf: print(bpf_text) if args.ebpf: exit() # initialize BPF & perf_events b = BPF(text=bpf_text) b.attach_perf_event(ev_type=PerfType.SOFTWARE, ev_config=PerfSWConfig.CPU_CLOCK, fn_name="do_perf_event", sample_period=0, sample_freq=frequency) print("Sampling run queue length... Hit Ctrl-C to end.") # output exiting = 0 if args.interval else 1 dist = b.get_table("dist") while (1): try: sleep(int(args.interval)) except KeyboardInterrupt: exiting = 1 print() if args.timestamp: print("%-8s\n" % strftime("%H:%M:%S"), end="") if args.runqocc: if args.cpus: # run queue occupancy, per-CPU summary idle = {} queued = {} cpumax = 0 for k, v in dist.items(): if k.cpu > cpumax: cpumax = k.cpu for c in range(0, cpumax + 1): idle[c] = 0 queued[c] = 0 for k, v in dist.items(): if k.slot == 0: idle[k.cpu] += v.value else: queued[k.cpu] += v.value for c in range(0, cpumax + 1): samples = idle[c] + queued[c] if samples: runqocc = float(queued[c]) / samples else: runqocc = 0 print("runqocc, CPU %-3d %6.2f%%" % (c, 100 * runqocc)) else: # run queue occupancy, system-wide summary idle = 0 queued = 0 for k, v in dist.items(): if k.value == 0: idle += v.value else: queued += v.value samples = idle + queued if samples: runqocc = float(queued) / samples else: runqocc = 0 print("runqocc: %0.2f%%" % (100 * runqocc)) else: # run queue length histograms dist.print_linear_hist("runqlen", "cpu") dist.clear() countdown -= 1 if exiting or countdown == 0: exit()