#!/usr/bin/python
# @lint-avoid-python-3-compatibility-imports
#
# runqlen    Summarize scheduler run queue length as a histogram.
#            For Linux, uses BCC, eBPF.
#
# This counts the length of the run queue, excluding the currently running
# thread, and shows it as a histogram.
#
# Also answers run queue occupancy.
#
# USAGE: runqlen [-h] [-T] [-Q] [-m] [-D] [interval] [count]
#
# REQUIRES: Linux 4.9+ (BPF_PROG_TYPE_PERF_EVENT support). Under tools/old is
# a version of this tool that may work on Linux 4.6 - 4.8.
#
# Copyright 2016 Netflix, Inc.
# Licensed under the Apache License, Version 2.0 (the "License")
#
# 12-Dec-2016   Brendan Gregg   Created this.

from __future__ import print_function
from bcc import BPF, PerfType, PerfSWConfig
from time import sleep, strftime
from tempfile import NamedTemporaryFile
from os import open, close, dup, unlink, O_WRONLY
import argparse

# arguments
examples = """examples:
    ./runqlen            # summarize run queue length as a histogram
    ./runqlen 1 10       # print 1 second summaries, 10 times
    ./runqlen -T 1       # 1s summaries and timestamps
    ./runqlen -O         # report run queue occupancy
    ./runqlen -C         # show each CPU separately
"""
parser = argparse.ArgumentParser(
    description="Summarize scheduler run queue length as a histogram",
    formatter_class=argparse.RawDescriptionHelpFormatter,
    epilog=examples)
parser.add_argument("-T", "--timestamp", action="store_true",
    help="include timestamp on output")
parser.add_argument("-O", "--runqocc", action="store_true",
    help="report run queue occupancy")
parser.add_argument("-C", "--cpus", action="store_true",
    help="print output for each CPU separately")
parser.add_argument("interval", nargs="?", default=99999999,
    help="output interval, in seconds")
parser.add_argument("count", nargs="?", default=99999999,
    help="number of outputs")
parser.add_argument("--ebpf", action="store_true",
    help=argparse.SUPPRESS)
args = parser.parse_args()
countdown = int(args.count)
debug = 0
frequency = 99

# Linux 4.15 introduced a new field runnable_weight
# in linux_src:kernel/sched/sched.h as
#     struct cfs_rq {
#         struct load_weight load;
#         unsigned long runnable_weight;
#         unsigned int nr_running, h_nr_running;
#         ......
#     }
# and this tool requires to access nr_running to get
# runqueue len information.
#
# The commit which introduces cfs_rq->runnable_weight
# field also introduces the field sched_entity->runnable_weight
# where sched_entity is defined in linux_src:include/linux/sched.h.
#
# To cope with pre-4.15 and 4.15/post-4.15 releases,
# we run a simple BPF program to detect whether
# field sched_entity->runnable_weight exists. The existence of
# this field should infer the existence of cfs_rq->runnable_weight.
#
# This will need maintenance as the relationship between these
# two fields may change in the future.
#
def check_runnable_weight_field():
    # Define the bpf program for checking purpose
    bpf_check_text = """
#include <linux/sched.h>
unsigned long dummy(struct sched_entity *entity)
{
    return entity->runnable_weight;
}
"""

    # Get a temporary file name
    tmp_file = NamedTemporaryFile(delete=False)
    tmp_file.close();

    # Duplicate and close stderr (fd = 2)
    old_stderr = dup(2)
    close(2)

    # Open a new file, should get fd number 2
    # This will avoid printing llvm errors on the screen
    fd = open(tmp_file.name, O_WRONLY)
    try:
        t = BPF(text=bpf_check_text)
        success_compile = True
    except:
        success_compile = False

    # Release the fd 2, and next dup should restore old stderr
    close(fd)
    dup(old_stderr)
    close(old_stderr)

    # remove the temporary file and return
    unlink(tmp_file.name)
    return success_compile


# define BPF program
bpf_text = """
#include <uapi/linux/ptrace.h>
#include <linux/sched.h>

// Declare enough of cfs_rq to find nr_running, since we can't #import the
// header. This will need maintenance. It is from kernel/sched/sched.h:
struct cfs_rq_partial {
    struct load_weight load;
    RUNNABLE_WEIGHT_FIELD
    unsigned int nr_running, h_nr_running;
};

typedef struct cpu_key {
    int cpu;
    unsigned int slot;
} cpu_key_t;
STORAGE

int do_perf_event()
{
    unsigned int len = 0;
    pid_t pid = 0;
    struct task_struct *task = NULL;
    struct cfs_rq_partial *my_q = NULL;

    // Fetch the run queue length from task->se.cfs_rq->nr_running. This is an
    // unstable interface and may need maintenance. Perhaps a future version
    // of BPF will support task_rq(p) or something similar as a more reliable
    // interface.
    task = (struct task_struct *)bpf_get_current_task();
    my_q = (struct cfs_rq_partial *)task->se.cfs_rq;
    len = my_q->nr_running;

    // Calculate run queue length by subtracting the currently running task,
    // if present. len 0 == idle, len 1 == one running task.
    if (len > 0)
        len--;

    STORE

    return 0;
}
"""

# code substitutions
if args.cpus:
    bpf_text = bpf_text.replace('STORAGE',
        'BPF_HISTOGRAM(dist, cpu_key_t);')
    bpf_text = bpf_text.replace('STORE', 'cpu_key_t key = {.slot = len}; ' +
        'key.cpu = bpf_get_smp_processor_id(); ' +
        'dist.increment(key);')
else:
    bpf_text = bpf_text.replace('STORAGE',
        'BPF_HISTOGRAM(dist, unsigned int);')
    bpf_text = bpf_text.replace('STORE', 'dist.increment(len);')

if check_runnable_weight_field():
    bpf_text = bpf_text.replace('RUNNABLE_WEIGHT_FIELD', 'unsigned long runnable_weight;')
else:
    bpf_text = bpf_text.replace('RUNNABLE_WEIGHT_FIELD', '')

if debug or args.ebpf:
    print(bpf_text)
    if args.ebpf:
        exit()

# initialize BPF & perf_events
b = BPF(text=bpf_text)
b.attach_perf_event(ev_type=PerfType.SOFTWARE,
    ev_config=PerfSWConfig.CPU_CLOCK, fn_name="do_perf_event",
    sample_period=0, sample_freq=frequency)

print("Sampling run queue length... Hit Ctrl-C to end.")

# output
exiting = 0 if args.interval else 1
dist = b.get_table("dist")
while (1):
    try:
        sleep(int(args.interval))
    except KeyboardInterrupt:
        exiting = 1

    print()
    if args.timestamp:
        print("%-8s\n" % strftime("%H:%M:%S"), end="")

    if args.runqocc:
        if args.cpus:
            # run queue occupancy, per-CPU summary
            idle = {}
            queued = {}
            cpumax = 0
            for k, v in dist.items():
                if k.cpu > cpumax:
                    cpumax = k.cpu
            for c in range(0, cpumax + 1):
                idle[c] = 0
                queued[c] = 0
            for k, v in dist.items():
                if k.slot == 0:
                    idle[k.cpu] += v.value
                else:
                    queued[k.cpu] += v.value
            for c in range(0, cpumax + 1):
                samples = idle[c] + queued[c]
                if samples:
                    runqocc = float(queued[c]) / samples
                else:
                    runqocc = 0
                print("runqocc, CPU %-3d %6.2f%%" % (c, 100 * runqocc))

        else:
            # run queue occupancy, system-wide summary
            idle = 0
            queued = 0
            for k, v in dist.items():
                if k.value == 0:
                    idle += v.value
                else:
                    queued += v.value
            samples = idle + queued
            if samples:
                runqocc = float(queued) / samples
            else:
                runqocc = 0
            print("runqocc: %0.2f%%" % (100 * runqocc))

    else:
        # run queue length histograms
        dist.print_linear_hist("runqlen", "cpu")

    dist.clear()

    countdown -= 1
    if exiting or countdown == 0:
        exit()