Skip to content

Commit

Permalink
Merge pull request iovisor#770 from palmtenor/pmu_api
Browse files Browse the repository at this point in the history
Add basic support for BPF perf event
  • Loading branch information
drzaeus77 committed Oct 21, 2016
2 parents 315998d + 2f3cdbf commit 0ef9ec4
Show file tree
Hide file tree
Showing 8 changed files with 364 additions and 0 deletions.
69 changes: 69 additions & 0 deletions man/man8/llcstat.8
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
.TH llcstat 8 "2015-08-18" "USER COMMANDS"
.SH NAME
llcstat \- Trace cache references and cache misses. Uses Linux eBPF/bcc.
.SH SYNOPSIS
.B llcstat [\-h] [\-c SAMPLE_PERIOD] [duration]
.SH DESCRIPTION
llcstat traces cache references and cache misses system-side, and summarizes
them by PID and CPU. These events have different meanings on different
architecture. For x86-64, they mean misses and references to LLC.
This can be useful to locate and debug performance issues
caused by cache hit rate.

This works by sampling corresponding events defined in uapi/linux/perf_event.h,
namely PERF_COUNT_HW_CACHE_REFERENCES and PERF_COUNT_HW_CACHE_MISSES, using
BPF perf event tracing. Upon each sampled event, the attached BPF program
records the PID and CPU ID on which the event happened, and stores it in table.

This makes use of a Linux 4.9 feature (BPF_PROG_TYPE_PERF_EVENT).

Since this uses BPF, only the root user can use this tool.
.SH REQUIREMENTS
CONFIG_BPF and bcc.
.SH OPTIONS
.TP
\-h
Print usage message.
.TP
\-c SAMPLE_PERIOD
Sample one in this many cache reference and cache miss events.
.TP
duration
Duration to trace, in seconds.
.SH EXAMPLES
.TP
Sample one in 100 events, trace for 20 seconds:
#
.B llcstat -c 100 20
.SH FIELDS
.TP
PID
Process ID
.TP
NAME
Process name
.TP
CPU
CPU ID
.TP
REFERENCE
Number of cache reference events
.TP
MISS
Number of cache miss events
.TP
HIT%
Cache hit ratio
.SH SOURCE
This is from bcc.
.IP
https://github.com/iovisor/bcc
.PP
Also look in the bcc distribution for a companion _examples.txt file containing
example usage, output, and commentary for this tool.
.SH OS
Linux
.SH STABILITY
Unstable - in development.
.SH AUTHOR
Teng Qin
1 change: 1 addition & 0 deletions src/cc/compat/linux/bpf.h
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@ enum bpf_prog_type {
BPF_PROG_TYPE_SCHED_ACT,
BPF_PROG_TYPE_TRACEPOINT,
BPF_PROG_TYPE_XDP,
BPF_PROG_TYPE_PERF_EVENT,
};

#define BPF_PSEUDO_MAP_FD 1
Expand Down
57 changes: 57 additions & 0 deletions src/cc/libbpf.c
Original file line number Diff line number Diff line change
Expand Up @@ -570,3 +570,60 @@ int bpf_attach_xdp(const char *dev_name, int progfd) {
close(sock);
return ret;
}

int bpf_attach_perf_event(int progfd, uint32_t ev_type, uint32_t ev_config,
uint64_t sample_period, uint64_t sample_freq,
pid_t pid, int cpu, int group_fd) {
if (ev_type != PERF_TYPE_HARDWARE && ev_type != PERF_TYPE_SOFTWARE) {
fprintf(stderr, "Unsupported perf event type\n");
return -1;
}
if ((ev_type == PERF_TYPE_HARDWARE && ev_config >= PERF_COUNT_HW_MAX) ||
(ev_type == PERF_TYPE_SOFTWARE && ev_config >= PERF_COUNT_SW_MAX)) {
fprintf(stderr, "Invalid perf event config\n");
return -1;
}
if (!((sample_period > 0) ^ (sample_freq > 0))) {
fprintf(
stderr, "Exactly one of sample_period / sample_freq should be set\n"
);
return -1;
}

struct perf_event_attr attr = {};
attr.type = ev_type;
attr.config = ev_config;
attr.inherit = 1;
if (sample_freq > 0) {
attr.freq = 1;
attr.sample_freq = sample_freq;
} else {
attr.sample_period = sample_period;
}

int fd = syscall(
__NR_perf_event_open, &attr, pid, cpu, group_fd, PERF_FLAG_FD_CLOEXEC
);
if (fd < 0) {
perror("perf_event_open failed");
return -1;
}
if (ioctl(fd, PERF_EVENT_IOC_SET_BPF, progfd) != 0) {
perror("ioctl(PERF_EVENT_IOC_SET_BPF) failed");
close(fd);
return -1;
}
if (ioctl(fd, PERF_EVENT_IOC_ENABLE, 0) != 0) {
perror("ioctl(PERF_EVENT_IOC_ENABLE) failed");
close(fd);
return -1;
}

return fd;
}

int bpf_detach_perf_event(uint32_t ev_type, uint32_t ev_config) {
// Right now, there is nothing to do, but it's a good idea to encourage
// callers to detach anything they attach.
return 0;
}
7 changes: 7 additions & 0 deletions src/cc/libbpf.h
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,13 @@ void * bpf_open_perf_buffer(perf_reader_raw_cb raw_cb, void *cb_cookie, int pid,
/* attached a prog expressed by progfd to the device specified in dev_name */
int bpf_attach_xdp(const char *dev_name, int progfd);

// attach a prog expressed by progfd to run on a specific perf event, with
// certain sample period or sample frequency
int bpf_attach_perf_event(int progfd, uint32_t ev_type, uint32_t ev_config,
uint64_t sample_period, uint64_t sample_freq,
pid_t pid, int cpu, int group_fd);
int bpf_detach_perf_event(uint32_t ev_type, uint32_t ev_config);

#define LOG_BUF_SIZE 65536

// Put non-static/inline functions in their own section with this prefix +
Expand Down
72 changes: 72 additions & 0 deletions src/python/bcc/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,13 +62,47 @@ def resolve_name(self, name):
return -1
return addr.value

class PerfType:
# From perf_type_id in uapi/linux/perf_event.h
HARDWARE = 0
SOFTWARE = 1

class PerfHWConfig:
# From perf_hw_id in uapi/linux/perf_event.h
CPU_CYCLES = 0
INSTRUCTIONS = 1
CACHE_REFERENCES = 2
CACHE_MISSES = 3
BRANCH_INSTRUCTIONS = 4
BRANCH_MISSES = 5
BUS_CYCLES = 6
STALLED_CYCLES_FRONTEND = 7
STALLED_CYCLES_BACKEND = 8
REF_CPU_CYCLES = 9

class PerfSWConfig:
# From perf_sw_id in uapi/linux/perf_event.h
CPU_CLOCK = 0
TASK_CLOCK = 1
PAGE_FAULTS = 2
CONTEXT_SWITCHES = 3
CPU_MIGRATIONS = 4
PAGE_FAULTS_MIN = 5
PAGE_FAULTS_MAJ = 6
ALIGNMENT_FAULTS = 7
EMULATION_FAULTS = 8
DUMMY = 9
BPF_OUTPUT = 10

class BPF(object):
# From bpf_prog_type in uapi/linux/bpf.h
SOCKET_FILTER = 1
KPROBE = 2
SCHED_CLS = 3
SCHED_ACT = 4
TRACEPOINT = 5
XDP = 6
PERF_EVENT = 7

_probe_repl = re.compile("[^a-zA-Z0-9_]")
_sym_caches = {}
Expand Down Expand Up @@ -168,6 +202,7 @@ def __init__(self, src_file="", hdr_file="", text=None, cb=None, debug=0,
self.open_kprobes = {}
self.open_uprobes = {}
self.open_tracepoints = {}
self.open_perf_events = {}
self.tracefile = None
atexit.register(self.cleanup)

Expand Down Expand Up @@ -608,6 +643,41 @@ def detach_tracepoint(self, tp=""):
raise Exception("Failed to detach BPF from tracepoint")
del self.open_tracepoints[tp]

def _attach_perf_event(self, progfd, ev_type, ev_config,
sample_period, sample_freq, pid, cpu, group_fd):
res = lib.bpf_attach_perf_event(progfd, ev_type, ev_config,
sample_period, sample_freq, pid, cpu, group_fd)
if res < 0:
raise Exception("Failed to attach BPF to perf event")
return res

def attach_perf_event(self, ev_type=-1, ev_config=-1, fn_name="",
sample_period=0, sample_freq=0, pid=-1, cpu=-1, group_fd=-1):
fn = self.load_func(fn_name, BPF.PERF_EVENT)
res = {}
if cpu >= 0:
res[cpu] = self._attach_perf_event(fn.fd, ev_type, ev_config,
sample_period, sample_freq, pid, cpu, group_fd)
else:
for i in range(0, multiprocessing.cpu_count()):
res[i] = self._attach_perf_event(fn.fd, ev_type, ev_config,
sample_period, sample_freq, pid, i, group_fd)
self.open_perf_events[(ev_type, ev_config)] = res

def detach_perf_event(self, ev_type=-1, ev_config=-1):
try:
fds = self.open_perf_events[(ev_type, ev_config)]
except KeyError:
raise Exception("Perf event type {} config {} not attached".format(
ev_type, ev_config))
for fd in fds.values():
os.close(fd)
res = lib.bpf_detach_perf_event(ev_type, ev_config)
if res < 0:
raise Exception("Failed to detach BPF from perf event")
del self.open_perf_events[(ev_type, ev_config)]
return res

def _add_uprobe(self, name, probe):
global _num_open_probes
self.open_uprobes[name] = probe
Expand Down Expand Up @@ -975,6 +1045,8 @@ def cleanup(self):
(tp_category, tp_name) = k.split(':')
lib.bpf_detach_tracepoint(tp_category, tp_name)
self.open_tracepoints.clear()
for (ev_type, ev_config) in list(self.open_perf_events.keys()):
self.detach_perf_event(ev_type, ev_config)
if self.tracefile:
self.tracefile.close()

Expand Down
6 changes: 6 additions & 0 deletions src/python/bcc/libbcc.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,12 @@
lib.bpf_attach_xdp.restype = ct.c_int;
lib.bpf_attach_xdp.argtypes = [ct.c_char_p, ct.c_int]

lib.bpf_attach_perf_event.restype = ct.c_int;
lib.bpf_attach_perf_event.argtype = [ct.c_int, ct.c_uint, ct.c_uint, ct.c_ulonglong, ct.c_ulonglong,
ct.c_int, ct.c_int, ct.c_int]
lib.bpf_detach_perf_event.restype = ct.c_int;
lib.bpf_detach_perf_event.argtype = [ct.c_uint, ct.c_uint]

# bcc symbol helpers
class bcc_symbol(ct.Structure):
_fields_ = [
Expand Down
110 changes: 110 additions & 0 deletions tools/llcstat.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
#!/usr/bin/python
#
# llcstat.py Summarize cache references and cache misses by PID.
# Cache reference and cache miss are corresponding events defined in
# uapi/linux/perf_event.h, it varies to different architecture.
# On x86-64, they mean LLC references and LLC misses.
#
# For Linux, uses BCC, eBPF. Embedded C.
#
# REQUIRES: Linux 4.9+ (BPF_PROG_TYPE_PERF_EVENT support).
#
# Copyright (c) 2016 Facebook, Inc.
# Licensed under the Apache License, Version 2.0 (the "License")
#
# 19-Oct-2016 Teng Qin Created this.

from __future__ import print_function
import argparse
from bcc import BPF, PerfType, PerfHWConfig
import signal
from time import sleep

parser = argparse.ArgumentParser(
description="Summarize cache references and misses by PID",
formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument(
"-c", "--sample_period", type=int, default=100,
help="Sample one in this many number of cache reference / miss events")
parser.add_argument(
"duration", nargs="?", default=10, help="Duration, in seconds, to run")
args = parser.parse_args()

# load BPF program
b = BPF(text="""
#include <linux/ptrace.h>
#include <uapi/linux/bpf_perf_event.h>
struct key_t {
int cpu;
int pid;
char name[TASK_COMM_LEN];
};
BPF_HASH(ref_count, struct key_t);
BPF_HASH(miss_count, struct key_t);
static inline __attribute__((always_inline)) void get_key(struct key_t* key) {
key->cpu = bpf_get_smp_processor_id();
key->pid = bpf_get_current_pid_tgid();
bpf_get_current_comm(&(key->name), sizeof(key->name));
}
int on_cache_miss(struct bpf_perf_event_data *ctx) {
struct key_t key = {};
get_key(&key);
u64 zero = 0, *val;
val = miss_count.lookup_or_init(&key, &zero);
(*val) += ctx->sample_period;
return 0;
}
int on_cache_ref(struct bpf_perf_event_data *ctx) {
struct key_t key = {};
get_key(&key);
u64 zero = 0, *val;
val = ref_count.lookup_or_init(&key, &zero);
(*val) += ctx->sample_period;
return 0;
}
""")

b.attach_perf_event(
ev_type=PerfType.HARDWARE, ev_config=PerfHWConfig.CACHE_MISSES,
fn_name="on_cache_miss", sample_period=args.sample_period)
b.attach_perf_event(
ev_type=PerfType.HARDWARE, ev_config=PerfHWConfig.CACHE_REFERENCES,
fn_name="on_cache_ref", sample_period=args.sample_period)

print("Running for {} seconds or hit Ctrl-C to end.".format(args.duration))

try:
sleep(float(args.duration))
except KeyboardInterrupt:
signal.signal(signal.SIGINT, lambda signal, frame: print())

miss_count = {}
for (k, v) in b.get_table('miss_count').items():
miss_count[(k.pid, k.cpu, k.name)] = v.value

print('PID NAME CPU REFERENCE MISS HIT%')
tot_ref = 0
tot_miss = 0
for (k, v) in b.get_table('ref_count').items():
try:
miss = miss_count[(k.pid, k.cpu, k.name)]
except KeyError:
miss = 0
tot_ref += v.value
tot_miss += miss
# This happens on some PIDs due to missed counts caused by sampling
hit = (v.value - miss) if (v.value >= miss) else 0
print('{:<8d} {:<16s} {:<4d} {:>12d} {:>12d} {:>6.2f}%'.format(
k.pid, k.name, k.cpu, v.value, miss,
(float(hit) / float(v.value)) * 100.0))
print('Total References: {} Total Misses: {} Hit Rate: {:.2f}%'.format(
tot_ref, tot_miss, (float(tot_ref - tot_miss) / float(tot_ref)) * 100.0))
Loading

0 comments on commit 0ef9ec4

Please sign in to comment.