From 3069caa01c4dc0772c2fa25b11eaf478c85d9ad3 Mon Sep 17 00:00:00 2001 From: Brenden Blanco Date: Mon, 1 Aug 2016 18:12:11 -0700 Subject: [PATCH] add open_perf_event api for reading perf counters (#643) * add open_perf_event api for reading perf counters Though the rewriter table method existed, we were not yet opening up the perf counter in the proper way for it to be read. Introduce a c function to enable attaching perf counters to cpu-indexed table slots. The python side opens, assigns, then immediately closes the fd. Only the kernel keeps a reference, so closing the table or deleting the entry will be sufficient to free up the fd when finished. Signed-off-by: Brenden Blanco * Skip perf array hw counter test if unsupported Hardware counters are not available in all places (some VM environments), so gracefully skip on the particular error condition. Signed-off-by: Brenden Blanco --- docs/reference_guide.md | 58 +++++++++++++++++++++++------ src/cc/libbpf.c | 22 +++++++++++ src/python/bcc/libbcc.py | 2 + src/python/bcc/perf.py | 1 + src/python/bcc/table.py | 66 ++++++++++++++++++++++++++++++++- tests/python/CMakeLists.txt | 2 + tests/python/test_perf_event.py | 47 +++++++++++++++++++++++ 7 files changed, 184 insertions(+), 14 deletions(-) create mode 100755 tests/python/test_perf_event.py diff --git a/docs/reference_guide.md b/docs/reference_guide.md index 47cc202b6746..0e51404d6c49 100644 --- a/docs/reference_guide.md +++ b/docs/reference_guide.md @@ -30,12 +30,14 @@ This guide is incomplete. If something feels missing, check the bcc and kernel s - [2. BPF_HASH](#2-bpf_hash) - [3. BPF_HISTOGRAM](#3-bpf_histogram) - [4. BPF_STACK_TRACE](#4-bpf_stack_trace) - - [5. map.lookup()](#5-maplookup) - - [6. map.lookup_or_init()](#6-maplookup_or_init) - - [7. map.delete()](#7-mapdelete) - - [8. map.update()](#8-mapupdate) - - [9. map.increment()](#9-mapincrement) - - [10. map.get_stackid()](#10-mapget_stackid) + - [5. BPF_PERF_ARRAY](#5-bpf_perf_array) + - [6. map.lookup()](#6-maplookup) + - [7. map.lookup_or_init()](#7-maplookup_or_init) + - [8. map.delete()](#8-mapdelete) + - [9. map.update()](#9-mapupdate) + - [10. map.increment()](#10-mapincrement) + - [11. map.get_stackid()](#11-mapget_stackid) + - [12. map.perf_read()](#12-mapperf_read) - [bcc Python](#bcc-python) - [Initialization](#initialization) @@ -442,7 +444,30 @@ Examples in situ: [search /examples](https://github.com/iovisor/bcc/search?q=BPF_STACK_TRACE+path%3Aexamples&type=Code), [search /tools](https://github.com/iovisor/bcc/search?q=BPF_STACK_TRACE+path%3Atools&type=Code) -### 5. map.lookup() +### 5. BPF_PERF_ARRAY + +Syntax: ```BPF_PERF_ARRAY(name, max_entries)``` + +Creates perf array named ```name```, with a maximum entry count provided, which must be equal to the number of system cpus. These maps are used to fetch hardware performance counters. + +For example: + +```C +text=""" +BPF_PERF_ARRAY(cpu_cycles, NUM_CPUS); +""" +b = bcc.BPF(text=text, cflags=["-DNUM_CPUS=%d" % multiprocessing.cpu_count()]) +b["cpu_cycles"].open_perf_event(b["cpu_cycles"].HW_CPU_CYCLES) +``` + +This creates a perf array named ```cpu_cycles```, with number of entries equal to the number of cpus/cores. The array is configured so that later calling map.perf_read() will return a hardware-calculated counter of the number of cycles elapsed from some point in the past. Only one type of hardware counter may be configured per table at a time. + +Methods (covered later): map.perf_read(). + +Examples in situ: +[search /tests](https://github.com/iovisor/bcc/search?q=BPF_PERF_ARRAY+path%3Atests&type=Code) + +### 6. map.lookup() Syntax: ```*val map.lookup(&key)``` @@ -452,7 +477,7 @@ Examples in situ: [search /examples](https://github.com/iovisor/bcc/search?q=lookup+path%3Aexamples&type=Code), [search /tools](https://github.com/iovisor/bcc/search?q=lookup+path%3Atools&type=Code) -### 6. map.lookup_or_init() +### 7. map.lookup_or_init() Syntax: ```*val map.lookup_or_init(&key, &zero)``` @@ -462,7 +487,7 @@ Examples in situ: [search /examples](https://github.com/iovisor/bcc/search?q=lookup_or_init+path%3Aexamples&type=Code), [search /tools](https://github.com/iovisor/bcc/search?q=lookup_or_init+path%3Atools&type=Code) -### 7. map.delete() +### 8. map.delete() Syntax: ```map.delete(&key)``` @@ -472,7 +497,7 @@ Examples in situ: [search /examples](https://github.com/iovisor/bcc/search?q=delete+path%3Aexamples&type=Code), [search /tools](https://github.com/iovisor/bcc/search?q=delete+path%3Atools&type=Code) -### 8. map.update() +### 9. map.update() Syntax: ```map.update(&key, &val)``` @@ -482,7 +507,7 @@ Examples in situ: [search /examples](https://github.com/iovisor/bcc/search?q=update+path%3Aexamples&type=Code), [search /tools](https://github.com/iovisor/bcc/search?q=update+path%3Atools&type=Code) -### 9. map.increment() +### 10. map.increment() Syntax: ```map.increment(&key)``` @@ -492,7 +517,7 @@ Examples in situ: [search /examples](https://github.com/iovisor/bcc/search?q=increment+path%3Aexamples&type=Code), [search /tools](https://github.com/iovisor/bcc/search?q=increment+path%3Atools&type=Code) -### 10. map.get_stackid() +### 11. map.get_stackid() Syntax: ```int map.get_stackid(void *ctx, u64 flags)``` @@ -502,6 +527,15 @@ Examples in situ: [search /examples](https://github.com/iovisor/bcc/search?q=get_stackid+path%3Aexamples&type=Code), [search /tools](https://github.com/iovisor/bcc/search?q=get_stackid+path%3Atools&type=Code) +### 12. map.perf_read() + +Syntax: ```u64 map.perf_read(u32 cpu)``` + +This returns the hardware performance counter as configured in [5. BPF_PERF_ARRAY](#5-bpf_perf_array) + +Examples in situ: +[search /tests](https://github.com/iovisor/bcc/search?q=perf_read+path%3Atests&type=Code) + # bcc Python ## Initialization diff --git a/src/cc/libbpf.c b/src/cc/libbpf.c index 0068e1f68963..2ad8b3b67260 100644 --- a/src/cc/libbpf.c +++ b/src/cc/libbpf.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -409,7 +410,28 @@ void * bpf_open_perf_buffer(perf_reader_raw_cb raw_cb, void *cb_cookie, int pid, return NULL; } +int bpf_open_perf_event(uint32_t type, uint64_t config, int pid, int cpu) { + int fd; + struct perf_event_attr attr = {}; + + attr.sample_period = LONG_MAX; + attr.type = type; + attr.config = config; + + fd = syscall(__NR_perf_event_open, &attr, pid, cpu, -1, PERF_FLAG_FD_CLOEXEC); + if (fd < 0) { + fprintf(stderr, "perf_event_open: %s\n", strerror(errno)); + return -1; + } + if (ioctl(fd, PERF_EVENT_IOC_ENABLE, 0) < 0) { + perror("ioctl(PERF_EVENT_IOC_ENABLE)"); + close(fd); + return -1; + } + + return fd; +} int bpf_attach_xdp(const char *dev_name, int progfd) { struct sockaddr_nl sa; diff --git a/src/python/bcc/libbcc.py b/src/python/bcc/libbcc.py index a893e192ce38..257a83d86b00 100644 --- a/src/python/bcc/libbcc.py +++ b/src/python/bcc/libbcc.py @@ -101,6 +101,8 @@ lib.bpf_detach_tracepoint.argtypes = [ct.c_char_p, ct.c_char_p] lib.bpf_open_perf_buffer.restype = ct.c_void_p lib.bpf_open_perf_buffer.argtypes = [_RAW_CB_TYPE, ct.py_object, ct.c_int, ct.c_int] +lib.bpf_open_perf_event.restype = ct.c_int +lib.bpf_open_perf_event.argtypes = [ct.c_uint, ct.c_ulonglong, ct.c_int, ct.c_int] lib.perf_reader_poll.restype = ct.c_int lib.perf_reader_poll.argtypes = [ct.c_int, ct.POINTER(ct.c_void_p), ct.c_int] lib.perf_reader_free.restype = None diff --git a/src/python/bcc/perf.py b/src/python/bcc/perf.py index 25d8ea225fb6..ea155915ca4c 100644 --- a/src/python/bcc/perf.py +++ b/src/python/bcc/perf.py @@ -50,6 +50,7 @@ class perf_event_attr(ct.Structure): PERF_TYPE_HARDWARE = 0 PERF_TYPE_SOFTWARE = 1 PERF_TYPE_TRACEPOINT = 2 + PERF_TYPE_HW_CACHE = 3 # perf_event_sample_format PERF_SAMPLE_RAW = 1024 # it's a u32; could also try zero args diff --git a/src/python/bcc/table.py b/src/python/bcc/table.py index 26218832c16d..f34cb7bb4392 100644 --- a/src/python/bcc/table.py +++ b/src/python/bcc/table.py @@ -15,8 +15,10 @@ from collections import MutableMapping import ctypes as ct import multiprocessing +import os from .libbcc import lib, _RAW_CB_TYPE +from .perf import Perf from subprocess import check_output BPF_MAP_TYPE_HASH = 1 @@ -165,7 +167,8 @@ def __setitem__(self, key, leaf): ct.cast(key_p, ct.c_void_p), ct.cast(leaf_p, ct.c_void_p), 0) if res < 0: - raise Exception("Could not update table") + errstr = os.strerror(ct.get_errno()) + raise Exception("Could not update table: %s" % errstr) # override the MutableMapping's implementation of these since they # don't handle KeyError nicely @@ -366,8 +369,43 @@ def __setitem__(self, key, leaf): leaf = self.Leaf(leaf.fd) super(ProgArray, self).__setitem__(key, leaf) - class PerfEventArray(ArrayBase): + class Event(object): + def __init__(self, typ, config): + self.typ = typ + self.config = config + + HW_CPU_CYCLES = Event(Perf.PERF_TYPE_HARDWARE, 0) + HW_INSTRUCTIONS = Event(Perf.PERF_TYPE_HARDWARE, 1) + HW_CACHE_REFERENCES = Event(Perf.PERF_TYPE_HARDWARE, 2) + HW_CACHE_MISSES = Event(Perf.PERF_TYPE_HARDWARE, 3) + HW_BRANCH_INSTRUCTIONS = Event(Perf.PERF_TYPE_HARDWARE, 4) + HW_BRANCH_MISSES = Event(Perf.PERF_TYPE_HARDWARE, 5) + HW_BUS_CYCLES = Event(Perf.PERF_TYPE_HARDWARE, 6) + HW_STALLED_CYCLES_FRONTEND = Event(Perf.PERF_TYPE_HARDWARE, 7) + HW_STALLED_CYCLES_BACKEND = Event(Perf.PERF_TYPE_HARDWARE, 8) + HW_REF_CPU_CYCLES = Event(Perf.PERF_TYPE_HARDWARE, 9) + + # not yet supported, wip + #HW_CACHE_L1D_READ = Event(Perf.PERF_TYPE_HW_CACHE, 0<<0|0<<8|0<<16) + #HW_CACHE_L1D_READ_MISS = Event(Perf.PERF_TYPE_HW_CACHE, 0<<0|0<<8|1<<16) + #HW_CACHE_L1D_WRITE = Event(Perf.PERF_TYPE_HW_CACHE, 0<<0|1<<8|0<<16) + #HW_CACHE_L1D_WRITE_MISS = Event(Perf.PERF_TYPE_HW_CACHE, 0<<0|1<<8|1<<16) + #HW_CACHE_L1D_PREF = Event(Perf.PERF_TYPE_HW_CACHE, 0<<0|2<<8|0<<16) + #HW_CACHE_L1D_PREF_MISS = Event(Perf.PERF_TYPE_HW_CACHE, 0<<0|2<<8|1<<16) + #HW_CACHE_L1I_READ = Event(Perf.PERF_TYPE_HW_CACHE, 1<<0|0<<8|0<<16) + #HW_CACHE_L1I_READ_MISS = Event(Perf.PERF_TYPE_HW_CACHE, 1<<0|0<<8|1<<16) + #HW_CACHE_L1I_WRITE = Event(Perf.PERF_TYPE_HW_CACHE, 1<<0|1<<8|0<<16) + #HW_CACHE_L1I_WRITE_MISS = Event(Perf.PERF_TYPE_HW_CACHE, 1<<0|1<<8|1<<16) + #HW_CACHE_L1I_PREF = Event(Perf.PERF_TYPE_HW_CACHE, 1<<0|2<<8|0<<16) + #HW_CACHE_L1I_PREF_MISS = Event(Perf.PERF_TYPE_HW_CACHE, 1<<0|2<<8|1<<16) + #HW_CACHE_LL_READ = Event(Perf.PERF_TYPE_HW_CACHE, 2<<0|0<<8|0<<16) + #HW_CACHE_LL_READ_MISS = Event(Perf.PERF_TYPE_HW_CACHE, 2<<0|0<<8|1<<16) + #HW_CACHE_LL_WRITE = Event(Perf.PERF_TYPE_HW_CACHE, 2<<0|1<<8|0<<16) + #HW_CACHE_LL_WRITE_MISS = Event(Perf.PERF_TYPE_HW_CACHE, 2<<0|1<<8|1<<16) + #HW_CACHE_LL_PREF = Event(Perf.PERF_TYPE_HW_CACHE, 2<<0|2<<8|0<<16) + #HW_CACHE_LL_PREF_MISS = Event(Perf.PERF_TYPE_HW_CACHE, 2<<0|2<<8|1<<16) + def __init__(self, *args, **kwargs): super(PerfEventArray, self).__init__(*args, **kwargs) @@ -404,6 +442,30 @@ def close_perf_buffer(self, key): self.bpf._del_kprobe((id(self), key)) del self._cbs[key] + def _open_perf_event(self, cpu, typ, config): + fd = lib.bpf_open_perf_event(typ, config, -1, cpu) + if fd < 0: + raise Exception("bpf_open_perf_event failed") + try: + self[self.Key(cpu)] = self.Leaf(fd) + finally: + # the fd is kept open in the map itself by the kernel + os.close(fd) + + def open_perf_event(self, ev): + """open_perf_event(ev) + + Configures the table such that calls from the bpf program to + table.perf_read(bpf_get_smp_processor_id()) will return the hardware + counter denoted by event ev on the local cpu. + """ + if not isinstance(ev, self.Event): + raise Exception("argument must be an Event, got %s", type(ev)) + + for i in range(0, multiprocessing.cpu_count()): + self._open_perf_event(i, ev.typ, ev.config) + + class PerCpuHash(HashTable): def __init__(self, *args, **kwargs): self.reducer = kwargs.pop("reducer", None) diff --git a/tests/python/CMakeLists.txt b/tests/python/CMakeLists.txt index 3fb2f1fa4e4d..d8e85365d920 100644 --- a/tests/python/CMakeLists.txt +++ b/tests/python/CMakeLists.txt @@ -52,6 +52,8 @@ add_test(NAME py_test_stackid WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND ${TEST_WRAPPER} py_stackid sudo ${CMAKE_CURRENT_SOURCE_DIR}/test_stackid.py) add_test(NAME py_test_tracepoint WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND ${TEST_WRAPPER} py_test_tracepoint sudo ${CMAKE_CURRENT_SOURCE_DIR}/test_tracepoint.py) +add_test(NAME py_test_perf_event WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} + COMMAND ${TEST_WRAPPER} py_test_perf_event sudo ${CMAKE_CURRENT_SOURCE_DIR}/test_perf_event.py) add_test(NAME py_test_dump_func WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND ${TEST_WRAPPER} py_dump_func simple ${CMAKE_CURRENT_SOURCE_DIR}/test_dump_func.py) diff --git a/tests/python/test_perf_event.py b/tests/python/test_perf_event.py new file mode 100755 index 000000000000..224faffb3d9e --- /dev/null +++ b/tests/python/test_perf_event.py @@ -0,0 +1,47 @@ +#!/usr/bin/env python +# Copyright (c) 2016 PLUMgrid +# Licensed under the Apache License, Version 2.0 (the "License") + +import bcc +import ctypes +import multiprocessing +import os +import time +import unittest + +class TestPerfCounter(unittest.TestCase): + def test_cycles(self): + text = """ +BPF_PERF_ARRAY(cnt1, NUM_CPUS); +BPF_TABLE("array", u32, u64, prev, NUM_CPUS); +BPF_HISTOGRAM(dist); +int kprobe__sys_getuid(void *ctx) { + u32 cpu = bpf_get_smp_processor_id(); + u64 val = cnt1.perf_read(cpu); + prev.update(&cpu, &val); + return 0; +} +int kretprobe__sys_getuid(void *ctx) { + u32 cpu = bpf_get_smp_processor_id(); + u64 val = cnt1.perf_read(cpu); + u64 *prevp = prev.lookup(&cpu); + if (prevp) + dist.increment(bpf_log2l(val - *prevp)); + return 0; +} +""" + b = bcc.BPF(text=text, debug=0, + cflags=["-DNUM_CPUS=%d" % multiprocessing.cpu_count()]) + cnt1 = b["cnt1"] + try: + cnt1.open_perf_event(cnt1.HW_CPU_CYCLES) + except: + if ctypes.get_errno() == 2: + raise self.skipTest("hardware events unsupported") + raise + for i in range(0, 100): + os.getuid() + b["dist"].print_log2_hist() + +if __name__ == "__main__": + unittest.main()