Skip to content

Commit

Permalink
Add perf_output support for high rate events
Browse files Browse the repository at this point in the history
This adds support for the bpf_perf_event_output command. This is
intended for per-process events from bpf to userspace at high rate. The
events from the bpf program can be completely customized.

Signed-off-by: Brenden Blanco <[email protected]>
  • Loading branch information
Brenden Blanco committed Nov 6, 2015
1 parent 33d0003 commit d0daf6a
Show file tree
Hide file tree
Showing 8 changed files with 237 additions and 40 deletions.
43 changes: 43 additions & 0 deletions examples/tracing/trace_perf_output.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
#!/usr/bin/env python
# Copyright (c) PLUMgrid, Inc.
# Licensed under the Apache License, Version 2.0 (the "License")

# This is an example of tracing an event and printing custom fields.
# run in project examples directory with:
# sudo ./trace_fields.py"

import atexit
from bcc import BPF
import ctypes

counter = 0
def cb(foo, data, size):
global counter
counter += 1

prog = """
BPF_PERF_ARRAY(events, 2);
BPF_TABLE("array", int, u64, counters, 10);
int kprobe__sys_write(void *ctx) {
struct {
u64 ts;
} data = {bpf_ktime_get_ns()};
if (events.perf_output(ctx, 0, &data, sizeof(data)) < 0)
bpf_trace_printk("perf_output failed\\n");
int zero = 0;
u64 *val = counters.lookup(&zero);
if (val) lock_xadd(val, 1);
return 0;
}
"""
b = BPF(text=prog)
b["events"].open_perf_buffer(0, cb, None)

@atexit.register
def print_counter():
global counter
global b
print("counter = %d vs %d" % (counter, b["counters"][ctypes.c_int(0)].value))

while 1:
b.kprobe_poll()
23 changes: 23 additions & 0 deletions src/cc/export/helpers.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,19 @@ struct _name##_table_t { \
__attribute__((section("maps/" _table_type))) \
struct _name##_table_t _name

#define BPF_PERF_ARRAY(_name, _max_entries) \
struct _name##_table_t { \
int key; \
u32 leaf; \
/* counter = map.perf_read(index) */ \
u64 (*perf_read) (int); \
/* map.perf_ouput(ctx, index, data, data_size) */ \
int (*perf_output) (void *, int, void *, u32); \
u32 data[_max_entries]; \
}; \
__attribute__((section("maps/perf_array"))) \
struct _name##_table_t _name

#define BPF_HASH1(_name) \
BPF_TABLE("hash", u64, u64, _name, 10240)
#define BPF_HASH2(_name, _key_type) \
Expand Down Expand Up @@ -117,6 +130,16 @@ static int (*bpf_skb_get_tunnel_key)(void *ctx, void *to, u32 size, u64 flags) =
(void *) BPF_FUNC_skb_get_tunnel_key;
static int (*bpf_skb_set_tunnel_key)(void *ctx, void *from, u32 size, u64 flags) =
(void *) BPF_FUNC_skb_set_tunnel_key;
static int (*bpf_perf_event_read)(void *map, u32 index) =
(void *) BPF_FUNC_perf_event_read;
#endif
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,4,0)
static int (*bpf_redirect)(int ifindex, u32 flags) =
(void *) BPF_FUNC_redirect;
static u32 (*bpf_get_route_realm)(void *ctx) =
(void *) BPF_FUNC_get_route_realm;
static int (*bpf_perf_event_output)(void *ctx, void *map, u32 index, void *data, u32 size) =
(void *) BPF_FUNC_perf_event_output;
#endif

/* llvm builtin functions that eBPF C program may use to
Expand Down
43 changes: 30 additions & 13 deletions src/cc/frontends/clang/b_frontend_action.cc
Original file line number Diff line number Diff line change
Expand Up @@ -332,6 +332,13 @@ bool BTypeVisitor::VisitCallExpr(CallExpr *Call) {
}
txt += "typeof(" + name + ".leaf) *_leaf = " + lookup + ", &_key); ";
txt += "if (_leaf) (*_leaf)++; })";
} else if (memb_name == "perf_output") {
string name = Ref->getDecl()->getName();
string arg0 = rewriter_.getRewrittenText(SourceRange(Call->getArg(0)->getLocStart(),
Call->getArg(0)->getLocEnd()));
string args_other = rewriter_.getRewrittenText(SourceRange(Call->getArg(1)->getLocStart(),
Call->getArg(3)->getLocEnd()));
txt = "bpf_perf_event_output(" + arg0 + ", bpf_pseudo_fd(1, " + fd + "), " + args_other + ")";
} else {
if (memb_name == "lookup") {
prefix = "bpf_map_lookup_elem";
Expand All @@ -345,6 +352,9 @@ bool BTypeVisitor::VisitCallExpr(CallExpr *Call) {
} else if (memb_name == "call") {
prefix = "bpf_tail_call_";
suffix = ")";
} else if (memb_name == "perf_read") {
prefix = "bpf_perf_event_read";
suffix = ")";
} else {
C.getDiagnostics().Report(Call->getLocStart(), diag::err_expected)
<< "valid bpf_table operation";
Expand Down Expand Up @@ -482,6 +492,13 @@ bool BTypeVisitor::VisitVarDecl(VarDecl *Decl) {
}
const RecordDecl *RD = R->getDecl()->getDefinition();

int major = 0, minor = 0;
struct utsname un;
if (uname(&un) == 0) {
// release format: <major>.<minor>.<revision>[-<othertag>]
sscanf(un.release, "%d.%d.", &major, &minor);
}

TableDesc table;
table.name = Decl->getName();

Expand Down Expand Up @@ -519,20 +536,20 @@ bool BTypeVisitor::VisitVarDecl(VarDecl *Decl) {
diag_.Report(Decl->getLocStart(), diag_id) << table.leaf_desc;
}
} else if (A->getName() == "maps/prog") {
struct utsname un;
if (uname(&un) == 0) {
int major = 0, minor = 0;
// release format: <major>.<minor>.<revision>[-<othertag>]
sscanf(un.release, "%d.%d.", &major, &minor);
if (KERNEL_VERSION(major,minor,0) >= KERNEL_VERSION(4,2,0))
map_type = BPF_MAP_TYPE_PROG_ARRAY;
}
if (map_type == BPF_MAP_TYPE_UNSPEC) {
C.getDiagnostics().Report(Decl->getLocStart(), diag::err_expected)
<< "kernel supporting maps/prog";
return false;
}
if (KERNEL_VERSION(major,minor,0) >= KERNEL_VERSION(4,2,0))
map_type = BPF_MAP_TYPE_PROG_ARRAY;
} else if (A->getName() == "maps/perf_array") {
if (KERNEL_VERSION(major,minor,0) >= KERNEL_VERSION(4,3,0))
map_type = BPF_MAP_TYPE_PERF_EVENT_ARRAY;
}

if (map_type == BPF_MAP_TYPE_UNSPEC) {
unsigned diag_id = C.getDiagnostics().getCustomDiagID(DiagnosticsEngine::Error,
"unsupported map type: %0");
C.getDiagnostics().Report(Decl->getLocStart(), diag_id) << A->getName();
return false;
}

table.type = map_type;
table.fd = bpf_create_map(map_type, table.key_size, table.leaf_size, table.max_entries);
if (table.fd < 0) {
Expand Down
51 changes: 43 additions & 8 deletions src/cc/libbpf.c
Original file line number Diff line number Diff line change
Expand Up @@ -178,8 +178,8 @@ int bpf_attach_socket(int sock, int prog) {

static int bpf_attach_tracing_event(int progfd, const char *event_path,
struct perf_reader *reader, int pid, int cpu, int group_fd) {
int efd = -1, rc = -1, pfd = -1;
ssize_t bytes = -1;
int efd = -1, rc = -1, pfd;
ssize_t bytes;
char buf[256];
struct perf_event_attr attr = {};

Expand All @@ -206,8 +206,9 @@ static int bpf_attach_tracing_event(int progfd, const char *event_path,
perror("perf_event_open");
goto cleanup;
}
perf_reader_set_fd(reader, pfd);

if (perf_reader_mmap(reader, pfd, attr.sample_type) < 0)
if (perf_reader_mmap(reader, attr.type, attr.sample_type) < 0)
goto cleanup;

if (ioctl(pfd, PERF_EVENT_IOC_SET_BPF, progfd) < 0) {
Expand All @@ -219,14 +220,11 @@ static int bpf_attach_tracing_event(int progfd, const char *event_path,
goto cleanup;
}

rc = pfd;
pfd = -1;
rc = 0;

cleanup:
if (efd >= 0)
close(efd);
if (pfd >= 0)
close(pfd);

return rc;
}
Expand All @@ -239,7 +237,7 @@ void * bpf_attach_kprobe(int progfd, const char *event,
char buf[256];
struct perf_reader *reader = NULL;

reader = perf_reader_new(-1, 8, cb, cb_cookie);
reader = perf_reader_new(cb, NULL, cb_cookie);
if (!reader)
goto cleanup;

Expand Down Expand Up @@ -292,3 +290,40 @@ int bpf_detach_kprobe(const char *event_desc) {
return rc;
}

void * bpf_open_perf_buffer(perf_reader_raw_cb raw_cb, void *cb_cookie) {
int rc = -1, pfd;
struct perf_event_attr attr = {};

struct perf_reader *reader = perf_reader_new(NULL, raw_cb, cb_cookie);

if (!reader)
goto cleanup;

attr.config = PERF_COUNT_SW_BPF_OUTPUT;
attr.type = PERF_TYPE_SOFTWARE;
attr.sample_type = PERF_SAMPLE_RAW;
pfd = syscall(__NR_perf_event_open, &attr, -1, 0, -1, PERF_FLAG_FD_CLOEXEC);
if (pfd < 0) {
perror("perf_event_open");
goto cleanup;
}
perf_reader_set_fd(reader, pfd);

if (perf_reader_mmap(reader, attr.type, attr.sample_type) < 0)
goto cleanup;

if (ioctl(pfd, PERF_EVENT_IOC_ENABLE, 0) < 0) {
perror("ioctl(PERF_EVENT_IOC_ENABLE)");
goto cleanup;
}

rc = 0;

cleanup:
if (reader && rc < 0) {
perf_reader_free(reader);
reader = NULL;
}

return reader;
}
78 changes: 65 additions & 13 deletions src/cc/perf_reader.c
Original file line number Diff line number Diff line change
Expand Up @@ -26,27 +26,32 @@
#include "libbpf.h"
#include "perf_reader.h"

int perf_reader_page_cnt = 8;

struct perf_reader {
perf_reader_cb cb;
perf_reader_raw_cb raw_cb;
void *cb_cookie; // to be returned in the cb
void *buf; // for keeping segmented data
size_t buf_size;
void *base;
int page_size;
int page_cnt;
int fd;
uint32_t type;
uint64_t sample_type;
};

struct perf_reader * perf_reader_new(int fd, int page_cnt, perf_reader_cb cb, void *cb_cookie) {
struct perf_reader * perf_reader_new(perf_reader_cb cb, perf_reader_raw_cb raw_cb, void *cb_cookie) {
struct perf_reader *reader = calloc(1, sizeof(struct perf_reader));
if (!reader)
return NULL;
reader->cb = cb;
reader->raw_cb = raw_cb;
reader->cb_cookie = cb_cookie;
reader->fd = fd;
reader->fd = -1;
reader->page_size = getpagesize();
reader->page_cnt = page_cnt;
reader->page_cnt = perf_reader_page_cnt;
return reader;
}

Expand All @@ -61,18 +66,20 @@ void perf_reader_free(void *ptr) {
}
}

int perf_reader_mmap(struct perf_reader *reader, int fd, uint64_t sample_type) {
int perf_reader_mmap(struct perf_reader *reader, unsigned type, unsigned long sample_type) {
int mmap_size = reader->page_size * (reader->page_cnt + 1);

if (!reader->cb)
return 0;
if (reader->fd < 0) {
fprintf(stderr, "%s: reader fd is not set\n", __FUNCTION__);
return -1;
}

reader->base = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE , MAP_SHARED, fd, 0);
reader->base = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE , MAP_SHARED, reader->fd, 0);
if (reader->base == MAP_FAILED) {
perror("mmap");
return -1;
}
reader->fd = fd;
reader->type = type;
reader->sample_type = sample_type;

return 0;
Expand All @@ -90,7 +97,7 @@ struct perf_sample_trace_kprobe {
uint64_t ip;
};

static void sample_parse(struct perf_reader *reader, void *data, int size) {
static void parse_tracepoint(struct perf_reader *reader, void *data, int size) {
uint8_t *ptr = data;
struct perf_event_header *header = (void *)data;

Expand Down Expand Up @@ -153,6 +160,40 @@ static void sample_parse(struct perf_reader *reader, void *data, int size) {
reader->cb(reader->cb_cookie, tk ? tk->common.pid : -1, num_callchain, callchain);
}

static void parse_sw(struct perf_reader *reader, void *data, int size) {
uint8_t *ptr = data;
struct perf_event_header *header = (void *)data;

struct {
uint32_t size;
char data[0];
} *raw = NULL;

ptr += sizeof(*header);
if (ptr > (uint8_t *)data + size) {
fprintf(stderr, "%s: corrupt sample header\n", __FUNCTION__);
return;
}

if (reader->sample_type & PERF_SAMPLE_RAW) {
raw = (void *)ptr;
ptr += sizeof(raw->size) + raw->size;
if (ptr > (uint8_t *)data + size) {
fprintf(stderr, "%s: corrupt raw sample\n", __FUNCTION__);
return;
}
}

// sanity check
if (ptr != (uint8_t *)data + size) {
fprintf(stderr, "%s: extra data at end of sample\n", __FUNCTION__);
return;
}

if (reader->raw_cb)
reader->raw_cb(reader->cb_cookie, raw->data, raw->size);
}

static uint64_t read_data_head(struct perf_event_mmap_page *perf_header) {
uint64_t data_head = *((volatile uint64_t *)&perf_header->data_head);
asm volatile("" ::: "memory");
Expand Down Expand Up @@ -194,12 +235,16 @@ static void event_read(struct perf_reader *reader) {
ptr = reader->buf;
}

if (e->type == PERF_RECORD_LOST)
if (e->type == PERF_RECORD_LOST) {
fprintf(stderr, "Lost %lu samples\n", *(uint64_t *)(ptr + sizeof(*e)));
else if (e->type == PERF_RECORD_SAMPLE)
sample_parse(reader, ptr, e->size);
else
} else if (e->type == PERF_RECORD_SAMPLE) {
if (reader->type == PERF_TYPE_TRACEPOINT)
parse_tracepoint(reader, ptr, e->size);
else if (reader->type == PERF_TYPE_SOFTWARE)
parse_sw(reader, ptr, e->size);
} else {
fprintf(stderr, "%s: unknown sample type %d\n", __FUNCTION__, e->type);
}

write_data_tail(perf_header, perf_header->data_tail + e->size);
}
Expand All @@ -223,3 +268,10 @@ int perf_reader_poll(int num_readers, struct perf_reader **readers, int timeout)
return 0;
}

void perf_reader_set_fd(struct perf_reader *reader, int fd) {
reader->fd = fd;
}

int perf_reader_fd(struct perf_reader *reader) {
return reader->fd;
}
6 changes: 4 additions & 2 deletions src/cc/perf_reader.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,9 @@

struct perf_reader;

struct perf_reader * perf_reader_new(int fd, int page_cnt, perf_reader_cb cb, void *cb_cookie);
struct perf_reader * perf_reader_new(perf_reader_cb cb, perf_reader_raw_cb raw_cb, void *cb_cookie);
void perf_reader_free(void *ptr);
int perf_reader_mmap(struct perf_reader *reader, int fd, unsigned long sample_type);
int perf_reader_mmap(struct perf_reader *reader, unsigned type, unsigned long sample_type);
int perf_reader_poll(int num_readers, struct perf_reader **readers, int timeout);
int perf_reader_fd(struct perf_reader *reader);
void perf_reader_set_fd(struct perf_reader *reader, int fd);
Loading

0 comments on commit d0daf6a

Please sign in to comment.