From 5f5687e468942b2415dc72227f5032380828a695 Mon Sep 17 00:00:00 2001 From: Mark Drayton Date: Mon, 20 Feb 2017 18:13:03 +0000 Subject: [PATCH] Make perf ring buffer size configurable As discussed in #966, this PR makes the size of the ring buffer used to send data to userspace configurable. It changes the Python, Lua and C++ APIs to expose this knob. It also defaults the buffer size to a larger value (64 pages per CPU, an 8x increase) for several tools which produce a lot of output, as well as making it configurable in `trace` via a `-b` flag. --- docs/reference_guide.md | 4 ++-- examples/lua/bashreadline.lua | 2 +- src/cc/BPF.cc | 7 +++++-- src/cc/BPF.h | 5 ++++- src/cc/BPFTable.cc | 8 ++++---- src/cc/BPFTable.h | 6 ++++-- src/cc/libbpf.c | 13 ++++++++----- src/cc/libbpf.h | 3 ++- src/cc/perf_reader.c | 7 +++---- src/cc/perf_reader.h | 3 ++- src/lua/bcc/libbcc.lua | 2 +- src/lua/bcc/table.lua | 11 ++++++----- src/python/bcc/libbcc.py | 2 +- src/python/bcc/table.py | 15 ++++++++++----- tools/biosnoop.lua | 6 +++--- tools/biosnoop.py | 2 +- tools/btrfsslower.py | 2 +- tools/cpuunclaimed.py | 2 +- tools/dbslower.py | 2 +- tools/dcsnoop.py | 2 +- tools/ext4slower.py | 2 +- tools/fileslower.py | 2 +- tools/mysqld_qslower.py | 2 +- tools/opensnoop.py | 2 +- tools/stacksnoop.lua | 2 +- tools/statsnoop.py | 2 +- tools/tcplife.py | 4 ++-- tools/trace.py | 10 +++++++++- tools/trace_example.txt | 32 ++++++++++++++++++++++++++------ tools/xfsslower.py | 2 +- tools/zfsslower.py | 2 +- 31 files changed, 106 insertions(+), 60 deletions(-) diff --git a/docs/reference_guide.md b/docs/reference_guide.md index f97644337ca7..9e10a9906f1d 100644 --- a/docs/reference_guide.md +++ b/docs/reference_guide.md @@ -865,9 +865,9 @@ These are equivalent. ### 2. open_perf_buffer() -Syntax: ```table.open_perf_buffers(callback)``` +Syntax: ```table.open_perf_buffers(callback, page_cnt=N)``` -This operates on a table as defined in BPF as BPF_PERF_OUTPUT(), and associates the callback Python function ```callback``` to be called when data is available in the perf ring buffer. This is part of the recommended mechanism for transferring per-event data from kernel to user space. +This operates on a table as defined in BPF as BPF_PERF_OUTPUT(), and associates the callback Python function ```callback``` to be called when data is available in the perf ring buffer. This is part of the recommended mechanism for transferring per-event data from kernel to user space. The size of the perf ring buffer can be specified via the ```page_cnt``` parameter, which must be a power of two number of pages and defaults to 8. Example: diff --git a/examples/lua/bashreadline.lua b/examples/lua/bashreadline.lua index ebb4c35932bd..045fbc0148f5 100755 --- a/examples/lua/bashreadline.lua +++ b/examples/lua/bashreadline.lua @@ -24,7 +24,7 @@ return function(BPF) print("%-9s %-6d %s" % {os.date("%H:%M:%S"), tonumber(event.pid), ffi.string(event.str)}) end - b:get_table("events"):open_perf_buffer(print_readline, "struct { uint64_t pid; char str[80]; }") + b:get_table("events"):open_perf_buffer(print_readline, "struct { uint64_t pid; char str[80]; }", nil) print("%-9s %-6s %s" % {"TIME", "PID", "COMMAND"}) b:kprobe_poll_loop() diff --git a/src/cc/BPF.cc b/src/cc/BPF.cc index 9e1c23a91a32..809bfdb7c34c 100644 --- a/src/cc/BPF.cc +++ b/src/cc/BPF.cc @@ -392,11 +392,14 @@ StatusTuple BPF::detach_perf_event(uint32_t ev_type, uint32_t ev_config) { } StatusTuple BPF::open_perf_buffer(const std::string& name, - perf_reader_raw_cb cb, void* cb_cookie) { + perf_reader_raw_cb cb, void* cb_cookie, + int page_cnt) { if (perf_buffers_.find(name) == perf_buffers_.end()) perf_buffers_[name] = new BPFPerfBuffer(bpf_module_.get(), name); + if ((page_cnt & (page_cnt - 1)) != 0) + return StatusTuple(-1, "open_perf_buffer page_cnt must be a power of two"); auto table = perf_buffers_[name]; - TRY2(table->open_all_cpu(cb, cb_cookie)); + TRY2(table->open_all_cpu(cb, cb_cookie, page_cnt)); return StatusTuple(0); } diff --git a/src/cc/BPF.h b/src/cc/BPF.h index ba2c15bd15bb..a4a88177a80d 100644 --- a/src/cc/BPF.h +++ b/src/cc/BPF.h @@ -27,6 +27,8 @@ #include "compat/linux/bpf.h" #include "libbpf.h" +static const int DEFAULT_PERF_BUFFER_PAGE_CNT = 8; + namespace ebpf { struct open_probe_t { @@ -96,7 +98,8 @@ class BPF { } StatusTuple open_perf_buffer(const std::string& name, perf_reader_raw_cb cb, - void* cb_cookie = nullptr); + void* cb_cookie = nullptr, + int page_cnt = DEFAULT_PERF_BUFFER_PAGE_CNT); StatusTuple close_perf_buffer(const std::string& name); void poll_perf_buffer(const std::string& name, int timeout = -1); diff --git a/src/cc/BPFTable.cc b/src/cc/BPFTable.cc index 837d5bd0b649..d0f2b99e6d25 100644 --- a/src/cc/BPFTable.cc +++ b/src/cc/BPFTable.cc @@ -67,11 +67,11 @@ std::vector BPFStackTable::get_stack_symbol(int stack_id, } StatusTuple BPFPerfBuffer::open_on_cpu(perf_reader_raw_cb cb, int cpu, - void* cb_cookie) { + void* cb_cookie, int page_cnt) { if (cpu_readers_.find(cpu) != cpu_readers_.end()) return StatusTuple(-1, "Perf buffer already open on CPU %d", cpu); auto reader = - static_cast(bpf_open_perf_buffer(cb, cb_cookie, -1, cpu)); + static_cast(bpf_open_perf_buffer(cb, cb_cookie, -1, cpu, page_cnt)); if (reader == nullptr) return StatusTuple(-1, "Unable to construct perf reader"); int reader_fd = perf_reader_fd(reader); @@ -86,12 +86,12 @@ StatusTuple BPFPerfBuffer::open_on_cpu(perf_reader_raw_cb cb, int cpu, } StatusTuple BPFPerfBuffer::open_all_cpu(perf_reader_raw_cb cb, - void* cb_cookie) { + void* cb_cookie, int page_cnt) { if (cpu_readers_.size() != 0 || readers_.size() != 0) return StatusTuple(-1, "Previously opened perf buffer not cleaned"); for (int i: get_online_cpus()) { - auto res = open_on_cpu(cb, i, cb_cookie); + auto res = open_on_cpu(cb, i, cb_cookie, page_cnt); if (res.code() != 0) { TRY2(close_all_cpu()); return res; diff --git a/src/cc/BPFTable.h b/src/cc/BPFTable.h index c5f805d6ee5c..98424f146fa4 100644 --- a/src/cc/BPFTable.h +++ b/src/cc/BPFTable.h @@ -126,12 +126,14 @@ class BPFPerfBuffer : protected BPFTableBase { : BPFTableBase(bpf_module, name) {} ~BPFPerfBuffer(); - StatusTuple open_all_cpu(perf_reader_raw_cb cb, void* cb_cookie); + StatusTuple open_all_cpu(perf_reader_raw_cb cb, void* cb_cookie, + int page_cnt); StatusTuple close_all_cpu(); void poll(int timeout); private: - StatusTuple open_on_cpu(perf_reader_raw_cb cb, int cpu, void* cb_cookie); + StatusTuple open_on_cpu(perf_reader_raw_cb cb, int cpu, void* cb_cookie, + int page_cnt); StatusTuple close_on_cpu(int cpu); std::map cpu_readers_; diff --git a/src/cc/libbpf.c b/src/cc/libbpf.c index f3d2881a2951..864b89e612b7 100644 --- a/src/cc/libbpf.c +++ b/src/cc/libbpf.c @@ -65,6 +65,8 @@ #define PERF_FLAG_FD_CLOEXEC (1UL << 3) #endif +static int probe_perf_reader_page_cnt = 8; + static __u64 ptr_to_u64(void *ptr) { return (__u64) (unsigned long) ptr; @@ -351,7 +353,7 @@ void * bpf_attach_kprobe(int progfd, enum bpf_probe_attach_type attach_type, con int n; snprintf(new_name, sizeof(new_name), "%s_bcc_%d", ev_name, getpid()); - reader = perf_reader_new(cb, NULL, cb_cookie); + reader = perf_reader_new(cb, NULL, cb_cookie, probe_perf_reader_page_cnt); if (!reader) goto error; @@ -411,7 +413,7 @@ void * bpf_attach_uprobe(int progfd, enum bpf_probe_attach_type attach_type, con int n; snprintf(new_name, sizeof(new_name), "%s_bcc_%d", ev_name, getpid()); - reader = perf_reader_new(cb, NULL, cb_cookie); + reader = perf_reader_new(cb, NULL, cb_cookie, probe_perf_reader_page_cnt); if (!reader) goto error; @@ -493,7 +495,7 @@ void * bpf_attach_tracepoint(int progfd, const char *tp_category, char buf[256]; struct perf_reader *reader = NULL; - reader = perf_reader_new(cb, NULL, cb_cookie); + reader = perf_reader_new(cb, NULL, cb_cookie, probe_perf_reader_page_cnt); if (!reader) goto error; @@ -515,12 +517,13 @@ int bpf_detach_tracepoint(const char *tp_category, const char *tp_name) { return 0; } -void * bpf_open_perf_buffer(perf_reader_raw_cb raw_cb, void *cb_cookie, int pid, int cpu) { +void * bpf_open_perf_buffer(perf_reader_raw_cb raw_cb, void *cb_cookie, int pid, + int cpu, int page_cnt) { int pfd; struct perf_event_attr attr = {}; struct perf_reader *reader = NULL; - reader = perf_reader_new(NULL, raw_cb, cb_cookie); + reader = perf_reader_new(NULL, raw_cb, cb_cookie, page_cnt); if (!reader) goto error; diff --git a/src/cc/libbpf.h b/src/cc/libbpf.h index 07afaa94e3a3..0e9fda2aec1d 100644 --- a/src/cc/libbpf.h +++ b/src/cc/libbpf.h @@ -68,7 +68,8 @@ void * bpf_attach_tracepoint(int progfd, const char *tp_category, int group_fd, perf_reader_cb cb, void *cb_cookie); int bpf_detach_tracepoint(const char *tp_category, const char *tp_name); -void * bpf_open_perf_buffer(perf_reader_raw_cb raw_cb, void *cb_cookie, int pid, int cpu); +void * bpf_open_perf_buffer(perf_reader_raw_cb raw_cb, void *cb_cookie, int pid, + int cpu, int page_cnt); /* attached a prog expressed by progfd to the device specified in dev_name */ int bpf_attach_xdp(const char *dev_name, int progfd); diff --git a/src/cc/perf_reader.c b/src/cc/perf_reader.c index 793a0699995d..a9ed18eb8e9a 100644 --- a/src/cc/perf_reader.c +++ b/src/cc/perf_reader.c @@ -26,8 +26,6 @@ #include "libbpf.h" #include "perf_reader.h" -int perf_reader_page_cnt = 8; - struct perf_reader { perf_reader_cb cb; perf_reader_raw_cb raw_cb; @@ -42,7 +40,8 @@ struct perf_reader { uint64_t sample_type; }; -struct perf_reader * perf_reader_new(perf_reader_cb cb, perf_reader_raw_cb raw_cb, void *cb_cookie) { +struct perf_reader * perf_reader_new(perf_reader_cb cb, + perf_reader_raw_cb raw_cb, void *cb_cookie, int page_cnt) { struct perf_reader *reader = calloc(1, sizeof(struct perf_reader)); if (!reader) return NULL; @@ -51,7 +50,7 @@ struct perf_reader * perf_reader_new(perf_reader_cb cb, perf_reader_raw_cb raw_c reader->cb_cookie = cb_cookie; reader->fd = -1; reader->page_size = getpagesize(); - reader->page_cnt = perf_reader_page_cnt; + reader->page_cnt = page_cnt; return reader; } diff --git a/src/cc/perf_reader.h b/src/cc/perf_reader.h index 6376c47887e4..4bbb1e3bbf61 100644 --- a/src/cc/perf_reader.h +++ b/src/cc/perf_reader.h @@ -25,7 +25,8 @@ extern "C" { struct perf_reader; -struct perf_reader * perf_reader_new(perf_reader_cb cb, perf_reader_raw_cb raw_cb, void *cb_cookie); +struct perf_reader * perf_reader_new(perf_reader_cb cb, + perf_reader_raw_cb raw_cb, void *cb_cookie, int page_cnt); void perf_reader_free(void *ptr); int perf_reader_mmap(struct perf_reader *reader, unsigned type, unsigned long sample_type); int perf_reader_poll(int num_readers, struct perf_reader **readers, int timeout); diff --git a/src/lua/bcc/libbcc.lua b/src/lua/bcc/libbcc.lua index 762db9c686ef..fa28e21db373 100644 --- a/src/lua/bcc/libbcc.lua +++ b/src/lua/bcc/libbcc.lua @@ -54,7 +54,7 @@ void * bpf_attach_uprobe(int progfd, int attach_type, const char *ev_name, int bpf_detach_uprobe(const char *ev_name); -void * bpf_open_perf_buffer(perf_reader_raw_cb raw_cb, void *cb_cookie, int pid, int cpu); +void * bpf_open_perf_buffer(perf_reader_raw_cb raw_cb, void *cb_cookie, int pid, int cpu, int page_cnt); ]] ffi.cdef[[ diff --git a/src/lua/bcc/table.lua b/src/lua/bcc/table.lua index c01f006a6075..3144f22f3765 100644 --- a/src/lua/bcc/table.lua +++ b/src/lua/bcc/table.lua @@ -243,13 +243,14 @@ local function _perf_id(id, cpu) return string.format("bcc:perf_event_array:%d:%d", tonumber(id), cpu or 0) end -function PerfEventArray:_open_perf_buffer(cpu, callback, ctype) +function PerfEventArray:_open_perf_buffer(cpu, callback, ctype, page_cnt) local _cb = ffi.cast("perf_reader_raw_cb", function (cookie, data, size) callback(cpu, ctype(data)[0]) end) - local reader = libbcc.bpf_open_perf_buffer(_cb, nil, -1, cpu) + -- default to 8 pages per buffer + local reader = libbcc.bpf_open_perf_buffer(_cb, nil, -1, cpu, page_cnt or 8) assert(reader, "failed to open perf buffer") local fd = libbcc.perf_reader_fd(reader) @@ -258,11 +259,11 @@ function PerfEventArray:_open_perf_buffer(cpu, callback, ctype) self._callbacks[cpu] = _cb end -function PerfEventArray:open_perf_buffer(callback, data_type, ...) +function PerfEventArray:open_perf_buffer(callback, data_type, data_params, page_cnt) assert(data_type, "a data type is needed for callback conversion") - local ctype = ffi.typeof(data_type.."*", ...) + local ctype = ffi.typeof(data_type.."*", unpack(data_params or {})) for i = 0, Posix.cpu_count() - 1 do - self:_open_perf_buffer(i, callback, ctype) + self:_open_perf_buffer(i, callback, ctype, page_cnt) end end diff --git a/src/python/bcc/libbcc.py b/src/python/bcc/libbcc.py index 9db1550c648a..099af8f328fd 100644 --- a/src/python/bcc/libbcc.py +++ b/src/python/bcc/libbcc.py @@ -102,7 +102,7 @@ lib.bpf_detach_tracepoint.restype = ct.c_int lib.bpf_detach_tracepoint.argtypes = [ct.c_char_p, ct.c_char_p] lib.bpf_open_perf_buffer.restype = ct.c_void_p -lib.bpf_open_perf_buffer.argtypes = [_RAW_CB_TYPE, ct.py_object, ct.c_int, ct.c_int] +lib.bpf_open_perf_buffer.argtypes = [_RAW_CB_TYPE, ct.py_object, ct.c_int, ct.c_int, ct.c_int] lib.bpf_open_perf_event.restype = ct.c_int lib.bpf_open_perf_event.argtypes = [ct.c_uint, ct.c_ulonglong, ct.c_int, ct.c_int] lib.perf_reader_poll.restype = ct.c_int diff --git a/src/python/bcc/table.py b/src/python/bcc/table.py index bb4cd4184a72..18851f2a41ef 100644 --- a/src/python/bcc/table.py +++ b/src/python/bcc/table.py @@ -507,20 +507,25 @@ def __delitem__(self, key): super(PerfEventArray, self).__delitem__(key) self.close_perf_buffer(key) - def open_perf_buffer(self, callback): + def open_perf_buffer(self, callback, page_cnt=8): """open_perf_buffers(callback) Opens a set of per-cpu ring buffer to receive custom perf event data from the bpf program. The callback will be invoked for each - event submitted from the kernel, up to millions per second. + event submitted from the kernel, up to millions per second. Use + page_cnt to change the size of the per-cpu ring buffer. The value + must be a power of two and defaults to 8. """ + if page_cnt & (page_cnt - 1) != 0: + raise Exception("Perf buffer page_cnt must be a power of two") + for i in get_online_cpus(): - self._open_perf_buffer(i, callback) + self._open_perf_buffer(i, callback, page_cnt) - def _open_perf_buffer(self, cpu, callback): + def _open_perf_buffer(self, cpu, callback, page_cnt): fn = _RAW_CB_TYPE(lambda _, data, size: callback(cpu, data, size)) - reader = lib.bpf_open_perf_buffer(fn, None, -1, cpu) + reader = lib.bpf_open_perf_buffer(fn, None, -1, cpu, page_cnt) if not reader: raise Exception("Could not open perf buffer") fd = lib.perf_reader_fd(reader) diff --git a/tools/biosnoop.lua b/tools/biosnoop.lua index ac08897bbeff..fac7f3bc453c 100644 --- a/tools/biosnoop.lua +++ b/tools/biosnoop.lua @@ -175,9 +175,9 @@ return function(BPF, utils) uint64_t sector; uint64_t len; uint64_t ts; - char disk_name[%d]; - char name[%d]; + char disk_name[$]; + char name[$]; } - ]] % {DISK_NAME_LEN, TASK_COMM_LEN}) + ]], {DISK_NAME_LEN, TASK_COMM_LEN}, 64) bpf:kprobe_poll_loop() end diff --git a/tools/biosnoop.py b/tools/biosnoop.py index aa8a077b702d..3d77e52cec74 100755 --- a/tools/biosnoop.py +++ b/tools/biosnoop.py @@ -182,6 +182,6 @@ def print_event(cpu, data, size): start_ts = 1 # loop with callback to print_event -b["events"].open_perf_buffer(print_event) +b["events"].open_perf_buffer(print_event, page_cnt=64) while 1: b.kprobe_poll() diff --git a/tools/btrfsslower.py b/tools/btrfsslower.py index fcc155e87067..8b34900ea6ff 100755 --- a/tools/btrfsslower.py +++ b/tools/btrfsslower.py @@ -343,6 +343,6 @@ def print_event(cpu, data, size): "BYTES", "OFF_KB", "LAT(ms)", "FILENAME")) # read events -b["events"].open_perf_buffer(print_event) +b["events"].open_perf_buffer(print_event, page_cnt=64) while 1: b.kprobe_poll() diff --git a/tools/cpuunclaimed.py b/tools/cpuunclaimed.py index 9624b50d7d84..3998f9ffa02e 100755 --- a/tools/cpuunclaimed.py +++ b/tools/cpuunclaimed.py @@ -205,7 +205,7 @@ def print_event(cpu, data, size): trigger = int(0.8 * (1000000000 / frequency)) # read events -b["events"].open_perf_buffer(print_event) +b["events"].open_perf_buffer(print_event, page_cnt=64) while 1: # allow some buffering by calling sleep(), to reduce the context switch # rate and lower overhead. diff --git a/tools/dbslower.py b/tools/dbslower.py index 70e0503cd8f9..6ddec41668c4 100755 --- a/tools/dbslower.py +++ b/tools/dbslower.py @@ -131,7 +131,7 @@ def print_event(cpu, data, size): (', '.join(map(str, args.pids)), args.threshold)) print("%-14s %-6s %8s %s" % ("TIME(s)", "PID", "MS", "QUERY")) -bpf["events"].open_perf_buffer(print_event) +bpf["events"].open_perf_buffer(print_event, page_cnt=64) while True: bpf.kprobe_poll() diff --git a/tools/dcsnoop.py b/tools/dcsnoop.py index d162a66cef33..a72ba4145d74 100755 --- a/tools/dcsnoop.py +++ b/tools/dcsnoop.py @@ -153,6 +153,6 @@ def print_event(cpu, data, size): # header print("%-11s %-6s %-16s %1s %s" % ("TIME(s)", "PID", "COMM", "T", "FILE")) -b["events"].open_perf_buffer(print_event) +b["events"].open_perf_buffer(print_event, page_cnt=64) while 1: b.kprobe_poll() diff --git a/tools/ext4slower.py b/tools/ext4slower.py index 20865a584359..495032535a11 100755 --- a/tools/ext4slower.py +++ b/tools/ext4slower.py @@ -337,6 +337,6 @@ def print_event(cpu, data, size): "BYTES", "OFF_KB", "LAT(ms)", "FILENAME")) # read events -b["events"].open_perf_buffer(print_event) +b["events"].open_perf_buffer(print_event, page_cnt=64) while 1: b.kprobe_poll() diff --git a/tools/fileslower.py b/tools/fileslower.py index 2ae4756e6cb0..ab299900c9a5 100755 --- a/tools/fileslower.py +++ b/tools/fileslower.py @@ -243,6 +243,6 @@ def print_event(cpu, data, size): time.time() - start_ts, event.comm, event.pid, mode_s[event.mode], event.sz, ms, name)) -b["events"].open_perf_buffer(print_event) +b["events"].open_perf_buffer(print_event, page_cnt=64) while 1: b.kprobe_poll() diff --git a/tools/mysqld_qslower.py b/tools/mysqld_qslower.py index 94906a805494..3ed18ec6daaa 100755 --- a/tools/mysqld_qslower.py +++ b/tools/mysqld_qslower.py @@ -128,6 +128,6 @@ def print_event(cpu, data, size): event.pid, float(event.delta) / 1000000, event.query)) # loop with callback to print_event -b["events"].open_perf_buffer(print_event) +b["events"].open_perf_buffer(print_event, page_cnt=64) while 1: b.kprobe_poll() diff --git a/tools/opensnoop.py b/tools/opensnoop.py index 0c2b9b524228..dae4ff4b4894 100755 --- a/tools/opensnoop.py +++ b/tools/opensnoop.py @@ -178,6 +178,6 @@ def print_event(cpu, data, size): event.comm, fd_s, err, event.fname)) # loop with callback to print_event -b["events"].open_perf_buffer(print_event) +b["events"].open_perf_buffer(print_event, page_cnt=64) while 1: b.kprobe_poll() diff --git a/tools/stacksnoop.lua b/tools/stacksnoop.lua index 8f5f5b4fa885..7dfaf3d221cb 100755 --- a/tools/stacksnoop.lua +++ b/tools/stacksnoop.lua @@ -102,6 +102,6 @@ return function(BPF, utils) bpf:get_table("events"):open_perf_buffer(print_event, "struct { uint64_t stack_id; uint32_t pid; char comm[$]; }", - TASK_COMM_LEN) + {TASK_COMM_LEN}) bpf:kprobe_poll_loop() end diff --git a/tools/statsnoop.py b/tools/statsnoop.py index 2fc21641f519..d9164b671572 100755 --- a/tools/statsnoop.py +++ b/tools/statsnoop.py @@ -159,6 +159,6 @@ def print_event(cpu, data, size): fd_s, err, event.fname)) # loop with callback to print_event -b["events"].open_perf_buffer(print_event) +b["events"].open_perf_buffer(print_event, page_cnt=64) while 1: b.kprobe_poll() diff --git a/tools/tcplife.py b/tools/tcplife.py index 1125f9c3d8ff..69ba174079ae 100755 --- a/tools/tcplife.py +++ b/tools/tcplife.py @@ -354,7 +354,7 @@ def print_ipv6_event(cpu, data, size): start_ts = 0 # read events -b["ipv4_events"].open_perf_buffer(print_ipv4_event) -b["ipv6_events"].open_perf_buffer(print_ipv6_event) +b["ipv4_events"].open_perf_buffer(print_ipv4_event, page_cnt=64) +b["ipv6_events"].open_perf_buffer(print_ipv6_event, page_cnt=64) while 1: b.kprobe_poll() diff --git a/tools/trace.py b/tools/trace.py index 46bc97e5d994..029194c77cbd 100755 --- a/tools/trace.py +++ b/tools/trace.py @@ -29,6 +29,7 @@ class Probe(object): use_localtime = True tgid = -1 pid = -1 + page_cnt = None @classmethod def configure(cls, args): @@ -38,6 +39,7 @@ def configure(cls, args): cls.first_ts = BPF.monotonic_time() cls.tgid = args.tgid or -1 cls.pid = args.pid or -1 + cls.page_cnt = args.buffer_pages def __init__(self, probe, string_size, kernel_stack, user_stack): self.usdt = None @@ -510,7 +512,8 @@ def attach(self, bpf, verbose): self._attach_u(bpf) self.python_struct = self._generate_python_data_decl() callback = partial(self.print_event, bpf) - bpf[self.events_name].open_perf_buffer(callback) + bpf[self.events_name].open_perf_buffer(callback, + page_cnt=self.page_cnt) def _attach_k(self, bpf): if self.probe_type == "r": @@ -543,6 +546,7 @@ def _attach_u(self, bpf): pid=Probe.tgid) class Tool(object): + DEFAULT_PERF_BUFFER_PAGES = 64 examples = """ EXAMPLES: @@ -577,6 +581,10 @@ def __init__(self): "functions and print trace messages.", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=Tool.examples) + parser.add_argument("-b", "--buffer-pages", type=int, + default=Tool.DEFAULT_PERF_BUFFER_PAGES, + help="number of pages to use for perf_events ring buffer " + "(default: %(default)d)") # we'll refer to the userspace concepts of "pid" and "tid" by # their kernel names -- tgid and pid -- inside the script parser.add_argument("-p", "--pid", type=int, metavar="PID", diff --git a/tools/trace_example.txt b/tools/trace_example.txt index 504030c06a18..eb72e5e57900 100644 --- a/tools/trace_example.txt +++ b/tools/trace_example.txt @@ -201,11 +201,28 @@ In this example, we traced the "ls ~" command as it was opening its shared libraries and then accessing the /home/vagrant directory listing. +Lastly, if a high-frequency event is traced you may overflow the perf ring +buffer. This shows as "Lost N samples": + +# trace sys_open +5087 5087 pgrep sys_open +5087 5087 pgrep sys_open +5087 5087 pgrep sys_open +5087 5087 pgrep sys_open +5087 5087 pgrep sys_open +Lost 764896 samples +Lost 764896 samples +Lost 764896 samples + +The perf ring buffer size can be changed with -b. The unit is size per-CPU buffer +size and is measured in pages. The value must be a power of two and defaults to +64 pages. + + USAGE message: -# trace -h -usage: trace [-h] [-p PID] [-L TID] [-v] [-Z STRING_SIZE] [-S] - [-M MAX_EVENTS] [-t] [-T] [-K] [-U] [-I header] +usage: trace [-h] [-b BUFFER_PAGES] [-p PID] [-L TID] [-v] [-Z STRING_SIZE] + [-S] [-M MAX_EVENTS] [-t] [-T] [-K] [-U] [-I header] probe [probe ...] Attach to functions and print trace messages. @@ -215,6 +232,9 @@ positional arguments: optional arguments: -h, --help show this help message and exit + -b BUFFER_PAGES, --buffer-pages BUFFER_PAGES + number of pages to use for perf_events ring buffer + (default: 64) -p PID, --pid PID id of the process to trace (optional) -L TID, --tid TID id of the thread to trace (optional) -v, --verbose print resulting BPF program code before executing @@ -224,7 +244,7 @@ optional arguments: -M MAX_EVENTS, --max-events MAX_EVENTS number of events to print before quitting -t, --timestamp print timestamp column (offset from trace start) - -T, --time print time column + -T, --time print time column -K, --kernel-stack output kernel stack trace -U, --user-stack output user stack trace -I header, --include header @@ -247,9 +267,9 @@ trace 'c:malloc "size = %d", arg1' Trace malloc calls and print the size being allocated trace 'p:c:write (arg1 == 1) "writing %d bytes to STDOUT", arg3' Trace the write() call from libc to monitor writes to STDOUT -trace 'r::__kmalloc (retval == 0) "kmalloc failed!" +trace 'r::__kmalloc (retval == 0) "kmalloc failed!"' Trace returns from __kmalloc which returned a null pointer -trace 'r:c:malloc (retval) "allocated = %x", retval +trace 'r:c:malloc (retval) "allocated = %x", retval' Trace returns from malloc and print non-NULL allocated buffers trace 't:block:block_rq_complete "sectors=%d", args->nr_sector' Trace the block_rq_complete kernel tracepoint and print # of tx sectors diff --git a/tools/xfsslower.py b/tools/xfsslower.py index 25c5a207c788..3fbc96d8d1ad 100755 --- a/tools/xfsslower.py +++ b/tools/xfsslower.py @@ -293,6 +293,6 @@ def print_event(cpu, data, size): "BYTES", "OFF_KB", "LAT(ms)", "FILENAME")) # read events -b["events"].open_perf_buffer(print_event) +b["events"].open_perf_buffer(print_event, page_cnt=64) while 1: b.kprobe_poll() diff --git a/tools/zfsslower.py b/tools/zfsslower.py index e2be68410583..f5e8cbb4ee6f 100755 --- a/tools/zfsslower.py +++ b/tools/zfsslower.py @@ -297,6 +297,6 @@ def print_event(cpu, data, size): "BYTES", "OFF_KB", "LAT(ms)", "FILENAME")) # read events -b["events"].open_perf_buffer(print_event) +b["events"].open_perf_buffer(print_event, page_cnt=64) while 1: b.kprobe_poll()