diff --git a/docs/reference_guide.md b/docs/reference_guide.md index e24ac8a1fde5..63fb73c74c7c 100644 --- a/docs/reference_guide.md +++ b/docs/reference_guide.md @@ -14,6 +14,7 @@ This guide is incomplete. If something feels missing, check the bcc and kernel s - [4. uprobes](#4-uprobes) - [5. uretprobes](#5-uretprobes) - [6. USDT probes](#6-usdt-probes) + - [7. Raw Tracepoints](#7-raw-tracepoints) - [Data](#data) - [1. bpf_probe_read()](#1-bpf_probe_read) - [2. bpf_probe_read_str()](#2-bpf_probe_read_str) @@ -61,6 +62,7 @@ This guide is incomplete. If something feels missing, check the bcc and kernel s - [4. attach_uprobe()](#4-attach_uprobe) - [5. attach_uretprobe()](#5-attach_uretprobe) - [6. USDT.enable_probe()](#6-usdtenable_probe) + - [7. attach_raw_tracepoint()](#7-attach_raw_tracepoint) - [Debug Output](#debug-output) - [1. trace_print()](#1-trace_print) - [2. trace_fields()](#2-trace_fields) @@ -237,6 +239,35 @@ Examples in situ: [search /examples](https://github.com/iovisor/bcc/search?q=bpf_usdt_readarg+path%3Aexamples&type=Code), [search /tools](https://github.com/iovisor/bcc/search?q=bpf_usdt_readarg+path%3Atools&type=Code) +### 7. Raw Tracepoints + +Syntax: RAW_TRACEPOINT_PROBE(*event*) + +This is a macro that instruments the raw tracepoint defined by *event*. + +The argument is a pointer to struct ```bpf_raw_tracepoint_args```, which is defined in [bpf.h](https://github.com/iovisor/bcc/blob/master/src/cc/compat/linux/bpf.h). The struct field ```args``` contains all parameters of the raw tracepoint where you can found at linux tree [include/trace/events](https://github.com/torvalds/linux/tree/master/include/trace/events) +directory. + +For example: +```C +RAW_TRACEPOINT_PROBE(sched_switch) +{ + // TP_PROTO(bool preempt, struct task_struct *prev, struct task_struct *next) + struct task_struct *prev = (struct task_struct *)ctx->args[1]; + struct task_struct *next= (struct task_struct *)ctx->args[2]; + s32 prev_tgid, next_tgid; + + bpf_probe_read(&prev_tgid, sizeof(prev->tgid), &prev->tgid); + bpf_probe_read(&next_tgid, sizeof(next->tgid), &next->tgid); + bpf_trace_printk("%d -> %d\\n", prev_tgid, next_tgid); +} +``` + +This instruments the sched:sched_switch tracepoint, and prints the prev and next tgid. + +Examples in situ: +[search /tools](https://github.com/iovisor/bcc/search?q=RAW_TRACEPOINT_PROBE+path%3Atools&type=Code) + ## Data ### 1. bpf_probe_read() @@ -993,6 +1024,23 @@ Examples in situ: [search /examples](https://github.com/iovisor/bcc/search?q=enable_probe+path%3Aexamples+language%3Apython&type=Code), [search /tools](https://github.com/iovisor/bcc/search?q=enable_probe+path%3Atools+language%3Apython&type=Code) +### 7. attach_raw_tracepoint() + +Syntax: ```BPF.attach_raw_tracepoint(tp="tracepoint", fn_name="name")``` + +Instruments the kernel raw tracepoint described by ```tracepoint``` (```event``` only, no ```category```), and when hit, runs the BPF function ```name()```. + +This is an explicit way to instrument tracepoints. The ```RAW_TRACEPOINT_PROBE``` syntax, covered in the earlier raw tracepoints section, is an alternate method. + +For example: + +```Python +b.attach_raw_tracepoint("sched_swtich", "do_trace") +``` + +Examples in situ: +[search /tools](https://github.com/iovisor/bcc/search?q=attach_raw_tracepoint+path%3Atools+language%3Apython&type=Code) + ## Debug Output ### 1. trace_print() diff --git a/src/cc/export/helpers.h b/src/cc/export/helpers.h index 2ed5ae13e6ce..11044cf1e51a 100644 --- a/src/cc/export/helpers.h +++ b/src/cc/export/helpers.h @@ -683,6 +683,9 @@ int bpf_usdt_readarg_p(int argc, struct pt_regs *ctx, void *buf, u64 len) asm("l #define TRACEPOINT_PROBE(category, event) \ int tracepoint__##category##__##event(struct tracepoint__##category##__##event *args) +#define RAW_TRACEPOINT_PROBE(event) \ +int raw_tracepoint__##event(struct bpf_raw_tracepoint_args *ctx) + #define TP_DATA_LOC_READ_CONST(dst, field, length) \ do { \ unsigned short __offset = args->data_loc_##field & 0xFFFF; \ diff --git a/src/cc/libbpf.c b/src/cc/libbpf.c index dc50270844d3..722350e92480 100644 --- a/src/cc/libbpf.c +++ b/src/cc/libbpf.c @@ -1061,6 +1061,21 @@ int bpf_detach_tracepoint(const char *tp_category, const char *tp_name) { return 0; } +int bpf_attach_raw_tracepoint(int progfd, char *tp_name) +{ + union bpf_attr attr; + int ret; + + bzero(&attr, sizeof(attr)); + attr.raw_tracepoint.name = ptr_to_u64(tp_name); + attr.raw_tracepoint.prog_fd = progfd; + + ret = syscall(__NR_bpf, BPF_RAW_TRACEPOINT_OPEN, &attr, sizeof(attr)); + if (ret < 0) + fprintf(stderr, "bpf_attach_raw_tracepoint (%s): %s\n", tp_name, strerror(errno)); + return ret; +} + void * bpf_open_perf_buffer(perf_reader_raw_cb raw_cb, perf_reader_lost_cb lost_cb, void *cb_cookie, int pid, int cpu, int page_cnt) { diff --git a/src/cc/libbpf.h b/src/cc/libbpf.h index e59d48ab6559..589006f5cbff 100644 --- a/src/cc/libbpf.h +++ b/src/cc/libbpf.h @@ -81,6 +81,8 @@ int bpf_attach_tracepoint(int progfd, const char *tp_category, const char *tp_name); int bpf_detach_tracepoint(const char *tp_category, const char *tp_name); +int bpf_attach_raw_tracepoint(int progfd, char *tp_name); + void * bpf_open_perf_buffer(perf_reader_raw_cb raw_cb, perf_reader_lost_cb lost_cb, void *cb_cookie, int pid, int cpu, int page_cnt); diff --git a/src/python/bcc/__init__.py b/src/python/bcc/__init__.py index a1552e708c41..5dccd652bd58 100644 --- a/src/python/bcc/__init__.py +++ b/src/python/bcc/__init__.py @@ -135,6 +135,17 @@ class BPF(object): TRACEPOINT = 5 XDP = 6 PERF_EVENT = 7 + CGROUP_SKB = 8 + CGROUP_SOCK = 9 + LWT_IN = 10 + LWT_OUT = 11 + LWT_XMIT = 12 + SOCK_OPS = 13 + SK_SKB = 14 + CGROUP_DEVICE = 15 + SK_MSG = 16 + RAW_TRACEPOINT = 17 + CGROUP_SOCK_ADDR = 18 # from xdp_action uapi/linux/bpf.h XDP_ABORTED = 0 @@ -267,6 +278,7 @@ def __init__(self, src_file=b"", hdr_file=b"", text=None, debug=0, self.kprobe_fds = {} self.uprobe_fds = {} self.tracepoint_fds = {} + self.raw_tracepoint_fds = {} self.perf_buffers = {} self.open_perf_events = {} self.tracefile = None @@ -310,7 +322,8 @@ def __init__(self, src_file=b"", hdr_file=b"", text=None, debug=0, for usdt_context in usdt_contexts: usdt_context.attach_uprobes(self) - # If any "kprobe__" or "tracepoint__" prefixed functions were defined, + # If any "kprobe__" or "tracepoint__" or "raw_tracepoint__" + # prefixed functions were defined, # they will be loaded and attached here. self._trace_autoload() @@ -725,6 +738,52 @@ def attach_tracepoint(self, tp=b"", tp_re=b"", fn_name=b""): self.tracepoint_fds[tp] = fd return self + def attach_raw_tracepoint(self, tp=b"", fn_name=b""): + """attach_raw_tracepoint(self, tp=b"", fn_name=b"") + + Run the bpf function denoted by fn_name every time the kernel tracepoint + specified by 'tp' is hit. The bpf function should be loaded as a + RAW_TRACEPOINT type. The fn_name is the kernel tracepoint name, + e.g., sched_switch, sys_enter_bind, etc. + + Examples: + BPF(text).attach_raw_tracepoint(tp="sched_switch", fn_name="on_switch") + """ + + tp = _assert_is_bytes(tp) + if tp in self.raw_tracepoint_fds: + raise Exception("Raw tracepoint %s has been attached" % tp) + + fn_name = _assert_is_bytes(fn_name) + fn = self.load_func(fn_name, BPF.RAW_TRACEPOINT) + fd = lib.bpf_attach_raw_tracepoint(fn.fd, tp) + if fd < 0: + raise Exception("Failed to attach BPF to raw tracepoint") + self.raw_tracepoint_fds[tp] = fd; + return self + + def detach_raw_tracepoint(self, tp=b""): + """detach_raw_tracepoint(tp="") + + Stop running the bpf function that is attached to the kernel tracepoint + specified by 'tp'. + + Example: bpf.detach_raw_tracepoint("sched_switch") + """ + + tp = _assert_is_bytes(tp) + if tp not in self.raw_tracepoint_fds: + raise Exception("Raw tracepoint %s is not attached" % tp) + os.close(self.raw_tracepoint_fds[tp]) + del self.raw_tracepoint_fds[tp] + + @staticmethod + def support_raw_tracepoint(): + # kernel symbol "bpf_find_raw_tracepoint" indicates raw_tracepint support + if BPF.ksymname("bpf_find_raw_tracepoint") != -1: + return True + return False + def detach_tracepoint(self, tp=b""): """detach_tracepoint(tp="") @@ -954,6 +1013,10 @@ def _trace_autoload(self): fn = self.load_func(func_name, BPF.TRACEPOINT) tp = fn.name[len(b"tracepoint__"):].replace(b"__", b":") self.attach_tracepoint(tp=tp, fn_name=fn.name) + elif func_name.startswith(b"raw_tracepoint__"): + fn = self.load_func(func_name, BPF.RAW_TRACEPOINT) + tp = fn.name[len(b"raw_tracepoint__"):] + self.attach_raw_tracepoint(tp=tp, fn_name=fn.name) def trace_open(self, nonblocking=False): """trace_open(nonblocking=False) @@ -1154,6 +1217,8 @@ def cleanup(self): self.detach_uprobe_event(k) for k, v in list(self.tracepoint_fds.items()): self.detach_tracepoint(k) + for k, v in list(self.raw_tracepoint_fds.items()): + self.detach_raw_tracepoint(k) # Clean up opened perf ring buffer and perf events table_keys = list(self.tables.keys()) diff --git a/src/python/bcc/libbcc.py b/src/python/bcc/libbcc.py index fd9f72b18c1b..e61227ef12a7 100644 --- a/src/python/bcc/libbcc.py +++ b/src/python/bcc/libbcc.py @@ -100,6 +100,8 @@ lib.bpf_attach_tracepoint.argtypes = [ct.c_int, ct.c_char_p, ct.c_char_p] lib.bpf_detach_tracepoint.restype = ct.c_int lib.bpf_detach_tracepoint.argtypes = [ct.c_char_p, ct.c_char_p] +lib.bpf_attach_raw_tracepoint.restype = ct.c_int +lib.bpf_attach_raw_tracepoint.argtypes = [ct.c_int, ct.c_char_p] lib.bpf_open_perf_buffer.restype = ct.c_void_p lib.bpf_open_perf_buffer.argtypes = [_RAW_CB_TYPE, _LOST_CB_TYPE, ct.py_object, ct.c_int, ct.c_int, ct.c_int] lib.bpf_open_perf_event.restype = ct.c_int diff --git a/tools/runqlat.py b/tools/runqlat.py index 95657cd930eb..ebda11d09bb9 100755 --- a/tools/runqlat.py +++ b/tools/runqlat.py @@ -95,7 +95,9 @@ start.update(&pid, &ts); return 0; } +""" +bpf_text_kprobe = """ int trace_wake_up_new_task(struct pt_regs *ctx, struct task_struct *p) { return trace_enqueue(p->tgid, p->pid); @@ -144,6 +146,76 @@ } """ +bpf_text_raw_tp = """ +RAW_TRACEPOINT_PROBE(sched_wakeup) +{ + // TP_PROTO(struct task_struct *p) + struct task_struct *p = (struct task_struct *)ctx->args[0]; + u32 tgid, pid; + + bpf_probe_read(&tgid, sizeof(tgid), &p->tgid); + bpf_probe_read(&pid, sizeof(pid), &p->pid); + return trace_enqueue(tgid, pid); +} + +RAW_TRACEPOINT_PROBE(sched_wakeup_new) +{ + // TP_PROTO(struct task_struct *p) + struct task_struct *p = (struct task_struct *)ctx->args[0]; + u32 tgid, pid; + + bpf_probe_read(&tgid, sizeof(tgid), &p->tgid); + bpf_probe_read(&pid, sizeof(pid), &p->pid); + return trace_enqueue(tgid, pid); +} + +RAW_TRACEPOINT_PROBE(sched_switch) +{ + // TP_PROTO(bool preempt, struct task_struct *prev, struct task_struct *next) + struct task_struct *prev = (struct task_struct *)ctx->args[1]; + struct task_struct *next= (struct task_struct *)ctx->args[2]; + u32 pid, tgid; + long state; + + // ivcsw: treat like an enqueue event and store timestamp + bpf_probe_read(&state, sizeof(long), &prev->state); + if (state == TASK_RUNNING) { + bpf_probe_read(&tgid, sizeof(prev->tgid), &prev->tgid); + bpf_probe_read(&pid, sizeof(prev->pid), &prev->pid); + if (!(FILTER)) { + u64 ts = bpf_ktime_get_ns(); + start.update(&pid, &ts); + } + } + + bpf_probe_read(&tgid, sizeof(next->tgid), &next->tgid); + bpf_probe_read(&pid, sizeof(next->pid), &next->pid); + if (FILTER) + return 0; + u64 *tsp, delta; + + // fetch timestamp and calculate delta + tsp = start.lookup(&pid); + if (tsp == 0) { + return 0; // missed enqueue + } + delta = bpf_ktime_get_ns() - *tsp; + FACTOR + + // store as histogram + STORE + + start.delete(&pid); + return 0; +} +""" + +is_support_raw_tp = BPF.support_raw_tracepoint() +if is_support_raw_tp: + bpf_text += bpf_text_raw_tp +else: + bpf_text += bpf_text_kprobe + # code substitutions if args.pid: # pid from userspace point of view is thread group from kernel pov @@ -186,9 +258,10 @@ # load BPF program b = BPF(text=bpf_text) -b.attach_kprobe(event="ttwu_do_wakeup", fn_name="trace_ttwu_do_wakeup") -b.attach_kprobe(event="wake_up_new_task", fn_name="trace_wake_up_new_task") -b.attach_kprobe(event="finish_task_switch", fn_name="trace_run") +if not is_support_raw_tp: + b.attach_kprobe(event="ttwu_do_wakeup", fn_name="trace_ttwu_do_wakeup") + b.attach_kprobe(event="wake_up_new_task", fn_name="trace_wake_up_new_task") + b.attach_kprobe(event="finish_task_switch", fn_name="trace_run") print("Tracing run queue latency... Hit Ctrl-C to end.")