Skip to content

Commit

Permalink
introduce {attach|detach}_raw_tracepoint API
Browse files Browse the repository at this point in the history
The motivation comes from pull request iovisor#1689.
It attached a kprobe bpf program to kernel function
ttwu_do_wakeup for more accurate tracing.
Unfortunately, it broke runqlat.py in my
4.17 environment since ttwu_do_wakeup function
is inlined in my kernel with gcc 7.3.1.

4.17 introduced raw_tracepoint and this patch
added the relevant API to bcc. With this,
we can use tracepoints
sched:{sched_wakeup, sched_wakeup_new, sched_switch}
to measure runq latency more reliably.

Signed-off-by: Yonghong Song <[email protected]>
  • Loading branch information
yonghong-song committed Apr 29, 2018
1 parent 67cc2ff commit 0d72237
Show file tree
Hide file tree
Showing 7 changed files with 212 additions and 4 deletions.
48 changes: 48 additions & 0 deletions docs/reference_guide.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ This guide is incomplete. If something feels missing, check the bcc and kernel s
- [4. uprobes](#4-uprobes)
- [5. uretprobes](#5-uretprobes)
- [6. USDT probes](#6-usdt-probes)
- [7. Raw Tracepoints](#7-raw-tracepoints)
- [Data](#data)
- [1. bpf_probe_read()](#1-bpf_probe_read)
- [2. bpf_probe_read_str()](#2-bpf_probe_read_str)
Expand Down Expand Up @@ -61,6 +62,7 @@ This guide is incomplete. If something feels missing, check the bcc and kernel s
- [4. attach_uprobe()](#4-attach_uprobe)
- [5. attach_uretprobe()](#5-attach_uretprobe)
- [6. USDT.enable_probe()](#6-usdtenable_probe)
- [7. attach_raw_tracepoint()](#7-attach_raw_tracepoint)
- [Debug Output](#debug-output)
- [1. trace_print()](#1-trace_print)
- [2. trace_fields()](#2-trace_fields)
Expand Down Expand Up @@ -237,6 +239,35 @@ Examples in situ:
[search /examples](https://github.com/iovisor/bcc/search?q=bpf_usdt_readarg+path%3Aexamples&type=Code),
[search /tools](https://github.com/iovisor/bcc/search?q=bpf_usdt_readarg+path%3Atools&type=Code)

### 7. Raw Tracepoints

Syntax: RAW_TRACEPOINT_PROBE(*event*)

This is a macro that instruments the raw tracepoint defined by *event*.

The argument is a pointer to struct ```bpf_raw_tracepoint_args```, which is defined in [bpf.h](https://github.com/iovisor/bcc/blob/master/src/cc/compat/linux/bpf.h). The struct field ```args``` contains all parameters of the raw tracepoint where you can found at linux tree [include/trace/events](https://github.com/torvalds/linux/tree/master/include/trace/events)
directory.

For example:
```C
RAW_TRACEPOINT_PROBE(sched_switch)
{
// TP_PROTO(bool preempt, struct task_struct *prev, struct task_struct *next)
struct task_struct *prev = (struct task_struct *)ctx->args[1];
struct task_struct *next= (struct task_struct *)ctx->args[2];
s32 prev_tgid, next_tgid;

bpf_probe_read(&prev_tgid, sizeof(prev->tgid), &prev->tgid);
bpf_probe_read(&next_tgid, sizeof(next->tgid), &next->tgid);
bpf_trace_printk("%d -> %d\\n", prev_tgid, next_tgid);
}
```
This instruments the sched:sched_switch tracepoint, and prints the prev and next tgid.
Examples in situ:
[search /tools](https://github.com/iovisor/bcc/search?q=RAW_TRACEPOINT_PROBE+path%3Atools&type=Code)
## Data
### 1. bpf_probe_read()
Expand Down Expand Up @@ -993,6 +1024,23 @@ Examples in situ:
[search /examples](https://github.com/iovisor/bcc/search?q=enable_probe+path%3Aexamples+language%3Apython&type=Code),
[search /tools](https://github.com/iovisor/bcc/search?q=enable_probe+path%3Atools+language%3Apython&type=Code)

### 7. attach_raw_tracepoint()

Syntax: ```BPF.attach_raw_tracepoint(tp="tracepoint", fn_name="name")```

Instruments the kernel raw tracepoint described by ```tracepoint``` (```event``` only, no ```category```), and when hit, runs the BPF function ```name()```.

This is an explicit way to instrument tracepoints. The ```RAW_TRACEPOINT_PROBE``` syntax, covered in the earlier raw tracepoints section, is an alternate method.

For example:

```Python
b.attach_raw_tracepoint("sched_swtich", "do_trace")
```

Examples in situ:
[search /tools](https://github.com/iovisor/bcc/search?q=attach_raw_tracepoint+path%3Atools+language%3Apython&type=Code)

## Debug Output

### 1. trace_print()
Expand Down
3 changes: 3 additions & 0 deletions src/cc/export/helpers.h
Original file line number Diff line number Diff line change
Expand Up @@ -683,6 +683,9 @@ int bpf_usdt_readarg_p(int argc, struct pt_regs *ctx, void *buf, u64 len) asm("l
#define TRACEPOINT_PROBE(category, event) \
int tracepoint__##category##__##event(struct tracepoint__##category##__##event *args)

#define RAW_TRACEPOINT_PROBE(event) \
int raw_tracepoint__##event(struct bpf_raw_tracepoint_args *ctx)

#define TP_DATA_LOC_READ_CONST(dst, field, length) \
do { \
unsigned short __offset = args->data_loc_##field & 0xFFFF; \
Expand Down
15 changes: 15 additions & 0 deletions src/cc/libbpf.c
Original file line number Diff line number Diff line change
Expand Up @@ -1061,6 +1061,21 @@ int bpf_detach_tracepoint(const char *tp_category, const char *tp_name) {
return 0;
}

int bpf_attach_raw_tracepoint(int progfd, char *tp_name)
{
union bpf_attr attr;
int ret;

bzero(&attr, sizeof(attr));
attr.raw_tracepoint.name = ptr_to_u64(tp_name);
attr.raw_tracepoint.prog_fd = progfd;

ret = syscall(__NR_bpf, BPF_RAW_TRACEPOINT_OPEN, &attr, sizeof(attr));
if (ret < 0)
fprintf(stderr, "bpf_attach_raw_tracepoint (%s): %s\n", tp_name, strerror(errno));
return ret;
}

void * bpf_open_perf_buffer(perf_reader_raw_cb raw_cb,
perf_reader_lost_cb lost_cb, void *cb_cookie,
int pid, int cpu, int page_cnt) {
Expand Down
2 changes: 2 additions & 0 deletions src/cc/libbpf.h
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,8 @@ int bpf_attach_tracepoint(int progfd, const char *tp_category,
const char *tp_name);
int bpf_detach_tracepoint(const char *tp_category, const char *tp_name);

int bpf_attach_raw_tracepoint(int progfd, char *tp_name);

void * bpf_open_perf_buffer(perf_reader_raw_cb raw_cb,
perf_reader_lost_cb lost_cb, void *cb_cookie,
int pid, int cpu, int page_cnt);
Expand Down
67 changes: 66 additions & 1 deletion src/python/bcc/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,17 @@ class BPF(object):
TRACEPOINT = 5
XDP = 6
PERF_EVENT = 7
CGROUP_SKB = 8
CGROUP_SOCK = 9
LWT_IN = 10
LWT_OUT = 11
LWT_XMIT = 12
SOCK_OPS = 13
SK_SKB = 14
CGROUP_DEVICE = 15
SK_MSG = 16
RAW_TRACEPOINT = 17
CGROUP_SOCK_ADDR = 18

# from xdp_action uapi/linux/bpf.h
XDP_ABORTED = 0
Expand Down Expand Up @@ -267,6 +278,7 @@ def __init__(self, src_file=b"", hdr_file=b"", text=None, debug=0,
self.kprobe_fds = {}
self.uprobe_fds = {}
self.tracepoint_fds = {}
self.raw_tracepoint_fds = {}
self.perf_buffers = {}
self.open_perf_events = {}
self.tracefile = None
Expand Down Expand Up @@ -310,7 +322,8 @@ def __init__(self, src_file=b"", hdr_file=b"", text=None, debug=0,
for usdt_context in usdt_contexts:
usdt_context.attach_uprobes(self)

# If any "kprobe__" or "tracepoint__" prefixed functions were defined,
# If any "kprobe__" or "tracepoint__" or "raw_tracepoint__"
# prefixed functions were defined,
# they will be loaded and attached here.
self._trace_autoload()

Expand Down Expand Up @@ -725,6 +738,52 @@ def attach_tracepoint(self, tp=b"", tp_re=b"", fn_name=b""):
self.tracepoint_fds[tp] = fd
return self

def attach_raw_tracepoint(self, tp=b"", fn_name=b""):
"""attach_raw_tracepoint(self, tp=b"", fn_name=b"")
Run the bpf function denoted by fn_name every time the kernel tracepoint
specified by 'tp' is hit. The bpf function should be loaded as a
RAW_TRACEPOINT type. The fn_name is the kernel tracepoint name,
e.g., sched_switch, sys_enter_bind, etc.
Examples:
BPF(text).attach_raw_tracepoint(tp="sched_switch", fn_name="on_switch")
"""

tp = _assert_is_bytes(tp)
if tp in self.raw_tracepoint_fds:
raise Exception("Raw tracepoint %s has been attached" % tp)

fn_name = _assert_is_bytes(fn_name)
fn = self.load_func(fn_name, BPF.RAW_TRACEPOINT)
fd = lib.bpf_attach_raw_tracepoint(fn.fd, tp)
if fd < 0:
raise Exception("Failed to attach BPF to raw tracepoint")
self.raw_tracepoint_fds[tp] = fd;
return self

def detach_raw_tracepoint(self, tp=b""):
"""detach_raw_tracepoint(tp="")
Stop running the bpf function that is attached to the kernel tracepoint
specified by 'tp'.
Example: bpf.detach_raw_tracepoint("sched_switch")
"""

tp = _assert_is_bytes(tp)
if tp not in self.raw_tracepoint_fds:
raise Exception("Raw tracepoint %s is not attached" % tp)
os.close(self.raw_tracepoint_fds[tp])
del self.raw_tracepoint_fds[tp]

@staticmethod
def support_raw_tracepoint():
# kernel symbol "bpf_find_raw_tracepoint" indicates raw_tracepint support
if BPF.ksymname("bpf_find_raw_tracepoint") != -1:
return True
return False

def detach_tracepoint(self, tp=b""):
"""detach_tracepoint(tp="")
Expand Down Expand Up @@ -954,6 +1013,10 @@ def _trace_autoload(self):
fn = self.load_func(func_name, BPF.TRACEPOINT)
tp = fn.name[len(b"tracepoint__"):].replace(b"__", b":")
self.attach_tracepoint(tp=tp, fn_name=fn.name)
elif func_name.startswith(b"raw_tracepoint__"):
fn = self.load_func(func_name, BPF.RAW_TRACEPOINT)
tp = fn.name[len(b"raw_tracepoint__"):]
self.attach_raw_tracepoint(tp=tp, fn_name=fn.name)

def trace_open(self, nonblocking=False):
"""trace_open(nonblocking=False)
Expand Down Expand Up @@ -1154,6 +1217,8 @@ def cleanup(self):
self.detach_uprobe_event(k)
for k, v in list(self.tracepoint_fds.items()):
self.detach_tracepoint(k)
for k, v in list(self.raw_tracepoint_fds.items()):
self.detach_raw_tracepoint(k)

# Clean up opened perf ring buffer and perf events
table_keys = list(self.tables.keys())
Expand Down
2 changes: 2 additions & 0 deletions src/python/bcc/libbcc.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,8 @@
lib.bpf_attach_tracepoint.argtypes = [ct.c_int, ct.c_char_p, ct.c_char_p]
lib.bpf_detach_tracepoint.restype = ct.c_int
lib.bpf_detach_tracepoint.argtypes = [ct.c_char_p, ct.c_char_p]
lib.bpf_attach_raw_tracepoint.restype = ct.c_int
lib.bpf_attach_raw_tracepoint.argtypes = [ct.c_int, ct.c_char_p]
lib.bpf_open_perf_buffer.restype = ct.c_void_p
lib.bpf_open_perf_buffer.argtypes = [_RAW_CB_TYPE, _LOST_CB_TYPE, ct.py_object, ct.c_int, ct.c_int, ct.c_int]
lib.bpf_open_perf_event.restype = ct.c_int
Expand Down
79 changes: 76 additions & 3 deletions tools/runqlat.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,9 @@
start.update(&pid, &ts);
return 0;
}
"""

bpf_text_kprobe = """
int trace_wake_up_new_task(struct pt_regs *ctx, struct task_struct *p)
{
return trace_enqueue(p->tgid, p->pid);
Expand Down Expand Up @@ -144,6 +146,76 @@
}
"""

bpf_text_raw_tp = """
RAW_TRACEPOINT_PROBE(sched_wakeup)
{
// TP_PROTO(struct task_struct *p)
struct task_struct *p = (struct task_struct *)ctx->args[0];
u32 tgid, pid;
bpf_probe_read(&tgid, sizeof(tgid), &p->tgid);
bpf_probe_read(&pid, sizeof(pid), &p->pid);
return trace_enqueue(tgid, pid);
}
RAW_TRACEPOINT_PROBE(sched_wakeup_new)
{
// TP_PROTO(struct task_struct *p)
struct task_struct *p = (struct task_struct *)ctx->args[0];
u32 tgid, pid;
bpf_probe_read(&tgid, sizeof(tgid), &p->tgid);
bpf_probe_read(&pid, sizeof(pid), &p->pid);
return trace_enqueue(tgid, pid);
}
RAW_TRACEPOINT_PROBE(sched_switch)
{
// TP_PROTO(bool preempt, struct task_struct *prev, struct task_struct *next)
struct task_struct *prev = (struct task_struct *)ctx->args[1];
struct task_struct *next= (struct task_struct *)ctx->args[2];
u32 pid, tgid;
long state;
// ivcsw: treat like an enqueue event and store timestamp
bpf_probe_read(&state, sizeof(long), &prev->state);
if (state == TASK_RUNNING) {
bpf_probe_read(&tgid, sizeof(prev->tgid), &prev->tgid);
bpf_probe_read(&pid, sizeof(prev->pid), &prev->pid);
if (!(FILTER)) {
u64 ts = bpf_ktime_get_ns();
start.update(&pid, &ts);
}
}
bpf_probe_read(&tgid, sizeof(next->tgid), &next->tgid);
bpf_probe_read(&pid, sizeof(next->pid), &next->pid);
if (FILTER)
return 0;
u64 *tsp, delta;
// fetch timestamp and calculate delta
tsp = start.lookup(&pid);
if (tsp == 0) {
return 0; // missed enqueue
}
delta = bpf_ktime_get_ns() - *tsp;
FACTOR
// store as histogram
STORE
start.delete(&pid);
return 0;
}
"""

is_support_raw_tp = BPF.support_raw_tracepoint()
if is_support_raw_tp:
bpf_text += bpf_text_raw_tp
else:
bpf_text += bpf_text_kprobe

# code substitutions
if args.pid:
# pid from userspace point of view is thread group from kernel pov
Expand Down Expand Up @@ -186,9 +258,10 @@

# load BPF program
b = BPF(text=bpf_text)
b.attach_kprobe(event="ttwu_do_wakeup", fn_name="trace_ttwu_do_wakeup")
b.attach_kprobe(event="wake_up_new_task", fn_name="trace_wake_up_new_task")
b.attach_kprobe(event="finish_task_switch", fn_name="trace_run")
if not is_support_raw_tp:
b.attach_kprobe(event="ttwu_do_wakeup", fn_name="trace_ttwu_do_wakeup")
b.attach_kprobe(event="wake_up_new_task", fn_name="trace_wake_up_new_task")
b.attach_kprobe(event="finish_task_switch", fn_name="trace_run")

print("Tracing run queue latency... Hit Ctrl-C to end.")

Expand Down

0 comments on commit 0d72237

Please sign in to comment.