diff --git a/docs/reference_guide.md b/docs/reference_guide.md index ff18ab93e676..0474f46849ff 100644 --- a/docs/reference_guide.md +++ b/docs/reference_guide.md @@ -1145,9 +1145,9 @@ Examples in situ: ### 13. BPF_XSKMAP -Syntax: ```BPF_XSKMAP(name, size)``` +Syntax: ```BPF_XSKMAP(name, size [, "/sys/fs/bpf/xyz"])``` -This creates a xsk map named ```name``` with ```size``` entries. Each entry represents one NIC's queue id. This map is only used in XDP to redirect packet to an AF_XDP socket. If the AF_XDP socket is binded to a queue which is different than the current packet's queue id, the packet will be dropped. For kernel v5.3 and latter, `lookup` method is available and can be used to check whether and AF_XDP socket is available for the current packet's queue id. More details at [AF_XDP](https://www.kernel.org/doc/html/latest/networking/af_xdp.html). +This creates a xsk map named ```name``` with ```size``` entries and pin it to the bpffs as a FILE. Each entry represents one NIC's queue id. This map is only used in XDP to redirect packet to an AF_XDP socket. If the AF_XDP socket is binded to a queue which is different than the current packet's queue id, the packet will be dropped. For kernel v5.3 and latter, `lookup` method is available and can be used to check whether and AF_XDP socket is available for the current packet's queue id. More details at [AF_XDP](https://www.kernel.org/doc/html/latest/networking/af_xdp.html). For example: ```C diff --git a/libbpf-tools/.gitignore b/libbpf-tools/.gitignore index 251646397529..83f56063ae3f 100644 --- a/libbpf-tools/.gitignore +++ b/libbpf-tools/.gitignore @@ -45,6 +45,7 @@ /syscount /tcpconnect /tcpconnlat +/tcplife /tcprtt /tcpsynbl /vfsstat diff --git a/libbpf-tools/Makefile b/libbpf-tools/Makefile index e753230cee89..e5aa35a2c48b 100644 --- a/libbpf-tools/Makefile +++ b/libbpf-tools/Makefile @@ -58,6 +58,7 @@ APPS = \ syscount \ tcpconnect \ tcpconnlat \ + tcplife \ tcprtt \ tcpsynbl \ vfsstat \ diff --git a/libbpf-tools/biopattern.bpf.c b/libbpf-tools/biopattern.bpf.c index 2f099be77ee8..334a175dcffb 100644 --- a/libbpf-tools/biopattern.bpf.c +++ b/libbpf-tools/biopattern.bpf.c @@ -5,6 +5,7 @@ #include #include "biopattern.h" #include "maps.bpf.h" +#include "core_fixes.bpf.h" const volatile bool filter_dev = false; const volatile __u32 targ_dev = 0; @@ -17,12 +18,24 @@ struct { } counters SEC(".maps"); SEC("tracepoint/block/block_rq_complete") -int handle__block_rq_complete(struct trace_event_raw_block_rq_complete *ctx) +int handle__block_rq_complete(void *args) { - sector_t sector = ctx->sector; struct counter *counterp, zero = {}; - u32 nr_sector = ctx->nr_sector; - u32 dev = ctx->dev; + sector_t sector; + u32 nr_sector; + u32 dev; + + if (has_block_rq_completion()) { + struct trace_event_raw_block_rq_completion___x *ctx = args; + sector = BPF_CORE_READ(ctx, sector); + nr_sector = BPF_CORE_READ(ctx, nr_sector); + dev = BPF_CORE_READ(ctx, dev); + } else { + struct trace_event_raw_block_rq_complete *ctx = args; + sector = BPF_CORE_READ(ctx, sector); + nr_sector = BPF_CORE_READ(ctx, nr_sector); + dev = BPF_CORE_READ(ctx, dev); + } if (filter_dev && targ_dev != dev) return 0; diff --git a/libbpf-tools/core_fixes.bpf.h b/libbpf-tools/core_fixes.bpf.h index 33a4f7f78311..3bbcbbaf4625 100644 --- a/libbpf-tools/core_fixes.bpf.h +++ b/libbpf-tools/core_fixes.bpf.h @@ -17,6 +17,15 @@ struct task_struct___x { unsigned int __state; } __attribute__((preserve_access_index)); +static __always_inline __s64 get_task_state(void *task) +{ + struct task_struct___x *t = task; + + if (bpf_core_field_exists(t->__state)) + return BPF_CORE_READ(t, __state); + return BPF_CORE_READ((struct task_struct *)task, state); +} + /** * commit 309dca309fc3 ("block: store a block_device pointer in struct bio") * adds a new member bi_bdev which is a pointer to struct block_device @@ -27,15 +36,6 @@ struct bio___x { struct block_device *bi_bdev; } __attribute__((preserve_access_index)); -static __always_inline __s64 get_task_state(void *task) -{ - struct task_struct___x *t = task; - - if (bpf_core_field_exists(t->__state)) - return BPF_CORE_READ(t, __state); - return BPF_CORE_READ((struct task_struct *)task, state); -} - static __always_inline struct gendisk *get_gendisk(void *bio) { struct bio___x *b = bio; @@ -45,4 +45,26 @@ static __always_inline struct gendisk *get_gendisk(void *bio) return BPF_CORE_READ((struct bio *)bio, bi_disk); } +/** + * commit d5869fdc189f ("block: introduce block_rq_error tracepoint") + * adds a new tracepoint block_rq_error and it shares the same arguments + * with tracepoint block_rq_complete. As a result, the kernel BTF now has + * a `struct trace_event_raw_block_rq_completion` instead of + * `struct trace_event_raw_block_rq_complete`. + * see: + * https://github.com/torvalds/linux/commit/d5869fdc189f + */ +struct trace_event_raw_block_rq_completion___x { + dev_t dev; + sector_t sector; + unsigned int nr_sector; +} __attribute__((preserve_access_index)); + +static __always_inline bool has_block_rq_completion() +{ + if (bpf_core_type_exists(struct trace_event_raw_block_rq_completion___x)) + return true; + return false; +} + #endif /* __CORE_FIXES_BPF_H */ diff --git a/libbpf-tools/filelife.c b/libbpf-tools/filelife.c index 07286ecf5752..5d0d5ecbb2f2 100644 --- a/libbpf-tools/filelife.c +++ b/libbpf-tools/filelife.c @@ -138,6 +138,9 @@ int main(int argc, char **argv) /* initialize global data (filtering options) */ obj->rodata->targ_tgid = env.pid; + if (!kprobe_exists("security_inode_create")) + bpf_program__set_autoload(obj->progs.security_inode_create, false); + err = filelife_bpf__load(obj); if (err) { fprintf(stderr, "failed to load BPF object: %d\n", err); diff --git a/libbpf-tools/llcstat.bpf.c b/libbpf-tools/llcstat.bpf.c index a36fc2dfbf9f..77fcf8306805 100644 --- a/libbpf-tools/llcstat.bpf.c +++ b/libbpf-tools/llcstat.bpf.c @@ -3,36 +3,43 @@ #include #include #include +#include "maps.bpf.h" #include "llcstat.h" #define MAX_ENTRIES 10240 +const volatile bool targ_per_thread = false; + struct { __uint(type, BPF_MAP_TYPE_HASH); __uint(max_entries, MAX_ENTRIES); - __type(key, u64); - __type(value, struct info); + __type(key, struct key_info); + __type(value, struct value_info); } infos SEC(".maps"); static __always_inline int trace_event(__u64 sample_period, bool miss) { - u64 pid = bpf_get_current_pid_tgid(); - u32 cpu = bpf_get_smp_processor_id(); - struct info *infop, info = {}; - u64 key = pid << 32 | cpu; - - infop = bpf_map_lookup_elem(&infos, &key); - if (!infop) { - bpf_get_current_comm(info.comm, sizeof(info.comm)); - infop = &info; - } + struct key_info key = {}; + struct value_info *infop, zero = {}; + + u64 pid_tgid = bpf_get_current_pid_tgid(); + key.cpu = bpf_get_smp_processor_id(); + key.pid = pid_tgid >> 32; + if (targ_per_thread) + key.tid = (u32)pid_tgid; + else + key.tid = key.pid; + + infop = bpf_map_lookup_or_try_init(&infos, &key, &zero); + if (!infop) + return 0; if (miss) infop->miss += sample_period; else infop->ref += sample_period; - if (infop == &info) - bpf_map_update_elem(&infos, &key, infop, 0); + bpf_get_current_comm(infop->comm, sizeof(infop->comm)); + return 0; } diff --git a/libbpf-tools/llcstat.c b/libbpf-tools/llcstat.c index bc13e7f133a6..30be26c5e67f 100644 --- a/libbpf-tools/llcstat.c +++ b/libbpf-tools/llcstat.c @@ -3,6 +3,7 @@ // // Based on llcstat(8) from BCC by Teng Qin. // 29-Sep-2020 Wenbo Zhang Created this. +// 20-Jun-2022 YeZhengMao Added tid info. #include #include #include @@ -21,6 +22,7 @@ struct env { int sample_period; time_t duration; bool verbose; + bool per_thread; } env = { .sample_period = 100, .duration = 10, @@ -40,6 +42,8 @@ static const struct argp_option opts[] = { { "sample_period", 'c', "SAMPLE_PERIOD", 0, "Sample one in this many " "number of cache reference / miss events" }, { "verbose", 'v', NULL, 0, "Verbose debug output" }, + { "tid", 't', NULL, 0, + "Summarize cache references and misses by PID/TID" }, { NULL, 'h', NULL, OPTION_HIDDEN, "Show the full help" }, {}, }; @@ -55,6 +59,9 @@ static error_t parse_arg(int key, char *arg, struct argp_state *state) case 'v': env.verbose = true; break; + case 't': + env.per_thread = true; + break; case 'c': errno = 0; env.sample_period = strtol(arg, NULL, 10); @@ -131,10 +138,10 @@ static void sig_handler(int sig) static void print_map(struct bpf_map *map) { __u64 total_ref = 0, total_miss = 0, total_hit, hit; - __u64 lookup_key = -1, next_key; + __u32 pid, cpu, tid; + struct key_info lookup_key = { .cpu = -1 }, next_key; int err, fd = bpf_map__fd(map); - struct info info; - __u32 pid, cpu; + struct value_info info; while (!bpf_map_get_next_key(fd, &lookup_key, &next_key)) { err = bpf_map_lookup_elem(fd, &next_key, &info); @@ -143,11 +150,16 @@ static void print_map(struct bpf_map *map) return; } hit = info.ref > info.miss ? info.ref - info.miss : 0; - pid = next_key >> 32; - cpu = next_key; - printf("%-8u %-16s %-4u %12llu %12llu %6.2f%%\n", pid, info.comm, - cpu, info.ref, info.miss, info.ref > 0 ? - hit * 1.0 / info.ref * 100 : 0); + cpu = next_key.cpu; + pid = next_key.pid; + tid = next_key.tid; + printf("%-8u ", pid); + if (env.per_thread) { + printf("%-8u ", tid); + } + printf("%-16s %-4u %12llu %12llu %6.2f%%\n", + info.comm, cpu, info.ref, info.miss, + info.ref > 0 ? hit * 1.0 / info.ref * 100 : 0); total_miss += info.miss; total_ref += info.ref; lookup_key = next_key; @@ -157,7 +169,7 @@ static void print_map(struct bpf_map *map) total_ref, total_miss, total_ref > 0 ? total_hit * 1.0 / total_ref * 100 : 0); - lookup_key = -1; + lookup_key.cpu = -1; while (!bpf_map_get_next_key(fd, &lookup_key, &next_key)) { err = bpf_map_delete_elem(fd, &next_key); if (err < 0) { @@ -212,6 +224,8 @@ int main(int argc, char **argv) goto cleanup; } + obj->rodata->targ_per_thread = env.per_thread; + err = llcstat_bpf__load(obj); if (err) { fprintf(stderr, "failed to load BPF object: %d\n", err); @@ -233,8 +247,12 @@ int main(int argc, char **argv) sleep(env.duration); - printf("%-8s %-16s %-4s %12s %12s %7s\n", - "PID", "NAME", "CPU", "REFERENCE", "MISS", "HIT%"); + printf("%-8s ", "PID"); + if (env.per_thread) { + printf("%-8s ", "TID"); + } + printf("%-16s %-4s %12s %12s %7s\n", + "NAME", "CPU", "REFERENCE", "MISS", "HIT%"); print_map(obj->maps.infos); diff --git a/libbpf-tools/llcstat.h b/libbpf-tools/llcstat.h index 8123cd7d90bf..83a50db8cc90 100644 --- a/libbpf-tools/llcstat.h +++ b/libbpf-tools/llcstat.h @@ -4,10 +4,16 @@ #define TASK_COMM_LEN 16 -struct info { +struct value_info { __u64 ref; __u64 miss; char comm[TASK_COMM_LEN]; }; +struct key_info { + __u32 cpu; + __u32 pid; + __u32 tid; +}; + #endif /* __LLCSTAT_H */ diff --git a/libbpf-tools/syscount.bpf.c b/libbpf-tools/syscount.bpf.c index 6209feeaa023..38f8f978343d 100644 --- a/libbpf-tools/syscount.bpf.c +++ b/libbpf-tools/syscount.bpf.c @@ -78,7 +78,7 @@ int sys_exit(struct trace_event_raw_sys_exit *args) static const struct data_t zero; pid_t pid = id >> 32; struct data_t *val; - u64 *start_ts; + u64 *start_ts, lat = 0; u32 tid = id; u32 key; @@ -97,6 +97,7 @@ int sys_exit(struct trace_event_raw_sys_exit *args) start_ts = bpf_map_lookup_elem(&start, &tid); if (!start_ts) return 0; + lat = bpf_ktime_get_ns() - *start_ts; } key = (count_by_process) ? pid : args->id; @@ -106,7 +107,7 @@ int sys_exit(struct trace_event_raw_sys_exit *args) if (count_by_process) save_proc_name(val); if (measure_latency) - __sync_fetch_and_add(&val->total_ns, bpf_ktime_get_ns() - *start_ts); + __sync_fetch_and_add(&val->total_ns, lat); } return 0; } diff --git a/libbpf-tools/tcpconnect.bpf.c b/libbpf-tools/tcpconnect.bpf.c index a13d48c239f2..c57faa0263ce 100644 --- a/libbpf-tools/tcpconnect.bpf.c +++ b/libbpf-tools/tcpconnect.bpf.c @@ -55,7 +55,7 @@ static __always_inline bool filter_port(__u16 port) if (filter_ports_len == 0) return false; - for (i = 0; i < filter_ports_len; i++) { + for (i = 0; i < filter_ports_len && i < MAX_PORTS; i++) { if (port == filter_ports[i]) return false; } diff --git a/libbpf-tools/tcpconnlat.bpf.c b/libbpf-tools/tcpconnlat.bpf.c index 56d374144b0f..b44abb293fa4 100644 --- a/libbpf-tools/tcpconnlat.bpf.c +++ b/libbpf-tools/tcpconnlat.bpf.c @@ -31,7 +31,7 @@ struct { __uint(value_size, sizeof(u32)); } events SEC(".maps"); -static __always_inline int trace_connect(struct sock *sk) +static int trace_connect(struct sock *sk) { u32 tgid = bpf_get_current_pid_tgid() >> 32; struct piddata piddata = {}; @@ -46,27 +46,14 @@ static __always_inline int trace_connect(struct sock *sk) return 0; } -SEC("fentry/tcp_v4_connect") -int BPF_PROG(tcp_v4_connect, struct sock *sk) -{ - return trace_connect(sk); -} - -SEC("kprobe/tcp_v6_connect") -int BPF_KPROBE(tcp_v6_connect, struct sock *sk) -{ - return trace_connect(sk); -} - -SEC("fentry/tcp_rcv_state_process") -int BPF_PROG(tcp_rcv_state_process, struct sock *sk) +static int handle_tcp_rcv_state_process(void *ctx, struct sock *sk) { struct piddata *piddatap; struct event event = {}; s64 delta; u64 ts; - if (sk->__sk_common.skc_state != TCP_SYN_SENT) + if (BPF_CORE_READ(sk, __sk_common.skc_state) != TCP_SYN_SENT) return 0; piddatap = bpf_map_lookup_elem(&start, &sk); @@ -85,12 +72,12 @@ int BPF_PROG(tcp_rcv_state_process, struct sock *sk) sizeof(event.comm)); event.ts_us = ts / 1000; event.tgid = piddatap->tgid; - event.lport = sk->__sk_common.skc_num; - event.dport = sk->__sk_common.skc_dport; - event.af = sk->__sk_common.skc_family; + event.lport = BPF_CORE_READ(sk, __sk_common.skc_num); + event.dport = BPF_CORE_READ(sk, __sk_common.skc_dport); + event.af = BPF_CORE_READ(sk, __sk_common.skc_family); if (event.af == AF_INET) { - event.saddr_v4 = sk->__sk_common.skc_rcv_saddr; - event.daddr_v4 = sk->__sk_common.skc_daddr; + event.saddr_v4 = BPF_CORE_READ(sk, __sk_common.skc_rcv_saddr); + event.daddr_v4 = BPF_CORE_READ(sk, __sk_common.skc_daddr); } else { BPF_CORE_READ_INTO(&event.saddr_v6, sk, __sk_common.skc_v6_rcv_saddr.in6_u.u6_addr32); @@ -105,4 +92,40 @@ int BPF_PROG(tcp_rcv_state_process, struct sock *sk) return 0; } +SEC("kprobe/tcp_v4_connect") +int BPF_KPROBE(tcp_v4_connect, struct sock *sk) +{ + return trace_connect(sk); +} + +SEC("kprobe/tcp_v6_connect") +int BPF_KPROBE(tcp_v6_connect, struct sock *sk) +{ + return trace_connect(sk); +} + +SEC("kprobe/tcp_rcv_state_process") +int BPF_KPROBE(tcp_rcv_state_process, struct sock *sk) +{ + return handle_tcp_rcv_state_process(ctx, sk); +} + +SEC("fentry/tcp_v4_connect") +int BPF_PROG(fentry_tcp_v4_connect, struct sock *sk) +{ + return trace_connect(sk); +} + +SEC("fentry/tcp_v6_connect") +int BPF_PROG(fentry_tcp_v6_connect, struct sock *sk) +{ + return trace_connect(sk); +} + +SEC("fentry/tcp_rcv_state_process") +int BPF_PROG(fentry_tcp_rcv_state_process, struct sock *sk) +{ + return handle_tcp_rcv_state_process(ctx, sk); +} + char LICENSE[] SEC("license") = "GPL"; diff --git a/libbpf-tools/tcpconnlat.c b/libbpf-tools/tcpconnlat.c index 8eae76aeadab..c07aa8afc252 100644 --- a/libbpf-tools/tcpconnlat.c +++ b/libbpf-tools/tcpconnlat.c @@ -182,6 +182,19 @@ int main(int argc, char **argv) obj->rodata->targ_min_us = env.min_us; obj->rodata->targ_tgid = env.pid; + if (fentry_can_attach("tcp_v4_connect", NULL)) { + bpf_program__set_attach_target(obj->progs.fentry_tcp_v4_connect, 0, "tcp_v4_connect"); + bpf_program__set_attach_target(obj->progs.fentry_tcp_v6_connect, 0, "tcp_v6_connect"); + bpf_program__set_attach_target(obj->progs.fentry_tcp_rcv_state_process, 0, "tcp_rcv_state_process"); + bpf_program__set_autoload(obj->progs.tcp_v4_connect, false); + bpf_program__set_autoload(obj->progs.tcp_v6_connect, false); + bpf_program__set_autoload(obj->progs.tcp_rcv_state_process, false); + } else { + bpf_program__set_autoload(obj->progs.fentry_tcp_v4_connect, false); + bpf_program__set_autoload(obj->progs.fentry_tcp_v6_connect, false); + bpf_program__set_autoload(obj->progs.fentry_tcp_rcv_state_process, false); + } + err = tcpconnlat_bpf__load(obj); if (err) { fprintf(stderr, "failed to load BPF object: %d\n", err); diff --git a/libbpf-tools/tcplife.bpf.c b/libbpf-tools/tcplife.bpf.c new file mode 100644 index 000000000000..a05d1396ecaa --- /dev/null +++ b/libbpf-tools/tcplife.bpf.c @@ -0,0 +1,152 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2022 Hengqi Chen */ +#include +#include +#include +#include +#include "tcplife.h" + +#define MAX_ENTRIES 10240 +#define AF_INET 2 +#define AF_INET6 10 + +const volatile bool filter_sport = false; +const volatile bool filter_dport = false; +const volatile __u16 target_sports[MAX_PORTS] = {}; +const volatile __u16 target_dports[MAX_PORTS] = {}; +const volatile pid_t target_pid = 0; +const volatile __u16 target_family = 0; + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, MAX_ENTRIES); + __type(key, struct sock *); + __type(value, __u64); +} birth SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, MAX_ENTRIES); + __type(key, struct sock *); + __type(value, struct ident); +} idents SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(__u32)); +} events SEC(".maps"); + +SEC("tracepoint/sock/inet_sock_set_state") +int inet_sock_set_state(struct trace_event_raw_inet_sock_set_state *args) +{ + __u64 ts, *start, delta_us, rx_b, tx_b; + struct ident ident = {}, *identp; + __u16 sport, dport, family; + struct event event = {}; + struct tcp_sock *tp; + struct sock *sk; + bool found; + __u32 pid; + int i; + + if (BPF_CORE_READ(args, protocol) != IPPROTO_TCP) + return 0; + + family = BPF_CORE_READ(args, family); + if (target_family && family != target_family) + return 0; + + sport = BPF_CORE_READ(args, sport); + if (filter_sport) { + found = false; + for (i = 0; i < MAX_PORTS; i++) { + if (!target_sports[i]) + return 0; + if (sport != target_sports[i]) + continue; + found = true; + break; + } + if (!found) + return 0; + } + + dport = BPF_CORE_READ(args, dport); + if (filter_dport) { + found = false; + for (i = 0; i < MAX_PORTS; i++) { + if (!target_dports[i]) + return 0; + if (dport != target_dports[i]) + continue; + found = true; + break; + } + if (!found) + return 0; + } + + sk = (struct sock *)BPF_CORE_READ(args, skaddr); + if (BPF_CORE_READ(args, newstate) < TCP_FIN_WAIT1) { + ts = bpf_ktime_get_ns(); + bpf_map_update_elem(&birth, &sk, &ts, BPF_ANY); + } + + if (BPF_CORE_READ(args, newstate) == TCP_SYN_SENT || BPF_CORE_READ(args, newstate) == TCP_LAST_ACK) { + pid = bpf_get_current_pid_tgid() >> 32; + if (target_pid && pid != target_pid) + return 0; + ident.pid = pid; + bpf_get_current_comm(ident.comm, sizeof(ident.comm)); + bpf_map_update_elem(&idents, &sk, &ident, BPF_ANY); + } + + if (BPF_CORE_READ(args, newstate) != TCP_CLOSE) + return 0; + + start = bpf_map_lookup_elem(&birth, &sk); + if (!start) { + bpf_map_delete_elem(&idents, &sk); + return 0; + } + ts = bpf_ktime_get_ns(); + delta_us = (ts - *start) / 1000; + + identp = bpf_map_lookup_elem(&idents, &sk); + pid = identp ? identp->pid : bpf_get_current_pid_tgid() >> 32; + if (target_pid && pid != target_pid) + goto cleanup; + + tp = (struct tcp_sock *)sk; + rx_b = BPF_CORE_READ(tp, bytes_received); + tx_b = BPF_CORE_READ(tp, bytes_acked); + + event.ts_us = ts / 1000; + event.span_us = delta_us; + event.rx_b = rx_b; + event.tx_b = tx_b; + event.pid = pid; + event.sport = sport; + event.dport = dport; + event.family = family; + if (!identp) + bpf_get_current_comm(event.comm, sizeof(event.comm)); + else + bpf_probe_read_kernel(event.comm, sizeof(event.comm), (void *)identp->comm); + if (family == AF_INET) { + bpf_probe_read_kernel(&event.saddr, sizeof(args->saddr), BPF_CORE_READ(args, saddr)); + bpf_probe_read_kernel(&event.daddr, sizeof(args->daddr), BPF_CORE_READ(args, daddr)); + } else { /* AF_INET6 */ + bpf_probe_read_kernel(&event.saddr, sizeof(args->saddr_v6), BPF_CORE_READ(args, saddr_v6)); + bpf_probe_read_kernel(&event.daddr, sizeof(args->daddr_v6), BPF_CORE_READ(args, daddr_v6)); + } + bpf_perf_event_output(args, &events, BPF_F_CURRENT_CPU, &event, sizeof(event)); + +cleanup: + bpf_map_delete_elem(&birth, &sk); + bpf_map_delete_elem(&idents, &sk); + return 0; +} + +char LICENSE[] SEC("license") = "GPL"; diff --git a/libbpf-tools/tcplife.c b/libbpf-tools/tcplife.c new file mode 100644 index 000000000000..b31109d8834d --- /dev/null +++ b/libbpf-tools/tcplife.c @@ -0,0 +1,246 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * tcplife Trace the lifespan of TCP sessions and summarize. + * + * Copyright (c) 2022 Hengqi Chen + * + * Based on tcplife(8) from BCC by Brendan Gregg. + * 02-Jun-2022 Hengqi Chen Created this. + */ +#include +#include +#include +#include +#include +#include + +#include "btf_helpers.h" +#include "tcplife.h" +#include "tcplife.skel.h" + +#define PERF_BUFFER_PAGES 16 +#define PERF_POLL_TIMEOUT_MS 100 + +static volatile sig_atomic_t exiting = 0; + +static pid_t target_pid = 0; +static short target_family = 0; +static char *target_sports = NULL; +static char *target_dports = NULL; +static int column_width = 15; +static bool emit_timestamp = false; +static bool verbose = false; + +const char *argp_program_version = "tcplife 0.1"; +const char *argp_program_bug_address = + "https://github.com/iovisor/bcc/tree/master/libbpf-tools"; +const char argp_program_doc[] = +"Trace the lifespan of TCP sessions and summarize.\n" +"\n" +"USAGE: tcplife [-h] [-p PID] [-4] [-6] [-L] [-D] [-T] [-w]\n" +"\n" +"EXAMPLES:\n" +" tcplife -p 1215 # only trace PID 1215\n" +" tcplife -p 1215 -4 # trace IPv4 only\n"; + +static const struct argp_option opts[] = { + { "pid", 'p', "PID", 0, "Process ID to trace" }, + { "ipv4", '4', NULL, 0, "Trace IPv4 only" }, + { "ipv6", '6', NULL, 0, "Trace IPv6 only" }, + { "wide", 'w', NULL, 0, "Wide column output (fits IPv6 addresses)" }, + { "time", 'T', NULL, 0, "Include timestamp on output" }, + { "localport", 'L', "LOCALPORT", 0, "Comma-separated list of local ports to trace." }, + { "remoteport", 'D', "REMOTEPORT", 0, "Comma-separated list of remote ports to trace." }, + { "verbose", 'v', NULL, 0, "Verbose debug output" }, + { NULL, 'h', NULL, OPTION_HIDDEN, "Show the full help" }, + {}, +}; + +static error_t parse_arg(int key, char *arg, struct argp_state *state) +{ + long n; + + switch (key) { + case 'p': + errno = 0; + n = strtol(arg, NULL, 10); + if (errno || n <= 0) { + fprintf(stderr, "Invalid PID: %s\n", arg); + argp_usage(state); + } + target_pid = n; + break; + case '4': + target_family = AF_INET; + break; + case '6': + target_family = AF_INET6; + break; + case 'w': + column_width = 26; + break; + case 'L': + target_sports = strdup(arg); + break; + case 'D': + target_dports = strdup(arg); + break; + case 'T': + emit_timestamp = true; + break; + case 'v': + verbose = true; + break; + case 'h': + argp_state_help(state, stderr, ARGP_HELP_STD_HELP); + break; + default: + return ARGP_ERR_UNKNOWN; + } + return 0; +} + +static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args) +{ + if (level == LIBBPF_DEBUG && !verbose) + return 0; + return vfprintf(stderr, format, args); +} + +static void sig_int(int signo) +{ + exiting = 1; +} + +static void handle_event(void *ctx, int cpu, void *data, __u32 data_sz) +{ + char ts[32], saddr[48], daddr[48]; + struct event *e = data; + struct tm *tm; + time_t t; + + if (emit_timestamp) { + time(&t); + tm = localtime(&t); + strftime(ts, sizeof(ts), "%H:%M:%S", tm); + printf("%8s ", ts); + } + + inet_ntop(e->family, &e->saddr, saddr, sizeof(saddr)); + inet_ntop(e->family, &e->daddr, daddr, sizeof(daddr)); + + printf("%-7d %-16s %-*s %-5d %-*s %-5d %-6.2f %-6.2f %-.2f\n", + e->pid, e->comm, column_width, saddr, e->sport, column_width, daddr, e->dport, + (double)e->tx_b / 1024, (double)e->rx_b / 1024, (double)e->span_us / 1000); +} + +static void handle_lost_events(void *ctx, int cpu, __u64 lost_cnt) +{ + fprintf(stderr, "lost %llu events on CPU #%d\n", lost_cnt, cpu); +} + +int main(int argc, char **argv) +{ + LIBBPF_OPTS(bpf_object_open_opts, open_opts); + static const struct argp argp = { + .options = opts, + .parser = parse_arg, + .doc = argp_program_doc, + }; + struct tcplife_bpf *obj; + struct perf_buffer *pb = NULL; + short port_num; + char *port; + int err, i; + + err = argp_parse(&argp, argc, argv, 0, NULL, NULL); + if (err) + return err; + + libbpf_set_strict_mode(LIBBPF_STRICT_ALL); + libbpf_set_print(libbpf_print_fn); + + err = ensure_core_btf(&open_opts); + if (err) { + fprintf(stderr, "failed to fetch necessary BTF for CO-RE: %s\n", strerror(-err)); + return 1; + } + + obj = tcplife_bpf__open_opts(&open_opts); + if (!obj) { + fprintf(stderr, "failed to open BPF object\n"); + return 1; + } + + obj->rodata->target_pid = target_pid; + obj->rodata->target_family = target_family; + + if (target_sports) { + i = 0; + port = strtok(target_sports, ","); + while (port && i < MAX_PORTS) { + port_num = strtol(port, NULL, 10); + obj->rodata->target_sports[i++] = port_num; + port = strtok(NULL, ","); + } + } + + if (target_dports) { + i = 0; + port = strtok(target_dports, ","); + while (port && i < MAX_PORTS) { + port_num = strtol(port, NULL, 10); + obj->rodata->target_dports[i++] = port_num; + port = strtok(NULL, ","); + } + } + + err = tcplife_bpf__load(obj); + if (err) { + fprintf(stderr, "failed to load BPF object: %d\n", err); + goto cleanup; + } + + err = tcplife_bpf__attach(obj); + if (err) { + fprintf(stderr, "failed to attach BPF object: %d\n", err); + goto cleanup; + } + + pb = perf_buffer__new(bpf_map__fd(obj->maps.events), PERF_BUFFER_PAGES, + handle_event, handle_lost_events, NULL, NULL); + if (!pb) { + err = -errno; + fprintf(stderr, "failed to open perf buffer: %d\n", err); + goto cleanup; + } + + if (signal(SIGINT, sig_int) == SIG_ERR) { + fprintf(stderr, "can't set signal handler: %s\n", strerror(errno)); + err = 1; + goto cleanup; + } + + if (emit_timestamp) + printf("%-8s ", "TIME(s)"); + printf("%-7s %-16s %-*s %-5s %-*s %-5s %-6s %-6s %-s\n", + "PID", "COMM", column_width, "LADDR", "LPORT", column_width, "RADDR", "RPORT", + "TX_KB", "RX_KB", "MS"); + + while (!exiting) { + err = perf_buffer__poll(pb, PERF_POLL_TIMEOUT_MS); + if (err < 0 && err != -EINTR) { + fprintf(stderr, "error polling perf buffer: %s\n", strerror(-err)); + goto cleanup; + } + /* reset err to return 0 if exiting */ + err = 0; + } + +cleanup: + perf_buffer__free(pb); + tcplife_bpf__destroy(obj); + cleanup_core_btf(&open_opts); + return err != 0; +} diff --git a/libbpf-tools/tcplife.h b/libbpf-tools/tcplife.h new file mode 100644 index 000000000000..6e92352f533a --- /dev/null +++ b/libbpf-tools/tcplife.h @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2022 Hengqi Chen */ +#ifndef __TCPLIFE_H +#define __TCPLIFE_H + +#define MAX_PORTS 1024 +#define TASK_COMM_LEN 16 + +struct ident { + __u32 pid; + char comm[TASK_COMM_LEN]; +}; + +struct event { + unsigned __int128 saddr; + unsigned __int128 daddr; + __u64 ts_us; + __u64 span_us; + __u64 rx_b; + __u64 tx_b; + __u32 pid; + __u16 sport; + __u16 dport; + __u16 family; + char comm[TASK_COMM_LEN]; +}; + +#endif /* __TCPLIFE_H */ diff --git a/libbpf-tools/trace_helpers.c b/libbpf-tools/trace_helpers.c index 9165be429084..e04028e1c50a 100644 --- a/libbpf-tools/trace_helpers.c +++ b/libbpf-tools/trace_helpers.c @@ -993,16 +993,25 @@ bool is_kernel_module(const char *name) static bool fentry_try_attach(int id) { - struct bpf_insn insns[] = { { .code = BPF_JMP | BPF_EXIT } }; - LIBBPF_OPTS(bpf_prog_load_opts, opts); int prog_fd, attach_fd; - - opts.expected_attach_type = BPF_TRACE_FENTRY; - opts.attach_btf_id = id, - - prog_fd = bpf_prog_load(BPF_PROG_TYPE_TRACING, "test", NULL, insns, 1, &opts); - if (prog_fd < 0) + char error[4096]; + struct bpf_insn insns[] = { + { .code = BPF_ALU64 | BPF_MOV | BPF_K, .dst_reg = BPF_REG_0, .imm = 0 }, + { .code = BPF_JMP | BPF_EXIT }, + }; + LIBBPF_OPTS(bpf_prog_load_opts, opts, + .expected_attach_type = BPF_TRACE_FENTRY, + .attach_btf_id = id, + .log_buf = error, + .log_size = sizeof(error), + ); + + prog_fd = bpf_prog_load(BPF_PROG_TYPE_TRACING, "test", "GPL", insns, + sizeof(insns) / sizeof(struct bpf_insn), &opts); + if (prog_fd < 0) { + fprintf(stderr, "failed to try attaching to fentry: %s\n", error); return false; + } attach_fd = bpf_raw_tracepoint_open(NULL, prog_fd); if (attach_fd >= 0) diff --git a/man/man8/biosnoop.8 b/man/man8/biosnoop.8 index 9a0ae2649680..24f19edff200 100644 --- a/man/man8/biosnoop.8 +++ b/man/man8/biosnoop.8 @@ -2,7 +2,7 @@ .SH NAME biosnoop \- Trace block device I/O and print details incl. issuing PID. .SH SYNOPSIS -.B biosnoop [\-hQ] +.B biosnoop [\-h] [\-Q] [\-d DISK] .SH DESCRIPTION This tools traces block device I/O (disk I/O), and prints a one-line summary for each I/O showing various details. These include the latency from the time of @@ -29,6 +29,9 @@ Print usage message. .TP \-Q Include a column showing the time spent queued in the OS. +.TP +\-d DISK +Trace this disk only. .SH EXAMPLES .TP Trace all block device I/O and print a summary line per I/O: @@ -82,6 +85,6 @@ Linux .SH STABILITY Unstable - in development. .SH AUTHOR -Brendan Gregg +Brendan Gregg, Rocky Xing .SH SEE ALSO disksnoop(8), iostat(1) diff --git a/man/man8/llcstat.8 b/man/man8/llcstat.8 index 36dbed7dedc1..5a28d3384c51 100644 --- a/man/man8/llcstat.8 +++ b/man/man8/llcstat.8 @@ -28,6 +28,9 @@ Print usage message. \-c SAMPLE_PERIOD Sample one in this many cache reference and cache miss events. .TP +\-t +Summarize cache references and misses by PID/TID +.TP duration Duration to trace, in seconds. .SH EXAMPLES diff --git a/src/cc/bcc_syms.cc b/src/cc/bcc_syms.cc index 12c8250b2910..b1aae89ed8fd 100644 --- a/src/cc/bcc_syms.cc +++ b/src/cc/bcc_syms.cc @@ -289,11 +289,8 @@ bool ProcSyms::Module::contains(uint64_t addr, uint64_t &offset) const { for (const auto &range : ranges_) { if (addr >= range.start && addr < range.end) { if (type_ == ModuleType::SO || type_ == ModuleType::VDSO) { - // Offset within the mmap - offset = addr - range.start + range.file_offset; - - // Offset within the ELF for SO symbol lookup - offset += (elf_so_addr_ - elf_so_offset_); + offset = __so_calc_mod_offset(range.start, range.file_offset, + elf_so_addr_, elf_so_offset_, addr); } else { offset = addr; } @@ -619,9 +616,26 @@ int _bcc_syms_find_module(mod_info *info, int enter_ns, void *p) { return -1; } +uint64_t __so_calc_global_addr(uint64_t mod_start_addr, + uint64_t mod_file_offset, + uint64_t elf_sec_start_addr, + uint64_t elf_sec_file_offset, uint64_t offset) { + return offset + (mod_start_addr - mod_file_offset) - + (elf_sec_start_addr - elf_sec_file_offset); +} + +uint64_t __so_calc_mod_offset(uint64_t mod_start_addr, uint64_t mod_file_offset, + uint64_t elf_sec_start_addr, + uint64_t elf_sec_file_offset, + uint64_t global_addr) { + return global_addr - (mod_start_addr - mod_file_offset) + + (elf_sec_start_addr - elf_sec_file_offset); +} + int bcc_resolve_global_addr(int pid, const char *module, const uint64_t address, uint8_t inode_match_only, uint64_t *global) { struct stat s; + uint64_t elf_so_addr, elf_so_offset; if (stat(module, &s)) return -1; @@ -632,7 +646,11 @@ int bcc_resolve_global_addr(int pid, const char *module, const uint64_t address, mod.start == 0x0) return -1; - *global = mod.start - mod.file_offset + address; + if (bcc_elf_get_text_scn_info(module, &elf_so_addr, &elf_so_offset) < 0) + return -1; + + *global = __so_calc_global_addr(mod.start, mod.file_offset, elf_so_addr, + elf_so_offset, address); return 0; } diff --git a/src/cc/bcc_syms.h b/src/cc/bcc_syms.h index 80627debead9..eb1e4ead4b8a 100644 --- a/src/cc/bcc_syms.h +++ b/src/cc/bcc_syms.h @@ -102,6 +102,30 @@ int bcc_resolve_symname(const char *module, const char *symname, struct bcc_symbol_option* option, struct bcc_symbol *sym); +/* Calculate the global address for 'offset' in a shared object loaded into + * a process + * + * Need to know (start_addr, file_offset) pairs for the /proc/PID/maps module + * entry containing the offset and the elf section containing the module's + * .text + */ +uint64_t __so_calc_global_addr(uint64_t mod_start_addr, + uint64_t mod_file_offset, + uint64_t elf_sec_start_addr, + uint64_t elf_sec_file_offset, uint64_t offset); + +/* Given a global address which falls within a shared object's mapping in a + * process, calculate the corresponding 'offset' in the .so + * + * Need to know (start_addr, file_offset) pairs for the /proc/PID/maps module + * entry containing the offset and the elf section containing the module's + * .text + */ +uint64_t __so_calc_mod_offset(uint64_t mod_start_addr, uint64_t mod_file_offset, + uint64_t elf_sec_start_addr, + uint64_t elf_sec_file_offset, + uint64_t global_addr); + #ifdef __cplusplus } #endif diff --git a/src/cc/export/helpers.h b/src/cc/export/helpers.h index e2de995fc25f..82dc0fe13aec 100644 --- a/src/cc/export/helpers.h +++ b/src/cc/export/helpers.h @@ -390,7 +390,7 @@ struct _name##_table_t _name = { .max_entries = (_max_entries) } #define BPF_CPUMAP(_name, _max_entries) \ BPF_XDP_REDIRECT_MAP("cpumap", u32, _name, _max_entries) -#define BPF_XSKMAP(_name, _max_entries) \ +#define _BPF_XSKMAP(_name, _max_entries, _pinned) \ struct _name##_table_t { \ u32 key; \ int leaf; \ @@ -399,8 +399,12 @@ struct _name##_table_t { \ u64 (*redirect_map) (int, int); \ u32 max_entries; \ }; \ -__attribute__((section("maps/xskmap"))) \ +__attribute__((section("maps/xskmap" _pinned))) \ struct _name##_table_t _name = { .max_entries = (_max_entries) } +#define BPF_XSKMAP2(_name, _max_entries) _BPF_XSKMAP(_name, _max_entries, "") +#define BPF_XSKMAP3(_name, _max_entries, _pinned) _BPF_XSKMAP(_name, _max_entries, ":" _pinned) +#define BPF_XSKMAPX(_1, _2, _3, NAME, ...) NAME +#define BPF_XSKMAP(...) BPF_XSKMAPX(__VA_ARGS__, BPF_XSKMAP3, BPF_XSKMAP2)(__VA_ARGS__) #define BPF_ARRAY_OF_MAPS(_name, _inner_map_name, _max_entries) \ BPF_TABLE("array_of_maps$" _inner_map_name, int, int, _name, _max_entries) diff --git a/tests/cc/test_c_api.cc b/tests/cc/test_c_api.cc index eb56dc08e7a9..510ccda942e8 100644 --- a/tests/cc/test_c_api.cc +++ b/tests/cc/test_c_api.cc @@ -600,6 +600,59 @@ TEST_CASE("resolve global addr in libc in this process", "[c_api][!mayfail]") { REQUIRE(global_addr == (search.start + local_addr - search.file_offset)); } +/* Consider the following scenario: we have some process that maps in a shared library [1] with a + * USDT probe [2]. The shared library's .text section doesn't have matching address and file off + * [3]. Since the location address in [2] is an offset relative to the base address of whatever.so + * in whatever process is mapping it, we need to convert the location address 0x77b8c to a global + * address in the process' address space in order to attach to the USDT. + * + * The formula for this (__so_calc_global_addr) is + * global_addr = offset + (mod_start_addr - mod_file_offset) + * - (elf_sec_start_addr - elf_sec_file_offset) + * + * Which for our concrete example is + * global_addr = 0x77b8c + (0x7f6cda31e000 - 0x72000) - (0x73c90 - 0x72c90) + * global_addr = 0x7f6cda322b8c + * + * [1 - output from `cat /proc/PID/maps`] + * 7f6cda2ab000-7f6cda31e000 r--p 00000000 00:2d 5370022276 /whatever.so + * 7f6cda31e000-7f6cda434000 r-xp 00072000 00:2d 5370022276 /whatever.so + * 7f6cda434000-7f6cda43d000 r--p 00187000 00:2d 5370022276 /whatever.so + * 7f6cda43d000-7f6cda43f000 rw-p 0018f000 00:2d 5370022276 /whatever.so + * + * [2 - output from `readelf -n /whatever.so`] + * stapsdt 0x00000038 NT_STAPSDT (SystemTap probe descriptors) + * Provider: test + * Name: test_probe + * Location: 0x0000000000077b8c, Base: 0x0000000000000000, Semaphore: 0x0000000000000000 + * Arguments: -8@$5 + * + * [3 - output from `readelf -W --sections /whatever.so`] + * [Nr] Name Type Address Off Size ES Flg Lk Inf Al + * [16] .text PROGBITS 0000000000073c90 072c90 1132dc 00 AX 0 0 16 + */ +TEST_CASE("conversion of module offset to/from global_addr", "[c_api]") { + uint64_t global_addr, offset, calc_offset, mod_start_addr, mod_file_offset; + uint64_t elf_sec_start_addr, elf_sec_file_offset; + + /* Initialize per example in comment above */ + offset = 0x77b8c; + mod_start_addr = 0x7f6cda31e000; + mod_file_offset = 0x00072000; + elf_sec_start_addr = 0x73c90; + elf_sec_file_offset = 0x72c90; + global_addr = __so_calc_global_addr(mod_start_addr, mod_file_offset, + elf_sec_start_addr, elf_sec_file_offset, + offset); + REQUIRE(global_addr == 0x7f6cda322b8c); + + /* Reverse operation (global_addr -> offset) should yield original offset */ + calc_offset = __so_calc_mod_offset(mod_start_addr, mod_file_offset, + elf_sec_start_addr, elf_sec_file_offset, + global_addr); + REQUIRE(calc_offset == offset); +} + TEST_CASE("get online CPUs", "[c_api]") { std::vector cpus = ebpf::get_online_cpus(); int num_cpus = sysconf(_SC_NPROCESSORS_ONLN); diff --git a/tools/biosnoop.py b/tools/biosnoop.py index 3f3ebd2e4f50..1028aa76ed56 100755 --- a/tools/biosnoop.py +++ b/tools/biosnoop.py @@ -12,16 +12,18 @@ # # 16-Sep-2015 Brendan Gregg Created this. # 11-Feb-2016 Allan McAleavy updated for BPF_PERF_OUTPUT +# 21-Jun-2022 Rocky Xing Added disk filter support. from __future__ import print_function from bcc import BPF -import re import argparse +import os # arguments examples = """examples: ./biosnoop # trace all block I/O ./biosnoop -Q # include OS queued time + ./biolatency -d sdc # trace sdc only """ parser = argparse.ArgumentParser( description="Trace block I/O", @@ -29,6 +31,8 @@ epilog=examples) parser.add_argument("-Q", "--queue", action="store_true", help="include OS queued time") +parser.add_argument("-d", "--disk", type=str, + help="Trace this disk only") parser.add_argument("--ebpf", action="store_true", help=argparse.SUPPRESS) args = parser.parse_args() @@ -70,6 +74,8 @@ // cache PID and comm by-req int trace_pid_start(struct pt_regs *ctx, struct request *req) { + DISK_FILTER + struct val_t val = {}; u64 ts; @@ -86,6 +92,8 @@ // time block I/O int trace_req_start(struct pt_regs *ctx, struct request *req) { + DISK_FILTER + struct start_req_t start_req = { .ts = bpf_ktime_get_ns(), .data_len = req->__data_len @@ -160,6 +168,34 @@ bpf_text = bpf_text.replace('__RQ_DISK__', 'rq_disk') else: bpf_text = bpf_text.replace('__RQ_DISK__', 'q->disk') + +if args.disk is not None: + disk_path = os.path.join('/dev', args.disk) + if not os.path.exists(disk_path): + print("no such disk '%s'" % args.disk) + exit(1) + + stat_info = os.stat(disk_path) + major = os.major(stat_info.st_rdev) + minor = os.minor(stat_info.st_rdev) + + disk_field_str = "" + if BPF.kernel_struct_has_field(b'request', b'rq_disk') == 1: + disk_field_str = 'req->rq_disk' + else: + disk_field_str = 'req->q->disk' + + disk_filter_str = """ + struct gendisk *disk = %s; + if (!(disk->major == %d && disk->first_minor == %d)) { + return 0; + } + """ % (disk_field_str, major, minor) + + bpf_text = bpf_text.replace('DISK_FILTER', disk_filter_str) +else: + bpf_text = bpf_text.replace('DISK_FILTER', '') + if debug or args.ebpf: print(bpf_text) if args.ebpf: diff --git a/tools/biosnoop_example.txt b/tools/biosnoop_example.txt index d8be0624c94e..38b0ca3431f9 100644 --- a/tools/biosnoop_example.txt +++ b/tools/biosnoop_example.txt @@ -64,14 +64,16 @@ TIME(s) COMM PID DISK T SECTOR BYTES QUE(ms) LAT(ms) USAGE message: -usage: biosnoop.py [-h] [-Q] +usage: biosnoop.py [-h] [-Q] [-d DISK] Trace block I/O optional arguments: - -h, --help show this help message and exit - -Q, --queue include OS queued time + -h, --help show this help message and exit + -Q, --queue include OS queued time + -d DISK, --disk DISK Trace this disk only examples: ./biosnoop # trace all block I/O ./biosnoop -Q # include OS queued time + ./biolatency -d sdc # trace sdc only diff --git a/tools/llcstat.py b/tools/llcstat.py index 4f1ba2f9a88a..ec7f4c364dc5 100755 --- a/tools/llcstat.py +++ b/tools/llcstat.py @@ -15,6 +15,7 @@ # Licensed under the Apache License, Version 2.0 (the "License") # # 19-Oct-2016 Teng Qin Created this. +# 20-Jun-2022 YeZhengMao Added tid info. from __future__ import print_function import argparse @@ -30,6 +31,10 @@ help="Sample one in this many number of cache reference / miss events") parser.add_argument( "duration", nargs="?", default=10, help="Duration, in seconds, to run") +parser.add_argument( + "-t", "--tid", action="store_true", + help="Summarize cache references and misses by PID/TID" +) parser.add_argument("--ebpf", action="store_true", help=argparse.SUPPRESS) args = parser.parse_args() @@ -41,7 +46,8 @@ struct key_t { int cpu; - int pid; + u32 pid; + u32 tid; char name[TASK_COMM_LEN]; }; @@ -49,8 +55,10 @@ BPF_HASH(miss_count, struct key_t); static inline __attribute__((always_inline)) void get_key(struct key_t* key) { + u64 pid_tgid = bpf_get_current_pid_tgid(); key->cpu = bpf_get_smp_processor_id(); - key->pid = bpf_get_current_pid_tgid() >> 32; + key->pid = pid_tgid >> 32; + key->tid = GET_TID ? (u32)pid_tgid : key->pid; bpf_get_current_comm(&(key->name), sizeof(key->name)); } @@ -73,6 +81,8 @@ } """ +bpf_text = bpf_text.replace("GET_TID", "1" if args.tid else "0") + if args.ebpf: print(bpf_text) exit() @@ -98,22 +108,42 @@ miss_count = {} for (k, v) in b.get_table('miss_count').items(): - miss_count[(k.pid, k.cpu, k.name)] = v.value + if args.tid: + miss_count[(k.pid, k.tid, k.cpu, k.name)] = v.value + else: + miss_count[(k.pid, k.cpu, k.name)] = v.value + +header_text = 'PID ' +format_text = '{:<8d} ' +if args.tid: + header_text += 'TID ' + format_text += '{:<8d} ' + +header_text += 'NAME CPU REFERENCE MISS HIT%' +format_text += '{:<16s} {:<4d} {:>12d} {:>12d} {:>6.2f}%' -print('PID NAME CPU REFERENCE MISS HIT%') +print(header_text) tot_ref = 0 tot_miss = 0 for (k, v) in b.get_table('ref_count').items(): try: - miss = miss_count[(k.pid, k.cpu, k.name)] + if args.tid: + miss = miss_count[(k.pid, k.tid, k.cpu, k.name)] + else: + miss = miss_count[(k.pid, k.cpu, k.name)] except KeyError: miss = 0 tot_ref += v.value tot_miss += miss # This happens on some PIDs due to missed counts caused by sampling hit = (v.value - miss) if (v.value >= miss) else 0 - print('{:<8d} {:<16s} {:<4d} {:>12d} {:>12d} {:>6.2f}%'.format( - k.pid, k.name.decode('utf-8', 'replace'), k.cpu, v.value, miss, - (float(hit) / float(v.value)) * 100.0)) + if args.tid: + print(format_text.format( + k.pid, k.tid, k.name.decode('utf-8', 'replace'), k.cpu, v.value, miss, + (float(hit) / float(v.value)) * 100.0)) + else: + print(format_text.format( + k.pid, k.name.decode('utf-8', 'replace'), k.cpu, v.value, miss, + (float(hit) / float(v.value)) * 100.0)) print('Total References: {} Total Misses: {} Hit Rate: {:.2f}%'.format( tot_ref, tot_miss, (float(tot_ref - tot_miss) / float(tot_ref)) * 100.0)) diff --git a/tools/llcstat_example.txt b/tools/llcstat_example.txt index ef2aec10f6f6..a7c1a78d694c 100644 --- a/tools/llcstat_example.txt +++ b/tools/llcstat_example.txt @@ -38,6 +38,21 @@ some degree by chance. Overall it should make sense. But for low counts, you might find a case where -- by chance -- a process has been tallied with more misses than references, which would seem impossible. +# ./llcstat.py 10 -t +Running for 10 seconds or hit Ctrl-C to end. +PID TID NAME CPU REFERENCE MISS HIT% +170843 170845 docker 12 2700 1200 55.56% +298670 298670 kworker/15:0 15 500 0 100.00% +170254 170254 kworker/11:1 11 2500 400 84.00% +1046952 1046953 git 0 2600 1100 57.69% +170843 170849 docker 15 1000 400 60.00% +1027373 1027382 node 8 3500 2500 28.57% +0 0 swapper/7 7 173000 4200 97.57% +1028217 1028217 node 14 15600 22400 0.00% +[...] +Total References: 7139900 Total Misses: 1413900 Hit Rate: 80.20% + +This shows each TID`s cache hit rate during the 10 seconds run period. USAGE message: @@ -54,3 +69,4 @@ positional arguments: -c SAMPLE_PERIOD, --sample_period SAMPLE_PERIOD Sample one in this many number of cache reference and miss events + -t, --tid Summarize cache references and misses by PID/TID diff --git a/tools/profile.py b/tools/profile.py index 47d2adf297fe..43afacc5fdba 100755 --- a/tools/profile.py +++ b/tools/profile.py @@ -335,21 +335,21 @@ def aksym(addr): # print folded stack output user_stack = list(user_stack) kernel_stack = list(kernel_stack) - line = [k.name] + line = [k.name.decode('utf-8', 'replace')] # if we failed to get the stack is, such as due to no space (-ENOMEM) or # hash collision (-EEXIST), we still print a placeholder for consistency if not args.kernel_stacks_only: if stack_id_err(k.user_stack_id): - line.append(b"[Missed User Stack]") + line.append("[Missed User Stack]") else: - line.extend([b.sym(addr, k.pid) for addr in reversed(user_stack)]) + line.extend([b.sym(addr, k.pid).decode('utf-8', 'replace') for addr in reversed(user_stack)]) if not args.user_stacks_only: - line.extend([b"-"] if (need_delimiter and k.kernel_stack_id >= 0 and k.user_stack_id >= 0) else []) + line.extend(["-"] if (need_delimiter and k.kernel_stack_id >= 0 and k.user_stack_id >= 0) else []) if stack_id_err(k.kernel_stack_id): - line.append(b"[Missed Kernel Stack]") + line.append("[Missed Kernel Stack]") else: - line.extend([aksym(addr) for addr in reversed(kernel_stack)]) - print("%s %d" % (b";".join(line).decode('utf-8', 'replace'), v.value)) + line.extend([aksym(addr).decode('utf-8', 'replace') for addr in reversed(kernel_stack)]) + print("%s %d" % (";".join(line), v.value)) else: # print default multi-line stack output if not args.user_stacks_only: @@ -357,7 +357,7 @@ def aksym(addr): print(" [Missed Kernel Stack]") else: for addr in kernel_stack: - print(" %s" % aksym(addr)) + print(" %s" % aksym(addr).decode('utf-8', 'replace')) if not args.kernel_stacks_only: if need_delimiter and k.user_stack_id >= 0 and k.kernel_stack_id >= 0: print(" --") diff --git a/tools/stackcount.py b/tools/stackcount.py index 8b7ca0087838..cea0e9e2785c 100755 --- a/tools/stackcount.py +++ b/tools/stackcount.py @@ -292,18 +292,18 @@ def _print_kframe(self, addr): if self.args.verbose: print("%-16x " % addr, end="") if self.args.offset: - print("%s" % self.probe.bpf.ksym(addr, show_offset=True)) + print("%s" % self.probe.bpf.ksym(addr, show_offset=True).decode()) else: - print("%s" % self.probe.bpf.ksym(addr)) + print("%s" % self.probe.bpf.ksym(addr).decode()) def _print_uframe(self, addr, pid): print(" ", end="") if self.args.verbose: print("%-16x " % addr, end="") if self.args.offset: - print("%s" % self.probe.bpf.sym(addr, pid, show_offset=True)) + print("%s" % self.probe.bpf.sym(addr, pid, show_offset=True).decode()) else: - print("%s" % self.probe.bpf.sym(addr, pid)) + print("%s" % self.probe.bpf.sym(addr, pid).decode()) @staticmethod def _signal_ignore(signal, frame):