diff --git a/docs/kernel-versions.md b/docs/kernel-versions.md index a4af5992ed67..116dd3807774 100644 --- a/docs/kernel-versions.md +++ b/docs/kernel-versions.md @@ -69,7 +69,8 @@ BPF attached to cgroups as device controller | 4.15 | [`ebc614f68736`](https://g bpf2bpf function calls | 4.16 | [`cc8b0b92a169`](https://github.com/torvalds/linux/commit/cc8b0b92a1699bc32f7fec71daa2bfc90de43a4d) BPF used for monitoring socket RX/TX data | 4.17 | [`4f738adba30a`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=4f738adba30a7cfc006f605707e7aee847ffefa0) BPF attached to raw tracepoints | 4.17 | [`c4f6699dfcb8`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=c4f6699dfcb8558d138fe838f741b2c10f416cf9) -BPF attached to `bind()` system call | 4.17 | [`4fbac77d2d09`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=4fbac77d2d092b475dda9eea66da674369665427) +BPF attached to `bind()` system call | 4.17 | [`4fbac77d2d09`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=4fbac77d2d092b475dda9eea66da674369665427) [`aac3fc320d94`](https://github.com/torvalds/linux/commit/aac3fc320d9404f2665a8b1249dc3170d5fa3caf) +BPF attached to `connect()` system call | 4.17 | [`d74bad4e74ee`](https://github.com/torvalds/linux/commit/d74bad4e74ee373787a9ae24197c17b7cdc428d5) BPF Type Format (BTF) | 4.18 | [`69b693f0aefa`](https://git.kernel.org/cgit/linux/kernel/git/davem/net-next.git/commit/?id=69b693f0aefa0ed521e8bd02260523b5ae446ad7) AF_XDP | 4.18 | [`fbfc504a24f5`](https://git.kernel.org/cgit/linux/kernel/git/davem/net-next.git/commit/?id=fbfc504a24f53f7ebe128ab55cb5dba634f4ece8) bpfilter | 4.18 | [`d2ba09c17a06`](https://git.kernel.org/cgit/linux/kernel/git/davem/net-next.git/commit/?id=d2ba09c17a0647f899d6c20a11bab9e6d3382f07) diff --git a/libbpf-tools/.gitignore b/libbpf-tools/.gitignore index 561f94ecef1c..251646397529 100644 --- a/libbpf-tools/.gitignore +++ b/libbpf-tools/.gitignore @@ -29,6 +29,7 @@ /llcstat /nfsdist /nfsslower +/mdflush /mountsnoop /numamove /offcputime diff --git a/libbpf-tools/Makefile b/libbpf-tools/Makefile index e60ec409aa3f..e753230cee89 100644 --- a/libbpf-tools/Makefile +++ b/libbpf-tools/Makefile @@ -42,6 +42,7 @@ APPS = \ klockstat \ ksnoop \ llcstat \ + mdflush \ mountsnoop \ numamove \ offcputime \ @@ -79,6 +80,15 @@ COMMON_OBJ = \ $(if $(ENABLE_MIN_CORE_BTFS),$(OUTPUT)/min_core_btf_tar.o) \ # +define allow-override + $(if $(or $(findstring environment,$(origin $(1))),\ + $(findstring command line,$(origin $(1)))),,\ + $(eval $(1) = $(2))) +endef + +$(call allow-override,CC,$(CROSS_COMPILE)cc) +$(call allow-override,LD,$(CROSS_COMPILE)ld) + .PHONY: all all: $(APPS) $(APP_ALIASES) @@ -91,6 +101,13 @@ msg = @printf ' %-8s %s%s\n' "$(1)" "$(notdir $(2))" "$(if $(3), $(3))"; MAKEFLAGS += --no-print-directory endif +ifneq ($(EXTRA_CFLAGS),) +CFLAGS += $(EXTRA_CFLAGS) +endif +ifneq ($(EXTRA_LDFLAGS),) +LDFLAGS += $(EXTRA_LDFLAGS) +endif + .PHONY: clean clean: $(call msg,CLEAN) @@ -103,7 +120,7 @@ $(OUTPUT) $(OUTPUT)/libbpf: .PHONY: bpftool bpftool: $(Q)mkdir -p $(OUTPUT)/bpftool - $(Q)$(MAKE) OUTPUT=$(OUTPUT)/bpftool/ -C $(BPFTOOL_SRC) + $(Q)$(MAKE) ARCH= CROSS_COMPILE= OUTPUT=$(OUTPUT)/bpftool/ -C $(BPFTOOL_SRC) $(APPS): %: $(OUTPUT)/%.o $(LIBBPF_OBJ) $(COMMON_OBJ) | $(OUTPUT) $(call msg,BINARY,$@) diff --git a/libbpf-tools/biolatency.bpf.c b/libbpf-tools/biolatency.bpf.c index b9e87c393494..4d59d5f8db4d 100644 --- a/libbpf-tools/biolatency.bpf.c +++ b/libbpf-tools/biolatency.bpf.c @@ -35,7 +35,6 @@ struct { __uint(max_entries, MAX_ENTRIES); __type(key, struct request *); __type(value, u64); - __uint(map_flags, BPF_F_NO_PREALLOC); } start SEC(".maps"); static struct hist initial_hist; @@ -45,7 +44,6 @@ struct { __uint(max_entries, MAX_ENTRIES); __type(key, struct hist_key); __type(value, struct hist); - __uint(map_flags, BPF_F_NO_PREALLOC); } hists SEC(".maps"); static __always_inline diff --git a/libbpf-tools/biopattern.bpf.c b/libbpf-tools/biopattern.bpf.c index bf051bc320a8..2f099be77ee8 100644 --- a/libbpf-tools/biopattern.bpf.c +++ b/libbpf-tools/biopattern.bpf.c @@ -14,7 +14,6 @@ struct { __uint(max_entries, 64); __type(key, u32); __type(value, struct counter); - __uint(map_flags, BPF_F_NO_PREALLOC); } counters SEC(".maps"); SEC("tracepoint/block/block_rq_complete") diff --git a/libbpf-tools/biosnoop.bpf.c b/libbpf-tools/biosnoop.bpf.c index a29af98de455..b7e711e059ec 100644 --- a/libbpf-tools/biosnoop.bpf.c +++ b/libbpf-tools/biosnoop.bpf.c @@ -36,7 +36,6 @@ struct { __uint(max_entries, MAX_ENTRIES); __type(key, struct request *); __type(value, struct piddata); - __uint(map_flags, BPF_F_NO_PREALLOC); } infobyreq SEC(".maps"); struct stage { diff --git a/libbpf-tools/biosnoop.c b/libbpf-tools/biosnoop.c index f0f665a6db53..988d82566ecf 100644 --- a/libbpf-tools/biosnoop.c +++ b/libbpf-tools/biosnoop.c @@ -167,7 +167,7 @@ void handle_event(void *ctx, int cpu, void *data, __u32 data_sz) start_ts = e->ts; blk_fill_rwbs(rwbs, e->cmd_flags); partition = partitions__get_by_dev(partitions, e->dev); - printf("%-11.6f %-14.14s %-6d %-7s %-4s %-10lld %-7d ", + printf("%-11.6f %-14.14s %-7d %-7s %-4s %-10lld %-7d ", (e->ts - start_ts) / 1000000000.0, e->comm, e->pid, partition ? partition->name : "Unknown", rwbs, e->sector, e->len); @@ -230,6 +230,13 @@ int main(int argc, char **argv) obj->rodata->targ_queued = env.queued; obj->rodata->filter_cg = env.cg; + if (fentry_can_attach("blk_account_io_start", NULL)) + bpf_program__set_attach_target(obj->progs.blk_account_io_start, 0, + "blk_account_io_start"); + else + bpf_program__set_attach_target(obj->progs.blk_account_io_start, 0, + "__blk_account_io_start"); + err = biosnoop_bpf__load(obj); if (err) { fprintf(stderr, "failed to load BPF object: %d\n", err); @@ -304,7 +311,7 @@ int main(int argc, char **argv) goto cleanup; } - printf("%-11s %-14s %-6s %-7s %-4s %-10s %-7s ", + printf("%-11s %-14s %-7s %-7s %-4s %-10s %-7s ", "TIME(s)", "COMM", "PID", "DISK", "T", "SECTOR", "BYTES"); if (env.queued) printf("%7s ", "QUE(ms)"); diff --git a/libbpf-tools/biostacks.bpf.c b/libbpf-tools/biostacks.bpf.c index c13975fa6c4c..dd9fec1c8dee 100644 --- a/libbpf-tools/biostacks.bpf.c +++ b/libbpf-tools/biostacks.bpf.c @@ -28,7 +28,6 @@ struct { __uint(max_entries, MAX_ENTRIES); __type(key, struct request *); __type(value, struct internal_rqinfo); - __uint(map_flags, BPF_F_NO_PREALLOC); } rqinfos SEC(".maps"); struct { @@ -36,7 +35,6 @@ struct { __uint(max_entries, MAX_ENTRIES); __type(key, struct rqinfo); __type(value, struct hist); - __uint(map_flags, BPF_F_NO_PREALLOC); } hists SEC(".maps"); static struct hist zero; diff --git a/libbpf-tools/biostacks.c b/libbpf-tools/biostacks.c index 260bc235eed6..2a25869dc461 100644 --- a/libbpf-tools/biostacks.c +++ b/libbpf-tools/biostacks.c @@ -173,6 +173,18 @@ int main(int argc, char **argv) obj->rodata->targ_ms = env.milliseconds; + if (fentry_can_attach("blk_account_io_start", NULL)) { + bpf_program__set_attach_target(obj->progs.blk_account_io_start, 0, + "blk_account_io_start"); + bpf_program__set_attach_target(obj->progs.blk_account_io_done, 0, + "blk_account_io_done"); + } else { + bpf_program__set_attach_target(obj->progs.blk_account_io_start, 0, + "__blk_account_io_start"); + bpf_program__set_attach_target(obj->progs.blk_account_io_done, 0, + "__blk_account_io_done"); + } + err = biostacks_bpf__load(obj); if (err) { fprintf(stderr, "failed to load BPF object: %d\n", err); diff --git a/libbpf-tools/bitesize.bpf.c b/libbpf-tools/bitesize.bpf.c index 46e9c48b85da..a246f635c11d 100644 --- a/libbpf-tools/bitesize.bpf.c +++ b/libbpf-tools/bitesize.bpf.c @@ -22,7 +22,6 @@ struct { __uint(max_entries, 10240); __type(key, struct hist_key); __type(value, struct hist); - __uint(map_flags, BPF_F_NO_PREALLOC); } hists SEC(".maps"); static struct hist initial_hist; diff --git a/libbpf-tools/core_fixes.bpf.h b/libbpf-tools/core_fixes.bpf.h index 762445e88587..33a4f7f78311 100644 --- a/libbpf-tools/core_fixes.bpf.h +++ b/libbpf-tools/core_fixes.bpf.h @@ -15,15 +15,34 @@ */ struct task_struct___x { unsigned int __state; -}; +} __attribute__((preserve_access_index)); -static __s64 get_task_state(void *task) +/** + * commit 309dca309fc3 ("block: store a block_device pointer in struct bio") + * adds a new member bi_bdev which is a pointer to struct block_device + * see: + * https://github.com/torvalds/linux/commit/309dca309fc3 + */ +struct bio___x { + struct block_device *bi_bdev; +} __attribute__((preserve_access_index)); + +static __always_inline __s64 get_task_state(void *task) { struct task_struct___x *t = task; if (bpf_core_field_exists(t->__state)) - return t->__state; - return ((struct task_struct *)task)->state; + return BPF_CORE_READ(t, __state); + return BPF_CORE_READ((struct task_struct *)task, state); +} + +static __always_inline struct gendisk *get_gendisk(void *bio) +{ + struct bio___x *b = bio; + + if (bpf_core_field_exists(b->bi_bdev)) + return BPF_CORE_READ(b, bi_bdev, bd_disk); + return BPF_CORE_READ((struct bio *)bio, bi_disk); } #endif /* __CORE_FIXES_BPF_H */ diff --git a/libbpf-tools/klockstat.bpf.c b/libbpf-tools/klockstat.bpf.c index b8483d91c026..26371c6846c4 100644 --- a/libbpf-tools/klockstat.bpf.c +++ b/libbpf-tools/klockstat.bpf.c @@ -14,6 +14,7 @@ const volatile pid_t targ_tgid = 0; const volatile pid_t targ_pid = 0; void *const volatile targ_lock = NULL; +const volatile int per_thread = 0; struct { __uint(type, BPF_MAP_TYPE_STACK_TRACE); @@ -147,6 +148,10 @@ static void account(struct lockholder_info *li) { struct lock_stat *ls; u64 delta; + u32 key = li->stack_id; + + if (per_thread) + key = li->task_id; /* * Multiple threads may have the same stack_id. Even though we are @@ -155,15 +160,19 @@ static void account(struct lockholder_info *li) * by multiple readers at the same time. They will be accounted as * the same lock, which is what we want, but we need to use atomics to * avoid corruption, especially for the total_time variables. + * But it should be ok for per-thread since it's not racy anymore. */ - ls = bpf_map_lookup_elem(&stat_map, &li->stack_id); + ls = bpf_map_lookup_elem(&stat_map, &key); if (!ls) { struct lock_stat fresh = {0}; - bpf_map_update_elem(&stat_map, &li->stack_id, &fresh, BPF_ANY); - ls = bpf_map_lookup_elem(&stat_map, &li->stack_id); + bpf_map_update_elem(&stat_map, &key, &fresh, BPF_ANY); + ls = bpf_map_lookup_elem(&stat_map, &key); if (!ls) return; + + if (per_thread) + bpf_get_current_comm(ls->acq_max_comm, TASK_COMM_LEN); } delta = li->acq_at - li->try_at; @@ -176,7 +185,8 @@ static void account(struct lockholder_info *li) * Potentially racy, if multiple threads think they are the max, * so you may get a clobbered write. */ - bpf_get_current_comm(ls->acq_max_comm, TASK_COMM_LEN); + if (!per_thread) + bpf_get_current_comm(ls->acq_max_comm, TASK_COMM_LEN); } delta = li->rel_at - li->acq_at; @@ -185,7 +195,8 @@ static void account(struct lockholder_info *li) if (delta > READ_ONCE(ls->hld_max_time)) { WRITE_ONCE(ls->hld_max_time, delta); WRITE_ONCE(ls->hld_max_id, li->task_id); - bpf_get_current_comm(ls->hld_max_comm, TASK_COMM_LEN); + if (!per_thread) + bpf_get_current_comm(ls->hld_max_comm, TASK_COMM_LEN); } } diff --git a/libbpf-tools/klockstat.c b/libbpf-tools/klockstat.c index 4c733a90bb55..6b5f377f96ea 100644 --- a/libbpf-tools/klockstat.c +++ b/libbpf-tools/klockstat.c @@ -55,6 +55,7 @@ static struct prog_env { bool reset; bool timestamp; bool verbose; + bool per_thread; } env = { .nr_locks = 99999999, .nr_stack_entries = 1, @@ -71,7 +72,7 @@ static const char args_doc[] = "FUNCTION"; static const char program_doc[] = "Trace mutex/sem lock acquisition and hold times, in nsec\n" "\n" -"Usage: klockstat [-hRTv] [-p PID] [-t TID] [-c FUNC] [-L LOCK] [-n NR_LOCKS]\n" +"Usage: klockstat [-hPRTv] [-p PID] [-t TID] [-c FUNC] [-L LOCK] [-n NR_LOCKS]\n" " [-s NR_STACKS] [-S SORT] [-d DURATION] [-i INTERVAL]\n" "\v" "Examples:\n" @@ -86,8 +87,9 @@ static const char program_doc[] = " klockstat -S acq_count # sort lock acquired results by acquire count\n" " klockstat -S hld_total # sort lock held results by total held time\n" " klockstat -S acq_count,hld_total # combination of above\n" -" klockstat -n 3 # display top 3 locks\n" +" klockstat -n 3 # display top 3 locks/threads\n" " klockstat -s 6 # display 6 stack entries per lock\n" +" klockstat -P # print stats per thread\n" ; static const struct argp_option opts[] = { @@ -97,7 +99,7 @@ static const struct argp_option opts[] = { { "caller", 'c', "FUNC", 0, "Filter by caller string prefix" }, { "lock", 'L', "LOCK", 0, "Filter by specific ksym lock name" }, { 0, 0, 0, 0, "" }, - { "locks", 'n', "NR_LOCKS", 0, "Number of locks to print" }, + { "locks", 'n', "NR_LOCKS", 0, "Number of locks or threads to print" }, { "stacks", 's', "NR_STACKS", 0, "Number of stack entries to print per lock" }, { "sort", 'S', "SORT", 0, "Sort by field:\n acq_[max|total|count]\n hld_[max|total|count]" }, { 0, 0, 0, 0, "" }, @@ -106,6 +108,7 @@ static const struct argp_option opts[] = { { "reset", 'R', NULL, 0, "Reset stats each interval" }, { "timestamp", 'T', NULL, 0, "Print timestamp" }, { "verbose", 'v', NULL, 0, "Verbose debug output" }, + { "per-thread", 'P', NULL, 0, "Print per-thread stats" }, { NULL, 'h', NULL, OPTION_HIDDEN, "Show the full help" }, {}, @@ -229,6 +232,9 @@ static error_t parse_arg(int key, char *arg, struct argp_state *state) case 'T': env->timestamp = true; break; + case 'P': + env->per_thread = true; + break; case 'h': argp_state_help(state, stderr, ARGP_HELP_STD_HELP); break; @@ -241,6 +247,10 @@ static error_t parse_arg(int key, char *arg, struct argp_state *state) env->interval = env->duration; env->iterations = env->duration / env->interval; } + if (env->per_thread && env->nr_stack_entries != 1) { + warn("--per-thread and --stacks cannot be used together\n"); + argp_usage(state); + } break; default: return ARGP_ERR_UNKNOWN; @@ -327,62 +337,140 @@ static char *symname(struct ksyms *ksyms, uint64_t pc, char *buf, size_t n) return buf; } +static char *print_caller(char *buf, int size, struct stack_stat *ss) +{ + snprintf(buf, size, "%u %16s", ss->stack_id, ss->ls.acq_max_comm); + return buf; +} + +static char *print_time(char *buf, int size, uint64_t nsec) +{ + struct { + float base; + char *unit; + } table[] = { + { 1e9 * 3600, "h " }, + { 1e9 * 60, "m " }, + { 1e9, "s " }, + { 1e6, "ms" }, + { 1e3, "us" }, + { 0, NULL }, + }; + + for (int i = 0; table[i].base; i++) { + if (nsec < table[i].base) + continue; + + snprintf(buf, size, "%.1f %s", nsec / table[i].base, table[i].unit); + return buf; + } + + snprintf(buf, size, "%u ns", (unsigned)nsec); + return buf; +} + static void print_acq_header(void) { - printf("\n Caller Avg Wait Count Max Wait Total Wait\n"); + if (env.per_thread) + printf("\n Tid Comm"); + else + printf("\n Caller"); + + printf(" Avg Wait Count Max Wait Total Wait\n"); } static void print_acq_stat(struct ksyms *ksyms, struct stack_stat *ss, int nr_stack_entries) { char buf[40]; + char avg[40]; + char max[40]; + char tot[40]; int i; - printf("%37s %9llu %8llu %10llu %12llu\n", + printf("%37s %9s %8llu %10s %12s\n", symname(ksyms, ss->bt[0], buf, sizeof(buf)), - ss->ls.acq_total_time / ss->ls.acq_count, + print_time(avg, sizeof(avg), ss->ls.acq_total_time / ss->ls.acq_count), ss->ls.acq_count, - ss->ls.acq_max_time, - ss->ls.acq_total_time); + print_time(max, sizeof(max), ss->ls.acq_max_time), + print_time(tot, sizeof(tot), ss->ls.acq_total_time)); for (i = 1; i < nr_stack_entries; i++) { - if (!ss->bt[i]) + if (!ss->bt[i] || env.per_thread) break; printf("%37s\n", symname(ksyms, ss->bt[i], buf, sizeof(buf))); } - if (nr_stack_entries > 1) + if (nr_stack_entries > 1 && !env.per_thread) printf(" Max PID %llu, COMM %s\n", ss->ls.acq_max_id >> 32, ss->ls.acq_max_comm); } +static void print_acq_task(struct stack_stat *ss) +{ + char buf[40]; + char avg[40]; + char max[40]; + char tot[40]; + + printf("%37s %9s %8llu %10s %12s\n", + print_caller(buf, sizeof(buf), ss), + print_time(avg, sizeof(avg), ss->ls.acq_total_time / ss->ls.acq_count), + ss->ls.acq_count, + print_time(max, sizeof(max), ss->ls.acq_max_time), + print_time(tot, sizeof(tot), ss->ls.acq_total_time)); +} + static void print_hld_header(void) { - printf("\n Caller Avg Hold Count Max Hold Total Hold\n"); + if (env.per_thread) + printf("\n Tid Comm"); + else + printf("\n Caller"); + + printf(" Avg Hold Count Max Hold Total Hold\n"); } static void print_hld_stat(struct ksyms *ksyms, struct stack_stat *ss, int nr_stack_entries) { char buf[40]; + char avg[40]; + char max[40]; + char tot[40]; int i; - printf("%37s %9llu %8llu %10llu %12llu\n", + printf("%37s %9s %8llu %10s %12s\n", symname(ksyms, ss->bt[0], buf, sizeof(buf)), - ss->ls.hld_total_time / ss->ls.hld_count, + print_time(avg, sizeof(avg), ss->ls.hld_total_time / ss->ls.hld_count), ss->ls.hld_count, - ss->ls.hld_max_time, - ss->ls.hld_total_time); + print_time(max, sizeof(max), ss->ls.hld_max_time), + print_time(tot, sizeof(tot), ss->ls.hld_total_time)); for (i = 1; i < nr_stack_entries; i++) { - if (!ss->bt[i]) + if (!ss->bt[i] || env.per_thread) break; printf("%37s\n", symname(ksyms, ss->bt[i], buf, sizeof(buf))); } - if (nr_stack_entries > 1) + if (nr_stack_entries > 1 && !env.per_thread) printf(" Max PID %llu, COMM %s\n", ss->ls.hld_max_id >> 32, ss->ls.hld_max_comm); } +static void print_hld_task(struct stack_stat *ss) +{ + char buf[40]; + char avg[40]; + char max[40]; + char tot[40]; + + printf("%37s %9s %8llu %10s %12s\n", + print_caller(buf, sizeof(buf), ss), + print_time(avg, sizeof(avg), ss->ls.hld_total_time / ss->ls.hld_count), + ss->ls.hld_count, + print_time(max, sizeof(max), ss->ls.hld_max_time), + print_time(tot, sizeof(tot), ss->ls.hld_total_time)); +} + static int print_stats(struct ksyms *ksyms, int stack_map, int stat_map) { struct stack_stat **stats, *ss; @@ -391,6 +479,7 @@ static int print_stats(struct ksyms *ksyms, int stack_map, int stat_map) uint32_t lookup_key = 0; uint32_t stack_id; int ret, i; + int nr_stack_entries; stats = calloc(stats_sz, sizeof(void *)); if (!stats) { @@ -426,31 +515,39 @@ static int print_stats(struct ksyms *ksyms, int stack_map, int stat_map) free(ss); continue; } - if (bpf_map_lookup_elem(stack_map, &stack_id, &ss->bt)) { + if (!env.per_thread && bpf_map_lookup_elem(stack_map, &stack_id, &ss->bt)) { /* Can still report the results without a backtrace. */ warn("failed to lookup stack_id %u\n", stack_id); } - if (!caller_is_traced(ksyms, ss->bt[0])) { + if (!env.per_thread && !caller_is_traced(ksyms, ss->bt[0])) { free(ss); continue; } stats[stat_idx++] = ss; } + nr_stack_entries = MIN(env.nr_stack_entries, PERF_MAX_STACK_DEPTH); + qsort(stats, stat_idx, sizeof(void*), sort_by_acq); for (i = 0; i < MIN(env.nr_locks, stat_idx); i++) { if (i == 0 || env.nr_stack_entries > 1) print_acq_header(); - print_acq_stat(ksyms, stats[i], - MIN(env.nr_stack_entries, PERF_MAX_STACK_DEPTH)); + + if (env.per_thread) + print_acq_task(stats[i]); + else + print_acq_stat(ksyms, stats[i], nr_stack_entries); } qsort(stats, stat_idx, sizeof(void*), sort_by_hld); for (i = 0; i < MIN(env.nr_locks, stat_idx); i++) { if (i == 0 || env.nr_stack_entries > 1) print_hld_header(); - print_hld_stat(ksyms, stats[i], - MIN(env.nr_stack_entries, PERF_MAX_STACK_DEPTH)); + + if (env.per_thread) + print_hld_task(stats[i]); + else + print_hld_stat(ksyms, stats[i], nr_stack_entries); } for (i = 0; i < stat_idx; i++) @@ -533,6 +630,7 @@ int main(int argc, char **argv) obj->rodata->targ_tgid = env.pid; obj->rodata->targ_pid = env.tid; obj->rodata->targ_lock = lock_addr; + obj->rodata->per_thread = env.per_thread; if (fentry_can_attach("mutex_lock_nested", NULL)) { bpf_program__set_attach_target(obj->progs.mutex_lock, 0, @@ -598,6 +696,7 @@ int main(int argc, char **argv) warn("print_stats error, aborting.\n"); break; } + fflush(stdout); } printf("Exiting trace of mutex/sem locks\n"); diff --git a/libbpf-tools/mdflush.bpf.c b/libbpf-tools/mdflush.bpf.c new file mode 100644 index 000000000000..8eac536a2fec --- /dev/null +++ b/libbpf-tools/mdflush.bpf.c @@ -0,0 +1,31 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ +/* Copyright (c) 2021~2022 Hengqi Chen */ +#include +#include +#include +#include +#include "core_fixes.bpf.h" +#include "mdflush.h" + +struct { + __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); + __type(key, __u32); + __type(value, __u32); +} events SEC(".maps"); + +SEC("fentry/md_flush_request") +int BPF_PROG(md_flush_request, void *mddev, void *bio) +{ + __u64 pid = bpf_get_current_pid_tgid() >> 32; + struct event event = {}; + struct gendisk *gendisk; + + event.pid = pid; + gendisk = get_gendisk(bio); + BPF_CORE_READ_STR_INTO(event.disk, gendisk, disk_name); + bpf_get_current_comm(event.comm, sizeof(event.comm)); + bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU, &event, sizeof(event)); + return 0; +} + +char LICENSE[] SEC("license") = "Dual BSD/GPL"; diff --git a/libbpf-tools/mdflush.c b/libbpf-tools/mdflush.c new file mode 100644 index 000000000000..0f23a0a72395 --- /dev/null +++ b/libbpf-tools/mdflush.c @@ -0,0 +1,152 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ + +/* + * mdflush Trace md flush events. + * + * Copyright (c) 2021~2022 Hengqi Chen + * + * Based on mdflush(8) from BCC by Brendan Gregg. + * 08-Nov-2021 Hengqi Chen Created this. + */ +#include +#include +#include +#include +#include + +#include +#include +#include "mdflush.h" +#include "mdflush.skel.h" +#include "trace_helpers.h" + +#define PERF_BUFFER_PAGES 16 +#define PERF_POLL_TIMEOUT_MS 100 +#define warn(...) fprintf(stderr, __VA_ARGS__) + +static volatile sig_atomic_t exiting = 0; +static bool verbose = false; + +const char *argp_program_version = "mdflush 0.1"; +const char *argp_program_bug_address = + "https://github.com/iovisor/bcc/tree/master/libbpf-tools"; +const char argp_program_doc[] = +"Trace md flush events.\n" +"\n" +"USAGE: mdflush\n"; + +static const struct argp_option opts[] = { + { "verbose", 'v', NULL, 0, "Verbose debug output" }, + { NULL, 'h', NULL, OPTION_HIDDEN, "Show the full help" }, + {}, +}; + +static error_t parse_arg(int key, char *arg, struct argp_state *state) +{ + switch (key) { + case 'h': + argp_state_help(state, stderr, ARGP_HELP_STD_HELP); + break; + case 'v': + verbose = true; + break; + default: + return ARGP_ERR_UNKNOWN; + } + return 0; +} + +static void sig_int(int signo) +{ + exiting = 1; +} + +static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args) +{ + if (level == LIBBPF_DEBUG && !verbose) + return 0; + return vfprintf(stderr, format, args); +} + +static void handle_event(void *ctx, int cpu, void *data, __u32 data_sz) +{ + struct event *e = data; + time_t t; + struct tm *tm; + char ts[32]; + + time(&t); + tm = localtime(&t); + strftime(ts, sizeof(ts), "%H:%M:%S", tm); + printf("%-8s %-7d %-16s %-s\n", + ts, e->pid, e->comm, e->disk); +} + +static void handle_lost_events(void *ctx, int cpu, __u64 lost_cnt) +{ + warn("lost %llu events on CPU #%d\n", lost_cnt, cpu); +} + +int main(int argc, char **argv) +{ + static const struct argp argp = { + .options = opts, + .parser = parse_arg, + .doc = argp_program_doc, + }; + struct perf_buffer *pb = NULL; + struct mdflush_bpf *obj; + int err; + + err = argp_parse(&argp, argc, argv, 0, NULL, NULL); + if (err) + return err; + + libbpf_set_strict_mode(LIBBPF_STRICT_ALL); + libbpf_set_print(libbpf_print_fn); + + obj = mdflush_bpf__open_and_load(); + if (!obj) { + warn("failed to open/load BPF object\n"); + return 1; + } + + err = mdflush_bpf__attach(obj); + if (err) { + warn("failed to attach BPF programs: %d\n", err); + goto cleanup; + } + + pb = perf_buffer__new(bpf_map__fd(obj->maps.events), PERF_BUFFER_PAGES, + handle_event, handle_lost_events, NULL, NULL); + if (!pb) { + warn("failed to open perf buffer: %d\n", err); + goto cleanup; + } + + if (signal(SIGINT, sig_int) == SIG_ERR) { + warn("can't set signal handler: %s\n", strerror(errno)); + err = 1; + goto cleanup; + } + + printf("Tracing md flush requests... Hit Ctrl-C to end.\n"); + printf("%-8s %-7s %-16s %-s\n", + "TIME", "PID", "COMM", "DEVICE"); + + while (!exiting) { + err = perf_buffer__poll(pb, PERF_POLL_TIMEOUT_MS); + if (err < 0 && err != -EINTR) { + warn("error polling perf buffer: %s\n", strerror(-err)); + goto cleanup; + } + /* reset err to return 0 if exiting */ + err = 0; + } + +cleanup: + perf_buffer__free(pb); + mdflush_bpf__destroy(obj); + + return err != 0; +} diff --git a/libbpf-tools/mdflush.h b/libbpf-tools/mdflush.h new file mode 100644 index 000000000000..18cd723a7d04 --- /dev/null +++ b/libbpf-tools/mdflush.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ +/* Copyright (c) 2021~2022 Hengqi Chen */ +#ifndef __MDFLUSH_H +#define __MDFLUSH_H + +#define TASK_COMM_LEN 16 +#define DISK_NAME_LEN 32 + +struct event { + __u32 pid; + char comm[TASK_COMM_LEN]; + char disk[DISK_NAME_LEN]; +}; + +#endif /* __MDFLUSH_H */ diff --git a/libbpf-tools/numamove.bpf.c b/libbpf-tools/numamove.bpf.c index 69d8d5f90719..62c2d714e489 100644 --- a/libbpf-tools/numamove.bpf.c +++ b/libbpf-tools/numamove.bpf.c @@ -9,7 +9,6 @@ struct { __uint(max_entries, 10240); __type(key, u32); __type(value, u64); - __uint(map_flags, BPF_F_NO_PREALLOC); } start SEC(".maps"); __u64 latency = 0; diff --git a/libbpf-tools/readahead.bpf.c b/libbpf-tools/readahead.bpf.c index b9423c3f9a49..89863e67d60d 100644 --- a/libbpf-tools/readahead.bpf.c +++ b/libbpf-tools/readahead.bpf.c @@ -13,7 +13,6 @@ struct { __uint(max_entries, MAX_ENTRIES); __type(key, u32); __type(value, u64); - __uint(map_flags, BPF_F_NO_PREALLOC); } in_readahead SEC(".maps"); struct { @@ -21,7 +20,6 @@ struct { __uint(max_entries, MAX_ENTRIES); __type(key, struct page *); __type(value, u64); - __uint(map_flags, BPF_F_NO_PREALLOC); } birth SEC(".maps"); struct hist hist = {}; diff --git a/libbpf-tools/syscount.bpf.c b/libbpf-tools/syscount.bpf.c index d6a98323df23..6209feeaa023 100644 --- a/libbpf-tools/syscount.bpf.c +++ b/libbpf-tools/syscount.bpf.c @@ -28,7 +28,6 @@ struct { __uint(max_entries, MAX_ENTRIES); __type(key, u32); __type(value, u64); - __uint(map_flags, BPF_F_NO_PREALLOC); } start SEC(".maps"); struct { @@ -36,7 +35,6 @@ struct { __uint(max_entries, MAX_ENTRIES); __type(key, u32); __type(value, struct data_t); - __uint(map_flags, BPF_F_NO_PREALLOC); } data SEC(".maps"); static __always_inline diff --git a/libbpf-tools/tcpconnect.bpf.c b/libbpf-tools/tcpconnect.bpf.c index 7ee8a301c262..a13d48c239f2 100644 --- a/libbpf-tools/tcpconnect.bpf.c +++ b/libbpf-tools/tcpconnect.bpf.c @@ -11,7 +11,7 @@ #include "maps.bpf.h" #include "tcpconnect.h" -SEC(".rodata") int filter_ports[MAX_PORTS]; +const volatile int filter_ports[MAX_PORTS]; const volatile int filter_ports_len = 0; const volatile uid_t filter_uid = -1; const volatile pid_t filter_pid = 0; @@ -26,7 +26,6 @@ struct { __uint(max_entries, MAX_ENTRIES); __type(key, u32); __type(value, struct sock *); - __uint(map_flags, BPF_F_NO_PREALLOC); } sockets SEC(".maps"); struct { @@ -34,7 +33,6 @@ struct { __uint(max_entries, MAX_ENTRIES); __type(key, struct ipv4_flow_key); __type(value, u64); - __uint(map_flags, BPF_F_NO_PREALLOC); } ipv4_count SEC(".maps"); struct { @@ -42,7 +40,6 @@ struct { __uint(max_entries, MAX_ENTRIES); __type(key, struct ipv6_flow_key); __type(value, u64); - __uint(map_flags, BPF_F_NO_PREALLOC); } ipv6_count SEC(".maps"); struct { diff --git a/man/man8/biolatency.8 b/man/man8/biolatency.8 index c13f6c8ad036..db2ef48429e9 100644 --- a/man/man8/biolatency.8 +++ b/man/man8/biolatency.8 @@ -2,7 +2,7 @@ .SH NAME biolatency \- Summarize block device I/O latency as a histogram. .SH SYNOPSIS -.B biolatency [\-h] [\-F] [\-T] [\-Q] [\-m] [\-D] [\-e] [interval [count]] +.B biolatency [\-h] [\-F] [\-T] [\-Q] [\-m] [\-D] [\-F] [\-e] [\-j] [\-d DISK] [interval [count]] .SH DESCRIPTION biolatency traces block device I/O (disk I/O), and records the distribution of I/O latency (time). This is printed as a histogram either on Ctrl-C, or @@ -42,6 +42,9 @@ Print a histogram dictionary \-e Show extension summary(total, average) .TP +\-d DISK +Trace this disk only +.TP interval Output interval, in seconds. .TP @@ -108,6 +111,6 @@ Linux .SH STABILITY Unstable - in development. .SH AUTHOR -Brendan Gregg +Brendan Gregg, Rocky Xing .SH SEE ALSO biosnoop(8) diff --git a/man/man8/funcinterval.8 b/man/man8/funcinterval.8 old mode 100755 new mode 100644 index 8a60399871e7..77128290b04b --- a/man/man8/funcinterval.8 +++ b/man/man8/funcinterval.8 @@ -8,7 +8,7 @@ This tool times interval between the same function as a histogram. eBPF/bcc is very suitable for platform performance tuning. By funclatency, we can profile specific functions to know how latency -this function costs. However, sometimes performance drop is not about the +this function costs. However, sometimes performance drop is not about the latency of function but the interval between function calls. funcinterval is born for this purpose. diff --git a/src/cc/bpf_module.cc b/src/cc/bpf_module.cc index 4e8ff10429ed..1ff33c8a79a1 100644 --- a/src/cc/bpf_module.cc +++ b/src/cc/bpf_module.cc @@ -404,7 +404,7 @@ int BPFModule::create_maps(std::map> &map_tids, } if (pinned_id <= 0) { - struct bpf_create_map_attr attr = {}; + struct bcc_create_map_attr attr = {}; attr.map_type = (enum bpf_map_type)map_type; attr.name = map_name; attr.key_size = key_size; diff --git a/src/cc/bpf_module_rw_engine.cc b/src/cc/bpf_module_rw_engine.cc index 6e0fcb74badd..52c877e4d657 100644 --- a/src/cc/bpf_module_rw_engine.cc +++ b/src/cc/bpf_module_rw_engine.cc @@ -410,7 +410,7 @@ int BPFModule::annotate() { table_names_[table.name] = id++; GlobalValue *gvar = mod_->getNamedValue(table.name); if (!gvar) continue; -#if LLVM_MAJOR_VERSION >= 15 +#if LLVM_MAJOR_VERSION >= 14 { Type *t = gvar->getValueType(); StructType *st = dyn_cast(t); diff --git a/src/cc/libbpf b/src/cc/libbpf index 86eb09863c1c..4cb682229d0c 160000 --- a/src/cc/libbpf +++ b/src/cc/libbpf @@ -1 +1 @@ -Subproject commit 86eb09863c1c0177e99c2c703092042d3cdba910 +Subproject commit 4cb682229d0ca9ef32fe191f00b5ce31fd050a66 diff --git a/src/cc/libbpf.c b/src/cc/libbpf.c index 467e06f979b1..d3ee8ca5d77f 100644 --- a/src/cc/libbpf.c +++ b/src/cc/libbpf.c @@ -307,7 +307,7 @@ static uint64_t ptr_to_u64(void *ptr) return (uint64_t) (unsigned long) ptr; } -static int libbpf_bpf_map_create(struct bpf_create_map_attr *create_attr) +static int libbpf_bpf_map_create(struct bcc_create_map_attr *create_attr) { LIBBPF_OPTS(bpf_map_create_opts, p); @@ -326,7 +326,7 @@ static int libbpf_bpf_map_create(struct bpf_create_map_attr *create_attr) create_attr->value_size, create_attr->max_entries, &p); } -int bcc_create_map_xattr(struct bpf_create_map_attr *attr, bool allow_rlimit) +int bcc_create_map_xattr(struct bcc_create_map_attr *attr, bool allow_rlimit) { unsigned name_len = attr->name ? strlen(attr->name) : 0; char map_name[BPF_OBJ_NAME_LEN] = {}; @@ -383,7 +383,7 @@ int bcc_create_map(enum bpf_map_type map_type, const char *name, int key_size, int value_size, int max_entries, int map_flags) { - struct bpf_create_map_attr attr = {}; + struct bcc_create_map_attr attr = {}; attr.map_type = map_type; attr.name = name; diff --git a/src/cc/libbpf.h b/src/cc/libbpf.h index e001d740f323..c5ea40a505a0 100644 --- a/src/cc/libbpf.h +++ b/src/cc/libbpf.h @@ -27,7 +27,23 @@ extern "C" { #endif -struct bpf_create_map_attr; +struct bcc_create_map_attr { + const char *name; + enum bpf_map_type map_type; + __u32 map_flags; + __u32 key_size; + __u32 value_size; + __u32 max_entries; + __u32 numa_node; + __u32 btf_fd; + __u32 btf_key_type_id; + __u32 btf_value_type_id; + __u32 map_ifindex; + union { + __u32 inner_map_fd; + __u32 btf_vmlinux_value_type_id; + }; +}; struct bpf_load_program_attr; enum bpf_probe_attach_type { @@ -44,7 +60,7 @@ struct bcc_perf_buffer_opts { int bcc_create_map(enum bpf_map_type map_type, const char *name, int key_size, int value_size, int max_entries, int map_flags); -int bcc_create_map_xattr(struct bpf_create_map_attr *attr, bool allow_rlimit); +int bcc_create_map_xattr(struct bcc_create_map_attr *attr, bool allow_rlimit); int bpf_update_elem(int fd, void *key, void *value, unsigned long long flags); int bpf_lookup_elem(int fd, void *key, void *value); int bpf_delete_elem(int fd, void *key); diff --git a/src/python/bcc/__init__.py b/src/python/bcc/__init__.py index c9f1030539b9..55c968aedc5a 100644 --- a/src/python/bcc/__init__.py +++ b/src/python/bcc/__init__.py @@ -828,7 +828,8 @@ def attach_kprobe(self, event=b"", event_off=0, fn_name=b"", event_re=b""): failed += 1 probes.append(line) if failed == len(matches): - raise Exception("Failed to attach BPF program %s to kprobe %s" % + raise Exception("Failed to attach BPF program %s to kprobe %s" + ", it's not traceable (either non-existing, inlined, or marked as \"notrace\")" % (fn_name, '/'.join(probes))) return @@ -837,7 +838,8 @@ def attach_kprobe(self, event=b"", event_off=0, fn_name=b"", event_re=b""): ev_name = b"p_" + event.replace(b"+", b"_").replace(b".", b"_") fd = lib.bpf_attach_kprobe(fn.fd, 0, ev_name, event, event_off, 0) if fd < 0: - raise Exception("Failed to attach BPF program %s to kprobe %s" % + raise Exception("Failed to attach BPF program %s to kprobe %s" + ", it's not traceable (either non-existing, inlined, or marked as \"notrace\")" % (fn_name, event)) self._add_kprobe_fd(ev_name, fn_name, fd) return self @@ -860,7 +862,8 @@ def attach_kretprobe(self, event=b"", fn_name=b"", event_re=b"", maxactive=0): failed += 1 probes.append(line) if failed == len(matches): - raise Exception("Failed to attach BPF program %s to kretprobe %s" % + raise Exception("Failed to attach BPF program %s to kretprobe %s" + ", it's not traceable (either non-existing, inlined, or marked as \"notrace\")" % (fn_name, '/'.join(probes))) return @@ -869,7 +872,8 @@ def attach_kretprobe(self, event=b"", fn_name=b"", event_re=b"", maxactive=0): ev_name = b"r_" + event.replace(b"+", b"_").replace(b".", b"_") fd = lib.bpf_attach_kprobe(fn.fd, 1, ev_name, event, 0, maxactive) if fd < 0: - raise Exception("Failed to attach BPF program %s to kretprobe %s" % + raise Exception("Failed to attach BPF program %s to kretprobe %s" + ", it's not traceable (either non-existing, inlined, or marked as \"notrace\")" % (fn_name, event)) self._add_kprobe_fd(ev_name, fn_name, fd) return self @@ -1736,6 +1740,20 @@ def add_module(modname): def donothing(self): """the do nothing exit handler""" + + def close(self): + """close(self) + + Closes all associated files descriptors. Attached BPF programs are not + detached. + """ + for name, fn in list(self.funcs.items()): + os.close(fn.fd) + del self.funcs[name] + if self.module: + lib.bpf_module_destroy(self.module) + self.module = None + def cleanup(self): # Clean up opened probes for k, v in list(self.kprobe_fds.items()): @@ -1763,12 +1781,8 @@ def cleanup(self): if self.tracefile: self.tracefile.close() self.tracefile = None - for name, fn in list(self.funcs.items()): - os.close(fn.fd) - del self.funcs[name] - if self.module: - lib.bpf_module_destroy(self.module) - self.module = None + + self.close() # Clean up ringbuf if self._ringbuf_manager: diff --git a/tools/biolatency.py b/tools/biolatency.py index 9ece05025912..6f7719054d0f 100755 --- a/tools/biolatency.py +++ b/tools/biolatency.py @@ -4,18 +4,20 @@ # biolatency Summarize block device I/O latency as a histogram. # For Linux, uses BCC, eBPF. # -# USAGE: biolatency [-h] [-T] [-Q] [-m] [-D] [-F] [-e] [-j] [interval] [count] +# USAGE: biolatency [-h] [-T] [-Q] [-m] [-D] [-F] [-e] [-j] [-d DISK] [interval] [count] # # Copyright (c) 2015 Brendan Gregg. # Licensed under the Apache License, Version 2.0 (the "License") # # 20-Sep-2015 Brendan Gregg Created this. +# 31-Mar-2022 Rocky Xing Added disk filter support. from __future__ import print_function from bcc import BPF from time import sleep, strftime import argparse import ctypes as ct +import os # arguments examples = """examples: @@ -27,6 +29,7 @@ ./biolatency -F # show I/O flags separately ./biolatency -j # print a dictionary ./biolatency -e # show extension summary(total, average) + ./biolatency -d sdc # Trace sdc only """ parser = argparse.ArgumentParser( description="Summarize block device I/O latency as a histogram", @@ -52,6 +55,8 @@ help=argparse.SUPPRESS) parser.add_argument("-j", "--json", action="store_true", help="json output") +parser.add_argument("-d", "--disk", type=str, + help="Trace this disk only") args = parser.parse_args() countdown = int(args.count) @@ -87,6 +92,8 @@ // time block I/O int trace_req_start(struct pt_regs *ctx, struct request *req) { + DISK_FILTER + u64 ts = bpf_ktime_get_ns(); start.update(&req, &ts); return 0; @@ -149,6 +156,33 @@ storage_str += "BPF_HISTOGRAM(dist);" store_str += "dist.atomic_increment(bpf_log2l(delta));" +if args.disk is not None: + disk_path = os.path.join('/dev', args.disk) + if not os.path.exists(disk_path): + print("no such disk '%s'" % args.disk) + exit(1) + + stat_info = os.stat(disk_path) + major = os.major(stat_info.st_rdev) + minor = os.minor(stat_info.st_rdev) + + disk_field_str = "" + if BPF.kernel_struct_has_field(b'request', b'rq_disk') == 1: + disk_field_str = 'req->rq_disk' + else: + disk_field_str = 'req->q->disk' + + disk_filter_str = """ + struct gendisk *disk = %s; + if (!(disk->major == %d && disk->first_minor == %d)) { + return 0; + } + """ % (disk_field_str, major, minor) + + bpf_text = bpf_text.replace('DISK_FILTER', disk_filter_str) +else: + bpf_text = bpf_text.replace('DISK_FILTER', '') + if args.extension: storage_str += "BPF_ARRAY(extension, ext_val_t, 1);" bpf_text = bpf_text.replace('EXTENSION', """ diff --git a/tools/biolatency_example.txt b/tools/biolatency_example.txt index a88136b8a655..1bc8f591d405 100644 --- a/tools/biolatency_example.txt +++ b/tools/biolatency_example.txt @@ -352,24 +352,25 @@ The -j with -m prints a millisecond histogram dictionary. The `value_type` key i USAGE message: # ./biolatency -h -usage: biolatency.py [-h] [-T] [-Q] [-m] [-D] [-F] [-j] - [interval] [count] +usage: biolatency.py [-h] [-T] [-Q] [-m] [-D] [-F] [-e] [-j] [-d DISK] + [interval] [count] Summarize block device I/O latency as a histogram positional arguments: - interval output interval, in seconds - count number of outputs + interval output interval, in seconds + count number of outputs optional arguments: - -h, --help show this help message and exit - -T, --timestamp include timestamp on output - -Q, --queued include OS queued time in I/O time - -m, --milliseconds millisecond histogram - -D, --disks print a histogram per disk device - -F, --flags print a histogram per set of I/O flags - -e, --extension also show extension summary(total, average) - -j, --json json output + -h, --help show this help message and exit + -T, --timestamp include timestamp on output + -Q, --queued include OS queued time in I/O time + -m, --milliseconds millisecond histogram + -D, --disks print a histogram per disk device + -F, --flags print a histogram per set of I/O flags + -e, --extension summarize average/total value + -j, --json json output + -d DISK, --disk DISK Trace this disk only examples: ./biolatency # summarize block I/O latency as a histogram @@ -380,3 +381,4 @@ examples: ./biolatency -F # show I/O flags separately ./biolatency -j # print a dictionary ./biolatency -e # show extension summary(total, average) + ./biolatency -d sdc # Trace sdc only diff --git a/tools/exitsnoop.py b/tools/exitsnoop.py index 42606bc6cd95..8b4947467c95 100755 --- a/tools/exitsnoop.py +++ b/tools/exitsnoop.py @@ -107,7 +107,7 @@ def _embedded_c(args): data.exit_time = bpf_ktime_get_ns(), data.pid = task->tgid, data.tid = task->pid, - data.ppid = task->parent->tgid, + data.ppid = task->real_parent->tgid, data.exit_code = task->exit_code >> 8, data.sig_info = task->exit_code & 0xFF, bpf_get_current_comm(&data.task, sizeof(data.task)); @@ -151,7 +151,7 @@ def _print_header(): print("%-13s" % title, end="") if Global.args.label is not None: print("%-6s" % "LABEL", end="") - print("%-16s %-6s %-6s %-6s %-7s %-10s" % + print("%-16s %-7s %-7s %-7s %-7s %-10s" % ("PCOMM", "PID", "PPID", "TID", "AGE(s)", "EXIT_CODE")) buffer = None @@ -167,7 +167,7 @@ def _print_event(cpu, data, size): # callback label = Global.args.label if len(Global.args.label) else 'exit' print("%-6s" % label, end="") age = (e.exit_time - e.start_time) / 1e9 - print("%-16s %-6d %-6d %-6d %-7.2f " % + print("%-16s %-7d %-7d %-7d %-7.2f " % (e.task.decode(), e.pid, e.ppid, e.tid, age), end="") if e.sig_info == 0: print("0" if e.exit_code == 0 else "code %d" % e.exit_code) diff --git a/tools/syscount.py b/tools/syscount.py index c832c08601ea..9ae027431aa4 100755 --- a/tools/syscount.py +++ b/tools/syscount.py @@ -76,7 +76,7 @@ def handle_errno(errstr): if args.list: for grp in izip_longest(*(iter(sorted(syscalls.values())),) * 4): - print(" ".join(["%-20s" % s for s in grp if s is not None])) + print(" ".join(["%-22s" % s.decode() for s in grp if s is not None])) sys.exit(0) text = """ diff --git a/tools/tcpdrop.py b/tools/tcpdrop.py index d64b57320536..ffa044df766d 100755 --- a/tools/tcpdrop.py +++ b/tools/tcpdrop.py @@ -16,6 +16,7 @@ # Licensed under the Apache License, Version 2.0 (the "License") # # 30-May-2018 Brendan Gregg Created this. +# 15-Jun-2022 Rong Tao Add tracepoint:skb:kfree_skb from __future__ import print_function from bcc import BPF @@ -100,7 +101,7 @@ #define tcp_flag_byte(th) (((u_int8_t *)th)[13]) #endif -int trace_tcp_drop(struct pt_regs *ctx, struct sock *sk, struct sk_buff *skb) +static int __trace_tcp_drop(void *ctx, struct sock *sk, struct sk_buff *skb) { if (sk == NULL) return 0; @@ -154,6 +155,29 @@ return 0; } + +int trace_tcp_drop(struct pt_regs *ctx, struct sock *sk, struct sk_buff *skb) +{ + return __trace_tcp_drop(ctx, sk, skb); +} +""" + +bpf_kfree_skb_text = """ +#include + +TRACEPOINT_PROBE(skb, kfree_skb) { + struct sk_buff *skb = args->skbaddr; + struct sock *sk = skb->sk; + enum skb_drop_reason reason = args->reason; + + // SKB_NOT_DROPPED_YET, + // SKB_DROP_REASON_NOT_SPECIFIED, + if (reason > SKB_DROP_REASON_NOT_SPECIFIED) { + return __trace_tcp_drop(args, sk, skb); + } + + return 0; +} """ if debug or args.ebpf: @@ -194,13 +218,22 @@ def print_ipv6_event(cpu, data, size): print("\t%s" % sym) print("") +if BPF.tracepoint_exists("skb", "kfree_skb"): + if BPF.kernel_struct_has_field("trace_event_raw_kfree_skb", "reason") == 1: + bpf_text += bpf_kfree_skb_text + # initialize BPF b = BPF(text=bpf_text) + if b.get_kprobe_functions(b"tcp_drop"): b.attach_kprobe(event="tcp_drop", fn_name="trace_tcp_drop") +elif b.tracepoint_exists("skb", "kfree_skb"): + print("WARNING: tcp_drop() kernel function not found or traceable. " + "Use tracpoint:skb:kfree_skb instead.") else: - print("ERROR: tcp_drop() kernel function not found or traceable. " - "The kernel might be too old or the the function has been inlined.") + print("ERROR: tcp_drop() kernel function and tracpoint:skb:kfree_skb" + " not found or traceable. " + "The kernel might be too old or the the function has been inlined.") exit() stack_traces = b.get_table("stack_traces") diff --git a/tools/tcplife.py b/tools/tcplife.py index 780385b4513b..8485a5f56948 100755 --- a/tools/tcplife.py +++ b/tools/tcplife.py @@ -25,7 +25,7 @@ from __future__ import print_function from bcc import BPF import argparse -from socket import inet_ntop, ntohs, AF_INET, AF_INET6 +from socket import inet_ntop, AF_INET, AF_INET6 from struct import pack from time import strftime @@ -191,13 +191,13 @@ FILTER_PID // get throughput stats. see tcp_get_info(). - u64 rx_b = 0, tx_b = 0, sport = 0; + u64 rx_b = 0, tx_b = 0; struct tcp_sock *tp = (struct tcp_sock *)sk; rx_b = tp->bytes_received; tx_b = tp->bytes_acked; u16 family = sk->__sk_common.skc_family; - + FILTER_FAMILY if (family == AF_INET) { @@ -318,12 +318,12 @@ if (mep != 0) pid = mep->pid; FILTER_PID - + u16 family = args->family; FILTER_FAMILY // get throughput stats. see tcp_get_info(). - u64 rx_b = 0, tx_b = 0, sport = 0; + u64 rx_b = 0, tx_b = 0; struct tcp_sock *tp = (struct tcp_sock *)sk; rx_b = tp->bytes_received; tx_b = tp->bytes_acked; diff --git a/tools/xfsslower.py b/tools/xfsslower.py index ef79a8947cb8..ef4b6b5b01fc 100755 --- a/tools/xfsslower.py +++ b/tools/xfsslower.py @@ -186,8 +186,11 @@ // populate output struct u32 size = PT_REGS_RC(ctx); - struct data_t data = {.type = type, .size = size, .delta_us = delta_us, - .pid = pid}; + struct data_t data = {}; + data.type = type; + data.size = size; + data.delta_us = delta_us; + data.pid = pid; data.ts_us = ts / 1000; data.offset = valp->offset; bpf_get_current_comm(&data.task, sizeof(data.task));