diff --git a/docs/kernel-versions.md b/docs/kernel-versions.md
index a4af5992ed67..116dd3807774 100644
--- a/docs/kernel-versions.md
+++ b/docs/kernel-versions.md
@@ -69,7 +69,8 @@ BPF attached to cgroups as device controller | 4.15 | [`ebc614f68736`](https://g
 bpf2bpf function calls | 4.16 |  [`cc8b0b92a169`](https://github.com/torvalds/linux/commit/cc8b0b92a1699bc32f7fec71daa2bfc90de43a4d)
 BPF used for monitoring socket RX/TX data | 4.17 | [`4f738adba30a`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=4f738adba30a7cfc006f605707e7aee847ffefa0)
 BPF attached to raw tracepoints | 4.17 | [`c4f6699dfcb8`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=c4f6699dfcb8558d138fe838f741b2c10f416cf9)
-BPF attached to `bind()` system call | 4.17 | [`4fbac77d2d09`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=4fbac77d2d092b475dda9eea66da674369665427)
+BPF attached to `bind()` system call | 4.17 | [`4fbac77d2d09`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=4fbac77d2d092b475dda9eea66da674369665427) [`aac3fc320d94`](https://github.com/torvalds/linux/commit/aac3fc320d9404f2665a8b1249dc3170d5fa3caf)
+BPF attached to `connect()` system call | 4.17 | [`d74bad4e74ee`](https://github.com/torvalds/linux/commit/d74bad4e74ee373787a9ae24197c17b7cdc428d5)
 BPF Type Format (BTF) | 4.18 | [`69b693f0aefa`](https://git.kernel.org/cgit/linux/kernel/git/davem/net-next.git/commit/?id=69b693f0aefa0ed521e8bd02260523b5ae446ad7)
 AF_XDP | 4.18 |  [`fbfc504a24f5`](https://git.kernel.org/cgit/linux/kernel/git/davem/net-next.git/commit/?id=fbfc504a24f53f7ebe128ab55cb5dba634f4ece8)
 bpfilter | 4.18 |  [`d2ba09c17a06`](https://git.kernel.org/cgit/linux/kernel/git/davem/net-next.git/commit/?id=d2ba09c17a0647f899d6c20a11bab9e6d3382f07)
diff --git a/libbpf-tools/.gitignore b/libbpf-tools/.gitignore
index 561f94ecef1c..251646397529 100644
--- a/libbpf-tools/.gitignore
+++ b/libbpf-tools/.gitignore
@@ -29,6 +29,7 @@
 /llcstat
 /nfsdist
 /nfsslower
+/mdflush
 /mountsnoop
 /numamove
 /offcputime
diff --git a/libbpf-tools/Makefile b/libbpf-tools/Makefile
index e60ec409aa3f..e753230cee89 100644
--- a/libbpf-tools/Makefile
+++ b/libbpf-tools/Makefile
@@ -42,6 +42,7 @@ APPS = \
 	klockstat \
 	ksnoop \
 	llcstat \
+	mdflush \
 	mountsnoop \
 	numamove \
 	offcputime \
@@ -79,6 +80,15 @@ COMMON_OBJ = \
 	$(if $(ENABLE_MIN_CORE_BTFS),$(OUTPUT)/min_core_btf_tar.o) \
 	#
 
+define allow-override
+  $(if $(or $(findstring environment,$(origin $(1))),\
+            $(findstring command line,$(origin $(1)))),,\
+    $(eval $(1) = $(2)))
+endef
+
+$(call allow-override,CC,$(CROSS_COMPILE)cc)
+$(call allow-override,LD,$(CROSS_COMPILE)ld)
+
 .PHONY: all
 all: $(APPS) $(APP_ALIASES)
 
@@ -91,6 +101,13 @@ msg = @printf '  %-8s %s%s\n' "$(1)" "$(notdir $(2))" "$(if $(3), $(3))";
 MAKEFLAGS += --no-print-directory
 endif
 
+ifneq ($(EXTRA_CFLAGS),)
+CFLAGS += $(EXTRA_CFLAGS)
+endif
+ifneq ($(EXTRA_LDFLAGS),)
+LDFLAGS += $(EXTRA_LDFLAGS)
+endif
+
 .PHONY: clean
 clean:
 	$(call msg,CLEAN)
@@ -103,7 +120,7 @@ $(OUTPUT) $(OUTPUT)/libbpf:
 .PHONY: bpftool
 bpftool:
 	$(Q)mkdir -p $(OUTPUT)/bpftool
-	$(Q)$(MAKE) OUTPUT=$(OUTPUT)/bpftool/ -C $(BPFTOOL_SRC)
+	$(Q)$(MAKE) ARCH= CROSS_COMPILE=  OUTPUT=$(OUTPUT)/bpftool/ -C $(BPFTOOL_SRC)
 
 $(APPS): %: $(OUTPUT)/%.o $(LIBBPF_OBJ) $(COMMON_OBJ) | $(OUTPUT)
 	$(call msg,BINARY,$@)
diff --git a/libbpf-tools/biolatency.bpf.c b/libbpf-tools/biolatency.bpf.c
index b9e87c393494..4d59d5f8db4d 100644
--- a/libbpf-tools/biolatency.bpf.c
+++ b/libbpf-tools/biolatency.bpf.c
@@ -35,7 +35,6 @@ struct {
 	__uint(max_entries, MAX_ENTRIES);
 	__type(key, struct request *);
 	__type(value, u64);
-	__uint(map_flags, BPF_F_NO_PREALLOC);
 } start SEC(".maps");
 
 static struct hist initial_hist;
@@ -45,7 +44,6 @@ struct {
 	__uint(max_entries, MAX_ENTRIES);
 	__type(key, struct hist_key);
 	__type(value, struct hist);
-	__uint(map_flags, BPF_F_NO_PREALLOC);
 } hists SEC(".maps");
 
 static __always_inline
diff --git a/libbpf-tools/biopattern.bpf.c b/libbpf-tools/biopattern.bpf.c
index bf051bc320a8..2f099be77ee8 100644
--- a/libbpf-tools/biopattern.bpf.c
+++ b/libbpf-tools/biopattern.bpf.c
@@ -14,7 +14,6 @@ struct {
 	__uint(max_entries, 64);
 	__type(key, u32);
 	__type(value, struct counter);
-	__uint(map_flags, BPF_F_NO_PREALLOC);
 } counters SEC(".maps");
 
 SEC("tracepoint/block/block_rq_complete")
diff --git a/libbpf-tools/biosnoop.bpf.c b/libbpf-tools/biosnoop.bpf.c
index a29af98de455..b7e711e059ec 100644
--- a/libbpf-tools/biosnoop.bpf.c
+++ b/libbpf-tools/biosnoop.bpf.c
@@ -36,7 +36,6 @@ struct {
 	__uint(max_entries, MAX_ENTRIES);
 	__type(key, struct request *);
 	__type(value, struct piddata);
-	__uint(map_flags, BPF_F_NO_PREALLOC);
 } infobyreq SEC(".maps");
 
 struct stage {
diff --git a/libbpf-tools/biosnoop.c b/libbpf-tools/biosnoop.c
index f0f665a6db53..988d82566ecf 100644
--- a/libbpf-tools/biosnoop.c
+++ b/libbpf-tools/biosnoop.c
@@ -167,7 +167,7 @@ void handle_event(void *ctx, int cpu, void *data, __u32 data_sz)
 		start_ts = e->ts;
 	blk_fill_rwbs(rwbs, e->cmd_flags);
 	partition = partitions__get_by_dev(partitions, e->dev);
-	printf("%-11.6f %-14.14s %-6d %-7s %-4s %-10lld %-7d ",
+	printf("%-11.6f %-14.14s %-7d %-7s %-4s %-10lld %-7d ",
 		(e->ts - start_ts) / 1000000000.0,
 		e->comm, e->pid, partition ? partition->name : "Unknown", rwbs,
 		e->sector, e->len);
@@ -230,6 +230,13 @@ int main(int argc, char **argv)
 	obj->rodata->targ_queued = env.queued;
 	obj->rodata->filter_cg = env.cg;
 
+	if (fentry_can_attach("blk_account_io_start", NULL))
+		bpf_program__set_attach_target(obj->progs.blk_account_io_start, 0,
+					       "blk_account_io_start");
+	else
+		bpf_program__set_attach_target(obj->progs.blk_account_io_start, 0,
+					       "__blk_account_io_start");
+
 	err = biosnoop_bpf__load(obj);
 	if (err) {
 		fprintf(stderr, "failed to load BPF object: %d\n", err);
@@ -304,7 +311,7 @@ int main(int argc, char **argv)
 		goto cleanup;
 	}
 
-	printf("%-11s %-14s %-6s %-7s %-4s %-10s %-7s ",
+	printf("%-11s %-14s %-7s %-7s %-4s %-10s %-7s ",
 		"TIME(s)", "COMM", "PID", "DISK", "T", "SECTOR", "BYTES");
 	if (env.queued)
 		printf("%7s ", "QUE(ms)");
diff --git a/libbpf-tools/biostacks.bpf.c b/libbpf-tools/biostacks.bpf.c
index c13975fa6c4c..dd9fec1c8dee 100644
--- a/libbpf-tools/biostacks.bpf.c
+++ b/libbpf-tools/biostacks.bpf.c
@@ -28,7 +28,6 @@ struct {
 	__uint(max_entries, MAX_ENTRIES);
 	__type(key, struct request *);
 	__type(value, struct internal_rqinfo);
-	__uint(map_flags, BPF_F_NO_PREALLOC);
 } rqinfos SEC(".maps");
 
 struct {
@@ -36,7 +35,6 @@ struct {
 	__uint(max_entries, MAX_ENTRIES);
 	__type(key, struct rqinfo);
 	__type(value, struct hist);
-	__uint(map_flags, BPF_F_NO_PREALLOC);
 } hists SEC(".maps");
 
 static struct hist zero;
diff --git a/libbpf-tools/biostacks.c b/libbpf-tools/biostacks.c
index 260bc235eed6..2a25869dc461 100644
--- a/libbpf-tools/biostacks.c
+++ b/libbpf-tools/biostacks.c
@@ -173,6 +173,18 @@ int main(int argc, char **argv)
 
 	obj->rodata->targ_ms = env.milliseconds;
 
+	if (fentry_can_attach("blk_account_io_start", NULL)) {
+		bpf_program__set_attach_target(obj->progs.blk_account_io_start, 0,
+					       "blk_account_io_start");
+		bpf_program__set_attach_target(obj->progs.blk_account_io_done, 0,
+					       "blk_account_io_done");
+	} else {
+		bpf_program__set_attach_target(obj->progs.blk_account_io_start, 0,
+					       "__blk_account_io_start");
+		bpf_program__set_attach_target(obj->progs.blk_account_io_done, 0,
+					       "__blk_account_io_done");
+	}
+
 	err = biostacks_bpf__load(obj);
 	if (err) {
 		fprintf(stderr, "failed to load BPF object: %d\n", err);
diff --git a/libbpf-tools/bitesize.bpf.c b/libbpf-tools/bitesize.bpf.c
index 46e9c48b85da..a246f635c11d 100644
--- a/libbpf-tools/bitesize.bpf.c
+++ b/libbpf-tools/bitesize.bpf.c
@@ -22,7 +22,6 @@ struct {
 	__uint(max_entries, 10240);
 	__type(key, struct hist_key);
 	__type(value, struct hist);
-	__uint(map_flags, BPF_F_NO_PREALLOC);
 } hists SEC(".maps");
 
 static struct hist initial_hist;
diff --git a/libbpf-tools/core_fixes.bpf.h b/libbpf-tools/core_fixes.bpf.h
index 762445e88587..33a4f7f78311 100644
--- a/libbpf-tools/core_fixes.bpf.h
+++ b/libbpf-tools/core_fixes.bpf.h
@@ -15,15 +15,34 @@
  */
 struct task_struct___x {
 	unsigned int __state;
-};
+} __attribute__((preserve_access_index));
 
-static __s64 get_task_state(void *task)
+/**
+ * commit 309dca309fc3 ("block: store a block_device pointer in struct bio")
+ * adds a new member bi_bdev which is a pointer to struct block_device
+ * see:
+ *     https://github.com/torvalds/linux/commit/309dca309fc3
+ */
+struct bio___x {
+	struct block_device *bi_bdev;
+} __attribute__((preserve_access_index));
+
+static __always_inline __s64 get_task_state(void *task)
 {
 	struct task_struct___x *t = task;
 
 	if (bpf_core_field_exists(t->__state))
-		return t->__state;
-	return ((struct task_struct *)task)->state;
+		return BPF_CORE_READ(t, __state);
+	return BPF_CORE_READ((struct task_struct *)task, state);
+}
+
+static __always_inline struct gendisk *get_gendisk(void *bio)
+{
+	struct bio___x *b = bio;
+
+	if (bpf_core_field_exists(b->bi_bdev))
+		return BPF_CORE_READ(b, bi_bdev, bd_disk);
+	return BPF_CORE_READ((struct bio *)bio, bi_disk);
 }
 
 #endif /* __CORE_FIXES_BPF_H */
diff --git a/libbpf-tools/klockstat.bpf.c b/libbpf-tools/klockstat.bpf.c
index b8483d91c026..26371c6846c4 100644
--- a/libbpf-tools/klockstat.bpf.c
+++ b/libbpf-tools/klockstat.bpf.c
@@ -14,6 +14,7 @@
 const volatile pid_t targ_tgid = 0;
 const volatile pid_t targ_pid = 0;
 void *const volatile targ_lock = NULL;
+const volatile int per_thread = 0;
 
 struct {
 	__uint(type, BPF_MAP_TYPE_STACK_TRACE);
@@ -147,6 +148,10 @@ static void account(struct lockholder_info *li)
 {
 	struct lock_stat *ls;
 	u64 delta;
+	u32 key = li->stack_id;
+
+	if (per_thread)
+		key = li->task_id;
 
 	/*
 	 * Multiple threads may have the same stack_id.  Even though we are
@@ -155,15 +160,19 @@ static void account(struct lockholder_info *li)
 	 * by multiple readers at the same time.  They will be accounted as
 	 * the same lock, which is what we want, but we need to use atomics to
 	 * avoid corruption, especially for the total_time variables.
+	 * But it should be ok for per-thread since it's not racy anymore.
 	 */
-	ls = bpf_map_lookup_elem(&stat_map, &li->stack_id);
+	ls = bpf_map_lookup_elem(&stat_map, &key);
 	if (!ls) {
 		struct lock_stat fresh = {0};
 
-		bpf_map_update_elem(&stat_map, &li->stack_id, &fresh, BPF_ANY);
-		ls = bpf_map_lookup_elem(&stat_map, &li->stack_id);
+		bpf_map_update_elem(&stat_map, &key, &fresh, BPF_ANY);
+		ls = bpf_map_lookup_elem(&stat_map, &key);
 		if (!ls)
 			return;
+
+		if (per_thread)
+			bpf_get_current_comm(ls->acq_max_comm, TASK_COMM_LEN);
 	}
 
 	delta = li->acq_at - li->try_at;
@@ -176,7 +185,8 @@ static void account(struct lockholder_info *li)
 		 * Potentially racy, if multiple threads think they are the max,
 		 * so you may get a clobbered write.
 		 */
-		bpf_get_current_comm(ls->acq_max_comm, TASK_COMM_LEN);
+		if (!per_thread)
+			bpf_get_current_comm(ls->acq_max_comm, TASK_COMM_LEN);
 	}
 
 	delta = li->rel_at - li->acq_at;
@@ -185,7 +195,8 @@ static void account(struct lockholder_info *li)
 	if (delta > READ_ONCE(ls->hld_max_time)) {
 		WRITE_ONCE(ls->hld_max_time, delta);
 		WRITE_ONCE(ls->hld_max_id, li->task_id);
-		bpf_get_current_comm(ls->hld_max_comm, TASK_COMM_LEN);
+		if (!per_thread)
+			bpf_get_current_comm(ls->hld_max_comm, TASK_COMM_LEN);
 	}
 }
 
diff --git a/libbpf-tools/klockstat.c b/libbpf-tools/klockstat.c
index 4c733a90bb55..6b5f377f96ea 100644
--- a/libbpf-tools/klockstat.c
+++ b/libbpf-tools/klockstat.c
@@ -55,6 +55,7 @@ static struct prog_env {
 	bool reset;
 	bool timestamp;
 	bool verbose;
+	bool per_thread;
 } env = {
 	.nr_locks = 99999999,
 	.nr_stack_entries = 1,
@@ -71,7 +72,7 @@ static const char args_doc[] = "FUNCTION";
 static const char program_doc[] =
 "Trace mutex/sem lock acquisition and hold times, in nsec\n"
 "\n"
-"Usage: klockstat [-hRTv] [-p PID] [-t TID] [-c FUNC] [-L LOCK] [-n NR_LOCKS]\n"
+"Usage: klockstat [-hPRTv] [-p PID] [-t TID] [-c FUNC] [-L LOCK] [-n NR_LOCKS]\n"
 "                 [-s NR_STACKS] [-S SORT] [-d DURATION] [-i INTERVAL]\n"
 "\v"
 "Examples:\n"
@@ -86,8 +87,9 @@ static const char program_doc[] =
 "  klockstat -S acq_count        # sort lock acquired results by acquire count\n"
 "  klockstat -S hld_total        # sort lock held results by total held time\n"
 "  klockstat -S acq_count,hld_total  # combination of above\n"
-"  klockstat -n 3                # display top 3 locks\n"
+"  klockstat -n 3                # display top 3 locks/threads\n"
 "  klockstat -s 6                # display 6 stack entries per lock\n"
+"  klockstat -P                  # print stats per thread\n"
 ;
 
 static const struct argp_option opts[] = {
@@ -97,7 +99,7 @@ static const struct argp_option opts[] = {
 	{ "caller", 'c', "FUNC", 0, "Filter by caller string prefix" },
 	{ "lock", 'L', "LOCK", 0, "Filter by specific ksym lock name" },
 	{ 0, 0, 0, 0, "" },
-	{ "locks", 'n', "NR_LOCKS", 0, "Number of locks to print" },
+	{ "locks", 'n', "NR_LOCKS", 0, "Number of locks or threads to print" },
 	{ "stacks", 's', "NR_STACKS", 0, "Number of stack entries to print per lock" },
 	{ "sort", 'S', "SORT", 0, "Sort by field:\n  acq_[max|total|count]\n  hld_[max|total|count]" },
 	{ 0, 0, 0, 0, "" },
@@ -106,6 +108,7 @@ static const struct argp_option opts[] = {
 	{ "reset", 'R', NULL, 0, "Reset stats each interval" },
 	{ "timestamp", 'T', NULL, 0, "Print timestamp" },
 	{ "verbose", 'v', NULL, 0, "Verbose debug output" },
+	{ "per-thread", 'P', NULL, 0, "Print per-thread stats" },
 
 	{ NULL, 'h', NULL, OPTION_HIDDEN, "Show the full help" },
 	{},
@@ -229,6 +232,9 @@ static error_t parse_arg(int key, char *arg, struct argp_state *state)
 	case 'T':
 		env->timestamp = true;
 		break;
+	case 'P':
+		env->per_thread = true;
+		break;
 	case 'h':
 		argp_state_help(state, stderr, ARGP_HELP_STD_HELP);
 		break;
@@ -241,6 +247,10 @@ static error_t parse_arg(int key, char *arg, struct argp_state *state)
 				env->interval = env->duration;
 			env->iterations = env->duration / env->interval;
 		}
+                if (env->per_thread && env->nr_stack_entries != 1) {
+			warn("--per-thread and --stacks cannot be used together\n");
+			argp_usage(state);
+                }
 		break;
 	default:
 		return ARGP_ERR_UNKNOWN;
@@ -327,62 +337,140 @@ static char *symname(struct ksyms *ksyms, uint64_t pc, char *buf, size_t n)
 	return buf;
 }
 
+static char *print_caller(char *buf, int size, struct stack_stat *ss)
+{
+	snprintf(buf, size, "%u  %16s", ss->stack_id, ss->ls.acq_max_comm);
+	return buf;
+}
+
+static char *print_time(char *buf, int size, uint64_t nsec)
+{
+	struct {
+		float base;
+		char *unit;
+	} table[] = {
+		{ 1e9 * 3600, "h " },
+		{ 1e9 * 60, "m " },
+		{ 1e9, "s " },
+		{ 1e6, "ms" },
+		{ 1e3, "us" },
+		{ 0, NULL },
+	};
+
+	for (int i = 0; table[i].base; i++) {
+		if (nsec < table[i].base)
+			continue;
+
+		snprintf(buf, size, "%.1f %s", nsec / table[i].base, table[i].unit);
+		return buf;
+	}
+
+	snprintf(buf, size, "%u ns", (unsigned)nsec);
+	return buf;
+}
+
 static void print_acq_header(void)
 {
-	printf("\n                               Caller  Avg Wait    Count   Max Wait   Total Wait\n");
+	if (env.per_thread)
+		printf("\n                Tid              Comm");
+	else
+		printf("\n                               Caller");
+
+	printf("  Avg Wait    Count   Max Wait   Total Wait\n");
 }
 
 static void print_acq_stat(struct ksyms *ksyms, struct stack_stat *ss,
 			   int nr_stack_entries)
 {
 	char buf[40];
+	char avg[40];
+	char max[40];
+	char tot[40];
 	int i;
 
-	printf("%37s %9llu %8llu %10llu %12llu\n",
+	printf("%37s %9s %8llu %10s %12s\n",
 	       symname(ksyms, ss->bt[0], buf, sizeof(buf)),
-	       ss->ls.acq_total_time / ss->ls.acq_count,
+	       print_time(avg, sizeof(avg), ss->ls.acq_total_time / ss->ls.acq_count),
 	       ss->ls.acq_count,
-	       ss->ls.acq_max_time,
-	       ss->ls.acq_total_time);
+	       print_time(max, sizeof(max), ss->ls.acq_max_time),
+	       print_time(tot, sizeof(tot), ss->ls.acq_total_time));
 	for (i = 1; i < nr_stack_entries; i++) {
-		if (!ss->bt[i])
+		if (!ss->bt[i] || env.per_thread)
 			break;
 		printf("%37s\n", symname(ksyms, ss->bt[i], buf, sizeof(buf)));
 	}
-	if (nr_stack_entries > 1)
+	if (nr_stack_entries > 1 && !env.per_thread)
 		printf("                              Max PID %llu, COMM %s\n",
 		       ss->ls.acq_max_id >> 32,
 		       ss->ls.acq_max_comm);
 }
 
+static void print_acq_task(struct stack_stat *ss)
+{
+	char buf[40];
+	char avg[40];
+	char max[40];
+	char tot[40];
+
+	printf("%37s %9s %8llu %10s %12s\n",
+	       print_caller(buf, sizeof(buf), ss),
+	       print_time(avg, sizeof(avg), ss->ls.acq_total_time / ss->ls.acq_count),
+	       ss->ls.acq_count,
+	       print_time(max, sizeof(max), ss->ls.acq_max_time),
+	       print_time(tot, sizeof(tot), ss->ls.acq_total_time));
+}
+
 static void print_hld_header(void)
 {
-	printf("\n                               Caller  Avg Hold    Count   Max Hold   Total Hold\n");
+	if (env.per_thread)
+		printf("\n                Tid              Comm");
+	else
+		printf("\n                               Caller");
+
+	printf("  Avg Hold    Count   Max Hold   Total Hold\n");
 }
 
 static void print_hld_stat(struct ksyms *ksyms, struct stack_stat *ss,
 			   int nr_stack_entries)
 {
 	char buf[40];
+	char avg[40];
+	char max[40];
+	char tot[40];
 	int i;
 
-	printf("%37s %9llu %8llu %10llu %12llu\n",
+	printf("%37s %9s %8llu %10s %12s\n",
 	       symname(ksyms, ss->bt[0], buf, sizeof(buf)),
-	       ss->ls.hld_total_time / ss->ls.hld_count,
+	       print_time(avg, sizeof(avg), ss->ls.hld_total_time / ss->ls.hld_count),
 	       ss->ls.hld_count,
-	       ss->ls.hld_max_time,
-	       ss->ls.hld_total_time);
+	       print_time(max, sizeof(max), ss->ls.hld_max_time),
+	       print_time(tot, sizeof(tot), ss->ls.hld_total_time));
 	for (i = 1; i < nr_stack_entries; i++) {
-		if (!ss->bt[i])
+		if (!ss->bt[i] || env.per_thread)
 			break;
 		printf("%37s\n", symname(ksyms, ss->bt[i], buf, sizeof(buf)));
 	}
-	if (nr_stack_entries > 1)
+	if (nr_stack_entries > 1 && !env.per_thread)
 		printf("                              Max PID %llu, COMM %s\n",
 		       ss->ls.hld_max_id >> 32,
 		       ss->ls.hld_max_comm);
 }
 
+static void print_hld_task(struct stack_stat *ss)
+{
+	char buf[40];
+	char avg[40];
+	char max[40];
+	char tot[40];
+
+	printf("%37s %9s %8llu %10s %12s\n",
+	       print_caller(buf, sizeof(buf), ss),
+	       print_time(avg, sizeof(avg), ss->ls.hld_total_time / ss->ls.hld_count),
+	       ss->ls.hld_count,
+	       print_time(max, sizeof(max), ss->ls.hld_max_time),
+	       print_time(tot, sizeof(tot), ss->ls.hld_total_time));
+}
+
 static int print_stats(struct ksyms *ksyms, int stack_map, int stat_map)
 {
 	struct stack_stat **stats, *ss;
@@ -391,6 +479,7 @@ static int print_stats(struct ksyms *ksyms, int stack_map, int stat_map)
 	uint32_t lookup_key = 0;
 	uint32_t stack_id;
 	int ret, i;
+	int nr_stack_entries;
 
 	stats = calloc(stats_sz, sizeof(void *));
 	if (!stats) {
@@ -426,31 +515,39 @@ static int print_stats(struct ksyms *ksyms, int stack_map, int stat_map)
 			free(ss);
 			continue;
 		}
-		if (bpf_map_lookup_elem(stack_map, &stack_id, &ss->bt)) {
+		if (!env.per_thread && bpf_map_lookup_elem(stack_map, &stack_id, &ss->bt)) {
 			/* Can still report the results without a backtrace. */
 			warn("failed to lookup stack_id %u\n", stack_id);
 		}
-		if (!caller_is_traced(ksyms, ss->bt[0])) {
+		if (!env.per_thread && !caller_is_traced(ksyms, ss->bt[0])) {
 			free(ss);
 			continue;
 		}
 		stats[stat_idx++] = ss;
 	}
 
+	nr_stack_entries = MIN(env.nr_stack_entries, PERF_MAX_STACK_DEPTH);
+
 	qsort(stats, stat_idx, sizeof(void*), sort_by_acq);
 	for (i = 0; i < MIN(env.nr_locks, stat_idx); i++) {
 		if (i == 0 || env.nr_stack_entries > 1)
 			print_acq_header();
-		print_acq_stat(ksyms, stats[i],
-			       MIN(env.nr_stack_entries, PERF_MAX_STACK_DEPTH));
+
+		if (env.per_thread)
+			print_acq_task(stats[i]);
+		else
+			print_acq_stat(ksyms, stats[i], nr_stack_entries);
 	}
 
 	qsort(stats, stat_idx, sizeof(void*), sort_by_hld);
 	for (i = 0; i < MIN(env.nr_locks, stat_idx); i++) {
 		if (i == 0 || env.nr_stack_entries > 1)
 			print_hld_header();
-		print_hld_stat(ksyms, stats[i],
-			       MIN(env.nr_stack_entries, PERF_MAX_STACK_DEPTH));
+
+		if (env.per_thread)
+			print_hld_task(stats[i]);
+		else
+			print_hld_stat(ksyms, stats[i], nr_stack_entries);
 	}
 
 	for (i = 0; i < stat_idx; i++)
@@ -533,6 +630,7 @@ int main(int argc, char **argv)
 	obj->rodata->targ_tgid = env.pid;
 	obj->rodata->targ_pid = env.tid;
 	obj->rodata->targ_lock = lock_addr;
+	obj->rodata->per_thread = env.per_thread;
 
 	if (fentry_can_attach("mutex_lock_nested", NULL)) {
 		bpf_program__set_attach_target(obj->progs.mutex_lock, 0,
@@ -598,6 +696,7 @@ int main(int argc, char **argv)
 			warn("print_stats error, aborting.\n");
 			break;
 		}
+		fflush(stdout);
 	}
 
 	printf("Exiting trace of mutex/sem locks\n");
diff --git a/libbpf-tools/mdflush.bpf.c b/libbpf-tools/mdflush.bpf.c
new file mode 100644
index 000000000000..8eac536a2fec
--- /dev/null
+++ b/libbpf-tools/mdflush.bpf.c
@@ -0,0 +1,31 @@
+/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */
+/* Copyright (c) 2021~2022 Hengqi Chen */
+#include <vmlinux.h>
+#include <bpf/bpf_tracing.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_core_read.h>
+#include "core_fixes.bpf.h"
+#include "mdflush.h"
+
+struct {
+	__uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
+	__type(key, __u32);
+	__type(value, __u32);
+} events SEC(".maps");
+
+SEC("fentry/md_flush_request")
+int BPF_PROG(md_flush_request, void *mddev, void *bio)
+{
+	__u64 pid = bpf_get_current_pid_tgid() >> 32;
+	struct event event = {};
+	struct gendisk *gendisk;
+
+	event.pid = pid;
+	gendisk = get_gendisk(bio);
+	BPF_CORE_READ_STR_INTO(event.disk, gendisk, disk_name);
+	bpf_get_current_comm(event.comm, sizeof(event.comm));
+	bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU, &event, sizeof(event));
+	return 0;
+}
+
+char LICENSE[] SEC("license") = "Dual BSD/GPL";
diff --git a/libbpf-tools/mdflush.c b/libbpf-tools/mdflush.c
new file mode 100644
index 000000000000..0f23a0a72395
--- /dev/null
+++ b/libbpf-tools/mdflush.c
@@ -0,0 +1,152 @@
+/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */
+
+/*
+ * mdflush	Trace md flush events.
+ *
+ * Copyright (c) 2021~2022 Hengqi Chen
+ *
+ * Based on mdflush(8) from BCC by Brendan Gregg.
+ * 08-Nov-2021   Hengqi Chen   Created this.
+ */
+#include <argp.h>
+#include <errno.h>
+#include <signal.h>
+#include <string.h>
+#include <time.h>
+
+#include <bpf/libbpf.h>
+#include <bpf/bpf.h>
+#include "mdflush.h"
+#include "mdflush.skel.h"
+#include "trace_helpers.h"
+
+#define PERF_BUFFER_PAGES	16
+#define PERF_POLL_TIMEOUT_MS	100
+#define warn(...) fprintf(stderr, __VA_ARGS__)
+
+static volatile sig_atomic_t exiting = 0;
+static bool verbose = false;
+
+const char *argp_program_version = "mdflush 0.1";
+const char *argp_program_bug_address =
+	"https://github.com/iovisor/bcc/tree/master/libbpf-tools";
+const char argp_program_doc[] =
+"Trace md flush events.\n"
+"\n"
+"USAGE: mdflush\n";
+
+static const struct argp_option opts[] = {
+	{ "verbose", 'v', NULL, 0, "Verbose debug output" },
+	{ NULL, 'h', NULL, OPTION_HIDDEN, "Show the full help" },
+	{},
+};
+
+static error_t parse_arg(int key, char *arg, struct argp_state *state)
+{
+	switch (key) {
+	case 'h':
+		argp_state_help(state, stderr, ARGP_HELP_STD_HELP);
+		break;
+	case 'v':
+		verbose = true;
+		break;
+	default:
+		return ARGP_ERR_UNKNOWN;
+	}
+	return 0;
+}
+
+static void sig_int(int signo)
+{
+	exiting = 1;
+}
+
+static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args)
+{
+	if (level == LIBBPF_DEBUG && !verbose)
+		return 0;
+	return vfprintf(stderr, format, args);
+}
+
+static void handle_event(void *ctx, int cpu, void *data, __u32 data_sz)
+{
+	struct event *e = data;
+	time_t t;
+	struct tm *tm;
+	char ts[32];
+
+	time(&t);
+	tm = localtime(&t);
+	strftime(ts, sizeof(ts), "%H:%M:%S", tm);
+	printf("%-8s %-7d %-16s %-s\n",
+	       ts, e->pid, e->comm, e->disk);
+}
+
+static void handle_lost_events(void *ctx, int cpu, __u64 lost_cnt)
+{
+	warn("lost %llu events on CPU #%d\n", lost_cnt, cpu);
+}
+
+int main(int argc, char **argv)
+{
+	static const struct argp argp = {
+		.options = opts,
+		.parser = parse_arg,
+		.doc = argp_program_doc,
+	};
+	struct perf_buffer *pb = NULL;
+	struct mdflush_bpf *obj;
+	int err;
+
+	err = argp_parse(&argp, argc, argv, 0, NULL, NULL);
+	if (err)
+		return err;
+
+	libbpf_set_strict_mode(LIBBPF_STRICT_ALL);
+	libbpf_set_print(libbpf_print_fn);
+
+	obj = mdflush_bpf__open_and_load();
+	if (!obj) {
+		warn("failed to open/load BPF object\n");
+		return 1;
+	}
+
+	err = mdflush_bpf__attach(obj);
+	if (err) {
+		warn("failed to attach BPF programs: %d\n", err);
+		goto cleanup;
+	}
+
+	pb = perf_buffer__new(bpf_map__fd(obj->maps.events), PERF_BUFFER_PAGES,
+			      handle_event, handle_lost_events, NULL, NULL);
+	if (!pb) {
+		warn("failed to open perf buffer: %d\n", err);
+		goto cleanup;
+	}
+
+	if (signal(SIGINT, sig_int) == SIG_ERR) {
+		warn("can't set signal handler: %s\n", strerror(errno));
+		err = 1;
+		goto cleanup;
+	}
+
+	printf("Tracing md flush requests... Hit Ctrl-C to end.\n");
+	printf("%-8s %-7s %-16s %-s\n",
+	       "TIME", "PID", "COMM", "DEVICE");
+
+	while (!exiting) {
+		err = perf_buffer__poll(pb, PERF_POLL_TIMEOUT_MS);
+		if (err < 0 && err != -EINTR) {
+			warn("error polling perf buffer: %s\n", strerror(-err));
+			goto cleanup;
+		}
+		/* reset err to return 0 if exiting */
+		err = 0;
+	}
+
+cleanup:
+	perf_buffer__free(pb);
+	mdflush_bpf__destroy(obj);
+
+	return err != 0;
+}
diff --git a/libbpf-tools/mdflush.h b/libbpf-tools/mdflush.h
new file mode 100644
index 000000000000..18cd723a7d04
--- /dev/null
+++ b/libbpf-tools/mdflush.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */
+/* Copyright (c) 2021~2022 Hengqi Chen */
+#ifndef __MDFLUSH_H
+#define __MDFLUSH_H
+
+#define TASK_COMM_LEN	16
+#define DISK_NAME_LEN	32
+
+struct event {
+	__u32 pid;
+	char comm[TASK_COMM_LEN];
+	char disk[DISK_NAME_LEN];
+};
+
+#endif /* __MDFLUSH_H */
diff --git a/libbpf-tools/numamove.bpf.c b/libbpf-tools/numamove.bpf.c
index 69d8d5f90719..62c2d714e489 100644
--- a/libbpf-tools/numamove.bpf.c
+++ b/libbpf-tools/numamove.bpf.c
@@ -9,7 +9,6 @@ struct {
 	__uint(max_entries, 10240);
 	__type(key, u32);
 	__type(value, u64);
-	__uint(map_flags, BPF_F_NO_PREALLOC);
 } start SEC(".maps");
 
 __u64 latency = 0;
diff --git a/libbpf-tools/readahead.bpf.c b/libbpf-tools/readahead.bpf.c
index b9423c3f9a49..89863e67d60d 100644
--- a/libbpf-tools/readahead.bpf.c
+++ b/libbpf-tools/readahead.bpf.c
@@ -13,7 +13,6 @@ struct {
 	__uint(max_entries, MAX_ENTRIES);
 	__type(key, u32);
 	__type(value, u64);
-	__uint(map_flags, BPF_F_NO_PREALLOC);
 } in_readahead SEC(".maps");
 
 struct {
@@ -21,7 +20,6 @@ struct {
 	__uint(max_entries, MAX_ENTRIES);
 	__type(key, struct page *);
 	__type(value, u64);
-	__uint(map_flags, BPF_F_NO_PREALLOC);
 } birth SEC(".maps");
 
 struct hist hist = {};
diff --git a/libbpf-tools/syscount.bpf.c b/libbpf-tools/syscount.bpf.c
index d6a98323df23..6209feeaa023 100644
--- a/libbpf-tools/syscount.bpf.c
+++ b/libbpf-tools/syscount.bpf.c
@@ -28,7 +28,6 @@ struct {
 	__uint(max_entries, MAX_ENTRIES);
 	__type(key, u32);
 	__type(value, u64);
-	__uint(map_flags, BPF_F_NO_PREALLOC);
 } start SEC(".maps");
 
 struct {
@@ -36,7 +35,6 @@ struct {
 	__uint(max_entries, MAX_ENTRIES);
 	__type(key, u32);
 	__type(value, struct data_t);
-	__uint(map_flags, BPF_F_NO_PREALLOC);
 } data SEC(".maps");
 
 static __always_inline
diff --git a/libbpf-tools/tcpconnect.bpf.c b/libbpf-tools/tcpconnect.bpf.c
index 7ee8a301c262..a13d48c239f2 100644
--- a/libbpf-tools/tcpconnect.bpf.c
+++ b/libbpf-tools/tcpconnect.bpf.c
@@ -11,7 +11,7 @@
 #include "maps.bpf.h"
 #include "tcpconnect.h"
 
-SEC(".rodata") int filter_ports[MAX_PORTS];
+const volatile int filter_ports[MAX_PORTS];
 const volatile int filter_ports_len = 0;
 const volatile uid_t filter_uid = -1;
 const volatile pid_t filter_pid = 0;
@@ -26,7 +26,6 @@ struct {
 	__uint(max_entries, MAX_ENTRIES);
 	__type(key, u32);
 	__type(value, struct sock *);
-	__uint(map_flags, BPF_F_NO_PREALLOC);
 } sockets SEC(".maps");
 
 struct {
@@ -34,7 +33,6 @@ struct {
 	__uint(max_entries, MAX_ENTRIES);
 	__type(key, struct ipv4_flow_key);
 	__type(value, u64);
-	__uint(map_flags, BPF_F_NO_PREALLOC);
 } ipv4_count SEC(".maps");
 
 struct {
@@ -42,7 +40,6 @@ struct {
 	__uint(max_entries, MAX_ENTRIES);
 	__type(key, struct ipv6_flow_key);
 	__type(value, u64);
-	__uint(map_flags, BPF_F_NO_PREALLOC);
 } ipv6_count SEC(".maps");
 
 struct {
diff --git a/man/man8/biolatency.8 b/man/man8/biolatency.8
index c13f6c8ad036..db2ef48429e9 100644
--- a/man/man8/biolatency.8
+++ b/man/man8/biolatency.8
@@ -2,7 +2,7 @@
 .SH NAME
 biolatency \- Summarize block device I/O latency as a histogram.
 .SH SYNOPSIS
-.B biolatency [\-h] [\-F] [\-T] [\-Q] [\-m] [\-D] [\-e] [interval [count]]
+.B biolatency [\-h] [\-F] [\-T] [\-Q] [\-m] [\-D] [\-F] [\-e] [\-j] [\-d DISK] [interval [count]]
 .SH DESCRIPTION
 biolatency traces block device I/O (disk I/O), and records the distribution
 of I/O latency (time). This is printed as a histogram either on Ctrl-C, or
@@ -42,6 +42,9 @@ Print a histogram dictionary
 \-e
 Show extension summary(total, average)
 .TP
+\-d DISK
+Trace this disk only
+.TP
 interval
 Output interval, in seconds.
 .TP
@@ -108,6 +111,6 @@ Linux
 .SH STABILITY
 Unstable - in development.
 .SH AUTHOR
-Brendan Gregg
+Brendan Gregg, Rocky Xing
 .SH SEE ALSO
 biosnoop(8)
diff --git a/man/man8/funcinterval.8 b/man/man8/funcinterval.8
old mode 100755
new mode 100644
index 8a60399871e7..77128290b04b
--- a/man/man8/funcinterval.8
+++ b/man/man8/funcinterval.8
@@ -8,7 +8,7 @@ This tool times interval between the same function as a histogram.
 
 eBPF/bcc is very suitable for platform performance tuning.
 By funclatency, we can profile specific functions to know how latency
-this function costs. However, sometimes performance drop is not about the 
+this function costs. However, sometimes performance drop is not about the
 latency of function but the interval between function calls.
 funcinterval is born for this purpose.
 
diff --git a/src/cc/bpf_module.cc b/src/cc/bpf_module.cc
index 4e8ff10429ed..1ff33c8a79a1 100644
--- a/src/cc/bpf_module.cc
+++ b/src/cc/bpf_module.cc
@@ -404,7 +404,7 @@ int BPFModule::create_maps(std::map<std::string, std::pair<int, int>> &map_tids,
     }
 
     if (pinned_id <= 0) {
-      struct bpf_create_map_attr attr = {};
+      struct bcc_create_map_attr attr = {};
       attr.map_type = (enum bpf_map_type)map_type;
       attr.name = map_name;
       attr.key_size = key_size;
diff --git a/src/cc/bpf_module_rw_engine.cc b/src/cc/bpf_module_rw_engine.cc
index 6e0fcb74badd..52c877e4d657 100644
--- a/src/cc/bpf_module_rw_engine.cc
+++ b/src/cc/bpf_module_rw_engine.cc
@@ -410,7 +410,7 @@ int BPFModule::annotate() {
     table_names_[table.name] = id++;
     GlobalValue *gvar = mod_->getNamedValue(table.name);
     if (!gvar) continue;
-#if LLVM_MAJOR_VERSION >= 15
+#if LLVM_MAJOR_VERSION >= 14
     {
       Type *t = gvar->getValueType();
       StructType *st = dyn_cast<StructType>(t);
diff --git a/src/cc/libbpf b/src/cc/libbpf
index 86eb09863c1c..4cb682229d0c 160000
--- a/src/cc/libbpf
+++ b/src/cc/libbpf
@@ -1 +1 @@
-Subproject commit 86eb09863c1c0177e99c2c703092042d3cdba910
+Subproject commit 4cb682229d0ca9ef32fe191f00b5ce31fd050a66
diff --git a/src/cc/libbpf.c b/src/cc/libbpf.c
index 467e06f979b1..d3ee8ca5d77f 100644
--- a/src/cc/libbpf.c
+++ b/src/cc/libbpf.c
@@ -307,7 +307,7 @@ static uint64_t ptr_to_u64(void *ptr)
   return (uint64_t) (unsigned long) ptr;
 }
 
-static int libbpf_bpf_map_create(struct bpf_create_map_attr *create_attr)
+static int libbpf_bpf_map_create(struct bcc_create_map_attr *create_attr)
 {
   LIBBPF_OPTS(bpf_map_create_opts, p);
 
@@ -326,7 +326,7 @@ static int libbpf_bpf_map_create(struct bpf_create_map_attr *create_attr)
                         create_attr->value_size, create_attr->max_entries, &p);
 }
 
-int bcc_create_map_xattr(struct bpf_create_map_attr *attr, bool allow_rlimit)
+int bcc_create_map_xattr(struct bcc_create_map_attr *attr, bool allow_rlimit)
 {
   unsigned name_len = attr->name ? strlen(attr->name) : 0;
   char map_name[BPF_OBJ_NAME_LEN] = {};
@@ -383,7 +383,7 @@ int bcc_create_map(enum bpf_map_type map_type, const char *name,
                    int key_size, int value_size,
                    int max_entries, int map_flags)
 {
-  struct bpf_create_map_attr attr = {};
+  struct bcc_create_map_attr attr = {};
 
   attr.map_type = map_type;
   attr.name = name;
diff --git a/src/cc/libbpf.h b/src/cc/libbpf.h
index e001d740f323..c5ea40a505a0 100644
--- a/src/cc/libbpf.h
+++ b/src/cc/libbpf.h
@@ -27,7 +27,23 @@
 extern "C" {
 #endif
 
-struct bpf_create_map_attr;
+struct bcc_create_map_attr {
+	const char *name;
+	enum bpf_map_type map_type;
+	__u32 map_flags;
+	__u32 key_size;
+	__u32 value_size;
+	__u32 max_entries;
+	__u32 numa_node;
+	__u32 btf_fd;
+	__u32 btf_key_type_id;
+	__u32 btf_value_type_id;
+	__u32 map_ifindex;
+	union {
+		__u32 inner_map_fd;
+		__u32 btf_vmlinux_value_type_id;
+	};
+};
 struct bpf_load_program_attr;
 
 enum bpf_probe_attach_type {
@@ -44,7 +60,7 @@ struct bcc_perf_buffer_opts {
 int bcc_create_map(enum bpf_map_type map_type, const char *name,
                    int key_size, int value_size, int max_entries,
                    int map_flags);
-int bcc_create_map_xattr(struct bpf_create_map_attr *attr, bool allow_rlimit);
+int bcc_create_map_xattr(struct bcc_create_map_attr *attr, bool allow_rlimit);
 int bpf_update_elem(int fd, void *key, void *value, unsigned long long flags);
 int bpf_lookup_elem(int fd, void *key, void *value);
 int bpf_delete_elem(int fd, void *key);
diff --git a/src/python/bcc/__init__.py b/src/python/bcc/__init__.py
index c9f1030539b9..55c968aedc5a 100644
--- a/src/python/bcc/__init__.py
+++ b/src/python/bcc/__init__.py
@@ -828,7 +828,8 @@ def attach_kprobe(self, event=b"", event_off=0, fn_name=b"", event_re=b""):
                     failed += 1
                     probes.append(line)
             if failed == len(matches):
-                raise Exception("Failed to attach BPF program %s to kprobe %s" %
+                raise Exception("Failed to attach BPF program %s to kprobe %s"
+                                ", it's not traceable (either non-existing, inlined, or marked as \"notrace\")" %
                                 (fn_name, '/'.join(probes)))
             return
 
@@ -837,7 +838,8 @@ def attach_kprobe(self, event=b"", event_off=0, fn_name=b"", event_re=b""):
         ev_name = b"p_" + event.replace(b"+", b"_").replace(b".", b"_")
         fd = lib.bpf_attach_kprobe(fn.fd, 0, ev_name, event, event_off, 0)
         if fd < 0:
-            raise Exception("Failed to attach BPF program %s to kprobe %s" %
+            raise Exception("Failed to attach BPF program %s to kprobe %s"
+                            ", it's not traceable (either non-existing, inlined, or marked as \"notrace\")" %
                             (fn_name, event))
         self._add_kprobe_fd(ev_name, fn_name, fd)
         return self
@@ -860,7 +862,8 @@ def attach_kretprobe(self, event=b"", fn_name=b"", event_re=b"", maxactive=0):
                     failed += 1
                     probes.append(line)
             if failed == len(matches):
-                raise Exception("Failed to attach BPF program %s to kretprobe %s" %
+                raise Exception("Failed to attach BPF program %s to kretprobe %s"
+                                ", it's not traceable (either non-existing, inlined, or marked as \"notrace\")" %
                                 (fn_name, '/'.join(probes)))
             return
 
@@ -869,7 +872,8 @@ def attach_kretprobe(self, event=b"", fn_name=b"", event_re=b"", maxactive=0):
         ev_name = b"r_" + event.replace(b"+", b"_").replace(b".", b"_")
         fd = lib.bpf_attach_kprobe(fn.fd, 1, ev_name, event, 0, maxactive)
         if fd < 0:
-            raise Exception("Failed to attach BPF program %s to kretprobe %s" %
+            raise Exception("Failed to attach BPF program %s to kretprobe %s"
+                            ", it's not traceable (either non-existing, inlined, or marked as \"notrace\")" %
                             (fn_name, event))
         self._add_kprobe_fd(ev_name, fn_name, fd)
         return self
@@ -1736,6 +1740,20 @@ def add_module(modname):
     def donothing(self):
         """the do nothing exit handler"""
 
+
+    def close(self):
+        """close(self)
+
+        Closes all associated files descriptors. Attached BPF programs are not
+        detached.
+        """
+        for name, fn in list(self.funcs.items()):
+            os.close(fn.fd)
+            del self.funcs[name]
+        if self.module:
+            lib.bpf_module_destroy(self.module)
+            self.module = None
+
     def cleanup(self):
         # Clean up opened probes
         for k, v in list(self.kprobe_fds.items()):
@@ -1763,12 +1781,8 @@ def cleanup(self):
         if self.tracefile:
             self.tracefile.close()
             self.tracefile = None
-        for name, fn in list(self.funcs.items()):
-            os.close(fn.fd)
-            del self.funcs[name]
-        if self.module:
-            lib.bpf_module_destroy(self.module)
-            self.module = None
+
+        self.close()
 
         # Clean up ringbuf
         if self._ringbuf_manager:
diff --git a/tools/biolatency.py b/tools/biolatency.py
index 9ece05025912..6f7719054d0f 100755
--- a/tools/biolatency.py
+++ b/tools/biolatency.py
@@ -4,18 +4,20 @@
 # biolatency    Summarize block device I/O latency as a histogram.
 #       For Linux, uses BCC, eBPF.
 #
-# USAGE: biolatency [-h] [-T] [-Q] [-m] [-D] [-F] [-e] [-j] [interval] [count]
+# USAGE: biolatency [-h] [-T] [-Q] [-m] [-D] [-F] [-e] [-j] [-d DISK] [interval] [count]
 #
 # Copyright (c) 2015 Brendan Gregg.
 # Licensed under the Apache License, Version 2.0 (the "License")
 #
 # 20-Sep-2015   Brendan Gregg   Created this.
+# 31-Mar-2022   Rocky Xing      Added disk filter support.
 
 from __future__ import print_function
 from bcc import BPF
 from time import sleep, strftime
 import argparse
 import ctypes as ct
+import os
 
 # arguments
 examples = """examples:
@@ -27,6 +29,7 @@
     ./biolatency -F                 # show I/O flags separately
     ./biolatency -j                 # print a dictionary
     ./biolatency -e                 # show extension summary(total, average)
+    ./biolatency -d sdc             # Trace sdc only
 """
 parser = argparse.ArgumentParser(
     description="Summarize block device I/O latency as a histogram",
@@ -52,6 +55,8 @@
     help=argparse.SUPPRESS)
 parser.add_argument("-j", "--json", action="store_true",
     help="json output")
+parser.add_argument("-d", "--disk", type=str,
+    help="Trace this disk only")
 
 args = parser.parse_args()
 countdown = int(args.count)
@@ -87,6 +92,8 @@
 // time block I/O
 int trace_req_start(struct pt_regs *ctx, struct request *req)
 {
+    DISK_FILTER
+
     u64 ts = bpf_ktime_get_ns();
     start.update(&req, &ts);
     return 0;
@@ -149,6 +156,33 @@
     storage_str += "BPF_HISTOGRAM(dist);"
     store_str += "dist.atomic_increment(bpf_log2l(delta));"
 
+if args.disk is not None:
+    disk_path = os.path.join('/dev', args.disk)
+    if not os.path.exists(disk_path):
+        print("no such disk '%s'" % args.disk)
+        exit(1)
+
+    stat_info = os.stat(disk_path)
+    major = os.major(stat_info.st_rdev)
+    minor = os.minor(stat_info.st_rdev)
+
+    disk_field_str = ""
+    if BPF.kernel_struct_has_field(b'request', b'rq_disk') == 1:
+        disk_field_str = 'req->rq_disk'
+    else:
+        disk_field_str = 'req->q->disk'
+
+    disk_filter_str = """
+    struct gendisk *disk = %s;
+    if (!(disk->major == %d && disk->first_minor == %d)) {
+        return 0;
+    }
+    """ % (disk_field_str, major, minor)
+
+    bpf_text = bpf_text.replace('DISK_FILTER', disk_filter_str)
+else:
+    bpf_text = bpf_text.replace('DISK_FILTER', '')
+
 if args.extension:
     storage_str += "BPF_ARRAY(extension, ext_val_t, 1);"
     bpf_text = bpf_text.replace('EXTENSION', """
diff --git a/tools/biolatency_example.txt b/tools/biolatency_example.txt
index a88136b8a655..1bc8f591d405 100644
--- a/tools/biolatency_example.txt
+++ b/tools/biolatency_example.txt
@@ -352,24 +352,25 @@ The -j with -m prints a millisecond histogram dictionary. The `value_type` key i
 USAGE message:
 
 # ./biolatency -h
-usage: biolatency.py [-h] [-T] [-Q] [-m] [-D] [-F] [-j]
-                              [interval] [count]
+usage: biolatency.py [-h] [-T] [-Q] [-m] [-D] [-F] [-e] [-j] [-d DISK]
+                     [interval] [count]
 
 Summarize block device I/O latency as a histogram
 
 positional arguments:
-  interval            output interval, in seconds
-  count               number of outputs
+  interval              output interval, in seconds
+  count                 number of outputs
 
 optional arguments:
-  -h, --help          show this help message and exit
-  -T, --timestamp     include timestamp on output
-  -Q, --queued        include OS queued time in I/O time
-  -m, --milliseconds  millisecond histogram
-  -D, --disks         print a histogram per disk device
-  -F, --flags         print a histogram per set of I/O flags
-  -e, --extension     also show extension summary(total, average)
-  -j, --json          json output
+  -h, --help            show this help message and exit
+  -T, --timestamp       include timestamp on output
+  -Q, --queued          include OS queued time in I/O time
+  -m, --milliseconds    millisecond histogram
+  -D, --disks           print a histogram per disk device
+  -F, --flags           print a histogram per set of I/O flags
+  -e, --extension       summarize average/total value
+  -j, --json            json output
+  -d DISK, --disk DISK  Trace this disk only
 
 examples:
     ./biolatency                    # summarize block I/O latency as a histogram
@@ -380,3 +381,4 @@ examples:
     ./biolatency -F                 # show I/O flags separately
     ./biolatency -j                 # print a dictionary
     ./biolatency -e                 # show extension summary(total, average)
+    ./biolatency -d sdc             # Trace sdc only
diff --git a/tools/exitsnoop.py b/tools/exitsnoop.py
index 42606bc6cd95..8b4947467c95 100755
--- a/tools/exitsnoop.py
+++ b/tools/exitsnoop.py
@@ -107,7 +107,7 @@ def _embedded_c(args):
         data.exit_time = bpf_ktime_get_ns(),
         data.pid = task->tgid,
         data.tid = task->pid,
-        data.ppid = task->parent->tgid,
+        data.ppid = task->real_parent->tgid,
         data.exit_code = task->exit_code >> 8,
         data.sig_info = task->exit_code & 0xFF,
         bpf_get_current_comm(&data.task, sizeof(data.task));
@@ -151,7 +151,7 @@ def _print_header():
         print("%-13s" % title, end="")
     if Global.args.label is not None:
         print("%-6s" % "LABEL", end="")
-    print("%-16s %-6s %-6s %-6s %-7s %-10s" %
+    print("%-16s %-7s %-7s %-7s %-7s %-10s" %
               ("PCOMM", "PID", "PPID", "TID", "AGE(s)", "EXIT_CODE"))
 
 buffer = None
@@ -167,7 +167,7 @@ def _print_event(cpu, data, size): # callback
         label = Global.args.label if len(Global.args.label) else 'exit'
         print("%-6s" % label, end="")
     age = (e.exit_time - e.start_time) / 1e9
-    print("%-16s %-6d %-6d %-6d %-7.2f " %
+    print("%-16s %-7d %-7d %-7d %-7.2f " %
               (e.task.decode(), e.pid, e.ppid, e.tid, age), end="")
     if e.sig_info == 0:
         print("0" if e.exit_code == 0 else "code %d" % e.exit_code)
diff --git a/tools/syscount.py b/tools/syscount.py
index c832c08601ea..9ae027431aa4 100755
--- a/tools/syscount.py
+++ b/tools/syscount.py
@@ -76,7 +76,7 @@ def handle_errno(errstr):
 
 if args.list:
     for grp in izip_longest(*(iter(sorted(syscalls.values())),) * 4):
-        print("   ".join(["%-20s" % s for s in grp if s is not None]))
+        print("   ".join(["%-22s" % s.decode() for s in grp if s is not None]))
     sys.exit(0)
 
 text = """
diff --git a/tools/tcpdrop.py b/tools/tcpdrop.py
index d64b57320536..ffa044df766d 100755
--- a/tools/tcpdrop.py
+++ b/tools/tcpdrop.py
@@ -16,6 +16,7 @@
 # Licensed under the Apache License, Version 2.0 (the "License")
 #
 # 30-May-2018   Brendan Gregg   Created this.
+# 15-Jun-2022   Rong Tao        Add tracepoint:skb:kfree_skb
 
 from __future__ import print_function
 from bcc import BPF
@@ -100,7 +101,7 @@
 #define tcp_flag_byte(th) (((u_int8_t *)th)[13])
 #endif
 
-int trace_tcp_drop(struct pt_regs *ctx, struct sock *sk, struct sk_buff *skb)
+static int __trace_tcp_drop(void *ctx, struct sock *sk, struct sk_buff *skb)
 {
     if (sk == NULL)
         return 0;
@@ -154,6 +155,29 @@
 
     return 0;
 }
+
+int trace_tcp_drop(struct pt_regs *ctx, struct sock *sk, struct sk_buff *skb)
+{
+    return __trace_tcp_drop(ctx, sk, skb);
+}
+"""
+
+bpf_kfree_skb_text = """
+#include <linux/skbuff.h>
+
+TRACEPOINT_PROBE(skb, kfree_skb) {
+    struct sk_buff *skb = args->skbaddr;
+    struct sock *sk = skb->sk;
+    enum skb_drop_reason reason = args->reason;
+
+    // SKB_NOT_DROPPED_YET,
+    // SKB_DROP_REASON_NOT_SPECIFIED,
+    if (reason > SKB_DROP_REASON_NOT_SPECIFIED) {
+        return __trace_tcp_drop(args, sk, skb);
+    }
+
+    return 0;
+}
 """
 
 if debug or args.ebpf:
@@ -194,13 +218,22 @@ def print_ipv6_event(cpu, data, size):
         print("\t%s" % sym)
     print("")
 
+if BPF.tracepoint_exists("skb", "kfree_skb"):
+    if BPF.kernel_struct_has_field("trace_event_raw_kfree_skb", "reason") == 1:
+        bpf_text += bpf_kfree_skb_text
+
 # initialize BPF
 b = BPF(text=bpf_text)
+
 if b.get_kprobe_functions(b"tcp_drop"):
     b.attach_kprobe(event="tcp_drop", fn_name="trace_tcp_drop")
+elif b.tracepoint_exists("skb", "kfree_skb"):
+    print("WARNING: tcp_drop() kernel function not found or traceable. "
+          "Use tracpoint:skb:kfree_skb instead.")
 else:
-    print("ERROR: tcp_drop() kernel function not found or traceable. "
-        "The kernel might be too old or the the function has been inlined.")
+    print("ERROR: tcp_drop() kernel function and tracpoint:skb:kfree_skb"
+          " not found or traceable. "
+          "The kernel might be too old or the the function has been inlined.")
     exit()
 stack_traces = b.get_table("stack_traces")
 
diff --git a/tools/tcplife.py b/tools/tcplife.py
index 780385b4513b..8485a5f56948 100755
--- a/tools/tcplife.py
+++ b/tools/tcplife.py
@@ -25,7 +25,7 @@
 from __future__ import print_function
 from bcc import BPF
 import argparse
-from socket import inet_ntop, ntohs, AF_INET, AF_INET6
+from socket import inet_ntop, AF_INET, AF_INET6
 from struct import pack
 from time import strftime
 
@@ -191,13 +191,13 @@
     FILTER_PID
 
     // get throughput stats. see tcp_get_info().
-    u64 rx_b = 0, tx_b = 0, sport = 0;
+    u64 rx_b = 0, tx_b = 0;
     struct tcp_sock *tp = (struct tcp_sock *)sk;
     rx_b = tp->bytes_received;
     tx_b = tp->bytes_acked;
 
     u16 family = sk->__sk_common.skc_family;
-    
+
     FILTER_FAMILY
 
     if (family == AF_INET) {
@@ -318,12 +318,12 @@
     if (mep != 0)
         pid = mep->pid;
     FILTER_PID
-    
+
     u16 family = args->family;
     FILTER_FAMILY
 
     // get throughput stats. see tcp_get_info().
-    u64 rx_b = 0, tx_b = 0, sport = 0;
+    u64 rx_b = 0, tx_b = 0;
     struct tcp_sock *tp = (struct tcp_sock *)sk;
     rx_b = tp->bytes_received;
     tx_b = tp->bytes_acked;
diff --git a/tools/xfsslower.py b/tools/xfsslower.py
index ef79a8947cb8..ef4b6b5b01fc 100755
--- a/tools/xfsslower.py
+++ b/tools/xfsslower.py
@@ -186,8 +186,11 @@
 
     // populate output struct
     u32 size = PT_REGS_RC(ctx);
-    struct data_t data = {.type = type, .size = size, .delta_us = delta_us,
-        .pid = pid};
+    struct data_t data = {};
+    data.type = type;
+    data.size = size;
+    data.delta_us = delta_us;
+    data.pid = pid;
     data.ts_us = ts / 1000;
     data.offset = valp->offset;
     bpf_get_current_comm(&data.task, sizeof(data.task));