diff --git a/docs/reference_guide.md b/docs/reference_guide.md
index ff18ab93e676..0474f46849ff 100644
--- a/docs/reference_guide.md
+++ b/docs/reference_guide.md
@@ -1145,9 +1145,9 @@ Examples in situ:
 
 ### 13. BPF_XSKMAP
 
-Syntax: ```BPF_XSKMAP(name, size)```
+Syntax: ```BPF_XSKMAP(name, size [, "/sys/fs/bpf/xyz"])```
 
-This creates a xsk map named ```name``` with ```size``` entries. Each entry represents one NIC's queue id. This map is only used in XDP to redirect packet to an AF_XDP socket. If the AF_XDP socket is binded to a queue which is different than the current packet's queue id, the packet will be dropped. For kernel v5.3 and latter, `lookup` method is available and can be used to check whether and AF_XDP socket is available for the current packet's queue id. More details at [AF_XDP](https://www.kernel.org/doc/html/latest/networking/af_xdp.html).
+This creates a xsk map named ```name``` with ```size``` entries and pin it to the bpffs as a FILE. Each entry represents one NIC's queue id. This map is only used in XDP to redirect packet to an AF_XDP socket. If the AF_XDP socket is binded to a queue which is different than the current packet's queue id, the packet will be dropped. For kernel v5.3 and latter, `lookup` method is available and can be used to check whether and AF_XDP socket is available for the current packet's queue id. More details at [AF_XDP](https://www.kernel.org/doc/html/latest/networking/af_xdp.html).
 
 For example:
 ```C
diff --git a/libbpf-tools/.gitignore b/libbpf-tools/.gitignore
index 251646397529..83f56063ae3f 100644
--- a/libbpf-tools/.gitignore
+++ b/libbpf-tools/.gitignore
@@ -45,6 +45,7 @@
 /syscount
 /tcpconnect
 /tcpconnlat
+/tcplife
 /tcprtt
 /tcpsynbl
 /vfsstat
diff --git a/libbpf-tools/Makefile b/libbpf-tools/Makefile
index e753230cee89..e5aa35a2c48b 100644
--- a/libbpf-tools/Makefile
+++ b/libbpf-tools/Makefile
@@ -58,6 +58,7 @@ APPS = \
 	syscount \
 	tcpconnect \
 	tcpconnlat \
+	tcplife \
 	tcprtt \
 	tcpsynbl \
 	vfsstat \
diff --git a/libbpf-tools/biopattern.bpf.c b/libbpf-tools/biopattern.bpf.c
index 2f099be77ee8..334a175dcffb 100644
--- a/libbpf-tools/biopattern.bpf.c
+++ b/libbpf-tools/biopattern.bpf.c
@@ -5,6 +5,7 @@
 #include <bpf/bpf_tracing.h>
 #include "biopattern.h"
 #include "maps.bpf.h"
+#include "core_fixes.bpf.h"
 
 const volatile bool filter_dev = false;
 const volatile __u32 targ_dev = 0;
@@ -17,12 +18,24 @@ struct {
 } counters SEC(".maps");
 
 SEC("tracepoint/block/block_rq_complete")
-int handle__block_rq_complete(struct trace_event_raw_block_rq_complete *ctx)
+int handle__block_rq_complete(void *args)
 {
-	sector_t sector = ctx->sector;
 	struct counter *counterp, zero = {};
-	u32 nr_sector = ctx->nr_sector;
-	u32 dev = ctx->dev;
+	sector_t sector;
+	u32 nr_sector;
+	u32 dev;
+
+	if (has_block_rq_completion()) {
+		struct trace_event_raw_block_rq_completion___x *ctx = args;
+		sector = BPF_CORE_READ(ctx, sector);
+		nr_sector = BPF_CORE_READ(ctx, nr_sector);
+		dev = BPF_CORE_READ(ctx, dev);
+	} else {
+		struct trace_event_raw_block_rq_complete *ctx = args;
+		sector = BPF_CORE_READ(ctx, sector);
+		nr_sector = BPF_CORE_READ(ctx, nr_sector);
+		dev = BPF_CORE_READ(ctx, dev);
+	}
 
 	if (filter_dev && targ_dev != dev)
 		return 0;
diff --git a/libbpf-tools/core_fixes.bpf.h b/libbpf-tools/core_fixes.bpf.h
index 33a4f7f78311..3bbcbbaf4625 100644
--- a/libbpf-tools/core_fixes.bpf.h
+++ b/libbpf-tools/core_fixes.bpf.h
@@ -17,6 +17,15 @@ struct task_struct___x {
 	unsigned int __state;
 } __attribute__((preserve_access_index));
 
+static __always_inline __s64 get_task_state(void *task)
+{
+	struct task_struct___x *t = task;
+
+	if (bpf_core_field_exists(t->__state))
+		return BPF_CORE_READ(t, __state);
+	return BPF_CORE_READ((struct task_struct *)task, state);
+}
+
 /**
  * commit 309dca309fc3 ("block: store a block_device pointer in struct bio")
  * adds a new member bi_bdev which is a pointer to struct block_device
@@ -27,15 +36,6 @@ struct bio___x {
 	struct block_device *bi_bdev;
 } __attribute__((preserve_access_index));
 
-static __always_inline __s64 get_task_state(void *task)
-{
-	struct task_struct___x *t = task;
-
-	if (bpf_core_field_exists(t->__state))
-		return BPF_CORE_READ(t, __state);
-	return BPF_CORE_READ((struct task_struct *)task, state);
-}
-
 static __always_inline struct gendisk *get_gendisk(void *bio)
 {
 	struct bio___x *b = bio;
@@ -45,4 +45,26 @@ static __always_inline struct gendisk *get_gendisk(void *bio)
 	return BPF_CORE_READ((struct bio *)bio, bi_disk);
 }
 
+/**
+ * commit d5869fdc189f ("block: introduce block_rq_error tracepoint")
+ * adds a new tracepoint block_rq_error and it shares the same arguments
+ * with tracepoint block_rq_complete. As a result, the kernel BTF now has
+ * a `struct trace_event_raw_block_rq_completion` instead of
+ * `struct trace_event_raw_block_rq_complete`.
+ * see:
+ *     https://github.com/torvalds/linux/commit/d5869fdc189f
+ */
+struct trace_event_raw_block_rq_completion___x {
+	dev_t dev;
+	sector_t sector;
+	unsigned int nr_sector;
+} __attribute__((preserve_access_index));
+
+static __always_inline bool has_block_rq_completion()
+{
+	if (bpf_core_type_exists(struct trace_event_raw_block_rq_completion___x))
+		return true;
+	return false;
+}
+
 #endif /* __CORE_FIXES_BPF_H */
diff --git a/libbpf-tools/filelife.c b/libbpf-tools/filelife.c
index 07286ecf5752..5d0d5ecbb2f2 100644
--- a/libbpf-tools/filelife.c
+++ b/libbpf-tools/filelife.c
@@ -138,6 +138,9 @@ int main(int argc, char **argv)
 	/* initialize global data (filtering options) */
 	obj->rodata->targ_tgid = env.pid;
 
+	if (!kprobe_exists("security_inode_create"))
+		bpf_program__set_autoload(obj->progs.security_inode_create, false);
+
 	err = filelife_bpf__load(obj);
 	if (err) {
 		fprintf(stderr, "failed to load BPF object: %d\n", err);
diff --git a/libbpf-tools/llcstat.bpf.c b/libbpf-tools/llcstat.bpf.c
index a36fc2dfbf9f..77fcf8306805 100644
--- a/libbpf-tools/llcstat.bpf.c
+++ b/libbpf-tools/llcstat.bpf.c
@@ -3,36 +3,43 @@
 #include <vmlinux.h>
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_tracing.h>
+#include "maps.bpf.h"
 #include "llcstat.h"
 
 #define MAX_ENTRIES	10240
 
+const volatile bool targ_per_thread = false;
+
 struct {
 	__uint(type, BPF_MAP_TYPE_HASH);
 	__uint(max_entries, MAX_ENTRIES);
-	__type(key, u64);
-	__type(value, struct info);
+	__type(key, struct key_info);
+	__type(value, struct value_info);
 } infos SEC(".maps");
 
 static __always_inline
 int trace_event(__u64 sample_period, bool miss)
 {
-	u64 pid = bpf_get_current_pid_tgid();
-	u32 cpu = bpf_get_smp_processor_id();
-	struct info *infop, info = {};
-	u64 key = pid << 32 | cpu;
-
-	infop = bpf_map_lookup_elem(&infos, &key);
-	if (!infop) {
-		bpf_get_current_comm(info.comm, sizeof(info.comm));
-		infop = &info;
-	}
+	struct key_info key = {};
+	struct value_info *infop, zero = {};
+
+	u64 pid_tgid = bpf_get_current_pid_tgid();
+	key.cpu = bpf_get_smp_processor_id();
+	key.pid = pid_tgid >> 32;
+	if (targ_per_thread)
+		key.tid = (u32)pid_tgid;
+	else
+		key.tid = key.pid;
+
+	infop = bpf_map_lookup_or_try_init(&infos, &key, &zero);
+	if (!infop)
+		return 0;
 	if (miss)
 		infop->miss += sample_period;
 	else
 		infop->ref += sample_period;
-	if (infop == &info)
-		bpf_map_update_elem(&infos, &key, infop, 0);
+	bpf_get_current_comm(infop->comm, sizeof(infop->comm));
+
 	return 0;
 }
 
diff --git a/libbpf-tools/llcstat.c b/libbpf-tools/llcstat.c
index bc13e7f133a6..30be26c5e67f 100644
--- a/libbpf-tools/llcstat.c
+++ b/libbpf-tools/llcstat.c
@@ -3,6 +3,7 @@
 //
 // Based on llcstat(8) from BCC by Teng Qin.
 // 29-Sep-2020   Wenbo Zhang   Created this.
+// 20-Jun-2022   YeZhengMao    Added tid info.
 #include <argp.h>
 #include <signal.h>
 #include <stdio.h>
@@ -21,6 +22,7 @@ struct env {
 	int sample_period;
 	time_t duration;
 	bool verbose;
+	bool per_thread;
 } env = {
 	.sample_period = 100,
 	.duration = 10,
@@ -40,6 +42,8 @@ static const struct argp_option opts[] = {
 	{ "sample_period", 'c', "SAMPLE_PERIOD", 0, "Sample one in this many "
 	  "number of cache reference / miss events" },
 	{ "verbose", 'v', NULL, 0, "Verbose debug output" },
+	{ "tid", 't', NULL, 0,
+	  "Summarize cache references and misses by PID/TID" },
 	{ NULL, 'h', NULL, OPTION_HIDDEN, "Show the full help" },
 	{},
 };
@@ -55,6 +59,9 @@ static error_t parse_arg(int key, char *arg, struct argp_state *state)
 	case 'v':
 		env.verbose = true;
 		break;
+	case 't':
+		env.per_thread = true;
+		break;
 	case 'c':
 		errno = 0;
 		env.sample_period = strtol(arg, NULL, 10);
@@ -131,10 +138,10 @@ static void sig_handler(int sig)
 static void print_map(struct bpf_map *map)
 {
 	__u64 total_ref = 0, total_miss = 0, total_hit, hit;
-	__u64 lookup_key = -1, next_key;
+	__u32 pid, cpu, tid;
+	struct key_info lookup_key = { .cpu = -1 }, next_key;
 	int err, fd = bpf_map__fd(map);
-	struct info info;
-	__u32 pid, cpu;
+	struct value_info info;
 
 	while (!bpf_map_get_next_key(fd, &lookup_key, &next_key)) {
 		err = bpf_map_lookup_elem(fd, &next_key, &info);
@@ -143,11 +150,16 @@ static void print_map(struct bpf_map *map)
 			return;
 		}
 		hit = info.ref > info.miss ? info.ref - info.miss : 0;
-		pid = next_key >> 32;
-		cpu = next_key;
-		printf("%-8u %-16s %-4u %12llu %12llu %6.2f%%\n", pid, info.comm,
-			cpu, info.ref, info.miss, info.ref > 0 ?
-			hit * 1.0 / info.ref * 100 : 0);
+		cpu = next_key.cpu;
+		pid = next_key.pid;
+		tid = next_key.tid;
+		printf("%-8u ", pid);
+		if (env.per_thread) {
+			printf("%-8u ", tid);
+		}
+		printf("%-16s %-4u %12llu %12llu %6.2f%%\n",
+			info.comm, cpu, info.ref, info.miss, 
+			info.ref > 0 ? hit * 1.0 / info.ref * 100 : 0);
 		total_miss += info.miss;
 		total_ref += info.ref;
 		lookup_key = next_key;
@@ -157,7 +169,7 @@ static void print_map(struct bpf_map *map)
 		total_ref, total_miss, total_ref > 0 ?
 		total_hit * 1.0 / total_ref * 100 : 0);
 
-	lookup_key = -1;
+	lookup_key.cpu = -1;
 	while (!bpf_map_get_next_key(fd, &lookup_key, &next_key)) {
 		err = bpf_map_delete_elem(fd, &next_key);
 		if (err < 0) {
@@ -212,6 +224,8 @@ int main(int argc, char **argv)
 		goto cleanup;
 	}
 
+	obj->rodata->targ_per_thread = env.per_thread;
+
 	err = llcstat_bpf__load(obj);
 	if (err) {
 		fprintf(stderr, "failed to load BPF object: %d\n", err);
@@ -233,8 +247,12 @@ int main(int argc, char **argv)
 
 	sleep(env.duration);
 
-	printf("%-8s %-16s %-4s %12s %12s %7s\n",
-		"PID", "NAME", "CPU", "REFERENCE", "MISS", "HIT%");
+	printf("%-8s ", "PID");
+	if (env.per_thread) {
+		printf("%-8s ", "TID");
+	}
+	printf("%-16s %-4s %12s %12s %7s\n",
+		"NAME", "CPU", "REFERENCE", "MISS", "HIT%");
 
 	print_map(obj->maps.infos);
 
diff --git a/libbpf-tools/llcstat.h b/libbpf-tools/llcstat.h
index 8123cd7d90bf..83a50db8cc90 100644
--- a/libbpf-tools/llcstat.h
+++ b/libbpf-tools/llcstat.h
@@ -4,10 +4,16 @@
 
 #define TASK_COMM_LEN	16
 
-struct info {
+struct value_info {
 	__u64 ref;
 	__u64 miss;
 	char comm[TASK_COMM_LEN];
 };
 
+struct key_info {
+	__u32 cpu;
+	__u32 pid;
+	__u32 tid;
+};
+
 #endif /* __LLCSTAT_H */
diff --git a/libbpf-tools/syscount.bpf.c b/libbpf-tools/syscount.bpf.c
index 6209feeaa023..38f8f978343d 100644
--- a/libbpf-tools/syscount.bpf.c
+++ b/libbpf-tools/syscount.bpf.c
@@ -78,7 +78,7 @@ int sys_exit(struct trace_event_raw_sys_exit *args)
 	static const struct data_t zero;
 	pid_t pid = id >> 32;
 	struct data_t *val;
-	u64 *start_ts;
+	u64 *start_ts, lat = 0;
 	u32 tid = id;
 	u32 key;
 
@@ -97,6 +97,7 @@ int sys_exit(struct trace_event_raw_sys_exit *args)
 		start_ts = bpf_map_lookup_elem(&start, &tid);
 		if (!start_ts)
 			return 0;
+		lat = bpf_ktime_get_ns() - *start_ts;
 	}
 
 	key = (count_by_process) ? pid : args->id;
@@ -106,7 +107,7 @@ int sys_exit(struct trace_event_raw_sys_exit *args)
 		if (count_by_process)
 			save_proc_name(val);
 		if (measure_latency)
-			__sync_fetch_and_add(&val->total_ns, bpf_ktime_get_ns() - *start_ts);
+			__sync_fetch_and_add(&val->total_ns, lat);
 	}
 	return 0;
 }
diff --git a/libbpf-tools/tcpconnect.bpf.c b/libbpf-tools/tcpconnect.bpf.c
index a13d48c239f2..c57faa0263ce 100644
--- a/libbpf-tools/tcpconnect.bpf.c
+++ b/libbpf-tools/tcpconnect.bpf.c
@@ -55,7 +55,7 @@ static __always_inline bool filter_port(__u16 port)
 	if (filter_ports_len == 0)
 		return false;
 
-	for (i = 0; i < filter_ports_len; i++) {
+	for (i = 0; i < filter_ports_len && i < MAX_PORTS; i++) {
 		if (port == filter_ports[i])
 			return false;
 	}
diff --git a/libbpf-tools/tcpconnlat.bpf.c b/libbpf-tools/tcpconnlat.bpf.c
index 56d374144b0f..b44abb293fa4 100644
--- a/libbpf-tools/tcpconnlat.bpf.c
+++ b/libbpf-tools/tcpconnlat.bpf.c
@@ -31,7 +31,7 @@ struct {
 	__uint(value_size, sizeof(u32));
 } events SEC(".maps");
 
-static __always_inline int trace_connect(struct sock *sk)
+static int trace_connect(struct sock *sk)
 {
 	u32 tgid = bpf_get_current_pid_tgid() >> 32;
 	struct piddata piddata = {};
@@ -46,27 +46,14 @@ static __always_inline int trace_connect(struct sock *sk)
 	return 0;
 }
 
-SEC("fentry/tcp_v4_connect")
-int BPF_PROG(tcp_v4_connect, struct sock *sk)
-{
-	return trace_connect(sk);
-}
-
-SEC("kprobe/tcp_v6_connect")
-int BPF_KPROBE(tcp_v6_connect, struct sock *sk)
-{
-	return trace_connect(sk);
-}
-
-SEC("fentry/tcp_rcv_state_process")
-int BPF_PROG(tcp_rcv_state_process, struct sock *sk)
+static int handle_tcp_rcv_state_process(void *ctx, struct sock *sk)
 {
 	struct piddata *piddatap;
 	struct event event = {};
 	s64 delta;
 	u64 ts;
 
-	if (sk->__sk_common.skc_state != TCP_SYN_SENT)
+	if (BPF_CORE_READ(sk, __sk_common.skc_state) != TCP_SYN_SENT)
 		return 0;
 
 	piddatap = bpf_map_lookup_elem(&start, &sk);
@@ -85,12 +72,12 @@ int BPF_PROG(tcp_rcv_state_process, struct sock *sk)
 			sizeof(event.comm));
 	event.ts_us = ts / 1000;
 	event.tgid = piddatap->tgid;
-	event.lport = sk->__sk_common.skc_num;
-	event.dport = sk->__sk_common.skc_dport;
-	event.af = sk->__sk_common.skc_family;
+	event.lport = BPF_CORE_READ(sk, __sk_common.skc_num);
+	event.dport = BPF_CORE_READ(sk, __sk_common.skc_dport);
+	event.af = BPF_CORE_READ(sk, __sk_common.skc_family);
 	if (event.af == AF_INET) {
-		event.saddr_v4 = sk->__sk_common.skc_rcv_saddr;
-		event.daddr_v4 = sk->__sk_common.skc_daddr;
+		event.saddr_v4 = BPF_CORE_READ(sk, __sk_common.skc_rcv_saddr);
+		event.daddr_v4 = BPF_CORE_READ(sk, __sk_common.skc_daddr);
 	} else {
 		BPF_CORE_READ_INTO(&event.saddr_v6, sk,
 				__sk_common.skc_v6_rcv_saddr.in6_u.u6_addr32);
@@ -105,4 +92,40 @@ int BPF_PROG(tcp_rcv_state_process, struct sock *sk)
 	return 0;
 }
 
+SEC("kprobe/tcp_v4_connect")
+int BPF_KPROBE(tcp_v4_connect, struct sock *sk)
+{
+	return trace_connect(sk);
+}
+
+SEC("kprobe/tcp_v6_connect")
+int BPF_KPROBE(tcp_v6_connect, struct sock *sk)
+{
+	return trace_connect(sk);
+}
+
+SEC("kprobe/tcp_rcv_state_process")
+int BPF_KPROBE(tcp_rcv_state_process, struct sock *sk)
+{
+	return handle_tcp_rcv_state_process(ctx, sk);
+}
+
+SEC("fentry/tcp_v4_connect")
+int BPF_PROG(fentry_tcp_v4_connect, struct sock *sk)
+{
+	return trace_connect(sk);
+}
+
+SEC("fentry/tcp_v6_connect")
+int BPF_PROG(fentry_tcp_v6_connect, struct sock *sk)
+{
+	return trace_connect(sk);
+}
+
+SEC("fentry/tcp_rcv_state_process")
+int BPF_PROG(fentry_tcp_rcv_state_process, struct sock *sk)
+{
+	return handle_tcp_rcv_state_process(ctx, sk);
+}
+
 char LICENSE[] SEC("license") = "GPL";
diff --git a/libbpf-tools/tcpconnlat.c b/libbpf-tools/tcpconnlat.c
index 8eae76aeadab..c07aa8afc252 100644
--- a/libbpf-tools/tcpconnlat.c
+++ b/libbpf-tools/tcpconnlat.c
@@ -182,6 +182,19 @@ int main(int argc, char **argv)
 	obj->rodata->targ_min_us = env.min_us;
 	obj->rodata->targ_tgid = env.pid;
 
+	if (fentry_can_attach("tcp_v4_connect", NULL)) {
+		bpf_program__set_attach_target(obj->progs.fentry_tcp_v4_connect, 0, "tcp_v4_connect");
+		bpf_program__set_attach_target(obj->progs.fentry_tcp_v6_connect, 0, "tcp_v6_connect");
+		bpf_program__set_attach_target(obj->progs.fentry_tcp_rcv_state_process, 0, "tcp_rcv_state_process");
+		bpf_program__set_autoload(obj->progs.tcp_v4_connect, false);
+		bpf_program__set_autoload(obj->progs.tcp_v6_connect, false);
+		bpf_program__set_autoload(obj->progs.tcp_rcv_state_process, false);
+	} else {
+		bpf_program__set_autoload(obj->progs.fentry_tcp_v4_connect, false);
+		bpf_program__set_autoload(obj->progs.fentry_tcp_v6_connect, false);
+		bpf_program__set_autoload(obj->progs.fentry_tcp_rcv_state_process, false);
+	}
+
 	err = tcpconnlat_bpf__load(obj);
 	if (err) {
 		fprintf(stderr, "failed to load BPF object: %d\n", err);
diff --git a/libbpf-tools/tcplife.bpf.c b/libbpf-tools/tcplife.bpf.c
new file mode 100644
index 000000000000..a05d1396ecaa
--- /dev/null
+++ b/libbpf-tools/tcplife.bpf.c
@@ -0,0 +1,152 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2022 Hengqi Chen */
+#include <vmlinux.h>
+#include <bpf/bpf_core_read.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include "tcplife.h"
+
+#define MAX_ENTRIES	10240
+#define AF_INET		2
+#define AF_INET6	10
+
+const volatile bool filter_sport = false;
+const volatile bool filter_dport = false;
+const volatile __u16 target_sports[MAX_PORTS] = {};
+const volatile __u16 target_dports[MAX_PORTS] = {};
+const volatile pid_t target_pid = 0;
+const volatile __u16 target_family = 0;
+
+struct {
+	__uint(type, BPF_MAP_TYPE_HASH);
+	__uint(max_entries, MAX_ENTRIES);
+	__type(key, struct sock *);
+	__type(value, __u64);
+} birth SEC(".maps");
+
+struct {
+	__uint(type, BPF_MAP_TYPE_HASH);
+	__uint(max_entries, MAX_ENTRIES);
+	__type(key, struct sock *);
+	__type(value, struct ident);
+} idents SEC(".maps");
+
+struct {
+	__uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
+	__uint(key_size, sizeof(__u32));
+	__uint(value_size, sizeof(__u32));
+} events SEC(".maps");
+
+SEC("tracepoint/sock/inet_sock_set_state")
+int inet_sock_set_state(struct trace_event_raw_inet_sock_set_state *args)
+{
+	__u64 ts, *start, delta_us, rx_b, tx_b;
+	struct ident ident = {}, *identp;
+	__u16 sport, dport, family;
+	struct event event = {};
+	struct tcp_sock *tp;
+	struct sock *sk;
+	bool found;
+	__u32 pid;
+	int i;
+
+	if (BPF_CORE_READ(args, protocol) != IPPROTO_TCP)
+		return 0;
+
+	family = BPF_CORE_READ(args, family);
+	if (target_family && family != target_family)
+		return 0;
+
+	sport = BPF_CORE_READ(args, sport);
+	if (filter_sport) {
+		found = false;
+		for (i = 0; i < MAX_PORTS; i++) {
+			if (!target_sports[i])
+				return 0;
+			if (sport != target_sports[i])
+				continue;
+			found = true;
+			break;
+		}
+		if (!found)
+			return 0;
+	}
+
+	dport = BPF_CORE_READ(args, dport);
+	if (filter_dport) {
+		found = false;
+		for (i = 0; i < MAX_PORTS; i++) {
+			if (!target_dports[i])
+				return 0;
+			if (dport != target_dports[i])
+				continue;
+			found = true;
+			break;
+		}
+		if (!found)
+			return 0;
+	}
+
+	sk = (struct sock *)BPF_CORE_READ(args, skaddr);
+	if (BPF_CORE_READ(args, newstate) < TCP_FIN_WAIT1) {
+		ts = bpf_ktime_get_ns();
+		bpf_map_update_elem(&birth, &sk, &ts, BPF_ANY);
+	}
+
+	if (BPF_CORE_READ(args, newstate) == TCP_SYN_SENT || BPF_CORE_READ(args, newstate) == TCP_LAST_ACK) {
+		pid = bpf_get_current_pid_tgid() >> 32;
+		if (target_pid && pid != target_pid)
+			return 0;
+		ident.pid = pid;
+		bpf_get_current_comm(ident.comm, sizeof(ident.comm));
+		bpf_map_update_elem(&idents, &sk, &ident, BPF_ANY);
+	}
+
+	if (BPF_CORE_READ(args, newstate) != TCP_CLOSE)
+		return 0;
+
+	start = bpf_map_lookup_elem(&birth, &sk);
+	if (!start) {
+		bpf_map_delete_elem(&idents, &sk);
+		return 0;
+	}
+	ts = bpf_ktime_get_ns();
+	delta_us = (ts - *start) / 1000;
+
+	identp = bpf_map_lookup_elem(&idents, &sk);
+	pid = identp ? identp->pid : bpf_get_current_pid_tgid() >> 32;
+	if (target_pid && pid != target_pid)
+		goto cleanup;
+
+	tp = (struct tcp_sock *)sk;
+	rx_b = BPF_CORE_READ(tp, bytes_received);
+	tx_b = BPF_CORE_READ(tp, bytes_acked);
+
+	event.ts_us = ts / 1000;
+	event.span_us = delta_us;
+	event.rx_b = rx_b;
+	event.tx_b = tx_b;
+	event.pid = pid;
+	event.sport = sport;
+	event.dport = dport;
+	event.family = family;
+	if (!identp)
+		bpf_get_current_comm(event.comm, sizeof(event.comm));
+	else
+		bpf_probe_read_kernel(event.comm, sizeof(event.comm), (void *)identp->comm);
+	if (family == AF_INET) {
+		bpf_probe_read_kernel(&event.saddr, sizeof(args->saddr), BPF_CORE_READ(args, saddr));
+		bpf_probe_read_kernel(&event.daddr, sizeof(args->daddr), BPF_CORE_READ(args, daddr));
+	} else {	/*  AF_INET6 */
+		bpf_probe_read_kernel(&event.saddr, sizeof(args->saddr_v6), BPF_CORE_READ(args, saddr_v6));
+		bpf_probe_read_kernel(&event.daddr, sizeof(args->daddr_v6), BPF_CORE_READ(args, daddr_v6));
+	}
+	bpf_perf_event_output(args, &events, BPF_F_CURRENT_CPU, &event, sizeof(event));
+
+cleanup:
+	bpf_map_delete_elem(&birth, &sk);
+	bpf_map_delete_elem(&idents, &sk);
+	return 0;
+}
+
+char LICENSE[] SEC("license") = "GPL";
diff --git a/libbpf-tools/tcplife.c b/libbpf-tools/tcplife.c
new file mode 100644
index 000000000000..b31109d8834d
--- /dev/null
+++ b/libbpf-tools/tcplife.c
@@ -0,0 +1,246 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * tcplife      Trace the lifespan of TCP sessions and summarize.
+ *
+ * Copyright (c) 2022 Hengqi Chen
+ *
+ * Based on tcplife(8) from BCC by Brendan Gregg.
+ * 02-Jun-2022   Hengqi Chen   Created this.
+ */
+#include <argp.h>
+#include <errno.h>
+#include <signal.h>
+#include <time.h>
+#include <arpa/inet.h>
+#include <sys/socket.h>
+
+#include "btf_helpers.h"
+#include "tcplife.h"
+#include "tcplife.skel.h"
+
+#define PERF_BUFFER_PAGES	16
+#define PERF_POLL_TIMEOUT_MS	100
+
+static volatile sig_atomic_t exiting = 0;
+
+static pid_t target_pid = 0;
+static short target_family = 0;
+static char *target_sports = NULL;
+static char *target_dports = NULL;
+static int column_width = 15;
+static bool emit_timestamp = false;
+static bool verbose = false;
+
+const char *argp_program_version = "tcplife 0.1";
+const char *argp_program_bug_address =
+	"https://github.com/iovisor/bcc/tree/master/libbpf-tools";
+const char argp_program_doc[] =
+"Trace the lifespan of TCP sessions and summarize.\n"
+"\n"
+"USAGE: tcplife [-h] [-p PID] [-4] [-6] [-L] [-D] [-T] [-w]\n"
+"\n"
+"EXAMPLES:\n"
+"    tcplife -p 1215             # only trace PID 1215\n"
+"    tcplife -p 1215 -4          # trace IPv4 only\n";
+
+static const struct argp_option opts[] = {
+	{ "pid", 'p', "PID", 0, "Process ID to trace" },
+	{ "ipv4", '4', NULL, 0, "Trace IPv4 only" },
+	{ "ipv6", '6', NULL, 0, "Trace IPv6 only" },
+	{ "wide", 'w', NULL, 0, "Wide column output (fits IPv6 addresses)" },
+	{ "time", 'T', NULL, 0, "Include timestamp on output" },
+	{ "localport", 'L', "LOCALPORT", 0, "Comma-separated list of local ports to trace." },
+	{ "remoteport", 'D', "REMOTEPORT", 0, "Comma-separated list of remote ports to trace." },
+	{ "verbose", 'v', NULL, 0, "Verbose debug output" },
+	{ NULL, 'h', NULL, OPTION_HIDDEN, "Show the full help" },
+	{},
+};
+
+static error_t parse_arg(int key, char *arg, struct argp_state *state)
+{
+	long n;
+
+	switch (key) {
+	case 'p':
+		errno = 0;
+		n = strtol(arg, NULL, 10);
+		if (errno || n <= 0) {
+			fprintf(stderr, "Invalid PID: %s\n", arg);
+			argp_usage(state);
+		}
+		target_pid = n;
+		break;
+	case '4':
+		target_family = AF_INET;
+		break;
+	case '6':
+		target_family = AF_INET6;
+		break;
+	case 'w':
+		column_width = 26;
+		break;
+	case 'L':
+		target_sports = strdup(arg);
+		break;
+	case 'D':
+		target_dports = strdup(arg);
+		break;
+	case 'T':
+		emit_timestamp = true;
+		break;
+	case 'v':
+		verbose = true;
+		break;
+	case 'h':
+		argp_state_help(state, stderr, ARGP_HELP_STD_HELP);
+		break;
+	default:
+		return ARGP_ERR_UNKNOWN;
+	}
+	return 0;
+}
+
+static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args)
+{
+	if (level == LIBBPF_DEBUG && !verbose)
+		return 0;
+	return vfprintf(stderr, format, args);
+}
+
+static void sig_int(int signo)
+{
+	exiting = 1;
+}
+
+static void handle_event(void *ctx, int cpu, void *data, __u32 data_sz)
+{
+	char ts[32], saddr[48], daddr[48];
+	struct event *e = data;
+	struct tm *tm;
+	time_t t;
+
+	if (emit_timestamp) {
+		time(&t);
+		tm = localtime(&t);
+		strftime(ts, sizeof(ts), "%H:%M:%S", tm);
+		printf("%8s ", ts);
+	}
+
+	inet_ntop(e->family, &e->saddr, saddr, sizeof(saddr));
+	inet_ntop(e->family, &e->daddr, daddr, sizeof(daddr));
+
+	printf("%-7d %-16s %-*s %-5d %-*s %-5d %-6.2f %-6.2f %-.2f\n",
+	       e->pid, e->comm, column_width, saddr, e->sport, column_width, daddr, e->dport,
+	       (double)e->tx_b / 1024, (double)e->rx_b / 1024, (double)e->span_us / 1000);
+}
+
+static void handle_lost_events(void *ctx, int cpu, __u64 lost_cnt)
+{
+	fprintf(stderr, "lost %llu events on CPU #%d\n", lost_cnt, cpu);
+}
+
+int main(int argc, char **argv)
+{
+	LIBBPF_OPTS(bpf_object_open_opts, open_opts);
+	static const struct argp argp = {
+		.options = opts,
+		.parser = parse_arg,
+		.doc = argp_program_doc,
+	};
+	struct tcplife_bpf *obj;
+	struct perf_buffer *pb = NULL;
+	short port_num;
+	char *port;
+	int err, i;
+
+	err = argp_parse(&argp, argc, argv, 0, NULL, NULL);
+	if (err)
+		return err;
+
+	libbpf_set_strict_mode(LIBBPF_STRICT_ALL);
+	libbpf_set_print(libbpf_print_fn);
+
+	err = ensure_core_btf(&open_opts);
+	if (err) {
+		fprintf(stderr, "failed to fetch necessary BTF for CO-RE: %s\n", strerror(-err));
+		return 1;
+	}
+
+	obj = tcplife_bpf__open_opts(&open_opts);
+	if (!obj) {
+		fprintf(stderr, "failed to open BPF object\n");
+		return 1;
+	}
+
+	obj->rodata->target_pid = target_pid;
+	obj->rodata->target_family = target_family;
+
+	if (target_sports) {
+		i = 0;
+		port = strtok(target_sports, ",");
+		while (port && i < MAX_PORTS) {
+			port_num = strtol(port, NULL, 10);
+			obj->rodata->target_sports[i++] = port_num;
+			port = strtok(NULL, ",");
+		}
+	}
+
+	if (target_dports) {
+		i = 0;
+		port = strtok(target_dports, ",");
+		while (port && i < MAX_PORTS) {
+			port_num = strtol(port, NULL, 10);
+			obj->rodata->target_dports[i++] = port_num;
+			port = strtok(NULL, ",");
+		}
+	}
+
+	err = tcplife_bpf__load(obj);
+	if (err) {
+		fprintf(stderr, "failed to load BPF object: %d\n", err);
+		goto cleanup;
+	}
+
+	err = tcplife_bpf__attach(obj);
+	if (err) {
+		fprintf(stderr, "failed to attach BPF object: %d\n", err);
+		goto cleanup;
+	}
+
+	pb = perf_buffer__new(bpf_map__fd(obj->maps.events), PERF_BUFFER_PAGES,
+			      handle_event, handle_lost_events, NULL, NULL);
+	if (!pb) {
+		err = -errno;
+		fprintf(stderr, "failed to open perf buffer: %d\n", err);
+		goto cleanup;
+	}
+
+	if (signal(SIGINT, sig_int) == SIG_ERR) {
+		fprintf(stderr, "can't set signal handler: %s\n", strerror(errno));
+		err = 1;
+		goto cleanup;
+	}
+
+	if (emit_timestamp)
+		printf("%-8s ", "TIME(s)");
+	printf("%-7s %-16s %-*s %-5s %-*s %-5s %-6s %-6s %-s\n",
+	       "PID", "COMM", column_width, "LADDR", "LPORT", column_width, "RADDR", "RPORT",
+	       "TX_KB", "RX_KB", "MS");
+
+	while (!exiting) {
+		err = perf_buffer__poll(pb, PERF_POLL_TIMEOUT_MS);
+		if (err < 0 && err != -EINTR) {
+			fprintf(stderr, "error polling perf buffer: %s\n", strerror(-err));
+			goto cleanup;
+		}
+		/* reset err to return 0 if exiting */
+		err = 0;
+	}
+
+cleanup:
+	perf_buffer__free(pb);
+	tcplife_bpf__destroy(obj);
+	cleanup_core_btf(&open_opts);
+	return err != 0;
+}
diff --git a/libbpf-tools/tcplife.h b/libbpf-tools/tcplife.h
new file mode 100644
index 000000000000..6e92352f533a
--- /dev/null
+++ b/libbpf-tools/tcplife.h
@@ -0,0 +1,28 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2022 Hengqi Chen */
+#ifndef __TCPLIFE_H
+#define __TCPLIFE_H
+
+#define MAX_PORTS	1024
+#define TASK_COMM_LEN	16
+
+struct ident {
+	__u32 pid;
+	char comm[TASK_COMM_LEN];
+};
+
+struct event {
+	unsigned __int128 saddr;
+	unsigned __int128 daddr;
+	__u64 ts_us;
+	__u64 span_us;
+	__u64 rx_b;
+	__u64 tx_b;
+	__u32 pid;
+	__u16 sport;
+	__u16 dport;
+	__u16 family;
+	char comm[TASK_COMM_LEN];
+};
+
+#endif /* __TCPLIFE_H */
diff --git a/libbpf-tools/trace_helpers.c b/libbpf-tools/trace_helpers.c
index 9165be429084..e04028e1c50a 100644
--- a/libbpf-tools/trace_helpers.c
+++ b/libbpf-tools/trace_helpers.c
@@ -993,16 +993,25 @@ bool is_kernel_module(const char *name)
 
 static bool fentry_try_attach(int id)
 {
-	struct bpf_insn insns[] = { { .code = BPF_JMP | BPF_EXIT } };
-	LIBBPF_OPTS(bpf_prog_load_opts, opts);
 	int prog_fd, attach_fd;
-
-	opts.expected_attach_type = BPF_TRACE_FENTRY;
-	opts.attach_btf_id = id,
-
-	prog_fd = bpf_prog_load(BPF_PROG_TYPE_TRACING, "test", NULL, insns, 1, &opts);
-	if (prog_fd < 0)
+	char error[4096];
+	struct bpf_insn insns[] = {
+		{ .code = BPF_ALU64 | BPF_MOV | BPF_K, .dst_reg = BPF_REG_0, .imm = 0 },
+		{ .code = BPF_JMP | BPF_EXIT },
+	};
+	LIBBPF_OPTS(bpf_prog_load_opts, opts,
+			.expected_attach_type = BPF_TRACE_FENTRY,
+			.attach_btf_id = id,
+			.log_buf = error,
+			.log_size = sizeof(error),
+	);
+
+	prog_fd = bpf_prog_load(BPF_PROG_TYPE_TRACING, "test", "GPL", insns,
+			sizeof(insns) / sizeof(struct bpf_insn), &opts);
+	if (prog_fd < 0) {
+		fprintf(stderr, "failed to try attaching to fentry: %s\n", error);
 		return false;
+	}
 
 	attach_fd = bpf_raw_tracepoint_open(NULL, prog_fd);
 	if (attach_fd >= 0)
diff --git a/man/man8/biosnoop.8 b/man/man8/biosnoop.8
index 9a0ae2649680..24f19edff200 100644
--- a/man/man8/biosnoop.8
+++ b/man/man8/biosnoop.8
@@ -2,7 +2,7 @@
 .SH NAME
 biosnoop \- Trace block device I/O and print details incl. issuing PID.
 .SH SYNOPSIS
-.B biosnoop [\-hQ]
+.B biosnoop [\-h] [\-Q] [\-d DISK]
 .SH DESCRIPTION
 This tools traces block device I/O (disk I/O), and prints a one-line summary
 for each I/O showing various details. These include the latency from the time of
@@ -29,6 +29,9 @@ Print usage message.
 .TP
 \-Q
 Include a column showing the time spent queued in the OS.
+.TP
+\-d DISK
+Trace this disk only.
 .SH EXAMPLES
 .TP
 Trace all block device I/O and print a summary line per I/O:
@@ -82,6 +85,6 @@ Linux
 .SH STABILITY
 Unstable - in development.
 .SH AUTHOR
-Brendan Gregg
+Brendan Gregg, Rocky Xing
 .SH SEE ALSO
 disksnoop(8), iostat(1)
diff --git a/man/man8/llcstat.8 b/man/man8/llcstat.8
index 36dbed7dedc1..5a28d3384c51 100644
--- a/man/man8/llcstat.8
+++ b/man/man8/llcstat.8
@@ -28,6 +28,9 @@ Print usage message.
 \-c SAMPLE_PERIOD
 Sample one in this many cache reference and cache miss events.
 .TP
+\-t
+Summarize cache references and misses by PID/TID
+.TP
 duration
 Duration to trace, in seconds.
 .SH EXAMPLES
diff --git a/src/cc/bcc_syms.cc b/src/cc/bcc_syms.cc
index 12c8250b2910..b1aae89ed8fd 100644
--- a/src/cc/bcc_syms.cc
+++ b/src/cc/bcc_syms.cc
@@ -289,11 +289,8 @@ bool ProcSyms::Module::contains(uint64_t addr, uint64_t &offset) const {
   for (const auto &range : ranges_) {
     if (addr >= range.start && addr < range.end) {
       if (type_ == ModuleType::SO || type_ == ModuleType::VDSO) {
-        // Offset within the mmap
-        offset = addr - range.start + range.file_offset;
-
-        // Offset within the ELF for SO symbol lookup
-        offset += (elf_so_addr_ - elf_so_offset_);
+        offset = __so_calc_mod_offset(range.start, range.file_offset,
+                                      elf_so_addr_, elf_so_offset_, addr);
       } else {
         offset = addr;
       }
@@ -619,9 +616,26 @@ int _bcc_syms_find_module(mod_info *info, int enter_ns, void *p) {
   return -1;
 }
 
+uint64_t __so_calc_global_addr(uint64_t mod_start_addr,
+                               uint64_t mod_file_offset,
+                               uint64_t elf_sec_start_addr,
+                               uint64_t elf_sec_file_offset, uint64_t offset) {
+  return offset + (mod_start_addr - mod_file_offset) -
+         (elf_sec_start_addr - elf_sec_file_offset);
+}
+
+uint64_t __so_calc_mod_offset(uint64_t mod_start_addr, uint64_t mod_file_offset,
+                              uint64_t elf_sec_start_addr,
+                              uint64_t elf_sec_file_offset,
+                              uint64_t global_addr) {
+  return global_addr - (mod_start_addr - mod_file_offset) +
+         (elf_sec_start_addr - elf_sec_file_offset);
+}
+
 int bcc_resolve_global_addr(int pid, const char *module, const uint64_t address,
                             uint8_t inode_match_only, uint64_t *global) {
   struct stat s;
+  uint64_t elf_so_addr, elf_so_offset;
   if (stat(module, &s))
     return -1;
 
@@ -632,7 +646,11 @@ int bcc_resolve_global_addr(int pid, const char *module, const uint64_t address,
       mod.start == 0x0)
     return -1;
 
-  *global = mod.start - mod.file_offset + address;
+  if (bcc_elf_get_text_scn_info(module, &elf_so_addr, &elf_so_offset) < 0)
+    return -1;
+
+  *global = __so_calc_global_addr(mod.start, mod.file_offset, elf_so_addr,
+                                  elf_so_offset, address);
   return 0;
 }
 
diff --git a/src/cc/bcc_syms.h b/src/cc/bcc_syms.h
index 80627debead9..eb1e4ead4b8a 100644
--- a/src/cc/bcc_syms.h
+++ b/src/cc/bcc_syms.h
@@ -102,6 +102,30 @@ int bcc_resolve_symname(const char *module, const char *symname,
                         struct bcc_symbol_option* option,
                         struct bcc_symbol *sym);
 
+/* Calculate the global address for 'offset' in a shared object loaded into
+ * a process
+ *
+ * Need to know (start_addr, file_offset) pairs for the /proc/PID/maps module
+ * entry containing the offset and the elf section containing the module's
+ * .text
+ */
+uint64_t __so_calc_global_addr(uint64_t mod_start_addr,
+                               uint64_t mod_file_offset,
+                               uint64_t elf_sec_start_addr,
+                               uint64_t elf_sec_file_offset, uint64_t offset);
+
+/* Given a global address which falls within a shared object's mapping in a
+ * process, calculate the corresponding 'offset' in the .so
+ *
+ * Need to know (start_addr, file_offset) pairs for the /proc/PID/maps module
+ * entry containing the offset and the elf section containing the module's
+ * .text
+ */
+uint64_t __so_calc_mod_offset(uint64_t mod_start_addr, uint64_t mod_file_offset,
+                              uint64_t elf_sec_start_addr,
+                              uint64_t elf_sec_file_offset,
+                              uint64_t global_addr);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/cc/export/helpers.h b/src/cc/export/helpers.h
index e2de995fc25f..82dc0fe13aec 100644
--- a/src/cc/export/helpers.h
+++ b/src/cc/export/helpers.h
@@ -390,7 +390,7 @@ struct _name##_table_t _name = { .max_entries = (_max_entries) }
 #define BPF_CPUMAP(_name, _max_entries) \
   BPF_XDP_REDIRECT_MAP("cpumap", u32, _name, _max_entries)
 
-#define BPF_XSKMAP(_name, _max_entries) \
+#define _BPF_XSKMAP(_name, _max_entries, _pinned) \
 struct _name##_table_t { \
   u32 key; \
   int leaf; \
@@ -399,8 +399,12 @@ struct _name##_table_t { \
   u64 (*redirect_map) (int, int); \
   u32 max_entries; \
 }; \
-__attribute__((section("maps/xskmap"))) \
+__attribute__((section("maps/xskmap" _pinned))) \
 struct _name##_table_t _name = { .max_entries = (_max_entries) }
+#define BPF_XSKMAP2(_name, _max_entries) _BPF_XSKMAP(_name, _max_entries, "")
+#define BPF_XSKMAP3(_name, _max_entries, _pinned) _BPF_XSKMAP(_name, _max_entries, ":" _pinned)
+#define BPF_XSKMAPX(_1, _2, _3, NAME, ...) NAME
+#define BPF_XSKMAP(...) BPF_XSKMAPX(__VA_ARGS__, BPF_XSKMAP3, BPF_XSKMAP2)(__VA_ARGS__)
 
 #define BPF_ARRAY_OF_MAPS(_name, _inner_map_name, _max_entries) \
   BPF_TABLE("array_of_maps$" _inner_map_name, int, int, _name, _max_entries)
diff --git a/tests/cc/test_c_api.cc b/tests/cc/test_c_api.cc
index eb56dc08e7a9..510ccda942e8 100644
--- a/tests/cc/test_c_api.cc
+++ b/tests/cc/test_c_api.cc
@@ -600,6 +600,59 @@ TEST_CASE("resolve global addr in libc in this process", "[c_api][!mayfail]") {
   REQUIRE(global_addr == (search.start + local_addr - search.file_offset));
 }
 
+/* Consider the following scenario: we have some process that maps in a shared library [1] with a
+ * USDT probe [2]. The shared library's .text section doesn't have matching address and file off
+ * [3]. Since the location address in [2] is an offset relative to the base address of whatever.so
+ * in whatever process is mapping it, we need to convert the location address 0x77b8c to a global
+ * address in the process' address space in order to attach to the USDT.
+ *
+ * The formula for this (__so_calc_global_addr) is
+ *   global_addr = offset + (mod_start_addr - mod_file_offset)
+ *                        - (elf_sec_start_addr - elf_sec_file_offset)
+ *
+ * Which for our concrete example is
+ *   global_addr = 0x77b8c + (0x7f6cda31e000 - 0x72000) - (0x73c90 - 0x72c90)
+ *   global_addr = 0x7f6cda322b8c
+ *
+ * [1 - output from `cat /proc/PID/maps`]
+ * 7f6cda2ab000-7f6cda31e000 r--p 00000000 00:2d 5370022276                 /whatever.so
+ * 7f6cda31e000-7f6cda434000 r-xp 00072000 00:2d 5370022276                 /whatever.so
+ * 7f6cda434000-7f6cda43d000 r--p 00187000 00:2d 5370022276                 /whatever.so
+ * 7f6cda43d000-7f6cda43f000 rw-p 0018f000 00:2d 5370022276                 /whatever.so
+ *
+ * [2 - output from `readelf -n /whatever.so`]
+ * stapsdt              0x00000038 NT_STAPSDT (SystemTap probe descriptors)
+ *   Provider: test
+ *   Name: test_probe
+ *   Location: 0x0000000000077b8c, Base: 0x0000000000000000, Semaphore: 0x0000000000000000
+ *   Arguments: -8@$5
+ *
+ * [3 - output from `readelf -W --sections /whatever.so`]
+ *   [Nr] Name              Type            Address          Off    Size   ES Flg Lk Inf Al
+ *   [16] .text             PROGBITS        0000000000073c90 072c90 1132dc 00  AX  0   0 16
+ */
+TEST_CASE("conversion of module offset to/from global_addr", "[c_api]") {
+  uint64_t global_addr, offset, calc_offset, mod_start_addr, mod_file_offset;
+  uint64_t elf_sec_start_addr, elf_sec_file_offset;
+
+  /* Initialize per example in comment above */
+  offset = 0x77b8c;
+  mod_start_addr = 0x7f6cda31e000;
+  mod_file_offset = 0x00072000;
+  elf_sec_start_addr = 0x73c90;
+  elf_sec_file_offset = 0x72c90;
+  global_addr = __so_calc_global_addr(mod_start_addr, mod_file_offset,
+                                      elf_sec_start_addr, elf_sec_file_offset,
+                                      offset);
+  REQUIRE(global_addr == 0x7f6cda322b8c);
+
+  /* Reverse operation (global_addr -> offset) should yield original offset */
+  calc_offset = __so_calc_mod_offset(mod_start_addr, mod_file_offset,
+                                     elf_sec_start_addr, elf_sec_file_offset,
+                                     global_addr);
+  REQUIRE(calc_offset == offset);
+}
+
 TEST_CASE("get online CPUs", "[c_api]") {
 	std::vector<int> cpus = ebpf::get_online_cpus();
 	int num_cpus = sysconf(_SC_NPROCESSORS_ONLN);
diff --git a/tools/biosnoop.py b/tools/biosnoop.py
index 3f3ebd2e4f50..1028aa76ed56 100755
--- a/tools/biosnoop.py
+++ b/tools/biosnoop.py
@@ -12,16 +12,18 @@
 #
 # 16-Sep-2015   Brendan Gregg   Created this.
 # 11-Feb-2016   Allan McAleavy  updated for BPF_PERF_OUTPUT
+# 21-Jun-2022   Rocky Xing      Added disk filter support.
 
 from __future__ import print_function
 from bcc import BPF
-import re
 import argparse
+import os
 
 # arguments
 examples = """examples:
     ./biosnoop           # trace all block I/O
     ./biosnoop -Q        # include OS queued time
+    ./biolatency -d sdc  # trace sdc only
 """
 parser = argparse.ArgumentParser(
     description="Trace block I/O",
@@ -29,6 +31,8 @@
     epilog=examples)
 parser.add_argument("-Q", "--queue", action="store_true",
     help="include OS queued time")
+parser.add_argument("-d", "--disk", type=str,
+    help="Trace this disk only")
 parser.add_argument("--ebpf", action="store_true",
     help=argparse.SUPPRESS)
 args = parser.parse_args()
@@ -70,6 +74,8 @@
 // cache PID and comm by-req
 int trace_pid_start(struct pt_regs *ctx, struct request *req)
 {
+    DISK_FILTER
+
     struct val_t val = {};
     u64 ts;
 
@@ -86,6 +92,8 @@
 // time block I/O
 int trace_req_start(struct pt_regs *ctx, struct request *req)
 {
+    DISK_FILTER
+
     struct start_req_t start_req = {
         .ts = bpf_ktime_get_ns(),
         .data_len = req->__data_len
@@ -160,6 +168,34 @@
     bpf_text = bpf_text.replace('__RQ_DISK__', 'rq_disk')
 else:
     bpf_text = bpf_text.replace('__RQ_DISK__', 'q->disk')
+
+if args.disk is not None:
+    disk_path = os.path.join('/dev', args.disk)
+    if not os.path.exists(disk_path):
+        print("no such disk '%s'" % args.disk)
+        exit(1)
+
+    stat_info = os.stat(disk_path)
+    major = os.major(stat_info.st_rdev)
+    minor = os.minor(stat_info.st_rdev)
+
+    disk_field_str = ""
+    if BPF.kernel_struct_has_field(b'request', b'rq_disk') == 1:
+        disk_field_str = 'req->rq_disk'
+    else:
+        disk_field_str = 'req->q->disk'
+
+    disk_filter_str = """
+    struct gendisk *disk = %s;
+    if (!(disk->major == %d && disk->first_minor == %d)) {
+        return 0;
+    }
+    """ % (disk_field_str, major, minor)
+
+    bpf_text = bpf_text.replace('DISK_FILTER', disk_filter_str)
+else:
+    bpf_text = bpf_text.replace('DISK_FILTER', '')
+
 if debug or args.ebpf:
     print(bpf_text)
     if args.ebpf:
diff --git a/tools/biosnoop_example.txt b/tools/biosnoop_example.txt
index d8be0624c94e..38b0ca3431f9 100644
--- a/tools/biosnoop_example.txt
+++ b/tools/biosnoop_example.txt
@@ -64,14 +64,16 @@ TIME(s)     COMM           PID    DISK    T SECTOR     BYTES  QUE(ms) LAT(ms)
 
 USAGE message:
 
-usage: biosnoop.py [-h] [-Q]
+usage: biosnoop.py [-h] [-Q] [-d DISK]
 
 Trace block I/O
 
 optional arguments:
-  -h, --help   show this help message and exit
-  -Q, --queue  include OS queued time
+  -h, --help            show this help message and exit
+  -Q, --queue           include OS queued time
+  -d DISK, --disk DISK  Trace this disk only
 
 examples:
     ./biosnoop           # trace all block I/O
     ./biosnoop -Q        # include OS queued time
+    ./biolatency -d sdc  # trace sdc only
diff --git a/tools/llcstat.py b/tools/llcstat.py
index 4f1ba2f9a88a..ec7f4c364dc5 100755
--- a/tools/llcstat.py
+++ b/tools/llcstat.py
@@ -15,6 +15,7 @@
 # Licensed under the Apache License, Version 2.0 (the "License")
 #
 # 19-Oct-2016   Teng Qin   Created this.
+# 20-Jun-2022   YeZhengMao Added tid info.
 
 from __future__ import print_function
 import argparse
@@ -30,6 +31,10 @@
     help="Sample one in this many number of cache reference / miss events")
 parser.add_argument(
     "duration", nargs="?", default=10, help="Duration, in seconds, to run")
+parser.add_argument(
+    "-t", "--tid", action="store_true",
+    help="Summarize cache references and misses by PID/TID"
+)
 parser.add_argument("--ebpf", action="store_true",
     help=argparse.SUPPRESS)
 args = parser.parse_args()
@@ -41,7 +46,8 @@
 
 struct key_t {
     int cpu;
-    int pid;
+    u32 pid;
+    u32 tid;
     char name[TASK_COMM_LEN];
 };
 
@@ -49,8 +55,10 @@
 BPF_HASH(miss_count, struct key_t);
 
 static inline __attribute__((always_inline)) void get_key(struct key_t* key) {
+    u64 pid_tgid = bpf_get_current_pid_tgid();
     key->cpu = bpf_get_smp_processor_id();
-    key->pid = bpf_get_current_pid_tgid() >> 32;
+    key->pid = pid_tgid >> 32;
+    key->tid = GET_TID ? (u32)pid_tgid : key->pid;
     bpf_get_current_comm(&(key->name), sizeof(key->name));
 }
 
@@ -73,6 +81,8 @@
 }
 """
 
+bpf_text = bpf_text.replace("GET_TID", "1" if args.tid else "0")
+
 if args.ebpf:
     print(bpf_text)
     exit()
@@ -98,22 +108,42 @@
 
 miss_count = {}
 for (k, v) in b.get_table('miss_count').items():
-    miss_count[(k.pid, k.cpu, k.name)] = v.value
+    if args.tid:
+        miss_count[(k.pid, k.tid, k.cpu, k.name)] = v.value
+    else:
+        miss_count[(k.pid, k.cpu, k.name)] = v.value
+
+header_text = 'PID      '
+format_text = '{:<8d} '
+if args.tid:
+    header_text += 'TID      '
+    format_text += '{:<8d} '
+
+header_text += 'NAME             CPU     REFERENCE         MISS    HIT%'
+format_text += '{:<16s} {:<4d} {:>12d} {:>12d} {:>6.2f}%'
 
-print('PID      NAME             CPU     REFERENCE         MISS    HIT%')
+print(header_text)
 tot_ref = 0
 tot_miss = 0
 for (k, v) in b.get_table('ref_count').items():
     try:
-        miss = miss_count[(k.pid, k.cpu, k.name)]
+        if args.tid:
+            miss = miss_count[(k.pid, k.tid, k.cpu, k.name)]
+        else:
+            miss = miss_count[(k.pid, k.cpu, k.name)]
     except KeyError:
         miss = 0
     tot_ref += v.value
     tot_miss += miss
     # This happens on some PIDs due to missed counts caused by sampling
     hit = (v.value - miss) if (v.value >= miss) else 0
-    print('{:<8d} {:<16s} {:<4d} {:>12d} {:>12d} {:>6.2f}%'.format(
-        k.pid, k.name.decode('utf-8', 'replace'), k.cpu, v.value, miss,
-        (float(hit) / float(v.value)) * 100.0))
+    if args.tid:
+        print(format_text.format(
+            k.pid, k.tid, k.name.decode('utf-8', 'replace'), k.cpu, v.value, miss,
+            (float(hit) / float(v.value)) * 100.0))
+    else:
+        print(format_text.format(
+            k.pid, k.name.decode('utf-8', 'replace'), k.cpu, v.value, miss,
+            (float(hit) / float(v.value)) * 100.0))
 print('Total References: {} Total Misses: {} Hit Rate: {:.2f}%'.format(
     tot_ref, tot_miss, (float(tot_ref - tot_miss) / float(tot_ref)) * 100.0))
diff --git a/tools/llcstat_example.txt b/tools/llcstat_example.txt
index ef2aec10f6f6..a7c1a78d694c 100644
--- a/tools/llcstat_example.txt
+++ b/tools/llcstat_example.txt
@@ -38,6 +38,21 @@ some degree by chance. Overall it should make sense. But for low counts,
 you might find a case where -- by chance -- a process has been tallied with
 more misses than references, which would seem impossible.
 
+# ./llcstat.py 10 -t
+Running for 10 seconds or hit Ctrl-C to end.
+PID      TID      NAME             CPU     REFERENCE         MISS    HIT%
+170843   170845   docker           12           2700         1200  55.56%
+298670   298670   kworker/15:0     15            500            0 100.00%
+170254   170254   kworker/11:1     11           2500          400  84.00%
+1046952  1046953  git              0            2600         1100  57.69%
+170843   170849   docker           15           1000          400  60.00%
+1027373  1027382  node             8            3500         2500  28.57%
+0        0        swapper/7        7          173000         4200  97.57%
+1028217  1028217  node             14          15600        22400   0.00%
+[...]
+Total References: 7139900 Total Misses: 1413900 Hit Rate: 80.20%
+
+This shows each TID`s cache hit rate during the 10 seconds run period.
 
 USAGE message:
 
@@ -54,3 +69,4 @@ positional arguments:
     -c SAMPLE_PERIOD, --sample_period SAMPLE_PERIOD
                           Sample one in this many number of cache reference
                           and miss events
+    -t, --tid             Summarize cache references and misses by PID/TID
diff --git a/tools/profile.py b/tools/profile.py
index 47d2adf297fe..43afacc5fdba 100755
--- a/tools/profile.py
+++ b/tools/profile.py
@@ -335,21 +335,21 @@ def aksym(addr):
         # print folded stack output
         user_stack = list(user_stack)
         kernel_stack = list(kernel_stack)
-        line = [k.name]
+        line = [k.name.decode('utf-8', 'replace')]
         # if we failed to get the stack is, such as due to no space (-ENOMEM) or
         # hash collision (-EEXIST), we still print a placeholder for consistency
         if not args.kernel_stacks_only:
             if stack_id_err(k.user_stack_id):
-                line.append(b"[Missed User Stack]")
+                line.append("[Missed User Stack]")
             else:
-                line.extend([b.sym(addr, k.pid) for addr in reversed(user_stack)])
+                line.extend([b.sym(addr, k.pid).decode('utf-8', 'replace') for addr in reversed(user_stack)])
         if not args.user_stacks_only:
-            line.extend([b"-"] if (need_delimiter and k.kernel_stack_id >= 0 and k.user_stack_id >= 0) else [])
+            line.extend(["-"] if (need_delimiter and k.kernel_stack_id >= 0 and k.user_stack_id >= 0) else [])
             if stack_id_err(k.kernel_stack_id):
-                line.append(b"[Missed Kernel Stack]")
+                line.append("[Missed Kernel Stack]")
             else:
-                line.extend([aksym(addr) for addr in reversed(kernel_stack)])
-        print("%s %d" % (b";".join(line).decode('utf-8', 'replace'), v.value))
+                line.extend([aksym(addr).decode('utf-8', 'replace') for addr in reversed(kernel_stack)])
+        print("%s %d" % (";".join(line), v.value))
     else:
         # print default multi-line stack output
         if not args.user_stacks_only:
@@ -357,7 +357,7 @@ def aksym(addr):
                 print("    [Missed Kernel Stack]")
             else:
                 for addr in kernel_stack:
-                    print("    %s" % aksym(addr))
+                    print("    %s" % aksym(addr).decode('utf-8', 'replace'))
         if not args.kernel_stacks_only:
             if need_delimiter and k.user_stack_id >= 0 and k.kernel_stack_id >= 0:
                 print("    --")
diff --git a/tools/stackcount.py b/tools/stackcount.py
index 8b7ca0087838..cea0e9e2785c 100755
--- a/tools/stackcount.py
+++ b/tools/stackcount.py
@@ -292,18 +292,18 @@ def _print_kframe(self, addr):
         if self.args.verbose:
             print("%-16x " % addr, end="")
         if self.args.offset:
-            print("%s" % self.probe.bpf.ksym(addr, show_offset=True))
+            print("%s" % self.probe.bpf.ksym(addr, show_offset=True).decode())
         else:
-            print("%s" % self.probe.bpf.ksym(addr))
+            print("%s" % self.probe.bpf.ksym(addr).decode())
 
     def _print_uframe(self, addr, pid):
         print("  ", end="")
         if self.args.verbose:
             print("%-16x " % addr, end="")
         if self.args.offset:
-            print("%s" % self.probe.bpf.sym(addr, pid, show_offset=True))
+            print("%s" % self.probe.bpf.sym(addr, pid, show_offset=True).decode())
         else:
-            print("%s" % self.probe.bpf.sym(addr, pid))
+            print("%s" % self.probe.bpf.sym(addr, pid).decode())
 
     @staticmethod
     def _signal_ignore(signal, frame):