diff --git a/man/man8/tcpretrans.8 b/man/man8/tcpretrans.8 index ea00ac7fa483..0b643d1180c6 100644 --- a/man/man8/tcpretrans.8 +++ b/man/man8/tcpretrans.8 @@ -2,7 +2,7 @@ .SH NAME tcpretrans \- Trace or count TCP retransmits and TLPs. Uses Linux eBPF/bcc. .SH SYNOPSIS -.B tcpretrans [\-h] [\-l] [\-c] [\-4 | \-6] +.B tcpretrans [\-h] [\-s] [\-l] [\-c] [\-4 | \-6] .SH DESCRIPTION This traces TCP retransmits, showing address, port, and TCP state information, and sometimes the PID (although usually not, since retransmits are usually @@ -10,7 +10,7 @@ sent by the kernel on timeouts). To keep overhead very low, only the TCP retransmit functions are traced. This does not trace every packet (like tcpdump(8) or a packet sniffer). Optionally, it can count retransmits over a user signalled interval to spot potentially dropping network paths the -flows are traversing. +flows are traversing. This uses dynamic tracing of the kernel tcp_retransmit_skb() and tcp_send_loss_probe() functions, and will need to be updated to @@ -24,6 +24,9 @@ CONFIG_BPF and bcc. \-h Print usage message. .TP +\-s +Display TCP sequence numbers. +.TP \-l Include tail loss probe attempts (in some cases the kernel may not complete the TLP send). @@ -83,6 +86,9 @@ Remote port. STATE TCP session state. .TP +SEQ +TCP sequence. +.TP RETRANSMITS Accumulated occurred retransmits since start. .SH OVERHEAD diff --git a/tools/tcpretrans.py b/tools/tcpretrans.py index 8e6247f7bc8d..79b481bbebe8 100755 --- a/tools/tcpretrans.py +++ b/tools/tcpretrans.py @@ -34,6 +34,8 @@ description="Trace TCP retransmits", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=examples) +parser.add_argument("-s", "--sequence", action="store_true", + help="display TCP sequence numbers") parser.add_argument("-l", "--lossprobe", action="store_true", help="include tail loss probe attempts") parser.add_argument("-c", "--count", action="store_true", @@ -52,6 +54,7 @@ bpf_text = """ #include #include +#include #include #define RETRANSMIT 1 @@ -61,6 +64,7 @@ struct ipv4_data_t { u32 pid; u64 ip; + u32 seq; u32 saddr; u32 daddr; u16 lport; @@ -72,6 +76,7 @@ struct ipv6_data_t { u32 pid; + u32 seq; u64 ip; unsigned __int128 saddr; unsigned __int128 daddr; @@ -101,8 +106,11 @@ """ bpf_text_kprobe = """ -static int trace_event(struct pt_regs *ctx, struct sock *skp, int type) +static int trace_event(struct pt_regs *ctx, struct sock *skp, struct sk_buff *skb, int type) { + struct tcp_skb_cb *tcb; + u32 seq; + if (skp == NULL) return 0; u32 pid = bpf_get_current_pid_tgid() >> 32; @@ -113,8 +121,15 @@ u16 dport = skp->__sk_common.skc_dport; char state = skp->__sk_common.skc_state; + seq = 0; + if (skb) { + /* macro TCP_SKB_CB from net/tcp.h */ + tcb = ((struct tcp_skb_cb *)&((skb)->cb[0])); + seq = tcb->seq; + } + FILTER_FAMILY - + if (family == AF_INET) { IPV4_INIT IPV4_CORE @@ -129,9 +144,9 @@ """ bpf_text_kprobe_retransmit = """ -int trace_retransmit(struct pt_regs *ctx, struct sock *sk) +int trace_retransmit(struct pt_regs *ctx, struct sock *sk, struct sk_buff *skb) { - trace_event(ctx, sk, RETRANSMIT); + trace_event(ctx, sk, skb, RETRANSMIT); return 0; } """ @@ -139,7 +154,7 @@ bpf_text_kprobe_tlp = """ int trace_tlp(struct pt_regs *ctx, struct sock *sk) { - trace_event(ctx, sk, TLP); + trace_event(ctx, sk, NULL, TLP); return 0; } """ @@ -147,15 +162,26 @@ bpf_text_tracepoint = """ TRACEPOINT_PROBE(tcp, tcp_retransmit_skb) { + struct tcp_skb_cb *tcb; + u32 seq; + u32 pid = bpf_get_current_pid_tgid() >> 32; const struct sock *skp = (const struct sock *)args->skaddr; + const struct sk_buff *skb = (const struct sk_buff *)args->skbaddr; u16 lport = args->sport; u16 dport = args->dport; char state = skp->__sk_common.skc_state; u16 family = skp->__sk_common.skc_family; - FILTER_FAMILY - + seq = 0; + if (skb) { + /* macro TCP_SKB_CB from net/tcp.h */ + tcb = ((struct tcp_skb_cb *)&((skb)->cb[0])); + seq = tcb->seq; + } + + FILTER_FAMILY + if (family == AF_INET) { IPV4_CODE } else if (family == AF_INET6) { @@ -179,6 +205,7 @@ struct ipv4_data_t data4 = {}; data4.pid = pid; data4.ip = 4; + data4.seq = seq; data4.type = type; data4.saddr = skp->__sk_common.skc_rcv_saddr; data4.daddr = skp->__sk_common.skc_daddr; @@ -202,6 +229,7 @@ struct ipv6_data_t data6 = {}; data6.pid = pid; data6.ip = 6; + data6.seq = seq; data6.type = type; bpf_probe_read_kernel(&data6.saddr, sizeof(data6.saddr), skp->__sk_common.skc_v6_rcv_saddr.in6_u.u6_addr32); @@ -230,6 +258,7 @@ data4.dport = dport; data4.type = RETRANSMIT; data4.ip = 4; + data4.seq = seq; data4.state = state; __builtin_memcpy(&data4.saddr, args->saddr, sizeof(data4.saddr)); __builtin_memcpy(&data4.daddr, args->daddr, sizeof(data4.daddr)); @@ -252,6 +281,7 @@ data6.dport = dport; data6.type = RETRANSMIT; data6.ip = 6; + data6.seq = seq; data6.state = state; __builtin_memcpy(&data6.saddr, args->saddr_v6, sizeof(data6.saddr)); __builtin_memcpy(&data6.daddr, args->daddr_v6, sizeof(data6.daddr)); @@ -325,21 +355,29 @@ # process event def print_ipv4_event(cpu, data, size): event = b["ipv4_events"].event(data) - print("%-8s %-6d %-2d %-20s %1s> %-20s %s" % ( + print("%-8s %-6d %-2d %-20s %1s> %-20s" % ( strftime("%H:%M:%S"), event.pid, event.ip, "%s:%d" % (inet_ntop(AF_INET, pack('I', event.saddr)), event.lport), type[event.type], - "%s:%s" % (inet_ntop(AF_INET, pack('I', event.daddr)), event.dport), - tcpstate[event.state])) + "%s:%s" % (inet_ntop(AF_INET, pack('I', event.daddr)), event.dport)), + end='') + if args.sequence: + print(" %-12s %s" % (tcpstate[event.state], event.seq)) + else: + print(" %s" % (tcpstate[event.state])) def print_ipv6_event(cpu, data, size): event = b["ipv6_events"].event(data) - print("%-8s %-6d %-2d %-20s %1s> %-20s %s" % ( + print("%-8s %-6d %-2d %-20s %1s> %-20s" % ( strftime("%H:%M:%S"), event.pid, event.ip, "%s:%d" % (inet_ntop(AF_INET6, event.saddr), event.lport), type[event.type], - "%s:%d" % (inet_ntop(AF_INET6, event.daddr), event.dport), - tcpstate[event.state])) + "%s:%d" % (inet_ntop(AF_INET6, event.daddr), event.dport)), + end='') + if args.sequence: + print(" %-12s %s" % (tcpstate[event.state], event.seq)) + else: + print(" %s" % (tcpstate[event.state])) def depict_cnt(counts_tab, l3prot='ipv4'): for k, v in sorted(counts_tab.items(), key=lambda counts: counts[1].value): @@ -377,8 +415,12 @@ def depict_cnt(counts_tab, l3prot='ipv4'): # read events else: # header - print("%-8s %-6s %-2s %-20s %1s> %-20s %-4s" % ("TIME", "PID", "IP", - "LADDR:LPORT", "T", "RADDR:RPORT", "STATE")) + print("%-8s %-6s %-2s %-20s %1s> %-20s" % ("TIME", "PID", "IP", + "LADDR:LPORT", "T", "RADDR:RPORT"), end='') + if args.sequence: + print(" %-12s %-10s" % ("STATE", "SEQ")) + else: + print(" %-4s" % ("STATE")) b["ipv4_events"].open_perf_buffer(print_ipv4_event) b["ipv6_events"].open_perf_buffer(print_ipv6_event) while 1: diff --git a/tools/tcpretrans_example.txt b/tools/tcpretrans_example.txt index 77fc12f8d2a7..aa698b9c71dc 100644 --- a/tools/tcpretrans_example.txt +++ b/tools/tcpretrans_example.txt @@ -4,7 +4,7 @@ Demonstrations of tcpretrans, the Linux eBPF/bcc version. This tool traces the kernel TCP retransmit function to show details of these retransmits. For example: -# ./tcpretrans +# ./tcpretrans TIME PID IP LADDR:LPORT T> RADDR:RPORT STATE 01:55:05 0 4 10.153.223.157:22 R> 69.53.245.40:34619 ESTABLISHED 01:55:05 0 4 10.153.223.157:22 R> 69.53.245.40:34619 ESTABLISHED @@ -45,29 +45,39 @@ See the "L>" in the "T>" column. These are attempts: the kernel probably sent a TLP, but in some cases it might not have been ultimately sent. To spot heavily retransmitting flows quickly one can use the -c flag. It will -count occurring retransmits per flow. +count occurring retransmits per flow. # ./tcpretrans.py -c Tracing retransmits ... Hit Ctrl-C to end ^C LADDR:LPORT RADDR:RPORT RETRANSMITS 192.168.10.50:60366 <-> 172.217.21.194:443 700 -192.168.10.50:666 <-> 172.213.11.195:443 345 +192.168.10.50:666 <-> 172.213.11.195:443 345 192.168.10.50:366 <-> 172.212.22.194:443 211 [...] This can ease to quickly isolate congested or otherwise awry network paths responsible for clamping tcp performance. +TCP sequence numbers can be included via -s, except in count mode. These numbers +are useful for identifying specific retransmissions in large packet caputes. +Note, lossprobe -l output will display 0 for the sequence number for L type. + +# ./tcpretrans.py -s +TIME PID IP LADDR:LPORT T> RADDR:RPORT STATE SEQ +18:03:46 0 4 192.168.10.50:41976 R> 172.217.21.194:443 SYN_SENT 2879306108 +18:03:49 0 4 192.168.10.50:41976 R> 172.217.21.194:443 SYN_SENT 2879306108 + USAGE message: # ./tcpretrans -h -usage: tcpretrans [-h] [-l] [-4 | -6] +usage: tcpretrans.py [-h] [-s] [-l] [-c] [-4 | -6] Trace TCP retransmits optional arguments: -h, --help show this help message and exit + -s, --sequence display TCP sequence numbers -l, --lossprobe include tail loss probe attempts -c, --count count occurred retransmits per flow -4, --ipv4 trace IPv4 family only