Skip to content

Commit

Permalink
tools/tcpretrans: add optional tcp seq output
Browse files Browse the repository at this point in the history
This commit adds the ability to print out tcp sequence numbers while
running the tool in normal mode by reading the appropriate fields from
skb. skb is not readily available for TLP, thus the output for that mode
is set to 0.

Signed-off-by: Michael Gugino <[email protected]>
  • Loading branch information
michaelgugino authored and yonghong-song committed Sep 2, 2021
1 parent b00e6b4 commit 7abd77a
Show file tree
Hide file tree
Showing 3 changed files with 79 additions and 21 deletions.
10 changes: 8 additions & 2 deletions man/man8/tcpretrans.8
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,15 @@
.SH NAME
tcpretrans \- Trace or count TCP retransmits and TLPs. Uses Linux eBPF/bcc.
.SH SYNOPSIS
.B tcpretrans [\-h] [\-l] [\-c] [\-4 | \-6]
.B tcpretrans [\-h] [\-s] [\-l] [\-c] [\-4 | \-6]
.SH DESCRIPTION
This traces TCP retransmits, showing address, port, and TCP state information,
and sometimes the PID (although usually not, since retransmits are usually
sent by the kernel on timeouts). To keep overhead very low, only
the TCP retransmit functions are traced. This does not trace every packet
(like tcpdump(8) or a packet sniffer). Optionally, it can count retransmits
over a user signalled interval to spot potentially dropping network paths the
flows are traversing.
flows are traversing.

This uses dynamic tracing of the kernel tcp_retransmit_skb() and
tcp_send_loss_probe() functions, and will need to be updated to
Expand All @@ -24,6 +24,9 @@ CONFIG_BPF and bcc.
\-h
Print usage message.
.TP
\-s
Display TCP sequence numbers.
.TP
\-l
Include tail loss probe attempts (in some cases the kernel may not
complete the TLP send).
Expand Down Expand Up @@ -83,6 +86,9 @@ Remote port.
STATE
TCP session state.
.TP
SEQ
TCP sequence.
.TP
RETRANSMITS
Accumulated occurred retransmits since start.
.SH OVERHEAD
Expand Down
72 changes: 57 additions & 15 deletions tools/tcpretrans.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@
description="Trace TCP retransmits",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=examples)
parser.add_argument("-s", "--sequence", action="store_true",
help="display TCP sequence numbers")
parser.add_argument("-l", "--lossprobe", action="store_true",
help="include tail loss probe attempts")
parser.add_argument("-c", "--count", action="store_true",
Expand All @@ -52,6 +54,7 @@
bpf_text = """
#include <uapi/linux/ptrace.h>
#include <net/sock.h>
#include <net/tcp.h>
#include <bcc/proto.h>
#define RETRANSMIT 1
Expand All @@ -61,6 +64,7 @@
struct ipv4_data_t {
u32 pid;
u64 ip;
u32 seq;
u32 saddr;
u32 daddr;
u16 lport;
Expand All @@ -72,6 +76,7 @@
struct ipv6_data_t {
u32 pid;
u32 seq;
u64 ip;
unsigned __int128 saddr;
unsigned __int128 daddr;
Expand Down Expand Up @@ -101,8 +106,11 @@
"""

bpf_text_kprobe = """
static int trace_event(struct pt_regs *ctx, struct sock *skp, int type)
static int trace_event(struct pt_regs *ctx, struct sock *skp, struct sk_buff *skb, int type)
{
struct tcp_skb_cb *tcb;
u32 seq;
if (skp == NULL)
return 0;
u32 pid = bpf_get_current_pid_tgid() >> 32;
Expand All @@ -113,8 +121,15 @@
u16 dport = skp->__sk_common.skc_dport;
char state = skp->__sk_common.skc_state;
seq = 0;
if (skb) {
/* macro TCP_SKB_CB from net/tcp.h */
tcb = ((struct tcp_skb_cb *)&((skb)->cb[0]));
seq = tcb->seq;
}
FILTER_FAMILY
if (family == AF_INET) {
IPV4_INIT
IPV4_CORE
Expand All @@ -129,33 +144,44 @@
"""

bpf_text_kprobe_retransmit = """
int trace_retransmit(struct pt_regs *ctx, struct sock *sk)
int trace_retransmit(struct pt_regs *ctx, struct sock *sk, struct sk_buff *skb)
{
trace_event(ctx, sk, RETRANSMIT);
trace_event(ctx, sk, skb, RETRANSMIT);
return 0;
}
"""

bpf_text_kprobe_tlp = """
int trace_tlp(struct pt_regs *ctx, struct sock *sk)
{
trace_event(ctx, sk, TLP);
trace_event(ctx, sk, NULL, TLP);
return 0;
}
"""

bpf_text_tracepoint = """
TRACEPOINT_PROBE(tcp, tcp_retransmit_skb)
{
struct tcp_skb_cb *tcb;
u32 seq;
u32 pid = bpf_get_current_pid_tgid() >> 32;
const struct sock *skp = (const struct sock *)args->skaddr;
const struct sk_buff *skb = (const struct sk_buff *)args->skbaddr;
u16 lport = args->sport;
u16 dport = args->dport;
char state = skp->__sk_common.skc_state;
u16 family = skp->__sk_common.skc_family;
FILTER_FAMILY
seq = 0;
if (skb) {
/* macro TCP_SKB_CB from net/tcp.h */
tcb = ((struct tcp_skb_cb *)&((skb)->cb[0]));
seq = tcb->seq;
}
FILTER_FAMILY
if (family == AF_INET) {
IPV4_CODE
} else if (family == AF_INET6) {
Expand All @@ -179,6 +205,7 @@
struct ipv4_data_t data4 = {};
data4.pid = pid;
data4.ip = 4;
data4.seq = seq;
data4.type = type;
data4.saddr = skp->__sk_common.skc_rcv_saddr;
data4.daddr = skp->__sk_common.skc_daddr;
Expand All @@ -202,6 +229,7 @@
struct ipv6_data_t data6 = {};
data6.pid = pid;
data6.ip = 6;
data6.seq = seq;
data6.type = type;
bpf_probe_read_kernel(&data6.saddr, sizeof(data6.saddr),
skp->__sk_common.skc_v6_rcv_saddr.in6_u.u6_addr32);
Expand Down Expand Up @@ -230,6 +258,7 @@
data4.dport = dport;
data4.type = RETRANSMIT;
data4.ip = 4;
data4.seq = seq;
data4.state = state;
__builtin_memcpy(&data4.saddr, args->saddr, sizeof(data4.saddr));
__builtin_memcpy(&data4.daddr, args->daddr, sizeof(data4.daddr));
Expand All @@ -252,6 +281,7 @@
data6.dport = dport;
data6.type = RETRANSMIT;
data6.ip = 6;
data6.seq = seq;
data6.state = state;
__builtin_memcpy(&data6.saddr, args->saddr_v6, sizeof(data6.saddr));
__builtin_memcpy(&data6.daddr, args->daddr_v6, sizeof(data6.daddr));
Expand Down Expand Up @@ -325,21 +355,29 @@
# process event
def print_ipv4_event(cpu, data, size):
event = b["ipv4_events"].event(data)
print("%-8s %-6d %-2d %-20s %1s> %-20s %s" % (
print("%-8s %-6d %-2d %-20s %1s> %-20s" % (
strftime("%H:%M:%S"), event.pid, event.ip,
"%s:%d" % (inet_ntop(AF_INET, pack('I', event.saddr)), event.lport),
type[event.type],
"%s:%s" % (inet_ntop(AF_INET, pack('I', event.daddr)), event.dport),
tcpstate[event.state]))
"%s:%s" % (inet_ntop(AF_INET, pack('I', event.daddr)), event.dport)),
end='')
if args.sequence:
print(" %-12s %s" % (tcpstate[event.state], event.seq))
else:
print(" %s" % (tcpstate[event.state]))

def print_ipv6_event(cpu, data, size):
event = b["ipv6_events"].event(data)
print("%-8s %-6d %-2d %-20s %1s> %-20s %s" % (
print("%-8s %-6d %-2d %-20s %1s> %-20s" % (
strftime("%H:%M:%S"), event.pid, event.ip,
"%s:%d" % (inet_ntop(AF_INET6, event.saddr), event.lport),
type[event.type],
"%s:%d" % (inet_ntop(AF_INET6, event.daddr), event.dport),
tcpstate[event.state]))
"%s:%d" % (inet_ntop(AF_INET6, event.daddr), event.dport)),
end='')
if args.sequence:
print(" %-12s %s" % (tcpstate[event.state], event.seq))
else:
print(" %s" % (tcpstate[event.state]))

def depict_cnt(counts_tab, l3prot='ipv4'):
for k, v in sorted(counts_tab.items(), key=lambda counts: counts[1].value):
Expand Down Expand Up @@ -377,8 +415,12 @@ def depict_cnt(counts_tab, l3prot='ipv4'):
# read events
else:
# header
print("%-8s %-6s %-2s %-20s %1s> %-20s %-4s" % ("TIME", "PID", "IP",
"LADDR:LPORT", "T", "RADDR:RPORT", "STATE"))
print("%-8s %-6s %-2s %-20s %1s> %-20s" % ("TIME", "PID", "IP",
"LADDR:LPORT", "T", "RADDR:RPORT"), end='')
if args.sequence:
print(" %-12s %-10s" % ("STATE", "SEQ"))
else:
print(" %-4s" % ("STATE"))
b["ipv4_events"].open_perf_buffer(print_ipv4_event)
b["ipv6_events"].open_perf_buffer(print_ipv6_event)
while 1:
Expand Down
18 changes: 14 additions & 4 deletions tools/tcpretrans_example.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ Demonstrations of tcpretrans, the Linux eBPF/bcc version.
This tool traces the kernel TCP retransmit function to show details of these
retransmits. For example:

# ./tcpretrans
# ./tcpretrans
TIME PID IP LADDR:LPORT T> RADDR:RPORT STATE
01:55:05 0 4 10.153.223.157:22 R> 69.53.245.40:34619 ESTABLISHED
01:55:05 0 4 10.153.223.157:22 R> 69.53.245.40:34619 ESTABLISHED
Expand Down Expand Up @@ -45,29 +45,39 @@ See the "L>" in the "T>" column. These are attempts: the kernel probably
sent a TLP, but in some cases it might not have been ultimately sent.

To spot heavily retransmitting flows quickly one can use the -c flag. It will
count occurring retransmits per flow.
count occurring retransmits per flow.

# ./tcpretrans.py -c
Tracing retransmits ... Hit Ctrl-C to end
^C
LADDR:LPORT RADDR:RPORT RETRANSMITS
192.168.10.50:60366 <-> 172.217.21.194:443 700
192.168.10.50:666 <-> 172.213.11.195:443 345
192.168.10.50:666 <-> 172.213.11.195:443 345
192.168.10.50:366 <-> 172.212.22.194:443 211
[...]

This can ease to quickly isolate congested or otherwise awry network paths
responsible for clamping tcp performance.

TCP sequence numbers can be included via -s, except in count mode. These numbers
are useful for identifying specific retransmissions in large packet caputes.
Note, lossprobe -l output will display 0 for the sequence number for L type.

# ./tcpretrans.py -s
TIME PID IP LADDR:LPORT T> RADDR:RPORT STATE SEQ
18:03:46 0 4 192.168.10.50:41976 R> 172.217.21.194:443 SYN_SENT 2879306108
18:03:49 0 4 192.168.10.50:41976 R> 172.217.21.194:443 SYN_SENT 2879306108

USAGE message:

# ./tcpretrans -h
usage: tcpretrans [-h] [-l] [-4 | -6]
usage: tcpretrans.py [-h] [-s] [-l] [-c] [-4 | -6]

Trace TCP retransmits

optional arguments:
-h, --help show this help message and exit
-s, --sequence display TCP sequence numbers
-l, --lossprobe include tail loss probe attempts
-c, --count count occurred retransmits per flow
-4, --ipv4 trace IPv4 family only
Expand Down

0 comments on commit 7abd77a

Please sign in to comment.