Skip to content

Commit

Permalink
tcpretrans: extend for counting
Browse files Browse the repository at this point in the history
Feature to quickly discern heavily retransmitting flows and
therefore spotting culprit network paths.

Signed-off-by: Matthias Tafelmeier <[email protected]>
  • Loading branch information
cherusk committed Dec 14, 2017
1 parent e5db52b commit 1e9467f
Show file tree
Hide file tree
Showing 3 changed files with 143 additions and 28 deletions.
12 changes: 10 additions & 2 deletions man/man8/tcpretrans.8
Original file line number Diff line number Diff line change
@@ -1,14 +1,16 @@
.TH tcpaccept 8 "2016-02-14" "USER COMMANDS"
.SH NAME
tcpaccept \- Trace TCP retransmits and TLPs. Uses Linux eBPF/bcc.
tcpaccept \- Trace or count TCP retransmits and TLPs. Uses Linux eBPF/bcc.
.SH SYNOPSIS
.B tcpaccept [\-h] [\-l]
.SH DESCRIPTION
This traces TCP retransmits, showing address, port, and TCP state information,
and sometimes the PID (although usually not, since retransmits are usually
sent by the kernel on timeouts). To keep overhead very low, only
the TCP retransmit functions are traced. This does not trace every packet
(like tcpdump(8) or a packet sniffer).
(like tcpdump(8) or a packet sniffer). Optionally, it can count retransmits
over a user signalled interval to spot potentially dropping network paths the
flows are traversing.

This uses dynamic tracing of the kernel tcp_retransmit_skb() and
tcp_send_loss_probe() functions, and will need to be updated to
Expand All @@ -25,6 +27,9 @@ Print usage message.
\-l
Include tail loss probe attempts (in some cases the kernel may not
complete the TLP send).
.TP
\-c
Count occurring retransmits per flow.
.SH EXAMPLES
.TP
Trace TCP retransmits:
Expand Down Expand Up @@ -63,6 +68,9 @@ Remote port.
.TP
STATE
TCP session state.
.TP
RETRANSMITS
Accumulated occurred retransmits since start.
.SH OVERHEAD
Should be negligible: TCP retransmit events should be low (<1000/s), and the
low overhead this tool adds to each event should make the cost negligible.
Expand Down
144 changes: 118 additions & 26 deletions tools/tcpretrans.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
# Licensed under the Apache License, Version 2.0 (the "License")
#
# 14-Feb-2016 Brendan Gregg Created this.
# 03-Nov-2017 Matthias Tafelmeier Extended this.

from __future__ import print_function
from bcc import BPF
Expand All @@ -21,6 +22,7 @@
from socket import inet_ntop, AF_INET, AF_INET6
from struct import pack
import ctypes as ct
from time import sleep

# arguments
examples = """examples:
Expand All @@ -33,6 +35,8 @@
epilog=examples)
parser.add_argument("-l", "--lossprobe", action="store_true",
help="include tail loss probe attempts")
parser.add_argument("-c", "--count", action="store_true",
help="count occurred retransmits per flow")
args = parser.parse_args()

# define BPF program
Expand Down Expand Up @@ -70,6 +74,23 @@
};
BPF_PERF_OUTPUT(ipv6_events);
// separate flow keys per address family
struct ipv4_flow_key_t {
u32 saddr;
u32 daddr;
u16 lport;
u16 dport;
};
BPF_HASH(ipv4_count, struct ipv4_flow_key_t);
struct ipv6_flow_key_t {
unsigned __int128 saddr;
unsigned __int128 daddr;
u16 lport;
u16 dport;
};
BPF_HASH(ipv6_count, struct ipv6_flow_key_t);
static int trace_event(struct pt_regs *ctx, struct sock *skp, int type)
{
if (skp == NULL)
Expand All @@ -83,26 +104,11 @@
char state = skp->__sk_common.skc_state;
if (family == AF_INET) {
struct ipv4_data_t data4 = {.pid = pid, .ip = 4, .type = type};
data4.saddr = skp->__sk_common.skc_rcv_saddr;
data4.daddr = skp->__sk_common.skc_daddr;
// lport is host order
data4.lport = lport;
data4.dport = ntohs(dport);
data4.state = state;
ipv4_events.perf_submit(ctx, &data4, sizeof(data4));
IPV4_INIT
IPV4_CORE
} else if (family == AF_INET6) {
struct ipv6_data_t data6 = {.pid = pid, .ip = 6, .type = type};
bpf_probe_read(&data6.saddr, sizeof(data6.saddr),
skp->__sk_common.skc_v6_rcv_saddr.in6_u.u6_addr32);
bpf_probe_read(&data6.daddr, sizeof(data6.daddr),
skp->__sk_common.skc_v6_daddr.in6_u.u6_addr32);
// lport is host order
data6.lport = lport;
data6.dport = ntohs(dport);
data6.state = state;
ipv6_events.perf_submit(ctx, &data6, sizeof(data6));
IPV6_INIT
IPV6_CORE
}
// else drop
Expand All @@ -122,6 +128,66 @@
}
"""

struct_init = { 'ipv4':
{ 'count' :
"""
struct ipv4_flow_key_t flow_key = {};
flow_key.saddr = skp->__sk_common.skc_rcv_saddr;
flow_key.daddr = skp->__sk_common.skc_daddr;
// lport is host order
flow_key.lport = lport;
flow_key.dport = ntohs(dport);""",
'trace' :
"""
struct ipv4_data_t data4 = {.pid = pid, .ip = 4, .type = type};
data4.saddr = skp->__sk_common.skc_rcv_saddr;
data4.daddr = skp->__sk_common.skc_daddr;
// lport is host order
data4.lport = lport;
data4.dport = ntohs(dport);
data4.state = state; """
},
'ipv6':
{ 'count' :
"""
struct ipv6_flow_key_t flow_key = {};
bpf_probe_read(&flow_key.saddr, sizeof(flow_key.saddr),
skp->__sk_common.skc_v6_rcv_saddr.in6_u.u6_addr32);
bpf_probe_read(&flow_key.daddr, sizeof(flow_key.daddr),
skp->__sk_common.skc_v6_daddr.in6_u.u6_addr32);
// lport is host order
flow_key.lport = lport;
flow_key.dport = ntohs(dport);""",
'trace' : """
struct ipv6_data_t data6 = {.pid = pid, .ip = 6, .type = type};
bpf_probe_read(&data6.saddr, sizeof(data6.saddr),
skp->__sk_common.skc_v6_rcv_saddr.in6_u.u6_addr32);
bpf_probe_read(&data6.daddr, sizeof(data6.daddr),
skp->__sk_common.skc_v6_daddr.in6_u.u6_addr32);
// lport is host order
data6.lport = lport;
data6.dport = ntohs(dport);
data6.state = state;"""
}
}

count_core_base = """
u64 zero = 0, *val;
val = COUNT_STRUCT.lookup_or_init(&flow_key, &zero);
(*val)++;
"""

if args.count:
bpf_text = bpf_text.replace("IPV4_INIT", struct_init['ipv4']['count'])
bpf_text = bpf_text.replace("IPV6_INIT", struct_init['ipv6']['count'])
bpf_text = bpf_text.replace("IPV4_CORE", count_core_base.replace("COUNT_STRUCT", 'ipv4_count'))
bpf_text = bpf_text.replace("IPV6_CORE", count_core_base.replace("COUNT_STRUCT", 'ipv6_count'))
else:
bpf_text = bpf_text.replace("IPV4_INIT", struct_init['ipv4']['trace'])
bpf_text = bpf_text.replace("IPV6_INIT", struct_init['ipv6']['trace'])
bpf_text = bpf_text.replace("IPV4_CORE", "ipv4_events.perf_submit(ctx, &data4, sizeof(data4));")
bpf_text = bpf_text.replace("IPV6_CORE", "ipv6_events.perf_submit(ctx, &data6, sizeof(data6));")

# event data
class Data_ipv4(ct.Structure):
_fields_ = [
Expand Down Expand Up @@ -186,18 +252,44 @@ def print_ipv6_event(cpu, data, size):
"%s:%d" % (inet_ntop(AF_INET6, event.daddr), event.dport),
tcpstate[event.state]))

def depict_cnt(counts_tab, l3prot='ipv4'):
for k, v in sorted(counts_tab.items(), key=lambda counts: counts[1].value):
depict_key = ""
ep_fmt = "[%s]#%d"
if l3prot == 'ipv4':
depict_key = "%-20s <-> %-20s" % (ep_fmt % (inet_ntop(AF_INET, pack('I', k.saddr)), k.lport),
ep_fmt % (inet_ntop(AF_INET, pack('I', k.daddr)), k.dport))
else:
depict_key = "%-20s <-> %-20s" % (ep_fmt % (inet_ntop(AF_INET6, k.saddr), k.lport),
ep_fmt % (inet_ntop(AF_INET6, k.daddr), k.dport))

print ("%s %10d" % (depict_key, v.value))

# initialize BPF
b = BPF(text=bpf_text)
b.attach_kprobe(event="tcp_retransmit_skb", fn_name="trace_retransmit")
if args.lossprobe:
b.attach_kprobe(event="tcp_send_loss_probe", fn_name="trace_tlp")

# header
print("%-8s %-6s %-2s %-20s %1s> %-20s %-4s" % ("TIME", "PID", "IP",
"LADDR:LPORT", "T", "RADDR:RPORT", "STATE"))
print("Tracing retransmits ... Hit Ctrl-C to end")
if args.count:
try:
while 1:
sleep(99999999)
except BaseException:
pass

# header
print("\n%-25s %-25s %-10s" % (
"LADDR:LPORT", "RADDR:RPORT", "RETRANSMITS"))
depict_cnt(b.get_table("ipv4_count"))
depict_cnt(b.get_table("ipv6_count"), l3prot='ipv6')
# read events
b["ipv4_events"].open_perf_buffer(print_ipv4_event)
b["ipv6_events"].open_perf_buffer(print_ipv6_event)
while 1:
b.kprobe_poll()
else:
# header
print("%-8s %-6s %-2s %-20s %1s> %-20s %-4s" % ("TIME", "PID", "IP",
"LADDR:LPORT", "T", "RADDR:RPORT", "STATE"))
b["ipv4_events"].open_perf_buffer(print_ipv4_event)
b["ipv6_events"].open_perf_buffer(print_ipv6_event)
while 1:
b.kprobe_poll()
15 changes: 15 additions & 0 deletions tools/tcpretrans_example.txt
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,20 @@ TIME PID IP LADDR:LPORT T> RADDR:RPORT STATE
See the "L>" in the "T>" column. These are attempts: the kernel probably
sent a TLP, but in some cases it might not have been ultimately sent.

To spot heavily retransmitting flows quickly one can use the -c flag. It will
count occurring retransmits per flow.

# ./tcpretrans.py -c
Tracing retransmits ... Hit Ctrl-C to end
^C
LADDR:LPORT RADDR:RPORT RETRANSMITS
192.168.10.50:60366 <-> 172.217.21.194:443 700
192.168.10.50:666 <-> 172.213.11.195:443 345
192.168.10.50:366 <-> 172.212.22.194:443 211
[...]

This can ease to quickly isolate congested or otherwise awry network paths
responsible for clamping tcp performance.

USAGE message:

Expand All @@ -55,6 +69,7 @@ Trace TCP retransmits
optional arguments:
-h, --help show this help message and exit
-l, --lossprobe include tail loss probe attempts
-c, --count count occurred retransmits per flow

examples:
./tcpretrans # trace TCP retransmits
Expand Down

0 comments on commit 1e9467f

Please sign in to comment.