From bbd9acd1d919549ba3aac20cbcea0a820494610f Mon Sep 17 00:00:00 2001 From: Brendan Gregg Date: Tue, 20 Mar 2018 18:35:12 -0700 Subject: [PATCH] add the tcpstates tool --- README.md | 1 + man/man8/tcpstates.8 | 128 ++++++++++++++ tools/tcpstates.py | 333 ++++++++++++++++++++++++++++++++++++ tools/tcpstates_example.txt | 52 ++++++ 4 files changed, 514 insertions(+) create mode 100644 man/man8/tcpstates.8 create mode 100755 tools/tcpstates.py create mode 100644 tools/tcpstates_example.txt diff --git a/README.md b/README.md index 0a29feae0412..ed89d0327eec 100644 --- a/README.md +++ b/README.md @@ -141,6 +141,7 @@ pair of .c and .py files, and some are directories of files. - tools/[tcpconnlat](tools/tcpconnlat.py): Trace TCP active connection latency (connect()). [Examples](tools/tcpconnlat_example.txt). - tools/[tcplife](tools/tcplife.py): Trace TCP sessions and summarize lifespan. [Examples](tools/tcplife_example.txt). - tools/[tcpretrans](tools/tcpretrans.py): Trace TCP retransmits and TLPs. [Examples](tools/tcpretrans_example.txt). +- tools/[tcpstates](tools/tcpstates.py): Trace TCP session state changes with durations. [Examples](tools/tcpstates_example.txt). - tools/[tcpsubnet](tools/tcpsubnet.py): Summarize and aggregate TCP send by subnet. [Examples](tools/tcpsubnet_example.txt). - tools/[tcptop](tools/tcptop.py): Summarize TCP send/recv throughput by host. Top for TCP. [Examples](tools/tcptop_example.txt). - tools/[tcptracer](tools/tcptracer.py): Trace TCP established connections (connect(), accept(), close()). [Examples](tools/tcptracer_example.txt). diff --git a/man/man8/tcpstates.8 b/man/man8/tcpstates.8 new file mode 100644 index 000000000000..b31fd649b178 --- /dev/null +++ b/man/man8/tcpstates.8 @@ -0,0 +1,128 @@ +.TH tcpstates 8 "2018-03-20" "USER COMMANDS" +.SH NAME +tcpstates \- Trace TCP session state changes with durations. Uses Linux eBPF/bcc. +.SH SYNOPSIS +.B tcpstates [\-h] [\-T] [\-t] [\-w] [\-s] [\-D PORTS] [\-L PORTS] +.SH DESCRIPTION +This tool traces TCP session state changes while tracing, and prints details +including the duration in each state. This can help explain the latency of +TCP connections: whether the time is spent in the ESTABLISHED state (data +transfer), or initialization state (SYN_SENT), etc. + +This tool works using the sock:inet_sock_set_state tracepoint, which was +added to Linux 4.16. Linux 4.16 also included extra state transitions so that +all TCP transitions could be observed by this tracepoint. + +Only TCP state changes are traced, so it is expected that the +overhead of this tool is much lower than typical send/receive tracing. + +Since this uses BPF, only the root user can use this tool. +.SH REQUIREMENTS +CONFIG_BPF and bcc, and the sock:inet_sock_set_state tracepoint. +.SH OPTIONS +.TP +\-h +Print usage message. +.TP +\-s +Comma separated values output (parseable). +.TP +\-t +Include a timestamp column (seconds). +.TP +\-T +Include a time column (HH:MM:SS). +.TP +\-w +Wide column output (fits IPv6 addresses). +.TP +\-L PORTS +Comma-separated list of local ports to trace (filtered in-kernel). +.TP +\-D PORTS +Comma-separated list of destination ports to trace (filtered in-kernel). +.SH EXAMPLES +.TP +Trace all TCP sessions, and show all state changes: +# +.B tcpstates +.TP +Include a timestamp column, and wide column output: +# +.B tcpstates \-tw +.TP +Trace connections to local ports 80 and 81 only: +# +.B tcpstates \-L 80,81 +.TP +Trace connections to remote port 80 only: +# +.B tcpstates \-D 80 +.SH FIELDS +.TP +TIME +Time of the change, in HH:MM:SS format. +.TP +TIME(s) +Time of the change, in seconds. +.TP +C-PID +The current on-CPU process ID. This may show the process that owns the TCP +session if the state change executes in synchronous process context, else it +is likely to show the kernel (asynchronous state change). +.TP +C-COMM +The current on-CPU process name. This may show the process that owns the TCP +session if the state change executes in synchronous process context, else it +is likely to show the kernel (asynchronous state change). +.TP +IP +IP address family (4 or 6) +.TP +LADDR +Local IP address. +.TP +DADDR +Remote IP address. +.TP +LPORT +Local port. +.TP +DPORT +Destination port. +.TP +OLDSTATE +Previous TCP state. +.TP +NEWSTATE +New TCP state. +.TP +MS +Duration of this state. +.SH OVERHEAD +This traces the kernel TCP set state function, which should be called much +less often than send/receive tracing, and therefore have lower overhead. The +overhead of the tool is relative to the rate of new TCP sessions: if this is +high, over 10,000 per second, then there may be noticeable overhead just to +print out 10k lines of formatted output per second. + +You can find out the rate of new TCP sessions using "sar \-n TCP 1", and +adding the active/s and passive/s columns. + +As always, test and understand this tools overhead for your types of +workloads before production use. +.SH SOURCE +This is from bcc. +.IP +https://github.com/iovisor/bcc +.PP +Also look in the bcc distribution for a companion _examples.txt file containing +example usage, output, and commentary for this tool. +.SH OS +Linux +.SH STABILITY +Unstable - in development. +.SH AUTHOR +Brendan Gregg +.SH SEE ALSO +tcpaccept(8), tcpconnect(8), tcptop(8), tcplife(8) diff --git a/tools/tcpstates.py b/tools/tcpstates.py new file mode 100755 index 000000000000..0c48f51f87c2 --- /dev/null +++ b/tools/tcpstates.py @@ -0,0 +1,333 @@ +#!/usr/bin/python +# @lint-avoid-python-3-compatibility-imports +# +# tcpstates Trace the TCP session state changes with durations. +# For Linux, uses BCC, BPF. Embedded C. +# +# USAGE: tcpstates [-h] [-C] [-S] [interval [count]] +# +# This uses the sock:inet_sock_set_state tracepoint, added to Linux 4.16. +# Linux 4.16 also adds more state transitions so that they can be traced. +# +# Copyright 2018 Netflix, Inc. +# Licensed under the Apache License, Version 2.0 (the "License") +# +# 20-Mar-2018 Brendan Gregg Created this. + +from __future__ import print_function +from bcc import BPF +import argparse +from socket import inet_ntop, AF_INET, AF_INET6 +from struct import pack +import ctypes as ct +from time import strftime + +# arguments +examples = """examples: + ./tcpstates # trace all TCP state changes + ./tcpstates -t # include timestamp column + ./tcpstates -T # include time column (HH:MM:SS) + ./tcpstates -w # wider colums (fit IPv6) + ./tcpstates -stT # csv output, with times & timestamps + ./tcpstates -L 80 # only trace local port 80 + ./tcpstates -L 80,81 # only trace local ports 80 and 81 + ./tcpstates -D 80 # only trace remote port 80 +""" +parser = argparse.ArgumentParser( + description="Trace TCP session state changes and durations", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=examples) +parser.add_argument("-T", "--time", action="store_true", + help="include time column on output (HH:MM:SS)") +parser.add_argument("-t", "--timestamp", action="store_true", + help="include timestamp on output (seconds)") +parser.add_argument("-w", "--wide", action="store_true", + help="wide column output (fits IPv6 addresses)") +parser.add_argument("-s", "--csv", action="store_true", + help="comma separated values output") +parser.add_argument("-L", "--localport", + help="comma-separated list of local ports to trace.") +parser.add_argument("-D", "--remoteport", + help="comma-separated list of remote ports to trace.") +parser.add_argument("--ebpf", action="store_true", + help=argparse.SUPPRESS) +args = parser.parse_args() +debug = 0 + +# define BPF program +bpf_text = """ +#include +#define KBUILD_MODNAME "foo" +#include +#include +#include + +BPF_HASH(last, struct sock *, u64); + +// separate data structs for ipv4 and ipv6 +struct ipv4_data_t { + // XXX: switch some to u32's when supported + u64 ts_us; + u64 skaddr; + u64 pid; + u64 saddr; + u64 daddr; + u64 ports; + u64 span_us; + u64 oldstate; + u64 newstate; + char task[TASK_COMM_LEN]; +}; +BPF_PERF_OUTPUT(ipv4_events); + +struct ipv6_data_t { + u64 ts_us; + u64 skaddr; + u64 pid; + unsigned __int128 saddr; + unsigned __int128 daddr; + u64 ports; + u64 span_us; + u64 oldstate; + u64 newstate; + char task[TASK_COMM_LEN]; +}; +BPF_PERF_OUTPUT(ipv6_events); + +struct id_t { + u32 pid; + char task[TASK_COMM_LEN]; +}; + +TRACEPOINT_PROBE(sock, inet_sock_set_state) +{ + if (args->protocol != IPPROTO_TCP) + return 0; + + u32 pid = bpf_get_current_pid_tgid() >> 32; + // sk is used as a UUID + struct sock *sk = (struct sock *)args->skaddr; + + // lport is either used in a filter here, or later + u16 lport = args->sport; + FILTER_LPORT + + // dport is either used in a filter here, or later + u16 dport = args->dport; + FILTER_DPORT + + // calculate delta + u64 *tsp, delta_us; + tsp = last.lookup(&sk); + if (tsp == 0) + delta_us = 0; + else + delta_us = (bpf_ktime_get_ns() - *tsp) / 1000; + + if (args->family == AF_INET) { + struct ipv4_data_t data4 = { + .span_us = delta_us, + .oldstate = args->oldstate, .newstate = args->newstate}; + data4.skaddr = (u64)args->skaddr; + data4.ts_us = bpf_ktime_get_ns() / 1000; + bpf_probe_read(&data4.saddr, sizeof(u32), args->saddr); + bpf_probe_read(&data4.daddr, sizeof(u32), args->daddr); + // a workaround until data4 compiles with separate lport/dport + data4.ports = dport + ((0ULL + lport) << 32); + data4.pid = pid; + + bpf_get_current_comm(&data4.task, sizeof(data4.task)); + ipv4_events.perf_submit(args, &data4, sizeof(data4)); + + } else /* 6 */ { + struct ipv6_data_t data6 = { + .span_us = delta_us, + .oldstate = args->oldstate, .newstate = args->newstate}; + data6.skaddr = (u64)args->skaddr; + data6.ts_us = bpf_ktime_get_ns() / 1000; + bpf_probe_read(&data6.saddr, sizeof(data6.saddr), args->saddr_v6); + bpf_probe_read(&data6.daddr, sizeof(data6.daddr), args->saddr_v6); + // a workaround until data6 compiles with separate lport/dport + data6.ports = dport + ((0ULL + lport) << 32); + data6.pid = pid; + bpf_get_current_comm(&data6.task, sizeof(data6.task)); + ipv6_events.perf_submit(args, &data6, sizeof(data6)); + } + + u64 ts = bpf_ktime_get_ns(); + last.update(&sk, &ts); + + return 0; +} +""" + +if (not BPF.tracepoint_exists("sock", "inet_sock_set_state")): + print("ERROR: tracepoint sock:inet_sock_set_state missing " + "(added in Linux 4.16). Exiting") + exit() + +# code substitutions +if args.remoteport: + dports = [int(dport) for dport in args.remoteport.split(',')] + dports_if = ' && '.join(['dport != %d' % dport for dport in dports]) + bpf_text = bpf_text.replace('FILTER_DPORT', + 'if (%s) { last.delete(&sk); return 0; }' % dports_if) +if args.localport: + lports = [int(lport) for lport in args.localport.split(',')] + lports_if = ' && '.join(['lport != %d' % lport for lport in lports]) + bpf_text = bpf_text.replace('FILTER_LPORT', + 'if (%s) { last.delete(&sk); return 0; }' % lports_if) +bpf_text = bpf_text.replace('FILTER_DPORT', '') +bpf_text = bpf_text.replace('FILTER_LPORT', '') + +if debug or args.ebpf: + print(bpf_text) + if args.ebpf: + exit() + +# event data +TASK_COMM_LEN = 16 # linux/sched.h + +class Data_ipv4(ct.Structure): + _fields_ = [ + ("ts_us", ct.c_ulonglong), + ("skaddr", ct.c_ulonglong), + ("pid", ct.c_ulonglong), + ("saddr", ct.c_ulonglong), + ("daddr", ct.c_ulonglong), + ("ports", ct.c_ulonglong), + ("span_us", ct.c_ulonglong), + ("oldstate", ct.c_ulonglong), + ("newstate", ct.c_ulonglong), + ("task", ct.c_char * TASK_COMM_LEN) + ] + +class Data_ipv6(ct.Structure): + _fields_ = [ + ("ts_us", ct.c_ulonglong), + ("skaddr", ct.c_ulonglong), + ("pid", ct.c_ulonglong), + ("saddr", (ct.c_ulonglong * 2)), + ("daddr", (ct.c_ulonglong * 2)), + ("ports", ct.c_ulonglong), + ("span_us", ct.c_ulonglong), + ("oldstate", ct.c_ulonglong), + ("newstate", ct.c_ulonglong), + ("task", ct.c_char * TASK_COMM_LEN) + ] + +# +# Setup output formats +# +# Don't change the default output (next 2 lines): this fits in 80 chars. I +# know it doesn't have NS or UIDs etc. I know. If you really, really, really +# need to add columns, columns that solve real actual problems, I'd start by +# adding an extended mode (-x) to included those columns. +# +header_string = "%-16s %-5s %-10.10s %s%-15s %-5s %-15s %-5s %-11s -> %-11s %s" +format_string = ("%-16x %-5d %-10.10s %s%-15s %-5d %-15s %-5d %-11s " + + "-> %-11s %.3f") +if args.wide: + header_string = ("%-16s %-5s %-16.16s %-2s %-26s %-5s %-26s %-5s %-11s " + + "-> %-11s %s") + format_string = ("%-16x %-5d %-16.16s %-2s %-26s %-5s %-26s %-5d %-11s " + + "-> %-11s %.3f") +if args.csv: + header_string = "%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s" + format_string = "%x,%d,%s,%s,%s,%s,%s,%d,%s,%s,%.3f" + +def tcpstate2str(state): + # from include/net/tcp_states.h: + tcpstate = { + 1: "ESTABLISHED", + 2: "SYN_SENT", + 3: "SYN_RECV", + 4: "FIN_WAIT1", + 5: "FIN_WAIT2", + 6: "TIME_WAIT", + 7: "CLOSE", + 8: "CLOSE_WAIT", + 9: "LAST_ACK", + 10: "LISTEN", + 11: "CLOSING", + 12: "NEW_SYN_RECV", + } + + if state in tcpstate: + return tcpstate[state] + else: + return str(state) + +# process event +def print_ipv4_event(cpu, data, size): + event = ct.cast(data, ct.POINTER(Data_ipv4)).contents + global start_ts + if args.time: + if args.csv: + print("%s," % strftime("%H:%M:%S"), end="") + else: + print("%-8s " % strftime("%H:%M:%S"), end="") + if args.timestamp: + if start_ts == 0: + start_ts = event.ts_us + delta_s = (float(event.ts_us) - start_ts) / 1000000 + if args.csv: + print("%.6f," % delta_s, end="") + else: + print("%-9.6f " % delta_s, end="") + print(format_string % (event.skaddr, event.pid, event.task.decode(), + "4" if args.wide or args.csv else "", + inet_ntop(AF_INET, pack("I", event.saddr)), event.ports >> 32, + inet_ntop(AF_INET, pack("I", event.daddr)), event.ports & 0xffffffff, + tcpstate2str(event.oldstate), tcpstate2str(event.newstate), + float(event.span_us) / 1000)) + +def print_ipv6_event(cpu, data, size): + event = ct.cast(data, ct.POINTER(Data_ipv6)).contents + global start_ts + if args.time: + if args.csv: + print("%s," % strftime("%H:%M:%S"), end="") + else: + print("%-8s " % strftime("%H:%M:%S"), end="") + if args.timestamp: + if start_ts == 0: + start_ts = event.ts_us + delta_s = (float(event.ts_us) - start_ts) / 1000000 + if args.csv: + print("%.6f," % delta_s, end="") + else: + print("%-9.6f " % delta_s, end="") + print(format_string % (event.skaddr, event.pid, event.task.decode(), + "6" if args.wide or args.csv else "", + inet_ntop(AF_INET6, event.saddr), event.ports >> 32, + inet_ntop(AF_INET6, event.daddr), event.ports & 0xffffffff, + tcpstate2str(event.oldstate), tcpstate2str(event.newstate), + float(event.span_us) / 1000)) + +# initialize BPF +b = BPF(text=bpf_text) + +# header +if args.time: + if args.csv: + print("%s," % ("TIME"), end="") + else: + print("%-8s " % ("TIME"), end="") +if args.timestamp: + if args.csv: + print("%s," % ("TIME(s)"), end="") + else: + print("%-9s " % ("TIME(s)"), end="") +print(header_string % ("SKADDR", "C-PID", "C-COMM", + "IP" if args.wide or args.csv else "", + "LADDR", "LPORT", "RADDR", "RPORT", + "OLDSTATE", "NEWSTATE", "MS")) + +start_ts = 0 + +# read events +b["ipv4_events"].open_perf_buffer(print_ipv4_event, page_cnt=64) +b["ipv6_events"].open_perf_buffer(print_ipv6_event, page_cnt=64) +while 1: + b.perf_buffer_poll() diff --git a/tools/tcpstates_example.txt b/tools/tcpstates_example.txt new file mode 100644 index 000000000000..aca857af4e78 --- /dev/null +++ b/tools/tcpstates_example.txt @@ -0,0 +1,52 @@ +Demonstrations of tcpstates, the Linux BPF/bcc version. + + +tcpstates prints TCP state change information, including the duration in each +state as milliseconds. For example, a single TCP session: + +# tcpstates +SKADDR C-PID C-COMM LADDR LPORT RADDR RPORT OLDSTATE -> NEWSTATE MS +ffff9fd7e8192000 22384 curl 100.66.100.185 0 52.33.159.26 80 CLOSE -> SYN_SENT 0.000 +ffff9fd7e8192000 0 swapper/5 100.66.100.185 63446 52.33.159.26 80 SYN_SENT -> ESTABLISHED 1.373 +ffff9fd7e8192000 22384 curl 100.66.100.185 63446 52.33.159.26 80 ESTABLISHED -> FIN_WAIT1 176.042 +ffff9fd7e8192000 0 swapper/5 100.66.100.185 63446 52.33.159.26 80 FIN_WAIT1 -> FIN_WAIT2 0.536 +ffff9fd7e8192000 0 swapper/5 100.66.100.185 63446 52.33.159.26 80 FIN_WAIT2 -> CLOSE 0.006 +^C + +This showed that the most time was spent in the ESTABLISHED state (which then +transitioned to FIN_WAIT1), which was 176.042 milliseconds. + +The first column is the socked address, as the output may include lines from +different sessions interleaved. The next two columns show the current on-CPU +process ID and command name: these may show the process that owns the TCP +session, depending on whether the state change executes synchronously in +process context. If that's not the case, they may show kernel details. + + +USAGE: + +# tcpstates -h +usage: tcpstates [-h] [-T] [-t] [-w] [-s] [-L LOCALPORT] [-D REMOTEPORT] + +Trace TCP session state changes and durations + +optional arguments: + -h, --help show this help message and exit + -T, --time include time column on output (HH:MM:SS) + -t, --timestamp include timestamp on output (seconds) + -w, --wide wide column output (fits IPv6 addresses) + -s, --csv comma separated values output + -L LOCALPORT, --localport LOCALPORT + comma-separated list of local ports to trace. + -D REMOTEPORT, --remoteport REMOTEPORT + comma-separated list of remote ports to trace. + +examples: + ./tcpstates # trace all TCP state changes + ./tcpstates -t # include timestamp column + ./tcpstates -T # include time column (HH:MM:SS) + ./tcpstates -w # wider colums (fit IPv6) + ./tcpstates -stT # csv output, with times & timestamps + ./tcpstates -L 80 # only trace local port 80 + ./tcpstates -L 80,81 # only trace local ports 80 and 81 + ./tcpstates -D 80 # only trace remote port 80