diff --git a/README.md b/README.md index 1e4f76a954e2..29e3aad64afc 100644 --- a/README.md +++ b/README.md @@ -120,6 +120,7 @@ Examples: - tools/[tcpconnect](tools/tcpconnect.py): Trace TCP active connections (connect()). [Examples](tools/tcpconnect_example.txt). - tools/[tcpconnlat](tools/tcpconnlat.py): Trace TCP active connection latency (connect()). [Examples](tools/tcpconnlat_example.txt). - tools/[tcpretrans](tools/tcpretrans.py): Trace TCP retransmits and TLPs. [Examples](tools/tcpretrans_example.txt). +- tools/[tcptop](tools/tcptop.py): Summarize TCP send/recv throughput by host. Top for TCP. [Examples](tools/tcptop_example.txt). - tools/[tplist](tools/tplist.py): Display kernel tracepoints or USDT probes and their formats. [Examples](tools/tplist_example.txt). - tools/[trace](tools/trace.py): Trace arbitrary functions, with filters. [Examples](tools/trace_example.txt) - tools/[vfscount](tools/vfscount.py) tools/[vfscount.c](tools/vfscount.c): Count VFS calls. [Examples](tools/vfscount_example.txt). diff --git a/man/man8/tcptop.8 b/man/man8/tcptop.8 new file mode 100644 index 000000000000..672e8edddbb7 --- /dev/null +++ b/man/man8/tcptop.8 @@ -0,0 +1,112 @@ +.TH tcptop 8 "2016-09-13" "USER COMMANDS" +.SH NAME +tcptop \- Summarize TCP send/recv throughput by host. Top for TCP. +.SH SYNOPSIS +.B tcptop [\-h] [\-C] [\-S] [\-p PID] [interval] [count] +.SH DESCRIPTION +This is top for TCP sessions. + +This summarizes TCP send/receive Kbytes by host, and prints a summary that +refreshes, along other system-wide metrics. + +This uses dynamic tracing of kernel TCP send/receive functions, and will +need to be updated to match kernel changes. + +The traced TCP functions are usually called at a lower rate than +per-packet functions, and therefore have lower overhead. The traced data is +summarized in-kernel using a BPF map to further reduce overhead. At very high +TCP event rates, the overhead may still be measurable. See the OVERHEAD +section for more details. + +Since this uses BPF, only the root user can use this tool. +.SH REQUIREMENTS +CONFIG_BPF and bcc. +.SH OPTIONS +.TP +\-h +Print USAGE message. +.TP +\-C +Don't clear the screen. +.TP +\-S +Don't print the system summary line (load averages). +.TP +\-p PID +Trace this PID only. +.TP +interval +Interval between updates, seconds (default 1). +.TP +count +Number of interval summaries (default is many). +.SH EXAMPLES +.TP +Summarize TCP throughput by active sessions, 1 second refresh: +# +.B tcptop +.TP +Don't clear the screen (rolling output), and 5 second summaries: +# +.B tcptop \-C 5 +.TP +Trace PID 181 only, and don't clear the screen: +# +.B tcptop \-Cp 181 +.SH FIELDS +.TP +loadavg: +The contents of /proc/loadavg +.TP +PID +Process ID. +.TP +COMM +Process name. +.TP +LADDR +Local address (IPv4), and TCP port +.TP +RADDR +Remote address (IPv4), and TCP port +.TP +LADDR6 +Source address (IPv6), and TCP port +.TP +RADDR6 +Destination address (IPv6), and TCP port +.TP +RX_KB +Received Kbytes +.TP +TX_KB +Transmitted Kbytes +.SH OVERHEAD +This traces all send/receives in TCP, high in the TCP/IP stack (close to the +application) which are usually called at a lower rate than per-packet +functions, lowering overhead. It also summarizes data in-kernel to further +reduce overhead. These techniques help, but there may still be measurable +overhead at high send/receive rates, eg, ~13% of one CPU at 100k events/sec. +use funccount to count the kprobes in the tool to find out this rate, as the +overhead is relative to the rate. Some sample production servers tested found +total TCP event rates of 4k to 15k per second, and the CPU overhead at these +rates ranged from 0.5% to 2.0% of one CPU. If your send/receive rate is low +(eg, <1000/sec) then the overhead is expected to be negligible; Test in a lab +environment first. +.SH SOURCE +This is from bcc. +.IP +https://github.com/iovisor/bcc +.PP +Also look in the bcc distribution for a companion _examples.txt file containing +example usage, output, and commentary for this tool. +.SH OS +Linux +.SH STABILITY +Unstable - in development. +.SH AUTHOR +Brendan Gregg +.SH INSPIRATION +top(1) by William LeFebvre +.SH SEE ALSO +tcpconnect(8), tcpaccept(8) diff --git a/tools/tcptop.py b/tools/tcptop.py new file mode 100755 index 000000000000..a6154c66625f --- /dev/null +++ b/tools/tcptop.py @@ -0,0 +1,287 @@ +#!/usr/bin/python +# @lint-avoid-python-3-compatibility-imports +# +# tcptop Summarize TCP send/recv throughput by host. +# For Linux, uses BCC, eBPF. Embedded C. +# +# USAGE: tcptop [-h] [-C] [-S] [-p PID] [interval [count]] +# +# This uses dynamic tracing of kernel functions, and will need to be updated +# to match kernel changes. +# +# WARNING: This traces all send/receives at the TCP level, and while it +# summarizes data in-kernel to reduce overhead, there may still be some +# overhead at high TCP send/receive rates (eg, ~13% of one CPU at 100k TCP +# events/sec. This is not the same as packet rate: funccount can be used to +# count the kprobes below to find out the TCP rate). Test in a lab environment +# first. If your send/receive rate is low (eg, <1k/sec) then the overhead is +# expected to be negligible. +# +# ToDo: Fit output to screen size (top X only) in default (not -C) mode. +# +# Copyright 2016 Netflix, Inc. +# Licensed under the Apache License, Version 2.0 (the "License") +# +# 02-Sep-2016 Brendan Gregg Created this. + +from __future__ import print_function +from bcc import BPF +import argparse +from socket import inet_ntop, AF_INET, AF_INET6 +from struct import pack +from time import sleep, strftime +from subprocess import call +import ctypes as ct + +# arguments +examples = """examples: + ./tcptop # trace TCP send/recv by host + ./tcptop -C # don't clear the screen + ./tcptop -p 181 # only trace PID 181 +""" +parser = argparse.ArgumentParser( + description="Summarize TCP send/recv throughput by host", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=examples) +parser.add_argument("-C", "--noclear", action="store_true", + help="don't clear the screen") +parser.add_argument("-S", "--nosummary", action="store_true", + help="skip system summary line") +parser.add_argument("-p", "--pid", + help="trace this PID only") +parser.add_argument("interval", nargs="?", default=1, + help="output interval, in seconds (default 1)") +parser.add_argument("count", nargs="?", default=99999999, + help="number of outputs") +args = parser.parse_args() +countdown = int(args.count) +if args.interval and int(args.interval) == 0: + print("ERROR: interval 0. Exiting.") + exit() +debug = 0 + +# linux stats +loadavg = "/proc/loadavg" + +# define BPF program +bpf_text = """ +#include +#include +#include + +struct ipv4_key_t { + u32 pid; + u32 saddr; + u32 daddr; + u16 lport; + u16 dport; +}; +BPF_HASH(ipv4_send_bytes, struct ipv4_key_t); +BPF_HASH(ipv4_recv_bytes, struct ipv4_key_t); + +struct ipv6_key_t { + u32 pid; + // workaround until unsigned __int128 support: + u64 saddr0; + u64 saddr1; + u64 daddr0; + u64 daddr1; + u16 lport; + u16 dport; +}; +BPF_HASH(ipv6_send_bytes, struct ipv6_key_t); +BPF_HASH(ipv6_recv_bytes, struct ipv6_key_t); + +int kprobe__tcp_sendmsg(struct pt_regs *ctx, struct sock *sk, + struct msghdr *msg, size_t size) +{ + u32 pid = bpf_get_current_pid_tgid(); + FILTER + u16 dport = 0, family = sk->__sk_common.skc_family; + u64 *val, zero = 0; + + if (family == AF_INET) { + struct ipv4_key_t ipv4_key = {.pid = pid}; + ipv4_key.saddr = sk->__sk_common.skc_rcv_saddr; + ipv4_key.daddr = sk->__sk_common.skc_daddr; + ipv4_key.lport = sk->__sk_common.skc_num; + dport = sk->__sk_common.skc_dport; + ipv4_key.dport = ntohs(dport); + val = ipv4_send_bytes.lookup_or_init(&ipv4_key, &zero); + (*val) += size; + + } else if (family == AF_INET6) { + struct ipv6_key_t ipv6_key = {.pid = pid}; + + bpf_probe_read(&ipv6_key.saddr0, sizeof(ipv6_key.saddr0), + &sk->__sk_common.skc_v6_rcv_saddr.in6_u.u6_addr32[0]); + bpf_probe_read(&ipv6_key.saddr1, sizeof(ipv6_key.saddr1), + &sk->__sk_common.skc_v6_rcv_saddr.in6_u.u6_addr32[2]); + bpf_probe_read(&ipv6_key.daddr0, sizeof(ipv6_key.daddr0), + &sk->__sk_common.skc_v6_daddr.in6_u.u6_addr32[0]); + bpf_probe_read(&ipv6_key.daddr1, sizeof(ipv6_key.daddr1), + &sk->__sk_common.skc_v6_daddr.in6_u.u6_addr32[2]); + ipv6_key.lport = sk->__sk_common.skc_num; + dport = sk->__sk_common.skc_dport; + ipv6_key.dport = ntohs(dport); + val = ipv6_send_bytes.lookup_or_init(&ipv6_key, &zero); + (*val) += size; + } + // else drop + + return 0; +} + +/* + * tcp_recvmsg() would be obvious to trace, but is less suitable because: + * - we'd need to trace both entry and return, to have both sock and size + * - misses tcp_read_sock() traffic + * we'd much prefer tracepoints once they are available. + */ +int kprobe__tcp_cleanup_rbuf(struct pt_regs *ctx, struct sock *sk, int copied) +{ + u32 pid = bpf_get_current_pid_tgid(); + FILTER + u16 dport = 0, family = sk->__sk_common.skc_family; + u64 *val, zero = 0; + + if (family == AF_INET) { + struct ipv4_key_t ipv4_key = {.pid = pid}; + ipv4_key.saddr = sk->__sk_common.skc_rcv_saddr; + ipv4_key.daddr = sk->__sk_common.skc_daddr; + ipv4_key.lport = sk->__sk_common.skc_num; + dport = sk->__sk_common.skc_dport; + ipv4_key.dport = ntohs(dport); + val = ipv4_recv_bytes.lookup_or_init(&ipv4_key, &zero); + (*val) += copied; + + } else if (family == AF_INET6) { + struct ipv6_key_t ipv6_key = {.pid = pid}; + bpf_probe_read(&ipv6_key.saddr0, sizeof(ipv6_key.saddr0), + &sk->__sk_common.skc_v6_rcv_saddr.in6_u.u6_addr32[0]); + bpf_probe_read(&ipv6_key.saddr1, sizeof(ipv6_key.saddr1), + &sk->__sk_common.skc_v6_rcv_saddr.in6_u.u6_addr32[2]); + bpf_probe_read(&ipv6_key.daddr0, sizeof(ipv6_key.daddr0), + &sk->__sk_common.skc_v6_daddr.in6_u.u6_addr32[0]); + bpf_probe_read(&ipv6_key.daddr1, sizeof(ipv6_key.daddr1), + &sk->__sk_common.skc_v6_daddr.in6_u.u6_addr32[2]); + ipv6_key.lport = sk->__sk_common.skc_num; + dport = sk->__sk_common.skc_dport; + ipv6_key.dport = ntohs(dport); + val = ipv6_recv_bytes.lookup_or_init(&ipv6_key, &zero); + (*val) += copied; + } + // else drop + + return 0; +} +""" + +# code substitutions +if args.pid: + bpf_text = bpf_text.replace('FILTER', + 'if (pid != %s) { return 0; }' % args.pid) +else: + bpf_text = bpf_text.replace('FILTER', '') +if debug: + print(bpf_text) + +def pid_to_comm(pid): + try: + comm = open("/proc/%d/comm" % pid, "r").read().rstrip() + return comm + except IOError: + return str(pid) + +# initialize BPF +b = BPF(text=bpf_text) + +ipv4_send_bytes = b["ipv4_send_bytes"] +ipv4_recv_bytes = b["ipv4_recv_bytes"] +ipv6_send_bytes = b["ipv6_send_bytes"] +ipv6_recv_bytes = b["ipv6_recv_bytes"] + +print('Tracing... Output every %s secs. Hit Ctrl-C to end' % args.interval) + +# output +exiting = 0 +while (1): + try: + if args.interval: + sleep(int(args.interval)) + else: + sleep(99999999) + except KeyboardInterrupt: + exiting = 1 + + # header + if args.noclear: + print() + else: + call("clear") + if not args.nosummary: + with open(loadavg) as stats: + print("%-8s loadavg: %s" % (strftime("%H:%M:%S"), stats.read())) + + # IPv4: build dict of all seen keys + keys = ipv4_recv_bytes + for k, v in ipv4_send_bytes.items(): + if k not in keys: + keys[k] = v + + if keys: + print("%-6s %-12s %-21s %-21s %6s %6s" % ("PID", "COMM", + "LADDR", "RADDR", "RX_KB", "TX_KB")) + + # output + for k, v in reversed(sorted(keys.items(), key=lambda keys: keys[1].value)): + send_kbytes = 0 + if k in ipv4_send_bytes: + send_kbytes = int(ipv4_send_bytes[k].value / 1024) + recv_kbytes = 0 + if k in ipv4_recv_bytes: + recv_kbytes = int(ipv4_recv_bytes[k].value / 1024) + + print("%-6d %-12.12s %-21s %-21s %6d %6d" % (k.pid, + pid_to_comm(k.pid), + inet_ntop(AF_INET, pack("I", k.saddr)) + ":" + str(k.lport), + inet_ntop(AF_INET, pack("I", k.daddr)) + ":" + str(k.dport), + recv_kbytes, send_kbytes)) + + ipv4_send_bytes.clear() + ipv4_recv_bytes.clear() + + # IPv6: build dict of all seen keys + keys = ipv6_recv_bytes + for k, v in ipv6_send_bytes.items(): + if k not in keys: + keys[k] = v + + if keys: + # more than 80 chars, sadly. + print("\n%-6s %-12s %-32s %-32s %6s %6s" % ("PID", "COMM", + "LADDR6", "RADDR6", "RX_KB", "TX_KB")) + + # output + for k, v in reversed(sorted(keys.items(), key=lambda keys: keys[1].value)): + send_kbytes = 0 + if k in ipv6_send_bytes: + send_kbytes = int(ipv6_send_bytes[k].value / 1024) + recv_kbytes = 0 + if k in ipv6_recv_bytes: + recv_kbytes = int(ipv6_recv_bytes[k].value / 1024) + + print("%-6d %-12.12s %-32s %-32s %6d %6d" % (k.pid, + pid_to_comm(k.pid), + inet_ntop(AF_INET6, pack("QQ", k.saddr0, k.saddr1)) + ":" + + str(k.lport), + inet_ntop(AF_INET6, pack("QQ", k.daddr0, k.daddr1)) + ":" + + str(k.dport), + recv_kbytes, send_kbytes)) + + ipv6_send_bytes.clear() + ipv6_recv_bytes.clear() + + countdown -= 1 + if exiting or countdown == 0: + exit() diff --git a/tools/tcptop_example.txt b/tools/tcptop_example.txt new file mode 100644 index 000000000000..63ba2efae67a --- /dev/null +++ b/tools/tcptop_example.txt @@ -0,0 +1,116 @@ +Demonstrations of tcptop, the Linux eBPF/bcc version. + + +tcptop summarizes throughput by host and port. Eg: + +# tcptop +Tracing... Output every 1 secs. Hit Ctrl-C to end + +19:46:24 loadavg: 1.86 2.67 2.91 3/362 16681 + +PID COMM LADDR RADDR RX_KB TX_KB +16648 16648 100.66.3.172:22 100.127.69.165:6684 1 0 +16647 sshd 100.66.3.172:22 100.127.69.165:6684 0 2149 +14374 sshd 100.66.3.172:22 100.127.69.165:25219 0 0 +14458 sshd 100.66.3.172:22 100.127.69.165:7165 0 0 + +PID COMM LADDR6 RADDR6 RX_KB TX_KB +16681 sshd fe80::8a3:9dff:fed5:6b19:22 fe80::8a3:9dff:fed5:6b19:16606 1 1 +16679 ssh fe80::8a3:9dff:fed5:6b19:16606 fe80::8a3:9dff:fed5:6b19:22 1 1 +16680 sshd fe80::8a3:9dff:fed5:6b19:22 fe80::8a3:9dff:fed5:6b19:16606 0 0 + +This example output shows two listings of TCP connections, for IPv4 and IPv6. +If there is only traffic for one of these, then only one group is shown. + +The output in each listing is sorted by total throughput (send then receive), +and when printed it is rounded (floor) to the nearest Kbyte. The example output +shows PID 16647, sshd, transmitted 2149 Kbytes during the tracing interval. +The other IPv4 sessions had such low throughput they rounded to zero. + +All TCP sessions, including over loopback, are included. + +The session with the process name (COMM) of 16648 is really a short-lived +process with PID 16648 where we didn't catch the process name when printing +the output. If this behavior is a serious issue for you, you can modify the +tool's code to include bpf_get_current_comm() in the key structs, so that it's +fetched during the event and will always be seen. I did it this way to start +with, but it was measurably increasing the overhead of this tool, so I switched +to the asynchronous model. + +The overhead is relative to TCP event rate (the rate of tcp_sendmsg() and +tcp_recvmsg() or tcp_cleanup_rbuf()). Due to buffering, this should be lower +than the packet rate. You can measure the rate of these using funccount. +Some sample production servers tested found total rates of 4k to 15k per +second. The CPU overhead at these rates ranged from 0.5% to 2.0% of one CPU. +Maybe your workloads have higher rates and therefore higher overhead, or, +lower rates. + + +I much prefer not clearing the screen, so that historic output is in the +scroll-back buffer, and patterns or intermittent issues can be better seen. +You can do this with -C: + +# tcptop -C +Tracing... Output every 1 secs. Hit Ctrl-C to end + +20:27:12 loadavg: 0.08 0.02 0.17 2/367 17342 + +PID COMM LADDR RADDR RX_KB TX_KB +17287 17287 100.66.3.172:22 100.127.69.165:57585 3 1 +17286 sshd 100.66.3.172:22 100.127.69.165:57585 0 1 +14374 sshd 100.66.3.172:22 100.127.69.165:25219 0 0 + +20:27:13 loadavg: 0.08 0.02 0.17 1/367 17342 + +PID COMM LADDR RADDR RX_KB TX_KB +17286 sshd 100.66.3.172:22 100.127.69.165:57585 1 7761 +14374 sshd 100.66.3.172:22 100.127.69.165:25219 0 0 + +20:27:14 loadavg: 0.08 0.02 0.17 2/365 17347 + +PID COMM LADDR RADDR RX_KB TX_KB +17286 17286 100.66.3.172:22 100.127.69.165:57585 1 2501 +14374 sshd 100.66.3.172:22 100.127.69.165:25219 0 0 + +20:27:15 loadavg: 0.07 0.02 0.17 2/367 17403 + +PID COMM LADDR RADDR RX_KB TX_KB +17349 17349 100.66.3.172:22 100.127.69.165:10161 3 1 +17348 sshd 100.66.3.172:22 100.127.69.165:10161 0 1 +14374 sshd 100.66.3.172:22 100.127.69.165:25219 0 0 + +20:27:16 loadavg: 0.07 0.02 0.17 1/367 17403 + +PID COMM LADDR RADDR RX_KB TX_KB +17348 sshd 100.66.3.172:22 100.127.69.165:10161 3333 0 +14374 sshd 100.66.3.172:22 100.127.69.165:25219 0 0 + +20:27:17 loadavg: 0.07 0.02 0.17 2/366 17409 + +PID COMM LADDR RADDR RX_KB TX_KB +17348 17348 100.66.3.172:22 100.127.69.165:10161 6909 2 + +You can disable the loadavg summary line with -S if needed. + + +USAGE: + +# tcptop -h +usage: tcptop.py [-h] [-C] [-S] [-p PID] [interval] [count] + +Summarize TCP send/recv throughput by host + +positional arguments: + interval output interval, in seconds (default 1) + count number of outputs + +optional arguments: + -h, --help show this help message and exit + -C, --noclear don't clear the screen + -S, --nosummary skip system summary line + -p PID, --pid PID trace this PID only + +examples: + ./tcptop # trace TCP send/recv by host + ./tcptop -C # don't clear the screen + ./tcptop -p 181 # only trace PID 181