diff --git a/README.md b/README.md
index 1e4f76a954e2..29e3aad64afc 100644
--- a/README.md
+++ b/README.md
@@ -120,6 +120,7 @@ Examples:
 - tools/[tcpconnect](tools/tcpconnect.py): Trace TCP active connections (connect()). [Examples](tools/tcpconnect_example.txt).
 - tools/[tcpconnlat](tools/tcpconnlat.py): Trace TCP active connection latency (connect()). [Examples](tools/tcpconnlat_example.txt).
 - tools/[tcpretrans](tools/tcpretrans.py): Trace TCP retransmits and TLPs. [Examples](tools/tcpretrans_example.txt).
+- tools/[tcptop](tools/tcptop.py): Summarize TCP send/recv throughput by host. Top for TCP. [Examples](tools/tcptop_example.txt).
 - tools/[tplist](tools/tplist.py): Display kernel tracepoints or USDT probes and their formats. [Examples](tools/tplist_example.txt).
 - tools/[trace](tools/trace.py): Trace arbitrary functions, with filters. [Examples](tools/trace_example.txt)
 - tools/[vfscount](tools/vfscount.py) tools/[vfscount.c](tools/vfscount.c): Count VFS calls. [Examples](tools/vfscount_example.txt).
diff --git a/man/man8/tcptop.8 b/man/man8/tcptop.8
new file mode 100644
index 000000000000..672e8edddbb7
--- /dev/null
+++ b/man/man8/tcptop.8
@@ -0,0 +1,112 @@
+.TH tcptop 8  "2016-09-13" "USER COMMANDS"
+.SH NAME
+tcptop \- Summarize TCP send/recv throughput by host. Top for TCP.
+.SH SYNOPSIS
+.B tcptop [\-h] [\-C] [\-S] [\-p PID] [interval] [count]
+.SH DESCRIPTION
+This is top for TCP sessions.
+
+This summarizes TCP send/receive Kbytes by host, and prints a summary that
+refreshes, along other system-wide metrics.
+
+This uses dynamic tracing of kernel TCP send/receive functions, and will
+need to be updated to match kernel changes.
+
+The traced TCP functions are usually called at a lower rate than
+per-packet functions, and therefore have lower overhead. The traced data is
+summarized in-kernel using a BPF map to further reduce overhead. At very high
+TCP event rates, the overhead may still be measurable. See the OVERHEAD
+section for more details.
+
+Since this uses BPF, only the root user can use this tool.
+.SH REQUIREMENTS
+CONFIG_BPF and bcc.
+.SH OPTIONS
+.TP
+\-h
+Print USAGE message.
+.TP
+\-C
+Don't clear the screen.
+.TP
+\-S
+Don't print the system summary line (load averages).
+.TP
+\-p PID
+Trace this PID only.
+.TP
+interval
+Interval between updates, seconds (default 1).
+.TP
+count
+Number of interval summaries (default is many).
+.SH EXAMPLES
+.TP
+Summarize TCP throughput by active sessions, 1 second refresh:
+#
+.B tcptop
+.TP
+Don't clear the screen (rolling output), and 5 second summaries:
+#
+.B tcptop \-C 5
+.TP
+Trace PID 181 only, and don't clear the screen:
+#
+.B tcptop \-Cp 181
+.SH FIELDS
+.TP
+loadavg:
+The contents of /proc/loadavg
+.TP
+PID
+Process ID.
+.TP
+COMM
+Process name.
+.TP
+LADDR
+Local address (IPv4), and TCP port
+.TP
+RADDR
+Remote address (IPv4), and TCP port
+.TP
+LADDR6
+Source address (IPv6), and TCP port
+.TP
+RADDR6
+Destination address (IPv6), and TCP port
+.TP
+RX_KB
+Received Kbytes
+.TP
+TX_KB
+Transmitted Kbytes
+.SH OVERHEAD
+This traces all send/receives in TCP, high in the TCP/IP stack (close to the
+application) which are usually called at a lower rate than per-packet
+functions, lowering overhead. It also summarizes data in-kernel to further
+reduce overhead. These techniques help, but there may still be measurable
+overhead at high send/receive rates, eg, ~13% of one CPU at 100k events/sec.
+use funccount to count the kprobes in the tool to find out this rate, as the
+overhead is relative to the rate. Some sample production servers tested found
+total TCP event rates of 4k to 15k per second, and the CPU overhead at these
+rates ranged from 0.5% to 2.0% of one CPU. If your send/receive rate is low
+(eg, <1000/sec) then the overhead is expected to be negligible; Test in a lab
+environment first.
+.SH SOURCE
+This is from bcc.
+.IP
+https://github.com/iovisor/bcc
+.PP
+Also look in the bcc distribution for a companion _examples.txt file containing
+example usage, output, and commentary for this tool.
+.SH OS
+Linux
+.SH STABILITY
+Unstable - in development.
+.SH AUTHOR
+Brendan Gregg
+.SH INSPIRATION
+top(1) by William LeFebvre
+.SH SEE ALSO
+tcpconnect(8), tcpaccept(8)
diff --git a/tools/tcptop.py b/tools/tcptop.py
new file mode 100755
index 000000000000..a6154c66625f
--- /dev/null
+++ b/tools/tcptop.py
@@ -0,0 +1,287 @@
+#!/usr/bin/python
+# @lint-avoid-python-3-compatibility-imports
+#
+# tcptop    Summarize TCP send/recv throughput by host.
+#           For Linux, uses BCC, eBPF. Embedded C.
+#
+# USAGE: tcptop [-h] [-C] [-S] [-p PID] [interval [count]]
+#
+# This uses dynamic tracing of kernel functions, and will need to be updated
+# to match kernel changes.
+#
+# WARNING: This traces all send/receives at the TCP level, and while it
+# summarizes data in-kernel to reduce overhead, there may still be some
+# overhead at high TCP send/receive rates (eg, ~13% of one CPU at 100k TCP
+# events/sec. This is not the same as packet rate: funccount can be used to
+# count the kprobes below to find out the TCP rate). Test in a lab environment
+# first. If your send/receive rate is low (eg, <1k/sec) then the overhead is
+# expected to be negligible.
+#
+# ToDo: Fit output to screen size (top X only) in default (not -C) mode.
+#
+# Copyright 2016 Netflix, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 02-Sep-2016   Brendan Gregg   Created this.
+
+from __future__ import print_function
+from bcc import BPF
+import argparse
+from socket import inet_ntop, AF_INET, AF_INET6
+from struct import pack
+from time import sleep, strftime
+from subprocess import call
+import ctypes as ct
+
+# arguments
+examples = """examples:
+    ./tcptop           # trace TCP send/recv by host
+    ./tcptop -C        # don't clear the screen
+    ./tcptop -p 181    # only trace PID 181
+"""
+parser = argparse.ArgumentParser(
+    description="Summarize TCP send/recv throughput by host",
+    formatter_class=argparse.RawDescriptionHelpFormatter,
+    epilog=examples)
+parser.add_argument("-C", "--noclear", action="store_true",
+    help="don't clear the screen")
+parser.add_argument("-S", "--nosummary", action="store_true",
+    help="skip system summary line")
+parser.add_argument("-p", "--pid",
+    help="trace this PID only")
+parser.add_argument("interval", nargs="?", default=1,
+    help="output interval, in seconds (default 1)")
+parser.add_argument("count", nargs="?", default=99999999,
+    help="number of outputs")
+args = parser.parse_args()
+countdown = int(args.count)
+if args.interval and int(args.interval) == 0:
+    print("ERROR: interval 0. Exiting.")
+    exit()
+debug = 0
+
+# linux stats
+loadavg = "/proc/loadavg"
+
+# define BPF program
+bpf_text = """
+#include <uapi/linux/ptrace.h>
+#include <net/sock.h>
+#include <bcc/proto.h>
+
+struct ipv4_key_t {
+    u32 pid;
+    u32 saddr;
+    u32 daddr;
+    u16 lport;
+    u16 dport;
+};
+BPF_HASH(ipv4_send_bytes, struct ipv4_key_t);
+BPF_HASH(ipv4_recv_bytes, struct ipv4_key_t);
+
+struct ipv6_key_t {
+    u32 pid;
+    // workaround until unsigned __int128 support:
+    u64 saddr0;
+    u64 saddr1;
+    u64 daddr0;
+    u64 daddr1;
+    u16 lport;
+    u16 dport;
+};
+BPF_HASH(ipv6_send_bytes, struct ipv6_key_t);
+BPF_HASH(ipv6_recv_bytes, struct ipv6_key_t);
+
+int kprobe__tcp_sendmsg(struct pt_regs *ctx, struct sock *sk,
+    struct msghdr *msg, size_t size)
+{
+    u32 pid = bpf_get_current_pid_tgid();
+    FILTER
+    u16 dport = 0, family = sk->__sk_common.skc_family;
+    u64 *val, zero = 0;
+
+    if (family == AF_INET) {
+        struct ipv4_key_t ipv4_key = {.pid = pid};
+        ipv4_key.saddr = sk->__sk_common.skc_rcv_saddr;
+        ipv4_key.daddr = sk->__sk_common.skc_daddr;
+        ipv4_key.lport = sk->__sk_common.skc_num;
+        dport = sk->__sk_common.skc_dport;
+        ipv4_key.dport = ntohs(dport);
+        val = ipv4_send_bytes.lookup_or_init(&ipv4_key, &zero);
+        (*val) += size;
+
+    } else if (family == AF_INET6) {
+        struct ipv6_key_t ipv6_key = {.pid = pid};
+
+        bpf_probe_read(&ipv6_key.saddr0, sizeof(ipv6_key.saddr0),
+            &sk->__sk_common.skc_v6_rcv_saddr.in6_u.u6_addr32[0]);
+        bpf_probe_read(&ipv6_key.saddr1, sizeof(ipv6_key.saddr1),
+            &sk->__sk_common.skc_v6_rcv_saddr.in6_u.u6_addr32[2]);
+        bpf_probe_read(&ipv6_key.daddr0, sizeof(ipv6_key.daddr0),
+            &sk->__sk_common.skc_v6_daddr.in6_u.u6_addr32[0]);
+        bpf_probe_read(&ipv6_key.daddr1, sizeof(ipv6_key.daddr1),
+            &sk->__sk_common.skc_v6_daddr.in6_u.u6_addr32[2]);
+        ipv6_key.lport = sk->__sk_common.skc_num;
+        dport = sk->__sk_common.skc_dport;
+        ipv6_key.dport = ntohs(dport);
+        val = ipv6_send_bytes.lookup_or_init(&ipv6_key, &zero);
+        (*val) += size;
+    }
+    // else drop
+
+    return 0;
+}
+
+/*
+ * tcp_recvmsg() would be obvious to trace, but is less suitable because:
+ * - we'd need to trace both entry and return, to have both sock and size
+ * - misses tcp_read_sock() traffic
+ * we'd much prefer tracepoints once they are available.
+ */
+int kprobe__tcp_cleanup_rbuf(struct pt_regs *ctx, struct sock *sk, int copied)
+{
+    u32 pid = bpf_get_current_pid_tgid();
+    FILTER
+    u16 dport = 0, family = sk->__sk_common.skc_family;
+    u64 *val, zero = 0;
+
+    if (family == AF_INET) {
+        struct ipv4_key_t ipv4_key = {.pid = pid};
+        ipv4_key.saddr = sk->__sk_common.skc_rcv_saddr;
+        ipv4_key.daddr = sk->__sk_common.skc_daddr;
+        ipv4_key.lport = sk->__sk_common.skc_num;
+        dport = sk->__sk_common.skc_dport;
+        ipv4_key.dport = ntohs(dport);
+        val = ipv4_recv_bytes.lookup_or_init(&ipv4_key, &zero);
+        (*val) += copied;
+
+    } else if (family == AF_INET6) {
+        struct ipv6_key_t ipv6_key = {.pid = pid};
+        bpf_probe_read(&ipv6_key.saddr0, sizeof(ipv6_key.saddr0),
+            &sk->__sk_common.skc_v6_rcv_saddr.in6_u.u6_addr32[0]);
+        bpf_probe_read(&ipv6_key.saddr1, sizeof(ipv6_key.saddr1),
+            &sk->__sk_common.skc_v6_rcv_saddr.in6_u.u6_addr32[2]);
+        bpf_probe_read(&ipv6_key.daddr0, sizeof(ipv6_key.daddr0),
+            &sk->__sk_common.skc_v6_daddr.in6_u.u6_addr32[0]);
+        bpf_probe_read(&ipv6_key.daddr1, sizeof(ipv6_key.daddr1),
+            &sk->__sk_common.skc_v6_daddr.in6_u.u6_addr32[2]);
+        ipv6_key.lport = sk->__sk_common.skc_num;
+        dport = sk->__sk_common.skc_dport;
+        ipv6_key.dport = ntohs(dport);
+        val = ipv6_recv_bytes.lookup_or_init(&ipv6_key, &zero);
+        (*val) += copied;
+    }
+    // else drop
+
+    return 0;
+}
+"""
+
+# code substitutions
+if args.pid:
+    bpf_text = bpf_text.replace('FILTER',
+        'if (pid != %s) { return 0; }' % args.pid)
+else:
+    bpf_text = bpf_text.replace('FILTER', '')
+if debug:
+    print(bpf_text)
+
+def pid_to_comm(pid):
+    try:
+        comm = open("/proc/%d/comm" % pid, "r").read().rstrip()
+        return comm
+    except IOError:
+        return str(pid)
+
+# initialize BPF
+b = BPF(text=bpf_text)
+
+ipv4_send_bytes = b["ipv4_send_bytes"]
+ipv4_recv_bytes = b["ipv4_recv_bytes"]
+ipv6_send_bytes = b["ipv6_send_bytes"]
+ipv6_recv_bytes = b["ipv6_recv_bytes"]
+
+print('Tracing... Output every %s secs. Hit Ctrl-C to end' % args.interval)
+
+# output
+exiting = 0
+while (1):
+    try:
+        if args.interval:
+            sleep(int(args.interval))
+        else:
+            sleep(99999999)
+    except KeyboardInterrupt:
+        exiting = 1
+
+    # header
+    if args.noclear:
+        print()
+    else:
+        call("clear")
+    if not args.nosummary:
+        with open(loadavg) as stats:
+            print("%-8s loadavg: %s" % (strftime("%H:%M:%S"), stats.read()))
+
+    # IPv4:  build dict of all seen keys
+    keys = ipv4_recv_bytes
+    for k, v in ipv4_send_bytes.items():
+        if k not in keys:
+            keys[k] = v
+
+    if keys:
+        print("%-6s %-12s %-21s %-21s %6s %6s" % ("PID", "COMM",
+            "LADDR", "RADDR", "RX_KB", "TX_KB"))
+
+    # output
+    for k, v in reversed(sorted(keys.items(), key=lambda keys: keys[1].value)):
+        send_kbytes = 0
+        if k in ipv4_send_bytes:
+            send_kbytes = int(ipv4_send_bytes[k].value / 1024)
+        recv_kbytes = 0
+        if k in ipv4_recv_bytes:
+            recv_kbytes = int(ipv4_recv_bytes[k].value / 1024)
+
+        print("%-6d %-12.12s %-21s %-21s %6d %6d" % (k.pid,
+            pid_to_comm(k.pid),
+            inet_ntop(AF_INET, pack("I", k.saddr)) + ":" + str(k.lport),
+            inet_ntop(AF_INET, pack("I", k.daddr)) + ":" + str(k.dport),
+            recv_kbytes, send_kbytes))
+
+    ipv4_send_bytes.clear()
+    ipv4_recv_bytes.clear()
+
+    # IPv6: build dict of all seen keys
+    keys = ipv6_recv_bytes
+    for k, v in ipv6_send_bytes.items():
+        if k not in keys:
+            keys[k] = v
+
+    if keys:
+        # more than 80 chars, sadly.
+        print("\n%-6s %-12s %-32s %-32s %6s %6s" % ("PID", "COMM",
+            "LADDR6", "RADDR6", "RX_KB", "TX_KB"))
+
+    # output
+    for k, v in reversed(sorted(keys.items(), key=lambda keys: keys[1].value)):
+        send_kbytes = 0
+        if k in ipv6_send_bytes:
+            send_kbytes = int(ipv6_send_bytes[k].value / 1024)
+        recv_kbytes = 0
+        if k in ipv6_recv_bytes:
+            recv_kbytes = int(ipv6_recv_bytes[k].value / 1024)
+
+        print("%-6d %-12.12s %-32s %-32s %6d %6d" % (k.pid,
+            pid_to_comm(k.pid),
+            inet_ntop(AF_INET6, pack("QQ", k.saddr0, k.saddr1)) + ":" +
+            str(k.lport),
+            inet_ntop(AF_INET6, pack("QQ", k.daddr0, k.daddr1)) + ":" +
+            str(k.dport),
+            recv_kbytes, send_kbytes))
+
+    ipv6_send_bytes.clear()
+    ipv6_recv_bytes.clear()
+
+    countdown -= 1
+    if exiting or countdown == 0:
+        exit()
diff --git a/tools/tcptop_example.txt b/tools/tcptop_example.txt
new file mode 100644
index 000000000000..63ba2efae67a
--- /dev/null
+++ b/tools/tcptop_example.txt
@@ -0,0 +1,116 @@
+Demonstrations of tcptop, the Linux eBPF/bcc version.
+
+
+tcptop summarizes throughput by host and port. Eg:
+
+# tcptop
+Tracing... Output every 1 secs. Hit Ctrl-C to end
+<screen clears>
+19:46:24 loadavg: 1.86 2.67 2.91 3/362 16681
+
+PID    COMM         LADDR                 RADDR                  RX_KB  TX_KB
+16648  16648        100.66.3.172:22       100.127.69.165:6684        1      0
+16647  sshd         100.66.3.172:22       100.127.69.165:6684        0   2149
+14374  sshd         100.66.3.172:22       100.127.69.165:25219       0      0
+14458  sshd         100.66.3.172:22       100.127.69.165:7165        0      0
+
+PID    COMM         LADDR6                           RADDR6                            RX_KB  TX_KB
+16681  sshd         fe80::8a3:9dff:fed5:6b19:22      fe80::8a3:9dff:fed5:6b19:16606        1      1
+16679  ssh          fe80::8a3:9dff:fed5:6b19:16606   fe80::8a3:9dff:fed5:6b19:22           1      1
+16680  sshd         fe80::8a3:9dff:fed5:6b19:22      fe80::8a3:9dff:fed5:6b19:16606        0      0
+
+This example output shows two listings of TCP connections, for IPv4 and IPv6.
+If there is only traffic for one of these, then only one group is shown.
+
+The output in each listing is sorted by total throughput (send then receive),
+and when printed it is rounded (floor) to the nearest Kbyte. The example output
+shows PID 16647, sshd, transmitted 2149 Kbytes during the tracing interval.
+The other IPv4 sessions had such low throughput they rounded to zero.
+
+All TCP sessions, including over loopback, are included.
+
+The session with the process name (COMM) of 16648 is really a short-lived
+process with PID 16648 where we didn't catch the process name when printing
+the output. If this behavior is a serious issue for you, you can modify the
+tool's code to include bpf_get_current_comm() in the key structs, so that it's
+fetched during the event and will always be seen. I did it this way to start
+with, but it was measurably increasing the overhead of this tool, so I switched
+to the asynchronous model.
+
+The overhead is relative to TCP event rate (the rate of tcp_sendmsg() and
+tcp_recvmsg() or tcp_cleanup_rbuf()). Due to buffering, this should be lower
+than the packet rate. You can measure the rate of these using funccount.
+Some sample production servers tested found total rates of 4k to 15k per
+second. The CPU overhead at these rates ranged from 0.5% to 2.0% of one CPU.
+Maybe your workloads have higher rates and therefore higher overhead, or,
+lower rates.
+
+
+I much prefer not clearing the screen, so that historic output is in the
+scroll-back buffer, and patterns or intermittent issues can be better seen.
+You can do this with -C:
+
+# tcptop -C
+Tracing... Output every 1 secs. Hit Ctrl-C to end
+
+20:27:12 loadavg: 0.08 0.02 0.17 2/367 17342
+
+PID    COMM         LADDR                 RADDR                  RX_KB  TX_KB
+17287  17287        100.66.3.172:22       100.127.69.165:57585       3      1
+17286  sshd         100.66.3.172:22       100.127.69.165:57585       0      1
+14374  sshd         100.66.3.172:22       100.127.69.165:25219       0      0
+
+20:27:13 loadavg: 0.08 0.02 0.17 1/367 17342
+
+PID    COMM         LADDR                 RADDR                  RX_KB  TX_KB
+17286  sshd         100.66.3.172:22       100.127.69.165:57585       1   7761
+14374  sshd         100.66.3.172:22       100.127.69.165:25219       0      0
+
+20:27:14 loadavg: 0.08 0.02 0.17 2/365 17347
+
+PID    COMM         LADDR                 RADDR                  RX_KB  TX_KB
+17286  17286        100.66.3.172:22       100.127.69.165:57585       1   2501
+14374  sshd         100.66.3.172:22       100.127.69.165:25219       0      0
+
+20:27:15 loadavg: 0.07 0.02 0.17 2/367 17403
+
+PID    COMM         LADDR                 RADDR                  RX_KB  TX_KB
+17349  17349        100.66.3.172:22       100.127.69.165:10161       3      1
+17348  sshd         100.66.3.172:22       100.127.69.165:10161       0      1
+14374  sshd         100.66.3.172:22       100.127.69.165:25219       0      0
+
+20:27:16 loadavg: 0.07 0.02 0.17 1/367 17403
+
+PID    COMM         LADDR                 RADDR                  RX_KB  TX_KB
+17348  sshd         100.66.3.172:22       100.127.69.165:10161    3333      0
+14374  sshd         100.66.3.172:22       100.127.69.165:25219       0      0
+
+20:27:17 loadavg: 0.07 0.02 0.17 2/366 17409
+
+PID    COMM         LADDR                 RADDR                  RX_KB  TX_KB
+17348  17348        100.66.3.172:22       100.127.69.165:10161    6909      2
+
+You can disable the loadavg summary line with -S if needed.
+
+
+USAGE:
+
+# tcptop -h
+usage: tcptop.py [-h] [-C] [-S] [-p PID] [interval] [count]
+
+Summarize TCP send/recv throughput by host
+
+positional arguments:
+  interval           output interval, in seconds (default 1)
+  count              number of outputs
+
+optional arguments:
+  -h, --help         show this help message and exit
+  -C, --noclear      don't clear the screen
+  -S, --nosummary    skip system summary line
+  -p PID, --pid PID  trace this PID only
+
+examples:
+    ./tcptop           # trace TCP send/recv by host
+    ./tcptop -C        # don't clear the screen
+    ./tcptop -p 181    # only trace PID 181