diff --git a/README.md b/README.md index 7d88ee12a491..a7b3a4b8a30f 100644 --- a/README.md +++ b/README.md @@ -70,6 +70,7 @@ Tools: - tools/[biosnoop](tools/biosnoop.py): Trace block device I/O with PID and latency. [Examples](tools/biosnoop_example.txt). - tools/[funccount](tools/funccount.py): Count kernel function calls. [Examples](tools/funccount_example.txt). - tools/[funclatency](tools/funclatency.py): Time kernel functions and show their latency distribution. [Examples](tools/funclatency_example.txt). +- tools/[gethostlatency](tools/gethostlatency.py): Show latency for getaddrinfo/gethostbyname[2] calls. [Examples](tools/gethostlatency_example.txt). - tools/[hardirqs](tools/hardirqs.py): Measure hard IRQ (hard interrupt) event time. [Examples](tools/hardirqs_example.txt). - tools/[killsnoop](tools/killsnoop.py): Trace signals issued by the kill() syscall. [Examples](tools/killsnoop_example.txt). - tools/[offcputime](tools/offcputime.py): Summarize off-CPU time by kernel stack trace. [Examples](tools/offcputime_example.txt). diff --git a/man/man8/gethostlatency.8 b/man/man8/gethostlatency.8 new file mode 100644 index 000000000000..b428983e2275 --- /dev/null +++ b/man/man8/gethostlatency.8 @@ -0,0 +1,55 @@ +.TH gethostlatency 8 "2016-01-28" "USER COMMANDS" +.SH NAME +gethostlatency \- Show latency for getaddrinfo/gethostbyname[2] calls. Uses Linux eBPF/bcc. +.SH SYNOPSIS +.B gethostlatency +.SH DESCRIPTION +This traces and prints when getaddrinfo(), gethostbyname(), and gethostbyname2() +are called, system wide, and shows the responsible PID and command name, +latency of the call (duration) in milliseconds, and the host string. + +This tool can be useful for identifying DNS latency, by identifying which +remote host name lookups were slow, and by how much. + +This tool currently uses dynamic tracing of user-level functions and registers, +and may need modifications to match your software and processor architecture. + +Since this uses BPF, only the root user can use this tool. +.SH REQUIREMENTS +CONFIG_BPF and bcc. +.SH EXAMPLES +.TP +Trace host lookups (getaddrinfo/gethostbyname[2]) system wide: +# +.B gethostlatency +.SH FIELDS +.TP +TIME +Time of the command (HH:MM:SS). +.TP +PID +Process ID of the client performing the call. +.TP +COMM +Process (command) name of the client performing the call. +.TP +HOST +Host name string: the target of the lookup. +.SH OVERHEAD +The rate of lookups should be relatively low, so the overhead is not expected +to be a problem. +.SH SOURCE +This is from bcc. +.IP +https://github.com/iovisor/bcc +.PP +Also look in the bcc distribution for a companion _examples.txt file containing +example usage, output, and commentary for this tool. +.SH OS +Linux +.SH STABILITY +Unstable - in development. +.SH AUTHOR +Brendan Gregg +.SH SEE ALSO +tcpdump(8) diff --git a/tools/gethostlatency.py b/tools/gethostlatency.py new file mode 100755 index 000000000000..7d32cb82d4af --- /dev/null +++ b/tools/gethostlatency.py @@ -0,0 +1,77 @@ +#!/usr/bin/python +# +# gethostlatency Show latency for getaddrinfo/gethostbyname[2] calls. +# For Linux, uses BCC, eBPF. Embedded C. +# +# This can be useful for identifying DNS latency, by identifying which +# remote host name lookups were slow, and by how much. +# +# This uses dynamic tracing of user-level functions and registers, and may +# need modifications to match your software and processor architecture. +# +# Copyright 2016 Netflix, Inc. +# Licensed under the Apache License, Version 2.0 (the "License") +# +# 28-Jan-2016 Brendan Gregg Created this. + +from __future__ import print_function +from bcc import BPF +from time import strftime + +# load BPF program +bpf_text = """ +#include + +struct val_t { + char host[80]; + u64 ts; +}; +BPF_HASH(start, u32, struct val_t); + +int do_entry(struct pt_regs *ctx) { + if (!ctx->di) + return 0; + struct val_t val = {}; + u32 pid = bpf_get_current_pid_tgid(); + bpf_probe_read(&val.host, sizeof(val.host), (void *)ctx->di); + val.ts = bpf_ktime_get_ns(); + start.update(&pid, &val); + return 0; +} + +int do_return(struct pt_regs *ctx) { + struct val_t *valp; + u64 delta; + u32 pid = bpf_get_current_pid_tgid(); + + valp = start.lookup(&pid); + if (valp == 0) + return 0; // missed start + + delta = (bpf_ktime_get_ns() - valp->ts) / 1000; + bpf_trace_printk("%d %s\\n", delta, valp->host); + start.delete(&pid); + return 0; +} +""" +b = BPF(text=bpf_text) +b.attach_uprobe(name="c", sym="getaddrinfo", fn_name="do_entry") +b.attach_uprobe(name="c", sym="gethostbyname", fn_name="do_entry") +b.attach_uprobe(name="c", sym="gethostbyname2", fn_name="do_entry") +b.attach_uretprobe(name="c", sym="getaddrinfo", fn_name="do_return") +b.attach_uretprobe(name="c", sym="gethostbyname", fn_name="do_return") +b.attach_uretprobe(name="c", sym="gethostbyname2", fn_name="do_return") + +# header +print("%-9s %-6s %-12s %6s %s" % ("TIME", "PID", "COMM", "LATms", "HOST")) + +# format output +while 1: + try: + (task, pid, cpu, flags, ts, msg) = b.trace_fields() + except ValueError: + continue + (delta, host) = msg.split(" ") + deltams = int(delta) / 1000 + print("%-9s %-6d %-12.12s %6.2f %s" % (strftime("%H:%M:%S"), pid, task, + deltams, host)) diff --git a/tools/gethostlatency_example.txt b/tools/gethostlatency_example.txt new file mode 100644 index 000000000000..429574fc288c --- /dev/null +++ b/tools/gethostlatency_example.txt @@ -0,0 +1,21 @@ +Demonstrations of gethostlatency, the Linux eBPF/bcc version. + + +This traces host name lookup calls (getaddrinfo(), gethostbyname(), and +gethostbyname2()), and shows the PID and command performing the lookup, the +latency (duration) of the call in milliseconds, and the host string: + +# ./gethostlatency +TIME PID COMM LATms HOST +06:10:24 28011 wget 90.00 www.iovisor.org +06:10:28 28127 wget 0.00 www.iovisor.org +06:10:41 28404 wget 9.00 www.netflix.com +06:10:48 28544 curl 35.00 www.netflix.com.au +06:11:10 29054 curl 31.00 www.plumgrid.com +06:11:16 29195 curl 3.00 www.facebook.com +06:11:25 29404 curl 72.00 foo +06:11:28 29475 curl 1.00 foo + +In this example, the first call to lookup "www.iovisor.org" took 90 ms, and +the second took 0 ms (cached). The slowest call in this example was to "foo", +which was an unsuccessful lookup.