diff --git a/README.md b/README.md index a917643c7d2d..0de68ed59b3f 100644 --- a/README.md +++ b/README.md @@ -72,6 +72,7 @@ Tools: - tools/[bitesize](tools/bitesize.py): Show per process I/O size histogram. [Examples](tools/bitesize_example.txt). - tools/[cachestat](tools/cachestat.py): Trace page cache hit/miss ratio. [Examples](tools/cachestat_example.txt). - tools/[execsnoop](tools/execsnoop.py): Trace new processes via exec() syscalls. [Examples](tools/execsnoop_example.txt). +- tools/[dcsnoop](tools/dcsnoop.py): Trace directory entry cache (dcache) lookups. [Examples](tools/dcsnoop_example.txt). - tools/[dcstat](tools/dcstat.py): Directory entry cache (dcache) stats. [Examples](tools/dcstat_example.txt). - tools/[filelife](tools/filelife.py): Trace the lifespan of short-lived files. [Examples](tools/filelife_example.txt). - tools/[fileslower](tools/fileslower.py): Trace slow synchronous file reads and writes. [Examples](tools/fileslower_example.txt). diff --git a/man/man8/dcsnoop.8 b/man/man8/dcsnoop.8 new file mode 100644 index 000000000000..6ccb3b65daa1 --- /dev/null +++ b/man/man8/dcsnoop.8 @@ -0,0 +1,77 @@ +.TH dcsnoop 8 "2016-02-10" "USER COMMANDS" +.SH NAME +dcsnoop \- Trace directory entry cache (dcache) lookups. Uses Linux eBPF/bcc. +.SH SYNOPSIS +.B dcsnoop [\-h] [\-a] +.SH DESCRIPTION +By default, this traces every failed dcache lookup (cache miss), and shows the +process performing the lookup and the filename requested. A \-a option can be +used to show all lookups, not just failed ones. + +The output of this tool can be verbose, and is intended for further +investigations of dcache performance beyond dcstat(8), which prints +per-second summaries. + +This uses kernel dynamic tracing of the d_lookup() function, and will need +and will need updating to match any changes to this function. + +Since this uses BPF, only the root user can use this tool. +.SH REQUIREMENTS +CONFIG_BPF and bcc. +.SH OPTIONS +.TP +\-h +Print usage message. +.TP +\-a +Trace references, not just failed lookups. +.SH EXAMPLES +.TP +Trace failed dcache lookups: +# +.B dcsnoop +.TP +Trace all dcache lookups: +# +.B dcsnoop \-a +.SH FIELDS +.TP +TIME(s) +Time of lookup, in seconds. +.TP +PID +Process ID. +.TP +COMM +Process name. +.TP +T +Type: R == reference (only visible with \-a), M == miss. A miss will print two +lines, one for the reference, and one for the miss. +.TP +FILE +The file name component that was being looked up. This contains trailing +pathname components (after '/'), which will be the subject of subsequent +lookups. +.SH OVERHEAD +File name lookups can be frequent (depending on the workload), and this tool +prints a line for each failed lookup, and with \-a, each reference as well. The +output may be verbose, and the incurred overhead, while optimized to some +extent, may still be from noticeable to significant. This is only really +intended for deeper investigations beyond dcstat(8), when absolutely necessary. +Measure and quantify the overhead in a test environment before use. +.SH SOURCE +This is from bcc. +.IP +https://github.com/iovisor/bcc +.PP +Also look in the bcc distribution for a companion _examples.txt file containing +example usage, output, and commentary for this tool. +.SH OS +Linux +.SH STABILITY +Unstable - in development. +.SH AUTHOR +Brendan Gregg +.SH SEE ALSO +dcstat(1) diff --git a/tools/dcsnoop.py b/tools/dcsnoop.py new file mode 100755 index 000000000000..956af6dd6fc0 --- /dev/null +++ b/tools/dcsnoop.py @@ -0,0 +1,117 @@ +#!/usr/bin/python +# @lint-avoid-python-3-compatibility-imports +# +# dcsnoop Trace directory entry cache (dcache) lookups. +# For Linux, uses BCC, eBPF. Embedded C. +# +# USAGE: dcsnoop [-h] [-a] +# +# By default, this traces every failed dcache lookup, and shows the process +# performing the lookup and the filename requested. A -a option can be used +# to show all lookups, not just failed ones. +# +# This uses kernel dynamic tracing of the d_lookup() function, and will need +# to be modified to match kernel changes. +# +# Also see dcstat(8), for per-second summaries. +# +# Copyright 2016 Netflix, Inc. +# Licensed under the Apache License, Version 2.0 (the "License") +# +# 09-Feb-2016 Brendan Gregg Created this. + +from __future__ import print_function +from bcc import BPF +import argparse +import re + +# arguments +examples = """examples: + ./dcsnoop # trace failed dcache lookups + ./dcsnoop -a # trace all dcache lookups +""" +parser = argparse.ArgumentParser( + description="Trace directory entry cache (dcache) lookups", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=examples) +parser.add_argument("-a", "--all", action="store_true", + help="trace all lookups (default is fails only)") +args = parser.parse_args() + +# define BPF program +bpf_text = """ +#include +#include +#include + +#define MAX_FILE_LEN 64 + +struct entry_t { + char name[MAX_FILE_LEN]; +}; + +BPF_HASH(entrybypid, u32, struct entry_t); + +/* from fs/namei.c: */ +struct nameidata { + struct path path; + struct qstr last; + // [...] +}; + +int trace_fast(struct pt_regs *ctx, struct nameidata *nd, struct path *path) +{ + bpf_trace_printk("R %s\\n", nd->last.name); + return 1; +} + +int kprobe__d_lookup(struct pt_regs *ctx, const struct dentry *parent, + const struct qstr *name) +{ + u32 pid = bpf_get_current_pid_tgid(); + struct entry_t entry = {}; + if (name->name) { + bpf_probe_read(&entry.name, sizeof(entry.name), (void *)name->name); + } + entrybypid.update(&pid, &entry); + return 0; +} + +int kretprobe__d_lookup(struct pt_regs *ctx) +{ + u32 pid = bpf_get_current_pid_tgid(); + struct entry_t *ep; + ep = entrybypid.lookup(&pid); + if (ep == 0) { + return 0; // missed entry + } + if (ctx->ax == 0) { + bpf_trace_printk("M %s\\n", ep->name); + } + entrybypid.delete(&pid); + return 0; +} +""" + +# initialize BPF +b = BPF(text=bpf_text) +if args.all: + b.attach_kprobe(event="lookup_fast", fn_name="trace_fast") + +# header +print("%-11s %-6s %-16s %1s %s" % ("TIME(s)", "PID", "COMM", "T", "FILE")) + +start_ts = 0 + +# format output +while 1: + (task, pid, cpu, flags, ts, msg) = b.trace_fields() + try: + (type, file) = msg.split(" ", 1) + except ValueError: + continue + + if start_ts == 0: + start_ts = ts + + print("%-11.6f %-6s %-16s %1s %s" % (ts - start_ts, pid, task, type, file)) diff --git a/tools/dcsnoop_example.txt b/tools/dcsnoop_example.txt new file mode 100644 index 000000000000..2184db0e1ff1 --- /dev/null +++ b/tools/dcsnoop_example.txt @@ -0,0 +1,96 @@ +Demonstrations of dcsnoop, the Linux eBPF/bcc version. + + +dcsnoop traces directory entry cache (dcache) lookups, and can be used for +further investigation beyond dcstat(8). The output is likely verbose, as +dcache lookups are likely frequent. By default, only failed lookups are shown. +For example: + +# ./dcsnoop.py +TIME(s) PID COMM T FILE +0.002837 1643 snmpd M net/dev +0.002852 1643 snmpd M 1643 +0.002856 1643 snmpd M net +0.002863 1643 snmpd M dev +0.002952 1643 snmpd M net/if_inet6 +0.002964 1643 snmpd M if_inet6 +0.003180 1643 snmpd M net/ipv4/neigh/eth0/retrans_time_ms +0.003192 1643 snmpd M ipv4/neigh/eth0/retrans_time_ms +0.003197 1643 snmpd M neigh/eth0/retrans_time_ms +0.003203 1643 snmpd M eth0/retrans_time_ms +0.003206 1643 snmpd M retrans_time_ms +0.003245 1643 snmpd M ipv6/neigh/eth0/retrans_time_ms +0.003249 1643 snmpd M neigh/eth0/retrans_time_ms +0.003252 1643 snmpd M eth0/retrans_time_ms +0.003255 1643 snmpd M retrans_time_ms +0.003287 1643 snmpd M conf/eth0/forwarding +0.003292 1643 snmpd M eth0/forwarding +0.003295 1643 snmpd M forwarding +0.003326 1643 snmpd M base_reachable_time_ms +[...] + +I ran a drop caches at the same time as executing this tool. The output shows +the processes, the type of event ("T" column: M == miss, R == reference), +and the filename for the dcache lookup. + +The way the dcache is currently implemented, each component of a path is +checked in turn. The first line, showing "net/dev" from snmp, will be a lookup +for "net" in a directory (that isn't shown here). If it finds "net", it will +then lookup "dev" inside net. You can see this sequence a little later, +starting at time 0.003180, where a pathname is being searched +directory by directory. + + +The -a option will show all lookups, although be warned, the output will be +very verbose. For example: + +# ./dcsnoop +TIME(s) PID COMM T FILE +0.000000 20279 dcsnoop.py M p_lookup_fast +0.000010 20279 dcsnoop.py M enable +0.000013 20279 dcsnoop.py M id +0.000015 20279 dcsnoop.py M filter +0.000017 20279 dcsnoop.py M trigger +0.000019 20279 dcsnoop.py M format +0.006148 20279 dcsnoop.py R sys/kernel/debug/tracing/trace_pipe +0.006158 20279 dcsnoop.py R kernel/debug/tracing/trace_pipe +0.006161 20279 dcsnoop.py R debug/tracing/trace_pipe +0.006164 20279 dcsnoop.py R tracing/trace_pipe +0.006166 20279 dcsnoop.py R trace_pipe +0.015900 1643 snmpd R proc/sys/net/ipv6/conf/lo/forwarding +0.015901 1643 snmpd R sys/net/ipv6/conf/lo/forwarding +0.015901 1643 snmpd R net/ipv6/conf/lo/forwarding +0.015902 1643 snmpd R ipv6/conf/lo/forwarding +0.015903 1643 snmpd R conf/lo/forwarding +0.015904 1643 snmpd R lo/forwarding +0.015905 1643 snmpd M lo/forwarding +0.015908 1643 snmpd R forwarding +0.015909 1643 snmpd M forwarding +0.015937 1643 snmpd R proc/sys/net/ipv6/neigh/lo/base_reachable_time_ms +0.015937 1643 snmpd R sys/net/ipv6/neigh/lo/base_reachable_time_ms +0.015938 1643 snmpd R net/ipv6/neigh/lo/base_reachable_time_ms +0.015939 1643 snmpd R ipv6/neigh/lo/base_reachable_time_ms +0.015940 1643 snmpd R neigh/lo/base_reachable_time_ms +0.015941 1643 snmpd R lo/base_reachable_time_ms +0.015941 1643 snmpd R base_reachable_time_ms +0.015943 1643 snmpd M base_reachable_time_ms +0.043569 1876 supervise M 20281 +0.043573 1886 supervise M 20280 +0.043582 1886 supervise R supervise/status.new +[...] + + +USAGE message: + +# ./dcsnoop.py -h +usage: dcsnoop.py [-h] [-a] + +Trace directory entry cache (dcache) lookups + +optional arguments: + -h, --help show this help message and exit + -a, --all trace all lookups (default is fails only) + +examples: + ./dcsnoop # trace failed dcache lookups + ./dcsnoop -a # trace all dcache lookups