Skip to content

Commit

Permalink
Print slow commit operations in nfsslower
Browse files Browse the repository at this point in the history
nfsslower does not currently print out slow commit operations, which
sometimes cause long delays in NFS writes.  This commit adds code to
trace slow commit operations.

NFS commits are quite slow and are not performed often, so the patch
shouldn't incur visible overhead.

Below is a sample output when writing a large file using dd command.
Most commits are issued from kworker threads, so the commit output may
be mostly dropped when filtered by PID.

TIME     COMM           PID    T BYTES   OFF_KB   LAT(ms) FILENAME
...
16:46:11 dd             1212   W 1048576 651264     46.58 testfile
16:46:11 dd             1212   W 1048576 653312     54.41 testfile
16:46:11 dd             1212   W 1048576 654336     18.96 testfile
16:46:11 dd             1212   W 1048576 655360     49.05 testfile
16:46:11 dd             1212   W 1048576 657408     82.96 testfile
16:46:11 dd             1212   W 1048576 659456    109.25 testfile
16:46:12 dd             1212   W 1048576 660480    163.55 testfile
16:46:12 dd             1212   W 1048576 662528    205.44 testfile
16:46:13 dd             1212   W 1048576 663552    751.02 testfile
16:46:37 kworker/u8:5   1207   C 0       0        27449.05 testfile
16:46:37 kworker/u8:5   1207   C 0       0        26725.16 testfile
16:46:37 kworker/u8:5   1207   C 0       0        27592.04 testfile
16:46:37 kworker/u8:4   1206   C 0       0        22188.75 testfile
16:46:37 kworker/u8:4   1206   C 0       0        26092.59 testfile
16:46:37 kworker/u8:4   1206   C 0       0        27268.90 testfile
16:46:37 kworker/u8:4   1206   C 0       0        22303.24 testfile
16:46:37 kworker/u8:4   1206   C 0       0        27081.34 testfile
16:46:37 dd             1212   W 1048576 664576   24337.80 testfile
16:46:38 dd             1212   W 1048576 958464     61.77 testfile
16:46:38 dd             1212   W 1048576 960512     56.60 testfile
16:46:38 dd             1212   W 1048576 963584     55.75 testfile
16:46:38 dd             1212   W 1048576 965632     54.84 testfile
...

Signed-off-by: Kosuke Tatsukawa <[email protected]>
  • Loading branch information
kosuke-tatsu authored and yonghong-song committed Dec 5, 2022
1 parent a858abf commit d49d9aa
Showing 1 changed file with 150 additions and 3 deletions.
153 changes: 150 additions & 3 deletions tools/nfsslower.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
# This script traces some common NFS operations: read, write, opens and
# getattr. It measures the time spent in these operations, and prints details
# for each that exceeded a threshold.
# The script also traces commit operations, which is specific to nfs and could
# be pretty slow.
#
# WARNING: This adds low-overhead instrumentation to these NFS operations,
# including reads and writes from the file system cache. Such reads and writes
Expand All @@ -25,6 +27,8 @@
# Currently there aren't any tracepoints available for nfs_read_file,
# nfs_write_file and nfs_open_file, nfs_getattr does have entry and exit
# tracepoints but we chose to use kprobes for consistency
# Raw tracepoints are used to trace nfs:nfs_initiate_commit and
# nfs:nfs_commit_done.
#
# 31-Aug-2017 Samuel Nair created this. Should work with NFSv{3,4}

Expand All @@ -41,8 +45,8 @@
./nfsslower -p 121 # trace pid 121 only
"""
parser = argparse.ArgumentParser(
description="""Trace READ, WRITE, OPEN \
and GETATTR NFS calls slower than a threshold,\
description="""Trace READ, WRITE, OPEN, GETATTR \
and COMMIT NFS calls slower than a threshold,\
supports NFSv{3,4}""",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=examples)
Expand All @@ -66,11 +70,16 @@
#include <linux/fs.h>
#include <linux/sched.h>
#include <linux/dcache.h>
#ifndef KBUILD_MODNAME
#define KBUILD_MODNAME "bcc"
#endif
#include <linux/nfs_fs.h>
#define TRACE_READ 0
#define TRACE_WRITE 1
#define TRACE_OPEN 2
#define TRACE_GETATTR 3
#define TRACE_COMMIT 4
struct val_t {
u64 ts;
Expand All @@ -79,6 +88,12 @@
struct dentry *d;
};
struct commit_t {
u64 ts;
u64 offset;
u64 count;
};
struct data_t {
// XXX: switch some to u32's when supported
u64 ts_us;
Expand All @@ -93,6 +108,7 @@
BPF_HASH(entryinfo, u64, struct val_t);
BPF_PERF_OUTPUT(events);
BPF_HASH(commitinfo, u64, struct commit_t);
int trace_rw_entry(struct pt_regs *ctx, struct kiocb *iocb,
struct iov_iter *data)
Expand Down Expand Up @@ -227,7 +243,119 @@
return trace_exit(ctx, TRACE_GETATTR);
}
static int trace_initiate_commit(struct nfs_commit_data *cd)
{
u64 key = (u64)cd;
struct commit_t c = { 0 };
c.ts = bpf_ktime_get_ns();
bpf_probe_read_kernel(&c.offset, sizeof(cd->args.offset),
&cd->args.offset);
bpf_probe_read_kernel(&c.count, sizeof(cd->args.count), &cd->args.count);
commitinfo.update(&key, &c);
return 0;
}
"""

bpf_text_raw_tp = """
RAW_TRACEPOINT_PROBE(nfs_initiate_commit)
{
// TP_PROTO(const struct nfs_commit_data *data)
struct nfs_commit_data *cd = (struct nfs_commit_data *)ctx->args[0];
return trace_initiate_commit(cd);
}
RAW_TRACEPOINT_PROBE(nfs_commit_done)
{
// TP_PROTO(const struct rpc_task *task, const struct nfs_commit_data *data)
struct nfs_commit_data *cd = (struct nfs_commit_data *)ctx->args[1];
u64 key = (u64)cd;
struct commit_t *cp = commitinfo.lookup(&key);
if (cp) {
struct nfs_open_context *p;
struct dentry *de;
struct qstr qs;
u64 ts = bpf_ktime_get_ns();
u64 delta_us = (ts - cp->ts) / 1000;
u32 pid = bpf_get_current_pid_tgid() >> 32;
struct data_t data = {.type = TRACE_COMMIT, .offset = cp->offset,
.size = cp->count, .ts_us = ts/1000, .delta_us = delta_us,
.pid = pid};
commitinfo.delete(&key);
bpf_get_current_comm(&data.task, sizeof(data.task));
if(FILTER_PID)
return 0;
if (FILTER_US)
return 0;
bpf_probe_read_kernel(&p, sizeof(p), &cd->context);
bpf_probe_read_kernel(&de, sizeof(de), &p->dentry);
bpf_probe_read_kernel(&qs, sizeof(qs), &de->d_name);
if (qs.len) {
bpf_probe_read_kernel(&data.file, sizeof(data.file),
(void *)qs.name);
events.perf_submit(ctx, &data, sizeof(data));
}
}
return 0;
}
"""

bpf_text_kprobe = """
int trace_nfs_initiate_commit(struct pt_regs *ctx, void *clnt, struct nfs_commit_data *cd)
{
return trace_initiate_commit(cd);
}
int trace_nfs_commit_done(struct pt_regs *ctx, void *task, void *calldata)
{
struct nfs_commit_data *cd = (struct nfs_commit_data *)calldata;
u64 key = (u64)cd;
struct commit_t *cp = commitinfo.lookup(&key);
if (cp) {
struct nfs_open_context *p;
struct dentry *de;
struct qstr qs;
u64 ts = bpf_ktime_get_ns();
u64 delta_us = (ts - cp->ts) / 1000;
u32 pid = bpf_get_current_pid_tgid() >> 32;
struct data_t data = {.type = TRACE_COMMIT, .offset = cp->offset,
.size = cp->count, .ts_us = ts/1000, .delta_us = delta_us,
.pid = pid};
commitinfo.delete(&key);
bpf_get_current_comm(&data.task, sizeof(data.task));
if(FILTER_PID)
return 0;
if (FILTER_US)
return 0;
bpf_probe_read_kernel(&p, sizeof(p), &cd->context);
bpf_probe_read_kernel(&de, sizeof(de), &p->dentry);
bpf_probe_read_kernel(&qs, sizeof(qs), &de->d_name);
if (qs.len) {
bpf_probe_read_kernel(&data.file, sizeof(data.file),
(void *)qs.name);
events.perf_submit(ctx, &data, sizeof(data));
}
}
return 0;
}
"""
is_support_raw_tp = BPF.support_raw_tracepoint()
if is_support_raw_tp:
bpf_text += bpf_text_raw_tp
else:
bpf_text += bpf_text_kprobe

if min_ms == 0:
bpf_text = bpf_text.replace('FILTER_US', '0')
else:
Expand All @@ -253,6 +381,8 @@ def print_event(cpu, data, size):
type = 'O'
elif event.type == 3:
type = 'G'
elif event.type == 4:
type = 'C'

if(csv):
print("%d,%s,%d,%s,%d,%d,%d,%s" % (
Expand All @@ -273,7 +403,18 @@ def print_event(cpu, data, size):
# Currently specifically works for NFSv4, the other kprobes are generic
# so it should work with earlier NFS versions

b = BPF(text=bpf_text)
# The following warning is shown on kernels after linux-5.18 when using bcc.
# Add compile option to silence it.
# In file included from /virtual/main.c:7:
# In file included from include/linux/nfs_fs.h:31:
# In file included from include/linux/sunrpc/auth.h:13:
# In file included from include/linux/sunrpc/sched.h:19:
# include/linux/sunrpc/xdr.h:751:10: warning: result of comparison of constant 4611686018427387903 with expression of type '__u32' (aka 'unsigned int') is always false [-Wtautological-constant-out-of-range-compare]
# if (len > SIZE_MAX / sizeof(*p))
# ~~~ ^ ~~~~~~~~~~~~~~~~~~~~~
# 1 warning generated.
b = BPF(text=bpf_text,
cflags=["-Wno-tautological-constant-out-of-range-compare"])
b.attach_kprobe(event="nfs_file_read", fn_name="trace_rw_entry")
b.attach_kprobe(event="nfs_file_write", fn_name="trace_rw_entry")
b.attach_kprobe(event="nfs4_file_open", fn_name="trace_file_open_entry")
Expand All @@ -286,6 +427,12 @@ def print_event(cpu, data, size):
b.attach_kretprobe(event="nfs_file_open", fn_name="trace_file_open_return")
b.attach_kretprobe(event="nfs_getattr", fn_name="trace_getattr_return")

if not is_support_raw_tp:
b.attach_kprobe(event="nfs_initiate_commit",
fn_name="trace_nfs_initiate_commit")
b.attach_kprobe(event="nfs_commit_done",
fn_name="trace_nfs_commit_done")

if(csv):
print("ENDTIME_us,TASK,PID,TYPE,BYTES,OFFSET_b,LATENCY_us,FILE")
else:
Expand Down

0 comments on commit d49d9aa

Please sign in to comment.