Finalized $entry, $latency, and $retval implementation including exam…

…ples and man
yezhengmao1 · Feb 13, 2016 · e350115 · e350115
1 parent 392d5c8
commit e350115
Show file tree

Hide file tree

Showing 3 changed files with 138 additions and 41 deletions.
diff --git a/man/man8/argdist.8 b/man/man8/argdist.8
@@ -2,7 +2,7 @@
 .SH NAME
 argdist \- Trace a function and display a histogram or frequency count of its parameter values. Uses Linux eBPF/bcc.
 .SH SYNOPSIS
-.B argdist [-h] [-p PID] [-z STRING_SIZE] [-i INTERVAL] [-n COUNT] [-H specifier [specifier ...]] [-C specifier [specifier ...]]
+.B argdist [-h] [-p PID] [-z STRING_SIZE] [-i INTERVAL] [-n COUNT] [-v] [-T TOP] [-H specifier [specifier ...]] [-C specifier [specifier ...]]
 .SH DESCRIPTION
 argdist attaches to function entry and exit points, collects specified parameter
 values, and stores them in a histogram or a frequency collection that counts
@@ -30,6 +30,12 @@ Print the collected data every INTERVAL seconds. The default is 1 second.
 -n NUMBER
 Print the collected data COUNT times and then exit.
 .TP
+\-v
+Display the generated BPF program, for debugging purposes.
+.TP
+\-T TOP
+When collecting frequency counts, display only the top TOP entries.
+.TP
 -H SPECIFIER, -C SPECIFIER
 One or more probe specifications that instruct argdist which functions to
 probe, which parameters to collect, how to aggregate them, and whether to perform
@@ -47,8 +53,7 @@ count information, or aggregate the collected values into a histogram. Counting
 probes will collect the number of times every parameter value was observed,
 whereas histogram probes will collect the parameter values into a histogram.
 Only integral types can be used with histogram probes; there is no such limitation
-for counting probes. Return probes can only use the pseudo-variable $retval, which
-is an alias for the function's return value.
+for counting probes.
 .TP
 .B [library]
 Library containing the probe.
@@ -73,11 +78,20 @@ The expression to capture.
 These are the values that are assigned to the histogram or raw event collection.
 You may use the parameters directly, or valid C expressions that involve the
 parameters, such as "size % 10".
+Return probes can use the argument values received by the
+function when it was entered, through the $entry(paramname) special variable.
+Return probes can also access the function's return value in $retval, and the
+function's execution time in nanoseconds in $latency. Note that adding the
+$latency or $entry(paramname) variables to the expression will introduce an
+additional probe at the function's entry to collect this data, and therefore
+introduce additional overhead.
 .TP
 .B [filter]
 The filter applied to the captured data.
 Only parameter values that pass the filter will be collected. This is any valid
 C expression that refers to the parameter values, such as "fd == 1 && length > 16".
+The $entry, $retval, and $latency variables can be used here as well, in return
+probes.
 .TP
 .B [label]
 The label that will be displayed when printing the probed values. By default,
@@ -96,6 +110,10 @@ Snoop on all strings returned by gets():
 #
 .B argdist.py -C 'r:c:gets():char*:$retval'
 .TP
+Print a histogram of read sizes that were longer than 1ms:
+#
+.B argdist.py -H 'r::__vfs_read(void *file, void *buf, size_t count):size_t:$entry(count):$latency > 1000000'
+.TP
 Print frequency counts of how many times writes were issued to a particular file descriptor number, in process 1005:
 #
 .B argdist.py -p 1005 -C 'p:c:write(int fd):int:fd'
@@ -114,7 +132,7 @@ Count fork() calls in libc across all processes, grouped by pid:
 .TP
 Print histograms of sleep() and nanosleep() parameter values:
 #
-.B argdist.py -H 'p:c:sleep(u32 seconds):u32:seconds' -H 'p:c:nanosleep(struct timespec { time_t tv_sec; long tv_nsec; } *req):long:req->tv_nsec'
+.B argdist.py -H 'p:c:sleep(u32 seconds):u32:seconds' 'p:c:nanosleep(struct timespec { time_t tv_sec; long tv_nsec; } *req):long:req->tv_nsec'
 .TP
 Spy on writes to STDOUT performed by process 2780, up to a string size of 120 characters:
 #

diff --git a/tools/argdist.py b/tools/argdist.py
@@ -74,22 +74,30 @@ def _generate_entry(self):
  text = text.replace("PID_FILTER", pid_filter)
  collect = ""
  for pname in self.args_to_probe:
- collect += "%s.update(&pid, &%s);\n" % \
- (self.hashname_prefix + pname, pname)
+ param_hash = self.hashname_prefix + pname
+ if pname == "__latency":
+ collect += """
+u64 __time = bpf_ktime_get_ns();
+%s.update(&pid, &__time);
+""" % param_hash
+ else:
+ collect += "%s.update(&pid, &%s);\n" % \
+ (param_hash, pname)
  text = text.replace("COLLECT", collect)
  return text
 
  def _generate_entry_probe(self):
- # TODO $latency as a special keyword that should be traced
  # Any $entry(name) expressions result in saving that argument
  # when entering the function.
  self.args_to_probe = set()
  regex = r"\$entry\((\w+)\)"
- for arg in re.finditer(regex, self.expr or ""):
+ for arg in re.finditer(regex, self.expr):
  self.args_to_probe.add(arg.group(1))
- for arg in re.finditer(regex, self.filter or ""):
+ for arg in re.finditer(regex, self.filter):
  self.args_to_probe.add(arg.group(1))
-
+ if "$latency" in self.expr or "$latency" in self.filter:
+ self.args_to_probe.add("__latency")
+ self.param_types["__latency"] = "u64" # nanoseconds
  for pname in self.args_to_probe:
  if pname not in self.param_types:
  raise ValueError("$entry(%s): no such param" \
@@ -124,12 +132,15 @@ def _generate_retprobe_prefix(self):
 
  def _replace_entry_exprs(self):
  for pname, vname in self.param_val_names.items():
- entry_expr = "$entry(%s)" % pname
- val_expr = "*" + vname # dereference the pointer
+ if pname == "__latency":
+ entry_expr = "$latency"
+ val_expr = "(bpf_ktime_get_ns() - *%s)" % vname
+ else:
+ entry_expr = "$entry(%s)" % pname
+ val_expr = "(*%s)" % vname
  self.expr = self.expr.replace(entry_expr, val_expr)
- if self.filter is not None:
- self.filter = self.filter.replace(entry_expr,
- val_expr)
+ self.filter = self.filter.replace(entry_expr,
+ val_expr)
 
  def _attach_entry_probe(self):
  if self.is_user:
@@ -177,14 +188,14 @@ def __init__(self, type, specifier, pid):
  self.expr_type = \
  "u64" if not self.is_ret_probe else "int"
  self.expr = "1" if not self.is_ret_probe else "$retval"
- self.filter = None if len(parts) != 6 else parts[5]
+ self.filter = "" if len(parts) != 6 else parts[5]
  self._substitute_exprs()
 
  # Do we need to attach an entry probe so that we can collect an 
  # argument that is required for an exit (return) probe?
  self.entry_probe_required = self.is_ret_probe and \
- ("$entry" in self.expr or \
- "$entry" in (self.filter or ""))
+ ("$entry" in self.expr or "$entry" in self.filter or
+ "$latency" in self.expr or "$latency" in self.filter)
 
  self.pid = pid
  # Generating unique names for probes means we can attach
@@ -198,9 +209,8 @@ def __init__(self, type, specifier, pid):
  def _substitute_exprs(self):
  self.expr = self.expr.replace("$retval",
  "(%s)ctx->ax" % self.expr_type)
- if self.filter is not None:
- self.filter = self.filter.replace("$retval",
- "(%s)ctx->ax" % self.expr_type)
+ self.filter = self.filter.replace("$retval",
+ "(%s)ctx->ax" % self.expr_type)
  self.expr = self._substitute_aliases(self.expr)
  self.filter = self._substitute_aliases(self.filter)
 
@@ -264,7 +274,8 @@ def generate_text(self, string_size):
  (self.expr_type, self.expr)
  program = program.replace("DATA_DECL", decl)
  program = program.replace("KEY_EXPR", key_expr) 
- program = program.replace("FILTER", self.filter or "1") 
+ program = program.replace("FILTER",
+ "1" if len(self.filter) == 0 else self.filter) 
  program = program.replace("COLLECT", collect)
  program = program.replace("PREFIX", prefix)
  return program
@@ -293,9 +304,9 @@ def attach(self, bpf):
  self._attach_entry_probe()
 
  def display(self, top):
- print(self.label or self.raw_spec)
  data = self.bpf.get_table(self.probe_hash_name)
  if self.type == "freq":
+ print(self.label or self.raw_spec)
  print("\t%-10s %s" % ("COUNT", "EVENT"))
  data = sorted(data.items(), key=lambda kv: kv[1].value)
  if top is not None:
@@ -317,16 +328,17 @@ def display(self, top):
  print("\t%-10s %s" % \
  (str(value.value), key_str))
  elif self.type == "hist":
- label = self.expr if not self.is_default_expr \
- else "retval"
+ label = self.label or \
+ (self.expr if not self.is_default_expr \
+ else "retval")
  data.print_log2_hist(val_type=label)
 
 examples = """
 Probe specifier syntax:
  {p,r}:[library]:function(signature)[:type:expr[:filter]][;label]
 Where:
  p,r -- probe at function entry or at function exit
- in exit probes, only $retval is accessible
+ in exit probes: can use $retval, $entry(param), $latency
  library -- the library that contains the function
  (leave empty for kernel functions)
  function -- the function name to trace
@@ -348,6 +360,9 @@ def display(self, top):
 argdist.py -C 'r:c:gets():char*:$retval;snooped strings'
  Snoop on all strings returned by gets()
 
+argdist.py -H 'r::__kmalloc(size_t size):u64:$latency/$entry(size);ns per byte'
+ Print a histogram of nanoseconds per byte from kmalloc allocations
+
 argdist.py -p 1005 -C 'p:c:write(int fd):int:fd' -T 5
  Print frequency counts of how many times writes were issued to a
  particular file descriptor number, in process 1005, but only show
@@ -356,6 +371,12 @@ def display(self, top):
 argdist.py -p 1005 -H 'r:c:read()'
  Print a histogram of error codes returned by read() in process 1005
 
+argdist.py -C 'r::__vfs_read():u32:$PID:$latency > 100000'
+ Print frequency of reads by process where the latency was >0.1ms
+
+argdist.py -H 'r::__vfs_read(void *file, void *buf, size_t count):size_t:$entry(count):$latency > 1000000' 
+ Print a histogram of read sizes that were longer than 1ms
+
 argdist.py -H \\
  'p:c:write(int fd, const void *buf, size_t count):size_t:count:fd==1'
  Print a histogram of buffer sizes passed to write() across all

diff --git a/tools/argdist_examples.txt b/tools/argdist_examples.txt
@@ -183,6 +183,54 @@ r:c:read()
  1024 -> 2047 : 0 | |
  2048 -> 4095 : 2 |** |
 
+In return probes, you can also trace the latency of the function (unless it is
+recursive) and the parameters it had on entry. For example, we can identify
+which processes are performing slow synchronous filesystem reads -- say,
+longer than 0.1ms (100,000ns):
+
+# ./argdist.py -C 'r::__vfs_read():u32:$PID:$latency > 100000'
+[01:08:48]
+r::__vfs_read():u32:$PID:$latency > 100000
+ COUNT EVENT
+ 1 bpf_get_current_pid_tgid() = 10457
+ 21 bpf_get_current_pid_tgid() = 2780
+[01:08:49]
+r::__vfs_read():u32:$PID:$latency > 100000
+ COUNT EVENT
+ 1 bpf_get_current_pid_tgid() = 10457
+ 21 bpf_get_current_pid_tgid() = 2780
+^C
+
+As you see, the $PID alias is expanded to the BPF function bpf_get_current_pid_tgid(),
+which returns the current process' pid. It looks like process 2780 performed
+21 slow reads.
+
+Occasionally, entry parameter values are also interesting. For example, you
+might be curious how long it takes malloc() to allocate memory -- nanoseconds
+per byte allocated. Let's go:
+
+# ./argdist.py -H 'r:c:malloc(size_t size):u64:$latency/$entry(size);ns per byte' -n 1 -i 10
+[01:11:13]
+ ns per byte : count distribution
+ 0 -> 1 : 0 | |
+ 2 -> 3 : 4 |***************** |
+ 4 -> 7 : 3 |************* |
+ 8 -> 15 : 2 |******** |
+ 16 -> 31 : 1 |**** |
+ 32 -> 63 : 0 | |
+ 64 -> 127 : 7 |******************************* |
+ 128 -> 255 : 1 |**** |
+ 256 -> 511 : 0 | |
+ 512 -> 1023 : 1 |**** |
+ 1024 -> 2047 : 1 |**** |
+ 2048 -> 4095 : 9 |****************************************|
+ 4096 -> 8191 : 1 |**** | 
+
+It looks like a tri-modal distribution. Some allocations are extremely cheap,
+and take 2-15 nanoseconds per byte. Other allocations are slower, and take
+64-127 nanoseconds per byte. And some allocations are slower still, and take
+multiple microseconds per byte.
+
 Here's a final example that finds how many write() system calls are performed
 by each process on the system:
 
@@ -200,15 +248,13 @@ write by process
  23 bpf_get_current_pid_tgid() = 7615
  23 bpf_get_current_pid_tgid() = 2480
 
-As you see, the $PID alias is expanded to the BPF function bpf_get_current_pid_tgid(),
-which returns the current process' pid.
-
 
 USAGE message:
 
-usage: argdist.py [-h] [-p PID] [-z STRING_SIZE] [-i INTERVAL] [-n COUNT]
- [-H [HISTSPECIFIER [HISTSPECIFIER ...]]]
- [-C [COUNTSPECIFIER [COUNTSPECIFIER ...]]] [-v]
+# argdist.py -h
+usage: argdist.py [-h] [-p PID] [-z STRING_SIZE] [-i INTERVAL] [-n COUNT] [-v]
+ [-T TOP] [-H [HISTSPECIFIER [HISTSPECIFIER ...]]]
+ [-C [COUNTSPECIFIER [COUNTSPECIFIER ...]]]
 
 Trace a function and display a summary of its parameter values.
 
@@ -221,19 +267,21 @@ optional arguments:
  output interval, in seconds
  -n COUNT, --number COUNT
  number of outputs
+ -v, --verbose print resulting BPF program code before executing
+ -T TOP, --top TOP number of top results to show (not applicable to
+ histograms)
  -H [HISTSPECIFIER [HISTSPECIFIER ...]], --histogram [HISTSPECIFIER [HISTSPECIFIER ...]]
  probe specifier to capture histogram of (see examples
  below)
  -C [COUNTSPECIFIER [COUNTSPECIFIER ...]], --count [COUNTSPECIFIER [COUNTSPECIFIER ...]]
  probe specifier to capture count of (see examples
  below)
- -v, --verbose print resulting BPF program code before executing
 
 Probe specifier syntax:
  {p,r}:[library]:function(signature)[:type:expr[:filter]][;label]
 Where:
  p,r -- probe at function entry or at function exit
- in exit probes, only $retval is accessible
+ in exit probes: can use $retval, $entry(param), $latency
  library -- the library that contains the function
  (leave empty for kernel functions)
  function -- the function name to trace
@@ -255,29 +303,39 @@ argdist.py -p 1005 -C 'p:c:malloc(size_t size):size_t:size:size==16'
 argdist.py -C 'r:c:gets():char*:$retval;snooped strings'
  Snoop on all strings returned by gets()
 
-argdist.py -p 1005 -C 'p:c:write(int fd):int:fd'
+argdist.py -H 'r::__kmalloc(size_t size):u64:$latency/$entry(size);ns per byte'
+ Print a histogram of nanoseconds per byte from kmalloc allocations
+
+argdist.py -p 1005 -C 'p:c:write(int fd):int:fd' -T 5
  Print frequency counts of how many times writes were issued to a
- particular file descriptor number, in process 1005
+ particular file descriptor number, in process 1005, but only show
+ the top 5 busiest fds
 
 argdist.py -p 1005 -H 'r:c:read()'
  Print a histogram of error codes returned by read() in process 1005
 
+argdist.py -C 'r::__vfs_read():u32:$PID:$latency > 100000'
+ Print frequency of reads by process where the latency was >0.1ms
+
+argdist.py -H 'r::__vfs_read(void *file, void *buf, size_t count):size_t:$entry(count):$latency > 1000000' 
+ Print a histogram of read sizes that were longer than 1ms
+
 argdist.py -H \
  'p:c:write(int fd, const void *buf, size_t count):size_t:count:fd==1'
  Print a histogram of buffer sizes passed to write() across all
  processes, where the file descriptor was 1 (STDOUT)
 
 argdist.py -C 'p:c:fork();fork calls'
  Count fork() calls in libc across all processes
- Can also use funccount.py, which is easier and more flexible
+ Can also use funccount.py, which is easier and more flexible 
 
-argdist.py \
- -H 'p:c:sleep(u32 seconds):u32:seconds' \
- -H 'p:c:nanosleep(struct timespec { time_t tv_sec; long tv_nsec; } *req):long:req->tv_nsec'
+argdist.py -H \
+ 'p:c:sleep(u32 seconds):u32:seconds' \
+ 'p:c:nanosleep(struct timespec { time_t tv_sec; long tv_nsec; } *req):long:req->tv_nsec'
  Print histograms of sleep() and nanosleep() parameter values
 
 argdist.py -p 2780 -z 120 \
  -C 'p:c:write(int fd, char* buf, size_t len):char*:buf:fd==1'
  Spy on writes to STDOUT performed by process 2780, up to a string size
- of 120 characters
+ of 120 characters