From a1155d9e0204a6fa8bd48177b4b028e26cf02eb7 Mon Sep 17 00:00:00 2001
From: Abhinav Gupta <abg@uber.com>
Date: Thu, 27 Jan 2022 08:54:04 -0800
Subject: [PATCH] Optimize AddCaller+AddStacktrace by sharing stacks (#1052)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

(Supersedes #808.)

If we use the `AddCaller(true)` option for a logger with
`AddStacktrace(level)`, which captures a stack trace for the specified
level or higher, Zap currently captures inspects the stack twice:

- `getCallerFrame` retrieves information about the immediate caller of
  the log entry
- `StackSkip` calls `takeStacktrace` to capture the entire stack trace
  and build a string from it

For the case where both caller and stack trace are requested,
we can avoid redundant work by sharing information about the call stack
between the logic for the two.

To accomplish this, the following high-level pieces were added:

    type stacktrace
      captureStacktrace(skip int, depth stacktraceDepth) *stacktrace
      func (*stacktrace) Next() (runtime.Frame, bool)

    type stackFormatter
      func newStackFormatter(*buffer.Buffer)
      func (*stackFormatter) FormatStack(*stacktrace)
      func (*stackFormatter) FormatFrame(runtime.Frame)

`captureStacktrace` captures stack traces (either just one frame or the
entire stack) that can then be inspected manually (to extract caller
information) and formatted into a string representation either
one-frame-at-a-time or wholesale with `stackFormatter`.

This allows us reorganize logic that applies the AddCaller and
AddStacktrace options so that it can:

- capture as much of the stack as it needs
- extract caller information from the first frame if we need it
- format that frame and the remaining frames into a stack trace

---

Benchmarking:
I ran the new benchmark included in this PR, as well as the existing
stack trace/caller based benchmarks 10 times for 10 seconds each.

```
% go test -run '^$' -bench 'Caller|Stack' -benchmem -benchtime 10s -count 10
[...]
name                       old time/op    new time/op    delta
StackField-96                 834ns ± 8%     819ns ±14%     ~     (p=0.971 n=10+10)
AddCallerHook-96              466ns ± 7%     480ns ± 1%     ~     (p=0.118 n=10+6)
AddCallerAndStacktrace-96    1.16µs ± 4%    0.78µs ± 8%  -32.60%  (p=0.000 n=9+10)
TakeStacktrace-96            2.66µs ± 2%    2.74µs ± 2%   +3.01%  (p=0.000 n=9+10)

name                       old alloc/op   new alloc/op   delta
StackField-96                  542B ± 0%      544B ± 0%   +0.33%  (p=0.000 n=10+10)
AddCallerHook-96               254B ± 0%      248B ± 0%   -2.36%  (p=0.000 n=10+10)
AddCallerAndStacktrace-96      652B ± 0%      429B ± 0%  -34.20%  (p=0.000 n=9+6)
TakeStacktrace-96              420B ± 0%      421B ± 0%   +0.24%  (p=0.000 n=10+10)

name                       old allocs/op  new allocs/op  delta
StackField-96                  3.00 ± 0%      3.00 ± 0%     ~     (all equal)
AddCallerHook-96               3.00 ± 0%      2.00 ± 0%  -33.33%  (p=0.000 n=10+10)
AddCallerAndStacktrace-96      5.00 ± 0%      3.00 ± 0%  -40.00%  (p=0.000 n=10+10)
TakeStacktrace-96              2.00 ± 0%      2.00 ± 0%     ~     (all equal)
```

Summary:

- AddCaller+AddStacktrace takes 2 fewer allocations and is ~30% faster
- AddCaller costs one allocation less
- takeStacktrace costs 3% more CPU

I think that the overall improvement is worth the slightly more expensive stack traces.

Refs GO-1181
---
 logger.go            |  67 ++++++++++-------
 logger_bench_test.go |  18 +++++
 stacktrace.go        | 175 ++++++++++++++++++++++++++++++++-----------
 stacktrace_test.go   |  32 ++++++++
 4 files changed, 224 insertions(+), 68 deletions(-)

diff --git a/logger.go b/logger.go
index c5948a6cf..087c74222 100644
--- a/logger.go
+++ b/logger.go
@@ -24,9 +24,9 @@ import (
 	"fmt"
 	"io/ioutil"
 	"os"
-	"runtime"
 	"strings"
 
+	"go.uber.org/zap/internal/bufferpool"
 	"go.uber.org/zap/zapcore"
 )
 
@@ -259,8 +259,10 @@ func (log *Logger) clone() *Logger {
 }
 
 func (log *Logger) check(lvl zapcore.Level, msg string) *zapcore.CheckedEntry {
-	// check must always be called directly by a method in the Logger interface
-	// (e.g., Check, Info, Fatal).
+	// Logger.check must always be called directly by a method in the
+	// Logger interface (e.g., Check, Info, Fatal).
+	// This skips Logger.check and the Info/Fatal/Check/etc. method that
+	// called it.
 	const callerSkipOffset = 2
 
 	// Check the level first to reduce the cost of disabled log calls.
@@ -307,42 +309,55 @@ func (log *Logger) check(lvl zapcore.Level, msg string) *zapcore.CheckedEntry {
 
 	// Thread the error output through to the CheckedEntry.
 	ce.ErrorOutput = log.errorOutput
-	if log.addCaller {
-		frame, defined := getCallerFrame(log.callerSkip + callerSkipOffset)
-		if !defined {
+
+	addStack := log.addStack.Enabled(ce.Level)
+	if !log.addCaller && !addStack {
+		return ce
+	}
+
+	// Adding the caller or stack trace requires capturing the callers of
+	// this function. We'll share information between these two.
+	stackDepth := stacktraceFirst
+	if addStack {
+		stackDepth = stacktraceFull
+	}
+	stack := captureStacktrace(log.callerSkip+callerSkipOffset, stackDepth)
+	defer stack.Free()
+
+	if stack.Count() == 0 {
+		if log.addCaller {
 			fmt.Fprintf(log.errorOutput, "%v Logger.check error: failed to get caller\n", ent.Time.UTC())
 			log.errorOutput.Sync()
 		}
+		return ce
+	}
 
+	frame, more := stack.Next()
+
+	if log.addCaller {
 		ce.Caller = zapcore.EntryCaller{
-			Defined:  defined,
+			Defined:  frame.PC != 0,
 			PC:       frame.PC,
 			File:     frame.File,
 			Line:     frame.Line,
 			Function: frame.Function,
 		}
 	}
-	if log.addStack.Enabled(ce.Level) {
-		ce.Stack = StackSkip("", log.callerSkip+callerSkipOffset).String
-	}
 
-	return ce
-}
+	if addStack {
+		buffer := bufferpool.Get()
+		defer buffer.Free()
 
-// getCallerFrame gets caller frame. The argument skip is the number of stack
-// frames to ascend, with 0 identifying the caller of getCallerFrame. The
-// boolean ok is false if it was not possible to recover the information.
-//
-// Note: This implementation is similar to runtime.Caller, but it returns the whole frame.
-func getCallerFrame(skip int) (frame runtime.Frame, ok bool) {
-	const skipOffset = 2 // skip getCallerFrame and Callers
-
-	pc := make([]uintptr, 1)
-	numFrames := runtime.Callers(skip+skipOffset, pc)
-	if numFrames < 1 {
-		return
+		stackfmt := newStackFormatter(buffer)
+
+		// We've already extracted the first frame, so format that
+		// separately and defer to stackfmt for the rest.
+		stackfmt.FormatFrame(frame)
+		if more {
+			stackfmt.FormatStack(stack)
+		}
+		ce.Stack = buffer.String()
 	}
 
-	frame, _ = runtime.CallersFrames(pc).Next()
-	return frame, frame.PC != 0
+	return ce
 }
diff --git a/logger_bench_test.go b/logger_bench_test.go
index 71c6b5911..41e661577 100644
--- a/logger_bench_test.go
+++ b/logger_bench_test.go
@@ -179,6 +179,24 @@ func BenchmarkAddCallerHook(b *testing.B) {
 	})
 }
 
+func BenchmarkAddCallerAndStacktrace(b *testing.B) {
+	logger := New(
+		zapcore.NewCore(
+			zapcore.NewJSONEncoder(NewProductionConfig().EncoderConfig),
+			&ztest.Discarder{},
+			InfoLevel,
+		),
+		AddCaller(),
+		AddStacktrace(WarnLevel),
+	)
+	b.ResetTimer()
+	b.RunParallel(func(pb *testing.PB) {
+		for pb.Next() {
+			logger.Warn("Caller and stacktrace.")
+		}
+	})
+}
+
 func Benchmark10Fields(b *testing.B) {
 	withBenchedLogger(b, func(log *Logger) {
 		log.Info("Ten fields, passed at the log site.",
diff --git a/stacktrace.go b/stacktrace.go
index 0cf8c1ddf..3d187fa56 100644
--- a/stacktrace.go
+++ b/stacktrace.go
@@ -24,62 +24,153 @@ import (
 	"runtime"
 	"sync"
 
+	"go.uber.org/zap/buffer"
 	"go.uber.org/zap/internal/bufferpool"
 )
 
-var (
-	_stacktracePool = sync.Pool{
-		New: func() interface{} {
-			return newProgramCounters(64)
-		},
-	}
+var _stacktracePool = sync.Pool{
+	New: func() interface{} {
+		return &stacktrace{
+			storage: make([]uintptr, 64),
+		}
+	},
+}
+
+type stacktrace struct {
+	pcs    []uintptr // program counters; always a subslice of storage
+	frames *runtime.Frames
+
+	// The size of pcs varies depending on requirements:
+	// it will be one if the only the first frame was requested,
+	// and otherwise it will reflect the depth of the call stack.
+	//
+	// storage decouples the slice we need (pcs) from the slice we pool.
+	// We will always allocate a reasonably large storage, but we'll use
+	// only as much of it as we need.
+	storage []uintptr
+}
+
+// stacktraceDepth specifies how deep of a stack trace should be captured.
+type stacktraceDepth int
+
+const (
+	// stacktraceFirst captures only the first frame.
+	stacktraceFirst stacktraceDepth = iota
+
+	// stacktraceFull captures the entire call stack, allocating more
+	// storage for it if needed.
+	stacktraceFull
 )
 
-func takeStacktrace(skip int) string {
-	buffer := bufferpool.Get()
-	defer buffer.Free()
-	programCounters := _stacktracePool.Get().(*programCounters)
-	defer _stacktracePool.Put(programCounters)
-
-	var numFrames int
-	for {
-		// Skip the call to runtime.Callers and takeStacktrace so that the
-		// program counters start at the caller of takeStacktrace.
-		numFrames = runtime.Callers(skip+2, programCounters.pcs)
-		if numFrames < len(programCounters.pcs) {
-			break
-		}
-		// Don't put the too-short counter slice back into the pool; this lets
-		// the pool adjust if we consistently take deep stacktraces.
-		programCounters = newProgramCounters(len(programCounters.pcs) * 2)
+// captureStacktrace captures a stack trace of the specified depth, skipping
+// the provided number of frames. skip=0 identifies the caller of
+// captureStacktrace.
+//
+// The caller must call Free on the returned stacktrace after using it.
+func captureStacktrace(skip int, depth stacktraceDepth) *stacktrace {
+	stack := _stacktracePool.Get().(*stacktrace)
+
+	switch depth {
+	case stacktraceFirst:
+		stack.pcs = stack.storage[:1]
+	case stacktraceFull:
+		stack.pcs = stack.storage
 	}
 
-	i := 0
-	frames := runtime.CallersFrames(programCounters.pcs[:numFrames])
+	// Unlike other "skip"-based APIs, skip=0 identifies runtime.Callers
+	// itself. +2 to skip captureStacktrace and runtime.Callers.
+	numFrames := runtime.Callers(
+		skip+2,
+		stack.pcs,
+	)
 
-	// Note: On the last iteration, frames.Next() returns false, with a valid
-	// frame, but we ignore this frame. The last frame is a a runtime frame which
-	// adds noise, since it's only either runtime.main or runtime.goexit.
-	for frame, more := frames.Next(); more; frame, more = frames.Next() {
-		if i != 0 {
-			buffer.AppendByte('\n')
+	// runtime.Callers truncates the recorded stacktrace if there is no
+	// room in the provided slice. For the full stack trace, keep expanding
+	// storage until there are fewer frames than there is room.
+	if depth == stacktraceFull {
+		pcs := stack.pcs
+		for numFrames == len(pcs) {
+			pcs = make([]uintptr, len(pcs)*2)
+			numFrames = runtime.Callers(skip+2, pcs)
 		}
-		i++
-		buffer.AppendString(frame.Function)
-		buffer.AppendByte('\n')
-		buffer.AppendByte('\t')
-		buffer.AppendString(frame.File)
-		buffer.AppendByte(':')
-		buffer.AppendInt(int64(frame.Line))
+
+		// Discard old storage instead of returning it to the pool.
+		// This will adjust the pool size over time if stack traces are
+		// consistently very deep.
+		stack.storage = pcs
+		stack.pcs = pcs[:numFrames]
+	} else {
+		stack.pcs = stack.pcs[:numFrames]
 	}
 
+	stack.frames = runtime.CallersFrames(stack.pcs)
+	return stack
+}
+
+// Free releases resources associated with this stacktrace
+// and returns it back to the pool.
+func (st *stacktrace) Free() {
+	st.frames = nil
+	st.pcs = nil
+	_stacktracePool.Put(st)
+}
+
+// Count reports the total number of frames in this stacktrace.
+// Count DOES NOT change as Next is called.
+func (st *stacktrace) Count() int {
+	return len(st.pcs)
+}
+
+// Next returns the next frame in the stack trace,
+// and a boolean indicating whether there are more after it.
+func (st *stacktrace) Next() (_ runtime.Frame, more bool) {
+	return st.frames.Next()
+}
+
+func takeStacktrace(skip int) string {
+	stack := captureStacktrace(skip+1, stacktraceFull)
+	defer stack.Free()
+
+	buffer := bufferpool.Get()
+	defer buffer.Free()
+
+	stackfmt := newStackFormatter(buffer)
+	stackfmt.FormatStack(stack)
 	return buffer.String()
 }
 
-type programCounters struct {
-	pcs []uintptr
+// stackFormatter formats a stack trace into a readable string representation.
+type stackFormatter struct {
+	b        *buffer.Buffer
+	nonEmpty bool // whehther we've written at least one frame already
+}
+
+// newStackFormatter builds a new stackFormatter.
+func newStackFormatter(b *buffer.Buffer) stackFormatter {
+	return stackFormatter{b: b}
+}
+
+// FormatStack formats all remaining frames in the provided stacktrace -- minus
+// the final runtime.main/runtime.goexit frame.
+func (sf *stackFormatter) FormatStack(stack *stacktrace) {
+	// Note: On the last iteration, frames.Next() returns false, with a valid
+	// frame, but we ignore this frame. The last frame is a a runtime frame which
+	// adds noise, since it's only either runtime.main or runtime.goexit.
+	for frame, more := stack.Next(); more; frame, more = stack.Next() {
+		sf.FormatFrame(frame)
+	}
 }
 
-func newProgramCounters(size int) *programCounters {
-	return &programCounters{make([]uintptr, size)}
+// FormatFrame formats the given frame.
+func (sf *stackFormatter) FormatFrame(frame runtime.Frame) {
+	if sf.nonEmpty {
+		sf.b.AppendByte('\n')
+	}
+	sf.nonEmpty = true
+	sf.b.AppendString(frame.Function)
+	sf.b.AppendByte('\n')
+	sf.b.AppendByte('\t')
+	sf.b.AppendString(frame.File)
+	sf.b.AppendByte(':')
+	sf.b.AppendInt(int64(frame.Line))
 }
diff --git a/stacktrace_test.go b/stacktrace_test.go
index d473029ee..82b6af38e 100644
--- a/stacktrace_test.go
+++ b/stacktrace_test.go
@@ -21,6 +21,7 @@
 package zap
 
 import (
+	"bytes"
 	"strings"
 	"testing"
 
@@ -67,8 +68,39 @@ func TestTakeStacktraceWithSkipInnerFunc(t *testing.T) {
 	)
 }
 
+func TestTakeStacktraceDeepStack(t *testing.T) {
+	const (
+		N                  = 500
+		withStackDepthName = "go.uber.org/zap.withStackDepth"
+	)
+	withStackDepth(N, func() {
+		trace := takeStacktrace(0)
+		for found := 0; found < N; found++ {
+			i := strings.Index(trace, withStackDepthName)
+			if i < 0 {
+				t.Fatalf(`expected %v occurrences of %q, found %d`,
+					N, withStackDepthName, found)
+			}
+			trace = trace[i+len(withStackDepthName):]
+		}
+	})
+}
+
 func BenchmarkTakeStacktrace(b *testing.B) {
 	for i := 0; i < b.N; i++ {
 		takeStacktrace(0)
 	}
 }
+
+func withStackDepth(depth int, f func()) {
+	var recurse func(rune) rune
+	recurse = func(r rune) rune {
+		if r > 0 {
+			bytes.Map(recurse, []byte(string([]rune{r - 1})))
+		} else {
+			f()
+		}
+		return 0
+	}
+	recurse(rune(depth))
+}