cmd/compile: implement jump tables

Performance is kind of hard to exactly quantify. One big difference between jump tables and the old binary search scheme is that there's only 1 branch statement instead of O(n) of them. That can be both a blessing and a curse, and can make evaluating jump tables very hard to do. The single branch can become a choke point for the hardware branch predictor. A branch table jump must fit all of its state in a single branch predictor entry (technically, a branch target predictor entry). With binary search that predictor state can be spread among lots of entries. In cases where the case selection is repetitive and thus predictable, binary search can perform better. The big win for a jump table is that it doesn't consume so much of the branch predictor's resources. But that benefit is essentially never observed in microbenchmarks, because the branch predictor can easily keep state for all the binary search branches in a microbenchmark. So that benefit is really hard to measure. So predictable switch microbenchmarks are ~useless - they will almost always favor the binary search scheme. Fully unpredictable switch microbenchmarks are better, as they aren't lying to us quite so much. In a perfectly unpredictable situation, a jump table will expect to incur 1-1/N branch mispredicts, where a binary search would incur lg(N)/2 of them. That makes the crossover point at about N=4. But of course switches in real programs are seldom fully unpredictable, so we'll use a higher crossover point. Beyond the branch predictor, jump tables tend to execute more instructions per switch but have no additional instructions per case, which also argues for a larger crossover. As far as code size goes, with this CL cmd/go has a slightly smaller code segment and a slightly larger overall size (from the jump tables themselves which live in the data segment). This is a case where some FDO (feedback-directed optimization) would be really nice to have. #28262 Some large-program benchmarks might help make the case for this CL. Especially if we can turn on branch mispredict counters so we can see how much using jump tables can free up branch prediction resources that can be gainfully used elsewhere in the program. name old time/op new time/op delta Switch8Predictable 1.89ns ± 2% 1.27ns ± 3% -32.58% (p=0.000 n=9+10) Switch8Unpredictable 9.33ns ± 1% 7.50ns ± 1% -19.60% (p=0.000 n=10+9) Switch32Predictable 2.20ns ± 2% 1.64ns ± 1% -25.39% (p=0.000 n=10+9) Switch32Unpredictable 10.0ns ± 2% 7.6ns ± 2% -24.04% (p=0.000 n=10+10) Fixes #5496 Update #34381 Change-Id: I3ff56011d02be53f605ca5fd3fb96b905517c34f Reviewed-on: https://go-review.googlesource.com/c/go/+/357330 Run-TryBot: Keith Randall <[email protected]> TryBot-Result: Gopher Robot <[email protected]> Reviewed-by: Cherry Mui <[email protected]> Reviewed-by: Keith Randall <[email protected]>
cloudflare · Apr 14, 2022 · 1ba96d8 · 1ba96d8
1 parent dd97871
commit 1ba96d8
Show file tree

Hide file tree

Showing 23 changed files with 428 additions and 40 deletions.
diff --git a/src/cmd/compile/internal/amd64/ssa.go b/src/cmd/compile/internal/amd64/ssa.go
@@ -1400,6 +1400,16 @@ func ssaGenBlock(s *ssagen.State, b, next *ssa.Block) {
  }
  }
 
+ case ssa.BlockAMD64JUMPTABLE:
+ // JMP *(TABLE)(INDEX*8)
+ p := s.Prog(obj.AJMP)
+ p.To.Type = obj.TYPE_MEM
+ p.To.Reg = b.Controls[1].Reg()
+ p.To.Index = b.Controls[0].Reg()
+ p.To.Scale = 8
+ // Save jump tables for later resolution of the target blocks.
+ s.JumpTables = append(s.JumpTables, b)
+
  default:
  b.Fatalf("branch not implemented: %s", b.LongString())
  }

diff --git a/src/cmd/compile/internal/gc/obj.go b/src/cmd/compile/internal/gc/obj.go
@@ -271,6 +271,9 @@ func addGCLocals() {
  objw.Global(x, int32(len(x.P)), obj.RODATA|obj.DUPOK)
  x.Set(obj.AttrStatic, true)
  }
+ for _, jt := range fn.JumpTables {
+ objw.Global(jt.Sym, int32(len(jt.Targets)*base.Ctxt.Arch.PtrSize), obj.RODATA)
+ }
  }
 }
 

diff --git a/src/cmd/compile/internal/ir/node.go b/src/cmd/compile/internal/ir/node.go
@@ -310,6 +310,7 @@ const (
  ORESULT // result of a function call; Xoffset is stack offset
  OINLMARK // start of an inlined body, with file/line of caller. Xoffset is an index into the inline tree.
  OLINKSYMOFFSET // offset within a name
+ OJUMPTABLE // A jump table structure for implementing dense expression switches
 
  // opcodes for generics
  ODYNAMICDOTTYPE // x = i.(T) where T is a type parameter (or derived from a type parameter)

diff --git a/src/cmd/compile/internal/ir/node_gen.go b/src/cmd/compile/internal/ir/node_gen.go
diff --git a/src/cmd/compile/internal/ir/op_string.go b/src/cmd/compile/internal/ir/op_string.go
diff --git a/src/cmd/compile/internal/ir/stmt.go b/src/cmd/compile/internal/ir/stmt.go
@@ -8,6 +8,7 @@ import (
  "cmd/compile/internal/base"
  "cmd/compile/internal/types"
  "cmd/internal/src"
+ "go/constant"
 )
 
 // A Decl is a declaration of a const, type, or var. (A declared func is a Func.)
@@ -262,6 +263,37 @@ func NewIfStmt(pos src.XPos, cond Node, body, els []Node) *IfStmt {
  return n
 }
 
+// A JumpTableStmt is used to implement switches. Its semantics are:
+// tmp := jt.Idx
+// if tmp == Cases[0] goto Targets[0]
+// if tmp == Cases[1] goto Targets[1]
+// ...
+// if tmp == Cases[n] goto Targets[n]
+// Note that a JumpTableStmt is more like a multiway-goto than
+// a multiway-if. In particular, the case bodies are just
+// labels to jump to, not not full Nodes lists.
+type JumpTableStmt struct {
+ miniStmt
+
+ // Value used to index the jump table.
+ // We support only integer types that
+ // are at most the size of a uintptr.
+ Idx Node
+
+ // If Idx is equal to Cases[i], jump to Targets[i].
+ // Cases entries must be distinct and in increasing order.
+ // The length of Cases and Targets must be equal.
+ Cases []constant.Value
+ Targets []*types.Sym
+}
+
+func NewJumpTableStmt(pos src.XPos, idx Node) *JumpTableStmt {
+ n := &JumpTableStmt{Idx: idx}
+ n.pos = pos
+ n.op = OJUMPTABLE
+ return n
+}
+
 // An InlineMarkStmt is a marker placed just before an inlined body.
 type InlineMarkStmt struct {
  miniStmt

diff --git a/src/cmd/compile/internal/ssa/check.go b/src/cmd/compile/internal/ssa/check.go
@@ -100,6 +100,10 @@ func checkFunc(f *Func) {
  if b.NumControls() != 0 {
  f.Fatalf("plain/dead block %s has a control value", b)
  }
+ case BlockJumpTable:
+ if b.NumControls() != 1 {
+ f.Fatalf("jumpTable block %s has no control value", b)
+ }
  }
  if len(b.Succs) != 2 && b.Likely != BranchUnknown {
  f.Fatalf("likeliness prediction %d for block %s with %d successors", b.Likely, b, len(b.Succs))

diff --git a/src/cmd/compile/internal/ssa/config.go b/src/cmd/compile/internal/ssa/config.go
@@ -168,6 +168,9 @@ type Frontend interface {
 
  // MyImportPath provides the import name (roughly, the package) for the function being compiled.
  MyImportPath() string
+
+ // LSym returns the linker symbol of the function being compiled.
+ LSym() string
 }
 
 // NewConfig returns a new configuration object for the given architecture.

diff --git a/src/cmd/compile/internal/ssa/export_test.go b/src/cmd/compile/internal/ssa/export_test.go
@@ -102,6 +102,9 @@ func (d TestFrontend) Debug_checknil() bool { retu
 func (d TestFrontend) MyImportPath() string {
  return "my/import/path"
 }
+func (d TestFrontend) LSym() string {
+ return "my/import/path.function"
+}
 
 var testTypes Types
 

diff --git a/src/cmd/compile/internal/ssa/gen/AMD64.rules b/src/cmd/compile/internal/ssa/gen/AMD64.rules
@@ -517,6 +517,8 @@
 
 (If cond yes no) => (NE (TESTB cond cond) yes no)
 
+(JumpTable idx) => (JUMPTABLE {makeJumpTableSym(b)} idx (LEAQ <typ.Uintptr> {makeJumpTableSym(b)} (SB)))
+
 // Atomic loads. Other than preserving their ordering with respect to other loads, nothing special here.
 (AtomicLoad8 ptr mem) => (MOVBatomicload ptr mem)
 (AtomicLoad32 ptr mem) => (MOVLatomicload ptr mem)

diff --git a/src/cmd/compile/internal/ssa/gen/AMD64Ops.go b/src/cmd/compile/internal/ssa/gen/AMD64Ops.go
@@ -1001,6 +1001,12 @@ func init() {
  {name: "NEF", controls: 1},
  {name: "ORD", controls: 1}, // FP, ordered comparison (parity zero)
  {name: "NAN", controls: 1}, // FP, unordered comparison (parity one)
+
+ // JUMPTABLE implements jump tables.
+ // Aux is the symbol (an *obj.LSym) for the jump table.
+ // control[0] is the index into the jump table.
+ // control[1] is the address of the jump table (the address of the symbol stored in Aux).
+ {name: "JUMPTABLE", controls: 2, aux: "Sym"},
  }
 
  archs = append(archs, arch{

diff --git a/src/cmd/compile/internal/ssa/gen/genericOps.go b/src/cmd/compile/internal/ssa/gen/genericOps.go
@@ -639,12 +639,13 @@ var genericOps = []opData{
 // First [] [always, never]
 
 var genericBlocks = []blockData{
- {name: "Plain"}, // a single successor
- {name: "If", controls: 1}, // if Controls[0] goto Succs[0] else goto Succs[1]
- {name: "Defer", controls: 1}, // Succs[0]=defer queued, Succs[1]=defer recovered. Controls[0] is call op (of memory type)
- {name: "Ret", controls: 1}, // no successors, Controls[0] value is memory result
- {name: "RetJmp", controls: 1}, // no successors, Controls[0] value is a tail call
- {name: "Exit", controls: 1}, // no successors, Controls[0] value generates a panic
+ {name: "Plain"}, // a single successor
+ {name: "If", controls: 1}, // if Controls[0] goto Succs[0] else goto Succs[1]
+ {name: "Defer", controls: 1}, // Succs[0]=defer queued, Succs[1]=defer recovered. Controls[0] is call op (of memory type)
+ {name: "Ret", controls: 1}, // no successors, Controls[0] value is memory result
+ {name: "RetJmp", controls: 1}, // no successors, Controls[0] value is a tail call
+ {name: "Exit", controls: 1}, // no successors, Controls[0] value generates a panic
+ {name: "JumpTable", controls: 1}, // multiple successors, the integer Controls[0] selects which one
 
  // transient block state used for dead code removal
  {name: "First"}, // 2 successors, always takes the first one (second is dead)

diff --git a/src/cmd/compile/internal/ssa/gen/rulegen.go b/src/cmd/compile/internal/ssa/gen/rulegen.go
@@ -1838,6 +1838,8 @@ func (op opData) auxIntType() string {
 // auxType returns the Go type that this block should store in its aux field.
 func (b blockData) auxType() string {
  switch b.aux {
+ case "Sym":
+ return "Sym"
  case "S390XCCMask", "S390XCCMaskInt8", "S390XCCMaskUint8":
  return "s390x.CCMask"
  case "S390XRotateParams":

diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go
diff --git a/src/cmd/compile/internal/ssa/rewrite.go b/src/cmd/compile/internal/ssa/rewrite.go
@@ -5,6 +5,7 @@
 package ssa
 
 import (
+ "cmd/compile/internal/base"
  "cmd/compile/internal/logopt"
  "cmd/compile/internal/types"
  "cmd/internal/obj"
@@ -1954,3 +1955,9 @@ func logicFlags32(x int32) flagConstant {
  fcb.N = x < 0
  return fcb.encode()
 }
+
+func makeJumpTableSym(b *Block) *obj.LSym {
+ s := base.Ctxt.Lookup(fmt.Sprintf("%s.jump%d", b.Func.fe.LSym(), b.ID))
+ s.Set(obj.AttrDuplicateOK, true)
+ return s
+}
diff --git a/src/cmd/compile/internal/ssa/rewriteAMD64.go b/src/cmd/compile/internal/ssa/rewriteAMD64.go