Skip to content

Commit

Permalink
cmd/compile: lower Add64/Sub64 into ssa on PPC64
Browse files Browse the repository at this point in the history
math/bits.Add64 and math/bits.Sub64 now lower and optimize
directly in SSA form.

The optimization of carry chains focuses around eliding
XER<->GPR transfers of the CA bit when used exclusively as an
input to a single carry operations, or when the CA value is
known.

This also adds support for handling XER spills in the assembler
which could happen if carry chains contain inter-dependencies
on each other (which seems very unlikely with practical usage),
or a clobber happens (SRAW/SRAD/SUBFC operations clobber CA).

With PPC64 Add64/Sub64 lowering into SSA and this patch, the net
performance difference in crypto/elliptic benchmarks on P9/ppc64le
are:

name                                old time/op    new time/op    delta
ScalarBaseMult/P256                   46.3µs ± 0%    46.9µs ± 0%   +1.34%
ScalarBaseMult/P224                    356µs ± 0%     209µs ± 0%  -41.14%
ScalarBaseMult/P384                   1.20ms ± 0%    0.57ms ± 0%  -52.14%
ScalarBaseMult/P521                   3.38ms ± 0%    1.44ms ± 0%  -57.27%
ScalarMult/P256                        199µs ± 0%     199µs ± 0%   -0.17%
ScalarMult/P224                        357µs ± 0%     212µs ± 0%  -40.56%
ScalarMult/P384                       1.20ms ± 0%    0.58ms ± 0%  -51.86%
ScalarMult/P521                       3.37ms ± 0%    1.44ms ± 0%  -57.32%
MarshalUnmarshal/P256/Uncompressed    2.59µs ± 0%    2.52µs ± 0%   -2.63%
MarshalUnmarshal/P256/Compressed      2.58µs ± 0%    2.52µs ± 0%   -2.06%
MarshalUnmarshal/P224/Uncompressed    1.54µs ± 0%    1.40µs ± 0%   -9.42%
MarshalUnmarshal/P224/Compressed      1.54µs ± 0%    1.39µs ± 0%   -9.87%
MarshalUnmarshal/P384/Uncompressed    2.40µs ± 0%    1.80µs ± 0%  -24.93%
MarshalUnmarshal/P384/Compressed      2.35µs ± 0%    1.81µs ± 0%  -23.03%
MarshalUnmarshal/P521/Uncompressed    3.79µs ± 0%    2.58µs ± 0%  -31.81%
MarshalUnmarshal/P521/Compressed      3.80µs ± 0%    2.60µs ± 0%  -31.67%

Note, P256 uses an asm implementation, thus, little variation is expected.

Change-Id: I88a24f6bf0f4f285c649e40243b1ab69cc452b71
Reviewed-on: https://go-review.googlesource.com/c/go/+/346870
Reviewed-by: Lynn Boger <[email protected]>
Reviewed-by: Dmitri Shuralyov <[email protected]>
Run-TryBot: Paul Murphy <[email protected]>
TryBot-Result: Gopher Robot <[email protected]>
Reviewed-by: Ian Lance Taylor <[email protected]>
  • Loading branch information
pmur committed May 10, 2022
1 parent 526de61 commit 0c43878
Show file tree
Hide file tree
Showing 9 changed files with 302 additions and 56 deletions.
5 changes: 5 additions & 0 deletions src/cmd/asm/internal/asm/testdata/ppc64.s
Original file line number Diff line number Diff line change
Expand Up @@ -814,4 +814,9 @@ TEXT asmtest(SB),DUPOK|NOSPLIT,$0
BCL $16,CR0LT,0(PC) // 42000001
BC $18,CR0LT,0(PC) // 42400000

MOVD SPR(3), 4(R1) // 7fe302a6fbe10004
MOVD XER, 4(R1) // 7fe102a6fbe10004
MOVD 4(R1), SPR(3) // ebe100047fe303a6
MOVD 4(R1), XER // ebe100047fe103a6

RET
25 changes: 0 additions & 25 deletions src/cmd/compile/internal/ppc64/ssa.go
Original file line number Diff line number Diff line change
Expand Up @@ -143,31 +143,6 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
p1.To.Type = obj.TYPE_REG
p1.To.Reg = v.Reg1()

case ssa.OpPPC64LoweredAdd64Carry:
// ADDC Rarg2, -1, Rtmp
// ADDE Rarg1, Rarg0, Reg0
// ADDZE Rzero, Reg1
r0 := v.Args[0].Reg()
r1 := v.Args[1].Reg()
r2 := v.Args[2].Reg()
p := s.Prog(ppc64.AADDC)
p.From.Type = obj.TYPE_CONST
p.From.Offset = -1
p.Reg = r2
p.To.Type = obj.TYPE_REG
p.To.Reg = ppc64.REGTMP
p1 := s.Prog(ppc64.AADDE)
p1.From.Type = obj.TYPE_REG
p1.From.Reg = r1
p1.Reg = r0
p1.To.Type = obj.TYPE_REG
p1.To.Reg = v.Reg0()
p2 := s.Prog(ppc64.AADDZE)
p2.From.Type = obj.TYPE_REG
p2.From.Reg = ppc64.REGZERO
p2.To.Type = obj.TYPE_REG
p2.To.Reg = v.Reg1()

case ssa.OpPPC64LoweredAtomicAnd8,
ssa.OpPPC64LoweredAtomicAnd32,
ssa.OpPPC64LoweredAtomicOr8,
Expand Down
17 changes: 16 additions & 1 deletion src/cmd/compile/internal/ssa/gen/PPC64.rules
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@
// (x + y) / 2 with x>=y => (x - y) / 2 + y
(Avg64u <t> x y) => (ADD (SRDconst <t> (SUB <t> x y) [1]) y)

(Add64carry ...) => (LoweredAdd64Carry ...)
(Mul64 ...) => (MULLD ...)
(Mul(32|16|8) ...) => (MULLW ...)
(Mul64uhilo ...) => (LoweredMuluhilo ...)
Expand Down Expand Up @@ -103,6 +102,22 @@
(ConstNil) => (MOVDconst [0])
(ConstBool [t]) => (MOVDconst [b2i(t)])

// Carrying addition.
(Select0 (Add64carry x y c)) => (Select0 <typ.UInt64> (ADDE x y (Select1 <typ.UInt64> (ADDCconst c [-1]))))
(Select1 (Add64carry x y c)) => (ADDZEzero (Select1 <typ.UInt64> (ADDE x y (Select1 <typ.UInt64> (ADDCconst c [-1])))))
// Fold initial carry bit if 0.
(ADDE x y (Select1 <typ.UInt64> (ADDCconst (MOVDconst [0]) [-1]))) => (ADDC x y)
// Fold transfer of CA -> GPR -> CA. Note 2 uses when feeding into a chained Add64carry.
(Select1 (ADDCconst n:(ADDZEzero x) [-1])) && n.Uses <= 2 => x

// Borrowing subtraction.
(Select0 (Sub64borrow x y c)) => (Select0 <typ.UInt64> (SUBE x y (Select1 <typ.UInt64> (SUBCconst c [0]))))
(Select1 (Sub64borrow x y c)) => (NEG (SUBZEzero (Select1 <typ.UInt64> (SUBE x y (Select1 <typ.UInt64> (SUBCconst c [0]))))))
// Fold initial borrow bit if 0.
(SUBE x y (Select1 <typ.UInt64> (SUBCconst (MOVDconst [0]) [0]))) => (SUBC x y)
// Fold transfer of CA -> GPR -> CA. Note 2 uses when feeding into a chained Sub64borrow.
(Select1 (SUBCconst n:(NEG (SUBZEzero x)) [0])) && n.Uses <= 2 => x

// Constant folding
(FABS (FMOVDconst [x])) => (FMOVDconst [math.Abs(x)])
(FSQRT (FMOVDconst [x])) && x >= 0 => (FMOVDconst [math.Sqrt(x)])
Expand Down
3 changes: 0 additions & 3 deletions src/cmd/compile/internal/ssa/gen/PPC64Ops.go
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,6 @@ func init() {
gp2xer1xer = regInfo{inputs: []regMask{gp | sp | sb, gp | sp | sb, xer}, outputs: []regMask{gp, xer}, clobbers: xer}
gp31 = regInfo{inputs: []regMask{gp | sp | sb, gp | sp | sb, gp | sp | sb}, outputs: []regMask{gp}}
gp22 = regInfo{inputs: []regMask{gp | sp | sb, gp | sp | sb}, outputs: []regMask{gp, gp}}
gp32 = regInfo{inputs: []regMask{gp | sp | sb, gp | sp | sb, gp | sp | sb}, outputs: []regMask{gp, gp}}
gp1cr = regInfo{inputs: []regMask{gp | sp | sb}}
gp2cr = regInfo{inputs: []regMask{gp | sp | sb, gp | sp | sb}}
crgp = regInfo{inputs: nil, outputs: []regMask{gp}}
Expand Down Expand Up @@ -225,8 +224,6 @@ func init() {
{name: "CLRLSLWI", argLength: 1, reg: gp11, asm: "CLRLSLWI", aux: "Int32"}, //
{name: "CLRLSLDI", argLength: 1, reg: gp11, asm: "CLRLSLDI", aux: "Int32"}, //

{name: "LoweredAdd64Carry", argLength: 3, reg: gp32, resultNotInArgs: true}, // arg0 + arg1 + carry, returns (sum, carry)

// Operations which consume or generate the CA (xer)
{name: "ADDC", argLength: 2, reg: gp21xer, asm: "ADDC", commutative: true, typ: "(UInt64, UInt64)"}, // arg0 + arg1 -> out, CA
{name: "SUBC", argLength: 2, reg: gp21xer, asm: "SUBC", typ: "(UInt64, UInt64)"}, // arg0 - arg1 -> out, CA
Expand Down
17 changes: 0 additions & 17 deletions src/cmd/compile/internal/ssa/opGen.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

207 changes: 204 additions & 3 deletions src/cmd/compile/internal/ssa/rewritePPC64.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion src/cmd/compile/internal/ssagen/ssa.go
Original file line number Diff line number Diff line change
Expand Up @@ -4687,7 +4687,7 @@ func InitTables() {
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue3(ssa.OpSub64borrow, types.NewTuple(types.Types[types.TUINT64], types.Types[types.TUINT64]), args[0], args[1], args[2])
},
sys.AMD64, sys.ARM64, sys.S390X)
sys.AMD64, sys.ARM64, sys.PPC64, sys.S390X)
alias("math/bits", "Sub", "math/bits", "Sub64", sys.ArchAMD64, sys.ArchARM64, sys.ArchS390X)
addF("math/bits", "Div64",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
Expand Down
Loading

0 comments on commit 0c43878

Please sign in to comment.