Skip to content

Commit

Permalink
cmd/compile: intrinsify RotateLeft{32,64} on loong64
Browse files Browse the repository at this point in the history
Benchmark on crypto/sha256 (provided by Xiaodong Liu):
name               old time/op    new time/op    delta
Hash8Bytes/New       1.19µs ± 0%    0.97µs ± 0%  -18.75%  (p=0.000 n=9+9)
Hash8Bytes/Sum224    1.21µs ± 0%    0.97µs ± 0%  -20.04%  (p=0.000 n=9+10)
Hash8Bytes/Sum256    1.21µs ± 0%    0.98µs ± 0%  -19.16%  (p=0.000 n=10+7)
Hash1K/New           15.9µs ± 0%    12.4µs ± 0%  -22.10%  (p=0.000 n=10+10)
Hash1K/Sum224        15.9µs ± 0%    12.4µs ± 0%  -22.18%  (p=0.000 n=8+10)
Hash1K/Sum256        15.9µs ± 0%    12.4µs ± 0%  -22.15%  (p=0.000 n=10+9)
Hash8K/New            119µs ± 0%      92µs ± 0%  -22.40%  (p=0.000 n=10+9)
Hash8K/Sum224         119µs ± 0%      92µs ± 0%  -22.41%  (p=0.000 n=9+10)
Hash8K/Sum256         119µs ± 0%      92µs ± 0%  -22.40%  (p=0.000 n=9+9)

name               old speed      new speed      delta
Hash8Bytes/New     6.70MB/s ± 0%  8.25MB/s ± 0%  +23.13%  (p=0.000 n=10+10)
Hash8Bytes/Sum224  6.60MB/s ± 0%  8.26MB/s ± 0%  +25.06%  (p=0.000 n=10+10)
Hash8Bytes/Sum256  6.59MB/s ± 0%  8.15MB/s ± 0%  +23.67%  (p=0.000 n=10+7)
Hash1K/New         64.3MB/s ± 0%  82.5MB/s ± 0%  +28.36%  (p=0.000 n=10+10)
Hash1K/Sum224      64.3MB/s ± 0%  82.6MB/s ± 0%  +28.51%  (p=0.000 n=10+10)
Hash1K/Sum256      64.3MB/s ± 0%  82.6MB/s ± 0%  +28.46%  (p=0.000 n=9+9)
Hash8K/New         69.0MB/s ± 0%  89.0MB/s ± 0%  +28.87%  (p=0.000 n=10+8)
Hash8K/Sum224      69.0MB/s ± 0%  89.0MB/s ± 0%  +28.88%  (p=0.000 n=9+10)
Hash8K/Sum256      69.0MB/s ± 0%  88.9MB/s ± 0%  +28.87%  (p=0.000 n=8+9)

Benchmark on crypto/sha512 (provided by Xiaodong Liu):
name               old time/op    new time/op     delta
Hash8Bytes/New       1.55µs ± 0%     1.31µs ± 0%  -15.67%  (p=0.000 n=10+10)
Hash8Bytes/Sum384    1.59µs ± 0%     1.35µs ± 0%  -14.97%  (p=0.000 n=10+10)
Hash8Bytes/Sum512    1.62µs ± 0%     1.39µs ± 0%  -14.02%  (p=0.000 n=10+10)
Hash1K/New           10.7µs ± 0%      8.6µs ± 0%  -19.60%  (p=0.000 n=8+8)
Hash1K/Sum384        10.8µs ± 0%      8.7µs ± 0%  -19.40%  (p=0.000 n=9+9)
Hash1K/Sum512        10.8µs ± 0%      8.7µs ± 0%  -19.35%  (p=0.000 n=9+10)
Hash8K/New           74.6µs ± 0%     59.6µs ± 0%  -20.08%  (p=0.000 n=10+9)
Hash8K/Sum384        74.7µs ± 0%     59.7µs ± 0%  -20.04%  (p=0.000 n=9+8)
Hash8K/Sum512        74.7µs ± 0%     59.7µs ± 0%  -20.01%  (p=0.000 n=10+10)

name               old speed      new speed       delta
Hash8Bytes/New     5.16MB/s ± 0%   6.12MB/s ± 0%  +18.60%  (p=0.000 n=10+8)
Hash8Bytes/Sum384  5.02MB/s ± 0%   5.90MB/s ± 0%  +17.56%  (p=0.000 n=10+10)
Hash8Bytes/Sum512  4.94MB/s ± 0%   5.74MB/s ± 0%  +16.29%  (p=0.000 n=10+9)
Hash1K/New         95.4MB/s ± 0%  118.6MB/s ± 0%  +24.38%  (p=0.000 n=10+10)
Hash1K/Sum384      95.0MB/s ± 0%  117.9MB/s ± 0%  +24.06%  (p=0.000 n=8+9)
Hash1K/Sum512      94.8MB/s ± 0%  117.5MB/s ± 0%  +23.99%  (p=0.000 n=8+9)
Hash8K/New          110MB/s ± 0%    137MB/s ± 0%  +25.11%  (p=0.000 n=9+6)
Hash8K/Sum384       110MB/s ± 0%    137MB/s ± 0%  +25.07%  (p=0.000 n=9+8)
Hash8K/Sum512       110MB/s ± 0%    137MB/s ± 0%  +25.01%  (p=0.000 n=10+10)

Change-Id: I28ccfce634659305a336c8e0a3f8589f7361d661
Reviewed-on: https://go-review.googlesource.com/c/go/+/422317
Reviewed-by: Keith Randall <[email protected]>
Run-TryBot: Wayne Zuo <[email protected]>
Reviewed-by: Ian Lance Taylor <[email protected]>
Reviewed-by: David Chase <[email protected]>
  • Loading branch information
wdvxdr1123 authored and dr2chase committed Aug 30, 2022
1 parent 7d57446 commit e8f0340
Show file tree
Hide file tree
Showing 8 changed files with 147 additions and 49 deletions.
4 changes: 4 additions & 0 deletions src/cmd/compile/internal/loong64/ssa.go
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,8 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
ssa.OpLOONG64SLLV,
ssa.OpLOONG64SRLV,
ssa.OpLOONG64SRAV,
ssa.OpLOONG64ROTR,
ssa.OpLOONG64ROTRV,
ssa.OpLOONG64ADDF,
ssa.OpLOONG64ADDD,
ssa.OpLOONG64SUBF,
Expand Down Expand Up @@ -165,6 +167,8 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
ssa.OpLOONG64SLLVconst,
ssa.OpLOONG64SRLVconst,
ssa.OpLOONG64SRAVconst,
ssa.OpLOONG64ROTRconst,
ssa.OpLOONG64ROTRVconst,
ssa.OpLOONG64SGTconst,
ssa.OpLOONG64SGTUconst:
p := s.Prog(v.Op.Asm())
Expand Down
6 changes: 4 additions & 2 deletions src/cmd/compile/internal/ssa/gen/LOONG64.rules
Original file line number Diff line number Diff line change
Expand Up @@ -111,8 +111,8 @@
// rotates
(RotateLeft8 <t> x (MOVVconst [c])) => (Or8 (Lsh8x64 <t> x (MOVVconst [c&7])) (Rsh8Ux64 <t> x (MOVVconst [-c&7])))
(RotateLeft16 <t> x (MOVVconst [c])) => (Or16 (Lsh16x64 <t> x (MOVVconst [c&15])) (Rsh16Ux64 <t> x (MOVVconst [-c&15])))
(RotateLeft32 <t> x (MOVVconst [c])) => (Or32 (Lsh32x64 <t> x (MOVVconst [c&31])) (Rsh32Ux64 <t> x (MOVVconst [-c&31])))
(RotateLeft64 <t> x (MOVVconst [c])) => (Or64 (Lsh64x64 <t> x (MOVVconst [c&63])) (Rsh64Ux64 <t> x (MOVVconst [-c&63])))
(RotateLeft32 x y) => (ROTR x (NEGV <y.Type> y))
(RotateLeft64 x y) => (ROTRV x (NEGV <y.Type> y))

// unary ops
(Neg(64|32|16|8) ...) => (NEGV ...)
Expand Down Expand Up @@ -572,6 +572,8 @@
(SLLV x (MOVVconst [c])) => (SLLVconst x [c])
(SRLV x (MOVVconst [c])) => (SRLVconst x [c])
(SRAV x (MOVVconst [c])) => (SRAVconst x [c])
(ROTR x (MOVVconst [c])) => (ROTRconst x [c&31])
(ROTRV x (MOVVconst [c])) => (ROTRVconst x [c&63])

(SGT (MOVVconst [c]) x) && is32Bit(c) => (SGTconst [c] x)
(SGTU (MOVVconst [c]) x) && is32Bit(c) => (SGTUconst [c] x)
Expand Down
16 changes: 10 additions & 6 deletions src/cmd/compile/internal/ssa/gen/LOONG64Ops.go
Original file line number Diff line number Diff line change
Expand Up @@ -196,12 +196,16 @@ func init() {
{name: "SQRTF", argLength: 1, reg: fp11, asm: "SQRTF"}, // sqrt(arg0), float32

// shifts
{name: "SLLV", argLength: 2, reg: gp21, asm: "SLLV"}, // arg0 << arg1, shift amount is mod 64
{name: "SLLVconst", argLength: 1, reg: gp11, asm: "SLLV", aux: "Int64"}, // arg0 << auxInt
{name: "SRLV", argLength: 2, reg: gp21, asm: "SRLV"}, // arg0 >> arg1, unsigned, shift amount is mod 64
{name: "SRLVconst", argLength: 1, reg: gp11, asm: "SRLV", aux: "Int64"}, // arg0 >> auxInt, unsigned
{name: "SRAV", argLength: 2, reg: gp21, asm: "SRAV"}, // arg0 >> arg1, signed, shift amount is mod 64
{name: "SRAVconst", argLength: 1, reg: gp11, asm: "SRAV", aux: "Int64"}, // arg0 >> auxInt, signed
{name: "SLLV", argLength: 2, reg: gp21, asm: "SLLV"}, // arg0 << arg1, shift amount is mod 64
{name: "SLLVconst", argLength: 1, reg: gp11, asm: "SLLV", aux: "Int64"}, // arg0 << auxInt
{name: "SRLV", argLength: 2, reg: gp21, asm: "SRLV"}, // arg0 >> arg1, unsigned, shift amount is mod 64
{name: "SRLVconst", argLength: 1, reg: gp11, asm: "SRLV", aux: "Int64"}, // arg0 >> auxInt, unsigned
{name: "SRAV", argLength: 2, reg: gp21, asm: "SRAV"}, // arg0 >> arg1, signed, shift amount is mod 64
{name: "SRAVconst", argLength: 1, reg: gp11, asm: "SRAV", aux: "Int64"}, // arg0 >> auxInt, signed
{name: "ROTR", argLength: 2, reg: gp21, asm: "ROTR"}, // arg0 right rotate by (arg1 mod 32) bits
{name: "ROTRV", argLength: 2, reg: gp21, asm: "ROTRV"}, // arg0 right rotate by (arg1 mod 64) bits
{name: "ROTRconst", argLength: 1, reg: gp11, asm: "ROTR", aux: "Int64"}, // uint32(arg0) right rotate by auxInt bits, auxInt should be in the range 0 to 31.
{name: "ROTRVconst", argLength: 1, reg: gp11, asm: "ROTRV", aux: "Int64"}, // arg0 right rotate by auxInt bits, auxInt should be in the range 0 to 63.

// comparisons
{name: "SGT", argLength: 2, reg: gp21, asm: "SGT", typ: "Bool"}, // 1 if arg0 > arg1 (signed), 0 otherwise
Expand Down
60 changes: 60 additions & 0 deletions src/cmd/compile/internal/ssa/opGen.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion src/cmd/compile/internal/ssa/rewrite.go
Original file line number Diff line number Diff line change
Expand Up @@ -1988,7 +1988,7 @@ func canRotate(c *Config, bits int64) bool {
switch c.arch {
case "386", "amd64":
return true
case "arm", "arm64", "s390x", "ppc64", "ppc64le", "wasm":
case "arm", "arm64", "s390x", "ppc64", "ppc64le", "wasm", "loong64":
return bits >= 32
default:
return false
Expand Down
92 changes: 54 additions & 38 deletions src/cmd/compile/internal/ssa/rewriteLOONG64.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions src/cmd/compile/internal/ssagen/ssa.go
Original file line number Diff line number Diff line change
Expand Up @@ -4643,12 +4643,12 @@ func InitTables() {
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue2(ssa.OpRotateLeft32, types.Types[types.TUINT32], args[0], args[1])
},
sys.AMD64, sys.ARM, sys.ARM64, sys.S390X, sys.PPC64, sys.Wasm)
sys.AMD64, sys.ARM, sys.ARM64, sys.S390X, sys.PPC64, sys.Wasm, sys.Loong64)
addF("math/bits", "RotateLeft64",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue2(ssa.OpRotateLeft64, types.Types[types.TUINT64], args[0], args[1])
},
sys.AMD64, sys.ARM64, sys.S390X, sys.PPC64, sys.Wasm)
sys.AMD64, sys.ARM64, sys.S390X, sys.PPC64, sys.Wasm, sys.Loong64)
alias("math/bits", "RotateLeft", "math/bits", "RotateLeft64", p8...)

makeOnesCountAMD64 := func(op ssa.Op) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
Expand Down
12 changes: 12 additions & 0 deletions test/codegen/rotate.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,20 +18,23 @@ func rot64(x uint64) uint64 {
// amd64:"ROLQ\t[$]7"
// ppc64:"ROTL\t[$]7"
// ppc64le:"ROTL\t[$]7"
// loong64: "ROTRV\t[$]57"
a += x<<7 | x>>57

// amd64:"ROLQ\t[$]8"
// arm64:"ROR\t[$]56"
// s390x:"RISBGZ\t[$]0, [$]63, [$]8, "
// ppc64:"ROTL\t[$]8"
// ppc64le:"ROTL\t[$]8"
// loong64: "ROTRV\t[$]56"
a += x<<8 + x>>56

// amd64:"ROLQ\t[$]9"
// arm64:"ROR\t[$]55"
// s390x:"RISBGZ\t[$]0, [$]63, [$]9, "
// ppc64:"ROTL\t[$]9"
// ppc64le:"ROTL\t[$]9"
// loong64: "ROTRV\t[$]55"
a += x<<9 ^ x>>55

// amd64:"ROLQ\t[$]10"
Expand All @@ -41,6 +44,7 @@ func rot64(x uint64) uint64 {
// ppc64le:"ROTL\t[$]10"
// arm64:"ROR\t[$]54"
// s390x:"RISBGZ\t[$]0, [$]63, [$]10, "
// loong64: "ROTRV\t[$]54"
a += bits.RotateLeft64(x, 10)

return a
Expand All @@ -53,6 +57,7 @@ func rot32(x uint32) uint32 {
// arm:"MOVW\tR\\d+@>25"
// ppc64:"ROTLW\t[$]7"
// ppc64le:"ROTLW\t[$]7"
// loong64: "ROTR\t[$]25"
a += x<<7 | x>>25

// amd64:`ROLL\t[$]8`
Expand All @@ -61,6 +66,7 @@ func rot32(x uint32) uint32 {
// s390x:"RLL\t[$]8"
// ppc64:"ROTLW\t[$]8"
// ppc64le:"ROTLW\t[$]8"
// loong64: "ROTR\t[$]24"
a += x<<8 + x>>24

// amd64:"ROLL\t[$]9"
Expand All @@ -69,6 +75,7 @@ func rot32(x uint32) uint32 {
// s390x:"RLL\t[$]9"
// ppc64:"ROTLW\t[$]9"
// ppc64le:"ROTLW\t[$]9"
// loong64: "ROTR\t[$]23"
a += x<<9 ^ x>>23

// amd64:"ROLL\t[$]10"
Expand All @@ -79,6 +86,7 @@ func rot32(x uint32) uint32 {
// ppc64le:"ROTLW\t[$]10"
// arm64:"RORW\t[$]22"
// s390x:"RLL\t[$]10"
// loong64: "ROTR\t[$]22"
a += bits.RotateLeft32(x, 10)

return a
Expand Down Expand Up @@ -127,12 +135,14 @@ func rot64nc(x uint64, z uint) uint64 {
// arm64:"ROR","NEG",-"AND"
// ppc64:"ROTL",-"NEG",-"AND"
// ppc64le:"ROTL",-"NEG",-"AND"
// loong64: "ROTRV", -"AND"
a += x<<z | x>>(64-z)

// amd64:"RORQ",-"AND"
// arm64:"ROR",-"NEG",-"AND"
// ppc64:"ROTL","NEG",-"AND"
// ppc64le:"ROTL","NEG",-"AND"
// loong64: "ROTRV", -"AND"
a += x>>z | x<<(64-z)

return a
Expand All @@ -147,12 +157,14 @@ func rot32nc(x uint32, z uint) uint32 {
// arm64:"ROR","NEG",-"AND"
// ppc64:"ROTLW",-"NEG",-"AND"
// ppc64le:"ROTLW",-"NEG",-"AND"
// loong64: "ROTR", -"AND"
a += x<<z | x>>(32-z)

// amd64:"RORL",-"AND"
// arm64:"ROR",-"NEG",-"AND"
// ppc64:"ROTLW","NEG",-"AND"
// ppc64le:"ROTLW","NEG",-"AND"
// loong64: "ROTR", -"AND"
a += x>>z | x<<(32-z)

return a
Expand Down

0 comments on commit e8f0340

Please sign in to comment.