Skip to content

Commit

Permalink
math/big: add PCALIGN to addMulVVW asm on ppc64x
Browse files Browse the repository at this point in the history
Adding PCALIGN to addMulVVW assembler implementation
provides the following improvement on power10:

    AddMulVVW/1         3.36ns ± 0%    3.37ns ± 0%   +0.20%
    AddMulVVW/2         4.45ns ± 0%    4.44ns ± 0%   -0.25%
    AddMulVVW/3         5.44ns ± 0%    5.49ns ± 0%   +0.84%
    AddMulVVW/4         6.43ns ± 0%    6.34ns ± 0%   -1.33%
    AddMulVVW/5         7.87ns ± 0%    7.73ns ± 0%   -1.70%
    AddMulVVW/10        13.4ns ± 3%    12.4ns ± 7%   -7.07%
    AddMulVVW/100        112ns ± 0%     102ns ± 0%   -9.34%
    AddMulVVW/1000      1.09µs ± 0%    0.95µs ± 0%  -13.15%
    AddMulVVW/10000     10.9µs ± 0%     9.6µs ± 0%  -12.46%
    AddMulVVW/100000     109µs ± 0%      95µs ± 0%  -12.58%

Change-Id: Ic33d4f125c84d568f63e17cf99dc4df5ca9328d9
Reviewed-on: https://go-review.googlesource.com/c/go/+/447236
TryBot-Result: Gopher Robot <[email protected]>
Reviewed-by: Bryan Mills <[email protected]>
Reviewed-by: Russ Cox <[email protected]>
Run-TryBot: Lynn Boger <[email protected]>
Reviewed-by: Paul Murphy <[email protected]>
Reviewed-by: Archana Ravindar <[email protected]>
  • Loading branch information
laboger committed Nov 3, 2022
1 parent ebb71ad commit 932330f
Showing 1 changed file with 10 additions and 0 deletions.
10 changes: 10 additions & 0 deletions src/math/big/arith_ppc64x.s
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@ TEXT ·addVV(SB), NOSPLIT, $0
// for small values of z_len (0.90x in the worst case), but
// gain significant performance as z_len increases (up to
// 1.45x).

PCALIGN $32
loop:
MOVD 8(R8), R11 // R11 = x[i]
MOVD 16(R8), R12 // R12 = x[i+1]
Expand Down Expand Up @@ -131,6 +133,8 @@ TEXT ·subVV(SB), NOSPLIT, $0
// for small values of z_len (0.92x in the worst case), but
// gain significant performance as z_len increases (up to
// 1.45x).

PCALIGN $32
loop:
MOVD 8(R8), R11 // R11 = x[i]
MOVD 16(R8), R12 // R12 = x[i+1]
Expand Down Expand Up @@ -212,6 +216,7 @@ TEXT ·addVW(SB), NOSPLIT, $0
CMP R0, R9
MOVD R9, CTR // Set up the loop counter
BEQ tail // If R9 = 0, we can't use the loop
PCALIGN $32

loop:
MOVD 8(R8), R20 // R20 = x[i]
Expand Down Expand Up @@ -288,6 +293,8 @@ TEXT ·subVW(SB), NOSPLIT, $0
// The loop here is almost the same as the one used in s390x, but
// we don't need to capture CA every iteration because we've already
// done that above.

PCALIGN $32
loop:
MOVD 8(R8), R20
MOVD 16(R8), R21
Expand Down Expand Up @@ -358,6 +365,7 @@ TEXT ·shlVU(SB), NOSPLIT, $0
CMP R5, R0 // iterate from i=len(z)-1 to 0
BEQ loopexit // Already at end?
MOVD 0(R15),R10 // x[i]
PCALIGN $32
shloop:
SLD R9, R10, R10 // x[i]<<s
MOVDU -8(R15), R14
Expand Down Expand Up @@ -520,6 +528,7 @@ TEXT ·mulAddVWW(SB), NOSPLIT, $0
CMP R0, R14
MOVD R14, CTR // Set up the loop counter
BEQ tail // If R9 = 0, we can't use the loop
PCALIGN $32

loop:
MOVD 8(R8), R20 // R20 = x[i]
Expand Down Expand Up @@ -602,6 +611,7 @@ TEXT ·addMulVVW(SB), NOSPLIT, $0
MOVD R0, R4 // R4 = c = 0
MOVD R22, CTR // Initialize loop counter
BEQ done
PCALIGN $32

loop:
MOVD (R8)(R3), R20 // Load x[i]
Expand Down

0 comments on commit 932330f

Please sign in to comment.