diff options
| author | Lynn Boger <laboger@linux.vnet.ibm.com> | 2024-01-23 12:46:05 -0600 |
|---|---|---|
| committer | Lynn Boger <laboger@linux.vnet.ibm.com> | 2024-01-25 19:32:43 +0000 |
| commit | 4fde3ef2acef11738c83f6ad9147eba03b5da6d1 (patch) | |
| tree | 98d5f89431a7e350455e49461361ca989111a637 /src/math | |
| parent | 67d30fc315f90207a7fd8cde465ab14687835f1c (diff) | |
| download | go-4fde3ef2acef11738c83f6ad9147eba03b5da6d1.tar.xz | |
math/big,crypto/internal/bigmod: unroll loop in addMulVVW for ppc64x
This updates the assembly implementation of AddMulVVW to
unroll the main loop to do 64 bytes at a time.
The code for addMulVVWx is based on the same code and has
also been updated to improve performance.
goos: linux
goarch: ppc64le
pkg: crypto/internal/bigmod
cpu: POWER10
│ bg.orig.out │ bg.out │
│ sec/op │ sec/op vs base │
ModAdd 116.3n ± 0% 116.9n ± 0% +0.52% (p=0.002 n=6)
ModSub 111.5n ± 0% 111.5n ± 0% 0.00% (p=0.273 n=6)
MontgomeryRepr 2.195µ ± 0% 1.944µ ± 0% -11.44% (p=0.002 n=6)
MontgomeryMul 2.195µ ± 0% 1.943µ ± 0% -11.48% (p=0.002 n=6)
ModMul 4.418µ ± 0% 3.900µ ± 0% -11.72% (p=0.002 n=6)
ExpBig 5.736m ± 0% 5.117m ± 0% -10.78% (p=0.002 n=6)
Exp 5.891m ± 0% 5.237m ± 0% -11.11% (p=0.002 n=6)
geomean 9.901µ 9.094µ -8.15%
goos: linux
goarch: ppc64le
pkg: math/big
cpu: POWER10
│ am.orig.out │ am.out │
│ sec/op │ sec/op vs base │
AddMulVVW/1 4.456n ± 1% 3.565n ± 0% -20.00% (p=0.002 n=6)
AddMulVVW/2 4.875n ± 1% 5.938n ± 1% +21.79% (p=0.002 n=6)
AddMulVVW/3 5.484n ± 0% 5.693n ± 0% +3.80% (p=0.002 n=6)
AddMulVVW/4 6.370n ± 0% 6.065n ± 0% -4.79% (p=0.002 n=6)
AddMulVVW/5 7.321n ± 0% 7.188n ± 0% -1.82% (p=0.002 n=6)
AddMulVVW/10 12.26n ± 8% 11.41n ± 0% -6.97% (p=0.002 n=6)
AddMulVVW/100 100.70n ± 0% 93.58n ± 0% -7.08% (p=0.002 n=6)
AddMulVVW/1000 938.6n ± 0% 845.5n ± 0% -9.92% (p=0.002 n=6)
AddMulVVW/10000 9.459µ ± 0% 8.415µ ± 0% -11.04% (p=0.002 n=6)
AddMulVVW/100000 94.57µ ± 0% 84.01µ ± 0% -11.16% (p=0.002 n=6)
geomean 75.17n 71.21n -5.27%
Change-Id: Idd79f5f02387564f4c2cc28d50b1c12bcd9a400f
Reviewed-on: https://go-review.googlesource.com/c/go/+/557915
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: Paul Murphy <murp@ibm.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Cherry Mui <cherryyz@google.com>
Reviewed-by: Carlos Amedee <carlos@golang.org>
Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com>
Reviewed-by: Filippo Valsorda <filippo@golang.org>
Diffstat (limited to 'src/math')
| -rw-r--r-- | src/math/big/arith_ppc64x.s | 93 |
1 files changed, 70 insertions, 23 deletions
diff --git a/src/math/big/arith_ppc64x.s b/src/math/big/arith_ppc64x.s index 9512a12270..c483e252ab 100644 --- a/src/math/big/arith_ppc64x.s +++ b/src/math/big/arith_ppc64x.s @@ -599,33 +599,80 @@ done: // func addMulVVW(z, x []Word, y Word) (c Word) TEXT ·addMulVVW(SB), NOSPLIT, $0 - MOVD z+0(FP), R10 // R10 = z[] - MOVD x+24(FP), R8 // R8 = x[] - MOVD y+48(FP), R9 // R9 = y - MOVD z_len+8(FP), R22 // R22 = z_len + MOVD z+0(FP), R3 // R3 = z[] + MOVD x+24(FP), R4 // R4 = x[] + MOVD y+48(FP), R5 // R5 = y + MOVD z_len+8(FP), R6 // R6 = z_len - MOVD R0, R3 // R3 will be the index register - CMP R0, R22 - MOVD R0, R4 // R4 = c = 0 - MOVD R22, CTR // Initialize loop counter - BEQ done - PCALIGN $16 + CMP R6, $4 + MOVD R0, R9 // R9 = c = 0 + BLT tail + SRD $2, R6, R7 + MOVD R7, CTR // Initialize loop counter + PCALIGN $16 loop: - MOVD (R8)(R3), R20 // Load x[i] - MOVD (R10)(R3), R21 // Load z[i] - MULLD R9, R20, R6 // R6 = Low-order(x[i]*y) - MULHDU R9, R20, R7 // R7 = High-order(x[i]*y) - ADDC R21, R6 // R6 = z0 - ADDZE R7 // R7 = z1 - ADDC R4, R6 // R6 = z0 + c + 0 - ADDZE R7, R4 // c += z1 - MOVD R6, (R10)(R3) // Store z[i] - ADD $8, R3 - BC 16, 0, loop // bdnz + MOVD 0(R4), R14 // x[i] + MOVD 8(R4), R16 // x[i+1] + MOVD 16(R4), R18 // x[i+2] + MOVD 24(R4), R20 // x[i+3] + MOVD 0(R3), R15 // z[i] + MOVD 8(R3), R17 // z[i+1] + MOVD 16(R3), R19 // z[i+2] + MOVD 24(R3), R21 // z[i+3] + MULLD R5, R14, R10 // low x[i]*y + MULHDU R5, R14, R11 // high x[i]*y + ADDC R15, R10 + ADDZE R11 + ADDC R9, R10 + ADDZE R11, R9 + MULLD R5, R16, R14 // low x[i+1]*y + MULHDU R5, R16, R15 // high x[i+1]*y + ADDC R17, R14 + ADDZE R15 + ADDC R9, R14 + ADDZE R15, R9 + MULLD R5, R18, R16 // low x[i+2]*y + MULHDU R5, R18, R17 // high x[i+2]*y + ADDC R19, R16 + ADDZE R17 + ADDC R9, R16 + ADDZE R17, R9 + MULLD R5, R20, R18 // low x[i+3]*y + MULHDU R5, R20, R19 // high x[i+3]*y + ADDC R21, R18 + ADDZE R19 + ADDC R9, R18 + ADDZE R19, R9 + MOVD R10, 0(R3) // z[i] + MOVD R14, 8(R3) // z[i+1] + MOVD R16, 16(R3) // z[i+2] + MOVD R18, 24(R3) // z[i+3] + ADD $32, R3 + ADD $32, R4 + BDNZ loop + + ANDCC $3, R6 +tail: + CMP R0, R6 + BEQ done + MOVD R6, CTR + PCALIGN $16 +tailloop: + MOVD 0(R4), R14 + MOVD 0(R3), R15 + MULLD R5, R14, R10 + MULHDU R5, R14, R11 + ADDC R15, R10 + ADDZE R11 + ADDC R9, R10 + ADDZE R11, R9 + MOVD R10, 0(R3) + ADD $8, R3 + ADD $8, R4 + BDNZ tailloop done: - MOVD R4, c+56(FP) + MOVD R9, c+56(FP) RET - |
