aboutsummaryrefslogtreecommitdiff
path: root/src/math
diff options
context:
space:
mode:
authorLynn Boger <laboger@linux.vnet.ibm.com>2024-01-23 12:46:05 -0600
committerLynn Boger <laboger@linux.vnet.ibm.com>2024-01-25 19:32:43 +0000
commit4fde3ef2acef11738c83f6ad9147eba03b5da6d1 (patch)
tree98d5f89431a7e350455e49461361ca989111a637 /src/math
parent67d30fc315f90207a7fd8cde465ab14687835f1c (diff)
downloadgo-4fde3ef2acef11738c83f6ad9147eba03b5da6d1.tar.xz
math/big,crypto/internal/bigmod: unroll loop in addMulVVW for ppc64x
This updates the assembly implementation of AddMulVVW to unroll the main loop to do 64 bytes at a time. The code for addMulVVWx is based on the same code and has also been updated to improve performance. goos: linux goarch: ppc64le pkg: crypto/internal/bigmod cpu: POWER10 │ bg.orig.out │ bg.out │ │ sec/op │ sec/op vs base │ ModAdd 116.3n ± 0% 116.9n ± 0% +0.52% (p=0.002 n=6) ModSub 111.5n ± 0% 111.5n ± 0% 0.00% (p=0.273 n=6) MontgomeryRepr 2.195µ ± 0% 1.944µ ± 0% -11.44% (p=0.002 n=6) MontgomeryMul 2.195µ ± 0% 1.943µ ± 0% -11.48% (p=0.002 n=6) ModMul 4.418µ ± 0% 3.900µ ± 0% -11.72% (p=0.002 n=6) ExpBig 5.736m ± 0% 5.117m ± 0% -10.78% (p=0.002 n=6) Exp 5.891m ± 0% 5.237m ± 0% -11.11% (p=0.002 n=6) geomean 9.901µ 9.094µ -8.15% goos: linux goarch: ppc64le pkg: math/big cpu: POWER10 │ am.orig.out │ am.out │ │ sec/op │ sec/op vs base │ AddMulVVW/1 4.456n ± 1% 3.565n ± 0% -20.00% (p=0.002 n=6) AddMulVVW/2 4.875n ± 1% 5.938n ± 1% +21.79% (p=0.002 n=6) AddMulVVW/3 5.484n ± 0% 5.693n ± 0% +3.80% (p=0.002 n=6) AddMulVVW/4 6.370n ± 0% 6.065n ± 0% -4.79% (p=0.002 n=6) AddMulVVW/5 7.321n ± 0% 7.188n ± 0% -1.82% (p=0.002 n=6) AddMulVVW/10 12.26n ± 8% 11.41n ± 0% -6.97% (p=0.002 n=6) AddMulVVW/100 100.70n ± 0% 93.58n ± 0% -7.08% (p=0.002 n=6) AddMulVVW/1000 938.6n ± 0% 845.5n ± 0% -9.92% (p=0.002 n=6) AddMulVVW/10000 9.459µ ± 0% 8.415µ ± 0% -11.04% (p=0.002 n=6) AddMulVVW/100000 94.57µ ± 0% 84.01µ ± 0% -11.16% (p=0.002 n=6) geomean 75.17n 71.21n -5.27% Change-Id: Idd79f5f02387564f4c2cc28d50b1c12bcd9a400f Reviewed-on: https://go-review.googlesource.com/c/go/+/557915 TryBot-Result: Gopher Robot <gobot@golang.org> Reviewed-by: Paul Murphy <murp@ibm.com> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: Cherry Mui <cherryyz@google.com> Reviewed-by: Carlos Amedee <carlos@golang.org> Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com> Reviewed-by: Filippo Valsorda <filippo@golang.org>
Diffstat (limited to 'src/math')
-rw-r--r--src/math/big/arith_ppc64x.s93
1 files changed, 70 insertions, 23 deletions
diff --git a/src/math/big/arith_ppc64x.s b/src/math/big/arith_ppc64x.s
index 9512a12270..c483e252ab 100644
--- a/src/math/big/arith_ppc64x.s
+++ b/src/math/big/arith_ppc64x.s
@@ -599,33 +599,80 @@ done:
// func addMulVVW(z, x []Word, y Word) (c Word)
TEXT ·addMulVVW(SB), NOSPLIT, $0
- MOVD z+0(FP), R10 // R10 = z[]
- MOVD x+24(FP), R8 // R8 = x[]
- MOVD y+48(FP), R9 // R9 = y
- MOVD z_len+8(FP), R22 // R22 = z_len
+ MOVD z+0(FP), R3 // R3 = z[]
+ MOVD x+24(FP), R4 // R4 = x[]
+ MOVD y+48(FP), R5 // R5 = y
+ MOVD z_len+8(FP), R6 // R6 = z_len
- MOVD R0, R3 // R3 will be the index register
- CMP R0, R22
- MOVD R0, R4 // R4 = c = 0
- MOVD R22, CTR // Initialize loop counter
- BEQ done
- PCALIGN $16
+ CMP R6, $4
+ MOVD R0, R9 // R9 = c = 0
+ BLT tail
+ SRD $2, R6, R7
+ MOVD R7, CTR // Initialize loop counter
+ PCALIGN $16
loop:
- MOVD (R8)(R3), R20 // Load x[i]
- MOVD (R10)(R3), R21 // Load z[i]
- MULLD R9, R20, R6 // R6 = Low-order(x[i]*y)
- MULHDU R9, R20, R7 // R7 = High-order(x[i]*y)
- ADDC R21, R6 // R6 = z0
- ADDZE R7 // R7 = z1
- ADDC R4, R6 // R6 = z0 + c + 0
- ADDZE R7, R4 // c += z1
- MOVD R6, (R10)(R3) // Store z[i]
- ADD $8, R3
- BC 16, 0, loop // bdnz
+ MOVD 0(R4), R14 // x[i]
+ MOVD 8(R4), R16 // x[i+1]
+ MOVD 16(R4), R18 // x[i+2]
+ MOVD 24(R4), R20 // x[i+3]
+ MOVD 0(R3), R15 // z[i]
+ MOVD 8(R3), R17 // z[i+1]
+ MOVD 16(R3), R19 // z[i+2]
+ MOVD 24(R3), R21 // z[i+3]
+ MULLD R5, R14, R10 // low x[i]*y
+ MULHDU R5, R14, R11 // high x[i]*y
+ ADDC R15, R10
+ ADDZE R11
+ ADDC R9, R10
+ ADDZE R11, R9
+ MULLD R5, R16, R14 // low x[i+1]*y
+ MULHDU R5, R16, R15 // high x[i+1]*y
+ ADDC R17, R14
+ ADDZE R15
+ ADDC R9, R14
+ ADDZE R15, R9
+ MULLD R5, R18, R16 // low x[i+2]*y
+ MULHDU R5, R18, R17 // high x[i+2]*y
+ ADDC R19, R16
+ ADDZE R17
+ ADDC R9, R16
+ ADDZE R17, R9
+ MULLD R5, R20, R18 // low x[i+3]*y
+ MULHDU R5, R20, R19 // high x[i+3]*y
+ ADDC R21, R18
+ ADDZE R19
+ ADDC R9, R18
+ ADDZE R19, R9
+ MOVD R10, 0(R3) // z[i]
+ MOVD R14, 8(R3) // z[i+1]
+ MOVD R16, 16(R3) // z[i+2]
+ MOVD R18, 24(R3) // z[i+3]
+ ADD $32, R3
+ ADD $32, R4
+ BDNZ loop
+
+ ANDCC $3, R6
+tail:
+ CMP R0, R6
+ BEQ done
+ MOVD R6, CTR
+ PCALIGN $16
+tailloop:
+ MOVD 0(R4), R14
+ MOVD 0(R3), R15
+ MULLD R5, R14, R10
+ MULHDU R5, R14, R11
+ ADDC R15, R10
+ ADDZE R11
+ ADDC R9, R10
+ ADDZE R11, R9
+ MOVD R10, 0(R3)
+ ADD $8, R3
+ ADD $8, R4
+ BDNZ tailloop
done:
- MOVD R4, c+56(FP)
+ MOVD R9, c+56(FP)
RET
-