aboutsummaryrefslogtreecommitdiff
path: root/src/math/big
diff options
context:
space:
mode:
authorVlad Krasnov <vlad@cloudflare.com>2017-11-06 13:59:51 -0800
committerCherry Zhang <cherryyz@google.com>2018-03-07 23:04:38 +0000
commitfd3d27938afa1a5df94f1e056deeb5842855fbdc (patch)
treef8958fb4a6feb9866a0c2fddf8cba4179c2302b5 /src/math/big
parentb1335037fad4c81ce296f652372aa631a29dcb48 (diff)
downloadgo-fd3d27938afa1a5df94f1e056deeb5842855fbdc.tar.xz
math/big: implement addMulVVW on arm64
The lack of proper addMulVVW implementation for arm64 hurts RSA performance. This assembly implementation is optimized for arm64 based servers. name old time/op new time/op delta pkg:math/big goos:linux goarch:arm64 AddMulVVW/1 55.2ns ± 0% 11.9ns ± 1% -78.37% (p=0.000 n=8+10) AddMulVVW/2 67.0ns ± 0% 11.2ns ± 0% -83.28% (p=0.000 n=7+10) AddMulVVW/3 93.2ns ± 0% 13.2ns ± 0% -85.84% (p=0.000 n=10+10) AddMulVVW/4 126ns ± 0% 13ns ± 1% -89.82% (p=0.000 n=10+10) AddMulVVW/5 151ns ± 0% 17ns ± 0% -88.87% (p=0.000 n=10+9) AddMulVVW/10 323ns ± 0% 25ns ± 0% -92.20% (p=0.000 n=10+10) AddMulVVW/100 3.28µs ± 0% 0.14µs ± 0% -95.82% (p=0.000 n=10+10) AddMulVVW/1000 31.7µs ± 0% 1.3µs ± 0% -96.00% (p=0.000 n=10+8) AddMulVVW/10000 313µs ± 0% 13µs ± 0% -95.98% (p=0.000 n=10+10) AddMulVVW/100000 3.24ms ± 0% 0.13ms ± 1% -96.13% (p=0.000 n=9+9) pkg:crypto/rsa goos:linux goarch:arm64 RSA2048Decrypt 44.7ms ± 0% 4.0ms ± 6% -91.08% (p=0.000 n=8+10) RSA2048Sign 46.3ms ± 0% 5.0ms ± 0% -89.29% (p=0.000 n=9+10) 3PrimeRSA2048Decrypt 22.3ms ± 0% 2.4ms ± 0% -89.26% (p=0.000 n=10+10) Change-Id: I295f0bd5c51a4442d02c44ece1f6026d30dff0bc Reviewed-on: https://go-review.googlesource.com/76270 Reviewed-by: Vlad Krasnov <vlad@cloudflare.com> Reviewed-by: Cherry Zhang <cherryyz@google.com> Run-TryBot: Vlad Krasnov <vlad@cloudflare.com> TryBot-Result: Gobot Gobot <gobot@golang.org>
Diffstat (limited to 'src/math/big')
-rw-r--r--src/math/big/arith_arm64.s83
1 files changed, 82 insertions, 1 deletions
diff --git a/src/math/big/arith_arm64.s b/src/math/big/arith_arm64.s
index 8b4b7136fa..0974c97c57 100644
--- a/src/math/big/arith_arm64.s
+++ b/src/math/big/arith_arm64.s
@@ -199,8 +199,89 @@ done:
// func addMulVVW(z, x []Word, y Word) (c Word)
TEXT ·addMulVVW(SB),NOSPLIT,$0
- B ·addMulVVW_g(SB)
+ MOVD z+0(FP), R1
+ MOVD z_len+8(FP), R0
+ MOVD x+24(FP), R2
+ MOVD y+48(FP), R3
+ MOVD $0, R4
+
+ TBZ $0, R0, two
+
+ MOVD.P 8(R2), R5
+ MOVD (R1), R6
+
+ MUL R5, R3, R7
+ UMULH R5, R3, R8
+
+ ADDS R7, R6
+ ADC $0, R8, R4
+
+ MOVD.P R6, 8(R1)
+ SUB $1, R0
+
+two:
+ TBZ $1, R0, loop
+
+ LDP.P 16(R2), (R5, R10)
+ LDP (R1), (R6, R11)
+
+ MUL R10, R3, R13
+ UMULH R10, R3, R12
+
+ MUL R5, R3, R7
+ UMULH R5, R3, R8
+ ADDS R4, R6
+ ADCS R13, R11
+ ADC $0, R12
+
+ ADDS R7, R6
+ ADCS R8, R11
+ ADC $0, R12, R4
+
+ STP.P (R6, R11), 16(R1)
+ SUB $2, R0
+
+// The main loop of this code operates on a block of 4 words every iteration
+// performing [R4:R12:R11:R10:R9] = R4 + R3 * [R8:R7:R6:R5] + [R12:R11:R10:R9]
+// where R4 is carried from the previous iteration, R8:R7:R6:R5 hold the next
+// 4 words of x, R3 is y and R12:R11:R10:R9 are part of the result z.
+loop:
+ CBZ R0, done
+
+ LDP.P 16(R2), (R5, R6)
+ LDP.P 16(R2), (R7, R8)
+
+ LDP (R1), (R9, R10)
+ ADDS R4, R9
+ MUL R6, R3, R14
+ ADCS R14, R10
+ MUL R7, R3, R15
+ LDP 16(R1), (R11, R12)
+ ADCS R15, R11
+ MUL R8, R3, R16
+ ADCS R16, R12
+ UMULH R8, R3, R20
+ ADC $0, R20
+
+ MUL R5, R3, R13
+ ADDS R13, R9
+ UMULH R5, R3, R17
+ ADCS R17, R10
+ UMULH R6, R3, R21
+ STP.P (R9, R10), 16(R1)
+ ADCS R21, R11
+ UMULH R7, R3, R19
+ ADCS R19, R12
+ STP.P (R11, R12), 16(R1)
+ ADC $0, R20, R4
+
+ SUB $4, R0
+ B loop
+
+done:
+ MOVD R4, c+56(FP)
+ RET
// func divWVW(z []Word, xn Word, x []Word, y Word) (r Word)
TEXT ·divWVW(SB),NOSPLIT,$0