aboutsummaryrefslogtreecommitdiff
path: root/src/math
diff options
context:
space:
mode:
authorJoel Sing <joel@sing.id.au>2024-06-27 20:03:28 +1000
committerJoel Sing <joel@sing.id.au>2024-08-23 10:41:28 +0000
commit2cee5d810974040b10ff8d19119758ac6e7270e1 (patch)
tree74ac78bb889b41ebb2abe466c972b71571b5844a /src/math
parent86cd5c4034647572f731ab95a36c35888b0e16c0 (diff)
downloadgo-2cee5d810974040b10ff8d19119758ac6e7270e1.tar.xz
math/big: implement addMulVVW in riscv64 assembly
This provides an assembly implementation of addMulVVW for riscv64, processing up to four words per loop, resulting in a significant performance gain. On a StarFive VisionFive 2: │ addmulvvw.1 │ addmulvvw.2 │ │ sec/op │ sec/op vs base │ AddMulVVW/1-4 65.49n ± 0% 50.79n ± 0% -22.44% (p=0.000 n=10) AddMulVVW/2-4 82.81n ± 0% 66.83n ± 0% -19.29% (p=0.000 n=10) AddMulVVW/3-4 100.20n ± 0% 82.87n ± 0% -17.30% (p=0.000 n=10) AddMulVVW/4-4 117.50n ± 0% 84.20n ± 0% -28.34% (p=0.000 n=10) AddMulVVW/5-4 134.9n ± 0% 100.3n ± 0% -25.69% (p=0.000 n=10) AddMulVVW/10-4 221.7n ± 0% 164.4n ± 0% -25.85% (p=0.000 n=10) AddMulVVW/100-4 1.794µ ± 0% 1.250µ ± 0% -30.32% (p=0.000 n=10) AddMulVVW/1000-4 17.42µ ± 0% 12.08µ ± 0% -30.68% (p=0.000 n=10) AddMulVVW/10000-4 254.9µ ± 0% 214.8µ ± 0% -15.75% (p=0.000 n=10) AddMulVVW/100000-4 2.569m ± 0% 2.178m ± 0% -15.20% (p=0.000 n=10) geomean 1.443µ 1.107µ -23.29% │ addmulvvw.1 │ addmulvvw.2 │ │ B/s │ B/s vs base │ AddMulVVW/1-4 932.0Mi ± 0% 1201.6Mi ± 0% +28.93% (p=0.000 n=10) AddMulVVW/2-4 1.440Gi ± 0% 1.784Gi ± 0% +23.90% (p=0.000 n=10) AddMulVVW/3-4 1.785Gi ± 0% 2.158Gi ± 0% +20.87% (p=0.000 n=10) AddMulVVW/4-4 2.029Gi ± 0% 2.832Gi ± 0% +39.59% (p=0.000 n=10) AddMulVVW/5-4 2.209Gi ± 0% 2.973Gi ± 0% +34.55% (p=0.000 n=10) AddMulVVW/10-4 2.689Gi ± 0% 3.626Gi ± 0% +34.86% (p=0.000 n=10) AddMulVVW/100-4 3.323Gi ± 0% 4.770Gi ± 0% +43.54% (p=0.000 n=10) AddMulVVW/1000-4 3.421Gi ± 0% 4.936Gi ± 0% +44.27% (p=0.000 n=10) AddMulVVW/10000-4 2.338Gi ± 0% 2.776Gi ± 0% +18.69% (p=0.000 n=10) AddMulVVW/100000-4 2.320Gi ± 0% 2.736Gi ± 0% +17.93% (p=0.000 n=10) geomean 2.109Gi 2.749Gi +30.36% Change-Id: I6c7ee48233c53ff9b6a5a9002675886cd9bff5af Reviewed-on: https://go-review.googlesource.com/c/go/+/595400 Reviewed-by: Meng Zhuo <mengzhuo1203@gmail.com> Reviewed-by: Cherry Mui <cherryyz@google.com> Reviewed-by: Dmitri Shuralyov <dmitshur@google.com> Reviewed-by: Mark Ryan <markdryan@rivosinc.com> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Diffstat (limited to 'src/math')
-rw-r--r--src/math/big/arith_riscv64.s93
1 files changed, 92 insertions, 1 deletions
diff --git a/src/math/big/arith_riscv64.s b/src/math/big/arith_riscv64.s
index 44580338b3..069a4080f4 100644
--- a/src/math/big/arith_riscv64.s
+++ b/src/math/big/arith_riscv64.s
@@ -375,5 +375,96 @@ done:
RET
TEXT ·addMulVVW(SB),NOSPLIT,$0
- JMP ·addMulVVW_g(SB)
+ MOV x+24(FP), X5
+ MOV y+48(FP), X6
+ MOV z+0(FP), X7
+ MOV z_len+8(FP), X30
+
+ MOV $4, X28
+ MOV $0, X29 // c = 0
+
+ BEQZ X30, done
+ BLTU X30, X28, loop1
+
+loop4:
+ MOV 0(X5), X8 // x[0]
+ MOV 0(X7), X10 // z[0]
+ MOV 8(X5), X11 // x[1]
+ MOV 8(X7), X13 // z[1]
+ MOV 16(X5), X14 // x[2]
+ MOV 16(X7), X16 // z[2]
+ MOV 24(X5), X17 // x[3]
+ MOV 24(X7), X19 // z[3]
+
+ MULHU X8, X6, X9 // z_hi[0] = x[0] * y
+ MUL X8, X6, X8 // z_lo[0] = x[0] * y
+ ADD X8, X10, X21 // z_lo[0] = x[0] * y + z[0]
+ SLTU X8, X21, X22
+ ADD X9, X22, X9 // z_hi[0] = x[0] * y + z[0]
+ ADD X21, X29, X10 // z[0] = x[0] * y + z[0] + c
+ SLTU X21, X10, X22
+ ADD X9, X22, X29 // next c
+
+ MULHU X11, X6, X12 // z_hi[1] = x[1] * y
+ MUL X11, X6, X11 // z_lo[1] = x[1] * y
+ ADD X11, X13, X21 // z_lo[1] = x[1] * y + z[1]
+ SLTU X11, X21, X22
+ ADD X12, X22, X12 // z_hi[1] = x[1] * y + z[1]
+ ADD X21, X29, X13 // z[1] = x[1] * y + z[1] + c
+ SLTU X21, X13, X22
+ ADD X12, X22, X29 // next c
+
+ MULHU X14, X6, X15 // z_hi[2] = x[2] * y
+ MUL X14, X6, X14 // z_lo[2] = x[2] * y
+ ADD X14, X16, X21 // z_lo[2] = x[2] * y + z[2]
+ SLTU X14, X21, X22
+ ADD X15, X22, X15 // z_hi[2] = x[2] * y + z[2]
+ ADD X21, X29, X16 // z[2] = x[2] * y + z[2] + c
+ SLTU X21, X16, X22
+ ADD X15, X22, X29 // next c
+
+ MULHU X17, X6, X18 // z_hi[3] = x[3] * y
+ MUL X17, X6, X17 // z_lo[3] = x[3] * y
+ ADD X17, X19, X21 // z_lo[3] = x[3] * y + z[3]
+ SLTU X17, X21, X22
+ ADD X18, X22, X18 // z_hi[3] = x[3] * y + z[3]
+ ADD X21, X29, X19 // z[3] = x[3] * y + z[3] + c
+ SLTU X21, X19, X22
+ ADD X18, X22, X29 // next c
+ MOV X10, 0(X7) // z[0]
+ MOV X13, 8(X7) // z[1]
+ MOV X16, 16(X7) // z[2]
+ MOV X19, 24(X7) // z[3]
+
+ ADD $32, X5
+ ADD $32, X7
+ SUB $4, X30
+
+ BGEU X30, X28, loop4
+ BEQZ X30, done
+
+loop1:
+ MOV 0(X5), X10 // x
+ MOV 0(X7), X11 // z
+
+ MULHU X10, X6, X12 // z_hi = x * y
+ MUL X10, X6, X10 // z_lo = x * y
+ ADD X10, X11, X13 // z_lo = x * y + z
+ SLTU X10, X13, X15
+ ADD X12, X15, X12 // z_hi = x * y + z
+ ADD X13, X29, X10 // z = x * y + z + c
+ SLTU X13, X10, X15
+ ADD X12, X15, X29 // next c
+
+ MOV X10, 0(X7) // z
+
+ ADD $8, X5
+ ADD $8, X7
+ SUB $1, X30
+
+ BNEZ X30, loop1
+
+done:
+ MOV X29, c+56(FP) // return c
+ RET