diff options
| author | kmvijay <kiran.m.vijay@ibm.com> | 2025-10-30 14:50:14 +0000 |
|---|---|---|
| committer | Gopher Robot <gobot@golang.org> | 2025-11-25 10:46:14 -0800 |
| commit | 6e4a0d8e44c845251c01fee3923113e6ba8d1e06 (patch) | |
| tree | 16e5808b58b757afc54388d152bba79c47f65b9e /src/crypto | |
| parent | 657b331ff5da9b02bd98b489ff144a03c6651bae (diff) | |
| download | go-6e4a0d8e44c845251c01fee3923113e6ba8d1e06.tar.xz | |
crypto/internal/fips140/bigmod: vector implementation of addMulVVWx on s390x
addMulVVWx assembly routine is used to multiply bignum multiplicand with a 64-bit multiplier.
The new implementation for s390x architecture uses an algorithm based on vector instructions,
with a significant performance improvement.
Note: z13 is the minimum architecture for Go, which already has VX support.
The performance improvement is as below:
goos: linux
goarch: s390x
pkg: crypto/internal/fips140/bigmod
Orig.txt Vector_Patch.txt
sec/op sec/op vs base
ModAdd 164.1n ± 0% 159.7n ± 0% -2.7% (p=0.000 n=10)
ModSub 152.3n ± 1% 147.3n ± 0% -3.25 (p=0.000 n=10)
MontgomeryRepr 4.806µ ± 3% 1.829µ ± 0% -61.94% (p=0.000 n=10)
MontgomeryMul 4.812µ ± 5% 1.834µ ± 0% -61.90% (p=0.000 n=10)
ModMul 9.646µ ± 3% 3.698µ ± 0% -61.67% (p=0.000 n=10)
ExpBig 11.28m ± 0% 11.28m ± 0% +0.04 (p=0.035 n=10)
Exp 12.284m ± 5% 5.004m ± 1% -59.26 (p=0.000 n=10)
geomean 18.61µ 10.74µ -42.2
Change-Id: I679944c9dac9f43f1626b018f72efa6da0d2442d
Cq-Include-Trybots: luci.golang.try:gotip-linux-s390x
Reviewed-on: https://go-review.googlesource.com/c/go/+/716480
Auto-Submit: Filippo Valsorda <filippo@golang.org>
Reviewed-by: Vishwanatha HD <vishwanatha.hd@ibm.com>
Reviewed-by: Cherry Mui <cherryyz@google.com>
Reviewed-by: Roland Shoemaker <roland@golang.org>
Reviewed-by: Filippo Valsorda <filippo@golang.org>
Reviewed-by: Srinivas Pokala <Pokala.Srinivas@ibm.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Diffstat (limited to 'src/crypto')
| -rw-r--r-- | src/crypto/internal/fips140/bigmod/nat_s390x.s | 185 |
1 files changed, 130 insertions, 55 deletions
diff --git a/src/crypto/internal/fips140/bigmod/nat_s390x.s b/src/crypto/internal/fips140/bigmod/nat_s390x.s index 0c07a0c8a6..9adeb9981d 100644 --- a/src/crypto/internal/fips140/bigmod/nat_s390x.s +++ b/src/crypto/internal/fips140/bigmod/nat_s390x.s @@ -4,82 +4,157 @@ //go:build !purego +// Register usage (z13 convention): +// R2 = rp (result pointer) +// R3 = ap (source pointer) +// R4 = an / idx (loop counter) +// R5 = b0 (multiplier limb) +// R6 = cy (carry) + #include "textflag.h" // func addMulVVW1024(z, x *uint, y uint) (c uint) TEXT ·addMulVVW1024(SB), $0-32 - MOVD $16, R5 - JMP addMulVVWx(SB) + MOVD $16, R4 + JMP addMulVVWx(SB) // func addMulVVW1536(z, x *uint, y uint) (c uint) TEXT ·addMulVVW1536(SB), $0-32 - MOVD $24, R5 - JMP addMulVVWx(SB) + MOVD $24, R4 + JMP addMulVVWx(SB) // func addMulVVW2048(z, x *uint, y uint) (c uint) TEXT ·addMulVVW2048(SB), $0-32 - MOVD $32, R5 - JMP addMulVVWx(SB) + MOVD $32, R4 + JMP addMulVVWx(SB) TEXT addMulVVWx(SB), NOFRAME|NOSPLIT, $0 MOVD z+0(FP), R2 - MOVD x+8(FP), R8 - MOVD y+16(FP), R9 + MOVD x+8(FP), R3 + MOVD y+16(FP), R5 + + MOVD $0, R6 + +L_ent: + VZERO V0 + VZERO V2 + SRD $2, R4, R10 + TMLL R4, $1 + BRC $8, L_bx0 - MOVD $0, R1 // i*8 = 0 - MOVD $0, R7 // i = 0 - MOVD $0, R0 // make sure it's zero - MOVD $0, R4 // c = 0 +L_bx1: + VLEG $1, 0(R2), V2 + VZERO V4 + TMLL R4, $2 + BRC $7, L_b11 - MOVD R5, R12 - AND $-2, R12 - CMPBGE R5, $2, A6 - BR E6 +L_b01: + MOVD $-24, R4 + MOVD R6, R0 + MOVD 0(R3), R7 + MLGR R5, R6 + ADDC R0, R7 + MOVD $0, R0 + ADDE R0, R6 + VLVGG $1, R7, V4 + VAQ V2, V4, V2 + VSTEG $1, V2, 0(R2) + VMRHG V2, V2, V2 + CMPBEQ R10, $0, L_1 + BR L_cj0 -A6: - MOVD (R8)(R1*1), R6 - MULHDU R9, R6 - MOVD (R2)(R1*1), R10 - ADDC R10, R11 // add to low order bits - ADDE R0, R6 - ADDC R4, R11 - ADDE R0, R6 - MOVD R6, R4 - MOVD R11, (R2)(R1*1) +L_b11: + MOVD $-8, R4 + MOVD 0(R3), R9 + MLGR R5, R8 + ADDC R6, R9 + MOVD $0, R6 + ADDE R6, R8 + VLVGG $1, R9, V4 + VAQ V2, V4, V2 + VSTEG $1, V2, 0(R2) + VMRHG V2, V2, V2 + BR L_cj1 - MOVD (8)(R8)(R1*1), R6 - MULHDU R9, R6 - MOVD (8)(R2)(R1*1), R10 - ADDC R10, R11 // add to low order bits - ADDE R0, R6 - ADDC R4, R11 - ADDE R0, R6 - MOVD R6, R4 - MOVD R11, (8)(R2)(R1*1) +L_bx0: + TMLL R4, $2 + BRC $7, L_b10 - ADD $16, R1 // i*8 + 8 - ADD $2, R7 // i++ +L_b00: + MOVD $-32, R4 - CMPBLT R7, R12, A6 - BR E6 +L_cj0: + MOVD 32(R3)(R4), R1 + MOVD 40(R3)(R4), R9 + MLGR R5, R0 + MLGR R5, R8 + VL 32(R4)(R2), V1 + VPDI $4, V1, V1, V1 + VLVGP R0, R1, V6 + VLVGP R9, R6, V7 + BR L_mid -L6: - // TODO: drop unused single-step loop. - MOVD (R8)(R1*1), R6 - MULHDU R9, R6 - MOVD (R2)(R1*1), R10 - ADDC R10, R11 // add to low order bits - ADDE R0, R6 - ADDC R4, R11 - ADDE R0, R6 - MOVD R6, R4 - MOVD R11, (R2)(R1*1) +L_b10: + MOVD $-16, R4 + MOVD R6, R8 - ADD $8, R1 // i*8 + 8 - ADD $1, R7 // i++ +L_cj1: + MOVD 16(R4)(R3), R1 + MOVD 24(R4)(R3), R7 + MLGR R5, R0 + MLGR R5, R6 + VL 16(R4)(R2), V1 + VPDI $4, V1, V1, V1 + VLVGP R0, R1, V6 + VLVGP R7, R8, V7 + CMPBEQ R10, $0, L_end -E6: - CMPBLT R7, R5, L6 // i < n +L_top: + MOVD 32(R4)(R3), R1 + MOVD 40(R4)(R3), R9 + MLGR R5, R0 + MLGR R5, R8 + VACQ V6, V1, V0, V5 + VACCCQ V6, V1, V0, V0 + VACQ V5, V7, V2, V3 + VACCCQ V5, V7, V2, V2 + VPDI $4, V3, V3, V3 + VL 32(R4)(R2), V1 + VPDI $4, V1, V1, V1 + VST V3, 16(R4)(R2) + VLVGP R0, R1, V6 + VLVGP R9, R6, V7 - MOVD R4, c+24(FP) +L_mid: + MOVD 48(R4)(R3), R1 + MOVD 56(R4)(R3), R7 + MLGR R5, R0 + MLGR R5, R6 + VACQ V6, V1, V0, V5 + VACCCQ V6, V1, V0, V0 + VACQ V5, V7, V2, V3 + VACCCQ V5, V7, V2, V2 + VPDI $4, V3, V3, V3 + VL 48(R4)(R2), V1 + VPDI $4, V1, V1, V1 + VST V3, 32(R4)(R2) + VLVGP R0, R1, V6 + VLVGP R7, R8, V7 + MOVD $32(R4), R4 + BRCTG R10, L_top + +L_end: + VACQ V6, V1, V0, V5 + VACCCQ V6, V1, V0, V0 + VACQ V5, V7, V2, V3 + VACCCQ V5, V7, V2, V2 + VPDI $4, V3, V3, V3 + VST V3, 16(R2)(R4) + VAG V0, V2, V2 + +L_1: + VLGVG $1, V2, R2 + ADDC R6, R2 + MOVD R2, c+24(FP) RET + |
