aboutsummaryrefslogtreecommitdiff
path: root/src/crypto
diff options
context:
space:
mode:
authorXiaolin Zhao <zhaoxiaolin@loongson.cn>2025-05-16 11:05:03 +0800
committerabner chenc <chenguoqi@loongson.cn>2025-05-20 20:28:34 -0700
commit5b17e2f92782bd81589b89d4cd9fbb26cae2bcd5 (patch)
treecd28830728862512e23b883a5960d4eef9999ab9 /src/crypto
parenta2eb643cbf5b68b50dd2dd5b62e605ca90ababe4 (diff)
downloadgo-5b17e2f92782bd81589b89d4cd9fbb26cae2bcd5.tar.xz
crypto/subtle: optimize function xorBytes using SIMD on loong64
On the Loongson-3A6000-HV and Loongson-3A5000, there has been a significant improvement in all performance metrics except for '8Bytes', which has experienced a decline, as follows. goos: linux goarch: loong64 pkg: crypto/subtle cpu: Loongson-3A6000-HV @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | XORBytes/8Bytes 7.282n ± 0% 8.805n ± 0% +20.91% (p=0.000 n=10) XORBytes/128Bytes 14.43n ± 0% 10.01n ± 0% -30.63% (p=0.000 n=10) XORBytes/2048Bytes 110.60n ± 0% 46.57n ± 0% -57.89% (p=0.000 n=10) XORBytes/8192Bytes 418.7n ± 0% 161.8n ± 0% -61.36% (p=0.000 n=10) XORBytes/32768Bytes 3.220µ ± 0% 1.673µ ± 0% -48.04% (p=0.000 n=10) XORBytesAlignment/8Bytes0Offset 7.621n ± 0% 9.305n ± 0% +22.10% (p=0.000 n=10) XORBytesAlignment/8Bytes1Offset 7.621n ± 0% 9.305n ± 0% +22.10% (p=0.000 n=10) XORBytesAlignment/8Bytes2Offset 7.621n ± 0% 9.305n ± 0% +22.10% (p=0.000 n=10) XORBytesAlignment/8Bytes3Offset 7.621n ± 0% 9.305n ± 0% +22.10% (p=0.000 n=10) XORBytesAlignment/8Bytes4Offset 7.621n ± 0% 9.305n ± 0% +22.10% (p=0.000 n=10) XORBytesAlignment/8Bytes5Offset 7.621n ± 0% 9.305n ± 0% +22.10% (p=0.000 n=10) XORBytesAlignment/8Bytes6Offset 7.621n ± 0% 9.305n ± 0% +22.10% (p=0.000 n=10) XORBytesAlignment/8Bytes7Offset 7.621n ± 0% 9.305n ± 0% +22.10% (p=0.000 n=10) XORBytesAlignment/128Bytes0Offset 14.430n ± 0% 9.973n ± 0% -30.88% (p=0.000 n=10) XORBytesAlignment/128Bytes1Offset 20.83n ± 0% 11.03n ± 0% -47.05% (p=0.000 n=10) XORBytesAlignment/128Bytes2Offset 20.83n ± 0% 11.03n ± 0% -47.07% (p=0.000 n=10) XORBytesAlignment/128Bytes3Offset 20.83n ± 0% 11.03n ± 0% -47.07% (p=0.000 n=10) XORBytesAlignment/128Bytes4Offset 20.83n ± 0% 11.03n ± 0% -47.05% (p=0.000 n=10) XORBytesAlignment/128Bytes5Offset 20.83n ± 0% 11.03n ± 0% -47.05% (p=0.000 n=10) XORBytesAlignment/128Bytes6Offset 20.83n ± 0% 11.03n ± 0% -47.05% (p=0.000 n=10) XORBytesAlignment/128Bytes7Offset 20.83n ± 0% 11.03n ± 0% -47.05% (p=0.000 n=10) XORBytesAlignment/2048Bytes0Offset 110.60n ± 0% 46.82n ± 0% -57.67% (p=0.000 n=10) XORBytesAlignment/2048Bytes1Offset 234.4n ± 0% 109.3n ± 0% -53.37% (p=0.000 n=10) XORBytesAlignment/2048Bytes2Offset 234.4n ± 0% 109.3n ± 0% -53.37% (p=0.000 n=10) XORBytesAlignment/2048Bytes3Offset 234.4n ± 0% 109.3n ± 0% -53.37% (p=0.000 n=10) XORBytesAlignment/2048Bytes4Offset 234.5n ± 0% 109.3n ± 0% -53.39% (p=0.000 n=10) XORBytesAlignment/2048Bytes5Offset 234.4n ± 0% 109.3n ± 0% -53.37% (p=0.000 n=10) XORBytesAlignment/2048Bytes6Offset 234.4n ± 0% 109.3n ± 0% -53.37% (p=0.000 n=10) XORBytesAlignment/2048Bytes7Offset 234.5n ± 0% 109.3n ± 0% -53.39% (p=0.000 n=10) geomean 39.42n 26.00n -34.05% goos: linux goarch: loong64 pkg: crypto/subtle cpu: Loongson-3A5000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | XORBytes/8Bytes 11.21n ± 0% 12.41n ± 1% +10.70% (p=0.000 n=10) XORBytes/128Bytes 18.22n ± 0% 13.61n ± 0% -25.30% (p=0.000 n=10) XORBytes/2048Bytes 162.20n ± 0% 48.46n ± 0% -70.13% (p=0.000 n=10) XORBytes/8192Bytes 629.8n ± 0% 163.8n ± 0% -73.99% (p=0.000 n=10) XORBytes/32768Bytes 4731.0n ± 1% 632.8n ± 0% -86.63% (p=0.000 n=10) XORBytesAlignment/8Bytes0Offset 11.61n ± 1% 12.42n ± 0% +6.98% (p=0.000 n=10) XORBytesAlignment/8Bytes1Offset 11.61n ± 0% 12.41n ± 0% +6.89% (p=0.000 n=10) XORBytesAlignment/8Bytes2Offset 11.61n ± 0% 12.42n ± 0% +6.98% (p=0.000 n=10) XORBytesAlignment/8Bytes3Offset 11.61n ± 0% 12.41n ± 0% +6.89% (p=0.000 n=10) XORBytesAlignment/8Bytes4Offset 11.61n ± 0% 12.42n ± 0% +6.98% (p=0.000 n=10) XORBytesAlignment/8Bytes5Offset 11.61n ± 0% 12.41n ± 0% +6.89% (p=0.000 n=10) XORBytesAlignment/8Bytes6Offset 11.61n ± 0% 12.41n ± 1% +6.89% (p=0.000 n=10) XORBytesAlignment/8Bytes7Offset 11.61n ± 0% 12.42n ± 0% +6.98% (p=0.000 n=10) XORBytesAlignment/128Bytes0Offset 17.82n ± 0% 13.62n ± 0% -23.57% (p=0.000 n=10) XORBytesAlignment/128Bytes1Offset 26.62n ± 0% 18.43n ± 0% -30.78% (p=0.000 n=10) XORBytesAlignment/128Bytes2Offset 26.64n ± 0% 18.43n ± 0% -30.85% (p=0.000 n=10) XORBytesAlignment/128Bytes3Offset 26.65n ± 0% 18.42n ± 0% -30.90% (p=0.000 n=10) XORBytesAlignment/128Bytes4Offset 26.65n ± 0% 18.42n ± 0% -30.88% (p=0.000 n=10) XORBytesAlignment/128Bytes5Offset 26.62n ± 0% 18.42n ± 0% -30.82% (p=0.000 n=10) XORBytesAlignment/128Bytes6Offset 26.63n ± 0% 18.42n ± 0% -30.84% (p=0.000 n=10) XORBytesAlignment/128Bytes7Offset 26.64n ± 0% 18.42n ± 0% -30.86% (p=0.000 n=10) XORBytesAlignment/2048Bytes0Offset 161.80n ± 0% 48.25n ± 0% -70.18% (p=0.000 n=10) XORBytesAlignment/2048Bytes1Offset 354.6n ± 0% 189.2n ± 0% -46.64% (p=0.000 n=10) XORBytesAlignment/2048Bytes2Offset 354.6n ± 0% 189.2n ± 0% -46.64% (p=0.000 n=10) XORBytesAlignment/2048Bytes3Offset 354.7n ± 0% 189.2n ± 0% -46.66% (p=0.000 n=10) XORBytesAlignment/2048Bytes4Offset 354.7n ± 0% 189.2n ± 1% -46.66% (p=0.000 n=10) XORBytesAlignment/2048Bytes5Offset 354.7n ± 0% 189.2n ± 0% -46.66% (p=0.000 n=10) XORBytesAlignment/2048Bytes6Offset 354.7n ± 0% 189.2n ± 0% -46.66% (p=0.000 n=10) XORBytesAlignment/2048Bytes7Offset 354.8n ± 0% 189.2n ± 0% -46.67% (p=0.000 n=10) geomean 56.46n 36.46n -35.42% Change-Id: I66e150b132517e9ff4827abf796812ffe608c052 Reviewed-on: https://go-review.googlesource.com/c/go/+/673355 LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: abner chenc <chenguoqi@loongson.cn> Reviewed-by: David Chase <drchase@google.com> Reviewed-by: Michael Knyszek <mknyszek@google.com>
Diffstat (limited to 'src/crypto')
-rw-r--r--src/crypto/internal/fips140/subtle/xor_asm.go2
-rw-r--r--src/crypto/internal/fips140/subtle/xor_loong64.go39
-rw-r--r--src/crypto/internal/fips140/subtle/xor_loong64.s387
-rw-r--r--src/crypto/internal/fips140deps/cpu/cpu.go3
4 files changed, 358 insertions, 73 deletions
diff --git a/src/crypto/internal/fips140/subtle/xor_asm.go b/src/crypto/internal/fips140/subtle/xor_asm.go
index 216ae7ffeb..00f3497a02 100644
--- a/src/crypto/internal/fips140/subtle/xor_asm.go
+++ b/src/crypto/internal/fips140/subtle/xor_asm.go
@@ -2,7 +2,7 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
-//go:build (amd64 || arm || arm64 || loong64 || mips || mipsle || mips64 || mips64le || ppc64 || ppc64le || riscv64) && !purego
+//go:build (amd64 || arm || arm64 || mips || mipsle || mips64 || mips64le || ppc64 || ppc64le || riscv64) && !purego
package subtle
diff --git a/src/crypto/internal/fips140/subtle/xor_loong64.go b/src/crypto/internal/fips140/subtle/xor_loong64.go
new file mode 100644
index 0000000000..ad66824d88
--- /dev/null
+++ b/src/crypto/internal/fips140/subtle/xor_loong64.go
@@ -0,0 +1,39 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !purego
+
+package subtle
+
+import (
+ "crypto/internal/fips140deps/cpu"
+ "crypto/internal/impl"
+)
+
+var useLSX = cpu.LOONG64HasLSX
+var useLASX = cpu.LOONG64HasLASX
+
+func init() {
+ impl.Register("subtle", "LSX", &useLSX)
+ impl.Register("subtle", "LASX", &useLASX)
+}
+
+//go:noescape
+func xorBytesBasic(dst, a, b *byte, n int)
+
+//go:noescape
+func xorBytesLSX(dst, a, b *byte, n int)
+
+//go:noescape
+func xorBytesLASX(dst, a, b *byte, n int)
+
+func xorBytes(dst, a, b *byte, n int) {
+ if useLASX {
+ xorBytesLASX(dst, a, b, n)
+ } else if useLSX {
+ xorBytesLSX(dst, a, b, n)
+ } else {
+ xorBytesBasic(dst, a, b, n)
+ }
+}
diff --git a/src/crypto/internal/fips140/subtle/xor_loong64.s b/src/crypto/internal/fips140/subtle/xor_loong64.s
index 09dc80eb93..36c18a6277 100644
--- a/src/crypto/internal/fips140/subtle/xor_loong64.s
+++ b/src/crypto/internal/fips140/subtle/xor_loong64.s
@@ -6,30 +6,76 @@
#include "textflag.h"
-// func xorBytes(dst, a, b *byte, n int)
-TEXT ·xorBytes(SB), NOSPLIT, $0
+#define SMALL_TAIL \
+ SGTU $2, R7, R8; \
+ BNE R8, xor_1; \
+ SGTU $4, R7, R8; \
+ BNE R8, xor_2; \
+ SGTU $8, R7, R8; \
+ BNE R8, xor_4; \
+ SGTU $16, R7, R8; \
+ BNE R8, xor_8; \
+
+#define SMALL \
+xor_8_check:; \
+ SGTU $8, R7, R8; \
+ BNE R8, xor_4_check; \
+xor_8:; \
+ SUBV $8, R7; \
+ MOVV (R5), R10; \
+ MOVV (R6), R11; \
+ XOR R10, R11; \
+ MOVV R11, (R4); \
+ ADDV $8, R5; \
+ ADDV $8, R6; \
+ ADDV $8, R4; \
+ BEQ R7, R0, end; \
+xor_4_check:; \
+ SGTU $4, R7, R8; \
+ BNE R8, xor_2_check; \
+xor_4:; \
+ SUBV $4, R7; \
+ MOVW (R5), R10; \
+ MOVW (R6), R11; \
+ XOR R10, R11; \
+ MOVW R11, (R4); \
+ ADDV $4, R5; \
+ ADDV $4, R6; \
+ ADDV $4, R4; \
+ BEQ R7, R0, end; \
+xor_2_check:; \
+ SGTU $2, R7, R8; \
+ BNE R8, xor_1; \
+xor_2:; \
+ SUBV $2, R7; \
+ MOVH (R5), R10; \
+ MOVH (R6), R11; \
+ XOR R10, R11; \
+ MOVH R11, (R4); \
+ ADDV $2, R5; \
+ ADDV $2, R6; \
+ ADDV $2, R4; \
+ BEQ R7, R0, end; \
+xor_1:; \
+ MOVB (R5), R10; \
+ MOVB (R6), R11; \
+ XOR R10, R11; \
+ MOVB R11, (R4); \
+
+// func xorBytesBasic(dst, a, b *byte, n int)
+TEXT ·xorBytesBasic(SB), NOSPLIT, $0
MOVV dst+0(FP), R4
MOVV a+8(FP), R5
MOVV b+16(FP), R6
MOVV n+24(FP), R7
- MOVV $64, R9
- BGEU R7, R9, loop64 // n >= 64
-tail:
- SRLV $1, R9
- BGEU R7, R9, xor_32 // n >= 32 && n < 64
- SRLV $1, R9
- BGEU R7, R9, xor_16 // n >= 16 && n < 32
- SRLV $1, R9
- BGEU R7, R9, xor_8 // n >= 8 && n < 16
- SRLV $1, R9
- BGEU R7, R9, xor_4 // n >= 4 && n < 8
- SRLV $1, R9
- BGEU R7, R9, xor_2 // n >= 2 && n < 4
- SRLV $1, R9
- BGEU R7, R9, xor_1 // n = 1
+ SMALL_TAIL
-loop64:
+xor_64_check:
+ SGTU $64, R7, R8
+ BNE R8, xor_32_check
+xor_64_loop:
+ SUBV $64, R7
MOVV (R5), R10
MOVV 8(R5), R11
MOVV 16(R5), R12
@@ -62,18 +108,18 @@ loop64:
MOVV R15, 40(R4)
MOVV R16, 48(R4)
MOVV R17, 56(R4)
+ SGTU $64, R7, R8
ADDV $64, R5
ADDV $64, R6
ADDV $64, R4
- SUBV $64, R7
- // 64 in R9
- BGEU R7, R9, loop64
- BEQ R7, R0, end
+ BEQ R8, xor_64_loop
+ BEQ R7, end
xor_32_check:
- SRLV $1, R9
- BLT R7, R9, xor_16_check
+ SGTU $32, R7, R8
+ BNE R8, xor_16_check
xor_32:
+ SUBV $32, R7
MOVV (R5), R10
MOVV 8(R5), R11
MOVV 16(R5), R12
@@ -93,13 +139,13 @@ xor_32:
ADDV $32, R5
ADDV $32, R6
ADDV $32, R4
- SUBV $32, R7
BEQ R7, R0, end
xor_16_check:
- SRLV $1, R9
- BLT R7, R9, xor_8_check
+ SGTU $16, R7, R8
+ BNE R8, xor_8_check
xor_16:
+ SUBV $16, R7
MOVV (R5), R10
MOVV 8(R5), R11
MOVV (R6), R12
@@ -111,56 +157,253 @@ xor_16:
ADDV $16, R5
ADDV $16, R6
ADDV $16, R4
- SUBV $16, R7
BEQ R7, R0, end
-xor_8_check:
- SRLV $1, R9
- BLT R7, R9, xor_4_check
-xor_8:
- MOVV (R5), R10
- MOVV (R6), R11
- XOR R10, R11
- MOVV R11, (R4)
- ADDV $8, R5
- ADDV $8, R6
- ADDV $8, R4
- SUBV $8, R7
- BEQ R7, R0, end
+ SMALL
+end:
+ RET
-xor_4_check:
- SRLV $1, R9
- BLT R7, R9, xor_2_check
-xor_4:
- MOVW (R5), R10
- MOVW (R6), R11
- XOR R10, R11
- MOVW R11, (R4)
- ADDV $4, R5
- ADDV $4, R6
- ADDV $4, R4
- SUBV $4, R7
- BEQ R7, R0, end
+// func xorBytesLSX(dst, a, b *byte, n int)
+TEXT ·xorBytesLSX(SB), NOSPLIT, $0
+ MOVV dst+0(FP), R4
+ MOVV a+8(FP), R5
+ MOVV b+16(FP), R6
+ MOVV n+24(FP), R7
-xor_2_check:
- SRLV $1, R9
- BLT R7, R9, xor_1
-xor_2:
- MOVH (R5), R10
- MOVH (R6), R11
- XOR R10, R11
- MOVH R11, (R4)
- ADDV $2, R5
- ADDV $2, R6
- ADDV $2, R4
- SUBV $2, R7
- BEQ R7, R0, end
+ SMALL_TAIL
+
+xor_128_lsx_check:
+ SGTU $128, R7, R8
+ BNE R8, xor_64_lsx_check
+xor_128_lsx_loop:
+ SUBV $128, R7
+ VMOVQ (R5), V0
+ VMOVQ 16(R5), V1
+ VMOVQ 32(R5), V2
+ VMOVQ 48(R5), V3
+ VMOVQ 64(R5), V4
+ VMOVQ 80(R5), V5
+ VMOVQ 96(R5), V6
+ VMOVQ 112(R5), V7
+ VMOVQ (R6), V8
+ VMOVQ 16(R6), V9
+ VMOVQ 32(R6), V10
+ VMOVQ 48(R6), V11
+ VMOVQ 64(R6), V12
+ VMOVQ 80(R6), V13
+ VMOVQ 96(R6), V14
+ VMOVQ 112(R6), V15
+ VXORV V0, V8, V8
+ VXORV V1, V9, V9
+ VXORV V2, V10, V10
+ VXORV V3, V11, V11
+ VXORV V4, V12, V12
+ VXORV V5, V13, V13
+ VXORV V6, V14, V14
+ VXORV V7, V15, V15
+ VMOVQ V8, (R4)
+ VMOVQ V9, 16(R4)
+ VMOVQ V10, 32(R4)
+ VMOVQ V11, 48(R4)
+ VMOVQ V12, 64(R4)
+ VMOVQ V13, 80(R4)
+ VMOVQ V14, 96(R4)
+ VMOVQ V15, 112(R4)
+ SGTU $128, R7, R8
+ ADDV $128, R5
+ ADDV $128, R6
+ ADDV $128, R4
+ BEQ R8, xor_128_lsx_loop
+ BEQ R7, end
-xor_1:
- MOVB (R5), R10
- MOVB (R6), R11
- XOR R10, R11
- MOVB R11, (R4)
+xor_64_lsx_check:
+ SGTU $64, R7, R8
+ BNE R8, xor_32_lsx_check
+xor_64_lsx:
+ SUBV $64, R7
+ VMOVQ (R5), V0
+ VMOVQ 16(R5), V1
+ VMOVQ 32(R5), V2
+ VMOVQ 48(R5), V3
+ VMOVQ (R6), V4
+ VMOVQ 16(R6), V5
+ VMOVQ 32(R6), V6
+ VMOVQ 48(R6), V7
+ VXORV V0, V4, V4
+ VXORV V1, V5, V5
+ VXORV V2, V6, V6
+ VXORV V3, V7, V7
+ VMOVQ V4, (R4)
+ VMOVQ V5, 16(R4)
+ VMOVQ V6, 32(R4)
+ VMOVQ V7, 48(R4)
+ ADDV $64, R5
+ ADDV $64, R6
+ ADDV $64, R4
+ BEQ R7, end
+xor_32_lsx_check:
+ SGTU $32, R7, R8
+ BNE R8, xor_16_lsx_check
+xor_32_lsx:
+ SUBV $32, R7
+ VMOVQ (R5), V0
+ VMOVQ 16(R5), V1
+ VMOVQ (R6), V2
+ VMOVQ 16(R6), V3
+ VXORV V0, V2, V2
+ VXORV V1, V3, V3
+ VMOVQ V2, (R4)
+ VMOVQ V3, 16(R4)
+ ADDV $32, R5
+ ADDV $32, R6
+ ADDV $32, R4
+ BEQ R7, end
+
+xor_16_lsx_check:
+ SGTU $16, R7, R8
+ BNE R8, xor_8_check
+xor_16_lsx:
+ SUBV $16, R7
+ VMOVQ (R5), V0
+ VMOVQ (R6), V1
+ VXORV V0, V1, V1
+ VMOVQ V1, (R4)
+ ADDV $16, R5
+ ADDV $16, R6
+ ADDV $16, R4
+ BEQ R7, end
+
+ SMALL
+end:
+ RET
+
+// func xorBytesLASX(dst, a, b *byte, n int)
+TEXT ·xorBytesLASX(SB), NOSPLIT, $0
+ MOVV dst+0(FP), R4
+ MOVV a+8(FP), R5
+ MOVV b+16(FP), R6
+ MOVV n+24(FP), R7
+
+ SMALL_TAIL
+
+xor_256_lasx_check:
+ SGTU $256, R7, R8
+ BNE R8, xor_128_lasx_check
+xor_256_lasx_loop:
+ SUBV $256, R7
+ XVMOVQ (R5), X0
+ XVMOVQ 32(R5), X1
+ XVMOVQ 64(R5), X2
+ XVMOVQ 96(R5), X3
+ XVMOVQ 128(R5), X4
+ XVMOVQ 160(R5), X5
+ XVMOVQ 192(R5), X6
+ XVMOVQ 224(R5), X7
+ XVMOVQ (R6), X8
+ XVMOVQ 32(R6), X9
+ XVMOVQ 64(R6), X10
+ XVMOVQ 96(R6), X11
+ XVMOVQ 128(R6), X12
+ XVMOVQ 160(R6), X13
+ XVMOVQ 192(R6), X14
+ XVMOVQ 224(R6), X15
+ XVXORV X0, X8, X8
+ XVXORV X1, X9, X9
+ XVXORV X2, X10, X10
+ XVXORV X3, X11, X11
+ XVXORV X4, X12, X12
+ XVXORV X5, X13, X13
+ XVXORV X6, X14, X14
+ XVXORV X7, X15, X15
+ XVMOVQ X8, (R4)
+ XVMOVQ X9, 32(R4)
+ XVMOVQ X10, 64(R4)
+ XVMOVQ X11, 96(R4)
+ XVMOVQ X12, 128(R4)
+ XVMOVQ X13, 160(R4)
+ XVMOVQ X14, 192(R4)
+ XVMOVQ X15, 224(R4)
+ SGTU $256, R7, R8
+ ADDV $256, R5
+ ADDV $256, R6
+ ADDV $256, R4
+ BEQ R8, xor_256_lasx_loop
+ BEQ R7, end
+
+xor_128_lasx_check:
+ SGTU $128, R7, R8
+ BNE R8, xor_64_lasx_check
+xor_128_lasx:
+ SUBV $128, R7
+ XVMOVQ (R5), X0
+ XVMOVQ 32(R5), X1
+ XVMOVQ 64(R5), X2
+ XVMOVQ 96(R5), X3
+ XVMOVQ (R6), X4
+ XVMOVQ 32(R6), X5
+ XVMOVQ 64(R6), X6
+ XVMOVQ 96(R6), X7
+ XVXORV X0, X4, X4
+ XVXORV X1, X5, X5
+ XVXORV X2, X6, X6
+ XVXORV X3, X7, X7
+ XVMOVQ X4, (R4)
+ XVMOVQ X5, 32(R4)
+ XVMOVQ X6, 64(R4)
+ XVMOVQ X7, 96(R4)
+ ADDV $128, R5
+ ADDV $128, R6
+ ADDV $128, R4
+ BEQ R7, end
+
+xor_64_lasx_check:
+ SGTU $64, R7, R8
+ BNE R8, xor_32_lasx_check
+xor_64_lasx:
+ SUBV $64, R7
+ XVMOVQ (R5), X0
+ XVMOVQ 32(R5), X1
+ XVMOVQ (R6), X2
+ XVMOVQ 32(R6), X3
+ XVXORV X0, X2, X2
+ XVXORV X1, X3, X3
+ XVMOVQ X2, (R4)
+ XVMOVQ X3, 32(R4)
+ ADDV $64, R5
+ ADDV $64, R6
+ ADDV $64, R4
+ BEQ R7, end
+
+xor_32_lasx_check:
+ SGTU $32, R7, R8
+ BNE R8, xor_16_lasx_check
+xor_32_lasx:
+ SUBV $32, R7
+ XVMOVQ (R5), X0
+ XVMOVQ (R6), X1
+ XVXORV X0, X1, X1
+ XVMOVQ X1, (R4)
+ ADDV $32, R5
+ ADDV $32, R6
+ ADDV $32, R4
+ BEQ R7, end
+
+xor_16_lasx_check:
+ SGTU $16, R7, R8
+ BNE R8, xor_8_check
+xor_16_lasx:
+ SUBV $16, R7
+ VMOVQ (R5), V0
+ VMOVQ (R6), V1
+ VXORV V0, V1, V1
+ VMOVQ V1, (R4)
+ ADDV $16, R5
+ ADDV $16, R6
+ ADDV $16, R4
+ BEQ R7, end
+
+ SMALL
end:
RET
+
diff --git a/src/crypto/internal/fips140deps/cpu/cpu.go b/src/crypto/internal/fips140deps/cpu/cpu.go
index cc9ac0035a..311e4f541b 100644
--- a/src/crypto/internal/fips140deps/cpu/cpu.go
+++ b/src/crypto/internal/fips140deps/cpu/cpu.go
@@ -23,6 +23,9 @@ var (
ARM64HasSHA2 = cpu.ARM64.HasSHA2
ARM64HasSHA512 = cpu.ARM64.HasSHA512
+ LOONG64HasLSX = cpu.Loong64.HasLSX
+ LOONG64HasLASX = cpu.Loong64.HasLASX
+
S390XHasAES = cpu.S390X.HasAES
S390XHasAESCBC = cpu.S390X.HasAESCBC
S390XHasAESCTR = cpu.S390X.HasAESCTR