aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorXiaolin Zhao <zhaoxiaolin@loongson.cn>2025-03-14 15:28:49 +0800
committerabner chenc <chenguoqi@loongson.cn>2025-04-13 20:23:35 -0700
commit388684e50b26e51e1428afcacb811c0732c45c01 (patch)
tree9f3b415a3e7fbc0061601b10a8d12dfa478590d7
parent953e8095893cd9efe44a90fd07ed3cfc87bfc109 (diff)
downloadgo-x-crypto-388684e50b26e51e1428afcacb811c0732c45c01.tar.xz
argon2: add loong64 SIMD implementation
The performance gains on Loongson 3A6000 and 3A5000 are as follows: goos: linux goarch: loong64 pkg: golang.org/x/crypto/argon2 cpu: Loongson-3A6000-HV @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | Argon2i/_Time:_3_Memory:_32_MB,_Threads:_1 131.23m ± 0% 67.56m ± 1% -48.52% (p=0.000 n=10) Argon2i/_Time:_4_Memory:_32_MB,_Threads:_1 171.28m ± 2% 90.20m ± 0% -47.34% (p=0.000 n=10) Argon2i/_Time:_5_Memory:_32_MB,_Threads:_1 213.3m ± 0% 112.6m ± 0% -47.21% (p=0.000 n=10) Argon2i/_Time:_3_Memory:_64_MB,_Threads:_4 269.5m ± 0% 147.2m ± 0% -45.37% (p=0.000 n=10) Argon2i/_Time:_4_Memory:_64_MB,_Threads:_4 357.7m ± 0% 195.4m ± 0% -45.36% (p=0.000 n=10) Argon2i/_Time:_5_Memory:_64_MB,_Threads:_4 449.8m ± 0% 243.8m ± 0% -45.79% (p=0.000 n=10) Argon2d/_Time:_3,_Memory:_32_MB,_Threads:_1 126.56m ± 0% 67.43m ± 0% -46.72% (p=0.000 n=10) Argon2d/_Time:_4,_Memory:_32_MB,_Threads:_1 168.57m ± 0% 90.04m ± 0% -46.58% (p=0.000 n=10) Argon2d/_Time:_5,_Memory:_32_MB,_Threads:_1 210.5m ± 0% 112.7m ± 0% -46.45% (p=0.000 n=10) Argon2d/_Time:_3,_Memory:_64_MB,_Threads:_4 264.8m ± 0% 145.0m ± 1% -45.23% (p=0.000 n=10) Argon2d/_Time:_4,_Memory:_64_MB,_Threads:_4 353.8m ± 0% 193.7m ± 0% -45.26% (p=0.000 n=10) Argon2d/_Time:_5,_Memory:_64_MB,_Threads:_4 444.4m ± 0% 242.3m ± 0% -45.49% (p=0.000 n=10) Argon2id/_Time:_3,_Memory:_32_MB,_Threads:_1 126.89m ± 0% 66.62m ± 0% -47.50% (p=0.000 n=10) Argon2id/_Time:_4,_Memory:_32_MB,_Threads:_1 169.02m ± 0% 89.07m ± 0% -47.30% (p=0.000 n=10) Argon2id/_Time:_5,_Memory:_32_MB,_Threads:_1 210.7m ± 0% 111.0m ± 0% -47.34% (p=0.000 n=10) Argon2id/_Time:_3,_Memory:_64_MB,_Threads:_4 267.6m ± 1% 145.8m ± 0% -45.51% (p=0.000 n=10) Argon2id/_Time:_4,_Memory:_64_MB,_Threads:_4 355.1m ± 0% 194.1m ± 0% -45.34% (p=0.000 n=10) Argon2id/_Time:_5,_Memory:_64_MB,_Threads:_4 443.6m ± 0% 242.5m ± 0% -45.33% (p=0.000 n=10) geomean 240.8m 129.3m -46.32% goos: linux goarch: loong64 pkg: golang.org/x/crypto/argon2 cpu: Loongson-3A5000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | Argon2i/_Time:_3_Memory:_32_MB,_Threads:_1 209.9m ± 1% 109.7m ± 2% -47.75% (p=0.000 n=10) Argon2i/_Time:_4_Memory:_32_MB,_Threads:_1 278.1m ± 0% 143.7m ± 0% -48.34% (p=0.000 n=10) Argon2i/_Time:_5_Memory:_32_MB,_Threads:_1 346.7m ± 0% 178.1m ± 0% -48.63% (p=0.000 n=10) Argon2i/_Time:_3_Memory:_64_MB,_Threads:_4 455.3m ± 0% 240.8m ± 0% -47.12% (p=0.000 n=10) Argon2i/_Time:_4_Memory:_64_MB,_Threads:_4 604.6m ± 0% 317.7m ± 0% -47.45% (p=0.000 n=10) Argon2i/_Time:_5_Memory:_64_MB,_Threads:_4 754.8m ± 0% 395.4m ± 0% -47.61% (p=0.000 n=10) Argon2d/_Time:_3,_Memory:_32_MB,_Threads:_1 206.9m ± 1% 107.6m ± 0% -48.00% (p=0.000 n=10) Argon2d/_Time:_4,_Memory:_32_MB,_Threads:_1 274.3m ± 0% 141.8m ± 1% -48.32% (p=0.000 n=10) Argon2d/_Time:_5,_Memory:_32_MB,_Threads:_1 342.4m ± 0% 175.6m ± 0% -48.71% (p=0.000 n=10) Argon2d/_Time:_3,_Memory:_64_MB,_Threads:_4 450.2m ± 0% 237.9m ± 0% -47.15% (p=0.000 n=10) Argon2d/_Time:_4,_Memory:_64_MB,_Threads:_4 597.7m ± 0% 314.0m ± 0% -47.46% (p=0.000 n=10) Argon2d/_Time:_5,_Memory:_64_MB,_Threads:_4 745.8m ± 0% 390.7m ± 1% -47.61% (p=0.000 n=10) Argon2id/_Time:_3,_Memory:_32_MB,_Threads:_1 207.6m ± 0% 107.9m ± 0% -48.05% (p=0.000 n=10) Argon2id/_Time:_4,_Memory:_32_MB,_Threads:_1 275.0m ± 0% 142.0m ± 0% -48.34% (p=0.000 n=10) Argon2id/_Time:_5,_Memory:_32_MB,_Threads:_1 342.9m ± 1% 176.0m ± 0% -48.66% (p=0.000 n=10) Argon2id/_Time:_3,_Memory:_64_MB,_Threads:_4 450.6m ± 1% 238.5m ± 0% -47.07% (p=0.000 n=10) Argon2id/_Time:_4,_Memory:_64_MB,_Threads:_4 598.5m ± 1% 314.6m ± 0% -47.44% (p=0.000 n=10) Argon2id/_Time:_5,_Memory:_64_MB,_Threads:_4 746.4m ± 0% 391.0m ± 0% -47.61% (p=0.000 n=10) geomean 398.6m 207.9m -47.86% Change-Id: Iaa9d134d68dd2f0972fc5768d7e66f7b1ff0ebd3 Reviewed-on: https://go-review.googlesource.com/c/crypto/+/657795 LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: abner chenc <chenguoqi@loongson.cn> Reviewed-by: Dmitri Shuralyov <dmitshur@google.com> Reviewed-by: Carlos Amedee <carlos@golang.org>
-rw-r--r--argon2/blamka_loong64.go59
-rw-r--r--argon2/blamka_loong64.s258
-rw-r--r--argon2/blamka_ref.go2
3 files changed, 318 insertions, 1 deletions
diff --git a/argon2/blamka_loong64.go b/argon2/blamka_loong64.go
new file mode 100644
index 0000000..1b43a2e
--- /dev/null
+++ b/argon2/blamka_loong64.go
@@ -0,0 +1,59 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build loong64 && gc && !purego
+
+package argon2
+
+import "golang.org/x/sys/cpu"
+
+//go:noescape
+func mixBlocks1VX(out, in1, in2 *block)
+
+//go:noescape
+func mixBlocks2VX(out, in1, in2, t *block)
+
+//go:noescape
+func xorBlocksVX(out, in1, in2, t *block)
+
+//go:noescape
+func blamkaVX(b *block)
+
+func processBlockVX(out, in1, in2 *block, xor bool) {
+ var t block
+ mixBlocks1VX(&t, in1, in2)
+ if cpu.Loong64.HasLSX {
+ blamkaVX(&t)
+ } else {
+ for i := 0; i < blockLength; i += 16 {
+ blamkaGeneric(
+ &t[i+0], &t[i+1], &t[i+2], &t[i+3],
+ &t[i+4], &t[i+5], &t[i+6], &t[i+7],
+ &t[i+8], &t[i+9], &t[i+10], &t[i+11],
+ &t[i+12], &t[i+13], &t[i+14], &t[i+15],
+ )
+ }
+ for i := 0; i < blockLength/8; i += 2 {
+ blamkaGeneric(
+ &t[i], &t[i+1], &t[16+i], &t[16+i+1],
+ &t[32+i], &t[32+i+1], &t[48+i], &t[48+i+1],
+ &t[64+i], &t[64+i+1], &t[80+i], &t[80+i+1],
+ &t[96+i], &t[96+i+1], &t[112+i], &t[112+i+1],
+ )
+ }
+ }
+ if xor {
+ xorBlocksVX(out, in1, in2, &t)
+ } else {
+ mixBlocks2VX(out, in1, in2, &t)
+ }
+}
+
+func processBlock(out, in1, in2 *block) {
+ processBlockVX(out, in1, in2, false)
+}
+
+func processBlockXOR(out, in1, in2 *block) {
+ processBlockVX(out, in1, in2, true)
+}
diff --git a/argon2/blamka_loong64.s b/argon2/blamka_loong64.s
new file mode 100644
index 0000000..c380d77
--- /dev/null
+++ b/argon2/blamka_loong64.s
@@ -0,0 +1,258 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build loong64 && gc && !purego
+
+#include "textflag.h"
+
+#define BLAMKA_ROUND \
+ VMULWEVVWU V0, V2, V8; \
+ VADDV V2, V0, V0; \
+ VADDV V0, V8, V0; \
+ VADDV V0, V8, V0; \
+ VXORV V6, V0, V6; \
+ VROTRV $32, V6, V6; \
+ VMULWEVVWU V4, V6, V8; \
+ VADDV V4, V6, V4; \
+ VADDV V4, V8, V4; \
+ VADDV V4, V8, V4; \
+ VXORV V2, V4, V2; \
+ VROTRV $24, V2, V2; \
+ VMULWEVVWU V0, V2, V8; \
+ VADDV V0, V2, V0; \
+ VADDV V0, V8, V0; \
+ VADDV V0, V8, V0; \
+ VXORV V6, V0, V6; \
+ VROTRV $16, V6, V6; \
+ VMULWEVVWU V4, V6, V8; \
+ VADDV V4, V6, V4; \
+ VADDV V4, V8, V4; \
+ VADDV V4, V8, V4; \
+ VXORV V2, V4, V2; \
+ VROTRV $63, V2, V2; \
+;\
+ VMULWEVVWU V1, V3, V8; \
+ VADDV V1, V3, V1; \
+ VADDV V1, V8, V1; \
+ VADDV V1, V8, V1; \
+ VXORV V7, V1, V7; \
+ VROTRV $32, V7, V7; \
+ VMULWEVVWU V5, V7, V8; \
+ VADDV V5, V7, V5; \
+ VADDV V5, V8, V5; \
+ VADDV V5, V8, V5; \
+ VXORV V3, V5, V3; \
+ VROTRV $24, V3, V3; \
+ VMULWEVVWU V1, V3, V8; \
+ VADDV V1, V3, V1; \
+ VADDV V1, V8, V1; \
+ VADDV V1, V8, V1; \
+ VXORV V7, V1, V7; \
+ VROTRV $16, V7, V7; \
+ VMULWEVVWU V5, V7, V8; \
+ VADDV V5, V7, V5; \
+ VADDV V5, V8, V5; \
+ VADDV V5, V8, V5; \
+ VXORV V3, V5, V3; \
+ VROTRV $63, V3, V3; \
+;\
+ VXORV V0, V0, V8; \ // V8 = 0
+ VADDV V2, V8, V9; \ // V9 = V2
+ VADDV V5, V8, V10; \ // V10 = V5
+ VADDV V6, V8, V11; \ // V11 = V6
+ VADDV V4, V8, V5; \ // V5 = V4
+ VADDV V10, V8, V4; \ // V4 = V5
+ VSHUF4IV $9, V3, V2; \
+ VSHUF4IV $9, V9, V3; \
+ VSHUF4IV $3, V7, V6; \
+ VSHUF4IV $3, V11, V7; \
+;\
+ VMULWEVVWU V0, V2, V9; \
+ VADDV V0, V2, V0; \
+ VADDV V0, V9, V0; \
+ VADDV V0, V9, V0; \
+ VXORV V6, V0, V6; \
+ VROTRV $32, V6, V6; \
+ VMULWEVVWU V4, V6, V9; \
+ VADDV V4, V6, V4; \
+ VADDV V4, V9, V4; \
+ VADDV V4, V9, V4; \
+ VXORV V2, V4, V2; \
+ VROTRV $24, V2, V2; \
+ VMULWEVVWU V0, V2, V9; \
+ VADDV V0, V2, V0; \
+ VADDV V0, V9, V0; \
+ VADDV V0, V9, V0; \
+ VXORV V6, V0, V6; \
+ VROTRV $16, V6, V6; \
+ VMULWEVVWU V4, V6, V9; \
+ VADDV V4, V6, V4; \
+ VADDV V4, V9, V4; \
+ VADDV V4, V9, V4; \
+ VXORV V2, V4, V2; \
+ VROTRV $63, V2, V2; \
+;\
+ VMULWEVVWU V1, V3, V9; \
+ VADDV V1, V3, V1; \
+ VADDV V1, V9, V1; \
+ VADDV V1, V9, V1; \
+ VXORV V7, V1, V7; \
+ VROTRV $32, V7, V7; \
+ VMULWEVVWU V5, V7, V9; \
+ VADDV V5, V7, V5; \
+ VADDV V5, V9, V5; \
+ VADDV V5, V9, V5; \
+ VXORV V3, V5, V3; \
+ VROTRV $24, V3, V3; \
+ VMULWEVVWU V1, V3, V9; \
+ VADDV V1, V3, V1; \
+ VADDV V1, V9, V1; \
+ VADDV V1, V9, V1; \
+ VXORV V7, V1, V7; \
+ VROTRV $16, V7, V7; \
+ VMULWEVVWU V5, V7, V9; \
+ VADDV V5, V7, V5; \
+ VADDV V5, V9, V5; \
+ VADDV V5, V9, V5; \
+ VXORV V3, V5, V3; \
+ VROTRV $63, V3, V3; \
+;\
+ VADDV V2, V8, V9; \ // V9 = V2
+ VADDV V5, V8, V10; \ // V10 = V5
+ VADDV V6, V8, V11; \ // V11 = V6
+ VADDV V4, V8, V5; \ // V5 = V4
+ VADDV V10, V8, V4; \ // V4 = V5
+ VSHUF4IV $3, V3, V2; \
+ VSHUF4IV $3, V9, V3; \
+ VSHUF4IV $9, V7, V6; \
+ VSHUF4IV $9, V11, V7; \
+
+#define BLAMKA_ROUND1(index) \
+ VMOVQ (index+0)(R4), V0; \
+ VMOVQ (index+16)(R4), V1; \
+ VMOVQ (index+32)(R4), V2; \
+ VMOVQ (index+48)(R4), V3; \
+ VMOVQ (index+64)(R4), V4; \
+ VMOVQ (index+80)(R4), V5; \
+ VMOVQ (index+96)(R4), V6; \
+ VMOVQ (index+112)(R4), V7; \
+ BLAMKA_ROUND; \
+ VMOVQ V0, (index+0)(R4); \
+ VMOVQ V1, (index+16)(R4); \
+ VMOVQ V2, (index+32)(R4); \
+ VMOVQ V3, (index+48)(R4); \
+ VMOVQ V4, (index+64)(R4); \
+ VMOVQ V5, (index+80)(R4); \
+ VMOVQ V6, (index+96)(R4); \
+ VMOVQ V7, (index+112)(R4); \
+
+#define BLAMKA_ROUND2(index) \
+ VMOVQ (index+0)(R4), V0; \
+ VMOVQ (index+128)(R4), V1; \
+ VMOVQ (index+256)(R4), V2; \
+ VMOVQ (index+384)(R4), V3; \
+ VMOVQ (index+512)(R4), V4; \
+ VMOVQ (index+640)(R4), V5; \
+ VMOVQ (index+768)(R4), V6; \
+ VMOVQ (index+896)(R4), V7; \
+ BLAMKA_ROUND; \
+ VMOVQ V0, (index+0)(R4); \
+ VMOVQ V1, (index+128)(R4); \
+ VMOVQ V2, (index+256)(R4); \
+ VMOVQ V3, (index+384)(R4); \
+ VMOVQ V4, (index+512)(R4); \
+ VMOVQ V5, (index+640)(R4); \
+ VMOVQ V6, (index+768)(R4); \
+ VMOVQ V7, (index+896)(R4); \
+
+// func blamkaVX(b *block)
+TEXT ·blamkaVX(SB), NOSPLIT, $0-8
+ MOVV b+0(FP), R4
+
+ BLAMKA_ROUND1(0)
+ BLAMKA_ROUND1(128)
+ BLAMKA_ROUND1(256)
+ BLAMKA_ROUND1(384)
+ BLAMKA_ROUND1(512)
+ BLAMKA_ROUND1(640)
+ BLAMKA_ROUND1(768)
+ BLAMKA_ROUND1(896)
+
+ BLAMKA_ROUND2(0)
+ BLAMKA_ROUND2(16)
+ BLAMKA_ROUND2(32)
+ BLAMKA_ROUND2(48)
+ BLAMKA_ROUND2(64)
+ BLAMKA_ROUND2(80)
+ BLAMKA_ROUND2(96)
+ BLAMKA_ROUND2(112)
+
+ RET
+
+// func mixBlocks1VX(t *block, in1 *block, in2 *block)
+TEXT ·mixBlocks1VX(SB), NOSPLIT, $0-24
+ MOVV t+0(FP), R4
+ MOVV in1+8(FP), R5
+ MOVV in2+16(FP), R6
+ MOVV $128, R8
+
+loop:
+ VMOVQ (R5), V0
+ VMOVQ (R6), V1
+ VXORV V0, V1, V2
+ VMOVQ V2, (R4)
+ ADDV $16, R5
+ ADDV $16, R6
+ ADDV $16, R4
+ SUBV $2, R8
+ BLT R0, R8, loop
+ RET
+
+// func mixBlocks2VX(out *block, in1 *block, in2 *block, t *block)
+TEXT ·mixBlocks2VX(SB), NOSPLIT, $0-32
+ MOVV out+0(FP), R4
+ MOVV in1+8(FP), R5
+ MOVV in2+16(FP), R6
+ MOVV t+24(FP), R7
+ MOVV $128, R8
+
+loop:
+ VMOVQ (R5), V0
+ VMOVQ (R6), V1
+ VMOVQ (R7), V2
+ VXORV V0, V1, V3
+ VXORV V3, V2, V4
+ VMOVQ V4, (R4)
+ ADDV $16, R5
+ ADDV $16, R6
+ ADDV $16, R7
+ ADDV $16, R4
+ SUBV $2, R8
+ BLT R0, R8, loop
+ RET
+
+// func xorBlocksVX(out *block, in1 *block, in2 *block, t *block)
+TEXT ·xorBlocksVX(SB), NOSPLIT, $0-32
+ MOVV out+0(FP), R4
+ MOVV in1+8(FP), R5
+ MOVV in2+16(FP), R6
+ MOVV t+24(FP), R7
+ MOVV $128, R8
+
+loop:
+ VMOVQ (R5), V0
+ VMOVQ (R6), V1
+ VMOVQ (R7), V2
+ VMOVQ (R4), V3
+ VXORV V0, V1, V4
+ VXORV V4, V2, V5
+ VXORV V5, V3, V6
+ VMOVQ V6, (R4)
+ ADDV $16, R5
+ ADDV $16, R6
+ ADDV $16, R7
+ ADDV $16, R4
+ SUBV $2, R8
+ BLT R0, R8, loop
+ RET
diff --git a/argon2/blamka_ref.go b/argon2/blamka_ref.go
index 16d58c6..cf3e141 100644
--- a/argon2/blamka_ref.go
+++ b/argon2/blamka_ref.go
@@ -2,7 +2,7 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
-//go:build !amd64 || purego || !gc
+//go:build (!amd64 && !loong64) || purego || !gc
package argon2