diff options
| author | Xiaolin Zhao <zhaoxiaolin@loongson.cn> | 2025-03-14 15:28:49 +0800 |
|---|---|---|
| committer | abner chenc <chenguoqi@loongson.cn> | 2025-04-13 20:23:35 -0700 |
| commit | 388684e50b26e51e1428afcacb811c0732c45c01 (patch) | |
| tree | 9f3b415a3e7fbc0061601b10a8d12dfa478590d7 | |
| parent | 953e8095893cd9efe44a90fd07ed3cfc87bfc109 (diff) | |
| download | go-x-crypto-388684e50b26e51e1428afcacb811c0732c45c01.tar.xz | |
argon2: add loong64 SIMD implementation
The performance gains on Loongson 3A6000 and 3A5000 are as follows:
goos: linux
goarch: loong64
pkg: golang.org/x/crypto/argon2
cpu: Loongson-3A6000-HV @ 2500.00MHz
| bench.old | bench.new |
| sec/op | sec/op vs base |
Argon2i/_Time:_3_Memory:_32_MB,_Threads:_1 131.23m ± 0% 67.56m ± 1% -48.52% (p=0.000 n=10)
Argon2i/_Time:_4_Memory:_32_MB,_Threads:_1 171.28m ± 2% 90.20m ± 0% -47.34% (p=0.000 n=10)
Argon2i/_Time:_5_Memory:_32_MB,_Threads:_1 213.3m ± 0% 112.6m ± 0% -47.21% (p=0.000 n=10)
Argon2i/_Time:_3_Memory:_64_MB,_Threads:_4 269.5m ± 0% 147.2m ± 0% -45.37% (p=0.000 n=10)
Argon2i/_Time:_4_Memory:_64_MB,_Threads:_4 357.7m ± 0% 195.4m ± 0% -45.36% (p=0.000 n=10)
Argon2i/_Time:_5_Memory:_64_MB,_Threads:_4 449.8m ± 0% 243.8m ± 0% -45.79% (p=0.000 n=10)
Argon2d/_Time:_3,_Memory:_32_MB,_Threads:_1 126.56m ± 0% 67.43m ± 0% -46.72% (p=0.000 n=10)
Argon2d/_Time:_4,_Memory:_32_MB,_Threads:_1 168.57m ± 0% 90.04m ± 0% -46.58% (p=0.000 n=10)
Argon2d/_Time:_5,_Memory:_32_MB,_Threads:_1 210.5m ± 0% 112.7m ± 0% -46.45% (p=0.000 n=10)
Argon2d/_Time:_3,_Memory:_64_MB,_Threads:_4 264.8m ± 0% 145.0m ± 1% -45.23% (p=0.000 n=10)
Argon2d/_Time:_4,_Memory:_64_MB,_Threads:_4 353.8m ± 0% 193.7m ± 0% -45.26% (p=0.000 n=10)
Argon2d/_Time:_5,_Memory:_64_MB,_Threads:_4 444.4m ± 0% 242.3m ± 0% -45.49% (p=0.000 n=10)
Argon2id/_Time:_3,_Memory:_32_MB,_Threads:_1 126.89m ± 0% 66.62m ± 0% -47.50% (p=0.000 n=10)
Argon2id/_Time:_4,_Memory:_32_MB,_Threads:_1 169.02m ± 0% 89.07m ± 0% -47.30% (p=0.000 n=10)
Argon2id/_Time:_5,_Memory:_32_MB,_Threads:_1 210.7m ± 0% 111.0m ± 0% -47.34% (p=0.000 n=10)
Argon2id/_Time:_3,_Memory:_64_MB,_Threads:_4 267.6m ± 1% 145.8m ± 0% -45.51% (p=0.000 n=10)
Argon2id/_Time:_4,_Memory:_64_MB,_Threads:_4 355.1m ± 0% 194.1m ± 0% -45.34% (p=0.000 n=10)
Argon2id/_Time:_5,_Memory:_64_MB,_Threads:_4 443.6m ± 0% 242.5m ± 0% -45.33% (p=0.000 n=10)
geomean 240.8m 129.3m -46.32%
goos: linux
goarch: loong64
pkg: golang.org/x/crypto/argon2
cpu: Loongson-3A5000 @ 2500.00MHz
| bench.old | bench.new |
| sec/op | sec/op vs base |
Argon2i/_Time:_3_Memory:_32_MB,_Threads:_1 209.9m ± 1% 109.7m ± 2% -47.75% (p=0.000 n=10)
Argon2i/_Time:_4_Memory:_32_MB,_Threads:_1 278.1m ± 0% 143.7m ± 0% -48.34% (p=0.000 n=10)
Argon2i/_Time:_5_Memory:_32_MB,_Threads:_1 346.7m ± 0% 178.1m ± 0% -48.63% (p=0.000 n=10)
Argon2i/_Time:_3_Memory:_64_MB,_Threads:_4 455.3m ± 0% 240.8m ± 0% -47.12% (p=0.000 n=10)
Argon2i/_Time:_4_Memory:_64_MB,_Threads:_4 604.6m ± 0% 317.7m ± 0% -47.45% (p=0.000 n=10)
Argon2i/_Time:_5_Memory:_64_MB,_Threads:_4 754.8m ± 0% 395.4m ± 0% -47.61% (p=0.000 n=10)
Argon2d/_Time:_3,_Memory:_32_MB,_Threads:_1 206.9m ± 1% 107.6m ± 0% -48.00% (p=0.000 n=10)
Argon2d/_Time:_4,_Memory:_32_MB,_Threads:_1 274.3m ± 0% 141.8m ± 1% -48.32% (p=0.000 n=10)
Argon2d/_Time:_5,_Memory:_32_MB,_Threads:_1 342.4m ± 0% 175.6m ± 0% -48.71% (p=0.000 n=10)
Argon2d/_Time:_3,_Memory:_64_MB,_Threads:_4 450.2m ± 0% 237.9m ± 0% -47.15% (p=0.000 n=10)
Argon2d/_Time:_4,_Memory:_64_MB,_Threads:_4 597.7m ± 0% 314.0m ± 0% -47.46% (p=0.000 n=10)
Argon2d/_Time:_5,_Memory:_64_MB,_Threads:_4 745.8m ± 0% 390.7m ± 1% -47.61% (p=0.000 n=10)
Argon2id/_Time:_3,_Memory:_32_MB,_Threads:_1 207.6m ± 0% 107.9m ± 0% -48.05% (p=0.000 n=10)
Argon2id/_Time:_4,_Memory:_32_MB,_Threads:_1 275.0m ± 0% 142.0m ± 0% -48.34% (p=0.000 n=10)
Argon2id/_Time:_5,_Memory:_32_MB,_Threads:_1 342.9m ± 1% 176.0m ± 0% -48.66% (p=0.000 n=10)
Argon2id/_Time:_3,_Memory:_64_MB,_Threads:_4 450.6m ± 1% 238.5m ± 0% -47.07% (p=0.000 n=10)
Argon2id/_Time:_4,_Memory:_64_MB,_Threads:_4 598.5m ± 1% 314.6m ± 0% -47.44% (p=0.000 n=10)
Argon2id/_Time:_5,_Memory:_64_MB,_Threads:_4 746.4m ± 0% 391.0m ± 0% -47.61% (p=0.000 n=10)
geomean 398.6m 207.9m -47.86%
Change-Id: Iaa9d134d68dd2f0972fc5768d7e66f7b1ff0ebd3
Reviewed-on: https://go-review.googlesource.com/c/crypto/+/657795
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: abner chenc <chenguoqi@loongson.cn>
Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
Reviewed-by: Carlos Amedee <carlos@golang.org>
| -rw-r--r-- | argon2/blamka_loong64.go | 59 | ||||
| -rw-r--r-- | argon2/blamka_loong64.s | 258 | ||||
| -rw-r--r-- | argon2/blamka_ref.go | 2 |
3 files changed, 318 insertions, 1 deletions
diff --git a/argon2/blamka_loong64.go b/argon2/blamka_loong64.go new file mode 100644 index 0000000..1b43a2e --- /dev/null +++ b/argon2/blamka_loong64.go @@ -0,0 +1,59 @@ +// Copyright 2025 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build loong64 && gc && !purego + +package argon2 + +import "golang.org/x/sys/cpu" + +//go:noescape +func mixBlocks1VX(out, in1, in2 *block) + +//go:noescape +func mixBlocks2VX(out, in1, in2, t *block) + +//go:noescape +func xorBlocksVX(out, in1, in2, t *block) + +//go:noescape +func blamkaVX(b *block) + +func processBlockVX(out, in1, in2 *block, xor bool) { + var t block + mixBlocks1VX(&t, in1, in2) + if cpu.Loong64.HasLSX { + blamkaVX(&t) + } else { + for i := 0; i < blockLength; i += 16 { + blamkaGeneric( + &t[i+0], &t[i+1], &t[i+2], &t[i+3], + &t[i+4], &t[i+5], &t[i+6], &t[i+7], + &t[i+8], &t[i+9], &t[i+10], &t[i+11], + &t[i+12], &t[i+13], &t[i+14], &t[i+15], + ) + } + for i := 0; i < blockLength/8; i += 2 { + blamkaGeneric( + &t[i], &t[i+1], &t[16+i], &t[16+i+1], + &t[32+i], &t[32+i+1], &t[48+i], &t[48+i+1], + &t[64+i], &t[64+i+1], &t[80+i], &t[80+i+1], + &t[96+i], &t[96+i+1], &t[112+i], &t[112+i+1], + ) + } + } + if xor { + xorBlocksVX(out, in1, in2, &t) + } else { + mixBlocks2VX(out, in1, in2, &t) + } +} + +func processBlock(out, in1, in2 *block) { + processBlockVX(out, in1, in2, false) +} + +func processBlockXOR(out, in1, in2 *block) { + processBlockVX(out, in1, in2, true) +} diff --git a/argon2/blamka_loong64.s b/argon2/blamka_loong64.s new file mode 100644 index 0000000..c380d77 --- /dev/null +++ b/argon2/blamka_loong64.s @@ -0,0 +1,258 @@ +// Copyright 2025 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build loong64 && gc && !purego + +#include "textflag.h" + +#define BLAMKA_ROUND \ + VMULWEVVWU V0, V2, V8; \ + VADDV V2, V0, V0; \ + VADDV V0, V8, V0; \ + VADDV V0, V8, V0; \ + VXORV V6, V0, V6; \ + VROTRV $32, V6, V6; \ + VMULWEVVWU V4, V6, V8; \ + VADDV V4, V6, V4; \ + VADDV V4, V8, V4; \ + VADDV V4, V8, V4; \ + VXORV V2, V4, V2; \ + VROTRV $24, V2, V2; \ + VMULWEVVWU V0, V2, V8; \ + VADDV V0, V2, V0; \ + VADDV V0, V8, V0; \ + VADDV V0, V8, V0; \ + VXORV V6, V0, V6; \ + VROTRV $16, V6, V6; \ + VMULWEVVWU V4, V6, V8; \ + VADDV V4, V6, V4; \ + VADDV V4, V8, V4; \ + VADDV V4, V8, V4; \ + VXORV V2, V4, V2; \ + VROTRV $63, V2, V2; \ +;\ + VMULWEVVWU V1, V3, V8; \ + VADDV V1, V3, V1; \ + VADDV V1, V8, V1; \ + VADDV V1, V8, V1; \ + VXORV V7, V1, V7; \ + VROTRV $32, V7, V7; \ + VMULWEVVWU V5, V7, V8; \ + VADDV V5, V7, V5; \ + VADDV V5, V8, V5; \ + VADDV V5, V8, V5; \ + VXORV V3, V5, V3; \ + VROTRV $24, V3, V3; \ + VMULWEVVWU V1, V3, V8; \ + VADDV V1, V3, V1; \ + VADDV V1, V8, V1; \ + VADDV V1, V8, V1; \ + VXORV V7, V1, V7; \ + VROTRV $16, V7, V7; \ + VMULWEVVWU V5, V7, V8; \ + VADDV V5, V7, V5; \ + VADDV V5, V8, V5; \ + VADDV V5, V8, V5; \ + VXORV V3, V5, V3; \ + VROTRV $63, V3, V3; \ +;\ + VXORV V0, V0, V8; \ // V8 = 0 + VADDV V2, V8, V9; \ // V9 = V2 + VADDV V5, V8, V10; \ // V10 = V5 + VADDV V6, V8, V11; \ // V11 = V6 + VADDV V4, V8, V5; \ // V5 = V4 + VADDV V10, V8, V4; \ // V4 = V5 + VSHUF4IV $9, V3, V2; \ + VSHUF4IV $9, V9, V3; \ + VSHUF4IV $3, V7, V6; \ + VSHUF4IV $3, V11, V7; \ +;\ + VMULWEVVWU V0, V2, V9; \ + VADDV V0, V2, V0; \ + VADDV V0, V9, V0; \ + VADDV V0, V9, V0; \ + VXORV V6, V0, V6; \ + VROTRV $32, V6, V6; \ + VMULWEVVWU V4, V6, V9; \ + VADDV V4, V6, V4; \ + VADDV V4, V9, V4; \ + VADDV V4, V9, V4; \ + VXORV V2, V4, V2; \ + VROTRV $24, V2, V2; \ + VMULWEVVWU V0, V2, V9; \ + VADDV V0, V2, V0; \ + VADDV V0, V9, V0; \ + VADDV V0, V9, V0; \ + VXORV V6, V0, V6; \ + VROTRV $16, V6, V6; \ + VMULWEVVWU V4, V6, V9; \ + VADDV V4, V6, V4; \ + VADDV V4, V9, V4; \ + VADDV V4, V9, V4; \ + VXORV V2, V4, V2; \ + VROTRV $63, V2, V2; \ +;\ + VMULWEVVWU V1, V3, V9; \ + VADDV V1, V3, V1; \ + VADDV V1, V9, V1; \ + VADDV V1, V9, V1; \ + VXORV V7, V1, V7; \ + VROTRV $32, V7, V7; \ + VMULWEVVWU V5, V7, V9; \ + VADDV V5, V7, V5; \ + VADDV V5, V9, V5; \ + VADDV V5, V9, V5; \ + VXORV V3, V5, V3; \ + VROTRV $24, V3, V3; \ + VMULWEVVWU V1, V3, V9; \ + VADDV V1, V3, V1; \ + VADDV V1, V9, V1; \ + VADDV V1, V9, V1; \ + VXORV V7, V1, V7; \ + VROTRV $16, V7, V7; \ + VMULWEVVWU V5, V7, V9; \ + VADDV V5, V7, V5; \ + VADDV V5, V9, V5; \ + VADDV V5, V9, V5; \ + VXORV V3, V5, V3; \ + VROTRV $63, V3, V3; \ +;\ + VADDV V2, V8, V9; \ // V9 = V2 + VADDV V5, V8, V10; \ // V10 = V5 + VADDV V6, V8, V11; \ // V11 = V6 + VADDV V4, V8, V5; \ // V5 = V4 + VADDV V10, V8, V4; \ // V4 = V5 + VSHUF4IV $3, V3, V2; \ + VSHUF4IV $3, V9, V3; \ + VSHUF4IV $9, V7, V6; \ + VSHUF4IV $9, V11, V7; \ + +#define BLAMKA_ROUND1(index) \ + VMOVQ (index+0)(R4), V0; \ + VMOVQ (index+16)(R4), V1; \ + VMOVQ (index+32)(R4), V2; \ + VMOVQ (index+48)(R4), V3; \ + VMOVQ (index+64)(R4), V4; \ + VMOVQ (index+80)(R4), V5; \ + VMOVQ (index+96)(R4), V6; \ + VMOVQ (index+112)(R4), V7; \ + BLAMKA_ROUND; \ + VMOVQ V0, (index+0)(R4); \ + VMOVQ V1, (index+16)(R4); \ + VMOVQ V2, (index+32)(R4); \ + VMOVQ V3, (index+48)(R4); \ + VMOVQ V4, (index+64)(R4); \ + VMOVQ V5, (index+80)(R4); \ + VMOVQ V6, (index+96)(R4); \ + VMOVQ V7, (index+112)(R4); \ + +#define BLAMKA_ROUND2(index) \ + VMOVQ (index+0)(R4), V0; \ + VMOVQ (index+128)(R4), V1; \ + VMOVQ (index+256)(R4), V2; \ + VMOVQ (index+384)(R4), V3; \ + VMOVQ (index+512)(R4), V4; \ + VMOVQ (index+640)(R4), V5; \ + VMOVQ (index+768)(R4), V6; \ + VMOVQ (index+896)(R4), V7; \ + BLAMKA_ROUND; \ + VMOVQ V0, (index+0)(R4); \ + VMOVQ V1, (index+128)(R4); \ + VMOVQ V2, (index+256)(R4); \ + VMOVQ V3, (index+384)(R4); \ + VMOVQ V4, (index+512)(R4); \ + VMOVQ V5, (index+640)(R4); \ + VMOVQ V6, (index+768)(R4); \ + VMOVQ V7, (index+896)(R4); \ + +// func blamkaVX(b *block) +TEXT ·blamkaVX(SB), NOSPLIT, $0-8 + MOVV b+0(FP), R4 + + BLAMKA_ROUND1(0) + BLAMKA_ROUND1(128) + BLAMKA_ROUND1(256) + BLAMKA_ROUND1(384) + BLAMKA_ROUND1(512) + BLAMKA_ROUND1(640) + BLAMKA_ROUND1(768) + BLAMKA_ROUND1(896) + + BLAMKA_ROUND2(0) + BLAMKA_ROUND2(16) + BLAMKA_ROUND2(32) + BLAMKA_ROUND2(48) + BLAMKA_ROUND2(64) + BLAMKA_ROUND2(80) + BLAMKA_ROUND2(96) + BLAMKA_ROUND2(112) + + RET + +// func mixBlocks1VX(t *block, in1 *block, in2 *block) +TEXT ·mixBlocks1VX(SB), NOSPLIT, $0-24 + MOVV t+0(FP), R4 + MOVV in1+8(FP), R5 + MOVV in2+16(FP), R6 + MOVV $128, R8 + +loop: + VMOVQ (R5), V0 + VMOVQ (R6), V1 + VXORV V0, V1, V2 + VMOVQ V2, (R4) + ADDV $16, R5 + ADDV $16, R6 + ADDV $16, R4 + SUBV $2, R8 + BLT R0, R8, loop + RET + +// func mixBlocks2VX(out *block, in1 *block, in2 *block, t *block) +TEXT ·mixBlocks2VX(SB), NOSPLIT, $0-32 + MOVV out+0(FP), R4 + MOVV in1+8(FP), R5 + MOVV in2+16(FP), R6 + MOVV t+24(FP), R7 + MOVV $128, R8 + +loop: + VMOVQ (R5), V0 + VMOVQ (R6), V1 + VMOVQ (R7), V2 + VXORV V0, V1, V3 + VXORV V3, V2, V4 + VMOVQ V4, (R4) + ADDV $16, R5 + ADDV $16, R6 + ADDV $16, R7 + ADDV $16, R4 + SUBV $2, R8 + BLT R0, R8, loop + RET + +// func xorBlocksVX(out *block, in1 *block, in2 *block, t *block) +TEXT ·xorBlocksVX(SB), NOSPLIT, $0-32 + MOVV out+0(FP), R4 + MOVV in1+8(FP), R5 + MOVV in2+16(FP), R6 + MOVV t+24(FP), R7 + MOVV $128, R8 + +loop: + VMOVQ (R5), V0 + VMOVQ (R6), V1 + VMOVQ (R7), V2 + VMOVQ (R4), V3 + VXORV V0, V1, V4 + VXORV V4, V2, V5 + VXORV V5, V3, V6 + VMOVQ V6, (R4) + ADDV $16, R5 + ADDV $16, R6 + ADDV $16, R7 + ADDV $16, R4 + SUBV $2, R8 + BLT R0, R8, loop + RET diff --git a/argon2/blamka_ref.go b/argon2/blamka_ref.go index 16d58c6..cf3e141 100644 --- a/argon2/blamka_ref.go +++ b/argon2/blamka_ref.go @@ -2,7 +2,7 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -//go:build !amd64 || purego || !gc +//go:build (!amd64 && !loong64) || purego || !gc package argon2 |
