aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorXiaolin Zhao <zhaoxiaolin@loongson.cn>2025-04-07 15:25:31 +0800
committerGopher Robot <gobot@golang.org>2025-04-11 13:55:20 -0700
commit18f770732fa01d5d5e1a529a5518d7b70f93d3c6 (patch)
tree411e6e1da95c71a94ef5cbe75517792ef48b2ff1
parent2ebaafcdf5677d2f056d0e8b9a8695e58d4feea7 (diff)
downloadgo-x-crypto-18f770732fa01d5d5e1a529a5518d7b70f93d3c6.tar.xz
salsa20: add loong64 SIMD implementation
The performance gains on Loongson 3A6000 and 3A5000 are as follows: goos: linux goarch: loong64 pkg: golang.org/x/crypto/salsa20 cpu: Loongson-3A6000-HV @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | XOR1K 3175.0n ± 0% 435.4n ± 0% -86.29% (p=0.000 n=20) | bench.old | bench.new | | B/s | B/s vs base | XOR1K 307.6Mi ± 0% 2242.7Mi ± 0% +629.13% (p=0.000 n=20) goos: linux goarch: loong64 pkg: golang.org/x/crypto/salsa20 cpu: Loongson-3A5000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | XOR1K 4125.0n ± 0% 864.0n ± 0% -79.05% (p=0.000 n=20) | bench.old | bench.new | | B/s | B/s vs base | XOR1K 236.7Mi ± 0% 1130.3Mi ± 0% +377.41% (p=0.000 n=20) Change-Id: Ib37f603e6654f1e3837985fad4b6dee10b5af993 Reviewed-on: https://go-review.googlesource.com/c/crypto/+/663375 Reviewed-by: Carlos Amedee <carlos@golang.org> Reviewed-by: abner chenc <chenguoqi@loongson.cn> Reviewed-by: Dmitri Shuralyov <dmitshur@google.com> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Auto-Submit: Carlos Amedee <carlos@golang.org>
-rw-r--r--salsa20/salsa/salsa20_loong64.go29
-rw-r--r--salsa20/salsa/salsa20_loong64.s482
-rw-r--r--salsa20/salsa/salsa20_noasm.go2
-rw-r--r--salsa20/salsa/salsa20_test.go (renamed from salsa20/salsa/salsa20_amd64_test.go)2
4 files changed, 513 insertions, 2 deletions
diff --git a/salsa20/salsa/salsa20_loong64.go b/salsa20/salsa/salsa20_loong64.go
new file mode 100644
index 0000000..8c7d867
--- /dev/null
+++ b/salsa20/salsa/salsa20_loong64.go
@@ -0,0 +1,29 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build loong64 && !purego && gc
+
+package salsa
+
+import "golang.org/x/sys/cpu"
+
+// XORKeyStreamVX is implemented in salsa20_loong64.s.
+//
+//go:noescape
+func XORKeyStreamVX(out, in *byte, n uint64, nonce, key *byte)
+
+// XORKeyStream crypts bytes from in to out using the given key and counters.
+// In and out must overlap entirely or not at all. Counter
+// contains the raw salsa20 counter bytes (both nonce and block counter).
+func XORKeyStream(out, in []byte, counter *[16]byte, key *[32]byte) {
+ if len(in) == 0 {
+ return
+ }
+ _ = out[len(in)-1]
+ if cpu.Loong64.HasLSX {
+ XORKeyStreamVX(&out[0], &in[0], uint64(len(in)), &counter[0], &key[0])
+ } else {
+ genericXORKeyStream(out, in, counter, key)
+ }
+}
diff --git a/salsa20/salsa/salsa20_loong64.s b/salsa20/salsa/salsa20_loong64.s
new file mode 100644
index 0000000..dc4b501
--- /dev/null
+++ b/salsa20/salsa/salsa20_loong64.s
@@ -0,0 +1,482 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !purego && gc
+
+#include "textflag.h"
+
+DATA ·constants+0x00(SB)/4, $0x61707865
+DATA ·constants+0x04(SB)/4, $0x3320646e
+DATA ·constants+0x08(SB)/4, $0x79622d32
+DATA ·constants+0x0c(SB)/4, $0x6b206574
+GLOBL ·constants(SB), NOPTR|RODATA, $32
+
+#define NUM_ROUNDS 10
+
+// func XORKeyStreamVX(out, in *byte, n uint64, counter, key *byte)
+TEXT ·XORKeyStreamVX(SB), $0-40
+ MOVV out+0(FP), R4
+ MOVV in+8(FP), R5
+ MOVV n+16(FP), R6
+ MOVV counter+24(FP), R7
+ MOVV key+32(FP), R8
+ MOVV $·constants(SB), R10
+
+ BGE R0, R6, ret
+
+ MOVV 8(R7), R9 // counter[8:16]
+
+loop256:
+ MOVV $NUM_ROUNDS, R15
+ VXORV V30, V30, V30 // V30 = 0
+
+ // load contants
+ // VLDREPL.W $0, R10, V0
+ WORD $0x30200140
+ // VLDREPL.W $1, R10, V5
+ WORD $0x30200545
+ // VLDREPL.W $2, R10, V10
+ WORD $0x3020094a
+ // VLDREPL.W $3, R10, V15
+ WORD $0x30200d4f
+
+ // load keys
+ // VLDREPL.W $0, R8, V1
+ WORD $0x30200101
+ // VLDREPL.W $1, R8, V2
+ WORD $0x30200502
+ // VLDREPL.W $2, R8, V3
+ WORD $0x30200903
+ // VLDREPL.W $3, R8, V4
+ WORD $0x30200d04
+ // VLDREPL.W $4, R8, V11
+ WORD $0x3020110b
+ // VLDREPL.W $5, R8, V12
+ WORD $0x3020150c
+ // VLDREPL.W $6, R8, V13
+ WORD $0x3020190d
+ // VLDREPL.W $7, R8, V14
+ WORD $0x30201d0e
+
+ // load and update counter
+ // VLDREPL.W $0, R7, V6
+ WORD $0x302000e6
+ // VLDREPL.W $1, R7, V7
+ WORD $0x302004e7
+
+ ADDV $1, R9, R11
+ ADDV $2, R9, R12
+ ADDV $3, R9, R13
+ VMOVQ R9, V8.W[0]
+ VMOVQ R11, V8.W[1]
+ VMOVQ R12, V8.W[2]
+ VMOVQ R13, V8.W[3]
+ SRLV $32, R9, R14
+ SRLV $32, R11, R11
+ SRLV $32, R12, R12
+ SRLV $32, R13, R13
+ VMOVQ R14, V9.W[0]
+ VMOVQ R11, V9.W[1]
+ VMOVQ R12, V9.W[2]
+ VMOVQ R13, V9.W[3]
+
+ // backup V8 and V9
+ VADDV V8, V30, V24 // V21 = V8
+ VADDV V9, V30, V25 // V22 = V9
+
+salsa20_256:
+ VADDW V0, V12, V26
+ VADDW V5, V1, V27
+ VADDW V10, V6, V28
+ VADDW V15, V11, V29
+ VROTRW $25, V26, V26
+ VROTRW $25, V27, V27
+ VROTRW $25, V28, V28
+ VROTRW $25, V29, V29
+ VXORV V4, V26, V4
+ VXORV V9, V27, V9
+ VXORV V14, V28, V14
+ VXORV V3, V29, V3
+ VADDW V4, V0, V26
+ VADDW V9, V5, V27
+ VADDW V14, V10, V28
+ VADDW V3, V15, V29
+ VROTRW $23, V26, V26
+ VROTRW $23, V27, V27
+ VROTRW $23, V28, V28
+ VROTRW $23, V29, V29
+ VXORV V8, V26, V8
+ VXORV V13, V27, V13
+ VXORV V2, V28, V2
+ VXORV V7, V29, V7
+ VADDW V8, V4, V26
+ VADDW V13, V9, V27
+ VADDW V2, V14, V28
+ VADDW V7, V3, V29
+ VROTRW $19, V26, V26
+ VROTRW $19, V27, V27
+ VROTRW $19, V28, V28
+ VROTRW $19, V29, V29
+ VXORV V12, V26, V12
+ VXORV V1, V27, V1
+ VXORV V6, V28, V6
+ VXORV V11, V29, V11
+ VADDW V12, V8, V26
+ VADDW V1, V13, V27
+ VADDW V6, V2, V28
+ VADDW V11, V7, V29
+ VROTRW $14, V26, V26
+ VROTRW $14, V27, V27
+ VROTRW $14, V28, V28
+ VROTRW $14, V29, V29
+ VXORV V0, V26, V0
+ VXORV V5, V27, V5
+ VXORV V10, V28, V10
+ VXORV V15, V29, V15
+
+ VADDW V0, V3, V26
+ VADDW V5, V4, V27
+ VADDW V10, V9, V28
+ VADDW V15, V14, V29
+ VROTRW $25, V26, V26
+ VROTRW $25, V27, V27
+ VROTRW $25, V28, V28
+ VROTRW $25, V29, V29
+ VXORV V1, V26, V1
+ VXORV V6, V27, V6
+ VXORV V11, V28, V11
+ VXORV V12, V29, V12
+ VADDW V1, V0, V26
+ VADDW V6, V5, V27
+ VADDW V11, V10, V28
+ VADDW V12, V15, V29
+ VROTRW $23, V26, V26
+ VROTRW $23, V27, V27
+ VROTRW $23, V28, V28
+ VROTRW $23, V29, V29
+ VXORV V2, V26, V2
+ VXORV V7, V27, V7
+ VXORV V8, V28, V8
+ VXORV V13, V29, V13
+ VADDW V2, V1, V26
+ VADDW V7, V6, V27
+ VADDW V8, V11, V28
+ VADDW V13, V12, V29
+ VROTRW $19, V26, V26
+ VROTRW $19, V27, V27
+ VROTRW $19, V28, V28
+ VROTRW $19, V29, V29
+ VXORV V3, V26, V3
+ VXORV V4, V27, V4
+ VXORV V9, V28, V9
+ VXORV V14, V29, V14
+ VADDW V3, V2, V26
+ VADDW V4, V7, V27
+ VADDW V9, V8, V28
+ VADDW V14, V13, V29
+ VROTRW $14, V26, V26
+ VROTRW $14, V27, V27
+ VROTRW $14, V28, V28
+ VROTRW $14, V29, V29
+ VXORV V0, V26, V0
+ VXORV V5, V27, V5
+ VXORV V10, V28, V10
+ VXORV V15, V29, V15
+
+ SUBV $1, R15
+ BNE R15, R0, salsa20_256
+
+ // load origin contants
+ // VLDREPL.W $0, R10, V16
+ WORD $0x30200150
+ // VLDREPL.W $1, R10, V21
+ WORD $0x30200555
+ // VLDREPL.W $2, R10, V26
+ WORD $0x3020095a
+ // VLDREPL.W $3, R10, V31
+ WORD $0x30200d5f
+
+ // load origin keys
+ // VLDREPL.W $0, R8, V17
+ WORD $0x30200111
+ // VLDREPL.W $1, R8, V18
+ WORD $0x30200512
+ // VLDREPL.W $2, R8, V19
+ WORD $0x30200913
+ // VLDREPL.W $3, R8, V20
+ WORD $0x30200d14
+ // VLDREPL.W $4, R8, V27
+ WORD $0x3020111b
+ // VLDREPL.W $5, R8, V28
+ WORD $0x3020151c
+ // VLDREPL.W $6, R8, V29
+ WORD $0x3020191d
+ // VLDREPL.W $7, R8, V30
+ WORD $0x30201d1e
+
+ // load origin counter
+ // VLDREPL.W $0, R7, V22
+ WORD $0x302000f6
+ // VLDREPL.W $1, R7, V23
+ WORD $0x302004f7
+
+ // add back the initial state to generate the key stream
+ VADDW V0, V16, V0
+ VADDW V1, V17, V1
+ VADDW V2, V18, V2
+ VADDW V3, V19, V3
+ VADDW V4, V20, V4
+ VADDW V5, V21, V5
+ VADDW V6, V22, V6
+ VADDW V7, V23, V7
+ VADDW V8, V24, V8
+ VADDW V9, V25, V9
+ VADDW V10, V26, V10
+ VADDW V11, V27, V11
+ VADDW V12, V28, V12
+ VADDW V13, V29, V13
+ VADDW V14, V30, V14
+ VADDW V15, V31, V15
+
+ // shuffle
+ VILVLW V0, V1, V16
+ VILVHW V0, V1, V17
+ VILVLW V2, V3, V18
+ VILVHW V2, V3, V19
+ VILVLW V4, V5 ,V20
+ VILVHW V4, V5, V21
+ VILVLW V6, V7, V22
+ VILVHW V6, V7, V23
+ VILVLW V8, V9, V24
+ VILVHW V8, V9, V25
+ VILVLW V10, V11, V26
+ VILVHW V10, V11, V27
+ VILVLW V12, V13, V28
+ VILVHW V12, V13, V29
+ VILVLW V14, V15, V30
+ VILVHW V14, V15, V31
+ VILVLV V16, V18, V0
+ VILVHV V16, V18, V4
+ VILVLV V17, V19, V8
+ VILVHV V17, V19, V12
+ VILVLV V20, V22, V1
+ VILVHV V20, V22, V5
+ VILVLV V21, V23, V9
+ VILVHV V21, V23, V13
+ VILVLV V24, V26, V2
+ VILVHV V24, V26, V6
+ VILVLV V25, V27, V10
+ VILVHV V25, V27, V14
+ VILVLV V28, V30, V3
+ VILVHV V28, V30, V7
+ VILVLV V29, V31, V11
+ VILVHV V29, V31, V15
+
+ SGTU $256, R6, R11
+ BNE R11, R0, less_than_256
+
+ // load src data from R5
+ VMOVQ 0(R5), V16
+ VMOVQ 16(R5), V17
+ VMOVQ 32(R5), V18
+ VMOVQ 48(R5), V19
+ VMOVQ 64(R5), V20
+ VMOVQ 80(R5), V21
+ VMOVQ 96(R5), V22
+ VMOVQ 112(R5), V23
+ VMOVQ 128(R5), V24
+ VMOVQ 144(R5), V25
+ VMOVQ 160(R5), V26
+ VMOVQ 176(R5), V27
+ VMOVQ 192(R5), V28
+ VMOVQ 208(R5), V29
+ VMOVQ 224(R5), V30
+ VMOVQ 240(R5), V31
+
+ VXORV V0, V16, V16
+ VXORV V1, V17, V17
+ VXORV V2, V18, V18
+ VXORV V3, V19, V19
+ VXORV V4, V20, V20
+ VXORV V5, V21, V21
+ VXORV V6, V22, V22
+ VXORV V7, V23, V23
+ VXORV V8, V24, V24
+ VXORV V9, V25, V25
+ VXORV V10, V26, V26
+ VXORV V11, V27, V27
+ VXORV V12, V28, V28
+ VXORV V13, V29, V29
+ VXORV V14, V30, V30
+ VXORV V15, V31, V31
+
+ VMOVQ V16, 0(R4)
+ VMOVQ V17, 16(R4)
+ VMOVQ V18, 32(R4)
+ VMOVQ V19, 48(R4)
+ VMOVQ V20, 64(R4)
+ VMOVQ V21, 80(R4)
+ VMOVQ V22, 96(R4)
+ VMOVQ V23, 112(R4)
+ VMOVQ V24, 128(R4)
+ VMOVQ V25, 144(R4)
+ VMOVQ V26, 160(R4)
+ VMOVQ V27, 176(R4)
+ VMOVQ V28, 192(R4)
+ VMOVQ V29, 208(R4)
+ VMOVQ V30, 224(R4)
+ VMOVQ V31, 240(R4)
+
+ ADDV $4, R9, R9 // update counter
+
+ SUBV $256, R6, R6
+ ADDV $256, R4, R4
+ ADDV $256, R5, R5
+ SGTU $256, R6, R11
+ BEQ R11, R0, loop256
+ BEQ R6, R0, ret
+
+less_than_256:
+ VXORV V30, V30, V30 // V30=0
+ SGTU $128, R6, R11
+ BNE R11, R0, less_than_128
+ SUBV $128, R6
+ VMOVQ (R5), V16
+ VMOVQ 16(R5), V17
+ VMOVQ 32(R5), V18
+ VMOVQ 48(R5), V19
+ VMOVQ 64(R5), V20
+ VMOVQ 80(R5), V21
+ VMOVQ 96(R5), V22
+ VMOVQ 112(R5), V23
+ VXORV V0, V16, V16
+ VXORV V1, V17, V17
+ VXORV V2, V18, V18
+ VXORV V3, V19, V19
+ VXORV V4, V20, V20
+ VXORV V5, V21, V21
+ VXORV V6, V22, V22
+ VXORV V7, V23, V23
+ VMOVQ V16, (R4)
+ VMOVQ V17, 16(R4)
+ VMOVQ V18, 32(R4)
+ VMOVQ V19, 48(R4)
+ VMOVQ V20, 64(R4)
+ VMOVQ V21, 80(R4)
+ VMOVQ V22, 96(R4)
+ VMOVQ V23, 112(R4)
+ BEQ R6, R0, ret
+ ADDV $128, R5, R5
+ ADDV $128, R4, R4
+ VADDV V8, V30, V0
+ VADDV V9, V30, V1
+ VADDV V10, V30, V2
+ VADDV V11, V30, V3
+ VADDV V12, V30, V4
+ VADDV V13, V30, V5
+ VADDV V14, V30, V6
+ VADDV V15, V30, V7
+
+less_than_128:
+ SGTU $64, R6, R11
+ BNE R11, R0, less_than_64
+ SUBV $64, R6
+ VMOVQ (R5), V16
+ VMOVQ 16(R5), V17
+ VMOVQ 32(R5), V18
+ VMOVQ 48(R5), V19
+ VXORV V0, V16, V16
+ VXORV V1, V17, V17
+ VXORV V2, V18, V18
+ VXORV V3, V19, V19
+ VMOVQ V16, (R4)
+ VMOVQ V17, 16(R4)
+ VMOVQ V18, 32(R4)
+ VMOVQ V19, 48(R4)
+ BEQ R6, R0, ret
+ ADDV $64, R5
+ ADDV $64, R4
+ VADDV V4, V30, V0
+ VADDV V5, V30, V1
+ VADDV V6, V30, V2
+ VADDV V7, V30, V3
+
+less_than_64:
+ SGTU $32, R6, R11
+ BNE R11, R0, less_than_32
+ SUBV $32, R6
+ VMOVQ (R5), V16
+ VMOVQ 16(R5), V17
+ VXORV V0, V16, V16
+ VXORV V1, V17, V17
+ VMOVQ V16, (R4)
+ VMOVQ V17, 16(R4)
+ BEQ R6, R0, ret
+ ADDV $32, R5
+ ADDV $32, R4
+ VADDV V2, V30, V0
+ VADDV V3, V30, V1
+
+less_than_32:
+ SGTU $16, R6, R11
+ BNE R11, R0, less_than_16
+ SUBV $16, R6
+ VMOVQ (R5), V16
+ VXORV V16, V0, V16
+ VMOVQ V16, (R4)
+ BEQ R6, R0, ret
+ ADDV $16, R5
+ ADDV $16, R4
+ VADDV V1, V30, V0
+
+less_than_16:
+ SGTU $8, R6, R11
+ BNE R11, R0, less_than_8
+ SUBV $8, R6
+ VMOVQ V0.V[0], R11
+ VMOVQ V0.V[1], R13
+ MOVV (R5), R12
+ XOR R11, R12, R12
+ MOVV R12, (R4)
+ BEQ R6, R0, ret
+ ADDV $8, R5
+ ADDV $8, R4
+ VMOVQ R13, V0.V[0]
+
+less_than_8:
+ SGTU $4, R6, R11
+ BNE R11, R0, less_than_4
+ SUBV $4, R6
+ VMOVQ V0.W[0], R11
+ VMOVQ V0.W[1], R13
+ MOVWU (R5), R12
+ XOR R11, R12, R12
+ MOVWU R12, (R4)
+ BEQ R6, R0, ret
+ ADDV $4, R5
+ ADDV $4, R4
+ VMOVQ R13, V0.W[0]
+
+less_than_4:
+ SGTU $2, R6, R11
+ BNE R11, R0, less_than_2
+ SUBV $2, R6
+ VMOVQ V0.H[0], R11
+ VMOVQ V0.H[1], R13
+ MOVHU (R5), R12
+ XOR R11, R12, R12
+ MOVHU R12, (R4)
+ BEQ R6, R0, ret
+ ADDV $2, R5
+ ADDV $2, R4
+ VMOVQ R13, V0.H[0]
+
+less_than_2:
+ VMOVQ V0.B[0], R11
+ MOVBU (R5), R12
+ XOR R11, R12, R12
+ MOVBU R12, (R4)
+
+ret:
+ RET
diff --git a/salsa20/salsa/salsa20_noasm.go b/salsa20/salsa/salsa20_noasm.go
index 9448760..64e262d 100644
--- a/salsa20/salsa/salsa20_noasm.go
+++ b/salsa20/salsa/salsa20_noasm.go
@@ -2,7 +2,7 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
-//go:build !amd64 || purego || !gc
+//go:build (!amd64 && !loong64) || purego || !gc
package salsa
diff --git a/salsa20/salsa/salsa20_amd64_test.go b/salsa20/salsa/salsa20_test.go
index fe14604..d6b29a3 100644
--- a/salsa20/salsa/salsa20_amd64_test.go
+++ b/salsa20/salsa/salsa20_test.go
@@ -2,7 +2,7 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
-//go:build amd64 && !purego && gc
+//go:build (amd64 || loong64) && !purego && gc
package salsa