aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorXiaolin Zhao <zhaoxiaolin@loongson.cn>2024-12-16 11:24:59 +0800
committerabner chenc <chenguoqi@loongson.cn>2025-04-13 20:23:16 -0700
commit953e8095893cd9efe44a90fd07ed3cfc87bfc109 (patch)
treee1a7da81cb274a1d75bee084b26587ecaac3c0b8
parent18f770732fa01d5d5e1a529a5518d7b70f93d3c6 (diff)
downloadgo-x-crypto-953e8095893cd9efe44a90fd07ed3cfc87bfc109.tar.xz
chacha20: add loong64 SIMD implementation
The performance of chacha20 has been greatly improved on 3A6000 and 3A5000. goos: linux goarch: loong64 pkg: golang.org/x/crypto/chacha20 cpu: Loongson-3A6000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | ChaCha20/64 171.9n ± 0% 159.3n ± 0% -7.33% (p=0.000 n=20) ChaCha20/256 592.2n ± 0% 142.8n ± 0% -75.89% (p=0.000 n=20) ChaCha20/10x25 981.5n ± 0% 518.8n ± 0% -47.14% (p=0.000 n=20) ChaCha20/4096 8.991µ ± 0% 1.732µ ± 0% -80.74% (p=0.000 n=20) ChaCha20/100x40 10.651µ ± 0% 5.135µ ± 0% -51.79% (p=0.000 n=20) ChaCha20/65536 143.43µ ± 0% 28.76µ ± 0% -79.95% (p=0.000 n=20) ChaCha20/1000x65 146.17µ ± 0% 37.13µ ± 0% -74.60% (p=0.000 n=20) geomean 5.721µ 1.962µ -65.70% | bench.old | bench.new | | B/s | B/s vs base | ChaCha20/64 355.1Mi ± 0% 383.1Mi ± 0% +7.89% (p=0.000 n=20) ChaCha20/256 412.2Mi ± 0% 1710.2Mi ± 0% +314.86% (p=0.000 n=20) ChaCha20/10x25 242.9Mi ± 0% 459.6Mi ± 0% +89.19% (p=0.000 n=20) ChaCha20/4096 434.5Mi ± 0% 2255.8Mi ± 0% +419.22% (p=0.000 n=20) ChaCha20/100x40 358.1Mi ± 0% 742.9Mi ± 0% +107.44% (p=0.000 n=20) ChaCha20/65536 435.8Mi ± 0% 2173.2Mi ± 0% +398.72% (p=0.000 n=20) ChaCha20/1000x65 424.1Mi ± 0% 1669.4Mi ± 0% +293.64% (p=0.000 n=20) geomean 373.9Mi 1.065Gi +191.55% goos: linux goarch: loong64 pkg: golang.org/x/crypto/chacha20 cpu: Loongson-3A5000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | ChaCha20/64 234.5n ± 0% 295.8n ± 0% +26.14% (p=0.000 n=20) ChaCha20/256 782.0n ± 0% 274.6n ± 0% -64.88% (p=0.000 n=20) ChaCha20/10x25 1340.0n ± 0% 752.7n ± 0% -43.83% (p=0.000 n=20) ChaCha20/4096 11.744µ ± 0% 3.455µ ± 0% -70.58% (p=0.000 n=20) ChaCha20/100x40 14.151µ ± 0% 7.435µ ± 0% -47.46% (p=0.000 n=20) ChaCha20/65536 188.05µ ± 0% 54.33µ ± 0% -71.11% (p=0.000 n=20) ChaCha20/1000x65 191.44µ ± 0% 66.29µ ± 0% -65.37% (p=0.000 n=20) geomean 7.604µ 3.436µ -54.81% | bench.old | bench.new | | B/s | B/s vs base | ChaCha20/64 260.3Mi ± 0% 206.3Mi ± 0% -20.73% (p=0.000 n=20) ChaCha20/256 312.2Mi ± 0% 888.9Mi ± 0% +184.75% (p=0.000 n=20) ChaCha20/10x25 177.9Mi ± 0% 316.8Mi ± 0% +78.08% (p=0.000 n=20) ChaCha20/4096 332.6Mi ± 0% 1130.8Mi ± 0% +239.95% (p=0.000 n=20) ChaCha20/100x40 269.6Mi ± 0% 513.1Mi ± 0% +90.34% (p=0.000 n=20) ChaCha20/65536 332.4Mi ± 0% 1150.5Mi ± 0% +246.16% (p=0.000 n=20) ChaCha20/1000x65 323.8Mi ± 0% 935.2Mi ± 0% +188.81% (p=0.000 n=20) geomean 281.3Mi 622.6Mi +121.31% Change-Id: I5386f2029122076c1d22a04610567e3df23877cd Reviewed-on: https://go-review.googlesource.com/c/crypto/+/636257 Reviewed-by: Dmitri Shuralyov <dmitshur@google.com> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: abner chenc <chenguoqi@loongson.cn> Reviewed-by: Carlos Amedee <carlos@golang.org>
-rw-r--r--chacha20/chacha_loong64.go22
-rw-r--r--chacha20/chacha_loong64.s374
-rw-r--r--chacha20/chacha_noasm.go2
3 files changed, 397 insertions, 1 deletions
diff --git a/chacha20/chacha_loong64.go b/chacha20/chacha_loong64.go
new file mode 100644
index 0000000..0d9547e
--- /dev/null
+++ b/chacha20/chacha_loong64.go
@@ -0,0 +1,22 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build gc && !purego
+
+package chacha20
+
+import "golang.org/x/sys/cpu"
+
+const bufSize = 256
+
+//go:noescape
+func xorKeyStreamVX(dst, src []byte, key *[8]uint32, nonce *[3]uint32, counter *uint32)
+
+func (c *Cipher) xorKeyStreamBlocks(dst, src []byte) {
+ if cpu.Loong64.HasLSX {
+ xorKeyStreamVX(dst, src, &c.key, &c.nonce, &c.counter)
+ } else {
+ c.xorKeyStreamBlocksGeneric(dst, src)
+ }
+}
diff --git a/chacha20/chacha_loong64.s b/chacha20/chacha_loong64.s
new file mode 100644
index 0000000..831ebf4
--- /dev/null
+++ b/chacha20/chacha_loong64.s
@@ -0,0 +1,374 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// derived from chacha_arm64.s
+
+//go:build gc && !purego
+
+#include "textflag.h"
+
+DATA ·constants+0x00(SB)/4, $0x61707865
+DATA ·constants+0x04(SB)/4, $0x3320646e
+DATA ·constants+0x08(SB)/4, $0x79622d32
+DATA ·constants+0x0c(SB)/4, $0x6b206574
+GLOBL ·constants(SB), NOPTR|RODATA, $32
+
+DATA ·incRotMatrix+0x00(SB)/4, $0x00000000
+DATA ·incRotMatrix+0x04(SB)/4, $0x00000001
+DATA ·incRotMatrix+0x08(SB)/4, $0x00000002
+DATA ·incRotMatrix+0x0c(SB)/4, $0x00000003
+GLOBL ·incRotMatrix(SB), NOPTR|RODATA, $32
+
+#define NUM_ROUNDS 10
+
+// func xorKeyStreamVX(dst, src []byte, key *[8]uint32, nonce *[3]uint32, counter *uint32)
+TEXT ·xorKeyStreamVX(SB), NOSPLIT, $0
+ MOVV dst+0(FP), R4
+ MOVV src+24(FP), R5
+ MOVV src_len+32(FP), R6
+ MOVV key+48(FP), R7
+ MOVV nonce+56(FP), R8
+ MOVV counter+64(FP), R9
+
+ MOVV $·constants(SB), R10
+ MOVV $·incRotMatrix(SB), R11
+
+ MOVW (R9), R12
+
+loop:
+ MOVV $NUM_ROUNDS, R15
+ // load 4-32bit data from incRotMatrix added to counter
+ VMOVQ (R11), V30
+
+ // load contants
+ // VLDREPL.W $0, R10, V0
+ WORD $0x30200140
+ // VLDREPL.W $1, R10, V1
+ WORD $0x30200541
+ // VLDREPL.W $2, R10, V2
+ WORD $0x30200942
+ // VLDREPL.W $3, R10, V3
+ WORD $0x30200d43
+
+ // load keys
+ // VLDREPL.W $0, R7, V4
+ WORD $0x302000e4
+ // VLDREPL.W $1, R7, V5
+ WORD $0x302004e5
+ // VLDREPL.W $2, R7, V6
+ WORD $0x302008e6
+ // VLDREPL.W $3, R7, V7
+ WORD $0x30200ce7
+ // VLDREPL.W $4, R7, V8
+ WORD $0x302010e8
+ // VLDREPL.W $5, R7, V9
+ WORD $0x302014e9
+ // VLDREPL.W $6, R7, V10
+ WORD $0x302018ea
+ // VLDREPL.W $7, R7, V11
+ WORD $0x30201ceb
+
+ // load counter + nonce
+ // VLDREPL.W $0, R9, V12
+ WORD $0x3020012c
+
+ // VLDREPL.W $0, R8, V13
+ WORD $0x3020010d
+ // VLDREPL.W $1, R8, V14
+ WORD $0x3020050e
+ // VLDREPL.W $2, R8, V15
+ WORD $0x3020090f
+
+ // update counter
+ VADDW V30, V12, V12
+
+chacha:
+ // V0..V3 += V4..V7
+ // V12..V15 <<<= ((V12..V15 XOR V0..V3), 16)
+ VADDW V0, V4, V0
+ VADDW V1, V5, V1
+ VADDW V2, V6, V2
+ VADDW V3, V7, V3
+ VXORV V12, V0, V12
+ VXORV V13, V1, V13
+ VXORV V14, V2, V14
+ VXORV V15, V3, V15
+ VROTRW $16, V12, V12
+ VROTRW $16, V13, V13
+ VROTRW $16, V14, V14
+ VROTRW $16, V15, V15
+
+ // V8..V11 += V12..V15
+ // V4..V7 <<<= ((V4..V7 XOR V8..V11), 12)
+ VADDW V8, V12, V8
+ VADDW V9, V13, V9
+ VADDW V10, V14, V10
+ VADDW V11, V15, V11
+ VXORV V4, V8, V4
+ VXORV V5, V9, V5
+ VXORV V6, V10, V6
+ VXORV V7, V11, V7
+ VROTRW $20, V4, V4
+ VROTRW $20, V5, V5
+ VROTRW $20, V6, V6
+ VROTRW $20, V7, V7
+
+ // V0..V3 += V4..V7
+ // V12..V15 <<<= ((V12..V15 XOR V0..V3), 8)
+ VADDW V0, V4, V0
+ VADDW V1, V5, V1
+ VADDW V2, V6, V2
+ VADDW V3, V7, V3
+ VXORV V12, V0, V12
+ VXORV V13, V1, V13
+ VXORV V14, V2, V14
+ VXORV V15, V3, V15
+ VROTRW $24, V12, V12
+ VROTRW $24, V13, V13
+ VROTRW $24, V14, V14
+ VROTRW $24, V15, V15
+
+ // V8..V11 += V12..V15
+ // V4..V7 <<<= ((V4..V7 XOR V8..V11), 7)
+ VADDW V12, V8, V8
+ VADDW V13, V9, V9
+ VADDW V14, V10, V10
+ VADDW V15, V11, V11
+ VXORV V4, V8, V4
+ VXORV V5, V9, V5
+ VXORV V6, V10, V6
+ VXORV V7, V11, V7
+ VROTRW $25, V4, V4
+ VROTRW $25, V5, V5
+ VROTRW $25, V6, V6
+ VROTRW $25, V7, V7
+
+ // V0..V3 += V5..V7, V4
+ // V15,V12-V14 <<<= ((V15,V12-V14 XOR V0..V3), 16)
+ VADDW V0, V5, V0
+ VADDW V1, V6, V1
+ VADDW V2, V7, V2
+ VADDW V3, V4, V3
+ VXORV V15, V0, V15
+ VXORV V12, V1, V12
+ VXORV V13, V2, V13
+ VXORV V14, V3, V14
+ VROTRW $16, V15, V15
+ VROTRW $16, V12, V12
+ VROTRW $16, V13, V13
+ VROTRW $16, V14, V14
+
+ // V10,V11,V8,V9 += V15,V12,V13,V14
+ // V5,V6,V7,V4 <<<= ((V5,V6,V7,V4 XOR V10,V11,V8,V9), 12)
+ VADDW V10, V15, V10
+ VADDW V11, V12, V11
+ VADDW V8, V13, V8
+ VADDW V9, V14, V9
+ VXORV V5, V10, V5
+ VXORV V6, V11, V6
+ VXORV V7, V8, V7
+ VXORV V4, V9, V4
+ VROTRW $20, V5, V5
+ VROTRW $20, V6, V6
+ VROTRW $20, V7, V7
+ VROTRW $20, V4, V4
+
+ // V0..V3 += V5..V7, V4
+ // V15,V12-V14 <<<= ((V15,V12-V14 XOR V0..V3), 8)
+ VADDW V5, V0, V0
+ VADDW V6, V1, V1
+ VADDW V7, V2, V2
+ VADDW V4, V3, V3
+ VXORV V15, V0, V15
+ VXORV V12, V1, V12
+ VXORV V13, V2, V13
+ VXORV V14, V3, V14
+ VROTRW $24, V15, V15
+ VROTRW $24, V12, V12
+ VROTRW $24, V13, V13
+ VROTRW $24, V14, V14
+
+ // V10,V11,V8,V9 += V15,V12,V13,V14
+ // V5,V6,V7,V4 <<<= ((V5,V6,V7,V4 XOR V10,V11,V8,V9), 7)
+ VADDW V15, V10, V10
+ VADDW V12, V11, V11
+ VADDW V13, V8, V8
+ VADDW V14, V9, V9
+ VXORV V5, V10, V5
+ VXORV V6, V11, V6
+ VXORV V7, V8, V7
+ VXORV V4, V9, V4
+ VROTRW $25, V5, V5
+ VROTRW $25, V6, V6
+ VROTRW $25, V7, V7
+ VROTRW $25, V4, V4
+
+ SUBV $1, R15
+ BNE R15, R0, chacha
+
+ // load origin contants
+ // VLDREPL.W $0, R10, V16
+ WORD $0x30200150
+ // VLDREPL.W $1, R10, V17
+ WORD $0x30200551
+ // VLDREPL.W $2, R10, V18
+ WORD $0x30200952
+ // VLDREPL.W $3, R10, V19
+ WORD $0x30200d53
+
+ // load origin keys
+ // VLDREPL.W $0, R7, V20
+ WORD $0x302000f4
+ // VLDREPL.W $1, R7, V21
+ WORD $0x302004f5
+ // VLDREPL.W $2, R7, V22
+ WORD $0x302008f6
+ // VLDREPL.W $3, R7, V23
+ WORD $0x30200cf7
+ // VLDREPL.W $4, R7, V24
+ WORD $0x302010f8
+ // VLDREPL.W $5, R7, V25
+ WORD $0x302014f9
+ // VLDREPL.W $6, R7, V26
+ WORD $0x302018fa
+ // VLDREPL.W $7, R7, V27
+ WORD $0x30201cfb
+
+ // add back the initial state to generate the key stream
+ VADDW V30, V12, V12 // update counter in advance to prevent V30 from being overwritten
+ VADDW V16, V0, V0
+ VADDW V17, V1, V1
+ VADDW V18, V2, V2
+ VADDW V19, V3, V3
+
+ // load origin counter + nonce
+ // VLDREPL.W $0, R9, V28
+ WORD $0x3020013c
+ // VLDREPL.W $0, R8, V29
+ WORD $0x3020011d
+ // VLDREPL.W $1, R8, V30
+ WORD $0x3020051e
+ // VLDREPL.W $2, R8, V31
+ WORD $0x3020091f
+
+ VADDW V20, V4, V4
+ VADDW V21, V5, V5
+ VADDW V22, V6, V6
+ VADDW V23, V7, V7
+ VADDW V24, V8, V8
+ VADDW V25, V9, V9
+ VADDW V26, V10, V10
+ VADDW V27, V11, V11
+ VADDW V28, V12, V12
+ VADDW V29, V13, V13
+ VADDW V30, V14, V14
+ VADDW V31, V15, V15
+
+ // shuffle
+ VILVLW V0, V1, V16
+ VILVHW V0, V1, V17
+ VILVLW V2, V3, V18
+ VILVHW V2, V3, V19
+ VILVLW V4, V5 ,V20
+ VILVHW V4, V5, V21
+ VILVLW V6, V7, V22
+ VILVHW V6, V7, V23
+ VILVLW V8, V9, V24
+ VILVHW V8, V9, V25
+ VILVLW V10, V11, V26
+ VILVHW V10, V11, V27
+ VILVLW V12, V13, V28
+ VILVHW V12, V13, V29
+ VILVLW V14, V15, V30
+ VILVHW V14, V15, V31
+ VILVLV V16, V18, V0
+ VILVHV V16, V18, V4
+ VILVLV V17, V19, V8
+ VILVHV V17, V19, V12
+
+ // load src data from R5
+ VMOVQ 0(R5), V16
+ VMOVQ 16(R5), V17
+ VMOVQ 32(R5), V18
+ VMOVQ 48(R5), V19
+
+ VILVLV V20, V22, V1
+ VILVHV V20, V22, V5
+ VILVLV V21, V23, V9
+ VILVHV V21, V23, V13
+
+ VMOVQ 64(R5), V20
+ VMOVQ 80(R5), V21
+ VMOVQ 96(R5), V22
+ VMOVQ 112(R5), V23
+
+ VILVLV V24, V26, V2
+ VILVHV V24, V26, V6
+ VILVLV V25, V27, V10
+ VILVHV V25, V27, V14
+
+ VMOVQ 128(R5), V24
+ VMOVQ 144(R5), V25
+ VMOVQ 160(R5), V26
+ VMOVQ 176(R5), V27
+
+ VILVLV V28, V30, V3
+ VILVHV V28, V30, V7
+ VILVLV V29, V31, V11
+ VILVHV V29, V31, V15
+
+ VMOVQ 192(R5), V28
+ VMOVQ 208(R5), V29
+ VMOVQ 224(R5), V30
+ VMOVQ 240(R5), V31
+
+ VXORV V0, V16, V16
+ VXORV V1, V17, V17
+ VXORV V2, V18, V18
+ VXORV V3, V19, V19
+
+ VMOVQ V16, 0(R4)
+ VMOVQ V17, 16(R4)
+ VMOVQ V18, 32(R4)
+ VMOVQ V19, 48(R4)
+
+ VXORV V4, V20, V20
+ VXORV V5, V21, V21
+ VXORV V6, V22, V22
+ VXORV V7, V23, V23
+
+ VMOVQ V20, 64(R4)
+ VMOVQ V21, 80(R4)
+ VMOVQ V22, 96(R4)
+ VMOVQ V23, 112(R4)
+
+ VXORV V8, V24, V24
+ VXORV V9, V25, V25
+ VXORV V10, V26, V26
+ VXORV V11, V27, V27
+
+ VMOVQ V24, 128(R4)
+ VMOVQ V25, 144(R4)
+ VMOVQ V26, 160(R4)
+ VMOVQ V27, 176(R4)
+
+ VXORV V12, V28, V28
+ VXORV V13, V29, V29
+ VXORV V14, V30, V30
+ VXORV V15, V31, V31
+
+ VMOVQ V28, 192(R4)
+ VMOVQ V29, 208(R4)
+ VMOVQ V30, 224(R4)
+ VMOVQ V31, 240(R4)
+
+ ADD $4, R12, R12
+ MOVW R12, (R9) // update counter
+
+ ADDV $256, R4, R4
+ ADDV $256, R5, R5
+ SUBV $256, R6, R6
+ BNE R6, R0, loop
+
+ RET
diff --git a/chacha20/chacha_noasm.go b/chacha20/chacha_noasm.go
index c709b72..3853cc0 100644
--- a/chacha20/chacha_noasm.go
+++ b/chacha20/chacha_noasm.go
@@ -2,7 +2,7 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
-//go:build (!arm64 && !s390x && !ppc64 && !ppc64le) || !gc || purego
+//go:build (!arm64 && !loong64 && !s390x && !ppc64 && !ppc64le) || !gc || purego
package chacha20