diff options
| author | Xiaolin Zhao <zhaoxiaolin@loongson.cn> | 2024-12-16 11:24:59 +0800 |
|---|---|---|
| committer | abner chenc <chenguoqi@loongson.cn> | 2025-04-13 20:23:16 -0700 |
| commit | 953e8095893cd9efe44a90fd07ed3cfc87bfc109 (patch) | |
| tree | e1a7da81cb274a1d75bee084b26587ecaac3c0b8 | |
| parent | 18f770732fa01d5d5e1a529a5518d7b70f93d3c6 (diff) | |
| download | go-x-crypto-953e8095893cd9efe44a90fd07ed3cfc87bfc109.tar.xz | |
chacha20: add loong64 SIMD implementation
The performance of chacha20 has been greatly improved on 3A6000 and 3A5000.
goos: linux
goarch: loong64
pkg: golang.org/x/crypto/chacha20
cpu: Loongson-3A6000 @ 2500.00MHz
| bench.old | bench.new |
| sec/op | sec/op vs base |
ChaCha20/64 171.9n ± 0% 159.3n ± 0% -7.33% (p=0.000 n=20)
ChaCha20/256 592.2n ± 0% 142.8n ± 0% -75.89% (p=0.000 n=20)
ChaCha20/10x25 981.5n ± 0% 518.8n ± 0% -47.14% (p=0.000 n=20)
ChaCha20/4096 8.991µ ± 0% 1.732µ ± 0% -80.74% (p=0.000 n=20)
ChaCha20/100x40 10.651µ ± 0% 5.135µ ± 0% -51.79% (p=0.000 n=20)
ChaCha20/65536 143.43µ ± 0% 28.76µ ± 0% -79.95% (p=0.000 n=20)
ChaCha20/1000x65 146.17µ ± 0% 37.13µ ± 0% -74.60% (p=0.000 n=20)
geomean 5.721µ 1.962µ -65.70%
| bench.old | bench.new |
| B/s | B/s vs base |
ChaCha20/64 355.1Mi ± 0% 383.1Mi ± 0% +7.89% (p=0.000 n=20)
ChaCha20/256 412.2Mi ± 0% 1710.2Mi ± 0% +314.86% (p=0.000 n=20)
ChaCha20/10x25 242.9Mi ± 0% 459.6Mi ± 0% +89.19% (p=0.000 n=20)
ChaCha20/4096 434.5Mi ± 0% 2255.8Mi ± 0% +419.22% (p=0.000 n=20)
ChaCha20/100x40 358.1Mi ± 0% 742.9Mi ± 0% +107.44% (p=0.000 n=20)
ChaCha20/65536 435.8Mi ± 0% 2173.2Mi ± 0% +398.72% (p=0.000 n=20)
ChaCha20/1000x65 424.1Mi ± 0% 1669.4Mi ± 0% +293.64% (p=0.000 n=20)
geomean 373.9Mi 1.065Gi +191.55%
goos: linux
goarch: loong64
pkg: golang.org/x/crypto/chacha20
cpu: Loongson-3A5000 @ 2500.00MHz
| bench.old | bench.new |
| sec/op | sec/op vs base |
ChaCha20/64 234.5n ± 0% 295.8n ± 0% +26.14% (p=0.000 n=20)
ChaCha20/256 782.0n ± 0% 274.6n ± 0% -64.88% (p=0.000 n=20)
ChaCha20/10x25 1340.0n ± 0% 752.7n ± 0% -43.83% (p=0.000 n=20)
ChaCha20/4096 11.744µ ± 0% 3.455µ ± 0% -70.58% (p=0.000 n=20)
ChaCha20/100x40 14.151µ ± 0% 7.435µ ± 0% -47.46% (p=0.000 n=20)
ChaCha20/65536 188.05µ ± 0% 54.33µ ± 0% -71.11% (p=0.000 n=20)
ChaCha20/1000x65 191.44µ ± 0% 66.29µ ± 0% -65.37% (p=0.000 n=20)
geomean 7.604µ 3.436µ -54.81%
| bench.old | bench.new |
| B/s | B/s vs base |
ChaCha20/64 260.3Mi ± 0% 206.3Mi ± 0% -20.73% (p=0.000 n=20)
ChaCha20/256 312.2Mi ± 0% 888.9Mi ± 0% +184.75% (p=0.000 n=20)
ChaCha20/10x25 177.9Mi ± 0% 316.8Mi ± 0% +78.08% (p=0.000 n=20)
ChaCha20/4096 332.6Mi ± 0% 1130.8Mi ± 0% +239.95% (p=0.000 n=20)
ChaCha20/100x40 269.6Mi ± 0% 513.1Mi ± 0% +90.34% (p=0.000 n=20)
ChaCha20/65536 332.4Mi ± 0% 1150.5Mi ± 0% +246.16% (p=0.000 n=20)
ChaCha20/1000x65 323.8Mi ± 0% 935.2Mi ± 0% +188.81% (p=0.000 n=20)
geomean 281.3Mi 622.6Mi +121.31%
Change-Id: I5386f2029122076c1d22a04610567e3df23877cd
Reviewed-on: https://go-review.googlesource.com/c/crypto/+/636257
Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: abner chenc <chenguoqi@loongson.cn>
Reviewed-by: Carlos Amedee <carlos@golang.org>
| -rw-r--r-- | chacha20/chacha_loong64.go | 22 | ||||
| -rw-r--r-- | chacha20/chacha_loong64.s | 374 | ||||
| -rw-r--r-- | chacha20/chacha_noasm.go | 2 |
3 files changed, 397 insertions, 1 deletions
diff --git a/chacha20/chacha_loong64.go b/chacha20/chacha_loong64.go new file mode 100644 index 0000000..0d9547e --- /dev/null +++ b/chacha20/chacha_loong64.go @@ -0,0 +1,22 @@ +// Copyright 2025 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build gc && !purego + +package chacha20 + +import "golang.org/x/sys/cpu" + +const bufSize = 256 + +//go:noescape +func xorKeyStreamVX(dst, src []byte, key *[8]uint32, nonce *[3]uint32, counter *uint32) + +func (c *Cipher) xorKeyStreamBlocks(dst, src []byte) { + if cpu.Loong64.HasLSX { + xorKeyStreamVX(dst, src, &c.key, &c.nonce, &c.counter) + } else { + c.xorKeyStreamBlocksGeneric(dst, src) + } +} diff --git a/chacha20/chacha_loong64.s b/chacha20/chacha_loong64.s new file mode 100644 index 0000000..831ebf4 --- /dev/null +++ b/chacha20/chacha_loong64.s @@ -0,0 +1,374 @@ +// Copyright 2025 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// derived from chacha_arm64.s + +//go:build gc && !purego + +#include "textflag.h" + +DATA ·constants+0x00(SB)/4, $0x61707865 +DATA ·constants+0x04(SB)/4, $0x3320646e +DATA ·constants+0x08(SB)/4, $0x79622d32 +DATA ·constants+0x0c(SB)/4, $0x6b206574 +GLOBL ·constants(SB), NOPTR|RODATA, $32 + +DATA ·incRotMatrix+0x00(SB)/4, $0x00000000 +DATA ·incRotMatrix+0x04(SB)/4, $0x00000001 +DATA ·incRotMatrix+0x08(SB)/4, $0x00000002 +DATA ·incRotMatrix+0x0c(SB)/4, $0x00000003 +GLOBL ·incRotMatrix(SB), NOPTR|RODATA, $32 + +#define NUM_ROUNDS 10 + +// func xorKeyStreamVX(dst, src []byte, key *[8]uint32, nonce *[3]uint32, counter *uint32) +TEXT ·xorKeyStreamVX(SB), NOSPLIT, $0 + MOVV dst+0(FP), R4 + MOVV src+24(FP), R5 + MOVV src_len+32(FP), R6 + MOVV key+48(FP), R7 + MOVV nonce+56(FP), R8 + MOVV counter+64(FP), R9 + + MOVV $·constants(SB), R10 + MOVV $·incRotMatrix(SB), R11 + + MOVW (R9), R12 + +loop: + MOVV $NUM_ROUNDS, R15 + // load 4-32bit data from incRotMatrix added to counter + VMOVQ (R11), V30 + + // load contants + // VLDREPL.W $0, R10, V0 + WORD $0x30200140 + // VLDREPL.W $1, R10, V1 + WORD $0x30200541 + // VLDREPL.W $2, R10, V2 + WORD $0x30200942 + // VLDREPL.W $3, R10, V3 + WORD $0x30200d43 + + // load keys + // VLDREPL.W $0, R7, V4 + WORD $0x302000e4 + // VLDREPL.W $1, R7, V5 + WORD $0x302004e5 + // VLDREPL.W $2, R7, V6 + WORD $0x302008e6 + // VLDREPL.W $3, R7, V7 + WORD $0x30200ce7 + // VLDREPL.W $4, R7, V8 + WORD $0x302010e8 + // VLDREPL.W $5, R7, V9 + WORD $0x302014e9 + // VLDREPL.W $6, R7, V10 + WORD $0x302018ea + // VLDREPL.W $7, R7, V11 + WORD $0x30201ceb + + // load counter + nonce + // VLDREPL.W $0, R9, V12 + WORD $0x3020012c + + // VLDREPL.W $0, R8, V13 + WORD $0x3020010d + // VLDREPL.W $1, R8, V14 + WORD $0x3020050e + // VLDREPL.W $2, R8, V15 + WORD $0x3020090f + + // update counter + VADDW V30, V12, V12 + +chacha: + // V0..V3 += V4..V7 + // V12..V15 <<<= ((V12..V15 XOR V0..V3), 16) + VADDW V0, V4, V0 + VADDW V1, V5, V1 + VADDW V2, V6, V2 + VADDW V3, V7, V3 + VXORV V12, V0, V12 + VXORV V13, V1, V13 + VXORV V14, V2, V14 + VXORV V15, V3, V15 + VROTRW $16, V12, V12 + VROTRW $16, V13, V13 + VROTRW $16, V14, V14 + VROTRW $16, V15, V15 + + // V8..V11 += V12..V15 + // V4..V7 <<<= ((V4..V7 XOR V8..V11), 12) + VADDW V8, V12, V8 + VADDW V9, V13, V9 + VADDW V10, V14, V10 + VADDW V11, V15, V11 + VXORV V4, V8, V4 + VXORV V5, V9, V5 + VXORV V6, V10, V6 + VXORV V7, V11, V7 + VROTRW $20, V4, V4 + VROTRW $20, V5, V5 + VROTRW $20, V6, V6 + VROTRW $20, V7, V7 + + // V0..V3 += V4..V7 + // V12..V15 <<<= ((V12..V15 XOR V0..V3), 8) + VADDW V0, V4, V0 + VADDW V1, V5, V1 + VADDW V2, V6, V2 + VADDW V3, V7, V3 + VXORV V12, V0, V12 + VXORV V13, V1, V13 + VXORV V14, V2, V14 + VXORV V15, V3, V15 + VROTRW $24, V12, V12 + VROTRW $24, V13, V13 + VROTRW $24, V14, V14 + VROTRW $24, V15, V15 + + // V8..V11 += V12..V15 + // V4..V7 <<<= ((V4..V7 XOR V8..V11), 7) + VADDW V12, V8, V8 + VADDW V13, V9, V9 + VADDW V14, V10, V10 + VADDW V15, V11, V11 + VXORV V4, V8, V4 + VXORV V5, V9, V5 + VXORV V6, V10, V6 + VXORV V7, V11, V7 + VROTRW $25, V4, V4 + VROTRW $25, V5, V5 + VROTRW $25, V6, V6 + VROTRW $25, V7, V7 + + // V0..V3 += V5..V7, V4 + // V15,V12-V14 <<<= ((V15,V12-V14 XOR V0..V3), 16) + VADDW V0, V5, V0 + VADDW V1, V6, V1 + VADDW V2, V7, V2 + VADDW V3, V4, V3 + VXORV V15, V0, V15 + VXORV V12, V1, V12 + VXORV V13, V2, V13 + VXORV V14, V3, V14 + VROTRW $16, V15, V15 + VROTRW $16, V12, V12 + VROTRW $16, V13, V13 + VROTRW $16, V14, V14 + + // V10,V11,V8,V9 += V15,V12,V13,V14 + // V5,V6,V7,V4 <<<= ((V5,V6,V7,V4 XOR V10,V11,V8,V9), 12) + VADDW V10, V15, V10 + VADDW V11, V12, V11 + VADDW V8, V13, V8 + VADDW V9, V14, V9 + VXORV V5, V10, V5 + VXORV V6, V11, V6 + VXORV V7, V8, V7 + VXORV V4, V9, V4 + VROTRW $20, V5, V5 + VROTRW $20, V6, V6 + VROTRW $20, V7, V7 + VROTRW $20, V4, V4 + + // V0..V3 += V5..V7, V4 + // V15,V12-V14 <<<= ((V15,V12-V14 XOR V0..V3), 8) + VADDW V5, V0, V0 + VADDW V6, V1, V1 + VADDW V7, V2, V2 + VADDW V4, V3, V3 + VXORV V15, V0, V15 + VXORV V12, V1, V12 + VXORV V13, V2, V13 + VXORV V14, V3, V14 + VROTRW $24, V15, V15 + VROTRW $24, V12, V12 + VROTRW $24, V13, V13 + VROTRW $24, V14, V14 + + // V10,V11,V8,V9 += V15,V12,V13,V14 + // V5,V6,V7,V4 <<<= ((V5,V6,V7,V4 XOR V10,V11,V8,V9), 7) + VADDW V15, V10, V10 + VADDW V12, V11, V11 + VADDW V13, V8, V8 + VADDW V14, V9, V9 + VXORV V5, V10, V5 + VXORV V6, V11, V6 + VXORV V7, V8, V7 + VXORV V4, V9, V4 + VROTRW $25, V5, V5 + VROTRW $25, V6, V6 + VROTRW $25, V7, V7 + VROTRW $25, V4, V4 + + SUBV $1, R15 + BNE R15, R0, chacha + + // load origin contants + // VLDREPL.W $0, R10, V16 + WORD $0x30200150 + // VLDREPL.W $1, R10, V17 + WORD $0x30200551 + // VLDREPL.W $2, R10, V18 + WORD $0x30200952 + // VLDREPL.W $3, R10, V19 + WORD $0x30200d53 + + // load origin keys + // VLDREPL.W $0, R7, V20 + WORD $0x302000f4 + // VLDREPL.W $1, R7, V21 + WORD $0x302004f5 + // VLDREPL.W $2, R7, V22 + WORD $0x302008f6 + // VLDREPL.W $3, R7, V23 + WORD $0x30200cf7 + // VLDREPL.W $4, R7, V24 + WORD $0x302010f8 + // VLDREPL.W $5, R7, V25 + WORD $0x302014f9 + // VLDREPL.W $6, R7, V26 + WORD $0x302018fa + // VLDREPL.W $7, R7, V27 + WORD $0x30201cfb + + // add back the initial state to generate the key stream + VADDW V30, V12, V12 // update counter in advance to prevent V30 from being overwritten + VADDW V16, V0, V0 + VADDW V17, V1, V1 + VADDW V18, V2, V2 + VADDW V19, V3, V3 + + // load origin counter + nonce + // VLDREPL.W $0, R9, V28 + WORD $0x3020013c + // VLDREPL.W $0, R8, V29 + WORD $0x3020011d + // VLDREPL.W $1, R8, V30 + WORD $0x3020051e + // VLDREPL.W $2, R8, V31 + WORD $0x3020091f + + VADDW V20, V4, V4 + VADDW V21, V5, V5 + VADDW V22, V6, V6 + VADDW V23, V7, V7 + VADDW V24, V8, V8 + VADDW V25, V9, V9 + VADDW V26, V10, V10 + VADDW V27, V11, V11 + VADDW V28, V12, V12 + VADDW V29, V13, V13 + VADDW V30, V14, V14 + VADDW V31, V15, V15 + + // shuffle + VILVLW V0, V1, V16 + VILVHW V0, V1, V17 + VILVLW V2, V3, V18 + VILVHW V2, V3, V19 + VILVLW V4, V5 ,V20 + VILVHW V4, V5, V21 + VILVLW V6, V7, V22 + VILVHW V6, V7, V23 + VILVLW V8, V9, V24 + VILVHW V8, V9, V25 + VILVLW V10, V11, V26 + VILVHW V10, V11, V27 + VILVLW V12, V13, V28 + VILVHW V12, V13, V29 + VILVLW V14, V15, V30 + VILVHW V14, V15, V31 + VILVLV V16, V18, V0 + VILVHV V16, V18, V4 + VILVLV V17, V19, V8 + VILVHV V17, V19, V12 + + // load src data from R5 + VMOVQ 0(R5), V16 + VMOVQ 16(R5), V17 + VMOVQ 32(R5), V18 + VMOVQ 48(R5), V19 + + VILVLV V20, V22, V1 + VILVHV V20, V22, V5 + VILVLV V21, V23, V9 + VILVHV V21, V23, V13 + + VMOVQ 64(R5), V20 + VMOVQ 80(R5), V21 + VMOVQ 96(R5), V22 + VMOVQ 112(R5), V23 + + VILVLV V24, V26, V2 + VILVHV V24, V26, V6 + VILVLV V25, V27, V10 + VILVHV V25, V27, V14 + + VMOVQ 128(R5), V24 + VMOVQ 144(R5), V25 + VMOVQ 160(R5), V26 + VMOVQ 176(R5), V27 + + VILVLV V28, V30, V3 + VILVHV V28, V30, V7 + VILVLV V29, V31, V11 + VILVHV V29, V31, V15 + + VMOVQ 192(R5), V28 + VMOVQ 208(R5), V29 + VMOVQ 224(R5), V30 + VMOVQ 240(R5), V31 + + VXORV V0, V16, V16 + VXORV V1, V17, V17 + VXORV V2, V18, V18 + VXORV V3, V19, V19 + + VMOVQ V16, 0(R4) + VMOVQ V17, 16(R4) + VMOVQ V18, 32(R4) + VMOVQ V19, 48(R4) + + VXORV V4, V20, V20 + VXORV V5, V21, V21 + VXORV V6, V22, V22 + VXORV V7, V23, V23 + + VMOVQ V20, 64(R4) + VMOVQ V21, 80(R4) + VMOVQ V22, 96(R4) + VMOVQ V23, 112(R4) + + VXORV V8, V24, V24 + VXORV V9, V25, V25 + VXORV V10, V26, V26 + VXORV V11, V27, V27 + + VMOVQ V24, 128(R4) + VMOVQ V25, 144(R4) + VMOVQ V26, 160(R4) + VMOVQ V27, 176(R4) + + VXORV V12, V28, V28 + VXORV V13, V29, V29 + VXORV V14, V30, V30 + VXORV V15, V31, V31 + + VMOVQ V28, 192(R4) + VMOVQ V29, 208(R4) + VMOVQ V30, 224(R4) + VMOVQ V31, 240(R4) + + ADD $4, R12, R12 + MOVW R12, (R9) // update counter + + ADDV $256, R4, R4 + ADDV $256, R5, R5 + SUBV $256, R6, R6 + BNE R6, R0, loop + + RET diff --git a/chacha20/chacha_noasm.go b/chacha20/chacha_noasm.go index c709b72..3853cc0 100644 --- a/chacha20/chacha_noasm.go +++ b/chacha20/chacha_noasm.go @@ -2,7 +2,7 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -//go:build (!arm64 && !s390x && !ppc64 && !ppc64le) || !gc || purego +//go:build (!arm64 && !loong64 && !s390x && !ppc64 && !ppc64le) || !gc || purego package chacha20 |
