aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--chacha20/chacha_loong64.go22
-rw-r--r--chacha20/chacha_loong64.s374
-rw-r--r--chacha20/chacha_noasm.go2
3 files changed, 397 insertions, 1 deletions
diff --git a/chacha20/chacha_loong64.go b/chacha20/chacha_loong64.go
new file mode 100644
index 0000000..0d9547e
--- /dev/null
+++ b/chacha20/chacha_loong64.go
@@ -0,0 +1,22 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build gc && !purego
+
+package chacha20
+
+import "golang.org/x/sys/cpu"
+
+const bufSize = 256
+
+//go:noescape
+func xorKeyStreamVX(dst, src []byte, key *[8]uint32, nonce *[3]uint32, counter *uint32)
+
+func (c *Cipher) xorKeyStreamBlocks(dst, src []byte) {
+ if cpu.Loong64.HasLSX {
+ xorKeyStreamVX(dst, src, &c.key, &c.nonce, &c.counter)
+ } else {
+ c.xorKeyStreamBlocksGeneric(dst, src)
+ }
+}
diff --git a/chacha20/chacha_loong64.s b/chacha20/chacha_loong64.s
new file mode 100644
index 0000000..831ebf4
--- /dev/null
+++ b/chacha20/chacha_loong64.s
@@ -0,0 +1,374 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// derived from chacha_arm64.s
+
+//go:build gc && !purego
+
+#include "textflag.h"
+
+DATA ·constants+0x00(SB)/4, $0x61707865
+DATA ·constants+0x04(SB)/4, $0x3320646e
+DATA ·constants+0x08(SB)/4, $0x79622d32
+DATA ·constants+0x0c(SB)/4, $0x6b206574
+GLOBL ·constants(SB), NOPTR|RODATA, $32
+
+DATA ·incRotMatrix+0x00(SB)/4, $0x00000000
+DATA ·incRotMatrix+0x04(SB)/4, $0x00000001
+DATA ·incRotMatrix+0x08(SB)/4, $0x00000002
+DATA ·incRotMatrix+0x0c(SB)/4, $0x00000003
+GLOBL ·incRotMatrix(SB), NOPTR|RODATA, $32
+
+#define NUM_ROUNDS 10
+
+// func xorKeyStreamVX(dst, src []byte, key *[8]uint32, nonce *[3]uint32, counter *uint32)
+TEXT ·xorKeyStreamVX(SB), NOSPLIT, $0
+ MOVV dst+0(FP), R4
+ MOVV src+24(FP), R5
+ MOVV src_len+32(FP), R6
+ MOVV key+48(FP), R7
+ MOVV nonce+56(FP), R8
+ MOVV counter+64(FP), R9
+
+ MOVV $·constants(SB), R10
+ MOVV $·incRotMatrix(SB), R11
+
+ MOVW (R9), R12
+
+loop:
+ MOVV $NUM_ROUNDS, R15
+ // load 4-32bit data from incRotMatrix added to counter
+ VMOVQ (R11), V30
+
+ // load contants
+ // VLDREPL.W $0, R10, V0
+ WORD $0x30200140
+ // VLDREPL.W $1, R10, V1
+ WORD $0x30200541
+ // VLDREPL.W $2, R10, V2
+ WORD $0x30200942
+ // VLDREPL.W $3, R10, V3
+ WORD $0x30200d43
+
+ // load keys
+ // VLDREPL.W $0, R7, V4
+ WORD $0x302000e4
+ // VLDREPL.W $1, R7, V5
+ WORD $0x302004e5
+ // VLDREPL.W $2, R7, V6
+ WORD $0x302008e6
+ // VLDREPL.W $3, R7, V7
+ WORD $0x30200ce7
+ // VLDREPL.W $4, R7, V8
+ WORD $0x302010e8
+ // VLDREPL.W $5, R7, V9
+ WORD $0x302014e9
+ // VLDREPL.W $6, R7, V10
+ WORD $0x302018ea
+ // VLDREPL.W $7, R7, V11
+ WORD $0x30201ceb
+
+ // load counter + nonce
+ // VLDREPL.W $0, R9, V12
+ WORD $0x3020012c
+
+ // VLDREPL.W $0, R8, V13
+ WORD $0x3020010d
+ // VLDREPL.W $1, R8, V14
+ WORD $0x3020050e
+ // VLDREPL.W $2, R8, V15
+ WORD $0x3020090f
+
+ // update counter
+ VADDW V30, V12, V12
+
+chacha:
+ // V0..V3 += V4..V7
+ // V12..V15 <<<= ((V12..V15 XOR V0..V3), 16)
+ VADDW V0, V4, V0
+ VADDW V1, V5, V1
+ VADDW V2, V6, V2
+ VADDW V3, V7, V3
+ VXORV V12, V0, V12
+ VXORV V13, V1, V13
+ VXORV V14, V2, V14
+ VXORV V15, V3, V15
+ VROTRW $16, V12, V12
+ VROTRW $16, V13, V13
+ VROTRW $16, V14, V14
+ VROTRW $16, V15, V15
+
+ // V8..V11 += V12..V15
+ // V4..V7 <<<= ((V4..V7 XOR V8..V11), 12)
+ VADDW V8, V12, V8
+ VADDW V9, V13, V9
+ VADDW V10, V14, V10
+ VADDW V11, V15, V11
+ VXORV V4, V8, V4
+ VXORV V5, V9, V5
+ VXORV V6, V10, V6
+ VXORV V7, V11, V7
+ VROTRW $20, V4, V4
+ VROTRW $20, V5, V5
+ VROTRW $20, V6, V6
+ VROTRW $20, V7, V7
+
+ // V0..V3 += V4..V7
+ // V12..V15 <<<= ((V12..V15 XOR V0..V3), 8)
+ VADDW V0, V4, V0
+ VADDW V1, V5, V1
+ VADDW V2, V6, V2
+ VADDW V3, V7, V3
+ VXORV V12, V0, V12
+ VXORV V13, V1, V13
+ VXORV V14, V2, V14
+ VXORV V15, V3, V15
+ VROTRW $24, V12, V12
+ VROTRW $24, V13, V13
+ VROTRW $24, V14, V14
+ VROTRW $24, V15, V15
+
+ // V8..V11 += V12..V15
+ // V4..V7 <<<= ((V4..V7 XOR V8..V11), 7)
+ VADDW V12, V8, V8
+ VADDW V13, V9, V9
+ VADDW V14, V10, V10
+ VADDW V15, V11, V11
+ VXORV V4, V8, V4
+ VXORV V5, V9, V5
+ VXORV V6, V10, V6
+ VXORV V7, V11, V7
+ VROTRW $25, V4, V4
+ VROTRW $25, V5, V5
+ VROTRW $25, V6, V6
+ VROTRW $25, V7, V7
+
+ // V0..V3 += V5..V7, V4
+ // V15,V12-V14 <<<= ((V15,V12-V14 XOR V0..V3), 16)
+ VADDW V0, V5, V0
+ VADDW V1, V6, V1
+ VADDW V2, V7, V2
+ VADDW V3, V4, V3
+ VXORV V15, V0, V15
+ VXORV V12, V1, V12
+ VXORV V13, V2, V13
+ VXORV V14, V3, V14
+ VROTRW $16, V15, V15
+ VROTRW $16, V12, V12
+ VROTRW $16, V13, V13
+ VROTRW $16, V14, V14
+
+ // V10,V11,V8,V9 += V15,V12,V13,V14
+ // V5,V6,V7,V4 <<<= ((V5,V6,V7,V4 XOR V10,V11,V8,V9), 12)
+ VADDW V10, V15, V10
+ VADDW V11, V12, V11
+ VADDW V8, V13, V8
+ VADDW V9, V14, V9
+ VXORV V5, V10, V5
+ VXORV V6, V11, V6
+ VXORV V7, V8, V7
+ VXORV V4, V9, V4
+ VROTRW $20, V5, V5
+ VROTRW $20, V6, V6
+ VROTRW $20, V7, V7
+ VROTRW $20, V4, V4
+
+ // V0..V3 += V5..V7, V4
+ // V15,V12-V14 <<<= ((V15,V12-V14 XOR V0..V3), 8)
+ VADDW V5, V0, V0
+ VADDW V6, V1, V1
+ VADDW V7, V2, V2
+ VADDW V4, V3, V3
+ VXORV V15, V0, V15
+ VXORV V12, V1, V12
+ VXORV V13, V2, V13
+ VXORV V14, V3, V14
+ VROTRW $24, V15, V15
+ VROTRW $24, V12, V12
+ VROTRW $24, V13, V13
+ VROTRW $24, V14, V14
+
+ // V10,V11,V8,V9 += V15,V12,V13,V14
+ // V5,V6,V7,V4 <<<= ((V5,V6,V7,V4 XOR V10,V11,V8,V9), 7)
+ VADDW V15, V10, V10
+ VADDW V12, V11, V11
+ VADDW V13, V8, V8
+ VADDW V14, V9, V9
+ VXORV V5, V10, V5
+ VXORV V6, V11, V6
+ VXORV V7, V8, V7
+ VXORV V4, V9, V4
+ VROTRW $25, V5, V5
+ VROTRW $25, V6, V6
+ VROTRW $25, V7, V7
+ VROTRW $25, V4, V4
+
+ SUBV $1, R15
+ BNE R15, R0, chacha
+
+ // load origin contants
+ // VLDREPL.W $0, R10, V16
+ WORD $0x30200150
+ // VLDREPL.W $1, R10, V17
+ WORD $0x30200551
+ // VLDREPL.W $2, R10, V18
+ WORD $0x30200952
+ // VLDREPL.W $3, R10, V19
+ WORD $0x30200d53
+
+ // load origin keys
+ // VLDREPL.W $0, R7, V20
+ WORD $0x302000f4
+ // VLDREPL.W $1, R7, V21
+ WORD $0x302004f5
+ // VLDREPL.W $2, R7, V22
+ WORD $0x302008f6
+ // VLDREPL.W $3, R7, V23
+ WORD $0x30200cf7
+ // VLDREPL.W $4, R7, V24
+ WORD $0x302010f8
+ // VLDREPL.W $5, R7, V25
+ WORD $0x302014f9
+ // VLDREPL.W $6, R7, V26
+ WORD $0x302018fa
+ // VLDREPL.W $7, R7, V27
+ WORD $0x30201cfb
+
+ // add back the initial state to generate the key stream
+ VADDW V30, V12, V12 // update counter in advance to prevent V30 from being overwritten
+ VADDW V16, V0, V0
+ VADDW V17, V1, V1
+ VADDW V18, V2, V2
+ VADDW V19, V3, V3
+
+ // load origin counter + nonce
+ // VLDREPL.W $0, R9, V28
+ WORD $0x3020013c
+ // VLDREPL.W $0, R8, V29
+ WORD $0x3020011d
+ // VLDREPL.W $1, R8, V30
+ WORD $0x3020051e
+ // VLDREPL.W $2, R8, V31
+ WORD $0x3020091f
+
+ VADDW V20, V4, V4
+ VADDW V21, V5, V5
+ VADDW V22, V6, V6
+ VADDW V23, V7, V7
+ VADDW V24, V8, V8
+ VADDW V25, V9, V9
+ VADDW V26, V10, V10
+ VADDW V27, V11, V11
+ VADDW V28, V12, V12
+ VADDW V29, V13, V13
+ VADDW V30, V14, V14
+ VADDW V31, V15, V15
+
+ // shuffle
+ VILVLW V0, V1, V16
+ VILVHW V0, V1, V17
+ VILVLW V2, V3, V18
+ VILVHW V2, V3, V19
+ VILVLW V4, V5 ,V20
+ VILVHW V4, V5, V21
+ VILVLW V6, V7, V22
+ VILVHW V6, V7, V23
+ VILVLW V8, V9, V24
+ VILVHW V8, V9, V25
+ VILVLW V10, V11, V26
+ VILVHW V10, V11, V27
+ VILVLW V12, V13, V28
+ VILVHW V12, V13, V29
+ VILVLW V14, V15, V30
+ VILVHW V14, V15, V31
+ VILVLV V16, V18, V0
+ VILVHV V16, V18, V4
+ VILVLV V17, V19, V8
+ VILVHV V17, V19, V12
+
+ // load src data from R5
+ VMOVQ 0(R5), V16
+ VMOVQ 16(R5), V17
+ VMOVQ 32(R5), V18
+ VMOVQ 48(R5), V19
+
+ VILVLV V20, V22, V1
+ VILVHV V20, V22, V5
+ VILVLV V21, V23, V9
+ VILVHV V21, V23, V13
+
+ VMOVQ 64(R5), V20
+ VMOVQ 80(R5), V21
+ VMOVQ 96(R5), V22
+ VMOVQ 112(R5), V23
+
+ VILVLV V24, V26, V2
+ VILVHV V24, V26, V6
+ VILVLV V25, V27, V10
+ VILVHV V25, V27, V14
+
+ VMOVQ 128(R5), V24
+ VMOVQ 144(R5), V25
+ VMOVQ 160(R5), V26
+ VMOVQ 176(R5), V27
+
+ VILVLV V28, V30, V3
+ VILVHV V28, V30, V7
+ VILVLV V29, V31, V11
+ VILVHV V29, V31, V15
+
+ VMOVQ 192(R5), V28
+ VMOVQ 208(R5), V29
+ VMOVQ 224(R5), V30
+ VMOVQ 240(R5), V31
+
+ VXORV V0, V16, V16
+ VXORV V1, V17, V17
+ VXORV V2, V18, V18
+ VXORV V3, V19, V19
+
+ VMOVQ V16, 0(R4)
+ VMOVQ V17, 16(R4)
+ VMOVQ V18, 32(R4)
+ VMOVQ V19, 48(R4)
+
+ VXORV V4, V20, V20
+ VXORV V5, V21, V21
+ VXORV V6, V22, V22
+ VXORV V7, V23, V23
+
+ VMOVQ V20, 64(R4)
+ VMOVQ V21, 80(R4)
+ VMOVQ V22, 96(R4)
+ VMOVQ V23, 112(R4)
+
+ VXORV V8, V24, V24
+ VXORV V9, V25, V25
+ VXORV V10, V26, V26
+ VXORV V11, V27, V27
+
+ VMOVQ V24, 128(R4)
+ VMOVQ V25, 144(R4)
+ VMOVQ V26, 160(R4)
+ VMOVQ V27, 176(R4)
+
+ VXORV V12, V28, V28
+ VXORV V13, V29, V29
+ VXORV V14, V30, V30
+ VXORV V15, V31, V31
+
+ VMOVQ V28, 192(R4)
+ VMOVQ V29, 208(R4)
+ VMOVQ V30, 224(R4)
+ VMOVQ V31, 240(R4)
+
+ ADD $4, R12, R12
+ MOVW R12, (R9) // update counter
+
+ ADDV $256, R4, R4
+ ADDV $256, R5, R5
+ SUBV $256, R6, R6
+ BNE R6, R0, loop
+
+ RET
diff --git a/chacha20/chacha_noasm.go b/chacha20/chacha_noasm.go
index c709b72..3853cc0 100644
--- a/chacha20/chacha_noasm.go
+++ b/chacha20/chacha_noasm.go
@@ -2,7 +2,7 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
-//go:build (!arm64 && !s390x && !ppc64 && !ppc64le) || !gc || purego
+//go:build (!arm64 && !loong64 && !s390x && !ppc64 && !ppc64le) || !gc || purego
package chacha20