aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMichael Pratt <mpratt@google.com>2025-04-15 02:22:27 -0700
committerGopher Robot <gobot@golang.org>2025-04-16 06:50:29 -0700
commit51f005cfd443a6a8fe542c8c2e58ed138f1cbbe2 (patch)
treebc311661103a3a43c79c1cb88f06d0f8a9c1aa06
parent7c358664da2071dd8c46274b0e0ba68b11f796cd (diff)
downloadgo-x-crypto-51f005cfd443a6a8fe542c8c2e58ed138f1cbbe2.tar.xz
Revert "salsa20: add loong64 SIMD implementation"
This reverts CL 663375. Reason for revert: Does not build on 1.23 or 1.24 For golang/go#73354. Change-Id: I251d598423b83c01cc2e04ddf6f49ae14095fa7c Reviewed-on: https://go-review.googlesource.com/c/crypto/+/665535 Auto-Submit: Michael Pratt <mpratt@google.com> Reviewed-by: abner chenc <chenguoqi@loongson.cn> Reviewed-by: Dmitri Shuralyov <dmitshur@google.com> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
-rw-r--r--salsa20/salsa/salsa20_amd64_test.go (renamed from salsa20/salsa/salsa20_test.go)2
-rw-r--r--salsa20/salsa/salsa20_loong64.go29
-rw-r--r--salsa20/salsa/salsa20_loong64.s482
-rw-r--r--salsa20/salsa/salsa20_noasm.go2
4 files changed, 2 insertions, 513 deletions
diff --git a/salsa20/salsa/salsa20_test.go b/salsa20/salsa/salsa20_amd64_test.go
index d6b29a3..fe14604 100644
--- a/salsa20/salsa/salsa20_test.go
+++ b/salsa20/salsa/salsa20_amd64_test.go
@@ -2,7 +2,7 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
-//go:build (amd64 || loong64) && !purego && gc
+//go:build amd64 && !purego && gc
package salsa
diff --git a/salsa20/salsa/salsa20_loong64.go b/salsa20/salsa/salsa20_loong64.go
deleted file mode 100644
index 8c7d867..0000000
--- a/salsa20/salsa/salsa20_loong64.go
+++ /dev/null
@@ -1,29 +0,0 @@
-// Copyright 2025 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-//go:build loong64 && !purego && gc
-
-package salsa
-
-import "golang.org/x/sys/cpu"
-
-// XORKeyStreamVX is implemented in salsa20_loong64.s.
-//
-//go:noescape
-func XORKeyStreamVX(out, in *byte, n uint64, nonce, key *byte)
-
-// XORKeyStream crypts bytes from in to out using the given key and counters.
-// In and out must overlap entirely or not at all. Counter
-// contains the raw salsa20 counter bytes (both nonce and block counter).
-func XORKeyStream(out, in []byte, counter *[16]byte, key *[32]byte) {
- if len(in) == 0 {
- return
- }
- _ = out[len(in)-1]
- if cpu.Loong64.HasLSX {
- XORKeyStreamVX(&out[0], &in[0], uint64(len(in)), &counter[0], &key[0])
- } else {
- genericXORKeyStream(out, in, counter, key)
- }
-}
diff --git a/salsa20/salsa/salsa20_loong64.s b/salsa20/salsa/salsa20_loong64.s
deleted file mode 100644
index dc4b501..0000000
--- a/salsa20/salsa/salsa20_loong64.s
+++ /dev/null
@@ -1,482 +0,0 @@
-// Copyright 2025 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-//go:build !purego && gc
-
-#include "textflag.h"
-
-DATA ·constants+0x00(SB)/4, $0x61707865
-DATA ·constants+0x04(SB)/4, $0x3320646e
-DATA ·constants+0x08(SB)/4, $0x79622d32
-DATA ·constants+0x0c(SB)/4, $0x6b206574
-GLOBL ·constants(SB), NOPTR|RODATA, $32
-
-#define NUM_ROUNDS 10
-
-// func XORKeyStreamVX(out, in *byte, n uint64, counter, key *byte)
-TEXT ·XORKeyStreamVX(SB), $0-40
- MOVV out+0(FP), R4
- MOVV in+8(FP), R5
- MOVV n+16(FP), R6
- MOVV counter+24(FP), R7
- MOVV key+32(FP), R8
- MOVV $·constants(SB), R10
-
- BGE R0, R6, ret
-
- MOVV 8(R7), R9 // counter[8:16]
-
-loop256:
- MOVV $NUM_ROUNDS, R15
- VXORV V30, V30, V30 // V30 = 0
-
- // load contants
- // VLDREPL.W $0, R10, V0
- WORD $0x30200140
- // VLDREPL.W $1, R10, V5
- WORD $0x30200545
- // VLDREPL.W $2, R10, V10
- WORD $0x3020094a
- // VLDREPL.W $3, R10, V15
- WORD $0x30200d4f
-
- // load keys
- // VLDREPL.W $0, R8, V1
- WORD $0x30200101
- // VLDREPL.W $1, R8, V2
- WORD $0x30200502
- // VLDREPL.W $2, R8, V3
- WORD $0x30200903
- // VLDREPL.W $3, R8, V4
- WORD $0x30200d04
- // VLDREPL.W $4, R8, V11
- WORD $0x3020110b
- // VLDREPL.W $5, R8, V12
- WORD $0x3020150c
- // VLDREPL.W $6, R8, V13
- WORD $0x3020190d
- // VLDREPL.W $7, R8, V14
- WORD $0x30201d0e
-
- // load and update counter
- // VLDREPL.W $0, R7, V6
- WORD $0x302000e6
- // VLDREPL.W $1, R7, V7
- WORD $0x302004e7
-
- ADDV $1, R9, R11
- ADDV $2, R9, R12
- ADDV $3, R9, R13
- VMOVQ R9, V8.W[0]
- VMOVQ R11, V8.W[1]
- VMOVQ R12, V8.W[2]
- VMOVQ R13, V8.W[3]
- SRLV $32, R9, R14
- SRLV $32, R11, R11
- SRLV $32, R12, R12
- SRLV $32, R13, R13
- VMOVQ R14, V9.W[0]
- VMOVQ R11, V9.W[1]
- VMOVQ R12, V9.W[2]
- VMOVQ R13, V9.W[3]
-
- // backup V8 and V9
- VADDV V8, V30, V24 // V21 = V8
- VADDV V9, V30, V25 // V22 = V9
-
-salsa20_256:
- VADDW V0, V12, V26
- VADDW V5, V1, V27
- VADDW V10, V6, V28
- VADDW V15, V11, V29
- VROTRW $25, V26, V26
- VROTRW $25, V27, V27
- VROTRW $25, V28, V28
- VROTRW $25, V29, V29
- VXORV V4, V26, V4
- VXORV V9, V27, V9
- VXORV V14, V28, V14
- VXORV V3, V29, V3
- VADDW V4, V0, V26
- VADDW V9, V5, V27
- VADDW V14, V10, V28
- VADDW V3, V15, V29
- VROTRW $23, V26, V26
- VROTRW $23, V27, V27
- VROTRW $23, V28, V28
- VROTRW $23, V29, V29
- VXORV V8, V26, V8
- VXORV V13, V27, V13
- VXORV V2, V28, V2
- VXORV V7, V29, V7
- VADDW V8, V4, V26
- VADDW V13, V9, V27
- VADDW V2, V14, V28
- VADDW V7, V3, V29
- VROTRW $19, V26, V26
- VROTRW $19, V27, V27
- VROTRW $19, V28, V28
- VROTRW $19, V29, V29
- VXORV V12, V26, V12
- VXORV V1, V27, V1
- VXORV V6, V28, V6
- VXORV V11, V29, V11
- VADDW V12, V8, V26
- VADDW V1, V13, V27
- VADDW V6, V2, V28
- VADDW V11, V7, V29
- VROTRW $14, V26, V26
- VROTRW $14, V27, V27
- VROTRW $14, V28, V28
- VROTRW $14, V29, V29
- VXORV V0, V26, V0
- VXORV V5, V27, V5
- VXORV V10, V28, V10
- VXORV V15, V29, V15
-
- VADDW V0, V3, V26
- VADDW V5, V4, V27
- VADDW V10, V9, V28
- VADDW V15, V14, V29
- VROTRW $25, V26, V26
- VROTRW $25, V27, V27
- VROTRW $25, V28, V28
- VROTRW $25, V29, V29
- VXORV V1, V26, V1
- VXORV V6, V27, V6
- VXORV V11, V28, V11
- VXORV V12, V29, V12
- VADDW V1, V0, V26
- VADDW V6, V5, V27
- VADDW V11, V10, V28
- VADDW V12, V15, V29
- VROTRW $23, V26, V26
- VROTRW $23, V27, V27
- VROTRW $23, V28, V28
- VROTRW $23, V29, V29
- VXORV V2, V26, V2
- VXORV V7, V27, V7
- VXORV V8, V28, V8
- VXORV V13, V29, V13
- VADDW V2, V1, V26
- VADDW V7, V6, V27
- VADDW V8, V11, V28
- VADDW V13, V12, V29
- VROTRW $19, V26, V26
- VROTRW $19, V27, V27
- VROTRW $19, V28, V28
- VROTRW $19, V29, V29
- VXORV V3, V26, V3
- VXORV V4, V27, V4
- VXORV V9, V28, V9
- VXORV V14, V29, V14
- VADDW V3, V2, V26
- VADDW V4, V7, V27
- VADDW V9, V8, V28
- VADDW V14, V13, V29
- VROTRW $14, V26, V26
- VROTRW $14, V27, V27
- VROTRW $14, V28, V28
- VROTRW $14, V29, V29
- VXORV V0, V26, V0
- VXORV V5, V27, V5
- VXORV V10, V28, V10
- VXORV V15, V29, V15
-
- SUBV $1, R15
- BNE R15, R0, salsa20_256
-
- // load origin contants
- // VLDREPL.W $0, R10, V16
- WORD $0x30200150
- // VLDREPL.W $1, R10, V21
- WORD $0x30200555
- // VLDREPL.W $2, R10, V26
- WORD $0x3020095a
- // VLDREPL.W $3, R10, V31
- WORD $0x30200d5f
-
- // load origin keys
- // VLDREPL.W $0, R8, V17
- WORD $0x30200111
- // VLDREPL.W $1, R8, V18
- WORD $0x30200512
- // VLDREPL.W $2, R8, V19
- WORD $0x30200913
- // VLDREPL.W $3, R8, V20
- WORD $0x30200d14
- // VLDREPL.W $4, R8, V27
- WORD $0x3020111b
- // VLDREPL.W $5, R8, V28
- WORD $0x3020151c
- // VLDREPL.W $6, R8, V29
- WORD $0x3020191d
- // VLDREPL.W $7, R8, V30
- WORD $0x30201d1e
-
- // load origin counter
- // VLDREPL.W $0, R7, V22
- WORD $0x302000f6
- // VLDREPL.W $1, R7, V23
- WORD $0x302004f7
-
- // add back the initial state to generate the key stream
- VADDW V0, V16, V0
- VADDW V1, V17, V1
- VADDW V2, V18, V2
- VADDW V3, V19, V3
- VADDW V4, V20, V4
- VADDW V5, V21, V5
- VADDW V6, V22, V6
- VADDW V7, V23, V7
- VADDW V8, V24, V8
- VADDW V9, V25, V9
- VADDW V10, V26, V10
- VADDW V11, V27, V11
- VADDW V12, V28, V12
- VADDW V13, V29, V13
- VADDW V14, V30, V14
- VADDW V15, V31, V15
-
- // shuffle
- VILVLW V0, V1, V16
- VILVHW V0, V1, V17
- VILVLW V2, V3, V18
- VILVHW V2, V3, V19
- VILVLW V4, V5 ,V20
- VILVHW V4, V5, V21
- VILVLW V6, V7, V22
- VILVHW V6, V7, V23
- VILVLW V8, V9, V24
- VILVHW V8, V9, V25
- VILVLW V10, V11, V26
- VILVHW V10, V11, V27
- VILVLW V12, V13, V28
- VILVHW V12, V13, V29
- VILVLW V14, V15, V30
- VILVHW V14, V15, V31
- VILVLV V16, V18, V0
- VILVHV V16, V18, V4
- VILVLV V17, V19, V8
- VILVHV V17, V19, V12
- VILVLV V20, V22, V1
- VILVHV V20, V22, V5
- VILVLV V21, V23, V9
- VILVHV V21, V23, V13
- VILVLV V24, V26, V2
- VILVHV V24, V26, V6
- VILVLV V25, V27, V10
- VILVHV V25, V27, V14
- VILVLV V28, V30, V3
- VILVHV V28, V30, V7
- VILVLV V29, V31, V11
- VILVHV V29, V31, V15
-
- SGTU $256, R6, R11
- BNE R11, R0, less_than_256
-
- // load src data from R5
- VMOVQ 0(R5), V16
- VMOVQ 16(R5), V17
- VMOVQ 32(R5), V18
- VMOVQ 48(R5), V19
- VMOVQ 64(R5), V20
- VMOVQ 80(R5), V21
- VMOVQ 96(R5), V22
- VMOVQ 112(R5), V23
- VMOVQ 128(R5), V24
- VMOVQ 144(R5), V25
- VMOVQ 160(R5), V26
- VMOVQ 176(R5), V27
- VMOVQ 192(R5), V28
- VMOVQ 208(R5), V29
- VMOVQ 224(R5), V30
- VMOVQ 240(R5), V31
-
- VXORV V0, V16, V16
- VXORV V1, V17, V17
- VXORV V2, V18, V18
- VXORV V3, V19, V19
- VXORV V4, V20, V20
- VXORV V5, V21, V21
- VXORV V6, V22, V22
- VXORV V7, V23, V23
- VXORV V8, V24, V24
- VXORV V9, V25, V25
- VXORV V10, V26, V26
- VXORV V11, V27, V27
- VXORV V12, V28, V28
- VXORV V13, V29, V29
- VXORV V14, V30, V30
- VXORV V15, V31, V31
-
- VMOVQ V16, 0(R4)
- VMOVQ V17, 16(R4)
- VMOVQ V18, 32(R4)
- VMOVQ V19, 48(R4)
- VMOVQ V20, 64(R4)
- VMOVQ V21, 80(R4)
- VMOVQ V22, 96(R4)
- VMOVQ V23, 112(R4)
- VMOVQ V24, 128(R4)
- VMOVQ V25, 144(R4)
- VMOVQ V26, 160(R4)
- VMOVQ V27, 176(R4)
- VMOVQ V28, 192(R4)
- VMOVQ V29, 208(R4)
- VMOVQ V30, 224(R4)
- VMOVQ V31, 240(R4)
-
- ADDV $4, R9, R9 // update counter
-
- SUBV $256, R6, R6
- ADDV $256, R4, R4
- ADDV $256, R5, R5
- SGTU $256, R6, R11
- BEQ R11, R0, loop256
- BEQ R6, R0, ret
-
-less_than_256:
- VXORV V30, V30, V30 // V30=0
- SGTU $128, R6, R11
- BNE R11, R0, less_than_128
- SUBV $128, R6
- VMOVQ (R5), V16
- VMOVQ 16(R5), V17
- VMOVQ 32(R5), V18
- VMOVQ 48(R5), V19
- VMOVQ 64(R5), V20
- VMOVQ 80(R5), V21
- VMOVQ 96(R5), V22
- VMOVQ 112(R5), V23
- VXORV V0, V16, V16
- VXORV V1, V17, V17
- VXORV V2, V18, V18
- VXORV V3, V19, V19
- VXORV V4, V20, V20
- VXORV V5, V21, V21
- VXORV V6, V22, V22
- VXORV V7, V23, V23
- VMOVQ V16, (R4)
- VMOVQ V17, 16(R4)
- VMOVQ V18, 32(R4)
- VMOVQ V19, 48(R4)
- VMOVQ V20, 64(R4)
- VMOVQ V21, 80(R4)
- VMOVQ V22, 96(R4)
- VMOVQ V23, 112(R4)
- BEQ R6, R0, ret
- ADDV $128, R5, R5
- ADDV $128, R4, R4
- VADDV V8, V30, V0
- VADDV V9, V30, V1
- VADDV V10, V30, V2
- VADDV V11, V30, V3
- VADDV V12, V30, V4
- VADDV V13, V30, V5
- VADDV V14, V30, V6
- VADDV V15, V30, V7
-
-less_than_128:
- SGTU $64, R6, R11
- BNE R11, R0, less_than_64
- SUBV $64, R6
- VMOVQ (R5), V16
- VMOVQ 16(R5), V17
- VMOVQ 32(R5), V18
- VMOVQ 48(R5), V19
- VXORV V0, V16, V16
- VXORV V1, V17, V17
- VXORV V2, V18, V18
- VXORV V3, V19, V19
- VMOVQ V16, (R4)
- VMOVQ V17, 16(R4)
- VMOVQ V18, 32(R4)
- VMOVQ V19, 48(R4)
- BEQ R6, R0, ret
- ADDV $64, R5
- ADDV $64, R4
- VADDV V4, V30, V0
- VADDV V5, V30, V1
- VADDV V6, V30, V2
- VADDV V7, V30, V3
-
-less_than_64:
- SGTU $32, R6, R11
- BNE R11, R0, less_than_32
- SUBV $32, R6
- VMOVQ (R5), V16
- VMOVQ 16(R5), V17
- VXORV V0, V16, V16
- VXORV V1, V17, V17
- VMOVQ V16, (R4)
- VMOVQ V17, 16(R4)
- BEQ R6, R0, ret
- ADDV $32, R5
- ADDV $32, R4
- VADDV V2, V30, V0
- VADDV V3, V30, V1
-
-less_than_32:
- SGTU $16, R6, R11
- BNE R11, R0, less_than_16
- SUBV $16, R6
- VMOVQ (R5), V16
- VXORV V16, V0, V16
- VMOVQ V16, (R4)
- BEQ R6, R0, ret
- ADDV $16, R5
- ADDV $16, R4
- VADDV V1, V30, V0
-
-less_than_16:
- SGTU $8, R6, R11
- BNE R11, R0, less_than_8
- SUBV $8, R6
- VMOVQ V0.V[0], R11
- VMOVQ V0.V[1], R13
- MOVV (R5), R12
- XOR R11, R12, R12
- MOVV R12, (R4)
- BEQ R6, R0, ret
- ADDV $8, R5
- ADDV $8, R4
- VMOVQ R13, V0.V[0]
-
-less_than_8:
- SGTU $4, R6, R11
- BNE R11, R0, less_than_4
- SUBV $4, R6
- VMOVQ V0.W[0], R11
- VMOVQ V0.W[1], R13
- MOVWU (R5), R12
- XOR R11, R12, R12
- MOVWU R12, (R4)
- BEQ R6, R0, ret
- ADDV $4, R5
- ADDV $4, R4
- VMOVQ R13, V0.W[0]
-
-less_than_4:
- SGTU $2, R6, R11
- BNE R11, R0, less_than_2
- SUBV $2, R6
- VMOVQ V0.H[0], R11
- VMOVQ V0.H[1], R13
- MOVHU (R5), R12
- XOR R11, R12, R12
- MOVHU R12, (R4)
- BEQ R6, R0, ret
- ADDV $2, R5
- ADDV $2, R4
- VMOVQ R13, V0.H[0]
-
-less_than_2:
- VMOVQ V0.B[0], R11
- MOVBU (R5), R12
- XOR R11, R12, R12
- MOVBU R12, (R4)
-
-ret:
- RET
diff --git a/salsa20/salsa/salsa20_noasm.go b/salsa20/salsa/salsa20_noasm.go
index 64e262d..9448760 100644
--- a/salsa20/salsa/salsa20_noasm.go
+++ b/salsa20/salsa/salsa20_noasm.go
@@ -2,7 +2,7 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
-//go:build (!amd64 && !loong64) || purego || !gc
+//go:build !amd64 || purego || !gc
package salsa