diff options
| author | Michael Pratt <mpratt@google.com> | 2025-04-15 02:22:27 -0700 |
|---|---|---|
| committer | Gopher Robot <gobot@golang.org> | 2025-04-16 06:50:29 -0700 |
| commit | 51f005cfd443a6a8fe542c8c2e58ed138f1cbbe2 (patch) | |
| tree | bc311661103a3a43c79c1cb88f06d0f8a9c1aa06 | |
| parent | 7c358664da2071dd8c46274b0e0ba68b11f796cd (diff) | |
| download | go-x-crypto-51f005cfd443a6a8fe542c8c2e58ed138f1cbbe2.tar.xz | |
Revert "salsa20: add loong64 SIMD implementation"
This reverts CL 663375.
Reason for revert: Does not build on 1.23 or 1.24
For golang/go#73354.
Change-Id: I251d598423b83c01cc2e04ddf6f49ae14095fa7c
Reviewed-on: https://go-review.googlesource.com/c/crypto/+/665535
Auto-Submit: Michael Pratt <mpratt@google.com>
Reviewed-by: abner chenc <chenguoqi@loongson.cn>
Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
| -rw-r--r-- | salsa20/salsa/salsa20_amd64_test.go (renamed from salsa20/salsa/salsa20_test.go) | 2 | ||||
| -rw-r--r-- | salsa20/salsa/salsa20_loong64.go | 29 | ||||
| -rw-r--r-- | salsa20/salsa/salsa20_loong64.s | 482 | ||||
| -rw-r--r-- | salsa20/salsa/salsa20_noasm.go | 2 |
4 files changed, 2 insertions, 513 deletions
diff --git a/salsa20/salsa/salsa20_test.go b/salsa20/salsa/salsa20_amd64_test.go index d6b29a3..fe14604 100644 --- a/salsa20/salsa/salsa20_test.go +++ b/salsa20/salsa/salsa20_amd64_test.go @@ -2,7 +2,7 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -//go:build (amd64 || loong64) && !purego && gc +//go:build amd64 && !purego && gc package salsa diff --git a/salsa20/salsa/salsa20_loong64.go b/salsa20/salsa/salsa20_loong64.go deleted file mode 100644 index 8c7d867..0000000 --- a/salsa20/salsa/salsa20_loong64.go +++ /dev/null @@ -1,29 +0,0 @@ -// Copyright 2025 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -//go:build loong64 && !purego && gc - -package salsa - -import "golang.org/x/sys/cpu" - -// XORKeyStreamVX is implemented in salsa20_loong64.s. -// -//go:noescape -func XORKeyStreamVX(out, in *byte, n uint64, nonce, key *byte) - -// XORKeyStream crypts bytes from in to out using the given key and counters. -// In and out must overlap entirely or not at all. Counter -// contains the raw salsa20 counter bytes (both nonce and block counter). -func XORKeyStream(out, in []byte, counter *[16]byte, key *[32]byte) { - if len(in) == 0 { - return - } - _ = out[len(in)-1] - if cpu.Loong64.HasLSX { - XORKeyStreamVX(&out[0], &in[0], uint64(len(in)), &counter[0], &key[0]) - } else { - genericXORKeyStream(out, in, counter, key) - } -} diff --git a/salsa20/salsa/salsa20_loong64.s b/salsa20/salsa/salsa20_loong64.s deleted file mode 100644 index dc4b501..0000000 --- a/salsa20/salsa/salsa20_loong64.s +++ /dev/null @@ -1,482 +0,0 @@ -// Copyright 2025 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -//go:build !purego && gc - -#include "textflag.h" - -DATA ·constants+0x00(SB)/4, $0x61707865 -DATA ·constants+0x04(SB)/4, $0x3320646e -DATA ·constants+0x08(SB)/4, $0x79622d32 -DATA ·constants+0x0c(SB)/4, $0x6b206574 -GLOBL ·constants(SB), NOPTR|RODATA, $32 - -#define NUM_ROUNDS 10 - -// func XORKeyStreamVX(out, in *byte, n uint64, counter, key *byte) -TEXT ·XORKeyStreamVX(SB), $0-40 - MOVV out+0(FP), R4 - MOVV in+8(FP), R5 - MOVV n+16(FP), R6 - MOVV counter+24(FP), R7 - MOVV key+32(FP), R8 - MOVV $·constants(SB), R10 - - BGE R0, R6, ret - - MOVV 8(R7), R9 // counter[8:16] - -loop256: - MOVV $NUM_ROUNDS, R15 - VXORV V30, V30, V30 // V30 = 0 - - // load contants - // VLDREPL.W $0, R10, V0 - WORD $0x30200140 - // VLDREPL.W $1, R10, V5 - WORD $0x30200545 - // VLDREPL.W $2, R10, V10 - WORD $0x3020094a - // VLDREPL.W $3, R10, V15 - WORD $0x30200d4f - - // load keys - // VLDREPL.W $0, R8, V1 - WORD $0x30200101 - // VLDREPL.W $1, R8, V2 - WORD $0x30200502 - // VLDREPL.W $2, R8, V3 - WORD $0x30200903 - // VLDREPL.W $3, R8, V4 - WORD $0x30200d04 - // VLDREPL.W $4, R8, V11 - WORD $0x3020110b - // VLDREPL.W $5, R8, V12 - WORD $0x3020150c - // VLDREPL.W $6, R8, V13 - WORD $0x3020190d - // VLDREPL.W $7, R8, V14 - WORD $0x30201d0e - - // load and update counter - // VLDREPL.W $0, R7, V6 - WORD $0x302000e6 - // VLDREPL.W $1, R7, V7 - WORD $0x302004e7 - - ADDV $1, R9, R11 - ADDV $2, R9, R12 - ADDV $3, R9, R13 - VMOVQ R9, V8.W[0] - VMOVQ R11, V8.W[1] - VMOVQ R12, V8.W[2] - VMOVQ R13, V8.W[3] - SRLV $32, R9, R14 - SRLV $32, R11, R11 - SRLV $32, R12, R12 - SRLV $32, R13, R13 - VMOVQ R14, V9.W[0] - VMOVQ R11, V9.W[1] - VMOVQ R12, V9.W[2] - VMOVQ R13, V9.W[3] - - // backup V8 and V9 - VADDV V8, V30, V24 // V21 = V8 - VADDV V9, V30, V25 // V22 = V9 - -salsa20_256: - VADDW V0, V12, V26 - VADDW V5, V1, V27 - VADDW V10, V6, V28 - VADDW V15, V11, V29 - VROTRW $25, V26, V26 - VROTRW $25, V27, V27 - VROTRW $25, V28, V28 - VROTRW $25, V29, V29 - VXORV V4, V26, V4 - VXORV V9, V27, V9 - VXORV V14, V28, V14 - VXORV V3, V29, V3 - VADDW V4, V0, V26 - VADDW V9, V5, V27 - VADDW V14, V10, V28 - VADDW V3, V15, V29 - VROTRW $23, V26, V26 - VROTRW $23, V27, V27 - VROTRW $23, V28, V28 - VROTRW $23, V29, V29 - VXORV V8, V26, V8 - VXORV V13, V27, V13 - VXORV V2, V28, V2 - VXORV V7, V29, V7 - VADDW V8, V4, V26 - VADDW V13, V9, V27 - VADDW V2, V14, V28 - VADDW V7, V3, V29 - VROTRW $19, V26, V26 - VROTRW $19, V27, V27 - VROTRW $19, V28, V28 - VROTRW $19, V29, V29 - VXORV V12, V26, V12 - VXORV V1, V27, V1 - VXORV V6, V28, V6 - VXORV V11, V29, V11 - VADDW V12, V8, V26 - VADDW V1, V13, V27 - VADDW V6, V2, V28 - VADDW V11, V7, V29 - VROTRW $14, V26, V26 - VROTRW $14, V27, V27 - VROTRW $14, V28, V28 - VROTRW $14, V29, V29 - VXORV V0, V26, V0 - VXORV V5, V27, V5 - VXORV V10, V28, V10 - VXORV V15, V29, V15 - - VADDW V0, V3, V26 - VADDW V5, V4, V27 - VADDW V10, V9, V28 - VADDW V15, V14, V29 - VROTRW $25, V26, V26 - VROTRW $25, V27, V27 - VROTRW $25, V28, V28 - VROTRW $25, V29, V29 - VXORV V1, V26, V1 - VXORV V6, V27, V6 - VXORV V11, V28, V11 - VXORV V12, V29, V12 - VADDW V1, V0, V26 - VADDW V6, V5, V27 - VADDW V11, V10, V28 - VADDW V12, V15, V29 - VROTRW $23, V26, V26 - VROTRW $23, V27, V27 - VROTRW $23, V28, V28 - VROTRW $23, V29, V29 - VXORV V2, V26, V2 - VXORV V7, V27, V7 - VXORV V8, V28, V8 - VXORV V13, V29, V13 - VADDW V2, V1, V26 - VADDW V7, V6, V27 - VADDW V8, V11, V28 - VADDW V13, V12, V29 - VROTRW $19, V26, V26 - VROTRW $19, V27, V27 - VROTRW $19, V28, V28 - VROTRW $19, V29, V29 - VXORV V3, V26, V3 - VXORV V4, V27, V4 - VXORV V9, V28, V9 - VXORV V14, V29, V14 - VADDW V3, V2, V26 - VADDW V4, V7, V27 - VADDW V9, V8, V28 - VADDW V14, V13, V29 - VROTRW $14, V26, V26 - VROTRW $14, V27, V27 - VROTRW $14, V28, V28 - VROTRW $14, V29, V29 - VXORV V0, V26, V0 - VXORV V5, V27, V5 - VXORV V10, V28, V10 - VXORV V15, V29, V15 - - SUBV $1, R15 - BNE R15, R0, salsa20_256 - - // load origin contants - // VLDREPL.W $0, R10, V16 - WORD $0x30200150 - // VLDREPL.W $1, R10, V21 - WORD $0x30200555 - // VLDREPL.W $2, R10, V26 - WORD $0x3020095a - // VLDREPL.W $3, R10, V31 - WORD $0x30200d5f - - // load origin keys - // VLDREPL.W $0, R8, V17 - WORD $0x30200111 - // VLDREPL.W $1, R8, V18 - WORD $0x30200512 - // VLDREPL.W $2, R8, V19 - WORD $0x30200913 - // VLDREPL.W $3, R8, V20 - WORD $0x30200d14 - // VLDREPL.W $4, R8, V27 - WORD $0x3020111b - // VLDREPL.W $5, R8, V28 - WORD $0x3020151c - // VLDREPL.W $6, R8, V29 - WORD $0x3020191d - // VLDREPL.W $7, R8, V30 - WORD $0x30201d1e - - // load origin counter - // VLDREPL.W $0, R7, V22 - WORD $0x302000f6 - // VLDREPL.W $1, R7, V23 - WORD $0x302004f7 - - // add back the initial state to generate the key stream - VADDW V0, V16, V0 - VADDW V1, V17, V1 - VADDW V2, V18, V2 - VADDW V3, V19, V3 - VADDW V4, V20, V4 - VADDW V5, V21, V5 - VADDW V6, V22, V6 - VADDW V7, V23, V7 - VADDW V8, V24, V8 - VADDW V9, V25, V9 - VADDW V10, V26, V10 - VADDW V11, V27, V11 - VADDW V12, V28, V12 - VADDW V13, V29, V13 - VADDW V14, V30, V14 - VADDW V15, V31, V15 - - // shuffle - VILVLW V0, V1, V16 - VILVHW V0, V1, V17 - VILVLW V2, V3, V18 - VILVHW V2, V3, V19 - VILVLW V4, V5 ,V20 - VILVHW V4, V5, V21 - VILVLW V6, V7, V22 - VILVHW V6, V7, V23 - VILVLW V8, V9, V24 - VILVHW V8, V9, V25 - VILVLW V10, V11, V26 - VILVHW V10, V11, V27 - VILVLW V12, V13, V28 - VILVHW V12, V13, V29 - VILVLW V14, V15, V30 - VILVHW V14, V15, V31 - VILVLV V16, V18, V0 - VILVHV V16, V18, V4 - VILVLV V17, V19, V8 - VILVHV V17, V19, V12 - VILVLV V20, V22, V1 - VILVHV V20, V22, V5 - VILVLV V21, V23, V9 - VILVHV V21, V23, V13 - VILVLV V24, V26, V2 - VILVHV V24, V26, V6 - VILVLV V25, V27, V10 - VILVHV V25, V27, V14 - VILVLV V28, V30, V3 - VILVHV V28, V30, V7 - VILVLV V29, V31, V11 - VILVHV V29, V31, V15 - - SGTU $256, R6, R11 - BNE R11, R0, less_than_256 - - // load src data from R5 - VMOVQ 0(R5), V16 - VMOVQ 16(R5), V17 - VMOVQ 32(R5), V18 - VMOVQ 48(R5), V19 - VMOVQ 64(R5), V20 - VMOVQ 80(R5), V21 - VMOVQ 96(R5), V22 - VMOVQ 112(R5), V23 - VMOVQ 128(R5), V24 - VMOVQ 144(R5), V25 - VMOVQ 160(R5), V26 - VMOVQ 176(R5), V27 - VMOVQ 192(R5), V28 - VMOVQ 208(R5), V29 - VMOVQ 224(R5), V30 - VMOVQ 240(R5), V31 - - VXORV V0, V16, V16 - VXORV V1, V17, V17 - VXORV V2, V18, V18 - VXORV V3, V19, V19 - VXORV V4, V20, V20 - VXORV V5, V21, V21 - VXORV V6, V22, V22 - VXORV V7, V23, V23 - VXORV V8, V24, V24 - VXORV V9, V25, V25 - VXORV V10, V26, V26 - VXORV V11, V27, V27 - VXORV V12, V28, V28 - VXORV V13, V29, V29 - VXORV V14, V30, V30 - VXORV V15, V31, V31 - - VMOVQ V16, 0(R4) - VMOVQ V17, 16(R4) - VMOVQ V18, 32(R4) - VMOVQ V19, 48(R4) - VMOVQ V20, 64(R4) - VMOVQ V21, 80(R4) - VMOVQ V22, 96(R4) - VMOVQ V23, 112(R4) - VMOVQ V24, 128(R4) - VMOVQ V25, 144(R4) - VMOVQ V26, 160(R4) - VMOVQ V27, 176(R4) - VMOVQ V28, 192(R4) - VMOVQ V29, 208(R4) - VMOVQ V30, 224(R4) - VMOVQ V31, 240(R4) - - ADDV $4, R9, R9 // update counter - - SUBV $256, R6, R6 - ADDV $256, R4, R4 - ADDV $256, R5, R5 - SGTU $256, R6, R11 - BEQ R11, R0, loop256 - BEQ R6, R0, ret - -less_than_256: - VXORV V30, V30, V30 // V30=0 - SGTU $128, R6, R11 - BNE R11, R0, less_than_128 - SUBV $128, R6 - VMOVQ (R5), V16 - VMOVQ 16(R5), V17 - VMOVQ 32(R5), V18 - VMOVQ 48(R5), V19 - VMOVQ 64(R5), V20 - VMOVQ 80(R5), V21 - VMOVQ 96(R5), V22 - VMOVQ 112(R5), V23 - VXORV V0, V16, V16 - VXORV V1, V17, V17 - VXORV V2, V18, V18 - VXORV V3, V19, V19 - VXORV V4, V20, V20 - VXORV V5, V21, V21 - VXORV V6, V22, V22 - VXORV V7, V23, V23 - VMOVQ V16, (R4) - VMOVQ V17, 16(R4) - VMOVQ V18, 32(R4) - VMOVQ V19, 48(R4) - VMOVQ V20, 64(R4) - VMOVQ V21, 80(R4) - VMOVQ V22, 96(R4) - VMOVQ V23, 112(R4) - BEQ R6, R0, ret - ADDV $128, R5, R5 - ADDV $128, R4, R4 - VADDV V8, V30, V0 - VADDV V9, V30, V1 - VADDV V10, V30, V2 - VADDV V11, V30, V3 - VADDV V12, V30, V4 - VADDV V13, V30, V5 - VADDV V14, V30, V6 - VADDV V15, V30, V7 - -less_than_128: - SGTU $64, R6, R11 - BNE R11, R0, less_than_64 - SUBV $64, R6 - VMOVQ (R5), V16 - VMOVQ 16(R5), V17 - VMOVQ 32(R5), V18 - VMOVQ 48(R5), V19 - VXORV V0, V16, V16 - VXORV V1, V17, V17 - VXORV V2, V18, V18 - VXORV V3, V19, V19 - VMOVQ V16, (R4) - VMOVQ V17, 16(R4) - VMOVQ V18, 32(R4) - VMOVQ V19, 48(R4) - BEQ R6, R0, ret - ADDV $64, R5 - ADDV $64, R4 - VADDV V4, V30, V0 - VADDV V5, V30, V1 - VADDV V6, V30, V2 - VADDV V7, V30, V3 - -less_than_64: - SGTU $32, R6, R11 - BNE R11, R0, less_than_32 - SUBV $32, R6 - VMOVQ (R5), V16 - VMOVQ 16(R5), V17 - VXORV V0, V16, V16 - VXORV V1, V17, V17 - VMOVQ V16, (R4) - VMOVQ V17, 16(R4) - BEQ R6, R0, ret - ADDV $32, R5 - ADDV $32, R4 - VADDV V2, V30, V0 - VADDV V3, V30, V1 - -less_than_32: - SGTU $16, R6, R11 - BNE R11, R0, less_than_16 - SUBV $16, R6 - VMOVQ (R5), V16 - VXORV V16, V0, V16 - VMOVQ V16, (R4) - BEQ R6, R0, ret - ADDV $16, R5 - ADDV $16, R4 - VADDV V1, V30, V0 - -less_than_16: - SGTU $8, R6, R11 - BNE R11, R0, less_than_8 - SUBV $8, R6 - VMOVQ V0.V[0], R11 - VMOVQ V0.V[1], R13 - MOVV (R5), R12 - XOR R11, R12, R12 - MOVV R12, (R4) - BEQ R6, R0, ret - ADDV $8, R5 - ADDV $8, R4 - VMOVQ R13, V0.V[0] - -less_than_8: - SGTU $4, R6, R11 - BNE R11, R0, less_than_4 - SUBV $4, R6 - VMOVQ V0.W[0], R11 - VMOVQ V0.W[1], R13 - MOVWU (R5), R12 - XOR R11, R12, R12 - MOVWU R12, (R4) - BEQ R6, R0, ret - ADDV $4, R5 - ADDV $4, R4 - VMOVQ R13, V0.W[0] - -less_than_4: - SGTU $2, R6, R11 - BNE R11, R0, less_than_2 - SUBV $2, R6 - VMOVQ V0.H[0], R11 - VMOVQ V0.H[1], R13 - MOVHU (R5), R12 - XOR R11, R12, R12 - MOVHU R12, (R4) - BEQ R6, R0, ret - ADDV $2, R5 - ADDV $2, R4 - VMOVQ R13, V0.H[0] - -less_than_2: - VMOVQ V0.B[0], R11 - MOVBU (R5), R12 - XOR R11, R12, R12 - MOVBU R12, (R4) - -ret: - RET diff --git a/salsa20/salsa/salsa20_noasm.go b/salsa20/salsa/salsa20_noasm.go index 64e262d..9448760 100644 --- a/salsa20/salsa/salsa20_noasm.go +++ b/salsa20/salsa/salsa20_noasm.go @@ -2,7 +2,7 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -//go:build (!amd64 && !loong64) || purego || !gc +//go:build !amd64 || purego || !gc package salsa |
