From 18f770732fa01d5d5e1a529a5518d7b70f93d3c6 Mon Sep 17 00:00:00 2001 From: Xiaolin Zhao Date: Mon, 7 Apr 2025 15:25:31 +0800 Subject: salsa20: add loong64 SIMD implementation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The performance gains on Loongson 3A6000 and 3A5000 are as follows: goos: linux goarch: loong64 pkg: golang.org/x/crypto/salsa20 cpu: Loongson-3A6000-HV @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | XOR1K 3175.0n ± 0% 435.4n ± 0% -86.29% (p=0.000 n=20) | bench.old | bench.new | | B/s | B/s vs base | XOR1K 307.6Mi ± 0% 2242.7Mi ± 0% +629.13% (p=0.000 n=20) goos: linux goarch: loong64 pkg: golang.org/x/crypto/salsa20 cpu: Loongson-3A5000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | XOR1K 4125.0n ± 0% 864.0n ± 0% -79.05% (p=0.000 n=20) | bench.old | bench.new | | B/s | B/s vs base | XOR1K 236.7Mi ± 0% 1130.3Mi ± 0% +377.41% (p=0.000 n=20) Change-Id: Ib37f603e6654f1e3837985fad4b6dee10b5af993 Reviewed-on: https://go-review.googlesource.com/c/crypto/+/663375 Reviewed-by: Carlos Amedee Reviewed-by: abner chenc Reviewed-by: Dmitri Shuralyov LUCI-TryBot-Result: Go LUCI Auto-Submit: Carlos Amedee --- salsa20/salsa/salsa20_amd64_test.go | 31 --- salsa20/salsa/salsa20_loong64.go | 29 +++ salsa20/salsa/salsa20_loong64.s | 482 ++++++++++++++++++++++++++++++++++++ salsa20/salsa/salsa20_noasm.go | 2 +- salsa20/salsa/salsa20_test.go | 31 +++ 5 files changed, 543 insertions(+), 32 deletions(-) delete mode 100644 salsa20/salsa/salsa20_amd64_test.go create mode 100644 salsa20/salsa/salsa20_loong64.go create mode 100644 salsa20/salsa/salsa20_loong64.s create mode 100644 salsa20/salsa/salsa20_test.go diff --git a/salsa20/salsa/salsa20_amd64_test.go b/salsa20/salsa/salsa20_amd64_test.go deleted file mode 100644 index fe14604..0000000 --- a/salsa20/salsa/salsa20_amd64_test.go +++ /dev/null @@ -1,31 +0,0 @@ -// Copyright 2019 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -//go:build amd64 && !purego && gc - -package salsa - -import ( - "bytes" - "testing" -) - -func TestCounterOverflow(t *testing.T) { - in := make([]byte, 4096) - key := &[32]byte{1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, - 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2} - for n, counter := range []*[16]byte{ - &[16]byte{0, 1, 2, 3, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0}, // zero counter - &[16]byte{0, 1, 2, 3, 4, 5, 6, 7, 0, 0, 0, 0, 0xff, 0xff, 0xff, 0xff}, // counter about to overflow 32 bits - &[16]byte{0, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 0xff, 0xff, 0xff, 0xff}, // counter above 32 bits - } { - out := make([]byte, 4096) - XORKeyStream(out, in, counter, key) - outGeneric := make([]byte, 4096) - genericXORKeyStream(outGeneric, in, counter, key) - if !bytes.Equal(out, outGeneric) { - t.Errorf("%d: assembly and go implementations disagree", n) - } - } -} diff --git a/salsa20/salsa/salsa20_loong64.go b/salsa20/salsa/salsa20_loong64.go new file mode 100644 index 0000000..8c7d867 --- /dev/null +++ b/salsa20/salsa/salsa20_loong64.go @@ -0,0 +1,29 @@ +// Copyright 2025 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build loong64 && !purego && gc + +package salsa + +import "golang.org/x/sys/cpu" + +// XORKeyStreamVX is implemented in salsa20_loong64.s. +// +//go:noescape +func XORKeyStreamVX(out, in *byte, n uint64, nonce, key *byte) + +// XORKeyStream crypts bytes from in to out using the given key and counters. +// In and out must overlap entirely or not at all. Counter +// contains the raw salsa20 counter bytes (both nonce and block counter). +func XORKeyStream(out, in []byte, counter *[16]byte, key *[32]byte) { + if len(in) == 0 { + return + } + _ = out[len(in)-1] + if cpu.Loong64.HasLSX { + XORKeyStreamVX(&out[0], &in[0], uint64(len(in)), &counter[0], &key[0]) + } else { + genericXORKeyStream(out, in, counter, key) + } +} diff --git a/salsa20/salsa/salsa20_loong64.s b/salsa20/salsa/salsa20_loong64.s new file mode 100644 index 0000000..dc4b501 --- /dev/null +++ b/salsa20/salsa/salsa20_loong64.s @@ -0,0 +1,482 @@ +// Copyright 2025 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build !purego && gc + +#include "textflag.h" + +DATA ·constants+0x00(SB)/4, $0x61707865 +DATA ·constants+0x04(SB)/4, $0x3320646e +DATA ·constants+0x08(SB)/4, $0x79622d32 +DATA ·constants+0x0c(SB)/4, $0x6b206574 +GLOBL ·constants(SB), NOPTR|RODATA, $32 + +#define NUM_ROUNDS 10 + +// func XORKeyStreamVX(out, in *byte, n uint64, counter, key *byte) +TEXT ·XORKeyStreamVX(SB), $0-40 + MOVV out+0(FP), R4 + MOVV in+8(FP), R5 + MOVV n+16(FP), R6 + MOVV counter+24(FP), R7 + MOVV key+32(FP), R8 + MOVV $·constants(SB), R10 + + BGE R0, R6, ret + + MOVV 8(R7), R9 // counter[8:16] + +loop256: + MOVV $NUM_ROUNDS, R15 + VXORV V30, V30, V30 // V30 = 0 + + // load contants + // VLDREPL.W $0, R10, V0 + WORD $0x30200140 + // VLDREPL.W $1, R10, V5 + WORD $0x30200545 + // VLDREPL.W $2, R10, V10 + WORD $0x3020094a + // VLDREPL.W $3, R10, V15 + WORD $0x30200d4f + + // load keys + // VLDREPL.W $0, R8, V1 + WORD $0x30200101 + // VLDREPL.W $1, R8, V2 + WORD $0x30200502 + // VLDREPL.W $2, R8, V3 + WORD $0x30200903 + // VLDREPL.W $3, R8, V4 + WORD $0x30200d04 + // VLDREPL.W $4, R8, V11 + WORD $0x3020110b + // VLDREPL.W $5, R8, V12 + WORD $0x3020150c + // VLDREPL.W $6, R8, V13 + WORD $0x3020190d + // VLDREPL.W $7, R8, V14 + WORD $0x30201d0e + + // load and update counter + // VLDREPL.W $0, R7, V6 + WORD $0x302000e6 + // VLDREPL.W $1, R7, V7 + WORD $0x302004e7 + + ADDV $1, R9, R11 + ADDV $2, R9, R12 + ADDV $3, R9, R13 + VMOVQ R9, V8.W[0] + VMOVQ R11, V8.W[1] + VMOVQ R12, V8.W[2] + VMOVQ R13, V8.W[3] + SRLV $32, R9, R14 + SRLV $32, R11, R11 + SRLV $32, R12, R12 + SRLV $32, R13, R13 + VMOVQ R14, V9.W[0] + VMOVQ R11, V9.W[1] + VMOVQ R12, V9.W[2] + VMOVQ R13, V9.W[3] + + // backup V8 and V9 + VADDV V8, V30, V24 // V21 = V8 + VADDV V9, V30, V25 // V22 = V9 + +salsa20_256: + VADDW V0, V12, V26 + VADDW V5, V1, V27 + VADDW V10, V6, V28 + VADDW V15, V11, V29 + VROTRW $25, V26, V26 + VROTRW $25, V27, V27 + VROTRW $25, V28, V28 + VROTRW $25, V29, V29 + VXORV V4, V26, V4 + VXORV V9, V27, V9 + VXORV V14, V28, V14 + VXORV V3, V29, V3 + VADDW V4, V0, V26 + VADDW V9, V5, V27 + VADDW V14, V10, V28 + VADDW V3, V15, V29 + VROTRW $23, V26, V26 + VROTRW $23, V27, V27 + VROTRW $23, V28, V28 + VROTRW $23, V29, V29 + VXORV V8, V26, V8 + VXORV V13, V27, V13 + VXORV V2, V28, V2 + VXORV V7, V29, V7 + VADDW V8, V4, V26 + VADDW V13, V9, V27 + VADDW V2, V14, V28 + VADDW V7, V3, V29 + VROTRW $19, V26, V26 + VROTRW $19, V27, V27 + VROTRW $19, V28, V28 + VROTRW $19, V29, V29 + VXORV V12, V26, V12 + VXORV V1, V27, V1 + VXORV V6, V28, V6 + VXORV V11, V29, V11 + VADDW V12, V8, V26 + VADDW V1, V13, V27 + VADDW V6, V2, V28 + VADDW V11, V7, V29 + VROTRW $14, V26, V26 + VROTRW $14, V27, V27 + VROTRW $14, V28, V28 + VROTRW $14, V29, V29 + VXORV V0, V26, V0 + VXORV V5, V27, V5 + VXORV V10, V28, V10 + VXORV V15, V29, V15 + + VADDW V0, V3, V26 + VADDW V5, V4, V27 + VADDW V10, V9, V28 + VADDW V15, V14, V29 + VROTRW $25, V26, V26 + VROTRW $25, V27, V27 + VROTRW $25, V28, V28 + VROTRW $25, V29, V29 + VXORV V1, V26, V1 + VXORV V6, V27, V6 + VXORV V11, V28, V11 + VXORV V12, V29, V12 + VADDW V1, V0, V26 + VADDW V6, V5, V27 + VADDW V11, V10, V28 + VADDW V12, V15, V29 + VROTRW $23, V26, V26 + VROTRW $23, V27, V27 + VROTRW $23, V28, V28 + VROTRW $23, V29, V29 + VXORV V2, V26, V2 + VXORV V7, V27, V7 + VXORV V8, V28, V8 + VXORV V13, V29, V13 + VADDW V2, V1, V26 + VADDW V7, V6, V27 + VADDW V8, V11, V28 + VADDW V13, V12, V29 + VROTRW $19, V26, V26 + VROTRW $19, V27, V27 + VROTRW $19, V28, V28 + VROTRW $19, V29, V29 + VXORV V3, V26, V3 + VXORV V4, V27, V4 + VXORV V9, V28, V9 + VXORV V14, V29, V14 + VADDW V3, V2, V26 + VADDW V4, V7, V27 + VADDW V9, V8, V28 + VADDW V14, V13, V29 + VROTRW $14, V26, V26 + VROTRW $14, V27, V27 + VROTRW $14, V28, V28 + VROTRW $14, V29, V29 + VXORV V0, V26, V0 + VXORV V5, V27, V5 + VXORV V10, V28, V10 + VXORV V15, V29, V15 + + SUBV $1, R15 + BNE R15, R0, salsa20_256 + + // load origin contants + // VLDREPL.W $0, R10, V16 + WORD $0x30200150 + // VLDREPL.W $1, R10, V21 + WORD $0x30200555 + // VLDREPL.W $2, R10, V26 + WORD $0x3020095a + // VLDREPL.W $3, R10, V31 + WORD $0x30200d5f + + // load origin keys + // VLDREPL.W $0, R8, V17 + WORD $0x30200111 + // VLDREPL.W $1, R8, V18 + WORD $0x30200512 + // VLDREPL.W $2, R8, V19 + WORD $0x30200913 + // VLDREPL.W $3, R8, V20 + WORD $0x30200d14 + // VLDREPL.W $4, R8, V27 + WORD $0x3020111b + // VLDREPL.W $5, R8, V28 + WORD $0x3020151c + // VLDREPL.W $6, R8, V29 + WORD $0x3020191d + // VLDREPL.W $7, R8, V30 + WORD $0x30201d1e + + // load origin counter + // VLDREPL.W $0, R7, V22 + WORD $0x302000f6 + // VLDREPL.W $1, R7, V23 + WORD $0x302004f7 + + // add back the initial state to generate the key stream + VADDW V0, V16, V0 + VADDW V1, V17, V1 + VADDW V2, V18, V2 + VADDW V3, V19, V3 + VADDW V4, V20, V4 + VADDW V5, V21, V5 + VADDW V6, V22, V6 + VADDW V7, V23, V7 + VADDW V8, V24, V8 + VADDW V9, V25, V9 + VADDW V10, V26, V10 + VADDW V11, V27, V11 + VADDW V12, V28, V12 + VADDW V13, V29, V13 + VADDW V14, V30, V14 + VADDW V15, V31, V15 + + // shuffle + VILVLW V0, V1, V16 + VILVHW V0, V1, V17 + VILVLW V2, V3, V18 + VILVHW V2, V3, V19 + VILVLW V4, V5 ,V20 + VILVHW V4, V5, V21 + VILVLW V6, V7, V22 + VILVHW V6, V7, V23 + VILVLW V8, V9, V24 + VILVHW V8, V9, V25 + VILVLW V10, V11, V26 + VILVHW V10, V11, V27 + VILVLW V12, V13, V28 + VILVHW V12, V13, V29 + VILVLW V14, V15, V30 + VILVHW V14, V15, V31 + VILVLV V16, V18, V0 + VILVHV V16, V18, V4 + VILVLV V17, V19, V8 + VILVHV V17, V19, V12 + VILVLV V20, V22, V1 + VILVHV V20, V22, V5 + VILVLV V21, V23, V9 + VILVHV V21, V23, V13 + VILVLV V24, V26, V2 + VILVHV V24, V26, V6 + VILVLV V25, V27, V10 + VILVHV V25, V27, V14 + VILVLV V28, V30, V3 + VILVHV V28, V30, V7 + VILVLV V29, V31, V11 + VILVHV V29, V31, V15 + + SGTU $256, R6, R11 + BNE R11, R0, less_than_256 + + // load src data from R5 + VMOVQ 0(R5), V16 + VMOVQ 16(R5), V17 + VMOVQ 32(R5), V18 + VMOVQ 48(R5), V19 + VMOVQ 64(R5), V20 + VMOVQ 80(R5), V21 + VMOVQ 96(R5), V22 + VMOVQ 112(R5), V23 + VMOVQ 128(R5), V24 + VMOVQ 144(R5), V25 + VMOVQ 160(R5), V26 + VMOVQ 176(R5), V27 + VMOVQ 192(R5), V28 + VMOVQ 208(R5), V29 + VMOVQ 224(R5), V30 + VMOVQ 240(R5), V31 + + VXORV V0, V16, V16 + VXORV V1, V17, V17 + VXORV V2, V18, V18 + VXORV V3, V19, V19 + VXORV V4, V20, V20 + VXORV V5, V21, V21 + VXORV V6, V22, V22 + VXORV V7, V23, V23 + VXORV V8, V24, V24 + VXORV V9, V25, V25 + VXORV V10, V26, V26 + VXORV V11, V27, V27 + VXORV V12, V28, V28 + VXORV V13, V29, V29 + VXORV V14, V30, V30 + VXORV V15, V31, V31 + + VMOVQ V16, 0(R4) + VMOVQ V17, 16(R4) + VMOVQ V18, 32(R4) + VMOVQ V19, 48(R4) + VMOVQ V20, 64(R4) + VMOVQ V21, 80(R4) + VMOVQ V22, 96(R4) + VMOVQ V23, 112(R4) + VMOVQ V24, 128(R4) + VMOVQ V25, 144(R4) + VMOVQ V26, 160(R4) + VMOVQ V27, 176(R4) + VMOVQ V28, 192(R4) + VMOVQ V29, 208(R4) + VMOVQ V30, 224(R4) + VMOVQ V31, 240(R4) + + ADDV $4, R9, R9 // update counter + + SUBV $256, R6, R6 + ADDV $256, R4, R4 + ADDV $256, R5, R5 + SGTU $256, R6, R11 + BEQ R11, R0, loop256 + BEQ R6, R0, ret + +less_than_256: + VXORV V30, V30, V30 // V30=0 + SGTU $128, R6, R11 + BNE R11, R0, less_than_128 + SUBV $128, R6 + VMOVQ (R5), V16 + VMOVQ 16(R5), V17 + VMOVQ 32(R5), V18 + VMOVQ 48(R5), V19 + VMOVQ 64(R5), V20 + VMOVQ 80(R5), V21 + VMOVQ 96(R5), V22 + VMOVQ 112(R5), V23 + VXORV V0, V16, V16 + VXORV V1, V17, V17 + VXORV V2, V18, V18 + VXORV V3, V19, V19 + VXORV V4, V20, V20 + VXORV V5, V21, V21 + VXORV V6, V22, V22 + VXORV V7, V23, V23 + VMOVQ V16, (R4) + VMOVQ V17, 16(R4) + VMOVQ V18, 32(R4) + VMOVQ V19, 48(R4) + VMOVQ V20, 64(R4) + VMOVQ V21, 80(R4) + VMOVQ V22, 96(R4) + VMOVQ V23, 112(R4) + BEQ R6, R0, ret + ADDV $128, R5, R5 + ADDV $128, R4, R4 + VADDV V8, V30, V0 + VADDV V9, V30, V1 + VADDV V10, V30, V2 + VADDV V11, V30, V3 + VADDV V12, V30, V4 + VADDV V13, V30, V5 + VADDV V14, V30, V6 + VADDV V15, V30, V7 + +less_than_128: + SGTU $64, R6, R11 + BNE R11, R0, less_than_64 + SUBV $64, R6 + VMOVQ (R5), V16 + VMOVQ 16(R5), V17 + VMOVQ 32(R5), V18 + VMOVQ 48(R5), V19 + VXORV V0, V16, V16 + VXORV V1, V17, V17 + VXORV V2, V18, V18 + VXORV V3, V19, V19 + VMOVQ V16, (R4) + VMOVQ V17, 16(R4) + VMOVQ V18, 32(R4) + VMOVQ V19, 48(R4) + BEQ R6, R0, ret + ADDV $64, R5 + ADDV $64, R4 + VADDV V4, V30, V0 + VADDV V5, V30, V1 + VADDV V6, V30, V2 + VADDV V7, V30, V3 + +less_than_64: + SGTU $32, R6, R11 + BNE R11, R0, less_than_32 + SUBV $32, R6 + VMOVQ (R5), V16 + VMOVQ 16(R5), V17 + VXORV V0, V16, V16 + VXORV V1, V17, V17 + VMOVQ V16, (R4) + VMOVQ V17, 16(R4) + BEQ R6, R0, ret + ADDV $32, R5 + ADDV $32, R4 + VADDV V2, V30, V0 + VADDV V3, V30, V1 + +less_than_32: + SGTU $16, R6, R11 + BNE R11, R0, less_than_16 + SUBV $16, R6 + VMOVQ (R5), V16 + VXORV V16, V0, V16 + VMOVQ V16, (R4) + BEQ R6, R0, ret + ADDV $16, R5 + ADDV $16, R4 + VADDV V1, V30, V0 + +less_than_16: + SGTU $8, R6, R11 + BNE R11, R0, less_than_8 + SUBV $8, R6 + VMOVQ V0.V[0], R11 + VMOVQ V0.V[1], R13 + MOVV (R5), R12 + XOR R11, R12, R12 + MOVV R12, (R4) + BEQ R6, R0, ret + ADDV $8, R5 + ADDV $8, R4 + VMOVQ R13, V0.V[0] + +less_than_8: + SGTU $4, R6, R11 + BNE R11, R0, less_than_4 + SUBV $4, R6 + VMOVQ V0.W[0], R11 + VMOVQ V0.W[1], R13 + MOVWU (R5), R12 + XOR R11, R12, R12 + MOVWU R12, (R4) + BEQ R6, R0, ret + ADDV $4, R5 + ADDV $4, R4 + VMOVQ R13, V0.W[0] + +less_than_4: + SGTU $2, R6, R11 + BNE R11, R0, less_than_2 + SUBV $2, R6 + VMOVQ V0.H[0], R11 + VMOVQ V0.H[1], R13 + MOVHU (R5), R12 + XOR R11, R12, R12 + MOVHU R12, (R4) + BEQ R6, R0, ret + ADDV $2, R5 + ADDV $2, R4 + VMOVQ R13, V0.H[0] + +less_than_2: + VMOVQ V0.B[0], R11 + MOVBU (R5), R12 + XOR R11, R12, R12 + MOVBU R12, (R4) + +ret: + RET diff --git a/salsa20/salsa/salsa20_noasm.go b/salsa20/salsa/salsa20_noasm.go index 9448760..64e262d 100644 --- a/salsa20/salsa/salsa20_noasm.go +++ b/salsa20/salsa/salsa20_noasm.go @@ -2,7 +2,7 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -//go:build !amd64 || purego || !gc +//go:build (!amd64 && !loong64) || purego || !gc package salsa diff --git a/salsa20/salsa/salsa20_test.go b/salsa20/salsa/salsa20_test.go new file mode 100644 index 0000000..d6b29a3 --- /dev/null +++ b/salsa20/salsa/salsa20_test.go @@ -0,0 +1,31 @@ +// Copyright 2019 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build (amd64 || loong64) && !purego && gc + +package salsa + +import ( + "bytes" + "testing" +) + +func TestCounterOverflow(t *testing.T) { + in := make([]byte, 4096) + key := &[32]byte{1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, + 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2} + for n, counter := range []*[16]byte{ + &[16]byte{0, 1, 2, 3, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0}, // zero counter + &[16]byte{0, 1, 2, 3, 4, 5, 6, 7, 0, 0, 0, 0, 0xff, 0xff, 0xff, 0xff}, // counter about to overflow 32 bits + &[16]byte{0, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 0xff, 0xff, 0xff, 0xff}, // counter above 32 bits + } { + out := make([]byte, 4096) + XORKeyStream(out, in, counter, key) + outGeneric := make([]byte, 4096) + genericXORKeyStream(outGeneric, in, counter, key) + if !bytes.Equal(out, outGeneric) { + t.Errorf("%d: assembly and go implementations disagree", n) + } + } +} -- cgit v1.3