diff options
| author | Xiaolin Zhao <zhaoxiaolin@loongson.cn> | 2024-12-23 10:24:07 +0800 |
|---|---|---|
| committer | abner chenc <chenguoqi@loongson.cn> | 2025-03-11 17:59:26 -0700 |
| commit | b369b723c8ad46b179f3a49d57bfc7d6a2740cdf (patch) | |
| tree | 689cd5cf5359689bb4d1cb4fbf0f96d84c1982d4 | |
| parent | 6b853fbea37a941d918ac0760a5492802df42b9b (diff) | |
| download | go-x-crypto-b369b723c8ad46b179f3a49d57bfc7d6a2740cdf.tar.xz | |
crypto/internal/poly1305: implement function update in assembly on loong64
The performance improvements on Loongson-3A5000 and Loongson-3A6000 are as follows:
goos: linux
goarch: loong64
pkg: golang.org/x/crypto/internal/poly1305
cpu: Loongson-3A5000 @ 2500.00MHz
| bench.old | bench.new |
| sec/op | sec/op vs base |
64 122.8n ± 0% 100.0n ± 0% -18.57% (p=0.000 n=10)
1K 1152.0n ± 0% 732.2n ± 0% -36.44% (p=0.000 n=10)
2M 2.356m ± 0% 1.443m ± 0% -38.74% (p=0.000 n=10)
64Unaligned 122.7n ± 0% 101.5n ± 0% -17.24% (p=0.000 n=10)
1KUnaligned 1152.0n ± 0% 745.4n ± 0% -35.30% (p=0.000 n=10)
2MUnaligned 2.336m ± 0% 1.473m ± 0% -36.94% (p=0.000 n=10)
Write64 77.92n ± 0% 54.88n ± 0% -29.57% (p=0.000 n=10)
Write1K 1106.0n ± 0% 683.3n ± 0% -38.22% (p=0.000 n=10)
Write2M 2.356m ± 0% 1.444m ± 0% -38.72% (p=0.000 n=10)
Write64Unaligned 77.87n ± 0% 55.69n ± 0% -28.49% (p=0.000 n=10)
Write1KUnaligned 1106.0n ± 0% 708.1n ± 0% -35.97% (p=0.000 n=10)
Write2MUnaligned 2.335m ± 0% 1.471m ± 0% -37.01% (p=0.000 n=10)
geomean 6.373µ 4.272µ -32.96%
| bench.old | bench.new |
| B/s | B/s vs base |
64 497.1Mi ± 0% 610.3Mi ± 0% +22.78% (p=0.000 n=10)
1K 847.6Mi ± 0% 1333.7Mi ± 0% +57.35% (p=0.000 n=10)
2M 849.0Mi ± 0% 1385.9Mi ± 0% +63.24% (p=0.000 n=10)
64Unaligned 497.4Mi ± 0% 600.9Mi ± 0% +20.81% (p=0.000 n=10)
1KUnaligned 847.6Mi ± 0% 1310.1Mi ± 0% +54.57% (p=0.000 n=10)
2MUnaligned 856.3Mi ± 0% 1357.9Mi ± 0% +58.58% (p=0.000 n=10)
Write64 783.3Mi ± 0% 1112.2Mi ± 0% +41.99% (p=0.000 n=10)
Write1K 882.8Mi ± 0% 1429.1Mi ± 0% +61.88% (p=0.000 n=10)
Write2M 849.0Mi ± 0% 1385.4Mi ± 0% +63.18% (p=0.000 n=10)
Write64Unaligned 783.8Mi ± 0% 1096.1Mi ± 0% +39.85% (p=0.000 n=10)
Write1KUnaligned 882.8Mi ± 0% 1379.0Mi ± 0% +56.20% (p=0.000 n=10)
Write2MUnaligned 856.5Mi ± 0% 1359.9Mi ± 0% +58.76% (p=0.000 n=10)
geomean 772.2Mi 1.125Gi +49.18%
goos: linux
goarch: loong64
pkg: golang.org/x/crypto/internal/poly1305
cpu: Loongson-3A6000-HV @ 2500.00MHz
| bench.old | bench.new |
| sec/op | sec/op vs base |
64 92.06n ± 0% 71.55n ± 0% -22.28% (p=0.000 n=10)
1K 998.4n ± 0% 607.7n ± 0% -39.13% (p=0.000 n=10)
2M 1.976m ± 0% 1.165m ± 0% -41.07% (p=0.000 n=10)
64Unaligned 92.05n ± 0% 71.55n ± 0% -22.27% (p=0.000 n=10)
1KUnaligned 998.3n ± 0% 607.6n ± 0% -39.13% (p=0.000 n=10)
2MUnaligned 1.975m ± 0% 1.222m ± 0% -38.11% (p=0.000 n=10)
Write64 65.24n ± 0% 45.23n ± 0% -30.67% (p=0.000 n=10)
Write1K 970.8n ± 0% 577.6n ± 0% -40.51% (p=0.000 n=10)
Write2M 1.965m ± 0% 1.163m ± 0% -40.81% (p=0.000 n=10)
Write64Unaligned 65.24n ± 0% 45.24n ± 0% -30.66% (p=0.000 n=10)
Write1KUnaligned 970.8n ± 0% 577.6n ± 0% -40.50% (p=0.000 n=10)
Write2MUnaligned 1.965m ± 0% 1.222m ± 0% -37.81% (p=0.000 n=10)
geomean 5.317µ 3.426µ -35.58%
| bench.old | bench.new |
| B/s | B/s vs base |
64 663.0Mi ± 0% 853.1Mi ± 0% +28.67% (p=0.000 n=10)
1K 978.1Mi ± 0% 1606.9Mi ± 0% +64.28% (p=0.000 n=10)
2M 1012.0Mi ± 0% 1717.4Mi ± 0% +69.70% (p=0.000 n=10)
64Unaligned 663.1Mi ± 0% 853.1Mi ± 0% +28.65% (p=0.000 n=10)
1KUnaligned 978.2Mi ± 0% 1607.1Mi ± 0% +64.29% (p=0.000 n=10)
2MUnaligned 1012.6Mi ± 0% 1636.2Mi ± 0% +61.58% (p=0.000 n=10)
Write64 935.5Mi ± 0% 1349.3Mi ± 0% +44.23% (p=0.000 n=10)
Write1K 1005.9Mi ± 0% 1690.9Mi ± 0% +68.09% (p=0.000 n=10)
Write2M 1017.7Mi ± 0% 1719.5Mi ± 0% +68.95% (p=0.000 n=10)
Write64Unaligned 935.6Mi ± 0% 1349.3Mi ± 0% +44.22% (p=0.000 n=10)
Write1KUnaligned 1006.0Mi ± 0% 1690.9Mi ± 0% +68.08% (p=0.000 n=10)
Write2MUnaligned 1017.7Mi ± 0% 1636.4Mi ± 0% +60.80% (p=0.000 n=10)
geomean 925.6Mi 1.403Gi +55.22%
Change-Id: If05a8bfc868b3e6f903ff169eed7a894af741f9b
Reviewed-on: https://go-review.googlesource.com/c/crypto/+/638455
Reviewed-by: David Chase <drchase@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Junyang Shao <shaojunyang@google.com>
Reviewed-by: abner chenc <chenguoqi@loongson.cn>
| -rw-r--r-- | internal/poly1305/mac_noasm.go | 2 | ||||
| -rw-r--r-- | internal/poly1305/sum_asm.go (renamed from internal/poly1305/sum_amd64.go) | 2 | ||||
| -rw-r--r-- | internal/poly1305/sum_loong64.s | 123 | ||||
| -rw-r--r-- | internal/poly1305/sum_ppc64x.go | 47 |
4 files changed, 125 insertions, 49 deletions
diff --git a/internal/poly1305/mac_noasm.go b/internal/poly1305/mac_noasm.go index bd896bd..8d99551 100644 --- a/internal/poly1305/mac_noasm.go +++ b/internal/poly1305/mac_noasm.go @@ -2,7 +2,7 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -//go:build (!amd64 && !ppc64le && !ppc64 && !s390x) || !gc || purego +//go:build (!amd64 && !loong64 && !ppc64le && !ppc64 && !s390x) || !gc || purego package poly1305 diff --git a/internal/poly1305/sum_amd64.go b/internal/poly1305/sum_asm.go index 164cd47..315b84a 100644 --- a/internal/poly1305/sum_amd64.go +++ b/internal/poly1305/sum_asm.go @@ -2,7 +2,7 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -//go:build gc && !purego +//go:build gc && !purego && (amd64 || loong64 || ppc64 || ppc64le) package poly1305 diff --git a/internal/poly1305/sum_loong64.s b/internal/poly1305/sum_loong64.s new file mode 100644 index 0000000..bc8361d --- /dev/null +++ b/internal/poly1305/sum_loong64.s @@ -0,0 +1,123 @@ +// Copyright 2025 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build gc && !purego + +// func update(state *macState, msg []byte) +TEXT ·update(SB), $0-32 + MOVV state+0(FP), R4 + MOVV msg_base+8(FP), R5 + MOVV msg_len+16(FP), R6 + + MOVV $0x10, R7 + + MOVV (R4), R8 // h0 + MOVV 8(R4), R9 // h1 + MOVV 16(R4), R10 // h2 + MOVV 24(R4), R11 // r0 + MOVV 32(R4), R12 // r1 + + BLT R6, R7, bytes_between_0_and_15 + +loop: + MOVV (R5), R14 // msg[0:8] + MOVV 8(R5), R16 // msg[8:16] + ADDV R14, R8, R8 // h0 (x1 + y1 = z1', if z1' < x1 then z1' overflow) + ADDV R16, R9, R27 + SGTU R14, R8, R24 // h0.carry + SGTU R9, R27, R28 + ADDV R27, R24, R9 // h1 + SGTU R27, R9, R24 + OR R24, R28, R24 // h1.carry + ADDV $0x01, R24, R24 + ADDV R10, R24, R10 // h2 + + ADDV $16, R5, R5 // msg = msg[16:] + +multiply: + MULV R8, R11, R14 // h0r0.lo + MULHVU R8, R11, R15 // h0r0.hi + MULV R9, R11, R13 // h1r0.lo + MULHVU R9, R11, R16 // h1r0.hi + ADDV R13, R15, R15 + SGTU R13, R15, R24 + ADDV R24, R16, R16 + MULV R10, R11, R25 + ADDV R16, R25, R25 + MULV R8, R12, R13 // h0r1.lo + MULHVU R8, R12, R16 // h0r1.hi + ADDV R13, R15, R15 + SGTU R13, R15, R24 + ADDV R24, R16, R16 + MOVV R16, R8 + MULV R10, R12, R26 // h2r1 + MULV R9, R12, R13 // h1r1.lo + MULHVU R9, R12, R16 // h1r1.hi + ADDV R13, R25, R25 + ADDV R16, R26, R27 + SGTU R13, R25, R24 + ADDV R27, R24, R26 + ADDV R8, R25, R25 + SGTU R8, R25, R24 + ADDV R24, R26, R26 + AND $3, R25, R10 + AND $-4, R25, R17 + ADDV R17, R14, R8 + ADDV R26, R15, R27 + SGTU R17, R8, R24 + SGTU R26, R27, R28 + ADDV R27, R24, R9 + SGTU R27, R9, R24 + OR R24, R28, R24 + ADDV R24, R10, R10 + SLLV $62, R26, R27 + SRLV $2, R25, R28 + SRLV $2, R26, R26 + OR R27, R28, R25 + ADDV R25, R8, R8 + ADDV R26, R9, R27 + SGTU R25, R8, R24 + SGTU R26, R27, R28 + ADDV R27, R24, R9 + SGTU R27, R9, R24 + OR R24, R28, R24 + ADDV R24, R10, R10 + + SUBV $16, R6, R6 + BGE R6, R7, loop + +bytes_between_0_and_15: + BEQ R6, R0, done + MOVV $1, R14 + XOR R15, R15 + ADDV R6, R5, R5 + +flush_buffer: + MOVBU -1(R5), R25 + SRLV $56, R14, R24 + SLLV $8, R15, R28 + SLLV $8, R14, R14 + OR R24, R28, R15 + XOR R25, R14, R14 + SUBV $1, R6, R6 + SUBV $1, R5, R5 + BNE R6, R0, flush_buffer + + ADDV R14, R8, R8 + SGTU R14, R8, R24 + ADDV R15, R9, R27 + SGTU R15, R27, R28 + ADDV R27, R24, R9 + SGTU R27, R9, R24 + OR R24, R28, R24 + ADDV R10, R24, R10 + + MOVV $16, R6 + JMP multiply + +done: + MOVV R8, (R4) + MOVV R9, 8(R4) + MOVV R10, 16(R4) + RET diff --git a/internal/poly1305/sum_ppc64x.go b/internal/poly1305/sum_ppc64x.go deleted file mode 100644 index 1a1679a..0000000 --- a/internal/poly1305/sum_ppc64x.go +++ /dev/null @@ -1,47 +0,0 @@ -// Copyright 2019 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -//go:build gc && !purego && (ppc64 || ppc64le) - -package poly1305 - -//go:noescape -func update(state *macState, msg []byte) - -// mac is a wrapper for macGeneric that redirects calls that would have gone to -// updateGeneric to update. -// -// Its Write and Sum methods are otherwise identical to the macGeneric ones, but -// using function pointers would carry a major performance cost. -type mac struct{ macGeneric } - -func (h *mac) Write(p []byte) (int, error) { - nn := len(p) - if h.offset > 0 { - n := copy(h.buffer[h.offset:], p) - if h.offset+n < TagSize { - h.offset += n - return nn, nil - } - p = p[n:] - h.offset = 0 - update(&h.macState, h.buffer[:]) - } - if n := len(p) - (len(p) % TagSize); n > 0 { - update(&h.macState, p[:n]) - p = p[n:] - } - if len(p) > 0 { - h.offset += copy(h.buffer[h.offset:], p) - } - return nn, nil -} - -func (h *mac) Sum(out *[16]byte) { - state := h.macState - if h.offset > 0 { - update(&state, h.buffer[:h.offset]) - } - finalize(out, &state.h, &state.s) -} |
