aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorJoel Sing <joel@sing.id.au>2025-02-12 23:41:35 +1100
committerJoel Sing <joel@sing.id.au>2025-08-08 01:35:29 -0700
commitd83b16fcb8de765f25cabbea63284406ea6dd091 (patch)
tree61431405abeeebae69b4083090bd7e0d2bd66fbb /src
parentdd3abf6bc5203df3c8a09ff151852fa8b9e7f2f4 (diff)
downloadgo-d83b16fcb8de765f25cabbea63284406ea6dd091.tar.xz
internal/bytealg: vector implementation of compare for riscv64
Provide a vector implementation of compare for riscv64, which is used when compiled with the rva23u64 profile, or when vector is detected to be available. Inputs that are 8 byte aligned will still be handled via a the non-vector code if the length is less than or equal to 128 bytes. On a Banana Pi F3, with GORISCV64=rva23u64: │ compare.1 │ compare.2 │ │ sec/op │ sec/op vs base │ BytesCompare/1-8 24.36n ± 0% 24.15n ± 0% -0.84% (p=0.007 n=10) BytesCompare/2-8 26.75n ± 0% 26.97n ± 0% +0.82% (p=0.000 n=10) BytesCompare/4-8 27.63n ± 0% 27.80n ± 0% +0.60% (p=0.001 n=10) BytesCompare/8-8 35.91n ± 0% 35.19n ± 0% -2.01% (p=0.000 n=10) BytesCompare/16-8 53.22n ± 0% 24.04n ± 1% -54.82% (p=0.000 n=10) BytesCompare/32-8 25.12n ± 0% 26.09n ± 1% +3.86% (p=0.000 n=10) BytesCompare/64-8 32.52n ± 0% 33.43n ± 1% +2.78% (p=0.000 n=10) BytesCompare/128-8 46.59n ± 0% 48.22n ± 1% +3.50% (p=0.000 n=10) BytesCompare/256-8 74.25n ± 0% 50.18n ± 0% -32.42% (p=0.000 n=10) BytesCompare/512-8 129.85n ± 0% 83.12n ± 0% -35.98% (p=0.000 n=10) BytesCompare/1024-8 244.6n ± 0% 148.0n ± 1% -39.49% (p=0.000 n=10) BytesCompare/2048-8 465.9n ± 0% 282.8n ± 2% -39.30% (p=0.000 n=10) CompareBytesEqual-8 51.96n ± 0% 52.90n ± 1% +1.80% (p=0.000 n=10) CompareBytesToNil-8 15.77n ± 1% 15.68n ± 0% -0.57% (p=0.000 n=10) CompareBytesEmpty-8 14.21n ± 1% 14.20n ± 1% ~ (p=1.000 n=10) CompareBytesIdentical-8 14.20n ± 1% 15.07n ± 1% +6.20% (p=0.000 n=10) CompareBytesSameLength-8 31.38n ± 0% 30.52n ± 0% -2.74% (p=0.000 n=10) CompareBytesDifferentLength-8 31.38n ± 0% 30.53n ± 0% -2.71% (p=0.000 n=10) CompareBytesBigUnaligned/offset=1-8 2401.0µ ± 0% 437.6µ ± 0% -81.77% (p=0.000 n=10) CompareBytesBigUnaligned/offset=2-8 2376.8µ ± 0% 437.4µ ± 0% -81.60% (p=0.000 n=10) CompareBytesBigUnaligned/offset=3-8 2384.1µ ± 0% 437.5µ ± 0% -81.65% (p=0.000 n=10) CompareBytesBigUnaligned/offset=4-8 2377.7µ ± 0% 437.4µ ± 0% -81.60% (p=0.000 n=10) CompareBytesBigUnaligned/offset=5-8 2366.3µ ± 0% 437.5µ ± 0% -81.51% (p=0.000 n=10) CompareBytesBigUnaligned/offset=6-8 2357.3µ ± 0% 437.3µ ± 0% -81.45% (p=0.000 n=10) CompareBytesBigUnaligned/offset=7-8 2385.3µ ± 0% 437.6µ ± 0% -81.65% (p=0.000 n=10) CompareBytesBigBothUnaligned/offset=0-8 447.2µ ± 0% 464.8µ ± 0% +3.94% (p=0.000 n=10) CompareBytesBigBothUnaligned/offset=1-8 447.7µ ± 0% 453.1µ ± 0% +1.20% (p=0.000 n=10) CompareBytesBigBothUnaligned/offset=2-8 447.9µ ± 0% 453.0µ ± 0% +1.15% (p=0.000 n=10) CompareBytesBigBothUnaligned/offset=3-8 448.0µ ± 0% 452.5µ ± 0% +1.02% (p=0.000 n=10) CompareBytesBigBothUnaligned/offset=4-8 448.0µ ± 0% 452.1µ ± 0% +0.92% (p=0.000 n=10) CompareBytesBigBothUnaligned/offset=5-8 447.8µ ± 0% 452.8µ ± 0% +1.12% (p=0.000 n=10) CompareBytesBigBothUnaligned/offset=6-8 447.9µ ± 0% 452.4µ ± 0% +1.01% (p=0.000 n=10) CompareBytesBigBothUnaligned/offset=7-8 447.9µ ± 0% 452.8µ ± 0% +1.09% (p=0.000 n=10) CompareBytesBig-8 441.2µ ± 0% 461.8µ ± 0% +4.66% (p=0.000 n=10) CompareBytesBigIdentical-8 13.81n ± 0% 13.80n ± 0% ~ (p=0.519 n=10) geomean 3.980µ 2.651µ -33.40% │ compare.1 │ compare.2 │ │ B/s │ B/s vs base │ CompareBytesBigUnaligned/offset=1-8 416.5Mi ± 0% 2285.1Mi ± 0% +448.64% (p=0.000 n=10) CompareBytesBigUnaligned/offset=2-8 420.7Mi ± 0% 2286.4Mi ± 0% +443.43% (p=0.000 n=10) CompareBytesBigUnaligned/offset=3-8 419.5Mi ± 0% 2285.9Mi ± 0% +444.97% (p=0.000 n=10) CompareBytesBigUnaligned/offset=4-8 420.6Mi ± 0% 2286.1Mi ± 0% +443.57% (p=0.000 n=10) CompareBytesBigUnaligned/offset=5-8 422.6Mi ± 0% 2285.7Mi ± 0% +440.86% (p=0.000 n=10) CompareBytesBigUnaligned/offset=6-8 424.2Mi ± 0% 2286.8Mi ± 0% +439.07% (p=0.000 n=10) CompareBytesBigUnaligned/offset=7-8 419.2Mi ± 0% 2285.2Mi ± 0% +445.07% (p=0.000 n=10) CompareBytesBigBothUnaligned/offset=0-8 2.184Gi ± 0% 2.101Gi ± 0% -3.79% (p=0.000 n=10) CompareBytesBigBothUnaligned/offset=1-8 2.181Gi ± 0% 2.155Gi ± 0% -1.18% (p=0.000 n=10) CompareBytesBigBothUnaligned/offset=2-8 2.180Gi ± 0% 2.156Gi ± 0% -1.13% (p=0.000 n=10) CompareBytesBigBothUnaligned/offset=3-8 2.180Gi ± 0% 2.158Gi ± 0% -1.01% (p=0.000 n=10) CompareBytesBigBothUnaligned/offset=4-8 2.180Gi ± 0% 2.160Gi ± 0% -0.91% (p=0.000 n=10) CompareBytesBigBothUnaligned/offset=5-8 2.181Gi ± 0% 2.157Gi ± 0% -1.11% (p=0.000 n=10) CompareBytesBigBothUnaligned/offset=6-8 2.181Gi ± 0% 2.159Gi ± 0% -1.00% (p=0.000 n=10) CompareBytesBigBothUnaligned/offset=7-8 2.180Gi ± 0% 2.157Gi ± 0% -1.08% (p=0.000 n=10) CompareBytesBig-8 2.213Gi ± 0% 2.115Gi ± 0% -4.45% (p=0.000 n=10) CompareBytesBigIdentical-8 69.06Ti ± 0% 69.09Ti ± 0% ~ (p=0.315 n=10) geomean 2.022Gi 4.022Gi +98.95% Change-Id: Id3012faf8d353eb1be0e1fb01b78ac43fa4c7e8b Reviewed-on: https://go-review.googlesource.com/c/go/+/646737 Reviewed-by: Mark Ryan <markdryan@rivosinc.com> Reviewed-by: Mark Freeman <markfreeman@google.com> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: Dmitri Shuralyov <dmitshur@google.com> Reviewed-by: Meng Zhuo <mengzhuo1203@gmail.com>
Diffstat (limited to 'src')
-rw-r--r--src/internal/bytealg/compare_riscv64.s47
1 files changed, 44 insertions, 3 deletions
diff --git a/src/internal/bytealg/compare_riscv64.s b/src/internal/bytealg/compare_riscv64.s
index 6388fcd209..3b1523dfbf 100644
--- a/src/internal/bytealg/compare_riscv64.s
+++ b/src/internal/bytealg/compare_riscv64.s
@@ -2,6 +2,7 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
+#include "asm_riscv64.h"
#include "go_asm.h"
#include "textflag.h"
@@ -35,6 +36,46 @@ TEXT compare<>(SB),NOSPLIT|NOFRAME,$0
MIN X11, X13, X5
BEQZ X5, cmp_len
+ MOV $16, X6
+ BLT X5, X6, check8_unaligned
+
+#ifndef hasV
+ MOVB internal∕cpu·RISCV64+const_offsetRISCV64HasV(SB), X6
+ BEQZ X6, compare_scalar
+#endif
+
+ // Use vector if not 8 byte aligned.
+ OR X10, X12, X6
+ AND $7, X6
+ BNEZ X6, vector_loop
+
+ // Use scalar if 8 byte aligned and <= 128 bytes.
+ SUB $128, X5, X6
+ BLEZ X6, compare_scalar_aligned
+
+ PCALIGN $16
+vector_loop:
+ VSETVLI X5, E8, M8, TA, MA, X6
+ VLE8V (X10), V8
+ VLE8V (X12), V16
+ VMSNEVV V8, V16, V0
+ VFIRSTM V0, X7
+ BGEZ X7, vector_not_eq
+ ADD X6, X10
+ ADD X6, X12
+ SUB X6, X5
+ BNEZ X5, vector_loop
+ JMP cmp_len
+
+vector_not_eq:
+ // Load first differing bytes in X8/X9.
+ ADD X7, X10
+ ADD X7, X12
+ MOVBU (X10), X8
+ MOVBU (X12), X9
+ JMP cmp
+
+compare_scalar:
MOV $32, X6
BLT X5, X6, check8_unaligned
@@ -57,9 +98,9 @@ align:
ADD $1, X12
BNEZ X7, align
-check32:
- // X6 contains $32
- BLT X5, X6, compare16
+compare_scalar_aligned:
+ MOV $32, X6
+ BLT X5, X6, check16
compare32:
MOV 0(X10), X15
MOV 0(X12), X16