aboutsummaryrefslogtreecommitdiff
path: root/src/internal/bytealg
diff options
context:
space:
mode:
authorJoel Sing <joel@sing.id.au>2025-02-08 01:03:23 +1100
committerJoel Sing <joel@sing.id.au>2025-08-06 06:23:02 -0700
commit3406a617d9643902e932ca99c6f276fa7b19e030 (patch)
tree6cca19be9e00c0a288ae46060c98809db484e33f /src/internal/bytealg
parent75ea2d05c01903a69dbdcd15e64b934da73c84ea (diff)
downloadgo-3406a617d9643902e932ca99c6f276fa7b19e030.tar.xz
internal/bytealg: vector implementation of indexbyte for riscv64
Provide a vector implementation of indexbyte for riscv64, which is used when compiled with the rva23u64 profile, or when vector is detected to be available. Inputs that are smaller than 24 bytes will continue to use the non-vector path. On a Banana Pi F3, with GORISCV64=rva23u64: │ indexbyte.1 │ indexbyte.2 │ │ sec/op │ sec/op vs base │ IndexByte/10-8 52.68n ± 0% 47.26n ± 0% -10.30% (p=0.000 n=10) IndexByte/32-8 68.62n ± 0% 47.02n ± 0% -31.49% (p=0.000 n=10) IndexByte/4K-8 2217.0n ± 0% 420.4n ± 0% -81.04% (p=0.000 n=10) IndexByte/4M-8 2624.4µ ± 0% 767.5µ ± 0% -70.75% (p=0.000 n=10) IndexByte/64M-8 68.08m ± 10% 47.84m ± 45% -29.73% (p=0.004 n=10) geomean 17.03µ 8.073µ -52.59% │ indexbyte.1 │ indexbyte.2 │ │ B/s │ B/s vs base │ IndexByte/10-8 181.0Mi ± 0% 201.8Mi ± 0% +11.48% (p=0.000 n=10) IndexByte/32-8 444.7Mi ± 0% 649.1Mi ± 0% +45.97% (p=0.000 n=10) IndexByte/4K-8 1.721Gi ± 0% 9.076Gi ± 0% +427.51% (p=0.000 n=10) IndexByte/4M-8 1.488Gi ± 0% 5.089Gi ± 0% +241.93% (p=0.000 n=10) IndexByte/64M-8 940.3Mi ± 9% 1337.8Mi ± 31% +42.27% (p=0.004 n=10) geomean 727.1Mi 1.498Gi +110.94% Change-Id: If7b0dbef38d76fa7a2021e4ecaed668a1d4b9783 Reviewed-on: https://go-review.googlesource.com/c/go/+/648856 LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: Meng Zhuo <mengzhuo1203@gmail.com> Reviewed-by: Mark Freeman <markfreeman@google.com> Reviewed-by: Mark Ryan <markdryan@rivosinc.com> Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
Diffstat (limited to 'src/internal/bytealg')
-rw-r--r--src/internal/bytealg/indexbyte_riscv64.s60
1 files changed, 41 insertions, 19 deletions
diff --git a/src/internal/bytealg/indexbyte_riscv64.s b/src/internal/bytealg/indexbyte_riscv64.s
index fde00da0ea..527ae6d35e 100644
--- a/src/internal/bytealg/indexbyte_riscv64.s
+++ b/src/internal/bytealg/indexbyte_riscv64.s
@@ -2,6 +2,7 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
+#include "asm_riscv64.h"
#include "go_asm.h"
#include "textflag.h"
@@ -11,12 +12,14 @@ TEXT ·IndexByte<ABIInternal>(SB),NOSPLIT,$0-40
// X12 = b_cap (unused)
// X13 = byte to find
AND $0xff, X13, X12 // x12 byte to look for
- MOV X10, X13 // store base for later
SLTI $24, X11, X14
- ADD X10, X11 // end
- BEQZ X14, bigBody
+ BNEZ X14, small
+ JMP indexByteBig<>(SB)
+small:
+ MOV X10, X13 // store base for later
+ ADD X10, X11 // end
SUB $1, X10
loop:
ADD $1, X10
@@ -31,21 +34,19 @@ notfound:
MOV $-1, X10
RET
-bigBody:
- JMP indexByteBig<>(SB)
-
TEXT ·IndexByteString<ABIInternal>(SB),NOSPLIT,$0-32
// X10 = b_base
// X11 = b_len
// X12 = byte to find
-
AND $0xff, X12 // x12 byte to look for
- MOV X10, X13 // store base for later
SLTI $24, X11, X14
- ADD X10, X11 // end
- BEQZ X14, bigBody
+ BNEZ X14, small
+ JMP indexByteBig<>(SB)
+small:
+ MOV X10, X13 // store base for later
+ ADD X10, X11 // end
SUB $1, X10
loop:
ADD $1, X10
@@ -60,20 +61,41 @@ notfound:
MOV $-1, X10
RET
-bigBody:
- JMP indexByteBig<>(SB)
-
TEXT indexByteBig<>(SB),NOSPLIT|NOFRAME,$0
- // On entry
+ // On entry:
// X10 = b_base
- // X11 = end
+ // X11 = b_len (at least 16 bytes)
// X12 = byte to find
- // X13 = b_base
- // X11 is at least 16 bytes > X10
-
- // On exit
+ // On exit:
// X10 = index of first instance of sought byte, if found, or -1 otherwise
+ MOV X10, X13 // store base for later
+
+#ifndef hasV
+ MOVB internal∕cpu·RISCV64+const_offsetRISCV64HasV(SB), X5
+ BEQZ X5, indexbyte_scalar
+#endif
+
+ PCALIGN $16
+vector_loop:
+ VSETVLI X11, E8, M8, TA, MA, X5
+ VLE8V (X10), V8
+ VMSEQVX X12, V8, V0
+ VFIRSTM V0, X6
+ BGEZ X6, vector_found
+ ADD X5, X10
+ SUB X5, X11
+ BNEZ X11, vector_loop
+ JMP notfound
+
+vector_found:
+ SUB X13, X10
+ ADD X6, X10
+ RET
+
+indexbyte_scalar:
+ ADD X10, X11 // end
+
// Process the first few bytes until we get to an 8 byte boundary
// No need to check for end here as we have at least 16 bytes in
// the buffer.