aboutsummaryrefslogtreecommitdiff
path: root/src/internal/bytealg
diff options
context:
space:
mode:
authorlimeidan <limeidan@loongson.cn>2025-04-23 11:04:15 +0800
committerabner chenc <chenguoqi@loongson.cn>2025-05-08 19:13:30 -0700
commit7bc714d60173af7e783fc69e1bc532ab4aab11ba (patch)
treedb2673250bac98e9e22ae9b5614847bedc484c90 /src/internal/bytealg
parentd65c209b4b5bc99e7e5587817b79e0850cff32cb (diff)
downloadgo-7bc714d60173af7e783fc69e1bc532ab4aab11ba.tar.xz
internal/bytealg: optimize the function indexbyte using SIMD on loong64
goos: linux goarch: loong64 pkg: bytes cpu: Loongson-3C5000 @ 2200.00MHz │ old │ new │ │ sec/op │ sec/op vs base │ IndexByte/10 19.32n ± 0% 11.84n ± 0% -38.72% (p=0.000 n=10) IndexByte/32 49.34n ± 0% 14.11n ± 0% -71.40% (p=0.000 n=10) IndexByte/4K 5608.0n ± 0% 138.8n ± 0% -97.52% (p=0.000 n=10) IndexByte/4M 3822.8µ ± 0% 119.4µ ± 0% -96.88% (p=0.000 n=10) IndexByte/64M 61.826m ± 1% 3.812m ± 0% -93.83% (p=0.000 n=10) geomean 16.61µ 1.602µ -90.35% goos: linux goarch: loong64 pkg: bytes cpu: Loongson-3A6000-HV @ 2500.00MHz │ old │ new │ │ sec/op │ sec/op vs base │ IndexByte/10 6.809n ± 0% 5.804n ± 0% -14.75% (p=0.000 n=10) IndexByte/32 16.015n ± 0% 6.404n ± 0% -60.01% (p=0.000 n=10) IndexByte/4K 1651.00n ± 0% 52.83n ± 0% -96.80% (p=0.000 n=10) IndexByte/4M 1680.76µ ± 0% 91.10µ ± 0% -94.58% (p=0.000 n=10) IndexByte/64M 26.878m ± 0% 2.010m ± 27% -92.52% (p=0.000 n=10) geomean 6.054µ 815.0n -86.54% Change-Id: Ib75b997249708f921c6717eba43543c6650bf376 Reviewed-on: https://go-review.googlesource.com/c/go/+/668055 Reviewed-by: Keith Randall <khr@google.com> Reviewed-by: abner chenc <chenguoqi@loongson.cn> Reviewed-by: Cherry Mui <cherryyz@google.com> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: sophie zhao <zhaoxiaolin@loongson.cn>
Diffstat (limited to 'src/internal/bytealg')
-rw-r--r--src/internal/bytealg/indexbyte_loong64.s296
1 files changed, 268 insertions, 28 deletions
diff --git a/src/internal/bytealg/indexbyte_loong64.s b/src/internal/bytealg/indexbyte_loong64.s
index c9591b3cda..3618501063 100644
--- a/src/internal/bytealg/indexbyte_loong64.s
+++ b/src/internal/bytealg/indexbyte_loong64.s
@@ -5,48 +5,288 @@
#include "go_asm.h"
#include "textflag.h"
+// input:
+// R4 = b_base
+// R5 = b_len
+// R6 = b_cap (unused)
+// R7 = byte to find
TEXT ·IndexByte<ABIInternal>(SB),NOSPLIT,$0-40
- // R4 = b_base
- // R5 = b_len
- // R6 = b_cap (unused)
- // R7 = byte to find
AND $0xff, R7
+ JMP indexbytebody<>(SB)
+
+// input:
+// R4 = s_base
+// R5 = s_len
+// R6 = byte to find
+TEXT ·IndexByteString<ABIInternal>(SB),NOSPLIT,$0-32
+ AND $0xff, R6, R7 // byte to find
+ JMP indexbytebody<>(SB)
+
+// input:
+// R4: b_base
+// R5: len
+// R7: byte to find
+TEXT indexbytebody<>(SB),NOSPLIT,$0
+ BEQ R5, notfound // len == 0
+
MOVV R4, R6 // store base for later
- ADDV R4, R5 // end
- ADDV $-1, R4
+ ADDV R4, R5, R8 // end
+
+ MOVV $32, R9
+ BGE R5, R9, lasx
+tail:
+ MOVV $8, R9
+ BLT R5, R9, lt_8
+generic8_loop:
+ MOVV (R4), R10
+
+ AND $0xff, R10, R11
+ BEQ R7, R11, found
+
+ BSTRPICKV $15, R10, $8, R11
+ BEQ R7, R11, byte_1th
+
+ BSTRPICKV $23, R10, $16, R11
+ BEQ R7, R11, byte_2th
+
+ BSTRPICKV $31, R10, $24, R11
+ BEQ R7, R11, byte_3th
- PCALIGN $16
-loop:
+ BSTRPICKV $39, R10, $32, R11
+ BEQ R7, R11, byte_4th
+
+ BSTRPICKV $47, R10, $40, R11
+ BEQ R7, R11, byte_5th
+
+ BSTRPICKV $55, R10, $48, R11
+ BEQ R7, R11, byte_6th
+
+ BSTRPICKV $63, R10, $56, R11
+ BEQ R7, R11, byte_7th
+
+ ADDV $8, R4
+ ADDV $-8, R5
+ BGE R5, R9, generic8_loop
+
+lt_8:
+ BEQ R4, R8, notfound
+ MOVBU (R4), R10
+ BEQ R7, R10, found
ADDV $1, R4
- BEQ R4, R5, notfound
- MOVBU (R4), R8
- BNE R7, R8, loop
+ JMP lt_8
- SUBV R6, R4 // remove base
+byte_1th:
+ ADDV $1, R4
+ SUBV R6, R4
RET
-notfound:
- MOVV $-1, R4
+byte_2th:
+ ADDV $2, R4
+ SUBV R6, R4
RET
-TEXT ·IndexByteString<ABIInternal>(SB),NOSPLIT,$0-32
- // R4 = s_base
- // R5 = s_len
- // R6 = byte to find
- MOVV R4, R7 // store base for later
- ADDV R4, R5 // end
- ADDV $-1, R4
+byte_3th:
+ ADDV $3, R4
+ SUBV R6, R4
+ RET
- PCALIGN $16
-loop:
- ADDV $1, R4
- BEQ R4, R5, notfound
- MOVBU (R4), R8
- BNE R6, R8, loop
+byte_4th:
+ ADDV $4, R4
+ SUBV R6, R4
+ RET
+
+byte_5th:
+ ADDV $5, R4
+ SUBV R6, R4
+ RET
+
+byte_6th:
+ ADDV $6, R4
+ SUBV R6, R4
+ RET
+
+byte_7th:
+ ADDV $7, R4
- SUBV R7, R4 // remove base
+found:
+ SUBV R6, R4
RET
notfound:
MOVV $-1, R4
RET
+
+lasx:
+ MOVBU internal∕cpu·Loong64+const_offsetLOONG64HasLASX(SB), R9
+ BEQ R9, lsx
+ XVMOVQ R7, X0.B32
+
+ MOVV $128, R9
+ BLT R5, R9, lasx32_loop
+lasx128_loop:
+ XVMOVQ 0(R4), X1
+ XVMOVQ 32(R4), X2
+ XVMOVQ 64(R4), X3
+ XVMOVQ 96(R4), X4
+
+ XVSEQB X1, X0, X1
+ XVSETNEV X1, FCC0
+ BFPT lasx_found_add_0
+
+ XVSEQB X2, X0, X1
+ XVSETNEV X1, FCC0
+ BFPT lasx_found_add_32
+
+ XVSEQB X3, X0, X1
+ XVSETNEV X1, FCC0
+ BFPT lasx_found_add_64
+
+ XVSEQB X4, X0, X1
+ XVSETNEV X1, FCC0
+ BFPT lasx_found_add_96
+
+ ADDV $128, R4
+ ADDV $-128, R5
+ BGE R5, R9, lasx128_loop
+
+ BEQ R5, notfound
+
+ MOVV $32, R9
+ BLT R5, R9, tail
+lasx32_loop:
+ XVMOVQ 0(R4), X1
+
+ XVSEQB X1, X0, X1
+ XVSETNEV X1, FCC0
+ BFPT lasx_found_add_0
+
+ ADDV $32, R4
+ ADDV $-32, R5
+ BGE R5, R9, lasx32_loop
+
+ BEQ R5, notfound
+
+ JMP tail
+
+lasx_found_add_0:
+ MOVV R0, R11
+ JMP lasx_index_cal
+
+lasx_found_add_32:
+ MOVV $32, R11
+ JMP lasx_index_cal
+
+lasx_found_add_64:
+ MOVV $64, R11
+ JMP lasx_index_cal
+
+lasx_found_add_96:
+ MOVV $96, R11
+ JMP lasx_index_cal
+
+lasx_index_cal:
+ MOVV $64, R9
+ XVMOVQ X1.V[0], R10
+ CTZV R10, R10
+ BNE R10, R9, index_cal
+ ADDV $8, R11
+
+ XVMOVQ X1.V[1], R10
+ CTZV R10, R10
+ BNE R10, R9, index_cal
+ ADDV $8, R11
+
+ XVMOVQ X1.V[2], R10
+ CTZV R10, R10
+ BNE R10, R9, index_cal
+ ADDV $8, R11
+
+ XVMOVQ X1.V[3], R10
+ CTZV R10, R10
+ JMP index_cal
+
+lsx:
+ MOVBU internal∕cpu·Loong64+const_offsetLOONG64HasLSX(SB), R9
+ BEQ R9, tail
+ VMOVQ R7, V0.B16
+
+ MOVV $64, R9
+ BLT R5, R9, lsx16_loop
+lsx64_loop:
+ VMOVQ 0(R4), V1
+ VMOVQ 16(R4), V2
+ VMOVQ 32(R4), V3
+ VMOVQ 48(R4), V4
+
+ VSEQB V1, V0, V1
+ VSETNEV V1, FCC0
+ BFPT lsx_found_add_0
+
+ VSEQB V2, V0, V1
+ VSETNEV V1, FCC0
+ BFPT lsx_found_add_16
+
+ VSEQB V3, V0, V1
+ VSETNEV V1, FCC0
+ BFPT lsx_found_add_32
+
+ VSEQB V4, V0, V1
+ VSETNEV V1, FCC0
+ BFPT lsx_found_add_48
+
+ ADDV $64, R4
+ ADDV $-64, R5
+ BGE R5, R9, lsx64_loop
+
+ BEQ R5, notfound
+
+ MOVV $16, R9
+ BLT R5, R9, tail
+lsx16_loop:
+ VMOVQ 0(R4), V1
+
+ VSEQB V1, V0, V1
+ VSETNEV V1, FCC0
+ BFPT lsx_found_add_0
+
+ ADDV $16, R4
+ ADDV $-16, R5
+ BGE R5, R9, lsx16_loop
+
+ BEQ R5, notfound
+
+ JMP tail
+
+lsx_found_add_0:
+ MOVV R0, R11
+ JMP lsx_index_cal
+
+lsx_found_add_16:
+ MOVV $16, R11
+ JMP lsx_index_cal
+
+lsx_found_add_32:
+ MOVV $32, R11
+ JMP lsx_index_cal
+
+lsx_found_add_48:
+ MOVV $48, R11
+ JMP lsx_index_cal
+
+lsx_index_cal:
+ MOVV $64, R9
+
+ VMOVQ V1.V[0], R10
+ CTZV R10, R10
+ BNE R10, R9, index_cal
+ ADDV $8, R11
+
+ VMOVQ V1.V[1], R10
+ CTZV R10, R10
+ JMP index_cal
+
+index_cal:
+ SRLV $3, R10
+ ADDV R11, R10
+ ADDV R10, R4
+ JMP found