diff options
Diffstat (limited to 'src')
| -rw-r--r-- | src/internal/bytealg/index_generic.go | 2 | ||||
| -rw-r--r-- | src/internal/bytealg/index_loong64.go | 30 | ||||
| -rw-r--r-- | src/internal/bytealg/index_loong64.s | 303 | ||||
| -rw-r--r-- | src/internal/bytealg/index_native.go | 2 |
4 files changed, 335 insertions, 2 deletions
diff --git a/src/internal/bytealg/index_generic.go b/src/internal/bytealg/index_generic.go index a59e32938e..643bb59ab1 100644 --- a/src/internal/bytealg/index_generic.go +++ b/src/internal/bytealg/index_generic.go @@ -2,7 +2,7 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -//go:build !amd64 && !arm64 && !s390x && !ppc64le && !ppc64 +//go:build !amd64 && !arm64 && !loong64 && !s390x && !ppc64le && !ppc64 package bytealg diff --git a/src/internal/bytealg/index_loong64.go b/src/internal/bytealg/index_loong64.go new file mode 100644 index 0000000000..ad574d66fa --- /dev/null +++ b/src/internal/bytealg/index_loong64.go @@ -0,0 +1,30 @@ +// Copyright 2025 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package bytealg + +import "internal/cpu" + +// Empirical data shows that using Index can get better +// performance when len(s) <= 16. +const MaxBruteForce = 16 + +func init() { + // If SIMD is supported, optimize the cases where the substring length is less than 64 bytes, + // otherwise, cases the length less than 32 bytes is optimized. + if cpu.Loong64.HasLASX || cpu.Loong64.HasLSX { + MaxLen = 64 + } else { + MaxLen = 32 + } +} + +// Cutover reports the number of failures of IndexByte we should tolerate +// before switching over to Index. +// n is the number of bytes processed so far. +// See the bytes.Index implementation for details. +func Cutover(n int) int { + // 1 error per 8 characters, plus a few slop to start. + return (n + 16) / 8 +} diff --git a/src/internal/bytealg/index_loong64.s b/src/internal/bytealg/index_loong64.s new file mode 100644 index 0000000000..1016db738d --- /dev/null +++ b/src/internal/bytealg/index_loong64.s @@ -0,0 +1,303 @@ +// Copyright 2025 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "go_asm.h" +#include "textflag.h" + +TEXT ·Index<ABIInternal>(SB),NOSPLIT,$0-56 + MOVV R7, R6 // R6 = separator pointer + MOVV R8, R7 // R7 = separator length + JMP indexbody<>(SB) + +TEXT ·IndexString<ABIInternal>(SB),NOSPLIT,$0-40 + JMP indexbody<>(SB) + +// input: +// R4 = string +// R5 = length +// R6 = separator pointer +// R7 = separator length (2 <= len <= 64) +TEXT indexbody<>(SB),NOSPLIT,$0 + // main idea is to load 'sep' into separate register(s) + // to avoid repeatedly re-load it again and again + // for sebsequent substring comparisons + SUBV R7, R5, R8 + ADDV R4, R8 // R8 contains the start of last substring for comparison + ADDV $1, R4, R9 // store base for later + + MOVV $8, R5 + BGE R7, R5, len_gt_or_eq_8 +len_2_7: + AND $0x4, R7, R5 + BNE R5, len_4_7 + +len_2_3: + AND $0x1, R7, R5 + BNE R5, len_3 + +len_2: + MOVHU (R6), R10 +loop_2: + BLT R8, R4, not_found + MOVHU (R4), R11 + ADDV $1, R4 + BNE R10, R11, loop_2 + JMP found + +len_3: + MOVHU (R6), R10 + MOVBU 2(R6), R11 +loop_3: + BLT R8, R4, not_found + MOVHU (R4), R12 + ADDV $1, R4 + BNE R10, R12, loop_3 + MOVBU 1(R4), R13 + BNE R11, R13, loop_3 + JMP found + +len_4_7: + AND $0x2, R7, R5 + BNE R5, len_6_7 + AND $0x1, R7, R5 + BNE R5, len_5 +len_4: + MOVWU (R6), R10 +loop_4: + BLT R8, R4, not_found + MOVWU (R4), R11 + ADDV $1, R4 + BNE R10, R11, loop_4 + JMP found + +len_5: + MOVWU (R6), R10 + MOVBU 4(R6), R11 +loop_5: + BLT R8, R4, not_found + MOVWU (R4), R12 + ADDV $1, R4 + BNE R10, R12, loop_5 + MOVBU 3(R4), R13 + BNE R11, R13, loop_5 + JMP found + +len_6_7: + AND $0x1, R7, R5 + BNE R5, len_7 +len_6: + MOVWU (R6), R10 + MOVHU 4(R6), R11 +loop_6: + BLT R8, R4, not_found + MOVWU (R4), R12 + ADDV $1, R4 + BNE R10, R12, loop_6 + MOVHU 3(R4), R13 + BNE R11, R13, loop_6 + JMP found + +len_7: + MOVWU (R6), R10 + MOVWU 3(R6), R11 +loop_7: + BLT R8, R4, not_found + MOVWU (R4), R12 + ADDV $1, R4 + BNE R10, R12, loop_7 + MOVWU 2(R4), R13 + BNE R11, R13, loop_7 + JMP found + +len_gt_or_eq_8: + BEQ R5, R7, len_8 + MOVV $17, R5 + BGE R7, R5, len_gt_or_eq_17 + JMP len_9_16 +len_8: + MOVV (R6), R10 +loop_8: + BLT R8, R4, not_found + MOVV (R4), R11 + ADDV $1, R4 + BNE R10, R11, loop_8 + JMP found + +len_9_16: + MOVV (R6), R10 + SUBV $8, R7 + MOVV (R6)(R7), R11 + SUBV $1, R7 +loop_9_16: + BLT R8, R4, not_found + MOVV (R4), R12 + ADDV $1, R4 + BNE R10, R12, loop_9_16 + MOVV (R4)(R7), R13 + BNE R11, R13, loop_9_16 + JMP found + +len_gt_or_eq_17: + MOVV $25, R5 + BGE R7, R5, len_gt_or_eq_25 +len_17_24: + MOVV 0(R6), R10 + MOVV 8(R6), R11 + SUBV $8, R7 + MOVV (R6)(R7), R12 + SUBV $1, R7 +loop_17_24: + BLT R8, R4, not_found + MOVV (R4), R13 + ADDV $1, R4 + BNE R10, R13, loop_17_24 + MOVV 7(R4), R14 + BNE R11, R14, loop_17_24 + MOVV (R4)(R7), R15 + BNE R12, R15, loop_17_24 + JMP found + +len_gt_or_eq_25: + MOVV $33, R5 + BGE R7, R5, len_gt_or_eq_33 + MOVBU internal∕cpu·Loong64+const_offsetLOONG64HasLSX(SB), R10 + BNE R10, lsx_len_25_32 +len_25_32: + MOVV 0(R6), R10 + MOVV 8(R6), R11 + MOVV 16(R6), R12 + SUBV $8, R7 + MOVV (R6)(R7), R13 + SUBV $1, R7 +loop_25_32: + BLT R8, R4, not_found + MOVV (R4), R14 + ADDV $1, R4 + BNE R10, R14, loop_25_32 + MOVV 7(R4), R15 + BNE R11, R15, loop_25_32 + MOVV 15(R4), R16 + BNE R12, R16, loop_25_32 + MOVV (R4)(R7), R17 + BNE R13, R17, loop_25_32 + JMP found + + // On loong64, LSX is included if LASX is supported. +lasx_len_25_32: +lsx_len_25_32: + VMOVQ 0(R6), V0 + SUBV $16, R7 + VMOVQ (R6)(R7), V1 + SUBV $1, R7 +lsx_loop_25_32: + BLT R8, R4, not_found + VMOVQ (R4), V2 + ADDV $1, R4 + VSEQV V0, V2, V2 + VSETANYEQV V2, FCC0 + BFPT FCC0, lsx_loop_25_32 + + VMOVQ (R4)(R7), V3 + VSEQV V1, V3, V3 + VSETANYEQV V3, FCC1 + BFPT FCC1, lsx_loop_25_32 + JMP found + +len_gt_or_eq_33: + MOVBU internal∕cpu·Loong64+const_offsetLOONG64HasLASX(SB), R10 + MOVV $49, R5 + BGE R7, R5, len_gt_or_eq_49 +len_33_48: + BNE R10, lasx_len_33_48 + JMP lsx_len_33_48 + +len_gt_or_eq_49: +len_49_64: + BNE R10, lasx_len_49_64 + JMP lsx_len_49_64 + +lsx_len_33_48: + VMOVQ 0(R6), V0 + VMOVQ 16(R6), V1 + SUBV $16, R7 + VMOVQ (R6)(R7), V2 + SUBV $1, R7 +lsx_loop_33_48: + BLT R8, R4, not_found + VMOVQ 0(R4), V3 + ADDV $1, R4 + VSEQV V0, V3, V3 + VSETANYEQV V3, FCC0 + BFPT FCC0, lsx_loop_33_48 + + VMOVQ 15(R4), V4 + VSEQV V1, V4, V4 + VSETANYEQV V4, FCC1 + BFPT FCC1, lsx_loop_33_48 + + VMOVQ (R4)(R7), V5 + VSEQV V2, V5, V5 + VSETANYEQV V5, FCC2 + BFPT FCC2, lsx_loop_33_48 + JMP found + +lsx_len_49_64: + VMOVQ 0(R6), V0 + VMOVQ 16(R6), V1 + VMOVQ 32(R6), V2 + SUBV $16, R7 + VMOVQ (R6)(R7), V3 + SUBV $1, R7 +lsx_loop_49_64: + BLT R8, R4, not_found + VMOVQ 0(R4), V4 + ADDV $1, R4 + VSEQV V0, V4, V4 + VSETANYEQV V4, FCC0 + BFPT FCC0, lsx_loop_49_64 + + VMOVQ 15(R4), V5 + VSEQV V1, V5, V5 + VSETANYEQV V5, FCC1 + BFPT FCC1, lsx_loop_49_64 + + VMOVQ 31(R4), V6 + VSEQV V2, V6, V6 + VSETANYEQV V6, FCC2 + BFPT FCC2, lsx_loop_49_64 + + VMOVQ (R4)(R7), V7 + VSEQV V3, V7, V7 + VSETANYEQV V7, FCC3 + BFPT FCC3, lsx_loop_49_64 + JMP found + +lasx_len_33_48: +lasx_len_49_64: +lasx_len_33_64: + XVMOVQ (R6), X0 + SUBV $32, R7 + XVMOVQ (R6)(R7), X1 + SUBV $1, R7 +lasx_loop_33_64: + BLT R8, R4, not_found + XVMOVQ (R4), X2 + ADDV $1, R4 + XVSEQV X0, X2, X3 + XVSETANYEQV X3, FCC0 + BFPT FCC0, lasx_loop_33_64 + + XVMOVQ (R4)(R7), X4 + XVSEQV X1, X4, X5 + XVSETANYEQV X5, FCC1 + BFPT FCC1, lasx_loop_33_64 + JMP found + +found: + SUBV R9, R4 + RET + +not_found: + MOVV $-1, R4 + RET diff --git a/src/internal/bytealg/index_native.go b/src/internal/bytealg/index_native.go index 59c93f9d12..f917c7a92a 100644 --- a/src/internal/bytealg/index_native.go +++ b/src/internal/bytealg/index_native.go @@ -2,7 +2,7 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -//go:build amd64 || arm64 || s390x || ppc64le || ppc64 +//go:build amd64 || arm64 || loong64 || s390x || ppc64le || ppc64 package bytealg |
