diff options
| author | Keith Randall <khr@golang.org> | 2018-03-04 09:47:47 -0800 |
|---|---|---|
| committer | Keith Randall <khr@golang.org> | 2018-03-04 19:49:44 +0000 |
| commit | ee58eccc565c0871d3f16fd702fd8649a3fb61ea (patch) | |
| tree | 837073b78954dc987cf575ff478faf6fdb8afb0e /src/internal/bytealg | |
| parent | f6332bb84ad87e958290ae23b29a2b13a41ee2a2 (diff) | |
| download | go-ee58eccc565c0871d3f16fd702fd8649a3fb61ea.tar.xz | |
internal/bytealg: move short string Index implementations into bytealg
Also move the arm64 CountByte implementation while we're here.
Fixes #19792
Change-Id: I1e0fdf1e03e3135af84150a2703b58dad1b0d57e
Reviewed-on: https://go-review.googlesource.com/98518
Run-TryBot: Keith Randall <khr@golang.org>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>
Diffstat (limited to 'src/internal/bytealg')
| -rw-r--r-- | src/internal/bytealg/bytealg.go | 22 | ||||
| -rw-r--r-- | src/internal/bytealg/count_arm64.s | 90 | ||||
| -rw-r--r-- | src/internal/bytealg/count_generic.go | 2 | ||||
| -rw-r--r-- | src/internal/bytealg/count_native.go | 2 | ||||
| -rw-r--r-- | src/internal/bytealg/equal_native.go | 16 | ||||
| -rw-r--r-- | src/internal/bytealg/index_amd64.go | 26 | ||||
| -rw-r--r-- | src/internal/bytealg/index_amd64.s | 274 | ||||
| -rw-r--r-- | src/internal/bytealg/index_arm64.go | 23 | ||||
| -rw-r--r-- | src/internal/bytealg/index_arm64.s | 149 | ||||
| -rw-r--r-- | src/internal/bytealg/index_generic.go | 29 | ||||
| -rw-r--r-- | src/internal/bytealg/index_native.go | 19 | ||||
| -rw-r--r-- | src/internal/bytealg/index_s390x.go | 31 | ||||
| -rw-r--r-- | src/internal/bytealg/index_s390x.s | 216 |
13 files changed, 881 insertions, 18 deletions
diff --git a/src/internal/bytealg/bytealg.go b/src/internal/bytealg/bytealg.go new file mode 100644 index 0000000000..1ab7c30f4e --- /dev/null +++ b/src/internal/bytealg/bytealg.go @@ -0,0 +1,22 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package bytealg + +import ( + "internal/cpu" + "unsafe" +) + +// Offsets into internal/cpu records for use in assembly. +const ( + x86_HasSSE2 = unsafe.Offsetof(cpu.X86.HasSSE2) + x86_HasSSE42 = unsafe.Offsetof(cpu.X86.HasSSE42) + x86_HasAVX2 = unsafe.Offsetof(cpu.X86.HasAVX2) + x86_HasPOPCNT = unsafe.Offsetof(cpu.X86.HasPOPCNT) + s390x_HasVX = unsafe.Offsetof(cpu.S390X.HasVX) +) + +// MaxLen is the maximum length of the string to be searched for (argument b) in Index. +var MaxLen int diff --git a/src/internal/bytealg/count_arm64.s b/src/internal/bytealg/count_arm64.s new file mode 100644 index 0000000000..8cd703d943 --- /dev/null +++ b/src/internal/bytealg/count_arm64.s @@ -0,0 +1,90 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "go_asm.h" +#include "textflag.h" + +TEXT ·Count(SB),NOSPLIT,$0-40 + MOVD b_base+0(FP), R0 + MOVD b_len+8(FP), R2 + MOVBU c+24(FP), R1 + MOVD $ret+32(FP), R8 + B countbytebody<>(SB) + +TEXT ·CountString(SB),NOSPLIT,$0-32 + MOVD s_base+0(FP), R0 + MOVD s_len+8(FP), R2 + MOVBU c+16(FP), R1 + MOVD $ret+24(FP), R8 + B countbytebody<>(SB) + +// input: +// R0: data +// R2: data len +// R1: byte to find +// R8: address to put result +TEXT countbytebody<>(SB),NOSPLIT,$0 + // R11 = count of byte to search + MOVD $0, R11 + // short path to handle 0-byte case + CBZ R2, done + CMP $0x20, R2 + // jump directly to tail if length < 32 + BLO tail + ANDS $0x1f, R0, R9 + BEQ chunk + // Work with not 32-byte aligned head + BIC $0x1f, R0, R3 + ADD $0x20, R3 +head_loop: + MOVBU.P 1(R0), R5 + CMP R5, R1 + CINC EQ, R11, R11 + SUB $1, R2, R2 + CMP R0, R3 + BNE head_loop + // Work with 32-byte aligned chunks +chunk: + BIC $0x1f, R2, R9 + // The first chunk can also be the last + CBZ R9, tail + // R3 = end of 32-byte chunks + ADD R0, R9, R3 + MOVD $1, R5 + VMOV R5, V5.B16 + // R2 = length of tail + SUB R9, R2, R2 + // Duplicate R1 (byte to search) to 16 1-byte elements of V0 + VMOV R1, V0.B16 + // Clear the low 64-bit element of V7 and V8 + VEOR V7.B8, V7.B8, V7.B8 + VEOR V8.B8, V8.B8, V8.B8 + // Count the target byte in 32-byte chunk +chunk_loop: + VLD1.P (R0), [V1.B16, V2.B16] + CMP R0, R3 + VCMEQ V0.B16, V1.B16, V3.B16 + VCMEQ V0.B16, V2.B16, V4.B16 + // Clear the higher 7 bits + VAND V5.B16, V3.B16, V3.B16 + VAND V5.B16, V4.B16, V4.B16 + // Count lanes match the requested byte + VADDP V4.B16, V3.B16, V6.B16 // 32B->16B + VUADDLV V6.B16, V7 + // Accumulate the count in low 64-bit element of V8 when inside the loop + VADD V7, V8 + BNE chunk_loop + VMOV V8.D[0], R6 + ADD R6, R11, R11 + CBZ R2, done +tail: + // Work with tail shorter than 32 bytes + MOVBU.P 1(R0), R5 + SUB $1, R2, R2 + CMP R5, R1 + CINC EQ, R11, R11 + CBNZ R2, tail +done: + MOVD R11, (R8) + RET diff --git a/src/internal/bytealg/count_generic.go b/src/internal/bytealg/count_generic.go index acc5a79827..a763b3bc61 100644 --- a/src/internal/bytealg/count_generic.go +++ b/src/internal/bytealg/count_generic.go @@ -2,7 +2,7 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -// +build !amd64 +// +build !amd64,!arm64 package bytealg diff --git a/src/internal/bytealg/count_native.go b/src/internal/bytealg/count_native.go index e6d3b066aa..a62c4cb5c0 100644 --- a/src/internal/bytealg/count_native.go +++ b/src/internal/bytealg/count_native.go @@ -2,7 +2,7 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -// +build amd64 +// +build amd64 arm64 package bytealg diff --git a/src/internal/bytealg/equal_native.go b/src/internal/bytealg/equal_native.go index 55d184a58b..b5c453086c 100644 --- a/src/internal/bytealg/equal_native.go +++ b/src/internal/bytealg/equal_native.go @@ -4,24 +4,8 @@ package bytealg -import ( - "internal/cpu" - "unsafe" -) - // Note: there's no equal_generic.go because every platform must implement at least memequal_varlen in assembly. -// Because equal_native.go is unconditional, it's a good place to compute asm constants. -// TODO: find a better way to do this? - -// Offsets into internal/cpu records for use in assembly. -const ( - x86_HasSSE2 = unsafe.Offsetof(cpu.X86.HasSSE2) - x86_HasAVX2 = unsafe.Offsetof(cpu.X86.HasAVX2) - x86_HasPOPCNT = unsafe.Offsetof(cpu.X86.HasPOPCNT) - s390x_HasVX = unsafe.Offsetof(cpu.S390X.HasVX) -) - //go:noescape func Equal(a, b []byte) bool diff --git a/src/internal/bytealg/index_amd64.go b/src/internal/bytealg/index_amd64.go new file mode 100644 index 0000000000..c7a1941e5f --- /dev/null +++ b/src/internal/bytealg/index_amd64.go @@ -0,0 +1,26 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package bytealg + +import "internal/cpu" + +const MaxBruteForce = 64 + +func init() { + if cpu.X86.HasAVX2 { + MaxLen = 63 + } else { + MaxLen = 31 + } +} + +// Cutover reports the number of failures of IndexByte we should tolerate +// before switching over to Index. +// n is the number of bytes processed so far. +// See the bytes.Index implementation for details. +func Cutover(n int) int { + // 1 error per 8 characters, plus a few slop to start. + return (n + 16) / 8 +} diff --git a/src/internal/bytealg/index_amd64.s b/src/internal/bytealg/index_amd64.s new file mode 100644 index 0000000000..f7297c0cab --- /dev/null +++ b/src/internal/bytealg/index_amd64.s @@ -0,0 +1,274 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "go_asm.h" +#include "textflag.h" + +TEXT ·Index(SB),NOSPLIT,$0-56 + MOVQ a_base+0(FP), DI + MOVQ a_len+8(FP), DX + MOVQ b_base+24(FP), BP + MOVQ b_len+32(FP), AX + MOVQ DI, R10 + LEAQ ret+48(FP), R11 + JMP indexbody<>(SB) + +TEXT ·IndexString(SB),NOSPLIT,$0-40 + MOVQ a_base+0(FP), DI + MOVQ a_len+8(FP), DX + MOVQ b_base+16(FP), BP + MOVQ b_len+24(FP), AX + MOVQ DI, R10 + LEAQ ret+32(FP), R11 + JMP indexbody<>(SB) + +// AX: length of string, that we are searching for +// DX: length of string, in which we are searching +// DI: pointer to string, in which we are searching +// BP: pointer to string, that we are searching for +// R11: address, where to put return value +// Note: We want len in DX and AX, because PCMPESTRI implicitly consumes them +TEXT indexbody<>(SB),NOSPLIT,$0 + CMPQ AX, DX + JA fail + CMPQ DX, $16 + JAE sse42 +no_sse42: + CMPQ AX, $2 + JA _3_or_more + MOVW (BP), BP + LEAQ -1(DI)(DX*1), DX +loop2: + MOVW (DI), SI + CMPW SI,BP + JZ success + ADDQ $1,DI + CMPQ DI,DX + JB loop2 + JMP fail +_3_or_more: + CMPQ AX, $3 + JA _4_or_more + MOVW 1(BP), BX + MOVW (BP), BP + LEAQ -2(DI)(DX*1), DX +loop3: + MOVW (DI), SI + CMPW SI,BP + JZ partial_success3 + ADDQ $1,DI + CMPQ DI,DX + JB loop3 + JMP fail +partial_success3: + MOVW 1(DI), SI + CMPW SI,BX + JZ success + ADDQ $1,DI + CMPQ DI,DX + JB loop3 + JMP fail +_4_or_more: + CMPQ AX, $4 + JA _5_or_more + MOVL (BP), BP + LEAQ -3(DI)(DX*1), DX +loop4: + MOVL (DI), SI + CMPL SI,BP + JZ success + ADDQ $1,DI + CMPQ DI,DX + JB loop4 + JMP fail +_5_or_more: + CMPQ AX, $7 + JA _8_or_more + LEAQ 1(DI)(DX*1), DX + SUBQ AX, DX + MOVL -4(BP)(AX*1), BX + MOVL (BP), BP +loop5to7: + MOVL (DI), SI + CMPL SI,BP + JZ partial_success5to7 + ADDQ $1,DI + CMPQ DI,DX + JB loop5to7 + JMP fail +partial_success5to7: + MOVL -4(AX)(DI*1), SI + CMPL SI,BX + JZ success + ADDQ $1,DI + CMPQ DI,DX + JB loop5to7 + JMP fail +_8_or_more: + CMPQ AX, $8 + JA _9_or_more + MOVQ (BP), BP + LEAQ -7(DI)(DX*1), DX +loop8: + MOVQ (DI), SI + CMPQ SI,BP + JZ success + ADDQ $1,DI + CMPQ DI,DX + JB loop8 + JMP fail +_9_or_more: + CMPQ AX, $15 + JA _16_or_more + LEAQ 1(DI)(DX*1), DX + SUBQ AX, DX + MOVQ -8(BP)(AX*1), BX + MOVQ (BP), BP +loop9to15: + MOVQ (DI), SI + CMPQ SI,BP + JZ partial_success9to15 + ADDQ $1,DI + CMPQ DI,DX + JB loop9to15 + JMP fail +partial_success9to15: + MOVQ -8(AX)(DI*1), SI + CMPQ SI,BX + JZ success + ADDQ $1,DI + CMPQ DI,DX + JB loop9to15 + JMP fail +_16_or_more: + CMPQ AX, $16 + JA _17_or_more + MOVOU (BP), X1 + LEAQ -15(DI)(DX*1), DX +loop16: + MOVOU (DI), X2 + PCMPEQB X1, X2 + PMOVMSKB X2, SI + CMPQ SI, $0xffff + JE success + ADDQ $1,DI + CMPQ DI,DX + JB loop16 + JMP fail +_17_or_more: + CMPQ AX, $31 + JA _32_or_more + LEAQ 1(DI)(DX*1), DX + SUBQ AX, DX + MOVOU -16(BP)(AX*1), X0 + MOVOU (BP), X1 +loop17to31: + MOVOU (DI), X2 + PCMPEQB X1,X2 + PMOVMSKB X2, SI + CMPQ SI, $0xffff + JE partial_success17to31 + ADDQ $1,DI + CMPQ DI,DX + JB loop17to31 + JMP fail +partial_success17to31: + MOVOU -16(AX)(DI*1), X3 + PCMPEQB X0, X3 + PMOVMSKB X3, SI + CMPQ SI, $0xffff + JE success + ADDQ $1,DI + CMPQ DI,DX + JB loop17to31 + JMP fail +// We can get here only when AVX2 is enabled and cutoff for indexShortStr is set to 63 +// So no need to check cpuid +_32_or_more: + CMPQ AX, $32 + JA _33_to_63 + VMOVDQU (BP), Y1 + LEAQ -31(DI)(DX*1), DX +loop32: + VMOVDQU (DI), Y2 + VPCMPEQB Y1, Y2, Y3 + VPMOVMSKB Y3, SI + CMPL SI, $0xffffffff + JE success_avx2 + ADDQ $1,DI + CMPQ DI,DX + JB loop32 + JMP fail_avx2 +_33_to_63: + LEAQ 1(DI)(DX*1), DX + SUBQ AX, DX + VMOVDQU -32(BP)(AX*1), Y0 + VMOVDQU (BP), Y1 +loop33to63: + VMOVDQU (DI), Y2 + VPCMPEQB Y1, Y2, Y3 + VPMOVMSKB Y3, SI + CMPL SI, $0xffffffff + JE partial_success33to63 + ADDQ $1,DI + CMPQ DI,DX + JB loop33to63 + JMP fail_avx2 +partial_success33to63: + VMOVDQU -32(AX)(DI*1), Y3 + VPCMPEQB Y0, Y3, Y4 + VPMOVMSKB Y4, SI + CMPL SI, $0xffffffff + JE success_avx2 + ADDQ $1,DI + CMPQ DI,DX + JB loop33to63 +fail_avx2: + VZEROUPPER +fail: + MOVQ $-1, (R11) + RET +success_avx2: + VZEROUPPER + JMP success +sse42: + CMPB internal∕cpu·X86+const_x86_HasSSE42(SB), $1 + JNE no_sse42 + CMPQ AX, $12 + // PCMPESTRI is slower than normal compare, + // so using it makes sense only if we advance 4+ bytes per compare + // This value was determined experimentally and is the ~same + // on Nehalem (first with SSE42) and Haswell. + JAE _9_or_more + LEAQ 16(BP), SI + TESTW $0xff0, SI + JEQ no_sse42 + MOVOU (BP), X1 + LEAQ -15(DI)(DX*1), SI + MOVQ $16, R9 + SUBQ AX, R9 // We advance by 16-len(sep) each iteration, so precalculate it into R9 +loop_sse42: + // 0x0c means: unsigned byte compare (bits 0,1 are 00) + // for equality (bits 2,3 are 11) + // result is not masked or inverted (bits 4,5 are 00) + // and corresponds to first matching byte (bit 6 is 0) + PCMPESTRI $0x0c, (DI), X1 + // CX == 16 means no match, + // CX > R9 means partial match at the end of the string, + // otherwise sep is at offset CX from X1 start + CMPQ CX, R9 + JBE sse42_success + ADDQ R9, DI + CMPQ DI, SI + JB loop_sse42 + PCMPESTRI $0x0c, -1(SI), X1 + CMPQ CX, R9 + JA fail + LEAQ -1(SI), DI +sse42_success: + ADDQ CX, DI +success: + SUBQ R10, DI + MOVQ DI, (R11) + RET diff --git a/src/internal/bytealg/index_arm64.go b/src/internal/bytealg/index_arm64.go new file mode 100644 index 0000000000..0f87ae106c --- /dev/null +++ b/src/internal/bytealg/index_arm64.go @@ -0,0 +1,23 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package bytealg + +// Empirical data shows that using IndexShortStr can get better +// performance when len(s) <= 16. +const MaxBruteForce = 16 + +func init() { + // 8 bytes can be completely loaded into 1 register. + MaxLen = 8 +} + +// Cutover reports the number of failures of IndexByte we should tolerate +// before switching over to IndexShortStr. +// n is the number of bytes processed so far. +// See the bytes.Index implementation for details. +func Cutover(n int) int { + // 1 error per 16 characters, plus a few slop to start. + return 4 + n>>4 +} diff --git a/src/internal/bytealg/index_arm64.s b/src/internal/bytealg/index_arm64.s new file mode 100644 index 0000000000..8cffcd10b5 --- /dev/null +++ b/src/internal/bytealg/index_arm64.s @@ -0,0 +1,149 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "go_asm.h" +#include "textflag.h" + +TEXT ·Index(SB),NOSPLIT,$0-56 + MOVD a_base+0(FP), R0 + MOVD a_len+8(FP), R1 + MOVD b_base+24(FP), R2 + MOVD b_len+32(FP), R3 + MOVD $ret+48(FP), R9 + B indexbody<>(SB) + +TEXT ·IndexString(SB),NOSPLIT,$0-40 + MOVD a_base+0(FP), R0 + MOVD a_len+8(FP), R1 + MOVD b_base+16(FP), R2 + MOVD b_len+24(FP), R3 + MOVD $ret+32(FP), R9 + B indexbody<>(SB) + +// input: +// R0: haystack +// R1: length of haystack +// R2: needle +// R3: length of needle (2 <= len <= 8) +// R9: address to put result +TEXT indexbody<>(SB),NOSPLIT,$0-56 + // main idea is to load 'sep' into separate register(s) + // to avoid repeatedly re-load it again and again + // for sebsequent substring comparisons + MOVD a_base+0(FP), R0 + MOVD a_len+8(FP), R1 + MOVD b_base+24(FP), R2 + MOVD b_len+32(FP), R3 + SUB R3, R1, R4 + // R4 contains the start of last substring for comparsion + ADD R0, R4, R4 + ADD $1, R0, R8 + TBZ $3, R3, len_2_7 +len_8: + // R5 contains 8-byte sep + MOVD (R2), R5 +loop_8: + // R6 contains substring for comparison + MOVD.P 1(R0), R6 + CMP R5, R6 + BEQ found + CMP R4, R0 + BLS loop_8 + JMP not_found +len_2_7: + TBZ $2, R3, len_2_3 + TBZ $1, R3, len_4_5 + TBZ $0, R3, len_6 +len_7: + // R5 and R6 contain 7-byte sep + MOVWU (R2), R5 + // 1-byte overlap with R5 + MOVWU 3(R2), R6 +loop_7: + MOVWU.P 1(R0), R3 + CMP R5, R3 + BNE not_equal_7 + MOVWU 2(R0), R3 + CMP R6, R3 + BEQ found +not_equal_7: + CMP R4, R0 + BLS loop_7 + JMP not_found +len_6: + // R5 and R6 contain 6-byte sep + MOVWU (R2), R5 + MOVHU 4(R2), R6 +loop_6: + MOVWU.P 1(R0), R3 + CMP R5, R3 + BNE not_equal_6 + MOVHU 3(R0), R3 + CMP R6, R3 + BEQ found +not_equal_6: + CMP R4, R0 + BLS loop_6 + JMP not_found +len_4_5: + TBZ $0, R3, len_4 +len_5: + // R5 and R7 contain 5-byte sep + MOVWU (R2), R5 + MOVBU 4(R2), R7 +loop_5: + MOVWU.P 1(R0), R3 + CMP R5, R3 + BNE not_equal_5 + MOVBU 3(R0), R3 + CMP R7, R3 + BEQ found +not_equal_5: + CMP R4, R0 + BLS loop_5 + JMP not_found +len_4: + // R5 contains 4-byte sep + MOVWU (R2), R5 +loop_4: + MOVWU.P 1(R0), R6 + CMP R5, R6 + BEQ found + CMP R4, R0 + BLS loop_4 + JMP not_found +len_2_3: + TBZ $0, R3, len_2 +len_3: + // R6 and R7 contain 3-byte sep + MOVHU (R2), R6 + MOVBU 2(R2), R7 +loop_3: + MOVHU.P 1(R0), R3 + CMP R6, R3 + BNE not_equal_3 + MOVBU 1(R0), R3 + CMP R7, R3 + BEQ found +not_equal_3: + CMP R4, R0 + BLS loop_3 + JMP not_found +len_2: + // R5 contains 2-byte sep + MOVHU (R2), R5 +loop_2: + MOVHU.P 1(R0), R6 + CMP R5, R6 + BEQ found + CMP R4, R0 + BLS loop_2 +not_found: + MOVD $-1, R0 + MOVD R0, (R9) + RET +found: + SUB R8, R0, R0 + MOVD R0, (R9) + RET diff --git a/src/internal/bytealg/index_generic.go b/src/internal/bytealg/index_generic.go new file mode 100644 index 0000000000..98e859f925 --- /dev/null +++ b/src/internal/bytealg/index_generic.go @@ -0,0 +1,29 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build !amd64,!arm64,!s390x + +package bytealg + +const MaxBruteForce = 0 + +// Index returns the index of the first instance of b in a, or -1 if b is not present in a. +// Requires 2 <= len(b) <= MaxLen. +func Index(a, b []byte) int { + panic("unimplemented") +} + +// IndexString returns the index of the first instance of b in a, or -1 if b is not present in a. +// Requires 2 <= len(b) <= MaxLen. +func IndexString(a, b string) int { + panic("unimplemented") +} + +// Cutover reports the number of failures of IndexByte we should tolerate +// before switching over to Index. +// n is the number of bytes processed so far. +// See the bytes.Index implementation for details. +func Cutover(n int) int { + panic("unimplemented") +} diff --git a/src/internal/bytealg/index_native.go b/src/internal/bytealg/index_native.go new file mode 100644 index 0000000000..fde4214245 --- /dev/null +++ b/src/internal/bytealg/index_native.go @@ -0,0 +1,19 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build amd64 arm64 s390x + +package bytealg + +//go:noescape + +// Index returns the index of the first instance of b in a, or -1 if b is not present in a. +// Requires 2 <= len(b) <= MaxLen. +func Index(a, b []byte) int + +//go:noescape + +// IndexString returns the index of the first instance of b in a, or -1 if b is not present in a. +// Requires 2 <= len(b) <= MaxLen. +func IndexString(a, b string) int diff --git a/src/internal/bytealg/index_s390x.go b/src/internal/bytealg/index_s390x.go new file mode 100644 index 0000000000..9340cf1135 --- /dev/null +++ b/src/internal/bytealg/index_s390x.go @@ -0,0 +1,31 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package bytealg + +import "internal/cpu" + +const MaxBruteForce = 64 + +func init() { + // Note: we're kind of lucky that this flag is available at this point. + // The runtime sets HasVX when processing auxv records, and that happens + // to happen *before* running the init functions of packages that + // the runtime depends on. + // TODO: it would really be nicer for internal/cpu to figure out this + // flag by itself. Then we wouldn't need to depend on quirks of + // early startup initialization order. + if cpu.S390X.HasVX { + MaxLen = 64 + } +} + +// Cutover reports the number of failures of IndexByte we should tolerate +// before switching over to Index. +// n is the number of bytes processed so far. +// See the bytes.Index implementation for details. +func Cutover(n int) int { + // 1 error per 8 characters, plus a few slop to start. + return (n + 16) / 8 +} diff --git a/src/internal/bytealg/index_s390x.s b/src/internal/bytealg/index_s390x.s new file mode 100644 index 0000000000..491d5bcfd2 --- /dev/null +++ b/src/internal/bytealg/index_s390x.s @@ -0,0 +1,216 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "go_asm.h" +#include "textflag.h" + +// Caller must confirm availability of vx facility before calling. +TEXT ·Index(SB),NOSPLIT|NOFRAME,$0-56 + LMG a_base+0(FP), R1, R2 // R1=&s[0], R2=len(s) + LMG b_base+24(FP), R3, R4 // R3=&sep[0], R4=len(sep) + MOVD $ret+48(FP), R5 + BR indexbody<>(SB) + +// Caller must confirm availability of vx facility before calling. +TEXT ·IndexString(SB),NOSPLIT|NOFRAME,$0-40 + LMG a_base+0(FP), R1, R2 // R1=&s[0], R2=len(s) + LMG b_base+16(FP), R3, R4 // R3=&sep[0], R4=len(sep) + MOVD $ret+32(FP), R5 + BR indexbody<>(SB) + +// s: string we are searching +// sep: string to search for +// R1=&s[0], R2=len(s) +// R3=&sep[0], R4=len(sep) +// R5=&ret (int) +// Caller must confirm availability of vx facility before calling. +TEXT indexbody<>(SB),NOSPLIT|NOFRAME,$0 + CMPBGT R4, R2, notfound + ADD R1, R2 + SUB R4, R2 // R2=&s[len(s)-len(sep)] (last valid index) + CMPBEQ R4, $0, notfound + SUB $1, R4 // R4=len(sep)-1 for use as VLL index + VLL R4, (R3), V0 // contains first 16 bytes of sep + MOVD R1, R7 +index2plus: + CMPBNE R4, $1, index3plus + MOVD $15(R7), R9 + CMPBGE R9, R2, index2to16 + VGBM $0xaaaa, V31 // 0xff00ff00ff00ff00... + VONE V16 + VREPH $0, V0, V1 + CMPBGE R9, R2, index2to16 +index2loop: + VL 0(R7), V2 // 16 bytes, even indices + VL 1(R7), V4 // 16 bytes, odd indices + VCEQH V1, V2, V5 // compare even indices + VCEQH V1, V4, V6 // compare odd indices + VSEL V5, V6, V31, V7 // merge even and odd indices + VFEEBS V16, V7, V17 // find leftmost index, set condition to 1 if found + BLT foundV17 + MOVD $16(R7), R7 // R7+=16 + ADD $15, R7, R9 + CMPBLE R9, R2, index2loop // continue if (R7+15) <= R2 (last index to search) + CMPBLE R7, R2, index2to16 + BR notfound + +index3plus: + CMPBNE R4, $2, index4plus + ADD $15, R7, R9 + CMPBGE R9, R2, index2to16 + MOVD $1, R0 + VGBM $0xaaaa, V31 // 0xff00ff00ff00ff00... + VONE V16 + VREPH $0, V0, V1 + VREPB $2, V0, V8 +index3loop: + VL (R7), V2 // load 16-bytes into V2 + VLL R0, 16(R7), V3 // load 2-bytes into V3 + VSLDB $1, V2, V3, V4 // V4=(V2:V3)<<1 + VSLDB $2, V2, V3, V9 // V9=(V2:V3)<<2 + VCEQH V1, V2, V5 // compare 2-byte even indices + VCEQH V1, V4, V6 // compare 2-byte odd indices + VCEQB V8, V9, V10 // compare last bytes + VSEL V5, V6, V31, V7 // merge even and odd indices + VN V7, V10, V7 // AND indices with last byte + VFEEBS V16, V7, V17 // find leftmost index, set condition to 1 if found + BLT foundV17 + MOVD $16(R7), R7 // R7+=16 + ADD $15, R7, R9 + CMPBLE R9, R2, index3loop // continue if (R7+15) <= R2 (last index to search) + CMPBLE R7, R2, index2to16 + BR notfound + +index4plus: + CMPBNE R4, $3, index5plus + ADD $15, R7, R9 + CMPBGE R9, R2, index2to16 + MOVD $2, R0 + VGBM $0x8888, V29 // 0xff000000ff000000... + VGBM $0x2222, V30 // 0x0000ff000000ff00... + VGBM $0xcccc, V31 // 0xffff0000ffff0000... + VONE V16 + VREPF $0, V0, V1 +index4loop: + VL (R7), V2 // load 16-bytes into V2 + VLL R0, 16(R7), V3 // load 3-bytes into V3 + VSLDB $1, V2, V3, V4 // V4=(V2:V3)<<1 + VSLDB $2, V2, V3, V9 // V9=(V2:V3)<<1 + VSLDB $3, V2, V3, V10 // V10=(V2:V3)<<1 + VCEQF V1, V2, V5 // compare index 0, 4, ... + VCEQF V1, V4, V6 // compare index 1, 5, ... + VCEQF V1, V9, V11 // compare index 2, 6, ... + VCEQF V1, V10, V12 // compare index 3, 7, ... + VSEL V5, V6, V29, V13 // merge index 0, 1, 4, 5, ... + VSEL V11, V12, V30, V14 // merge index 2, 3, 6, 7, ... + VSEL V13, V14, V31, V7 // final merge + VFEEBS V16, V7, V17 // find leftmost index, set condition to 1 if found + BLT foundV17 + MOVD $16(R7), R7 // R7+=16 + ADD $15, R7, R9 + CMPBLE R9, R2, index4loop // continue if (R7+15) <= R2 (last index to search) + CMPBLE R7, R2, index2to16 + BR notfound + +index5plus: + CMPBGT R4, $15, index17plus +index2to16: + CMPBGT R7, R2, notfound + MOVD $1(R7), R8 + CMPBGT R8, R2, index2to16tail +index2to16loop: + // unrolled 2x + VLL R4, (R7), V1 + VLL R4, 1(R7), V2 + VCEQGS V0, V1, V3 + BEQ found + MOVD $1(R7), R7 + VCEQGS V0, V2, V4 + BEQ found + MOVD $1(R7), R7 + CMPBLT R7, R2, index2to16loop + CMPBGT R7, R2, notfound +index2to16tail: + VLL R4, (R7), V1 + VCEQGS V0, V1, V2 + BEQ found + BR notfound + +index17plus: + CMPBGT R4, $31, index33plus + SUB $16, R4, R0 + VLL R0, 16(R3), V1 + VONE V7 +index17to32loop: + VL (R7), V2 + VLL R0, 16(R7), V3 + VCEQG V0, V2, V4 + VCEQG V1, V3, V5 + VN V4, V5, V6 + VCEQGS V6, V7, V8 + BEQ found + MOVD $1(R7), R7 + CMPBLE R7, R2, index17to32loop + BR notfound + +index33plus: + CMPBGT R4, $47, index49plus + SUB $32, R4, R0 + VL 16(R3), V1 + VLL R0, 32(R3), V2 + VONE V11 +index33to48loop: + VL (R7), V3 + VL 16(R7), V4 + VLL R0, 32(R7), V5 + VCEQG V0, V3, V6 + VCEQG V1, V4, V7 + VCEQG V2, V5, V8 + VN V6, V7, V9 + VN V8, V9, V10 + VCEQGS V10, V11, V12 + BEQ found + MOVD $1(R7), R7 + CMPBLE R7, R2, index33to48loop + BR notfound + +index49plus: + CMPBGT R4, $63, index65plus + SUB $48, R4, R0 + VL 16(R3), V1 + VL 32(R3), V2 + VLL R0, 48(R3), V3 + VONE V15 +index49to64loop: + VL (R7), V4 + VL 16(R7), V5 + VL 32(R7), V6 + VLL R0, 48(R7), V7 + VCEQG V0, V4, V8 + VCEQG V1, V5, V9 + VCEQG V2, V6, V10 + VCEQG V3, V7, V11 + VN V8, V9, V12 + VN V10, V11, V13 + VN V12, V13, V14 + VCEQGS V14, V15, V16 + BEQ found + MOVD $1(R7), R7 + CMPBLE R7, R2, index49to64loop +notfound: + MOVD $-1, (R5) + RET + +index65plus: + // not implemented + MOVD $0, (R0) + RET + +foundV17: // index is in doubleword V17[0] + VLGVG $0, V17, R8 + ADD R8, R7 +found: + SUB R1, R7 + MOVD R7, (R5) + RET |
