internal/bytealg: move short string Index implementations into bytealg

Also move the arm64 CountByte implementation while we're here. Fixes #19792 Change-Id: I1e0fdf1e03e3135af84150a2703b58dad1b0d57e Reviewed-on: https://go-review.googlesource.com/98518 Run-TryBot: Keith Randall <khr@golang.org> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>
author: Keith Randall <khr@golang.org> 2018-03-04 09:47:47 -0800
committer: Keith Randall <khr@golang.org> 2018-03-04 19:49:44 +0000
commit: ee58eccc565c0871d3f16fd702fd8649a3fb61ea (patch)
tree: 837073b78954dc987cf575ff478faf6fdb8afb0e /src/internal/bytealg
parent: f6332bb84ad87e958290ae23b29a2b13a41ee2a2 (diff)
download: go-ee58eccc565c0871d3f16fd702fd8649a3fb61ea.tar.xz
13 files changed, 881 insertions, 18 deletions
diff --git a/src/internal/bytealg/bytealg.go b/src/internal/bytealg/bytealg.go
new file mode 100644
index 0000000000..1ab7c30f4e
--- /dev/null
+++ b/src/internal/bytealg/bytealg.go
@@ -0,0 +1,22 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package bytealg
+
+import (
+	"internal/cpu"
+	"unsafe"
+)
+
+// Offsets into internal/cpu records for use in assembly.
+const (
+	x86_HasSSE2   = unsafe.Offsetof(cpu.X86.HasSSE2)
+	x86_HasSSE42  = unsafe.Offsetof(cpu.X86.HasSSE42)
+	x86_HasAVX2   = unsafe.Offsetof(cpu.X86.HasAVX2)
+	x86_HasPOPCNT = unsafe.Offsetof(cpu.X86.HasPOPCNT)
+	s390x_HasVX   = unsafe.Offsetof(cpu.S390X.HasVX)
+)
+
+// MaxLen is the maximum length of the string to be searched for (argument b) in Index.
+var MaxLen int
diff --git a/src/internal/bytealg/count_arm64.s b/src/internal/bytealg/count_arm64.s
new file mode 100644
index 0000000000..8cd703d943
--- /dev/null
+++ b/src/internal/bytealg/count_arm64.s
@@ -0,0 +1,90 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·Count(SB),NOSPLIT,$0-40
+	MOVD	b_base+0(FP), R0
+	MOVD	b_len+8(FP), R2
+	MOVBU	c+24(FP), R1
+	MOVD	$ret+32(FP), R8
+	B	countbytebody<>(SB)
+
+TEXT ·CountString(SB),NOSPLIT,$0-32
+	MOVD	s_base+0(FP), R0
+	MOVD	s_len+8(FP), R2
+	MOVBU	c+16(FP), R1
+	MOVD	$ret+24(FP), R8
+	B	countbytebody<>(SB)
+
+// input:
+//   R0: data
+//   R2: data len
+//   R1: byte to find
+//   R8: address to put result
+TEXT countbytebody<>(SB),NOSPLIT,$0
+	// R11 = count of byte to search
+	MOVD	$0, R11
+	// short path to handle 0-byte case
+	CBZ	R2, done
+	CMP	$0x20, R2
+	// jump directly to tail if length < 32
+	BLO	tail
+	ANDS	$0x1f, R0, R9
+	BEQ	chunk
+	// Work with not 32-byte aligned head
+	BIC	$0x1f, R0, R3
+	ADD	$0x20, R3
+head_loop:
+	MOVBU.P	1(R0), R5
+	CMP	R5, R1
+	CINC	EQ, R11, R11
+	SUB	$1, R2, R2
+	CMP	R0, R3
+	BNE	head_loop
+	// Work with 32-byte aligned chunks
+chunk:
+	BIC	$0x1f, R2, R9
+	// The first chunk can also be the last
+	CBZ	R9, tail
+	// R3 = end of 32-byte chunks
+	ADD	R0, R9, R3
+	MOVD	$1, R5
+	VMOV	R5, V5.B16
+	// R2 = length of tail
+	SUB	R9, R2, R2
+	// Duplicate R1 (byte to search) to 16 1-byte elements of V0
+	VMOV	R1, V0.B16
+	// Clear the low 64-bit element of V7 and V8
+	VEOR	V7.B8, V7.B8, V7.B8
+	VEOR	V8.B8, V8.B8, V8.B8
+	// Count the target byte in 32-byte chunk
+chunk_loop:
+	VLD1.P	(R0), [V1.B16, V2.B16]
+	CMP	R0, R3
+	VCMEQ	V0.B16, V1.B16, V3.B16
+	VCMEQ	V0.B16, V2.B16, V4.B16
+	// Clear the higher 7 bits
+	VAND	V5.B16, V3.B16, V3.B16
+	VAND	V5.B16, V4.B16, V4.B16
+	// Count lanes match the requested byte
+	VADDP	V4.B16, V3.B16, V6.B16 // 32B->16B
+	VUADDLV	V6.B16, V7
+	// Accumulate the count in low 64-bit element of V8 when inside the loop
+	VADD	V7, V8
+	BNE	chunk_loop
+	VMOV	V8.D[0], R6
+	ADD	R6, R11, R11
+	CBZ	R2, done
+tail:
+	// Work with tail shorter than 32 bytes
+	MOVBU.P	1(R0), R5
+	SUB	$1, R2, R2
+	CMP	R5, R1
+	CINC	EQ, R11, R11
+	CBNZ	R2, tail
+done:
+	MOVD	R11, (R8)
+	RET
diff --git a/src/internal/bytealg/count_generic.go b/src/internal/bytealg/count_generic.go
index acc5a79827..a763b3bc61 100644
--- a/src/internal/bytealg/count_generic.go
+++ b/src/internal/bytealg/count_generic.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build !amd64
+// +build !amd64,!arm64
 
 package bytealg
 
diff --git a/src/internal/bytealg/count_native.go b/src/internal/bytealg/count_native.go
index e6d3b066aa..a62c4cb5c0 100644
--- a/src/internal/bytealg/count_native.go
+++ b/src/internal/bytealg/count_native.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build amd64
+// +build amd64 arm64
 
 package bytealg
 
diff --git a/src/internal/bytealg/equal_native.go b/src/internal/bytealg/equal_native.go
index 55d184a58b..b5c453086c 100644
--- a/src/internal/bytealg/equal_native.go
+++ b/src/internal/bytealg/equal_native.go
@@ -4,24 +4,8 @@
 
 package bytealg
 
-import (
-	"internal/cpu"
-	"unsafe"
-)
-
 // Note: there's no equal_generic.go because every platform must implement at least memequal_varlen in assembly.
 
-// Because equal_native.go is unconditional, it's a good place to compute asm constants.
-// TODO: find a better way to do this?
-
-// Offsets into internal/cpu records for use in assembly.
-const (
-	x86_HasSSE2   = unsafe.Offsetof(cpu.X86.HasSSE2)
-	x86_HasAVX2   = unsafe.Offsetof(cpu.X86.HasAVX2)
-	x86_HasPOPCNT = unsafe.Offsetof(cpu.X86.HasPOPCNT)
-	s390x_HasVX   = unsafe.Offsetof(cpu.S390X.HasVX)
-)
-
 //go:noescape
 func Equal(a, b []byte) bool
 
diff --git a/src/internal/bytealg/index_amd64.go b/src/internal/bytealg/index_amd64.go
new file mode 100644
index 0000000000..c7a1941e5f
--- /dev/null
+++ b/src/internal/bytealg/index_amd64.go
@@ -0,0 +1,26 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package bytealg
+
+import "internal/cpu"
+
+const MaxBruteForce = 64
+
+func init() {
+	if cpu.X86.HasAVX2 {
+		MaxLen = 63
+	} else {
+		MaxLen = 31
+	}
+}
+
+// Cutover reports the number of failures of IndexByte we should tolerate
+// before switching over to Index.
+// n is the number of bytes processed so far.
+// See the bytes.Index implementation for details.
+func Cutover(n int) int {
+	// 1 error per 8 characters, plus a few slop to start.
+	return (n + 16) / 8
+}
diff --git a/src/internal/bytealg/index_amd64.s b/src/internal/bytealg/index_amd64.s
new file mode 100644
index 0000000000..f7297c0cab
--- /dev/null
+++ b/src/internal/bytealg/index_amd64.s
@@ -0,0 +1,274 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·Index(SB),NOSPLIT,$0-56
+	MOVQ a_base+0(FP), DI
+	MOVQ a_len+8(FP), DX
+	MOVQ b_base+24(FP), BP
+	MOVQ b_len+32(FP), AX
+	MOVQ DI, R10
+	LEAQ ret+48(FP), R11
+	JMP  indexbody<>(SB)
+
+TEXT ·IndexString(SB),NOSPLIT,$0-40
+	MOVQ a_base+0(FP), DI
+	MOVQ a_len+8(FP), DX
+	MOVQ b_base+16(FP), BP
+	MOVQ b_len+24(FP), AX
+	MOVQ DI, R10
+	LEAQ ret+32(FP), R11
+	JMP  indexbody<>(SB)
+
+// AX: length of string, that we are searching for
+// DX: length of string, in which we are searching
+// DI: pointer to string, in which we are searching
+// BP: pointer to string, that we are searching for
+// R11: address, where to put return value
+// Note: We want len in DX and AX, because PCMPESTRI implicitly consumes them
+TEXT indexbody<>(SB),NOSPLIT,$0
+	CMPQ AX, DX
+	JA fail
+	CMPQ DX, $16
+	JAE sse42
+no_sse42:
+	CMPQ AX, $2
+	JA   _3_or_more
+	MOVW (BP), BP
+	LEAQ -1(DI)(DX*1), DX
+loop2:
+	MOVW (DI), SI
+	CMPW SI,BP
+	JZ success
+	ADDQ $1,DI
+	CMPQ DI,DX
+	JB loop2
+	JMP fail
+_3_or_more:
+	CMPQ AX, $3
+	JA   _4_or_more
+	MOVW 1(BP), BX
+	MOVW (BP), BP
+	LEAQ -2(DI)(DX*1), DX
+loop3:
+	MOVW (DI), SI
+	CMPW SI,BP
+	JZ   partial_success3
+	ADDQ $1,DI
+	CMPQ DI,DX
+	JB loop3
+	JMP fail
+partial_success3:
+	MOVW 1(DI), SI
+	CMPW SI,BX
+	JZ success
+	ADDQ $1,DI
+	CMPQ DI,DX
+	JB loop3
+	JMP fail
+_4_or_more:
+	CMPQ AX, $4
+	JA   _5_or_more
+	MOVL (BP), BP
+	LEAQ -3(DI)(DX*1), DX
+loop4:
+	MOVL (DI), SI
+	CMPL SI,BP
+	JZ   success
+	ADDQ $1,DI
+	CMPQ DI,DX
+	JB loop4
+	JMP fail
+_5_or_more:
+	CMPQ AX, $7
+	JA   _8_or_more
+	LEAQ 1(DI)(DX*1), DX
+	SUBQ AX, DX
+	MOVL -4(BP)(AX*1), BX
+	MOVL (BP), BP
+loop5to7:
+	MOVL (DI), SI
+	CMPL SI,BP
+	JZ   partial_success5to7
+	ADDQ $1,DI
+	CMPQ DI,DX
+	JB loop5to7
+	JMP fail
+partial_success5to7:
+	MOVL -4(AX)(DI*1), SI
+	CMPL SI,BX
+	JZ success
+	ADDQ $1,DI
+	CMPQ DI,DX
+	JB loop5to7
+	JMP fail
+_8_or_more:
+	CMPQ AX, $8
+	JA   _9_or_more
+	MOVQ (BP), BP
+	LEAQ -7(DI)(DX*1), DX
+loop8:
+	MOVQ (DI), SI
+	CMPQ SI,BP
+	JZ   success
+	ADDQ $1,DI
+	CMPQ DI,DX
+	JB loop8
+	JMP fail
+_9_or_more:
+	CMPQ AX, $15
+	JA   _16_or_more
+	LEAQ 1(DI)(DX*1), DX
+	SUBQ AX, DX
+	MOVQ -8(BP)(AX*1), BX
+	MOVQ (BP), BP
+loop9to15:
+	MOVQ (DI), SI
+	CMPQ SI,BP
+	JZ   partial_success9to15
+	ADDQ $1,DI
+	CMPQ DI,DX
+	JB loop9to15
+	JMP fail
+partial_success9to15:
+	MOVQ -8(AX)(DI*1), SI
+	CMPQ SI,BX
+	JZ success
+	ADDQ $1,DI
+	CMPQ DI,DX
+	JB loop9to15
+	JMP fail
+_16_or_more:
+	CMPQ AX, $16
+	JA   _17_or_more
+	MOVOU (BP), X1
+	LEAQ -15(DI)(DX*1), DX
+loop16:
+	MOVOU (DI), X2
+	PCMPEQB X1, X2
+	PMOVMSKB X2, SI
+	CMPQ  SI, $0xffff
+	JE   success
+	ADDQ $1,DI
+	CMPQ DI,DX
+	JB loop16
+	JMP fail
+_17_or_more:
+	CMPQ AX, $31
+	JA   _32_or_more
+	LEAQ 1(DI)(DX*1), DX
+	SUBQ AX, DX
+	MOVOU -16(BP)(AX*1), X0
+	MOVOU (BP), X1
+loop17to31:
+	MOVOU (DI), X2
+	PCMPEQB X1,X2
+	PMOVMSKB X2, SI
+	CMPQ  SI, $0xffff
+	JE   partial_success17to31
+	ADDQ $1,DI
+	CMPQ DI,DX
+	JB loop17to31
+	JMP fail
+partial_success17to31:
+	MOVOU -16(AX)(DI*1), X3
+	PCMPEQB X0, X3
+	PMOVMSKB X3, SI
+	CMPQ  SI, $0xffff
+	JE success
+	ADDQ $1,DI
+	CMPQ DI,DX
+	JB loop17to31
+	JMP fail
+// We can get here only when AVX2 is enabled and cutoff for indexShortStr is set to 63
+// So no need to check cpuid
+_32_or_more:
+	CMPQ AX, $32
+	JA   _33_to_63
+	VMOVDQU (BP), Y1
+	LEAQ -31(DI)(DX*1), DX
+loop32:
+	VMOVDQU (DI), Y2
+	VPCMPEQB Y1, Y2, Y3
+	VPMOVMSKB Y3, SI
+	CMPL  SI, $0xffffffff
+	JE   success_avx2
+	ADDQ $1,DI
+	CMPQ DI,DX
+	JB loop32
+	JMP fail_avx2
+_33_to_63:
+	LEAQ 1(DI)(DX*1), DX
+	SUBQ AX, DX
+	VMOVDQU -32(BP)(AX*1), Y0
+	VMOVDQU (BP), Y1
+loop33to63:
+	VMOVDQU (DI), Y2
+	VPCMPEQB Y1, Y2, Y3
+	VPMOVMSKB Y3, SI
+	CMPL  SI, $0xffffffff
+	JE   partial_success33to63
+	ADDQ $1,DI
+	CMPQ DI,DX
+	JB loop33to63
+	JMP fail_avx2
+partial_success33to63:
+	VMOVDQU -32(AX)(DI*1), Y3
+	VPCMPEQB Y0, Y3, Y4
+	VPMOVMSKB Y4, SI
+	CMPL  SI, $0xffffffff
+	JE success_avx2
+	ADDQ $1,DI
+	CMPQ DI,DX
+	JB loop33to63
+fail_avx2:
+	VZEROUPPER
+fail:
+	MOVQ $-1, (R11)
+	RET
+success_avx2:
+	VZEROUPPER
+	JMP success
+sse42:
+	CMPB internal∕cpu·X86+const_x86_HasSSE42(SB), $1
+	JNE no_sse42
+	CMPQ AX, $12
+	// PCMPESTRI is slower than normal compare,
+	// so using it makes sense only if we advance 4+ bytes per compare
+	// This value was determined experimentally and is the ~same
+	// on Nehalem (first with SSE42) and Haswell.
+	JAE _9_or_more
+	LEAQ 16(BP), SI
+	TESTW $0xff0, SI
+	JEQ no_sse42
+	MOVOU (BP), X1
+	LEAQ -15(DI)(DX*1), SI
+	MOVQ $16, R9
+	SUBQ AX, R9 // We advance by 16-len(sep) each iteration, so precalculate it into R9
+loop_sse42:
+	// 0x0c means: unsigned byte compare (bits 0,1 are 00)
+	// for equality (bits 2,3 are 11)
+	// result is not masked or inverted (bits 4,5 are 00)
+	// and corresponds to first matching byte (bit 6 is 0)
+	PCMPESTRI $0x0c, (DI), X1
+	// CX == 16 means no match,
+	// CX > R9 means partial match at the end of the string,
+	// otherwise sep is at offset CX from X1 start
+	CMPQ CX, R9
+	JBE sse42_success
+	ADDQ R9, DI
+	CMPQ DI, SI
+	JB loop_sse42
+	PCMPESTRI $0x0c, -1(SI), X1
+	CMPQ CX, R9
+	JA fail
+	LEAQ -1(SI), DI
+sse42_success:
+	ADDQ CX, DI
+success:
+	SUBQ R10, DI
+	MOVQ DI, (R11)
+	RET
diff --git a/src/internal/bytealg/index_arm64.go b/src/internal/bytealg/index_arm64.go
new file mode 100644
index 0000000000..0f87ae106c
--- /dev/null
+++ b/src/internal/bytealg/index_arm64.go
@@ -0,0 +1,23 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package bytealg
+
+// Empirical data shows that using IndexShortStr can get better
+// performance when len(s) <= 16.
+const MaxBruteForce = 16
+
+func init() {
+	// 8 bytes can be completely loaded into 1 register.
+	MaxLen = 8
+}
+
+// Cutover reports the number of failures of IndexByte we should tolerate
+// before switching over to IndexShortStr.
+// n is the number of bytes processed so far.
+// See the bytes.Index implementation for details.
+func Cutover(n int) int {
+	// 1 error per 16 characters, plus a few slop to start.
+	return 4 + n>>4
+}
diff --git a/src/internal/bytealg/index_arm64.s b/src/internal/bytealg/index_arm64.s
new file mode 100644
index 0000000000..8cffcd10b5
--- /dev/null
+++ b/src/internal/bytealg/index_arm64.s
@@ -0,0 +1,149 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·Index(SB),NOSPLIT,$0-56
+	MOVD	a_base+0(FP), R0
+	MOVD	a_len+8(FP), R1
+	MOVD	b_base+24(FP), R2
+	MOVD	b_len+32(FP), R3
+	MOVD	$ret+48(FP), R9
+	B	indexbody<>(SB)
+
+TEXT ·IndexString(SB),NOSPLIT,$0-40
+	MOVD	a_base+0(FP), R0
+	MOVD	a_len+8(FP), R1
+	MOVD	b_base+16(FP), R2
+	MOVD	b_len+24(FP), R3
+	MOVD	$ret+32(FP), R9
+	B	indexbody<>(SB)
+
+// input:
+//   R0: haystack
+//   R1: length of haystack
+//   R2: needle
+//   R3: length of needle (2 <= len <= 8)
+//   R9: address to put result
+TEXT indexbody<>(SB),NOSPLIT,$0-56
+	// main idea is to load 'sep' into separate register(s)
+	// to avoid repeatedly re-load it again and again
+	// for sebsequent substring comparisons
+	MOVD	a_base+0(FP), R0
+	MOVD	a_len+8(FP), R1
+	MOVD	b_base+24(FP), R2
+	MOVD	b_len+32(FP), R3
+	SUB	R3, R1, R4
+	// R4 contains the start of last substring for comparsion
+	ADD	R0, R4, R4
+	ADD	$1, R0, R8
+	TBZ	$3, R3, len_2_7
+len_8:
+	// R5 contains 8-byte sep
+	MOVD	(R2), R5
+loop_8:
+	// R6 contains substring for comparison
+	MOVD.P	1(R0), R6
+	CMP	R5, R6
+	BEQ	found
+	CMP	R4, R0
+	BLS	loop_8
+	JMP	not_found
+len_2_7:
+	TBZ	$2, R3, len_2_3
+	TBZ	$1, R3, len_4_5
+	TBZ	$0, R3, len_6
+len_7:
+	// R5 and R6 contain 7-byte sep
+	MOVWU	(R2), R5
+	// 1-byte overlap with R5
+	MOVWU	3(R2), R6
+loop_7:
+	MOVWU.P	1(R0), R3
+	CMP	R5, R3
+	BNE	not_equal_7
+	MOVWU	2(R0), R3
+	CMP	R6, R3
+	BEQ	found
+not_equal_7:
+	CMP	R4, R0
+	BLS	loop_7
+	JMP	not_found
+len_6:
+	// R5 and R6 contain 6-byte sep
+	MOVWU	(R2), R5
+	MOVHU	4(R2), R6
+loop_6:
+	MOVWU.P	1(R0), R3
+	CMP	R5, R3
+	BNE	not_equal_6
+	MOVHU	3(R0), R3
+	CMP	R6, R3
+	BEQ	found
+not_equal_6:
+	CMP	R4, R0
+	BLS	loop_6
+	JMP	not_found
+len_4_5:
+	TBZ	$0, R3, len_4
+len_5:
+	// R5 and R7 contain 5-byte sep
+	MOVWU	(R2), R5
+	MOVBU	4(R2), R7
+loop_5:
+	MOVWU.P	1(R0), R3
+	CMP	R5, R3
+	BNE	not_equal_5
+	MOVBU	3(R0), R3
+	CMP	R7, R3
+	BEQ	found
+not_equal_5:
+	CMP	R4, R0
+	BLS	loop_5
+	JMP	not_found
+len_4:
+	// R5 contains 4-byte sep
+	MOVWU	(R2), R5
+loop_4:
+	MOVWU.P	1(R0), R6
+	CMP	R5, R6
+	BEQ	found
+	CMP	R4, R0
+	BLS	loop_4
+	JMP	not_found
+len_2_3:
+	TBZ	$0, R3, len_2
+len_3:
+	// R6 and R7 contain 3-byte sep
+	MOVHU	(R2), R6
+	MOVBU	2(R2), R7
+loop_3:
+	MOVHU.P	1(R0), R3
+	CMP	R6, R3
+	BNE	not_equal_3
+	MOVBU	1(R0), R3
+	CMP	R7, R3
+	BEQ	found
+not_equal_3:
+	CMP	R4, R0
+	BLS	loop_3
+	JMP	not_found
+len_2:
+	// R5 contains 2-byte sep
+	MOVHU	(R2), R5
+loop_2:
+	MOVHU.P	1(R0), R6
+	CMP	R5, R6
+	BEQ	found
+	CMP	R4, R0
+	BLS	loop_2
+not_found:
+	MOVD	$-1, R0
+	MOVD	R0, (R9)
+	RET
+found:
+	SUB	R8, R0, R0
+	MOVD	R0, (R9)
+	RET
diff --git a/src/internal/bytealg/index_generic.go b/src/internal/bytealg/index_generic.go
new file mode 100644
index 0000000000..98e859f925
--- /dev/null
+++ b/src/internal/bytealg/index_generic.go
@@ -0,0 +1,29 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !amd64,!arm64,!s390x
+
+package bytealg
+
+const MaxBruteForce = 0
+
+// Index returns the index of the first instance of b in a, or -1 if b is not present in a.
+// Requires 2 <= len(b) <= MaxLen.
+func Index(a, b []byte) int {
+	panic("unimplemented")
+}
+
+// IndexString returns the index of the first instance of b in a, or -1 if b is not present in a.
+// Requires 2 <= len(b) <= MaxLen.
+func IndexString(a, b string) int {
+	panic("unimplemented")
+}
+
+// Cutover reports the number of failures of IndexByte we should tolerate
+// before switching over to Index.
+// n is the number of bytes processed so far.
+// See the bytes.Index implementation for details.
+func Cutover(n int) int {
+	panic("unimplemented")
+}
diff --git a/src/internal/bytealg/index_native.go b/src/internal/bytealg/index_native.go
new file mode 100644
index 0000000000..fde4214245
--- /dev/null
+++ b/src/internal/bytealg/index_native.go
@@ -0,0 +1,19 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build amd64 arm64 s390x
+
+package bytealg
+
+//go:noescape
+
+// Index returns the index of the first instance of b in a, or -1 if b is not present in a.
+// Requires 2 <= len(b) <= MaxLen.
+func Index(a, b []byte) int
+
+//go:noescape
+
+// IndexString returns the index of the first instance of b in a, or -1 if b is not present in a.
+// Requires 2 <= len(b) <= MaxLen.
+func IndexString(a, b string) int
diff --git a/src/internal/bytealg/index_s390x.go b/src/internal/bytealg/index_s390x.go
new file mode 100644
index 0000000000..9340cf1135
--- /dev/null
+++ b/src/internal/bytealg/index_s390x.go
@@ -0,0 +1,31 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package bytealg
+
+import "internal/cpu"
+
+const MaxBruteForce = 64
+
+func init() {
+	// Note: we're kind of lucky that this flag is available at this point.
+	// The runtime sets HasVX when processing auxv records, and that happens
+	// to happen *before* running the init functions of packages that
+	// the runtime depends on.
+	// TODO: it would really be nicer for internal/cpu to figure out this
+	// flag by itself. Then we wouldn't need to depend on quirks of
+	// early startup initialization order.
+	if cpu.S390X.HasVX {
+		MaxLen = 64
+	}
+}
+
+// Cutover reports the number of failures of IndexByte we should tolerate
+// before switching over to Index.
+// n is the number of bytes processed so far.
+// See the bytes.Index implementation for details.
+func Cutover(n int) int {
+	// 1 error per 8 characters, plus a few slop to start.
+	return (n + 16) / 8
+}
diff --git a/src/internal/bytealg/index_s390x.s b/src/internal/bytealg/index_s390x.s
new file mode 100644
index 0000000000..491d5bcfd2
--- /dev/null
+++ b/src/internal/bytealg/index_s390x.s
@@ -0,0 +1,216 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+// Caller must confirm availability of vx facility before calling.
+TEXT ·Index(SB),NOSPLIT|NOFRAME,$0-56
+	LMG	a_base+0(FP), R1, R2  // R1=&s[0],   R2=len(s)
+	LMG	b_base+24(FP), R3, R4 // R3=&sep[0], R4=len(sep)
+	MOVD	$ret+48(FP), R5
+	BR	indexbody<>(SB)
+
+// Caller must confirm availability of vx facility before calling.
+TEXT ·IndexString(SB),NOSPLIT|NOFRAME,$0-40
+	LMG	a_base+0(FP), R1, R2  // R1=&s[0],   R2=len(s)
+	LMG	b_base+16(FP), R3, R4 // R3=&sep[0], R4=len(sep)
+	MOVD	$ret+32(FP), R5
+	BR	indexbody<>(SB)
+
+// s: string we are searching
+// sep: string to search for
+// R1=&s[0], R2=len(s)
+// R3=&sep[0], R4=len(sep)
+// R5=&ret (int)
+// Caller must confirm availability of vx facility before calling.
+TEXT indexbody<>(SB),NOSPLIT|NOFRAME,$0
+	CMPBGT	R4, R2, notfound
+	ADD	R1, R2
+	SUB	R4, R2 // R2=&s[len(s)-len(sep)] (last valid index)
+	CMPBEQ	R4, $0, notfound
+	SUB	$1, R4 // R4=len(sep)-1 for use as VLL index
+	VLL	R4, (R3), V0 // contains first 16 bytes of sep
+	MOVD	R1, R7
+index2plus:
+	CMPBNE	R4, $1, index3plus
+	MOVD	$15(R7), R9
+	CMPBGE	R9, R2, index2to16
+	VGBM	$0xaaaa, V31       // 0xff00ff00ff00ff00...
+	VONE	V16
+	VREPH	$0, V0, V1
+	CMPBGE	R9, R2, index2to16
+index2loop:
+	VL	0(R7), V2          // 16 bytes, even indices
+	VL	1(R7), V4          // 16 bytes, odd indices
+	VCEQH	V1, V2, V5         // compare even indices
+	VCEQH	V1, V4, V6         // compare odd indices
+	VSEL	V5, V6, V31, V7    // merge even and odd indices
+	VFEEBS	V16, V7, V17       // find leftmost index, set condition to 1 if found
+	BLT	foundV17
+	MOVD	$16(R7), R7        // R7+=16
+	ADD	$15, R7, R9
+	CMPBLE	R9, R2, index2loop // continue if (R7+15) <= R2 (last index to search)
+	CMPBLE	R7, R2, index2to16
+	BR	notfound
+
+index3plus:
+	CMPBNE	R4, $2, index4plus
+	ADD	$15, R7, R9
+	CMPBGE	R9, R2, index2to16
+	MOVD	$1, R0
+	VGBM	$0xaaaa, V31       // 0xff00ff00ff00ff00...
+	VONE	V16
+	VREPH	$0, V0, V1
+	VREPB	$2, V0, V8
+index3loop:
+	VL	(R7), V2           // load 16-bytes into V2
+	VLL	R0, 16(R7), V3     // load 2-bytes into V3
+	VSLDB	$1, V2, V3, V4     // V4=(V2:V3)<<1
+	VSLDB	$2, V2, V3, V9     // V9=(V2:V3)<<2
+	VCEQH	V1, V2, V5         // compare 2-byte even indices
+	VCEQH	V1, V4, V6         // compare 2-byte odd indices
+	VCEQB	V8, V9, V10        // compare last bytes
+	VSEL	V5, V6, V31, V7    // merge even and odd indices
+	VN	V7, V10, V7        // AND indices with last byte
+	VFEEBS	V16, V7, V17       // find leftmost index, set condition to 1 if found
+	BLT	foundV17
+	MOVD	$16(R7), R7        // R7+=16
+	ADD	$15, R7, R9
+	CMPBLE	R9, R2, index3loop // continue if (R7+15) <= R2 (last index to search)
+	CMPBLE	R7, R2, index2to16
+	BR	notfound
+
+index4plus:
+	CMPBNE	R4, $3, index5plus
+	ADD	$15, R7, R9
+	CMPBGE	R9, R2, index2to16
+	MOVD	$2, R0
+	VGBM	$0x8888, V29       // 0xff000000ff000000...
+	VGBM	$0x2222, V30       // 0x0000ff000000ff00...
+	VGBM	$0xcccc, V31       // 0xffff0000ffff0000...
+	VONE	V16
+	VREPF	$0, V0, V1
+index4loop:
+	VL	(R7), V2           // load 16-bytes into V2
+	VLL	R0, 16(R7), V3     // load 3-bytes into V3
+	VSLDB	$1, V2, V3, V4     // V4=(V2:V3)<<1
+	VSLDB	$2, V2, V3, V9     // V9=(V2:V3)<<1
+	VSLDB	$3, V2, V3, V10    // V10=(V2:V3)<<1
+	VCEQF	V1, V2, V5         // compare index 0, 4, ...
+	VCEQF	V1, V4, V6         // compare index 1, 5, ...
+	VCEQF	V1, V9, V11        // compare index 2, 6, ...
+	VCEQF	V1, V10, V12       // compare index 3, 7, ...
+	VSEL	V5, V6, V29, V13   // merge index 0, 1, 4, 5, ...
+	VSEL	V11, V12, V30, V14 // merge index 2, 3, 6, 7, ...
+	VSEL	V13, V14, V31, V7  // final merge
+	VFEEBS	V16, V7, V17       // find leftmost index, set condition to 1 if found
+	BLT	foundV17
+	MOVD	$16(R7), R7        // R7+=16
+	ADD	$15, R7, R9
+	CMPBLE	R9, R2, index4loop // continue if (R7+15) <= R2 (last index to search)
+	CMPBLE	R7, R2, index2to16
+	BR	notfound
+
+index5plus:
+	CMPBGT	R4, $15, index17plus
+index2to16:
+	CMPBGT	R7, R2, notfound
+	MOVD	$1(R7), R8
+	CMPBGT	R8, R2, index2to16tail
+index2to16loop:
+	// unrolled 2x
+	VLL	R4, (R7), V1
+	VLL	R4, 1(R7), V2
+	VCEQGS	V0, V1, V3
+	BEQ	found
+	MOVD	$1(R7), R7
+	VCEQGS	V0, V2, V4
+	BEQ	found
+	MOVD	$1(R7), R7
+	CMPBLT	R7, R2, index2to16loop
+	CMPBGT	R7, R2, notfound
+index2to16tail:
+	VLL	R4, (R7), V1
+	VCEQGS	V0, V1, V2
+	BEQ	found
+	BR	notfound
+
+index17plus:
+	CMPBGT	R4, $31, index33plus
+	SUB	$16, R4, R0
+	VLL	R0, 16(R3), V1
+	VONE	V7
+index17to32loop:
+	VL	(R7), V2
+	VLL	R0, 16(R7), V3
+	VCEQG	V0, V2, V4
+	VCEQG	V1, V3, V5
+	VN	V4, V5, V6
+	VCEQGS	V6, V7, V8
+	BEQ	found
+	MOVD	$1(R7), R7
+	CMPBLE  R7, R2, index17to32loop
+	BR	notfound
+
+index33plus:
+	CMPBGT	R4, $47, index49plus
+	SUB	$32, R4, R0
+	VL	16(R3), V1
+	VLL	R0, 32(R3), V2
+	VONE	V11
+index33to48loop:
+	VL	(R7), V3
+	VL	16(R7), V4
+	VLL	R0, 32(R7), V5
+	VCEQG	V0, V3, V6
+	VCEQG	V1, V4, V7
+	VCEQG	V2, V5, V8
+	VN	V6, V7, V9
+	VN	V8, V9, V10
+	VCEQGS	V10, V11, V12
+	BEQ	found
+	MOVD	$1(R7), R7
+	CMPBLE  R7, R2, index33to48loop
+	BR	notfound
+
+index49plus:
+	CMPBGT	R4, $63, index65plus
+	SUB	$48, R4, R0
+	VL	16(R3), V1
+	VL	32(R3), V2
+	VLL	R0, 48(R3), V3
+	VONE	V15
+index49to64loop:
+	VL	(R7), V4
+	VL	16(R7), V5
+	VL	32(R7), V6
+	VLL	R0, 48(R7), V7
+	VCEQG	V0, V4, V8
+	VCEQG	V1, V5, V9
+	VCEQG	V2, V6, V10
+	VCEQG	V3, V7, V11
+	VN	V8, V9, V12
+	VN	V10, V11, V13
+	VN	V12, V13, V14
+	VCEQGS	V14, V15, V16
+	BEQ	found
+	MOVD	$1(R7), R7
+	CMPBLE  R7, R2, index49to64loop
+notfound:
+	MOVD	$-1, (R5)
+	RET
+
+index65plus:
+	// not implemented
+	MOVD	$0, (R0)
+	RET
+
+foundV17: // index is in doubleword V17[0]
+	VLGVG	$0, V17, R8
+	ADD	R8, R7
+found:
+	SUB	R1, R7
+	MOVD	R7, (R5)
+	RET
author	Keith Randall <khr@golang.org>	2018-03-04 09:47:47 -0800
committer	Keith Randall <khr@golang.org>	2018-03-04 19:49:44 +0000
commit	ee58eccc565c0871d3f16fd702fd8649a3fb61ea (patch)
tree	837073b78954dc987cf575ff478faf6fdb8afb0e /src/internal/bytealg
parent	f6332bb84ad87e958290ae23b29a2b13a41ee2a2 (diff)
download	go-ee58eccc565c0871d3f16fd702fd8649a3fb61ea.tar.xz