aboutsummaryrefslogtreecommitdiff
path: root/src/internal
diff options
context:
space:
mode:
authorKeith Randall <khr@golang.org>2018-03-04 09:47:47 -0800
committerKeith Randall <khr@golang.org>2018-03-04 19:49:44 +0000
commitee58eccc565c0871d3f16fd702fd8649a3fb61ea (patch)
tree837073b78954dc987cf575ff478faf6fdb8afb0e /src/internal
parentf6332bb84ad87e958290ae23b29a2b13a41ee2a2 (diff)
downloadgo-ee58eccc565c0871d3f16fd702fd8649a3fb61ea.tar.xz
internal/bytealg: move short string Index implementations into bytealg
Also move the arm64 CountByte implementation while we're here. Fixes #19792 Change-Id: I1e0fdf1e03e3135af84150a2703b58dad1b0d57e Reviewed-on: https://go-review.googlesource.com/98518 Run-TryBot: Keith Randall <khr@golang.org> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>
Diffstat (limited to 'src/internal')
-rw-r--r--src/internal/bytealg/bytealg.go22
-rw-r--r--src/internal/bytealg/count_arm64.s90
-rw-r--r--src/internal/bytealg/count_generic.go2
-rw-r--r--src/internal/bytealg/count_native.go2
-rw-r--r--src/internal/bytealg/equal_native.go16
-rw-r--r--src/internal/bytealg/index_amd64.go26
-rw-r--r--src/internal/bytealg/index_amd64.s274
-rw-r--r--src/internal/bytealg/index_arm64.go23
-rw-r--r--src/internal/bytealg/index_arm64.s149
-rw-r--r--src/internal/bytealg/index_generic.go29
-rw-r--r--src/internal/bytealg/index_native.go19
-rw-r--r--src/internal/bytealg/index_s390x.go31
-rw-r--r--src/internal/bytealg/index_s390x.s216
13 files changed, 881 insertions, 18 deletions
diff --git a/src/internal/bytealg/bytealg.go b/src/internal/bytealg/bytealg.go
new file mode 100644
index 0000000000..1ab7c30f4e
--- /dev/null
+++ b/src/internal/bytealg/bytealg.go
@@ -0,0 +1,22 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package bytealg
+
+import (
+ "internal/cpu"
+ "unsafe"
+)
+
+// Offsets into internal/cpu records for use in assembly.
+const (
+ x86_HasSSE2 = unsafe.Offsetof(cpu.X86.HasSSE2)
+ x86_HasSSE42 = unsafe.Offsetof(cpu.X86.HasSSE42)
+ x86_HasAVX2 = unsafe.Offsetof(cpu.X86.HasAVX2)
+ x86_HasPOPCNT = unsafe.Offsetof(cpu.X86.HasPOPCNT)
+ s390x_HasVX = unsafe.Offsetof(cpu.S390X.HasVX)
+)
+
+// MaxLen is the maximum length of the string to be searched for (argument b) in Index.
+var MaxLen int
diff --git a/src/internal/bytealg/count_arm64.s b/src/internal/bytealg/count_arm64.s
new file mode 100644
index 0000000000..8cd703d943
--- /dev/null
+++ b/src/internal/bytealg/count_arm64.s
@@ -0,0 +1,90 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·Count(SB),NOSPLIT,$0-40
+ MOVD b_base+0(FP), R0
+ MOVD b_len+8(FP), R2
+ MOVBU c+24(FP), R1
+ MOVD $ret+32(FP), R8
+ B countbytebody<>(SB)
+
+TEXT ·CountString(SB),NOSPLIT,$0-32
+ MOVD s_base+0(FP), R0
+ MOVD s_len+8(FP), R2
+ MOVBU c+16(FP), R1
+ MOVD $ret+24(FP), R8
+ B countbytebody<>(SB)
+
+// input:
+// R0: data
+// R2: data len
+// R1: byte to find
+// R8: address to put result
+TEXT countbytebody<>(SB),NOSPLIT,$0
+ // R11 = count of byte to search
+ MOVD $0, R11
+ // short path to handle 0-byte case
+ CBZ R2, done
+ CMP $0x20, R2
+ // jump directly to tail if length < 32
+ BLO tail
+ ANDS $0x1f, R0, R9
+ BEQ chunk
+ // Work with not 32-byte aligned head
+ BIC $0x1f, R0, R3
+ ADD $0x20, R3
+head_loop:
+ MOVBU.P 1(R0), R5
+ CMP R5, R1
+ CINC EQ, R11, R11
+ SUB $1, R2, R2
+ CMP R0, R3
+ BNE head_loop
+ // Work with 32-byte aligned chunks
+chunk:
+ BIC $0x1f, R2, R9
+ // The first chunk can also be the last
+ CBZ R9, tail
+ // R3 = end of 32-byte chunks
+ ADD R0, R9, R3
+ MOVD $1, R5
+ VMOV R5, V5.B16
+ // R2 = length of tail
+ SUB R9, R2, R2
+ // Duplicate R1 (byte to search) to 16 1-byte elements of V0
+ VMOV R1, V0.B16
+ // Clear the low 64-bit element of V7 and V8
+ VEOR V7.B8, V7.B8, V7.B8
+ VEOR V8.B8, V8.B8, V8.B8
+ // Count the target byte in 32-byte chunk
+chunk_loop:
+ VLD1.P (R0), [V1.B16, V2.B16]
+ CMP R0, R3
+ VCMEQ V0.B16, V1.B16, V3.B16
+ VCMEQ V0.B16, V2.B16, V4.B16
+ // Clear the higher 7 bits
+ VAND V5.B16, V3.B16, V3.B16
+ VAND V5.B16, V4.B16, V4.B16
+ // Count lanes match the requested byte
+ VADDP V4.B16, V3.B16, V6.B16 // 32B->16B
+ VUADDLV V6.B16, V7
+ // Accumulate the count in low 64-bit element of V8 when inside the loop
+ VADD V7, V8
+ BNE chunk_loop
+ VMOV V8.D[0], R6
+ ADD R6, R11, R11
+ CBZ R2, done
+tail:
+ // Work with tail shorter than 32 bytes
+ MOVBU.P 1(R0), R5
+ SUB $1, R2, R2
+ CMP R5, R1
+ CINC EQ, R11, R11
+ CBNZ R2, tail
+done:
+ MOVD R11, (R8)
+ RET
diff --git a/src/internal/bytealg/count_generic.go b/src/internal/bytealg/count_generic.go
index acc5a79827..a763b3bc61 100644
--- a/src/internal/bytealg/count_generic.go
+++ b/src/internal/bytealg/count_generic.go
@@ -2,7 +2,7 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
-// +build !amd64
+// +build !amd64,!arm64
package bytealg
diff --git a/src/internal/bytealg/count_native.go b/src/internal/bytealg/count_native.go
index e6d3b066aa..a62c4cb5c0 100644
--- a/src/internal/bytealg/count_native.go
+++ b/src/internal/bytealg/count_native.go
@@ -2,7 +2,7 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
-// +build amd64
+// +build amd64 arm64
package bytealg
diff --git a/src/internal/bytealg/equal_native.go b/src/internal/bytealg/equal_native.go
index 55d184a58b..b5c453086c 100644
--- a/src/internal/bytealg/equal_native.go
+++ b/src/internal/bytealg/equal_native.go
@@ -4,24 +4,8 @@
package bytealg
-import (
- "internal/cpu"
- "unsafe"
-)
-
// Note: there's no equal_generic.go because every platform must implement at least memequal_varlen in assembly.
-// Because equal_native.go is unconditional, it's a good place to compute asm constants.
-// TODO: find a better way to do this?
-
-// Offsets into internal/cpu records for use in assembly.
-const (
- x86_HasSSE2 = unsafe.Offsetof(cpu.X86.HasSSE2)
- x86_HasAVX2 = unsafe.Offsetof(cpu.X86.HasAVX2)
- x86_HasPOPCNT = unsafe.Offsetof(cpu.X86.HasPOPCNT)
- s390x_HasVX = unsafe.Offsetof(cpu.S390X.HasVX)
-)
-
//go:noescape
func Equal(a, b []byte) bool
diff --git a/src/internal/bytealg/index_amd64.go b/src/internal/bytealg/index_amd64.go
new file mode 100644
index 0000000000..c7a1941e5f
--- /dev/null
+++ b/src/internal/bytealg/index_amd64.go
@@ -0,0 +1,26 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package bytealg
+
+import "internal/cpu"
+
+const MaxBruteForce = 64
+
+func init() {
+ if cpu.X86.HasAVX2 {
+ MaxLen = 63
+ } else {
+ MaxLen = 31
+ }
+}
+
+// Cutover reports the number of failures of IndexByte we should tolerate
+// before switching over to Index.
+// n is the number of bytes processed so far.
+// See the bytes.Index implementation for details.
+func Cutover(n int) int {
+ // 1 error per 8 characters, plus a few slop to start.
+ return (n + 16) / 8
+}
diff --git a/src/internal/bytealg/index_amd64.s b/src/internal/bytealg/index_amd64.s
new file mode 100644
index 0000000000..f7297c0cab
--- /dev/null
+++ b/src/internal/bytealg/index_amd64.s
@@ -0,0 +1,274 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·Index(SB),NOSPLIT,$0-56
+ MOVQ a_base+0(FP), DI
+ MOVQ a_len+8(FP), DX
+ MOVQ b_base+24(FP), BP
+ MOVQ b_len+32(FP), AX
+ MOVQ DI, R10
+ LEAQ ret+48(FP), R11
+ JMP indexbody<>(SB)
+
+TEXT ·IndexString(SB),NOSPLIT,$0-40
+ MOVQ a_base+0(FP), DI
+ MOVQ a_len+8(FP), DX
+ MOVQ b_base+16(FP), BP
+ MOVQ b_len+24(FP), AX
+ MOVQ DI, R10
+ LEAQ ret+32(FP), R11
+ JMP indexbody<>(SB)
+
+// AX: length of string, that we are searching for
+// DX: length of string, in which we are searching
+// DI: pointer to string, in which we are searching
+// BP: pointer to string, that we are searching for
+// R11: address, where to put return value
+// Note: We want len in DX and AX, because PCMPESTRI implicitly consumes them
+TEXT indexbody<>(SB),NOSPLIT,$0
+ CMPQ AX, DX
+ JA fail
+ CMPQ DX, $16
+ JAE sse42
+no_sse42:
+ CMPQ AX, $2
+ JA _3_or_more
+ MOVW (BP), BP
+ LEAQ -1(DI)(DX*1), DX
+loop2:
+ MOVW (DI), SI
+ CMPW SI,BP
+ JZ success
+ ADDQ $1,DI
+ CMPQ DI,DX
+ JB loop2
+ JMP fail
+_3_or_more:
+ CMPQ AX, $3
+ JA _4_or_more
+ MOVW 1(BP), BX
+ MOVW (BP), BP
+ LEAQ -2(DI)(DX*1), DX
+loop3:
+ MOVW (DI), SI
+ CMPW SI,BP
+ JZ partial_success3
+ ADDQ $1,DI
+ CMPQ DI,DX
+ JB loop3
+ JMP fail
+partial_success3:
+ MOVW 1(DI), SI
+ CMPW SI,BX
+ JZ success
+ ADDQ $1,DI
+ CMPQ DI,DX
+ JB loop3
+ JMP fail
+_4_or_more:
+ CMPQ AX, $4
+ JA _5_or_more
+ MOVL (BP), BP
+ LEAQ -3(DI)(DX*1), DX
+loop4:
+ MOVL (DI), SI
+ CMPL SI,BP
+ JZ success
+ ADDQ $1,DI
+ CMPQ DI,DX
+ JB loop4
+ JMP fail
+_5_or_more:
+ CMPQ AX, $7
+ JA _8_or_more
+ LEAQ 1(DI)(DX*1), DX
+ SUBQ AX, DX
+ MOVL -4(BP)(AX*1), BX
+ MOVL (BP), BP
+loop5to7:
+ MOVL (DI), SI
+ CMPL SI,BP
+ JZ partial_success5to7
+ ADDQ $1,DI
+ CMPQ DI,DX
+ JB loop5to7
+ JMP fail
+partial_success5to7:
+ MOVL -4(AX)(DI*1), SI
+ CMPL SI,BX
+ JZ success
+ ADDQ $1,DI
+ CMPQ DI,DX
+ JB loop5to7
+ JMP fail
+_8_or_more:
+ CMPQ AX, $8
+ JA _9_or_more
+ MOVQ (BP), BP
+ LEAQ -7(DI)(DX*1), DX
+loop8:
+ MOVQ (DI), SI
+ CMPQ SI,BP
+ JZ success
+ ADDQ $1,DI
+ CMPQ DI,DX
+ JB loop8
+ JMP fail
+_9_or_more:
+ CMPQ AX, $15
+ JA _16_or_more
+ LEAQ 1(DI)(DX*1), DX
+ SUBQ AX, DX
+ MOVQ -8(BP)(AX*1), BX
+ MOVQ (BP), BP
+loop9to15:
+ MOVQ (DI), SI
+ CMPQ SI,BP
+ JZ partial_success9to15
+ ADDQ $1,DI
+ CMPQ DI,DX
+ JB loop9to15
+ JMP fail
+partial_success9to15:
+ MOVQ -8(AX)(DI*1), SI
+ CMPQ SI,BX
+ JZ success
+ ADDQ $1,DI
+ CMPQ DI,DX
+ JB loop9to15
+ JMP fail
+_16_or_more:
+ CMPQ AX, $16
+ JA _17_or_more
+ MOVOU (BP), X1
+ LEAQ -15(DI)(DX*1), DX
+loop16:
+ MOVOU (DI), X2
+ PCMPEQB X1, X2
+ PMOVMSKB X2, SI
+ CMPQ SI, $0xffff
+ JE success
+ ADDQ $1,DI
+ CMPQ DI,DX
+ JB loop16
+ JMP fail
+_17_or_more:
+ CMPQ AX, $31
+ JA _32_or_more
+ LEAQ 1(DI)(DX*1), DX
+ SUBQ AX, DX
+ MOVOU -16(BP)(AX*1), X0
+ MOVOU (BP), X1
+loop17to31:
+ MOVOU (DI), X2
+ PCMPEQB X1,X2
+ PMOVMSKB X2, SI
+ CMPQ SI, $0xffff
+ JE partial_success17to31
+ ADDQ $1,DI
+ CMPQ DI,DX
+ JB loop17to31
+ JMP fail
+partial_success17to31:
+ MOVOU -16(AX)(DI*1), X3
+ PCMPEQB X0, X3
+ PMOVMSKB X3, SI
+ CMPQ SI, $0xffff
+ JE success
+ ADDQ $1,DI
+ CMPQ DI,DX
+ JB loop17to31
+ JMP fail
+// We can get here only when AVX2 is enabled and cutoff for indexShortStr is set to 63
+// So no need to check cpuid
+_32_or_more:
+ CMPQ AX, $32
+ JA _33_to_63
+ VMOVDQU (BP), Y1
+ LEAQ -31(DI)(DX*1), DX
+loop32:
+ VMOVDQU (DI), Y2
+ VPCMPEQB Y1, Y2, Y3
+ VPMOVMSKB Y3, SI
+ CMPL SI, $0xffffffff
+ JE success_avx2
+ ADDQ $1,DI
+ CMPQ DI,DX
+ JB loop32
+ JMP fail_avx2
+_33_to_63:
+ LEAQ 1(DI)(DX*1), DX
+ SUBQ AX, DX
+ VMOVDQU -32(BP)(AX*1), Y0
+ VMOVDQU (BP), Y1
+loop33to63:
+ VMOVDQU (DI), Y2
+ VPCMPEQB Y1, Y2, Y3
+ VPMOVMSKB Y3, SI
+ CMPL SI, $0xffffffff
+ JE partial_success33to63
+ ADDQ $1,DI
+ CMPQ DI,DX
+ JB loop33to63
+ JMP fail_avx2
+partial_success33to63:
+ VMOVDQU -32(AX)(DI*1), Y3
+ VPCMPEQB Y0, Y3, Y4
+ VPMOVMSKB Y4, SI
+ CMPL SI, $0xffffffff
+ JE success_avx2
+ ADDQ $1,DI
+ CMPQ DI,DX
+ JB loop33to63
+fail_avx2:
+ VZEROUPPER
+fail:
+ MOVQ $-1, (R11)
+ RET
+success_avx2:
+ VZEROUPPER
+ JMP success
+sse42:
+ CMPB internal∕cpu·X86+const_x86_HasSSE42(SB), $1
+ JNE no_sse42
+ CMPQ AX, $12
+ // PCMPESTRI is slower than normal compare,
+ // so using it makes sense only if we advance 4+ bytes per compare
+ // This value was determined experimentally and is the ~same
+ // on Nehalem (first with SSE42) and Haswell.
+ JAE _9_or_more
+ LEAQ 16(BP), SI
+ TESTW $0xff0, SI
+ JEQ no_sse42
+ MOVOU (BP), X1
+ LEAQ -15(DI)(DX*1), SI
+ MOVQ $16, R9
+ SUBQ AX, R9 // We advance by 16-len(sep) each iteration, so precalculate it into R9
+loop_sse42:
+ // 0x0c means: unsigned byte compare (bits 0,1 are 00)
+ // for equality (bits 2,3 are 11)
+ // result is not masked or inverted (bits 4,5 are 00)
+ // and corresponds to first matching byte (bit 6 is 0)
+ PCMPESTRI $0x0c, (DI), X1
+ // CX == 16 means no match,
+ // CX > R9 means partial match at the end of the string,
+ // otherwise sep is at offset CX from X1 start
+ CMPQ CX, R9
+ JBE sse42_success
+ ADDQ R9, DI
+ CMPQ DI, SI
+ JB loop_sse42
+ PCMPESTRI $0x0c, -1(SI), X1
+ CMPQ CX, R9
+ JA fail
+ LEAQ -1(SI), DI
+sse42_success:
+ ADDQ CX, DI
+success:
+ SUBQ R10, DI
+ MOVQ DI, (R11)
+ RET
diff --git a/src/internal/bytealg/index_arm64.go b/src/internal/bytealg/index_arm64.go
new file mode 100644
index 0000000000..0f87ae106c
--- /dev/null
+++ b/src/internal/bytealg/index_arm64.go
@@ -0,0 +1,23 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package bytealg
+
+// Empirical data shows that using IndexShortStr can get better
+// performance when len(s) <= 16.
+const MaxBruteForce = 16
+
+func init() {
+ // 8 bytes can be completely loaded into 1 register.
+ MaxLen = 8
+}
+
+// Cutover reports the number of failures of IndexByte we should tolerate
+// before switching over to IndexShortStr.
+// n is the number of bytes processed so far.
+// See the bytes.Index implementation for details.
+func Cutover(n int) int {
+ // 1 error per 16 characters, plus a few slop to start.
+ return 4 + n>>4
+}
diff --git a/src/internal/bytealg/index_arm64.s b/src/internal/bytealg/index_arm64.s
new file mode 100644
index 0000000000..8cffcd10b5
--- /dev/null
+++ b/src/internal/bytealg/index_arm64.s
@@ -0,0 +1,149 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·Index(SB),NOSPLIT,$0-56
+ MOVD a_base+0(FP), R0
+ MOVD a_len+8(FP), R1
+ MOVD b_base+24(FP), R2
+ MOVD b_len+32(FP), R3
+ MOVD $ret+48(FP), R9
+ B indexbody<>(SB)
+
+TEXT ·IndexString(SB),NOSPLIT,$0-40
+ MOVD a_base+0(FP), R0
+ MOVD a_len+8(FP), R1
+ MOVD b_base+16(FP), R2
+ MOVD b_len+24(FP), R3
+ MOVD $ret+32(FP), R9
+ B indexbody<>(SB)
+
+// input:
+// R0: haystack
+// R1: length of haystack
+// R2: needle
+// R3: length of needle (2 <= len <= 8)
+// R9: address to put result
+TEXT indexbody<>(SB),NOSPLIT,$0-56
+ // main idea is to load 'sep' into separate register(s)
+ // to avoid repeatedly re-load it again and again
+ // for sebsequent substring comparisons
+ MOVD a_base+0(FP), R0
+ MOVD a_len+8(FP), R1
+ MOVD b_base+24(FP), R2
+ MOVD b_len+32(FP), R3
+ SUB R3, R1, R4
+ // R4 contains the start of last substring for comparsion
+ ADD R0, R4, R4
+ ADD $1, R0, R8
+ TBZ $3, R3, len_2_7
+len_8:
+ // R5 contains 8-byte sep
+ MOVD (R2), R5
+loop_8:
+ // R6 contains substring for comparison
+ MOVD.P 1(R0), R6
+ CMP R5, R6
+ BEQ found
+ CMP R4, R0
+ BLS loop_8
+ JMP not_found
+len_2_7:
+ TBZ $2, R3, len_2_3
+ TBZ $1, R3, len_4_5
+ TBZ $0, R3, len_6
+len_7:
+ // R5 and R6 contain 7-byte sep
+ MOVWU (R2), R5
+ // 1-byte overlap with R5
+ MOVWU 3(R2), R6
+loop_7:
+ MOVWU.P 1(R0), R3
+ CMP R5, R3
+ BNE not_equal_7
+ MOVWU 2(R0), R3
+ CMP R6, R3
+ BEQ found
+not_equal_7:
+ CMP R4, R0
+ BLS loop_7
+ JMP not_found
+len_6:
+ // R5 and R6 contain 6-byte sep
+ MOVWU (R2), R5
+ MOVHU 4(R2), R6
+loop_6:
+ MOVWU.P 1(R0), R3
+ CMP R5, R3
+ BNE not_equal_6
+ MOVHU 3(R0), R3
+ CMP R6, R3
+ BEQ found
+not_equal_6:
+ CMP R4, R0
+ BLS loop_6
+ JMP not_found
+len_4_5:
+ TBZ $0, R3, len_4
+len_5:
+ // R5 and R7 contain 5-byte sep
+ MOVWU (R2), R5
+ MOVBU 4(R2), R7
+loop_5:
+ MOVWU.P 1(R0), R3
+ CMP R5, R3
+ BNE not_equal_5
+ MOVBU 3(R0), R3
+ CMP R7, R3
+ BEQ found
+not_equal_5:
+ CMP R4, R0
+ BLS loop_5
+ JMP not_found
+len_4:
+ // R5 contains 4-byte sep
+ MOVWU (R2), R5
+loop_4:
+ MOVWU.P 1(R0), R6
+ CMP R5, R6
+ BEQ found
+ CMP R4, R0
+ BLS loop_4
+ JMP not_found
+len_2_3:
+ TBZ $0, R3, len_2
+len_3:
+ // R6 and R7 contain 3-byte sep
+ MOVHU (R2), R6
+ MOVBU 2(R2), R7
+loop_3:
+ MOVHU.P 1(R0), R3
+ CMP R6, R3
+ BNE not_equal_3
+ MOVBU 1(R0), R3
+ CMP R7, R3
+ BEQ found
+not_equal_3:
+ CMP R4, R0
+ BLS loop_3
+ JMP not_found
+len_2:
+ // R5 contains 2-byte sep
+ MOVHU (R2), R5
+loop_2:
+ MOVHU.P 1(R0), R6
+ CMP R5, R6
+ BEQ found
+ CMP R4, R0
+ BLS loop_2
+not_found:
+ MOVD $-1, R0
+ MOVD R0, (R9)
+ RET
+found:
+ SUB R8, R0, R0
+ MOVD R0, (R9)
+ RET
diff --git a/src/internal/bytealg/index_generic.go b/src/internal/bytealg/index_generic.go
new file mode 100644
index 0000000000..98e859f925
--- /dev/null
+++ b/src/internal/bytealg/index_generic.go
@@ -0,0 +1,29 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !amd64,!arm64,!s390x
+
+package bytealg
+
+const MaxBruteForce = 0
+
+// Index returns the index of the first instance of b in a, or -1 if b is not present in a.
+// Requires 2 <= len(b) <= MaxLen.
+func Index(a, b []byte) int {
+ panic("unimplemented")
+}
+
+// IndexString returns the index of the first instance of b in a, or -1 if b is not present in a.
+// Requires 2 <= len(b) <= MaxLen.
+func IndexString(a, b string) int {
+ panic("unimplemented")
+}
+
+// Cutover reports the number of failures of IndexByte we should tolerate
+// before switching over to Index.
+// n is the number of bytes processed so far.
+// See the bytes.Index implementation for details.
+func Cutover(n int) int {
+ panic("unimplemented")
+}
diff --git a/src/internal/bytealg/index_native.go b/src/internal/bytealg/index_native.go
new file mode 100644
index 0000000000..fde4214245
--- /dev/null
+++ b/src/internal/bytealg/index_native.go
@@ -0,0 +1,19 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build amd64 arm64 s390x
+
+package bytealg
+
+//go:noescape
+
+// Index returns the index of the first instance of b in a, or -1 if b is not present in a.
+// Requires 2 <= len(b) <= MaxLen.
+func Index(a, b []byte) int
+
+//go:noescape
+
+// IndexString returns the index of the first instance of b in a, or -1 if b is not present in a.
+// Requires 2 <= len(b) <= MaxLen.
+func IndexString(a, b string) int
diff --git a/src/internal/bytealg/index_s390x.go b/src/internal/bytealg/index_s390x.go
new file mode 100644
index 0000000000..9340cf1135
--- /dev/null
+++ b/src/internal/bytealg/index_s390x.go
@@ -0,0 +1,31 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package bytealg
+
+import "internal/cpu"
+
+const MaxBruteForce = 64
+
+func init() {
+ // Note: we're kind of lucky that this flag is available at this point.
+ // The runtime sets HasVX when processing auxv records, and that happens
+ // to happen *before* running the init functions of packages that
+ // the runtime depends on.
+ // TODO: it would really be nicer for internal/cpu to figure out this
+ // flag by itself. Then we wouldn't need to depend on quirks of
+ // early startup initialization order.
+ if cpu.S390X.HasVX {
+ MaxLen = 64
+ }
+}
+
+// Cutover reports the number of failures of IndexByte we should tolerate
+// before switching over to Index.
+// n is the number of bytes processed so far.
+// See the bytes.Index implementation for details.
+func Cutover(n int) int {
+ // 1 error per 8 characters, plus a few slop to start.
+ return (n + 16) / 8
+}
diff --git a/src/internal/bytealg/index_s390x.s b/src/internal/bytealg/index_s390x.s
new file mode 100644
index 0000000000..491d5bcfd2
--- /dev/null
+++ b/src/internal/bytealg/index_s390x.s
@@ -0,0 +1,216 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+// Caller must confirm availability of vx facility before calling.
+TEXT ·Index(SB),NOSPLIT|NOFRAME,$0-56
+ LMG a_base+0(FP), R1, R2 // R1=&s[0], R2=len(s)
+ LMG b_base+24(FP), R3, R4 // R3=&sep[0], R4=len(sep)
+ MOVD $ret+48(FP), R5
+ BR indexbody<>(SB)
+
+// Caller must confirm availability of vx facility before calling.
+TEXT ·IndexString(SB),NOSPLIT|NOFRAME,$0-40
+ LMG a_base+0(FP), R1, R2 // R1=&s[0], R2=len(s)
+ LMG b_base+16(FP), R3, R4 // R3=&sep[0], R4=len(sep)
+ MOVD $ret+32(FP), R5
+ BR indexbody<>(SB)
+
+// s: string we are searching
+// sep: string to search for
+// R1=&s[0], R2=len(s)
+// R3=&sep[0], R4=len(sep)
+// R5=&ret (int)
+// Caller must confirm availability of vx facility before calling.
+TEXT indexbody<>(SB),NOSPLIT|NOFRAME,$0
+ CMPBGT R4, R2, notfound
+ ADD R1, R2
+ SUB R4, R2 // R2=&s[len(s)-len(sep)] (last valid index)
+ CMPBEQ R4, $0, notfound
+ SUB $1, R4 // R4=len(sep)-1 for use as VLL index
+ VLL R4, (R3), V0 // contains first 16 bytes of sep
+ MOVD R1, R7
+index2plus:
+ CMPBNE R4, $1, index3plus
+ MOVD $15(R7), R9
+ CMPBGE R9, R2, index2to16
+ VGBM $0xaaaa, V31 // 0xff00ff00ff00ff00...
+ VONE V16
+ VREPH $0, V0, V1
+ CMPBGE R9, R2, index2to16
+index2loop:
+ VL 0(R7), V2 // 16 bytes, even indices
+ VL 1(R7), V4 // 16 bytes, odd indices
+ VCEQH V1, V2, V5 // compare even indices
+ VCEQH V1, V4, V6 // compare odd indices
+ VSEL V5, V6, V31, V7 // merge even and odd indices
+ VFEEBS V16, V7, V17 // find leftmost index, set condition to 1 if found
+ BLT foundV17
+ MOVD $16(R7), R7 // R7+=16
+ ADD $15, R7, R9
+ CMPBLE R9, R2, index2loop // continue if (R7+15) <= R2 (last index to search)
+ CMPBLE R7, R2, index2to16
+ BR notfound
+
+index3plus:
+ CMPBNE R4, $2, index4plus
+ ADD $15, R7, R9
+ CMPBGE R9, R2, index2to16
+ MOVD $1, R0
+ VGBM $0xaaaa, V31 // 0xff00ff00ff00ff00...
+ VONE V16
+ VREPH $0, V0, V1
+ VREPB $2, V0, V8
+index3loop:
+ VL (R7), V2 // load 16-bytes into V2
+ VLL R0, 16(R7), V3 // load 2-bytes into V3
+ VSLDB $1, V2, V3, V4 // V4=(V2:V3)<<1
+ VSLDB $2, V2, V3, V9 // V9=(V2:V3)<<2
+ VCEQH V1, V2, V5 // compare 2-byte even indices
+ VCEQH V1, V4, V6 // compare 2-byte odd indices
+ VCEQB V8, V9, V10 // compare last bytes
+ VSEL V5, V6, V31, V7 // merge even and odd indices
+ VN V7, V10, V7 // AND indices with last byte
+ VFEEBS V16, V7, V17 // find leftmost index, set condition to 1 if found
+ BLT foundV17
+ MOVD $16(R7), R7 // R7+=16
+ ADD $15, R7, R9
+ CMPBLE R9, R2, index3loop // continue if (R7+15) <= R2 (last index to search)
+ CMPBLE R7, R2, index2to16
+ BR notfound
+
+index4plus:
+ CMPBNE R4, $3, index5plus
+ ADD $15, R7, R9
+ CMPBGE R9, R2, index2to16
+ MOVD $2, R0
+ VGBM $0x8888, V29 // 0xff000000ff000000...
+ VGBM $0x2222, V30 // 0x0000ff000000ff00...
+ VGBM $0xcccc, V31 // 0xffff0000ffff0000...
+ VONE V16
+ VREPF $0, V0, V1
+index4loop:
+ VL (R7), V2 // load 16-bytes into V2
+ VLL R0, 16(R7), V3 // load 3-bytes into V3
+ VSLDB $1, V2, V3, V4 // V4=(V2:V3)<<1
+ VSLDB $2, V2, V3, V9 // V9=(V2:V3)<<1
+ VSLDB $3, V2, V3, V10 // V10=(V2:V3)<<1
+ VCEQF V1, V2, V5 // compare index 0, 4, ...
+ VCEQF V1, V4, V6 // compare index 1, 5, ...
+ VCEQF V1, V9, V11 // compare index 2, 6, ...
+ VCEQF V1, V10, V12 // compare index 3, 7, ...
+ VSEL V5, V6, V29, V13 // merge index 0, 1, 4, 5, ...
+ VSEL V11, V12, V30, V14 // merge index 2, 3, 6, 7, ...
+ VSEL V13, V14, V31, V7 // final merge
+ VFEEBS V16, V7, V17 // find leftmost index, set condition to 1 if found
+ BLT foundV17
+ MOVD $16(R7), R7 // R7+=16
+ ADD $15, R7, R9
+ CMPBLE R9, R2, index4loop // continue if (R7+15) <= R2 (last index to search)
+ CMPBLE R7, R2, index2to16
+ BR notfound
+
+index5plus:
+ CMPBGT R4, $15, index17plus
+index2to16:
+ CMPBGT R7, R2, notfound
+ MOVD $1(R7), R8
+ CMPBGT R8, R2, index2to16tail
+index2to16loop:
+ // unrolled 2x
+ VLL R4, (R7), V1
+ VLL R4, 1(R7), V2
+ VCEQGS V0, V1, V3
+ BEQ found
+ MOVD $1(R7), R7
+ VCEQGS V0, V2, V4
+ BEQ found
+ MOVD $1(R7), R7
+ CMPBLT R7, R2, index2to16loop
+ CMPBGT R7, R2, notfound
+index2to16tail:
+ VLL R4, (R7), V1
+ VCEQGS V0, V1, V2
+ BEQ found
+ BR notfound
+
+index17plus:
+ CMPBGT R4, $31, index33plus
+ SUB $16, R4, R0
+ VLL R0, 16(R3), V1
+ VONE V7
+index17to32loop:
+ VL (R7), V2
+ VLL R0, 16(R7), V3
+ VCEQG V0, V2, V4
+ VCEQG V1, V3, V5
+ VN V4, V5, V6
+ VCEQGS V6, V7, V8
+ BEQ found
+ MOVD $1(R7), R7
+ CMPBLE R7, R2, index17to32loop
+ BR notfound
+
+index33plus:
+ CMPBGT R4, $47, index49plus
+ SUB $32, R4, R0
+ VL 16(R3), V1
+ VLL R0, 32(R3), V2
+ VONE V11
+index33to48loop:
+ VL (R7), V3
+ VL 16(R7), V4
+ VLL R0, 32(R7), V5
+ VCEQG V0, V3, V6
+ VCEQG V1, V4, V7
+ VCEQG V2, V5, V8
+ VN V6, V7, V9
+ VN V8, V9, V10
+ VCEQGS V10, V11, V12
+ BEQ found
+ MOVD $1(R7), R7
+ CMPBLE R7, R2, index33to48loop
+ BR notfound
+
+index49plus:
+ CMPBGT R4, $63, index65plus
+ SUB $48, R4, R0
+ VL 16(R3), V1
+ VL 32(R3), V2
+ VLL R0, 48(R3), V3
+ VONE V15
+index49to64loop:
+ VL (R7), V4
+ VL 16(R7), V5
+ VL 32(R7), V6
+ VLL R0, 48(R7), V7
+ VCEQG V0, V4, V8
+ VCEQG V1, V5, V9
+ VCEQG V2, V6, V10
+ VCEQG V3, V7, V11
+ VN V8, V9, V12
+ VN V10, V11, V13
+ VN V12, V13, V14
+ VCEQGS V14, V15, V16
+ BEQ found
+ MOVD $1(R7), R7
+ CMPBLE R7, R2, index49to64loop
+notfound:
+ MOVD $-1, (R5)
+ RET
+
+index65plus:
+ // not implemented
+ MOVD $0, (R0)
+ RET
+
+foundV17: // index is in doubleword V17[0]
+ VLGVG $0, V17, R8
+ ADD R8, R7
+found:
+ SUB R1, R7
+ MOVD R7, (R5)
+ RET