bytes: add optimized countByte for arm64

Use SIMD instructions when counting a single byte. Inspired from runtime IndexByte implementation. Benchmark results of bytes, where 1 byte in every 8 is the one we are looking: name old time/op new time/op delta CountSingle/10-8 96.1ns ± 1% 38.8ns ± 0% -59.64% (p=0.000 n=9+7) CountSingle/32-8 172ns ± 2% 36ns ± 1% -79.27% (p=0.000 n=10+10) CountSingle/4K-8 18.2µs ± 1% 0.9µs ± 0% -95.17% (p=0.000 n=9+10) CountSingle/4M-8 18.4ms ± 0% 0.9ms ± 0% -95.00% (p=0.000 n=10+9) CountSingle/64M-8 284ms ± 4% 19ms ± 0% -93.40% (p=0.000 n=10+10) name old speed new speed delta CountSingle/10-8 104MB/s ± 1% 258MB/s ± 0% +147.99% (p=0.000 n=9+10) CountSingle/32-8 185MB/s ± 1% 897MB/s ± 1% +385.33% (p=0.000 n=9+10) CountSingle/4K-8 225MB/s ± 1% 4658MB/s ± 0% +1967.40% (p=0.000 n=9+10) CountSingle/4M-8 228MB/s ± 0% 4555MB/s ± 0% +1901.71% (p=0.000 n=10+9) CountSingle/64M-8 236MB/s ± 4% 3575MB/s ± 0% +1414.69% (p=0.000 n=10+10) Change-Id: Ifccb51b3c8658c49773fe05147c3cf3aead361e5 Reviewed-on: https://go-review.googlesource.com/71111 Reviewed-by: Cherry Zhang <cherryyz@google.com> Run-TryBot: Cherry Zhang <cherryyz@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org>
author: Wei Xiao <wei.xiao@arm.com> 2017-06-16 06:45:14 +0000
committer: Brad Fitzpatrick <bradfitz@golang.org> 2017-11-21 19:07:38 +0000
commit: 9a14cd9e7556e68ea98255ae809b5862d574df9e (patch)
tree: e3f36968e830bc9313f0ea83bdf622f54ee6aca0 /src/bytes
parent: 63ef3cde335e5b46fc3c8027b5e2f474a26717e8 (diff)
download: go-9a14cd9e7556e68ea98255ae809b5862d574df9e.tar.xz
3 files changed, 143 insertions, 1 deletions
diff --git a/src/bytes/bytes_arm64.go b/src/bytes/bytes_arm64.go
new file mode 100644
index 0000000000..846eeba486
--- /dev/null
+++ b/src/bytes/bytes_arm64.go
@@ -0,0 +1,68 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package bytes
+
+func countByte(s []byte, c byte) int // bytes_arm64.s
+
+// Index returns the index of the first instance of sep in s, or -1 if sep is not present in s.
+func Index(s, sep []byte) int {
+	n := len(sep)
+	switch {
+	case n == 0:
+		return 0
+	case n == 1:
+		return IndexByte(s, sep[0])
+	case n == len(s):
+		if Equal(sep, s) {
+			return 0
+		}
+		return -1
+	case n > len(s):
+		return -1
+	}
+	c := sep[0]
+	i := 0
+	fails := 0
+	t := s[:len(s)-n+1]
+	for i < len(t) {
+		if t[i] != c {
+			o := IndexByte(t[i:], c)
+			if o < 0 {
+				break
+			}
+			i += o
+		}
+		if Equal(s[i:i+n], sep) {
+			return i
+		}
+		i++
+		fails++
+		if fails >= 4+i>>4 && i < len(t) {
+			// Give up on IndexByte, it isn't skipping ahead
+			// far enough to be better than Rabin-Karp.
+			// Experiments (using IndexPeriodic) suggest
+			// the cutover is about 16 byte skips.
+			// TODO: if large prefixes of sep are matching
+			// we should cutover at even larger average skips,
+			// because Equal becomes that much more expensive.
+			// This code does not take that effect into account.
+			j := indexRabinKarp(s[i:], sep)
+			if j < 0 {
+				return -1
+			}
+			return i + j
+		}
+	}
+	return -1
+}
+
+// Count counts the number of non-overlapping instances of sep in s.
+// If sep is an empty slice, Count returns 1 + the number of UTF-8-encoded code points in s.
+func Count(s, sep []byte) int {
+	if len(sep) == 1 {
+		return countByte(s, sep[0])
+	}
+	return countGeneric(s, sep)
+}
diff --git a/src/bytes/bytes_arm64.s b/src/bytes/bytes_arm64.s
new file mode 100644
index 0000000000..5e229d772b
--- /dev/null
+++ b/src/bytes/bytes_arm64.s
@@ -0,0 +1,74 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+// countByte(s []byte, c byte) int
+TEXT bytes·countByte(SB),NOSPLIT,$0-40
+	MOVD	s_base+0(FP), R0
+	MOVD	s_len+8(FP), R2
+	MOVBU	c+24(FP), R1
+	// R11 = count of byte to search
+	MOVD	$0, R11
+	// short path to handle 0-byte case
+	CBZ	R2, done
+	CMP	$0x20, R2
+	// jump directly to tail if length < 32
+	BLO	tail
+	ANDS	$0x1f, R0, R9
+	BEQ	chunk
+	// Work with not 32-byte aligned head
+	BIC	$0x1f, R0, R3
+	ADD	$0x20, R3
+head_loop:
+	MOVBU.P	1(R0), R5
+	CMP	R5, R1
+	CINC	EQ, R11, R11
+	SUB	$1, R2, R2
+	CMP	R0, R3
+	BNE	head_loop
+	// Work with 32-byte aligned chunks
+chunk:
+	BIC	$0x1f, R2, R9
+	// The first chunk can also be the last
+	CBZ	R9, tail
+	// R3 = end of 32-byte chunks
+	ADD	R0, R9, R3
+	MOVD	$1, R5
+	VMOV	R5, V5.B16
+	// R2 = length of tail
+	SUB	R9, R2, R2
+	// Duplicate R1 (byte to search) to 16 1-byte elements of V0
+	VMOV	R1, V0.B16
+	// Clear the low 64-bit element of V7 and V8
+	VEOR	V7.B8, V7.B8, V7.B8
+	VEOR	V8.B8, V8.B8, V8.B8
+	// Count the target byte in 32-byte chunk
+chunk_loop:
+	VLD1.P	(R0), [V1.B16, V2.B16]
+	CMP	R0, R3
+	VCMEQ	V0.B16, V1.B16, V3.B16
+	VCMEQ	V0.B16, V2.B16, V4.B16
+	// Clear the higher 7 bits
+	VAND	V5.B16, V3.B16, V3.B16
+	VAND	V5.B16, V4.B16, V4.B16
+	// Count lanes match the requested byte
+	VADDP	V4.B16, V3.B16, V6.B16 // 32B->16B
+	VUADDLV	V6.B16, V7
+	// Accumulate the count in low 64-bit element of V8 when inside the loop
+	VADD	V7, V8
+	BNE	chunk_loop
+	VMOV	V8.D[0], R6
+	ADD	R6, R11, R11
+	CBZ	R2, done
+tail:
+	// Work with tail shorter than 32 bytes
+	MOVBU.P	1(R0), R5
+	SUB	$1, R2, R2
+	CMP	R5, R1
+	CINC	EQ, R11, R11
+	CBNZ	R2, tail
+done:
+	MOVD	R11, ret+32(FP)
+	RET
diff --git a/src/bytes/bytes_generic.go b/src/bytes/bytes_generic.go
index b30e53bf2e..0e7d33f09a 100644
--- a/src/bytes/bytes_generic.go
+++ b/src/bytes/bytes_generic.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build !amd64,!s390x
+// +build !amd64,!s390x,!arm64
 
 package bytes
author	Wei Xiao <wei.xiao@arm.com>	2017-06-16 06:45:14 +0000
committer	Brad Fitzpatrick <bradfitz@golang.org>	2017-11-21 19:07:38 +0000
commit	9a14cd9e7556e68ea98255ae809b5862d574df9e (patch)
tree	e3f36968e830bc9313f0ea83bdf622f54ee6aca0 /src/bytes
parent	63ef3cde335e5b46fc3c8027b5e2f474a26717e8 (diff)
download	go-9a14cd9e7556e68ea98255ae809b5862d574df9e.tar.xz