diff options
| author | Wei Xiao <wei.xiao@arm.com> | 2017-06-16 06:45:14 +0000 |
|---|---|---|
| committer | Brad Fitzpatrick <bradfitz@golang.org> | 2017-11-21 19:07:38 +0000 |
| commit | 9a14cd9e7556e68ea98255ae809b5862d574df9e (patch) | |
| tree | e3f36968e830bc9313f0ea83bdf622f54ee6aca0 /src/bytes | |
| parent | 63ef3cde335e5b46fc3c8027b5e2f474a26717e8 (diff) | |
| download | go-9a14cd9e7556e68ea98255ae809b5862d574df9e.tar.xz | |
bytes: add optimized countByte for arm64
Use SIMD instructions when counting a single byte.
Inspired from runtime IndexByte implementation.
Benchmark results of bytes, where 1 byte in every 8 is the one we are looking:
name old time/op new time/op delta
CountSingle/10-8 96.1ns ± 1% 38.8ns ± 0% -59.64% (p=0.000 n=9+7)
CountSingle/32-8 172ns ± 2% 36ns ± 1% -79.27% (p=0.000 n=10+10)
CountSingle/4K-8 18.2µs ± 1% 0.9µs ± 0% -95.17% (p=0.000 n=9+10)
CountSingle/4M-8 18.4ms ± 0% 0.9ms ± 0% -95.00% (p=0.000 n=10+9)
CountSingle/64M-8 284ms ± 4% 19ms ± 0% -93.40% (p=0.000 n=10+10)
name old speed new speed delta
CountSingle/10-8 104MB/s ± 1% 258MB/s ± 0% +147.99% (p=0.000 n=9+10)
CountSingle/32-8 185MB/s ± 1% 897MB/s ± 1% +385.33% (p=0.000 n=9+10)
CountSingle/4K-8 225MB/s ± 1% 4658MB/s ± 0% +1967.40% (p=0.000 n=9+10)
CountSingle/4M-8 228MB/s ± 0% 4555MB/s ± 0% +1901.71% (p=0.000 n=10+9)
CountSingle/64M-8 236MB/s ± 4% 3575MB/s ± 0% +1414.69% (p=0.000 n=10+10)
Change-Id: Ifccb51b3c8658c49773fe05147c3cf3aead361e5
Reviewed-on: https://go-review.googlesource.com/71111
Reviewed-by: Cherry Zhang <cherryyz@google.com>
Run-TryBot: Cherry Zhang <cherryyz@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Diffstat (limited to 'src/bytes')
| -rw-r--r-- | src/bytes/bytes_arm64.go | 68 | ||||
| -rw-r--r-- | src/bytes/bytes_arm64.s | 74 | ||||
| -rw-r--r-- | src/bytes/bytes_generic.go | 2 |
3 files changed, 143 insertions, 1 deletions
diff --git a/src/bytes/bytes_arm64.go b/src/bytes/bytes_arm64.go new file mode 100644 index 0000000000..846eeba486 --- /dev/null +++ b/src/bytes/bytes_arm64.go @@ -0,0 +1,68 @@ +// Copyright 2017 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package bytes + +func countByte(s []byte, c byte) int // bytes_arm64.s + +// Index returns the index of the first instance of sep in s, or -1 if sep is not present in s. +func Index(s, sep []byte) int { + n := len(sep) + switch { + case n == 0: + return 0 + case n == 1: + return IndexByte(s, sep[0]) + case n == len(s): + if Equal(sep, s) { + return 0 + } + return -1 + case n > len(s): + return -1 + } + c := sep[0] + i := 0 + fails := 0 + t := s[:len(s)-n+1] + for i < len(t) { + if t[i] != c { + o := IndexByte(t[i:], c) + if o < 0 { + break + } + i += o + } + if Equal(s[i:i+n], sep) { + return i + } + i++ + fails++ + if fails >= 4+i>>4 && i < len(t) { + // Give up on IndexByte, it isn't skipping ahead + // far enough to be better than Rabin-Karp. + // Experiments (using IndexPeriodic) suggest + // the cutover is about 16 byte skips. + // TODO: if large prefixes of sep are matching + // we should cutover at even larger average skips, + // because Equal becomes that much more expensive. + // This code does not take that effect into account. + j := indexRabinKarp(s[i:], sep) + if j < 0 { + return -1 + } + return i + j + } + } + return -1 +} + +// Count counts the number of non-overlapping instances of sep in s. +// If sep is an empty slice, Count returns 1 + the number of UTF-8-encoded code points in s. +func Count(s, sep []byte) int { + if len(sep) == 1 { + return countByte(s, sep[0]) + } + return countGeneric(s, sep) +} diff --git a/src/bytes/bytes_arm64.s b/src/bytes/bytes_arm64.s new file mode 100644 index 0000000000..5e229d772b --- /dev/null +++ b/src/bytes/bytes_arm64.s @@ -0,0 +1,74 @@ +// Copyright 2017 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "textflag.h" + +// countByte(s []byte, c byte) int +TEXT bytes·countByte(SB),NOSPLIT,$0-40 + MOVD s_base+0(FP), R0 + MOVD s_len+8(FP), R2 + MOVBU c+24(FP), R1 + // R11 = count of byte to search + MOVD $0, R11 + // short path to handle 0-byte case + CBZ R2, done + CMP $0x20, R2 + // jump directly to tail if length < 32 + BLO tail + ANDS $0x1f, R0, R9 + BEQ chunk + // Work with not 32-byte aligned head + BIC $0x1f, R0, R3 + ADD $0x20, R3 +head_loop: + MOVBU.P 1(R0), R5 + CMP R5, R1 + CINC EQ, R11, R11 + SUB $1, R2, R2 + CMP R0, R3 + BNE head_loop + // Work with 32-byte aligned chunks +chunk: + BIC $0x1f, R2, R9 + // The first chunk can also be the last + CBZ R9, tail + // R3 = end of 32-byte chunks + ADD R0, R9, R3 + MOVD $1, R5 + VMOV R5, V5.B16 + // R2 = length of tail + SUB R9, R2, R2 + // Duplicate R1 (byte to search) to 16 1-byte elements of V0 + VMOV R1, V0.B16 + // Clear the low 64-bit element of V7 and V8 + VEOR V7.B8, V7.B8, V7.B8 + VEOR V8.B8, V8.B8, V8.B8 + // Count the target byte in 32-byte chunk +chunk_loop: + VLD1.P (R0), [V1.B16, V2.B16] + CMP R0, R3 + VCMEQ V0.B16, V1.B16, V3.B16 + VCMEQ V0.B16, V2.B16, V4.B16 + // Clear the higher 7 bits + VAND V5.B16, V3.B16, V3.B16 + VAND V5.B16, V4.B16, V4.B16 + // Count lanes match the requested byte + VADDP V4.B16, V3.B16, V6.B16 // 32B->16B + VUADDLV V6.B16, V7 + // Accumulate the count in low 64-bit element of V8 when inside the loop + VADD V7, V8 + BNE chunk_loop + VMOV V8.D[0], R6 + ADD R6, R11, R11 + CBZ R2, done +tail: + // Work with tail shorter than 32 bytes + MOVBU.P 1(R0), R5 + SUB $1, R2, R2 + CMP R5, R1 + CINC EQ, R11, R11 + CBNZ R2, tail +done: + MOVD R11, ret+32(FP) + RET diff --git a/src/bytes/bytes_generic.go b/src/bytes/bytes_generic.go index b30e53bf2e..0e7d33f09a 100644 --- a/src/bytes/bytes_generic.go +++ b/src/bytes/bytes_generic.go @@ -2,7 +2,7 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -// +build !amd64,!s390x +// +build !amd64,!s390x,!arm64 package bytes |
