aboutsummaryrefslogtreecommitdiff
path: root/src/bytes/bytes_arm64.s
diff options
context:
space:
mode:
authorWei Xiao <wei.xiao@arm.com>2017-06-16 06:45:14 +0000
committerBrad Fitzpatrick <bradfitz@golang.org>2017-11-21 19:07:38 +0000
commit9a14cd9e7556e68ea98255ae809b5862d574df9e (patch)
treee3f36968e830bc9313f0ea83bdf622f54ee6aca0 /src/bytes/bytes_arm64.s
parent63ef3cde335e5b46fc3c8027b5e2f474a26717e8 (diff)
downloadgo-9a14cd9e7556e68ea98255ae809b5862d574df9e.tar.xz
bytes: add optimized countByte for arm64
Use SIMD instructions when counting a single byte. Inspired from runtime IndexByte implementation. Benchmark results of bytes, where 1 byte in every 8 is the one we are looking: name old time/op new time/op delta CountSingle/10-8 96.1ns ± 1% 38.8ns ± 0% -59.64% (p=0.000 n=9+7) CountSingle/32-8 172ns ± 2% 36ns ± 1% -79.27% (p=0.000 n=10+10) CountSingle/4K-8 18.2µs ± 1% 0.9µs ± 0% -95.17% (p=0.000 n=9+10) CountSingle/4M-8 18.4ms ± 0% 0.9ms ± 0% -95.00% (p=0.000 n=10+9) CountSingle/64M-8 284ms ± 4% 19ms ± 0% -93.40% (p=0.000 n=10+10) name old speed new speed delta CountSingle/10-8 104MB/s ± 1% 258MB/s ± 0% +147.99% (p=0.000 n=9+10) CountSingle/32-8 185MB/s ± 1% 897MB/s ± 1% +385.33% (p=0.000 n=9+10) CountSingle/4K-8 225MB/s ± 1% 4658MB/s ± 0% +1967.40% (p=0.000 n=9+10) CountSingle/4M-8 228MB/s ± 0% 4555MB/s ± 0% +1901.71% (p=0.000 n=10+9) CountSingle/64M-8 236MB/s ± 4% 3575MB/s ± 0% +1414.69% (p=0.000 n=10+10) Change-Id: Ifccb51b3c8658c49773fe05147c3cf3aead361e5 Reviewed-on: https://go-review.googlesource.com/71111 Reviewed-by: Cherry Zhang <cherryyz@google.com> Run-TryBot: Cherry Zhang <cherryyz@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org>
Diffstat (limited to 'src/bytes/bytes_arm64.s')
-rw-r--r--src/bytes/bytes_arm64.s74
1 files changed, 74 insertions, 0 deletions
diff --git a/src/bytes/bytes_arm64.s b/src/bytes/bytes_arm64.s
new file mode 100644
index 0000000000..5e229d772b
--- /dev/null
+++ b/src/bytes/bytes_arm64.s
@@ -0,0 +1,74 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+// countByte(s []byte, c byte) int
+TEXT bytes·countByte(SB),NOSPLIT,$0-40
+ MOVD s_base+0(FP), R0
+ MOVD s_len+8(FP), R2
+ MOVBU c+24(FP), R1
+ // R11 = count of byte to search
+ MOVD $0, R11
+ // short path to handle 0-byte case
+ CBZ R2, done
+ CMP $0x20, R2
+ // jump directly to tail if length < 32
+ BLO tail
+ ANDS $0x1f, R0, R9
+ BEQ chunk
+ // Work with not 32-byte aligned head
+ BIC $0x1f, R0, R3
+ ADD $0x20, R3
+head_loop:
+ MOVBU.P 1(R0), R5
+ CMP R5, R1
+ CINC EQ, R11, R11
+ SUB $1, R2, R2
+ CMP R0, R3
+ BNE head_loop
+ // Work with 32-byte aligned chunks
+chunk:
+ BIC $0x1f, R2, R9
+ // The first chunk can also be the last
+ CBZ R9, tail
+ // R3 = end of 32-byte chunks
+ ADD R0, R9, R3
+ MOVD $1, R5
+ VMOV R5, V5.B16
+ // R2 = length of tail
+ SUB R9, R2, R2
+ // Duplicate R1 (byte to search) to 16 1-byte elements of V0
+ VMOV R1, V0.B16
+ // Clear the low 64-bit element of V7 and V8
+ VEOR V7.B8, V7.B8, V7.B8
+ VEOR V8.B8, V8.B8, V8.B8
+ // Count the target byte in 32-byte chunk
+chunk_loop:
+ VLD1.P (R0), [V1.B16, V2.B16]
+ CMP R0, R3
+ VCMEQ V0.B16, V1.B16, V3.B16
+ VCMEQ V0.B16, V2.B16, V4.B16
+ // Clear the higher 7 bits
+ VAND V5.B16, V3.B16, V3.B16
+ VAND V5.B16, V4.B16, V4.B16
+ // Count lanes match the requested byte
+ VADDP V4.B16, V3.B16, V6.B16 // 32B->16B
+ VUADDLV V6.B16, V7
+ // Accumulate the count in low 64-bit element of V8 when inside the loop
+ VADD V7, V8
+ BNE chunk_loop
+ VMOV V8.D[0], R6
+ ADD R6, R11, R11
+ CBZ R2, done
+tail:
+ // Work with tail shorter than 32 bytes
+ MOVBU.P 1(R0), R5
+ SUB $1, R2, R2
+ CMP R5, R1
+ CINC EQ, R11, R11
+ CBNZ R2, tail
+done:
+ MOVD R11, ret+32(FP)
+ RET