aboutsummaryrefslogtreecommitdiff
path: root/src/internal
diff options
context:
space:
mode:
authorKeith Randall <khr@golang.org>2018-03-03 10:28:58 -0800
committerKeith Randall <khr@golang.org>2018-03-04 17:49:25 +0000
commit45964e4f9c950863adcaeb62fbe49f3fa913f27d (patch)
treebb07b6ae9f12caa6156932ff236ce3c4c42147e0 /src/internal
parent89ae7045f395de8eb4085e3ac8c1ebf59b029965 (diff)
downloadgo-45964e4f9c950863adcaeb62fbe49f3fa913f27d.tar.xz
internal/bytealg: move Count to bytealg
Move bytes.Count and strings.Count to bytealg. Update #19792 Change-Id: I3e4e14b504a0b71758885bb131e5656e342cf8cb Reviewed-on: https://go-review.googlesource.com/98495 Run-TryBot: Keith Randall <khr@golang.org> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>
Diffstat (limited to 'src/internal')
-rw-r--r--src/internal/bytealg/count_amd64.s201
-rw-r--r--src/internal/bytealg/count_generic.go27
-rw-r--r--src/internal/bytealg/count_native.go33
-rw-r--r--src/internal/bytealg/equal_native.go9
4 files changed, 267 insertions, 3 deletions
diff --git a/src/internal/bytealg/count_amd64.s b/src/internal/bytealg/count_amd64.s
new file mode 100644
index 0000000000..19eb1ac642
--- /dev/null
+++ b/src/internal/bytealg/count_amd64.s
@@ -0,0 +1,201 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·Count(SB),NOSPLIT,$0-40
+ CMPB internal∕cpu·X86+const_x86_HasPOPCNT(SB), $1
+ JEQ 2(PC)
+ JMP ·countGeneric(SB)
+ MOVQ b_base+0(FP), SI
+ MOVQ b_len+8(FP), BX
+ MOVB c+24(FP), AL
+ LEAQ ret+32(FP), R8
+ JMP countbody<>(SB)
+
+TEXT ·CountString(SB),NOSPLIT,$0-32
+ CMPB internal∕cpu·X86+const_x86_HasPOPCNT(SB), $1
+ JEQ 2(PC)
+ JMP ·countGenericString(SB)
+ MOVQ s_base+0(FP), SI
+ MOVQ s_len+8(FP), BX
+ MOVB c+16(FP), AL
+ LEAQ ret+24(FP), R8
+ JMP countbody<>(SB)
+
+// input:
+// SI: data
+// BX: data len
+// AL: byte sought
+// R8: address to put result
+// This function requires the POPCNT instruction.
+TEXT countbody<>(SB),NOSPLIT,$0
+ // Shuffle X0 around so that each byte contains
+ // the character we're looking for.
+ MOVD AX, X0
+ PUNPCKLBW X0, X0
+ PUNPCKLBW X0, X0
+ PSHUFL $0, X0, X0
+
+ CMPQ BX, $16
+ JLT small
+
+ MOVQ $0, R12 // Accumulator
+
+ MOVQ SI, DI
+
+ CMPQ BX, $32
+ JA avx2
+sse:
+ LEAQ -16(SI)(BX*1), AX // AX = address of last 16 bytes
+ JMP sseloopentry
+
+sseloop:
+ // Move the next 16-byte chunk of the data into X1.
+ MOVOU (DI), X1
+ // Compare bytes in X0 to X1.
+ PCMPEQB X0, X1
+ // Take the top bit of each byte in X1 and put the result in DX.
+ PMOVMSKB X1, DX
+ // Count number of matching bytes
+ POPCNTL DX, DX
+ // Accumulate into R12
+ ADDQ DX, R12
+ // Advance to next block.
+ ADDQ $16, DI
+sseloopentry:
+ CMPQ DI, AX
+ JBE sseloop
+
+ // Get the number of bytes to consider in the last 16 bytes
+ ANDQ $15, BX
+ JZ end
+
+ // Create mask to ignore overlap between previous 16 byte block
+ // and the next.
+ MOVQ $16,CX
+ SUBQ BX, CX
+ MOVQ $0xFFFF, R10
+ SARQ CL, R10
+ SALQ CL, R10
+
+ // Process the last 16-byte chunk. This chunk may overlap with the
+ // chunks we've already searched so we need to mask part of it.
+ MOVOU (AX), X1
+ PCMPEQB X0, X1
+ PMOVMSKB X1, DX
+ // Apply mask
+ ANDQ R10, DX
+ POPCNTL DX, DX
+ ADDQ DX, R12
+end:
+ MOVQ R12, (R8)
+ RET
+
+// handle for lengths < 16
+small:
+ TESTQ BX, BX
+ JEQ endzero
+
+ // Check if we'll load across a page boundary.
+ LEAQ 16(SI), AX
+ TESTW $0xff0, AX
+ JEQ endofpage
+
+ // We must ignore high bytes as they aren't part of our slice.
+ // Create mask.
+ MOVB BX, CX
+ MOVQ $1, R10
+ SALQ CL, R10
+ SUBQ $1, R10
+
+ // Load data
+ MOVOU (SI), X1
+ // Compare target byte with each byte in data.
+ PCMPEQB X0, X1
+ // Move result bits to integer register.
+ PMOVMSKB X1, DX
+ // Apply mask
+ ANDQ R10, DX
+ POPCNTL DX, DX
+ // Directly return DX, we don't need to accumulate
+ // since we have <16 bytes.
+ MOVQ DX, (R8)
+ RET
+endzero:
+ MOVQ $0, (R8)
+ RET
+
+endofpage:
+ // We must ignore low bytes as they aren't part of our slice.
+ MOVQ $16,CX
+ SUBQ BX, CX
+ MOVQ $0xFFFF, R10
+ SARQ CL, R10
+ SALQ CL, R10
+
+ // Load data into the high end of X1.
+ MOVOU -16(SI)(BX*1), X1
+ // Compare target byte with each byte in data.
+ PCMPEQB X0, X1
+ // Move result bits to integer register.
+ PMOVMSKB X1, DX
+ // Apply mask
+ ANDQ R10, DX
+ // Directly return DX, we don't need to accumulate
+ // since we have <16 bytes.
+ POPCNTL DX, DX
+ MOVQ DX, (R8)
+ RET
+
+avx2:
+ CMPB runtime·support_avx2(SB), $1
+ JNE sse
+ MOVD AX, X0
+ LEAQ -32(SI)(BX*1), R11
+ VPBROADCASTB X0, Y1
+avx2_loop:
+ VMOVDQU (DI), Y2
+ VPCMPEQB Y1, Y2, Y3
+ VPMOVMSKB Y3, DX
+ POPCNTL DX, DX
+ ADDQ DX, R12
+ ADDQ $32, DI
+ CMPQ DI, R11
+ JLE avx2_loop
+
+ // If last block is already processed,
+ // skip to the end.
+ CMPQ DI, R11
+ JEQ endavx
+
+ // Load address of the last 32 bytes.
+ // There is an overlap with the previous block.
+ MOVQ R11, DI
+ VMOVDQU (DI), Y2
+ VPCMPEQB Y1, Y2, Y3
+ VPMOVMSKB Y3, DX
+ // Exit AVX mode.
+ VZEROUPPER
+
+ // Create mask to ignore overlap between previous 32 byte block
+ // and the next.
+ ANDQ $31, BX
+ MOVQ $32,CX
+ SUBQ BX, CX
+ MOVQ $0xFFFFFFFF, R10
+ SARQ CL, R10
+ SALQ CL, R10
+ // Apply mask
+ ANDQ R10, DX
+ POPCNTL DX, DX
+ ADDQ DX, R12
+ MOVQ R12, (R8)
+ RET
+endavx:
+ // Exit AVX mode.
+ VZEROUPPER
+ MOVQ R12, (R8)
+ RET
diff --git a/src/internal/bytealg/count_generic.go b/src/internal/bytealg/count_generic.go
new file mode 100644
index 0000000000..acc5a79827
--- /dev/null
+++ b/src/internal/bytealg/count_generic.go
@@ -0,0 +1,27 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !amd64
+
+package bytealg
+
+func Count(b []byte, c byte) int {
+ n := 0
+ for _, x := range b {
+ if x == c {
+ n++
+ }
+ }
+ return n
+}
+
+func CountString(s string, c byte) int {
+ n := 0
+ for i := 0; i < len(s); i++ {
+ if s[i] == c {
+ n++
+ }
+ }
+ return n
+}
diff --git a/src/internal/bytealg/count_native.go b/src/internal/bytealg/count_native.go
new file mode 100644
index 0000000000..e6d3b066aa
--- /dev/null
+++ b/src/internal/bytealg/count_native.go
@@ -0,0 +1,33 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build amd64
+
+package bytealg
+
+//go:noescape
+func Count(b []byte, c byte) int
+
+//go:noescape
+func CountString(s string, c byte) int
+
+// A backup implementation to use by assembly.
+func countGeneric(b []byte, c byte) int {
+ n := 0
+ for _, x := range b {
+ if x == c {
+ n++
+ }
+ }
+ return n
+}
+func countGenericString(s string, c byte) int {
+ n := 0
+ for i := 0; i < len(s); i++ {
+ if s[i] == c {
+ n++
+ }
+ }
+ return n
+}
diff --git a/src/internal/bytealg/equal_native.go b/src/internal/bytealg/equal_native.go
index 3d4c057f10..55d184a58b 100644
--- a/src/internal/bytealg/equal_native.go
+++ b/src/internal/bytealg/equal_native.go
@@ -15,9 +15,12 @@ import (
// TODO: find a better way to do this?
// Offsets into internal/cpu records for use in assembly.
-const x86_HasSSE2 = unsafe.Offsetof(cpu.X86.HasSSE2)
-const x86_HasAVX2 = unsafe.Offsetof(cpu.X86.HasAVX2)
-const s390x_HasVX = unsafe.Offsetof(cpu.S390X.HasVX)
+const (
+ x86_HasSSE2 = unsafe.Offsetof(cpu.X86.HasSSE2)
+ x86_HasAVX2 = unsafe.Offsetof(cpu.X86.HasAVX2)
+ x86_HasPOPCNT = unsafe.Offsetof(cpu.X86.HasPOPCNT)
+ s390x_HasVX = unsafe.Offsetof(cpu.S390X.HasVX)
+)
//go:noescape
func Equal(a, b []byte) bool