aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorKeith Randall <khr@golang.org>2016-01-15 18:17:09 -0800
committerKeith Randall <khr@golang.org>2016-02-26 01:09:53 +0000
commit687abca1ea828dd4745d50c351f3b73ccd4d09be (patch)
tree91b98da3bac8c6916f81e2a555551010837ded11 /src
parenta337e30620ca8557943190a988f53487ced68f05 (diff)
downloadgo-687abca1ea828dd4745d50c351f3b73ccd4d09be.tar.xz
runtime: avoid using REP prefix for IndexByte
REP-prefixed instructions have a large startup cost. Avoid them like the plague. benchmark old ns/op new ns/op delta BenchmarkIndexByte10-8 22.4 5.34 -76.16% Fixes #13983 Change-Id: I857e956e240fc9681d053f2584ccf24c1b272bb3 Reviewed-on: https://go-review.googlesource.com/18703 Reviewed-by: Minux Ma <minux@golang.org> Run-TryBot: Keith Randall <khr@golang.org> TryBot-Result: Gobot Gobot <gobot@golang.org>
Diffstat (limited to 'src')
-rw-r--r--src/bytes/bytes_test.go37
-rw-r--r--src/runtime/asm_amd64.s144
2 files changed, 110 insertions, 71 deletions
diff --git a/src/bytes/bytes_test.go b/src/bytes/bytes_test.go
index 8df62fcc6a..a412dc89b9 100644
--- a/src/bytes/bytes_test.go
+++ b/src/bytes/bytes_test.go
@@ -335,6 +335,41 @@ func TestIndexByteBig(t *testing.T) {
}
}
+// test a small index across all page offsets
+func TestIndexByteSmall(t *testing.T) {
+ b := make([]byte, 5015) // bigger than a page
+ // Make sure we find the correct byte even when straddling a page.
+ for i := 0; i <= len(b)-15; i++ {
+ for j := 0; j < 15; j++ {
+ b[i+j] = byte(100 + j)
+ }
+ for j := 0; j < 15; j++ {
+ p := IndexByte(b[i:i+15], byte(100+j))
+ if p != j {
+ t.Errorf("IndexByte(%q, %d) = %d", b[i:i+15], 100+j, p)
+ }
+ }
+ for j := 0; j < 15; j++ {
+ b[i+j] = 0
+ }
+ }
+ // Make sure matches outside the slice never trigger.
+ for i := 0; i <= len(b)-15; i++ {
+ for j := 0; j < 15; j++ {
+ b[i+j] = 1
+ }
+ for j := 0; j < 15; j++ {
+ p := IndexByte(b[i:i+15], byte(0))
+ if p != -1 {
+ t.Errorf("IndexByte(%q, %d) = %d", b[i:i+15], 0, p)
+ }
+ }
+ for j := 0; j < 15; j++ {
+ b[i+j] = 0
+ }
+ }
+}
+
func TestIndexRune(t *testing.T) {
for _, tt := range indexRuneTests {
a := []byte(tt.a)
@@ -348,10 +383,12 @@ func TestIndexRune(t *testing.T) {
var bmbuf []byte
+func BenchmarkIndexByte10(b *testing.B) { bmIndexByte(b, IndexByte, 10) }
func BenchmarkIndexByte32(b *testing.B) { bmIndexByte(b, IndexByte, 32) }
func BenchmarkIndexByte4K(b *testing.B) { bmIndexByte(b, IndexByte, 4<<10) }
func BenchmarkIndexByte4M(b *testing.B) { bmIndexByte(b, IndexByte, 4<<20) }
func BenchmarkIndexByte64M(b *testing.B) { bmIndexByte(b, IndexByte, 64<<20) }
+func BenchmarkIndexBytePortable10(b *testing.B) { bmIndexByte(b, IndexBytePortable, 10) }
func BenchmarkIndexBytePortable32(b *testing.B) { bmIndexByte(b, IndexBytePortable, 32) }
func BenchmarkIndexBytePortable4K(b *testing.B) { bmIndexByte(b, IndexBytePortable, 4<<10) }
func BenchmarkIndexBytePortable4M(b *testing.B) { bmIndexByte(b, IndexBytePortable, 4<<20) }
diff --git a/src/runtime/asm_amd64.s b/src/runtime/asm_amd64.s
index 98a8e839ed..ac4630c833 100644
--- a/src/runtime/asm_amd64.s
+++ b/src/runtime/asm_amd64.s
@@ -1838,80 +1838,98 @@ TEXT strings·IndexByte(SB),NOSPLIT,$0-32
// AL: byte sought
// R8: address to put result
TEXT runtime·indexbytebody(SB),NOSPLIT,$0
- MOVQ SI, DI
-
- CMPQ BX, $16
- JLT small
-
- CMPQ BX, $32
- JA avx2
-no_avx2:
- // round up to first 16-byte boundary
- TESTQ $15, SI
- JZ aligned
- MOVQ SI, CX
- ANDQ $~15, CX
- ADDQ $16, CX
-
- // search the beginning
- SUBQ SI, CX
- REPN; SCASB
- JZ success
-
-// DI is 16-byte aligned; get ready to search using SSE instructions
-aligned:
- // round down to last 16-byte boundary
- MOVQ BX, R11
- ADDQ SI, R11
- ANDQ $~15, R11
-
- // shuffle X0 around so that each byte contains c
+ // Shuffle X0 around so that each byte contains
+ // the character we're looking for.
MOVD AX, X0
PUNPCKLBW X0, X0
PUNPCKLBW X0, X0
PSHUFL $0, X0, X0
- JMP condition
+
+ CMPQ BX, $16
+ JLT small
+
+ MOVQ SI, DI
+ CMPQ BX, $32
+ JA avx2
sse:
- // move the next 16-byte chunk of the buffer into X1
- MOVO (DI), X1
- // compare bytes in X0 to X1
- PCMPEQB X0, X1
- // take the top bit of each byte in X1 and put the result in DX
+ LEAQ -16(SI)(BX*1), AX // AX = address of last 16 bytes
+ JMP sseloopentry
+
+sseloop:
+ // Move the next 16-byte chunk of the data into X1.
+ MOVOU (DI), X1
+ // Compare bytes in X0 to X1.
+ PCMPEQB X0, X1
+ // Take the top bit of each byte in X1 and put the result in DX.
PMOVMSKB X1, DX
- TESTL DX, DX
- JNZ ssesuccess
- ADDQ $16, DI
-
-condition:
- CMPQ DI, R11
- JLT sse
+ // Find first set bit, if any.
+ BSFL DX, DX
+ JNZ ssesuccess
+ // Advance to next block.
+ ADDQ $16, DI
+sseloopentry:
+ CMPQ DI, AX
+ JB sseloop
- // search the end
- MOVQ SI, CX
- ADDQ BX, CX
- SUBQ R11, CX
- // if CX == 0, the zero flag will be set and we'll end up
- // returning a false success
- JZ failure
- REPN; SCASB
- JZ success
+ // Search the last 16-byte chunk. This chunk may overlap with the
+ // chunks we've already searched, but that's ok.
+ MOVQ AX, DI
+ MOVOU (AX), X1
+ PCMPEQB X0, X1
+ PMOVMSKB X1, DX
+ BSFL DX, DX
+ JNZ ssesuccess
failure:
MOVQ $-1, (R8)
RET
+// We've found a chunk containing the byte.
+// The chunk was loaded from DI.
+// The index of the matching byte in the chunk is DX.
+// The start of the data is SI.
+ssesuccess:
+ SUBQ SI, DI // Compute offset of chunk within data.
+ ADDQ DX, DI // Add offset of byte within chunk.
+ MOVQ DI, (R8)
+ RET
+
// handle for lengths < 16
small:
- MOVQ BX, CX
- REPN; SCASB
- JZ success
- MOVQ $-1, (R8)
+ TESTQ BX, BX
+ JEQ failure
+
+ // Check if we'll load across a page boundary.
+ LEAQ 16(SI), AX
+ TESTW $0xff0, AX
+ JEQ endofpage
+
+ MOVOU (SI), X1 // Load data
+ PCMPEQB X0, X1 // Compare target byte with each byte in data.
+ PMOVMSKB X1, DX // Move result bits to integer register.
+ BSFL DX, DX // Find first set bit.
+ JZ failure // No set bit, failure.
+ CMPL DX, BX
+ JAE failure // Match is past end of data.
+ MOVQ DX, (R8)
+ RET
+
+endofpage:
+ MOVOU -16(SI)(BX*1), X1 // Load data into the high end of X1.
+ PCMPEQB X0, X1 // Compare target byte with each byte in data.
+ PMOVMSKB X1, DX // Move result bits to integer register.
+ MOVL BX, CX
+ SHLL CX, DX
+ SHRL $16, DX // Shift desired bits down to bottom of register.
+ BSFL DX, DX // Find first set bit.
+ JZ failure // No set bit, failure.
+ MOVQ DX, (R8)
RET
avx2:
CMPB runtime·support_avx2(SB), $1
- JNE no_avx2
+ JNE sse
MOVD AX, X0
LEAQ -32(SI)(BX*1), R11
VPBROADCASTB X0, Y1
@@ -1941,22 +1959,6 @@ avx2success:
VZEROUPPER
RET
-// we've found the chunk containing the byte
-// now just figure out which specific byte it is
-ssesuccess:
- // get the index of the least significant set bit
- BSFW DX, DX
- SUBQ SI, DI
- ADDQ DI, DX
- MOVQ DX, (R8)
- RET
-
-success:
- SUBQ SI, DI
- SUBL $1, DI
- MOVQ DI, (R8)
- RET
-
TEXT bytes·Equal(SB),NOSPLIT,$0-49
MOVQ a_len+8(FP), BX
MOVQ b_len+32(FP), CX