aboutsummaryrefslogtreecommitdiff
path: root/src/runtime/asm_amd64.s
diff options
context:
space:
mode:
authorIlya Tocar <ilya.tocar@intel.com>2016-04-28 17:39:55 +0300
committerIlya Tocar <ilya.tocar@intel.com>2016-09-07 10:43:13 +0000
commit0cff219c1279cb76f042004bffcefba0a169cb67 (patch)
treea929e6aaf14d9307523a72b278d053854676d7bb /src/runtime/asm_amd64.s
parent83c73a85db84a04c8e60e52cfa348fc6b675fbf7 (diff)
downloadgo-0cff219c1279cb76f042004bffcefba0a169cb67.tar.xz
strings: use AVX2 for Index if available
IndexHard4-4 1.50ms ± 2% 0.71ms ± 0% -52.36% (p=0.000 n=20+19) This also fixes a bug, that caused a string of length 16 to use two 8-byte comparisons instead of one 16-byte. And adds a test for cases when partial_match fails. Change-Id: I1ee8fc4e068bb36c95c45de78f067c822c0d9df0 Reviewed-on: https://go-review.googlesource.com/22551 Run-TryBot: Ilya Tocar <ilya.tocar@intel.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Keith Randall <khr@golang.org>
Diffstat (limited to 'src/runtime/asm_amd64.s')
-rw-r--r--src/runtime/asm_amd64.s65
1 files changed, 62 insertions, 3 deletions
diff --git a/src/runtime/asm_amd64.s b/src/runtime/asm_amd64.s
index c9d6b90d80..488c34a233 100644
--- a/src/runtime/asm_amd64.s
+++ b/src/runtime/asm_amd64.s
@@ -1695,6 +1695,16 @@ big_loop_avx2_exit:
JMP loop
+TEXT strings·supportAVX2(SB),NOSPLIT,$0-1
+ MOVBLZX runtime·support_avx2(SB), AX
+ MOVB AX, ret+0(FP)
+ RET
+
+TEXT bytes·supportAVX2(SB),NOSPLIT,$0-1
+ MOVBLZX runtime·support_avx2(SB), AX
+ MOVB AX, ret+0(FP)
+ RET
+
TEXT strings·indexShortStr(SB),NOSPLIT,$0-40
MOVQ s+0(FP), DI
// We want len in DX and AX, because PCMPESTRI implicitly consumes them
@@ -1809,7 +1819,7 @@ loop8:
JB loop8
JMP fail
_9_or_more:
- CMPQ AX, $16
+ CMPQ AX, $15
JA _16_or_more
LEAQ 1(DI)(DX*1), DX
SUBQ AX, DX
@@ -1833,7 +1843,7 @@ partial_success9to15:
JMP fail
_16_or_more:
CMPQ AX, $16
- JA _17_to_31
+ JA _17_or_more
MOVOU (BP), X1
LEAQ -15(DI)(DX*1), DX
loop16:
@@ -1846,7 +1856,9 @@ loop16:
CMPQ DI,DX
JB loop16
JMP fail
-_17_to_31:
+_17_or_more:
+ CMPQ AX, $31
+ JA _32_or_more
LEAQ 1(DI)(DX*1), DX
SUBQ AX, DX
MOVOU -16(BP)(AX*1), X0
@@ -1870,9 +1882,56 @@ partial_success17to31:
ADDQ $1,DI
CMPQ DI,DX
JB loop17to31
+ JMP fail
+// We can get here only when AVX2 is enabled and cutoff for indexShortStr is set to 63
+// So no need to check cpuid
+_32_or_more:
+ CMPQ AX, $32
+ JA _33_to_63
+ VMOVDQU (BP), Y1
+ LEAQ -31(DI)(DX*1), DX
+loop32:
+ VMOVDQU (DI), Y2
+ VPCMPEQB Y1, Y2, Y3
+ VPMOVMSKB Y3, SI
+ CMPL SI, $0xffffffff
+ JE success_avx2
+ ADDQ $1,DI
+ CMPQ DI,DX
+ JB loop32
+ JMP fail_avx2
+_33_to_63:
+ LEAQ 1(DI)(DX*1), DX
+ SUBQ AX, DX
+ VMOVDQU -32(BP)(AX*1), Y0
+ VMOVDQU (BP), Y1
+loop33to63:
+ VMOVDQU (DI), Y2
+ VPCMPEQB Y1, Y2, Y3
+ VPMOVMSKB Y3, SI
+ CMPL SI, $0xffffffff
+ JE partial_success33to63
+ ADDQ $1,DI
+ CMPQ DI,DX
+ JB loop33to63
+ JMP fail_avx2
+partial_success33to63:
+ VMOVDQU -32(AX)(DI*1), Y3
+ VPCMPEQB Y0, Y3, Y4
+ VPMOVMSKB Y4, SI
+ CMPL SI, $0xffffffff
+ JE success_avx2
+ ADDQ $1,DI
+ CMPQ DI,DX
+ JB loop33to63
+fail_avx2:
+ VZEROUPPER
fail:
MOVQ $-1, (R11)
RET
+success_avx2:
+ VZEROUPPER
+ JMP success
sse42:
MOVL runtime·cpuid_ecx(SB), CX
ANDL $0x100000, CX