aboutsummaryrefslogtreecommitdiff
path: root/src/runtime
diff options
context:
space:
mode:
authorIlya Tocar <ilya.tocar@intel.com>2015-10-28 23:20:26 +0300
committerKeith Randall <khr@golang.org>2015-11-02 18:39:38 +0000
commit0e23ca41d99c82d301badf1b762888e2c69e6c57 (patch)
tree9339a0fb834ac1df57454c2051c33211a4c1393e /src/runtime
parentcf73357e37b08fe80600470bb7618e1ea951b42e (diff)
downloadgo-0e23ca41d99c82d301badf1b762888e2c69e6c57.tar.xz
bytes: speed up Compare() on amd64
Use AVX2 if available. Results (haswell), below: name old time/op new time/op delta BytesCompare1-6 11.4ns ± 0% 11.4ns ± 0% ~ (all samples are equal) BytesCompare2-6 11.4ns ± 0% 11.4ns ± 0% ~ (all samples are equal) BytesCompare4-6 11.4ns ± 0% 11.4ns ± 0% ~ (all samples are equal) BytesCompare8-6 9.29ns ± 2% 8.76ns ± 0% -5.72% (p=0.000 n=16+17) BytesCompare16-6 9.29ns ± 2% 9.20ns ± 0% -1.02% (p=0.000 n=20+16) BytesCompare32-6 11.4ns ± 1% 11.4ns ± 0% ~ (p=0.191 n=20+20) BytesCompare64-6 14.4ns ± 0% 13.1ns ± 0% -8.68% (p=0.000 n=20+20) BytesCompare128-6 20.2ns ± 0% 18.5ns ± 0% -8.27% (p=0.000 n=16+20) BytesCompare256-6 29.3ns ± 0% 24.5ns ± 0% -16.38% (p=0.000 n=16+16) BytesCompare512-6 46.8ns ± 0% 37.1ns ± 0% -20.78% (p=0.000 n=18+16) BytesCompare1024-6 82.9ns ± 0% 62.3ns ± 0% -24.86% (p=0.000 n=20+14) BytesCompare2048-6 155ns ± 0% 112ns ± 0% -27.74% (p=0.000 n=20+20) CompareBytesEqual-6 10.1ns ± 1% 10.0ns ± 1% ~ (p=0.527 n=20+20) CompareBytesToNil-6 10.0ns ± 2% 9.4ns ± 0% -6.57% (p=0.000 n=20+17) CompareBytesEmpty-6 8.76ns ± 0% 8.76ns ± 0% ~ (all samples are equal) CompareBytesIdentical-6 8.76ns ± 0% 8.76ns ± 0% ~ (all samples are equal) CompareBytesSameLength-6 10.6ns ± 1% 10.6ns ± 1% ~ (p=0.240 n=20+20) CompareBytesDifferentLength-6 10.6ns ± 0% 10.6ns ± 1% ~ (p=1.000 n=20+20) CompareBytesBigUnaligned-6 132±s ± 1% 105±s ± 1% -20.61% (p=0.000 n=20+18) CompareBytesBig-6 125±s ± 1% 105±s ± 1% -16.31% (p=0.000 n=20+20) CompareBytesBigIdentical-6 8.13ns ± 0% 8.13ns ± 0% ~ (all samples are equal) name old speed new speed delta CompareBytesBigUnaligned-6 7.94GB/s ± 1% 10.01GB/s ± 1% +25.96% (p=0.000 n=20+18) CompareBytesBig-6 8.38GB/s ± 1% 10.01GB/s ± 1% +19.48% (p=0.000 n=20+20) CompareBytesBigIdentical-6 129TB/s ± 0% 129TB/s ± 0% +0.01% (p=0.003 n=17+19) Change-Id: I820f31bab4582dd4204b146bb077c0d2f24cd8f5 Reviewed-on: https://go-review.googlesource.com/16434 Run-TryBot: Ilya Tocar <ilya.tocar@intel.com> Reviewed-by: Klaus Post <klauspost@gmail.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Keith Randall <khr@golang.org>
Diffstat (limited to 'src/runtime')
-rw-r--r--src/runtime/asm_amd64.s70
-rw-r--r--src/runtime/runtime2.go2
2 files changed, 71 insertions, 1 deletions
diff --git a/src/runtime/asm_amd64.s b/src/runtime/asm_amd64.s
index 454789c509..33d641e612 100644
--- a/src/runtime/asm_amd64.s
+++ b/src/runtime/asm_amd64.s
@@ -42,11 +42,37 @@ TEXT runtime·rt0_go(SB),NOSPLIT,$0
JNE notintel
MOVB $1, runtime·lfenceBeforeRdtsc(SB)
notintel:
+ // Do nothing.
MOVQ $1, AX
CPUID
MOVL CX, runtime·cpuid_ecx(SB)
MOVL DX, runtime·cpuid_edx(SB)
+ // Detect AVX and AVX2 as per 14.7.1 Detection of AVX2 chapter of [1]
+ // [1] 64-ia-32-architectures-software-developer-manual-325462.pdf
+ // http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-manual-325462.pdf
+ ANDL $0x18000000, CX // check for OSXSAVE and AVX bits
+ CMPL CX, $0x18000000
+ JNE noavx
+ MOVL $0, CX
+ // For XGETBV, OSXSAVE bit is required and sufficient
+ BYTE $0x0F; BYTE $0x01; BYTE $0xD0
+ ANDL $6, AX
+ CMPL AX, $6 // Check for OS support of YMM registers
+ JNE noavx
+ MOVB $1, runtime·support_avx(SB)
+ MOVL $7, AX
+ MOVL $0, CX
+ CPUID
+ ANDL $0x20, BX // check for AVX2 bit
+ CMPL BX, $0x20
+ JNE noavx2
+ MOVB $1, runtime·support_avx2(SB)
+ JMP nocpuinfo
+noavx:
+ MOVB $0, runtime·support_avx(SB)
+noavx2:
+ MOVB $0, runtime·support_avx2(SB)
nocpuinfo:
// if there is an _cgo_init, call it.
@@ -1508,7 +1534,10 @@ TEXT runtime·cmpbody(SB),NOSPLIT,$0-0
JB small
CMPQ R8, $63
- JA big_loop
+ JBE loop
+ CMPB runtime·support_avx2(SB), $1
+ JEQ big_loop_avx2
+ JMP big_loop
loop:
CMPQ R8, $16
JBE _0through16
@@ -1657,6 +1686,45 @@ big_loop:
JBE loop
JMP big_loop
+ // Compare 64-bytes per loop iteration.
+ // Loop is unrolled and uses AVX2.
+big_loop_avx2:
+ MOVHDU (SI), X2
+ MOVHDU (DI), X3
+ MOVHDU 32(SI), X4
+ MOVHDU 32(DI), X5
+ VPCMPEQB X2, X3, X0
+ VPMOVMSKB X0, AX
+ XORL $0xffffffff, AX
+ JNE diff32_avx2
+ VPCMPEQB X4, X5, X6
+ VPMOVMSKB X6, AX
+ XORL $0xffffffff, AX
+ JNE diff64_avx2
+
+ ADDQ $64, SI
+ ADDQ $64, DI
+ SUBQ $64, R8
+ CMPQ R8, $64
+ JB big_loop_avx2_exit
+ JMP big_loop_avx2
+
+ // Avoid AVX->SSE transition penalty and search first 32 bytes of 64 byte chunk.
+diff32_avx2:
+ VZEROUPPER
+ JMP diff16
+
+ // Same as diff32_avx2, but for last 32 bytes.
+diff64_avx2:
+ VZEROUPPER
+ JMP diff48
+
+ // For <64 bytes remainder jump to normal loop.
+big_loop_avx2_exit:
+ VZEROUPPER
+ JMP loop
+
+
TEXT bytes·IndexByte(SB),NOSPLIT,$0-40
MOVQ s+0(FP), SI
MOVQ s_len+8(FP), BX
diff --git a/src/runtime/runtime2.go b/src/runtime/runtime2.go
index 6b61cd62fa..f1337e570e 100644
--- a/src/runtime/runtime2.go
+++ b/src/runtime/runtime2.go
@@ -627,6 +627,8 @@ var (
cpuid_ecx uint32
cpuid_edx uint32
lfenceBeforeRdtsc bool
+ support_avx bool
+ support_avx2 bool
goarm uint8 // set by cmd/link on arm systems
)