diff options
| author | Ilya Tocar <ilya.tocar@intel.com> | 2015-10-28 23:20:26 +0300 |
|---|---|---|
| committer | Keith Randall <khr@golang.org> | 2015-11-02 18:39:38 +0000 |
| commit | 0e23ca41d99c82d301badf1b762888e2c69e6c57 (patch) | |
| tree | 9339a0fb834ac1df57454c2051c33211a4c1393e /src/runtime | |
| parent | cf73357e37b08fe80600470bb7618e1ea951b42e (diff) | |
| download | go-0e23ca41d99c82d301badf1b762888e2c69e6c57.tar.xz | |
bytes: speed up Compare() on amd64
Use AVX2 if available.
Results (haswell), below:
name old time/op new time/op delta
BytesCompare1-6 11.4ns ± 0% 11.4ns ± 0% ~ (all samples are equal)
BytesCompare2-6 11.4ns ± 0% 11.4ns ± 0% ~ (all samples are equal)
BytesCompare4-6 11.4ns ± 0% 11.4ns ± 0% ~ (all samples are equal)
BytesCompare8-6 9.29ns ± 2% 8.76ns ± 0% -5.72% (p=0.000 n=16+17)
BytesCompare16-6 9.29ns ± 2% 9.20ns ± 0% -1.02% (p=0.000 n=20+16)
BytesCompare32-6 11.4ns ± 1% 11.4ns ± 0% ~ (p=0.191 n=20+20)
BytesCompare64-6 14.4ns ± 0% 13.1ns ± 0% -8.68% (p=0.000 n=20+20)
BytesCompare128-6 20.2ns ± 0% 18.5ns ± 0% -8.27% (p=0.000 n=16+20)
BytesCompare256-6 29.3ns ± 0% 24.5ns ± 0% -16.38% (p=0.000 n=16+16)
BytesCompare512-6 46.8ns ± 0% 37.1ns ± 0% -20.78% (p=0.000 n=18+16)
BytesCompare1024-6 82.9ns ± 0% 62.3ns ± 0% -24.86% (p=0.000 n=20+14)
BytesCompare2048-6 155ns ± 0% 112ns ± 0% -27.74% (p=0.000 n=20+20)
CompareBytesEqual-6 10.1ns ± 1% 10.0ns ± 1% ~ (p=0.527 n=20+20)
CompareBytesToNil-6 10.0ns ± 2% 9.4ns ± 0% -6.57% (p=0.000 n=20+17)
CompareBytesEmpty-6 8.76ns ± 0% 8.76ns ± 0% ~ (all samples are equal)
CompareBytesIdentical-6 8.76ns ± 0% 8.76ns ± 0% ~ (all samples are equal)
CompareBytesSameLength-6 10.6ns ± 1% 10.6ns ± 1% ~ (p=0.240 n=20+20)
CompareBytesDifferentLength-6 10.6ns ± 0% 10.6ns ± 1% ~ (p=1.000 n=20+20)
CompareBytesBigUnaligned-6 132±s ± 1% 105±s ± 1% -20.61% (p=0.000 n=20+18)
CompareBytesBig-6 125±s ± 1% 105±s ± 1% -16.31% (p=0.000 n=20+20)
CompareBytesBigIdentical-6 8.13ns ± 0% 8.13ns ± 0% ~ (all samples are equal)
name old speed new speed delta
CompareBytesBigUnaligned-6 7.94GB/s ± 1% 10.01GB/s ± 1% +25.96% (p=0.000 n=20+18)
CompareBytesBig-6 8.38GB/s ± 1% 10.01GB/s ± 1% +19.48% (p=0.000 n=20+20)
CompareBytesBigIdentical-6 129TB/s ± 0% 129TB/s ± 0% +0.01% (p=0.003 n=17+19)
Change-Id: I820f31bab4582dd4204b146bb077c0d2f24cd8f5
Reviewed-on: https://go-review.googlesource.com/16434
Run-TryBot: Ilya Tocar <ilya.tocar@intel.com>
Reviewed-by: Klaus Post <klauspost@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
Diffstat (limited to 'src/runtime')
| -rw-r--r-- | src/runtime/asm_amd64.s | 70 | ||||
| -rw-r--r-- | src/runtime/runtime2.go | 2 |
2 files changed, 71 insertions, 1 deletions
diff --git a/src/runtime/asm_amd64.s b/src/runtime/asm_amd64.s index 454789c509..33d641e612 100644 --- a/src/runtime/asm_amd64.s +++ b/src/runtime/asm_amd64.s @@ -42,11 +42,37 @@ TEXT runtime·rt0_go(SB),NOSPLIT,$0 JNE notintel MOVB $1, runtime·lfenceBeforeRdtsc(SB) notintel: + // Do nothing. MOVQ $1, AX CPUID MOVL CX, runtime·cpuid_ecx(SB) MOVL DX, runtime·cpuid_edx(SB) + // Detect AVX and AVX2 as per 14.7.1 Detection of AVX2 chapter of [1] + // [1] 64-ia-32-architectures-software-developer-manual-325462.pdf + // http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-manual-325462.pdf + ANDL $0x18000000, CX // check for OSXSAVE and AVX bits + CMPL CX, $0x18000000 + JNE noavx + MOVL $0, CX + // For XGETBV, OSXSAVE bit is required and sufficient + BYTE $0x0F; BYTE $0x01; BYTE $0xD0 + ANDL $6, AX + CMPL AX, $6 // Check for OS support of YMM registers + JNE noavx + MOVB $1, runtime·support_avx(SB) + MOVL $7, AX + MOVL $0, CX + CPUID + ANDL $0x20, BX // check for AVX2 bit + CMPL BX, $0x20 + JNE noavx2 + MOVB $1, runtime·support_avx2(SB) + JMP nocpuinfo +noavx: + MOVB $0, runtime·support_avx(SB) +noavx2: + MOVB $0, runtime·support_avx2(SB) nocpuinfo: // if there is an _cgo_init, call it. @@ -1508,7 +1534,10 @@ TEXT runtime·cmpbody(SB),NOSPLIT,$0-0 JB small CMPQ R8, $63 - JA big_loop + JBE loop + CMPB runtime·support_avx2(SB), $1 + JEQ big_loop_avx2 + JMP big_loop loop: CMPQ R8, $16 JBE _0through16 @@ -1657,6 +1686,45 @@ big_loop: JBE loop JMP big_loop + // Compare 64-bytes per loop iteration. + // Loop is unrolled and uses AVX2. +big_loop_avx2: + MOVHDU (SI), X2 + MOVHDU (DI), X3 + MOVHDU 32(SI), X4 + MOVHDU 32(DI), X5 + VPCMPEQB X2, X3, X0 + VPMOVMSKB X0, AX + XORL $0xffffffff, AX + JNE diff32_avx2 + VPCMPEQB X4, X5, X6 + VPMOVMSKB X6, AX + XORL $0xffffffff, AX + JNE diff64_avx2 + + ADDQ $64, SI + ADDQ $64, DI + SUBQ $64, R8 + CMPQ R8, $64 + JB big_loop_avx2_exit + JMP big_loop_avx2 + + // Avoid AVX->SSE transition penalty and search first 32 bytes of 64 byte chunk. +diff32_avx2: + VZEROUPPER + JMP diff16 + + // Same as diff32_avx2, but for last 32 bytes. +diff64_avx2: + VZEROUPPER + JMP diff48 + + // For <64 bytes remainder jump to normal loop. +big_loop_avx2_exit: + VZEROUPPER + JMP loop + + TEXT bytes·IndexByte(SB),NOSPLIT,$0-40 MOVQ s+0(FP), SI MOVQ s_len+8(FP), BX diff --git a/src/runtime/runtime2.go b/src/runtime/runtime2.go index 6b61cd62fa..f1337e570e 100644 --- a/src/runtime/runtime2.go +++ b/src/runtime/runtime2.go @@ -627,6 +627,8 @@ var ( cpuid_ecx uint32 cpuid_edx uint32 lfenceBeforeRdtsc bool + support_avx bool + support_avx2 bool goarm uint8 // set by cmd/link on arm systems ) |
