aboutsummaryrefslogtreecommitdiff
path: root/src/runtime/asm_amd64.s
diff options
context:
space:
mode:
authorIlya Tocar <ilya.tocar@intel.com>2015-10-29 18:52:22 +0300
committerKeith Randall <khr@golang.org>2015-11-06 15:16:28 +0000
commit321a40721bbfcb9bcf6113d4e8afd1bc030f1d8f (patch)
treec6ec5c022428718857ac133583bed5a0bc79407d /src/runtime/asm_amd64.s
parentffb20631fc4d78ce3a4e84044ada7f8e55544996 (diff)
downloadgo-321a40721bbfcb9bcf6113d4e8afd1bc030f1d8f.tar.xz
runtime: optimize indexbytebody on amd64
Use avx2 to compare 32 bytes per iteration. Results (haswell): name old time/op new time/op delta IndexByte32-6 15.5ns ± 0% 14.7ns ± 5% -4.87% (p=0.000 n=16+20) IndexByte4K-6 360ns ± 0% 183ns ± 0% -49.17% (p=0.000 n=19+20) IndexByte4M-6 384µs ± 0% 256µs ± 1% -33.41% (p=0.000 n=20+20) IndexByte64M-6 6.20ms ± 0% 4.18ms ± 1% -32.52% (p=0.000 n=19+20) IndexBytePortable32-6 73.4ns ± 5% 75.8ns ± 3% +3.35% (p=0.000 n=20+19) IndexBytePortable4K-6 5.15µs ± 0% 5.15µs ± 0% ~ (all samples are equal) IndexBytePortable4M-6 5.26ms ± 0% 5.25ms ± 0% -0.12% (p=0.000 n=20+18) IndexBytePortable64M-6 84.1ms ± 0% 84.1ms ± 0% -0.08% (p=0.012 n=18+20) Index32-6 352ns ± 0% 352ns ± 0% ~ (all samples are equal) Index4K-6 53.8µs ± 0% 53.8µs ± 0% -0.03% (p=0.000 n=16+18) Index4M-6 55.4ms ± 0% 55.4ms ± 0% ~ (p=0.149 n=20+19) Index64M-6 886ms ± 0% 886ms ± 0% ~ (p=0.108 n=20+20) IndexEasy32-6 80.3ns ± 0% 80.1ns ± 0% -0.21% (p=0.000 n=20+20) IndexEasy4K-6 426ns ± 0% 215ns ± 0% -49.53% (p=0.000 n=20+20) IndexEasy4M-6 388µs ± 0% 262µs ± 1% -32.42% (p=0.000 n=18+20) IndexEasy64M-6 6.20ms ± 0% 4.19ms ± 1% -32.47% (p=0.000 n=18+20) name old speed new speed delta IndexByte32-6 2.06GB/s ± 1% 2.17GB/s ± 5% +5.19% (p=0.000 n=18+20) IndexByte4K-6 11.4GB/s ± 0% 22.3GB/s ± 0% +96.45% (p=0.000 n=17+20) IndexByte4M-6 10.9GB/s ± 0% 16.4GB/s ± 1% +50.17% (p=0.000 n=20+20) IndexByte64M-6 10.8GB/s ± 0% 16.0GB/s ± 1% +48.19% (p=0.000 n=19+20) IndexBytePortable32-6 436MB/s ± 5% 422MB/s ± 3% -3.27% (p=0.000 n=20+19) IndexBytePortable4K-6 795MB/s ± 0% 795MB/s ± 0% ~ (p=0.940 n=17+18) IndexBytePortable4M-6 798MB/s ± 0% 799MB/s ± 0% +0.12% (p=0.000 n=20+18) IndexBytePortable64M-6 798MB/s ± 0% 798MB/s ± 0% +0.08% (p=0.011 n=18+20) Index32-6 90.9MB/s ± 0% 90.9MB/s ± 0% -0.00% (p=0.025 n=20+20) Index4K-6 76.1MB/s ± 0% 76.1MB/s ± 0% +0.03% (p=0.000 n=14+15) Index4M-6 75.7MB/s ± 0% 75.7MB/s ± 0% ~ (p=0.076 n=20+19) Index64M-6 75.7MB/s ± 0% 75.7MB/s ± 0% ~ (p=0.456 n=20+17) IndexEasy32-6 399MB/s ± 0% 399MB/s ± 0% +0.20% (p=0.000 n=20+19) IndexEasy4K-6 9.60GB/s ± 0% 19.02GB/s ± 0% +98.19% (p=0.000 n=20+20) IndexEasy4M-6 10.8GB/s ± 0% 16.0GB/s ± 1% +47.98% (p=0.000 n=18+20) IndexEasy64M-6 10.8GB/s ± 0% 16.0GB/s ± 1% +48.08% (p=0.000 n=18+20) Change-Id: I46075921dde9f3580a89544c0b3a2d8c9181ebc4 Reviewed-on: https://go-review.googlesource.com/16484 Reviewed-by: Keith Randall <khr@golang.org> Run-TryBot: Ilya Tocar <ilya.tocar@intel.com> Reviewed-by: Klaus Post <klauspost@gmail.com> TryBot-Result: Gobot Gobot <gobot@golang.org>
Diffstat (limited to 'src/runtime/asm_amd64.s')
-rw-r--r--src/runtime/asm_amd64.s35
1 files changed, 35 insertions, 0 deletions
diff --git a/src/runtime/asm_amd64.s b/src/runtime/asm_amd64.s
index 8401accbcd..68b342d4db 100644
--- a/src/runtime/asm_amd64.s
+++ b/src/runtime/asm_amd64.s
@@ -1940,6 +1940,9 @@ TEXT runtime·indexbytebody(SB),NOSPLIT,$0
CMPQ BX, $16
JLT small
+ CMPQ BX, $32
+ JA avx2
+no_avx2:
// round up to first 16-byte boundary
TESTQ $15, SI
JZ aligned
@@ -2003,6 +2006,38 @@ small:
MOVQ $-1, (R8)
RET
+avx2:
+ CMPB runtime·support_avx2(SB), $1
+ JNE no_avx2
+ MOVD AX, X0
+ LEAQ -32(SI)(BX*1), R11
+ VPBROADCASTB X0, X1
+avx2_loop:
+ MOVHDU (DI), X2
+ VPCMPEQB X1, X2, X3
+ VPTEST X3, X3
+ JNZ avx2success
+ ADDQ $32, DI
+ CMPQ DI, R11
+ JLT avx2_loop
+ MOVQ R11, DI
+ MOVHDU (DI), X2
+ VPCMPEQB X1, X2, X3
+ VPTEST X3, X3
+ JNZ avx2success
+ VZEROUPPER
+ MOVQ $-1, (R8)
+ RET
+
+avx2success:
+ VPMOVMSKB X3, DX
+ BSFL DX, DX
+ SUBQ SI, DI
+ ADDQ DI, DX
+ MOVQ DX, (R8)
+ VZEROUPPER
+ RET
+
// we've found the chunk containing the byte
// now just figure out which specific byte it is
ssesuccess: