aboutsummaryrefslogtreecommitdiff
path: root/src/internal/bytealg
diff options
context:
space:
mode:
authorvpachkov <slava.pach@gmail.com>2022-04-01 20:37:30 +0300
committerKeith Randall <khr@golang.org>2022-08-18 17:17:01 +0000
commit330cffb86951414da5ef2fde912167f6b4d1d91e (patch)
treefde1770bdb30eaa5bf612f69af38bc35886d0406 /src/internal/bytealg
parentc82bbc0e8edbbebe47e92729e8f3f1b60d380b5b (diff)
downloadgo-330cffb86951414da5ef2fde912167f6b4d1d91e.tar.xz
runtime: remove dead code and unnecessary checks for amd64
Use amd64 assembly header to remove unnecessary cpu flags checks and dead code that is guaranteed to not be executed when compiling for specific microarchitectures. name old time/op new time/op delta BytesCompare/1-12 3.88ns ± 1% 3.18ns ± 1% -18.15% (p=0.008 n=5+5) BytesCompare/2-12 3.89ns ± 1% 3.21ns ± 2% -17.66% (p=0.008 n=5+5) BytesCompare/4-12 3.89ns ± 0% 3.17ns ± 0% -18.62% (p=0.008 n=5+5) BytesCompare/8-12 3.44ns ± 2% 3.39ns ± 1% -1.36% (p=0.008 n=5+5) BytesCompare/16-12 3.40ns ± 1% 3.14ns ± 0% -7.77% (p=0.008 n=5+5) BytesCompare/32-12 3.90ns ± 1% 3.65ns ± 0% -6.19% (p=0.008 n=5+5) BytesCompare/64-12 4.96ns ± 1% 4.71ns ± 2% -4.98% (p=0.008 n=5+5) BytesCompare/128-12 6.42ns ± 0% 5.99ns ± 4% -6.75% (p=0.008 n=5+5) BytesCompare/256-12 9.36ns ± 0% 7.40ns ± 0% -20.97% (p=0.008 n=5+5) BytesCompare/512-12 15.9ns ± 1% 11.4ns ± 1% -28.36% (p=0.008 n=5+5) BytesCompare/1024-12 27.0ns ± 0% 19.3ns ± 0% -28.36% (p=0.008 n=5+5) BytesCompare/2048-12 50.2ns ± 0% 43.3ns ± 0% -13.71% (p=0.008 n=5+5) [Geo mean] 7.13ns 6.07ns -14.86% name old speed new speed delta Count/10-12 723MB/s ± 0% 704MB/s ± 1% -2.73% (p=0.008 n=5+5) Count/32-12 2.21GB/s ± 0% 2.12GB/s ± 2% -4.21% (p=0.008 n=5+5) Count/4K-12 1.03GB/s ± 0% 1.03GB/s ± 1% ~ (p=1.000 n=5+5) Count/4M-12 1.04GB/s ± 0% 1.02GB/s ± 2% ~ (p=0.310 n=5+5) Count/64M-12 1.02GB/s ± 0% 1.01GB/s ± 1% -1.00% (p=0.016 n=5+5) CountEasy/10-12 779MB/s ± 0% 768MB/s ± 1% -1.48% (p=0.008 n=5+5) CountEasy/32-12 2.15GB/s ± 0% 2.09GB/s ± 1% -2.71% (p=0.008 n=5+5) CountEasy/4K-12 45.1GB/s ± 1% 45.2GB/s ± 1% ~ (p=0.421 n=5+5) CountEasy/4M-12 36.4GB/s ± 1% 36.5GB/s ± 1% ~ (p=0.690 n=5+5) CountEasy/64M-12 16.1GB/s ± 2% 16.4GB/s ± 1% ~ (p=0.056 n=5+5) CountSingle/10-12 2.15GB/s ± 2% 2.22GB/s ± 1% +3.37% (p=0.008 n=5+5) CountSingle/32-12 5.86GB/s ± 1% 5.76GB/s ± 1% -1.55% (p=0.008 n=5+5) CountSingle/4K-12 54.6GB/s ± 1% 55.0GB/s ± 1% ~ (p=0.548 n=5+5) CountSingle/4M-12 45.9GB/s ± 4% 46.4GB/s ± 2% ~ (p=0.548 n=5+5) CountSingle/64M-12 17.3GB/s ± 1% 17.2GB/s ± 2% ~ (p=1.000 n=5+5) [Geo mean] 5.11GB/s 5.08GB/s -0.53% name old speed new speed delta Equal/1-12 200MB/s ± 0% 188MB/s ± 1% -6.11% (p=0.008 n=5+5) Equal/6-12 1.20GB/s ± 0% 1.13GB/s ± 1% -6.38% (p=0.008 n=5+5) Equal/9-12 1.67GB/s ± 3% 1.74GB/s ± 1% +3.83% (p=0.008 n=5+5) Equal/15-12 2.82GB/s ± 1% 2.89GB/s ± 1% +2.63% (p=0.008 n=5+5) Equal/16-12 2.96GB/s ± 1% 3.08GB/s ± 1% +3.95% (p=0.008 n=5+5) Equal/20-12 3.33GB/s ± 1% 3.54GB/s ± 1% +6.36% (p=0.008 n=5+5) Equal/32-12 4.57GB/s ± 0% 5.26GB/s ± 1% +15.09% (p=0.008 n=5+5) Equal/4K-12 62.0GB/s ± 1% 65.9GB/s ± 2% +6.29% (p=0.008 n=5+5) Equal/4M-12 23.6GB/s ± 2% 24.8GB/s ± 4% +5.43% (p=0.008 n=5+5) Equal/64M-12 11.1GB/s ± 2% 11.3GB/s ± 1% +1.69% (p=0.008 n=5+5) [Geo mean] 3.91GB/s 4.03GB/s +3.11% name old speed new speed delta IndexByte/10-12 2.64GB/s ± 0% 2.69GB/s ± 0% +1.67% (p=0.008 n=5+5) IndexByte/32-12 6.79GB/s ± 0% 6.27GB/s ± 0% -7.57% (p=0.008 n=5+5) IndexByte/4K-12 56.2GB/s ± 0% 56.9GB/s ± 0% +1.27% (p=0.008 n=5+5) IndexByte/4M-12 40.1GB/s ± 1% 41.7GB/s ± 1% +4.05% (p=0.008 n=5+5) IndexByte/64M-12 17.5GB/s ± 0% 17.7GB/s ± 1% ~ (p=0.095 n=5+5) IndexBytePortable/10-12 2.06GB/s ± 1% 2.16GB/s ± 1% +5.08% (p=0.008 n=5+5) IndexBytePortable/32-12 1.40GB/s ± 1% 1.54GB/s ± 1% +10.05% (p=0.008 n=5+5) IndexBytePortable/4K-12 3.99GB/s ± 0% 4.08GB/s ± 0% +2.16% (p=0.008 n=5+5) IndexBytePortable/4M-12 4.05GB/s ± 1% 4.08GB/s ± 2% ~ (p=0.095 n=5+5) IndexBytePortable/64M-12 3.80GB/s ± 1% 3.81GB/s ± 0% ~ (p=0.421 n=5+5) IndexRune/10-12 746MB/s ± 1% 752MB/s ± 0% +0.85% (p=0.008 n=5+5) IndexRune/32-12 2.33GB/s ± 0% 2.42GB/s ± 0% +3.66% (p=0.008 n=5+5) IndexRune/4K-12 44.4GB/s ± 0% 44.2GB/s ± 0% ~ (p=0.095 n=5+5) IndexRune/4M-12 36.2GB/s ± 1% 36.3GB/s ± 2% ~ (p=0.841 n=5+5) IndexRune/64M-12 16.2GB/s ± 2% 16.3GB/s ± 2% ~ (p=0.548 n=5+5) IndexRuneASCII/10-12 2.57GB/s ± 0% 2.58GB/s ± 0% +0.63% (p=0.008 n=5+5) IndexRuneASCII/32-12 6.00GB/s ± 0% 6.30GB/s ± 1% +4.98% (p=0.008 n=5+5) IndexRuneASCII/4K-12 56.7GB/s ± 0% 56.8GB/s ± 1% ~ (p=0.151 n=5+5) IndexRuneASCII/4M-12 41.6GB/s ± 1% 41.7GB/s ± 2% ~ (p=0.151 n=5+5) IndexRuneASCII/64M-12 17.7GB/s ± 1% 17.6GB/s ± 1% ~ (p=0.222 n=5+5) Index/10-12 1.06GB/s ± 1% 1.06GB/s ± 0% ~ (p=0.310 n=5+5) Index/32-12 3.57GB/s ± 0% 3.56GB/s ± 1% ~ (p=0.056 n=5+5) Index/4K-12 1.02GB/s ± 2% 1.03GB/s ± 0% ~ (p=0.690 n=5+5) Index/4M-12 1.04GB/s ± 0% 1.03GB/s ± 1% ~ (p=1.000 n=4+5) Index/64M-12 1.02GB/s ± 0% 1.02GB/s ± 0% ~ (p=0.905 n=5+4) IndexEasy/10-12 1.12GB/s ± 2% 1.15GB/s ± 1% +3.10% (p=0.008 n=5+5) IndexEasy/32-12 3.14GB/s ± 2% 3.13GB/s ± 1% ~ (p=0.310 n=5+5) IndexEasy/4K-12 47.6GB/s ± 1% 47.7GB/s ± 2% ~ (p=0.310 n=5+5) IndexEasy/4M-12 36.4GB/s ± 1% 36.3GB/s ± 2% ~ (p=0.690 n=5+5) IndexEasy/64M-12 16.1GB/s ± 1% 16.4GB/s ± 5% ~ (p=0.151 n=5+5) [Geo mean] 6.39GB/s 6.46GB/s +1.11% Change-Id: Ic1ca62f5cc719d87e2c4aeff25ad73507facff82 Reviewed-on: https://go-review.googlesource.com/c/go/+/397576 Reviewed-by: Keith Randall <khr@google.com> Run-TryBot: Keith Randall <khr@google.com> TryBot-Result: Gopher Robot <gobot@golang.org> Reviewed-by: Michael Knyszek <mknyszek@google.com>
Diffstat (limited to 'src/internal/bytealg')
-rw-r--r--src/internal/bytealg/compare_amd64.s7
-rw-r--r--src/internal/bytealg/count_amd64.s7
-rw-r--r--src/internal/bytealg/equal_amd64.s3
-rw-r--r--src/internal/bytealg/index_amd64.s2
-rw-r--r--src/internal/bytealg/indexbyte_amd64.s2
5 files changed, 21 insertions, 0 deletions
diff --git a/src/internal/bytealg/compare_amd64.s b/src/internal/bytealg/compare_amd64.s
index 4ccaca5e87..fdd015f560 100644
--- a/src/internal/bytealg/compare_amd64.s
+++ b/src/internal/bytealg/compare_amd64.s
@@ -3,6 +3,7 @@
// license that can be found in the LICENSE file.
#include "go_asm.h"
+#include "asm_amd64.h"
#include "textflag.h"
TEXT ·Compare<ABIInternal>(SB),NOSPLIT,$0-56
@@ -44,9 +45,13 @@ TEXT cmpbody<>(SB),NOSPLIT,$0-0
CMPQ R8, $63
JBE loop
+#ifndef hasAVX2
CMPB internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1
JEQ big_loop_avx2
JMP big_loop
+#else
+ JMP big_loop_avx2
+#endif
loop:
CMPQ R8, $16
JBE _0through16
@@ -155,6 +160,7 @@ allsame:
RET
// this works for >= 64 bytes of data.
+#ifndef hasAVX2
big_loop:
MOVOU (SI), X0
MOVOU (DI), X1
@@ -190,6 +196,7 @@ big_loop:
CMPQ R8, $64
JBE loop
JMP big_loop
+#endif
// Compare 64-bytes per loop iteration.
// Loop is unrolled and uses AVX2.
diff --git a/src/internal/bytealg/count_amd64.s b/src/internal/bytealg/count_amd64.s
index fa864c4c76..efb17f84b7 100644
--- a/src/internal/bytealg/count_amd64.s
+++ b/src/internal/bytealg/count_amd64.s
@@ -3,12 +3,15 @@
// license that can be found in the LICENSE file.
#include "go_asm.h"
+#include "asm_amd64.h"
#include "textflag.h"
TEXT ·Count(SB),NOSPLIT,$0-40
+#ifndef hasPOPCNT
CMPB internal∕cpu·X86+const_offsetX86HasPOPCNT(SB), $1
JEQ 2(PC)
JMP ·countGeneric(SB)
+#endif
MOVQ b_base+0(FP), SI
MOVQ b_len+8(FP), BX
MOVB c+24(FP), AL
@@ -16,9 +19,11 @@ TEXT ·Count(SB),NOSPLIT,$0-40
JMP countbody<>(SB)
TEXT ·CountString(SB),NOSPLIT,$0-32
+#ifndef hasPOPCNT
CMPB internal∕cpu·X86+const_offsetX86HasPOPCNT(SB), $1
JEQ 2(PC)
JMP ·countGenericString(SB)
+#endif
MOVQ s_base+0(FP), SI
MOVQ s_len+8(FP), BX
MOVB c+16(FP), AL
@@ -151,8 +156,10 @@ endofpage:
RET
avx2:
+#ifndef hasAVX2
CMPB internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1
JNE sse
+#endif
MOVD AX, X0
LEAQ -32(SI)(BX*1), R11
VPBROADCASTB X0, Y1
diff --git a/src/internal/bytealg/equal_amd64.s b/src/internal/bytealg/equal_amd64.s
index dd46e2e0fd..d178a33779 100644
--- a/src/internal/bytealg/equal_amd64.s
+++ b/src/internal/bytealg/equal_amd64.s
@@ -3,6 +3,7 @@
// license that can be found in the LICENSE file.
#include "go_asm.h"
+#include "asm_amd64.h"
#include "textflag.h"
// memequal(a, b unsafe.Pointer, size uintptr) bool
@@ -46,6 +47,7 @@ TEXT memeqbody<>(SB),NOSPLIT,$0-0
JB small
CMPQ BX, $64
JB bigloop
+#ifndef hasAVX2
CMPB internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1
JE hugeloop_avx2
@@ -76,6 +78,7 @@ hugeloop:
JEQ hugeloop
XORQ AX, AX // return 0
RET
+#endif
// 64 bytes at a time using ymm registers
hugeloop_avx2:
diff --git a/src/internal/bytealg/index_amd64.s b/src/internal/bytealg/index_amd64.s
index 6193b57239..04314917b8 100644
--- a/src/internal/bytealg/index_amd64.s
+++ b/src/internal/bytealg/index_amd64.s
@@ -233,8 +233,10 @@ success_avx2:
VZEROUPPER
JMP success
sse42:
+#ifndef hasSSE42
CMPB internal∕cpu·X86+const_offsetX86HasSSE42(SB), $1
JNE no_sse42
+#endif
CMPQ AX, $12
// PCMPESTRI is slower than normal compare,
// so using it makes sense only if we advance 4+ bytes per compare
diff --git a/src/internal/bytealg/indexbyte_amd64.s b/src/internal/bytealg/indexbyte_amd64.s
index f78093c539..1ca70e39e2 100644
--- a/src/internal/bytealg/indexbyte_amd64.s
+++ b/src/internal/bytealg/indexbyte_amd64.s
@@ -115,8 +115,10 @@ endofpage:
RET
avx2:
+#ifndef hasAVX2
CMPB internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1
JNE sse
+#endif
MOVD AX, X0
LEAQ -32(SI)(BX*1), R11
VPBROADCASTB X0, Y1