aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorvpachkov <slava.pach@gmail.com>2022-04-01 20:37:30 +0300
committerKeith Randall <khr@golang.org>2022-08-18 17:17:01 +0000
commit330cffb86951414da5ef2fde912167f6b4d1d91e (patch)
treefde1770bdb30eaa5bf612f69af38bc35886d0406 /src
parentc82bbc0e8edbbebe47e92729e8f3f1b60d380b5b (diff)
downloadgo-330cffb86951414da5ef2fde912167f6b4d1d91e.tar.xz
runtime: remove dead code and unnecessary checks for amd64
Use amd64 assembly header to remove unnecessary cpu flags checks and dead code that is guaranteed to not be executed when compiling for specific microarchitectures. name old time/op new time/op delta BytesCompare/1-12 3.88ns ± 1% 3.18ns ± 1% -18.15% (p=0.008 n=5+5) BytesCompare/2-12 3.89ns ± 1% 3.21ns ± 2% -17.66% (p=0.008 n=5+5) BytesCompare/4-12 3.89ns ± 0% 3.17ns ± 0% -18.62% (p=0.008 n=5+5) BytesCompare/8-12 3.44ns ± 2% 3.39ns ± 1% -1.36% (p=0.008 n=5+5) BytesCompare/16-12 3.40ns ± 1% 3.14ns ± 0% -7.77% (p=0.008 n=5+5) BytesCompare/32-12 3.90ns ± 1% 3.65ns ± 0% -6.19% (p=0.008 n=5+5) BytesCompare/64-12 4.96ns ± 1% 4.71ns ± 2% -4.98% (p=0.008 n=5+5) BytesCompare/128-12 6.42ns ± 0% 5.99ns ± 4% -6.75% (p=0.008 n=5+5) BytesCompare/256-12 9.36ns ± 0% 7.40ns ± 0% -20.97% (p=0.008 n=5+5) BytesCompare/512-12 15.9ns ± 1% 11.4ns ± 1% -28.36% (p=0.008 n=5+5) BytesCompare/1024-12 27.0ns ± 0% 19.3ns ± 0% -28.36% (p=0.008 n=5+5) BytesCompare/2048-12 50.2ns ± 0% 43.3ns ± 0% -13.71% (p=0.008 n=5+5) [Geo mean] 7.13ns 6.07ns -14.86% name old speed new speed delta Count/10-12 723MB/s ± 0% 704MB/s ± 1% -2.73% (p=0.008 n=5+5) Count/32-12 2.21GB/s ± 0% 2.12GB/s ± 2% -4.21% (p=0.008 n=5+5) Count/4K-12 1.03GB/s ± 0% 1.03GB/s ± 1% ~ (p=1.000 n=5+5) Count/4M-12 1.04GB/s ± 0% 1.02GB/s ± 2% ~ (p=0.310 n=5+5) Count/64M-12 1.02GB/s ± 0% 1.01GB/s ± 1% -1.00% (p=0.016 n=5+5) CountEasy/10-12 779MB/s ± 0% 768MB/s ± 1% -1.48% (p=0.008 n=5+5) CountEasy/32-12 2.15GB/s ± 0% 2.09GB/s ± 1% -2.71% (p=0.008 n=5+5) CountEasy/4K-12 45.1GB/s ± 1% 45.2GB/s ± 1% ~ (p=0.421 n=5+5) CountEasy/4M-12 36.4GB/s ± 1% 36.5GB/s ± 1% ~ (p=0.690 n=5+5) CountEasy/64M-12 16.1GB/s ± 2% 16.4GB/s ± 1% ~ (p=0.056 n=5+5) CountSingle/10-12 2.15GB/s ± 2% 2.22GB/s ± 1% +3.37% (p=0.008 n=5+5) CountSingle/32-12 5.86GB/s ± 1% 5.76GB/s ± 1% -1.55% (p=0.008 n=5+5) CountSingle/4K-12 54.6GB/s ± 1% 55.0GB/s ± 1% ~ (p=0.548 n=5+5) CountSingle/4M-12 45.9GB/s ± 4% 46.4GB/s ± 2% ~ (p=0.548 n=5+5) CountSingle/64M-12 17.3GB/s ± 1% 17.2GB/s ± 2% ~ (p=1.000 n=5+5) [Geo mean] 5.11GB/s 5.08GB/s -0.53% name old speed new speed delta Equal/1-12 200MB/s ± 0% 188MB/s ± 1% -6.11% (p=0.008 n=5+5) Equal/6-12 1.20GB/s ± 0% 1.13GB/s ± 1% -6.38% (p=0.008 n=5+5) Equal/9-12 1.67GB/s ± 3% 1.74GB/s ± 1% +3.83% (p=0.008 n=5+5) Equal/15-12 2.82GB/s ± 1% 2.89GB/s ± 1% +2.63% (p=0.008 n=5+5) Equal/16-12 2.96GB/s ± 1% 3.08GB/s ± 1% +3.95% (p=0.008 n=5+5) Equal/20-12 3.33GB/s ± 1% 3.54GB/s ± 1% +6.36% (p=0.008 n=5+5) Equal/32-12 4.57GB/s ± 0% 5.26GB/s ± 1% +15.09% (p=0.008 n=5+5) Equal/4K-12 62.0GB/s ± 1% 65.9GB/s ± 2% +6.29% (p=0.008 n=5+5) Equal/4M-12 23.6GB/s ± 2% 24.8GB/s ± 4% +5.43% (p=0.008 n=5+5) Equal/64M-12 11.1GB/s ± 2% 11.3GB/s ± 1% +1.69% (p=0.008 n=5+5) [Geo mean] 3.91GB/s 4.03GB/s +3.11% name old speed new speed delta IndexByte/10-12 2.64GB/s ± 0% 2.69GB/s ± 0% +1.67% (p=0.008 n=5+5) IndexByte/32-12 6.79GB/s ± 0% 6.27GB/s ± 0% -7.57% (p=0.008 n=5+5) IndexByte/4K-12 56.2GB/s ± 0% 56.9GB/s ± 0% +1.27% (p=0.008 n=5+5) IndexByte/4M-12 40.1GB/s ± 1% 41.7GB/s ± 1% +4.05% (p=0.008 n=5+5) IndexByte/64M-12 17.5GB/s ± 0% 17.7GB/s ± 1% ~ (p=0.095 n=5+5) IndexBytePortable/10-12 2.06GB/s ± 1% 2.16GB/s ± 1% +5.08% (p=0.008 n=5+5) IndexBytePortable/32-12 1.40GB/s ± 1% 1.54GB/s ± 1% +10.05% (p=0.008 n=5+5) IndexBytePortable/4K-12 3.99GB/s ± 0% 4.08GB/s ± 0% +2.16% (p=0.008 n=5+5) IndexBytePortable/4M-12 4.05GB/s ± 1% 4.08GB/s ± 2% ~ (p=0.095 n=5+5) IndexBytePortable/64M-12 3.80GB/s ± 1% 3.81GB/s ± 0% ~ (p=0.421 n=5+5) IndexRune/10-12 746MB/s ± 1% 752MB/s ± 0% +0.85% (p=0.008 n=5+5) IndexRune/32-12 2.33GB/s ± 0% 2.42GB/s ± 0% +3.66% (p=0.008 n=5+5) IndexRune/4K-12 44.4GB/s ± 0% 44.2GB/s ± 0% ~ (p=0.095 n=5+5) IndexRune/4M-12 36.2GB/s ± 1% 36.3GB/s ± 2% ~ (p=0.841 n=5+5) IndexRune/64M-12 16.2GB/s ± 2% 16.3GB/s ± 2% ~ (p=0.548 n=5+5) IndexRuneASCII/10-12 2.57GB/s ± 0% 2.58GB/s ± 0% +0.63% (p=0.008 n=5+5) IndexRuneASCII/32-12 6.00GB/s ± 0% 6.30GB/s ± 1% +4.98% (p=0.008 n=5+5) IndexRuneASCII/4K-12 56.7GB/s ± 0% 56.8GB/s ± 1% ~ (p=0.151 n=5+5) IndexRuneASCII/4M-12 41.6GB/s ± 1% 41.7GB/s ± 2% ~ (p=0.151 n=5+5) IndexRuneASCII/64M-12 17.7GB/s ± 1% 17.6GB/s ± 1% ~ (p=0.222 n=5+5) Index/10-12 1.06GB/s ± 1% 1.06GB/s ± 0% ~ (p=0.310 n=5+5) Index/32-12 3.57GB/s ± 0% 3.56GB/s ± 1% ~ (p=0.056 n=5+5) Index/4K-12 1.02GB/s ± 2% 1.03GB/s ± 0% ~ (p=0.690 n=5+5) Index/4M-12 1.04GB/s ± 0% 1.03GB/s ± 1% ~ (p=1.000 n=4+5) Index/64M-12 1.02GB/s ± 0% 1.02GB/s ± 0% ~ (p=0.905 n=5+4) IndexEasy/10-12 1.12GB/s ± 2% 1.15GB/s ± 1% +3.10% (p=0.008 n=5+5) IndexEasy/32-12 3.14GB/s ± 2% 3.13GB/s ± 1% ~ (p=0.310 n=5+5) IndexEasy/4K-12 47.6GB/s ± 1% 47.7GB/s ± 2% ~ (p=0.310 n=5+5) IndexEasy/4M-12 36.4GB/s ± 1% 36.3GB/s ± 2% ~ (p=0.690 n=5+5) IndexEasy/64M-12 16.1GB/s ± 1% 16.4GB/s ± 5% ~ (p=0.151 n=5+5) [Geo mean] 6.39GB/s 6.46GB/s +1.11% Change-Id: Ic1ca62f5cc719d87e2c4aeff25ad73507facff82 Reviewed-on: https://go-review.googlesource.com/c/go/+/397576 Reviewed-by: Keith Randall <khr@google.com> Run-TryBot: Keith Randall <khr@google.com> TryBot-Result: Gopher Robot <gobot@golang.org> Reviewed-by: Michael Knyszek <mknyszek@google.com>
Diffstat (limited to 'src')
-rw-r--r--src/internal/bytealg/compare_amd64.s7
-rw-r--r--src/internal/bytealg/count_amd64.s7
-rw-r--r--src/internal/bytealg/equal_amd64.s3
-rw-r--r--src/internal/bytealg/index_amd64.s2
-rw-r--r--src/internal/bytealg/indexbyte_amd64.s2
-rw-r--r--src/runtime/asm_amd64.h11
-rw-r--r--src/runtime/mkpreempt.go5
-rw-r--r--src/runtime/preempt_amd64.s3
8 files changed, 40 insertions, 0 deletions
diff --git a/src/internal/bytealg/compare_amd64.s b/src/internal/bytealg/compare_amd64.s
index 4ccaca5e87..fdd015f560 100644
--- a/src/internal/bytealg/compare_amd64.s
+++ b/src/internal/bytealg/compare_amd64.s
@@ -3,6 +3,7 @@
// license that can be found in the LICENSE file.
#include "go_asm.h"
+#include "asm_amd64.h"
#include "textflag.h"
TEXT ·Compare<ABIInternal>(SB),NOSPLIT,$0-56
@@ -44,9 +45,13 @@ TEXT cmpbody<>(SB),NOSPLIT,$0-0
CMPQ R8, $63
JBE loop
+#ifndef hasAVX2
CMPB internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1
JEQ big_loop_avx2
JMP big_loop
+#else
+ JMP big_loop_avx2
+#endif
loop:
CMPQ R8, $16
JBE _0through16
@@ -155,6 +160,7 @@ allsame:
RET
// this works for >= 64 bytes of data.
+#ifndef hasAVX2
big_loop:
MOVOU (SI), X0
MOVOU (DI), X1
@@ -190,6 +196,7 @@ big_loop:
CMPQ R8, $64
JBE loop
JMP big_loop
+#endif
// Compare 64-bytes per loop iteration.
// Loop is unrolled and uses AVX2.
diff --git a/src/internal/bytealg/count_amd64.s b/src/internal/bytealg/count_amd64.s
index fa864c4c76..efb17f84b7 100644
--- a/src/internal/bytealg/count_amd64.s
+++ b/src/internal/bytealg/count_amd64.s
@@ -3,12 +3,15 @@
// license that can be found in the LICENSE file.
#include "go_asm.h"
+#include "asm_amd64.h"
#include "textflag.h"
TEXT ·Count(SB),NOSPLIT,$0-40
+#ifndef hasPOPCNT
CMPB internal∕cpu·X86+const_offsetX86HasPOPCNT(SB), $1
JEQ 2(PC)
JMP ·countGeneric(SB)
+#endif
MOVQ b_base+0(FP), SI
MOVQ b_len+8(FP), BX
MOVB c+24(FP), AL
@@ -16,9 +19,11 @@ TEXT ·Count(SB),NOSPLIT,$0-40
JMP countbody<>(SB)
TEXT ·CountString(SB),NOSPLIT,$0-32
+#ifndef hasPOPCNT
CMPB internal∕cpu·X86+const_offsetX86HasPOPCNT(SB), $1
JEQ 2(PC)
JMP ·countGenericString(SB)
+#endif
MOVQ s_base+0(FP), SI
MOVQ s_len+8(FP), BX
MOVB c+16(FP), AL
@@ -151,8 +156,10 @@ endofpage:
RET
avx2:
+#ifndef hasAVX2
CMPB internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1
JNE sse
+#endif
MOVD AX, X0
LEAQ -32(SI)(BX*1), R11
VPBROADCASTB X0, Y1
diff --git a/src/internal/bytealg/equal_amd64.s b/src/internal/bytealg/equal_amd64.s
index dd46e2e0fd..d178a33779 100644
--- a/src/internal/bytealg/equal_amd64.s
+++ b/src/internal/bytealg/equal_amd64.s
@@ -3,6 +3,7 @@
// license that can be found in the LICENSE file.
#include "go_asm.h"
+#include "asm_amd64.h"
#include "textflag.h"
// memequal(a, b unsafe.Pointer, size uintptr) bool
@@ -46,6 +47,7 @@ TEXT memeqbody<>(SB),NOSPLIT,$0-0
JB small
CMPQ BX, $64
JB bigloop
+#ifndef hasAVX2
CMPB internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1
JE hugeloop_avx2
@@ -76,6 +78,7 @@ hugeloop:
JEQ hugeloop
XORQ AX, AX // return 0
RET
+#endif
// 64 bytes at a time using ymm registers
hugeloop_avx2:
diff --git a/src/internal/bytealg/index_amd64.s b/src/internal/bytealg/index_amd64.s
index 6193b57239..04314917b8 100644
--- a/src/internal/bytealg/index_amd64.s
+++ b/src/internal/bytealg/index_amd64.s
@@ -233,8 +233,10 @@ success_avx2:
VZEROUPPER
JMP success
sse42:
+#ifndef hasSSE42
CMPB internal∕cpu·X86+const_offsetX86HasSSE42(SB), $1
JNE no_sse42
+#endif
CMPQ AX, $12
// PCMPESTRI is slower than normal compare,
// so using it makes sense only if we advance 4+ bytes per compare
diff --git a/src/internal/bytealg/indexbyte_amd64.s b/src/internal/bytealg/indexbyte_amd64.s
index f78093c539..1ca70e39e2 100644
--- a/src/internal/bytealg/indexbyte_amd64.s
+++ b/src/internal/bytealg/indexbyte_amd64.s
@@ -115,8 +115,10 @@ endofpage:
RET
avx2:
+#ifndef hasAVX2
CMPB internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1
JNE sse
+#endif
MOVD AX, X0
LEAQ -32(SI)(BX*1), R11
VPBROADCASTB X0, Y1
diff --git a/src/runtime/asm_amd64.h b/src/runtime/asm_amd64.h
index 49e0ee2323..f7a8896db6 100644
--- a/src/runtime/asm_amd64.h
+++ b/src/runtime/asm_amd64.h
@@ -5,10 +5,21 @@
// Define features that are guaranteed to be supported by setting the AMD64 variable.
// If a feature is supported, there's no need to check it at runtime every time.
+#ifdef GOAMD64_v2
+#define hasPOPCNT
+#define hasSSE42
+#endif
+
#ifdef GOAMD64_v3
+#define hasAVX
#define hasAVX2
+#define hasPOPCNT
+#define hasSSE42
#endif
#ifdef GOAMD64_v4
+#define hasAVX
#define hasAVX2
+#define hasPOPCNT
+#define hasSSE42
#endif
diff --git a/src/runtime/mkpreempt.go b/src/runtime/mkpreempt.go
index 28befcbd0d..61d2d0247e 100644
--- a/src/runtime/mkpreempt.go
+++ b/src/runtime/mkpreempt.go
@@ -126,6 +126,9 @@ func header(arch string) {
fmt.Fprintf(out, "//go:build %s || %sle\n\n", base, base)
}
fmt.Fprintf(out, "#include \"go_asm.h\"\n")
+ if arch == "amd64" {
+ fmt.Fprintf(out, "#include \"asm_amd64.h\"\n")
+ }
fmt.Fprintf(out, "#include \"textflag.h\"\n\n")
fmt.Fprintf(out, "TEXT ·asyncPreempt(SB),NOSPLIT|NOFRAME,$0-0\n")
}
@@ -267,8 +270,10 @@ func genAMD64() {
// Clear the upper bits to get to a clean state. See issue #37174.
// It is safe here as Go code don't use the upper bits of Y registers.
p("#ifdef GOOS_darwin")
+ p("#ifndef hasAVX")
p("CMPB internal∕cpu·X86+const_offsetX86HasAVX(SB), $0")
p("JE 2(PC)")
+ p("#endif")
p("VZEROUPPER")
p("#endif")
diff --git a/src/runtime/preempt_amd64.s b/src/runtime/preempt_amd64.s
index 31f7c8b66f..94a84fb74c 100644
--- a/src/runtime/preempt_amd64.s
+++ b/src/runtime/preempt_amd64.s
@@ -1,6 +1,7 @@
// Code generated by mkpreempt.go; DO NOT EDIT.
#include "go_asm.h"
+#include "asm_amd64.h"
#include "textflag.h"
TEXT ·asyncPreempt(SB),NOSPLIT|NOFRAME,$0-0
@@ -27,8 +28,10 @@ TEXT ·asyncPreempt(SB),NOSPLIT|NOFRAME,$0-0
MOVQ R14, 96(SP)
MOVQ R15, 104(SP)
#ifdef GOOS_darwin
+ #ifndef hasAVX
CMPB internal∕cpu·X86+const_offsetX86HasAVX(SB), $0
JE 2(PC)
+ #endif
VZEROUPPER
#endif
MOVUPS X0, 112(SP)