aboutsummaryrefslogtreecommitdiff
path: root/src/runtime
diff options
context:
space:
mode:
authornimelehin <nimelehin@gmail.com>2022-05-09 23:22:14 +0300
committerKeith Randall <khr@golang.org>2022-05-10 20:52:34 +0000
commite0f99775f24cd19ad96847b0a5b00952aac9d548 (patch)
treef34556190f4b2d3878e94f5573def51e6e86e714 /src/runtime
parentaeaf4b0e5b21ea0819b4e862ed86ce44760516bf (diff)
downloadgo-e0f99775f24cd19ad96847b0a5b00952aac9d548.tar.xz
runtime: store pointer-size words in memclr
GC requires the whole zeroed word to be visible for a memory subsystem. While the implementation of Enhanced REP STOSB tries to use as efficient stores as possible, e.g writing the whole cache line and not byte-after-byte, we should use REP STOSQ to guarantee the requirements of the GC. The performance is not affected. Change-Id: I1b0fd1444a40bfbb661541291ab96eba11bcc762 Reviewed-on: https://go-review.googlesource.com/c/go/+/405274 Reviewed-by: Cherry Mui <cherryyz@google.com> Reviewed-by: Keith Randall <khr@google.com> Reviewed-by: Keith Randall <khr@golang.org>
Diffstat (limited to 'src/runtime')
-rw-r--r--src/runtime/memclr_amd64.s10
1 files changed, 7 insertions, 3 deletions
diff --git a/src/runtime/memclr_amd64.s b/src/runtime/memclr_amd64.s
index b8f283b8fd..19bfa6f20d 100644
--- a/src/runtime/memclr_amd64.s
+++ b/src/runtime/memclr_amd64.s
@@ -18,7 +18,7 @@ TEXT runtime·memclrNoHeapPointers<ABIInternal>(SB), NOSPLIT, $0-16
MOVQ AX, DI // DI = ptr
XORQ AX, AX
- // MOVOU seems always faster than REP STOSQ.
+ // MOVOU seems always faster than REP STOSQ when Enhanced REP STOSQ is not available.
tail:
// BSR+branch table make almost all memmove/memclr benchmarks worse. Not worth doing.
TESTQ BX, BX
@@ -119,9 +119,13 @@ loop_preheader_erms:
JAE loop_preheader_avx2_huge
loop_erms:
+ // STOSQ is used to guarantee that the whole zeroed pointer-sized word is visible
+ // for a memory subsystem as the GC requires this.
MOVQ BX, CX
- REP; STOSB
- RET
+ SHRQ $3, CX
+ ANDQ $7, BX
+ REP; STOSQ
+ JMP tail
loop_preheader_avx2_huge:
// Align to 32 byte boundary