aboutsummaryrefslogtreecommitdiff
path: root/src/runtime
diff options
context:
space:
mode:
authorRuslan Andreev <kels9009@gmail.com>2021-06-15 14:04:30 +0000
committerAustin Clements <austin@google.com>2021-09-07 20:27:30 +0000
commit23f4f0db682fad0c8d61a5b5cdbdbad4cf1cd41f (patch)
tree2c3d076097f9ff7e1289bb3e0623446694e7383c /src/runtime
parentd92101f452e10680ad4c8af2d5ad40d940b59214 (diff)
downloadgo-23f4f0db682fad0c8d61a5b5cdbdbad4cf1cd41f.tar.xz
cmd/compile: add prefetch intrinsic support
This CL provide new intrinsics to emit prefetch instructions for AMD64 and ARM64 platforms: Prefetch - prefetches data from memory address to cache; PrefetchStreamed - prefetches data from memory address, with a hint that this data is being streamed. This patch also provides prefetch calls pointed by RSC inside scanobject and greyobject of GC mark logic. Performance results provided by Michael: https://perf.golang.org/search?q=upload:20210901.9 Benchmark parameters: tree2 -heapsize=1000000000 -cpus=8 tree -n=18 parser peano Benchmarks AMD64 (Xeon - Cascade Lake): name old time/op new time/op delta Tree2-8 36.1ms ± 6% 33.4ms ± 5% -7.65% (p=0.000 n=9+9) Tree-8 326ms ± 1% 324ms ± 1% -0.44% (p=0.006 n=9+10) Parser-8 2.75s ± 1% 2.71s ± 1% -1.47% (p=0.008 n=5+5) Peano-8 63.1ms ± 1% 63.0ms ± 1% ~ (p=0.730 n=9+9) [Geo mean] 213ms 207ms -2.45% Benchmarks ARM64 (Kunpeng 920): name old time/op new time/op delta Tree2-8 50.3ms ± 8% 44.1ms ± 5% -12.24% (p=0.000 n=10+9) Tree-8 494ms ± 1% 493ms ± 1% ~ (p=0.684 n=10+10) Parser-8 3.99s ± 1% 3.93s ± 1% -1.37% (p=0.016 n=5+5) Peano-8 84.4ms ± 0% 84.1ms ± 1% ~ (p=0.068 n=8+10) [Geo mean] 302ms 291ms -3.67% Change-Id: I43e10bc2f9512dc49d7631dd8843a79036fa43d0 Reviewed-on: https://go-review.googlesource.com/c/go/+/328289 Reviewed-by: Austin Clements <austin@google.com> Reviewed-by: Cherry Mui <cherryyz@google.com> Run-TryBot: Austin Clements <austin@google.com> TryBot-Result: Go Bot <gobot@golang.org>
Diffstat (limited to 'src/runtime')
-rw-r--r--src/runtime/internal/sys/intrinsics_common.go15
-rw-r--r--src/runtime/mgcmark.go25
2 files changed, 29 insertions, 11 deletions
diff --git a/src/runtime/internal/sys/intrinsics_common.go b/src/runtime/internal/sys/intrinsics_common.go
index 818d75ecc5..48d9759ca9 100644
--- a/src/runtime/internal/sys/intrinsics_common.go
+++ b/src/runtime/internal/sys/intrinsics_common.go
@@ -141,3 +141,18 @@ func TrailingZeros8(x uint8) int {
func Len8(x uint8) int {
return int(len8tab[x])
}
+
+// Prefetch prefetches data from memory addr to cache
+//
+// AMD64: Produce PREFETCHT0 instruction
+//
+// ARM64: Produce PRFM instruction with PLDL1KEEP option
+func Prefetch(addr uintptr) {}
+
+// PrefetchStreamed prefetches data from memory addr, with a hint that this data is being streamed.
+// That is, it is likely to be accessed very soon, but only once. If possible, this will avoid polluting the cache.
+//
+// AMD64: Produce PREFETCHNTA instruction
+//
+// ARM64: Produce PRFM instruction with PLDL1STRM option
+func PrefetchStreamed(addr uintptr) {}
diff --git a/src/runtime/mgcmark.go b/src/runtime/mgcmark.go
index 874d910720..64f1c79c36 100644
--- a/src/runtime/mgcmark.go
+++ b/src/runtime/mgcmark.go
@@ -9,6 +9,7 @@ package runtime
import (
"internal/goarch"
"runtime/internal/atomic"
+ "runtime/internal/sys"
"unsafe"
)
@@ -1104,11 +1105,6 @@ func gcDrainN(gcw *gcWork, scanWork int64) int64 {
gcw.balance()
}
- // This might be a good place to add prefetch code...
- // if(wbuf.nobj > 4) {
- // PREFETCH(wbuf->obj[wbuf.nobj - 3];
- // }
- //
b := gcw.tryGetFast()
if b == 0 {
b = gcw.tryGet()
@@ -1135,6 +1131,7 @@ func gcDrainN(gcw *gcWork, scanWork int64) int64 {
// No heap or root jobs.
break
}
+
scanobject(b, gcw)
// Flush background scan work credit.
@@ -1199,6 +1196,12 @@ func scanblock(b0, n0 uintptr, ptrmask *uint8, gcw *gcWork, stk *stackScanState)
//
//go:nowritebarrier
func scanobject(b uintptr, gcw *gcWork) {
+ // Prefetch object before we scan it.
+ //
+ // This will overlap fetching the beginning of the object with initial
+ // setup before we start scanning the object.
+ sys.Prefetch(b)
+
// Find the bits for b and the size of the object at b.
//
// b is either the beginning of an object, in which case this
@@ -1437,12 +1440,12 @@ func greyobject(obj, base, off uintptr, span *mspan, gcw *gcWork, objIndex uintp
}
}
- // Queue the obj for scanning. The PREFETCH(obj) logic has been removed but
- // seems like a nice optimization that can be added back in.
- // There needs to be time between the PREFETCH and the use.
- // Previously we put the obj in an 8 element buffer that is drained at a rate
- // to give the PREFETCH time to do its work.
- // Use of PREFETCHNTA might be more appropriate than PREFETCH
+ // We're adding obj to P's local workbuf, so it's likely
+ // this object will be processed soon by the same P.
+ // Even if the workbuf gets flushed, there will likely still be
+ // some benefit on platforms with inclusive shared caches.
+ sys.Prefetch(obj)
+ // Queue the obj for scanning.
if !gcw.putFast(obj) {
gcw.put(obj)
}