From 15ead857dbc638b9d83a7686acf0dc746fc45918 Mon Sep 17 00:00:00 2001 From: "Paul E. Murphy" Date: Wed, 9 Sep 2020 17:24:23 -0500 Subject: cmd/compiler,cmd/go,sync: add internal {LoadAcq,StoreRel}64 on ppc64 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add an internal atomic intrinsic for load with acquire semantics (extending LoadAcq to 64b) and add LoadAcquintptr for internal use within the sync package. For other arches, this remaps to the appropriate atomic.Load{,64} intrinsic which should not alter code generation. Similarly, add StoreRel{uintptr,64} for consistency, and inline. Finally, add an exception to allow sync to directly use the runtime/internal/atomic package which avoids more convoluted workarounds (contributed by Lynn Boger). In an extreme example, sync.(*Pool).pin consumes 20% of wall time during fmt tests. This is reduced to 5% on ppc64le/power9. From the fmt benchmarks on ppc64le: name old time/op new time/op delta SprintfPadding 468ns ± 0% 451ns ± 0% -3.63% SprintfEmpty 73.3ns ± 0% 51.9ns ± 0% -29.20% SprintfString 135ns ± 0% 122ns ± 0% -9.63% SprintfTruncateString 232ns ± 0% 214ns ± 0% -7.76% SprintfTruncateBytes 216ns ± 0% 202ns ± 0% -6.48% SprintfSlowParsingPath 162ns ± 0% 142ns ± 0% -12.35% SprintfQuoteString 1.00µs ± 0% 0.99µs ± 0% -1.39% SprintfInt 117ns ± 0% 104ns ± 0% -11.11% SprintfIntInt 190ns ± 0% 175ns ± 0% -7.89% SprintfPrefixedInt 232ns ± 0% 212ns ± 0% -8.62% SprintfFloat 270ns ± 0% 255ns ± 0% -5.56% SprintfComplex 1.01µs ± 0% 0.99µs ± 0% -1.68% SprintfBoolean 127ns ± 0% 111ns ± 0% -12.60% SprintfHexString 220ns ± 0% 198ns ± 0% -10.00% SprintfHexBytes 261ns ± 0% 252ns ± 0% -3.45% SprintfBytes 600ns ± 0% 590ns ± 0% -1.67% SprintfStringer 684ns ± 0% 658ns ± 0% -3.80% SprintfStructure 2.57µs ± 0% 2.57µs ± 0% -0.12% ManyArgs 669ns ± 0% 646ns ± 0% -3.44% FprintInt 140ns ± 0% 136ns ± 0% -2.86% FprintfBytes 184ns ± 0% 181ns ± 0% -1.63% FprintIntNoAlloc 140ns ± 0% 136ns ± 0% -2.86% ScanInts 929µs ± 0% 921µs ± 0% -0.79% ScanRecursiveInt 122ms ± 0% 121ms ± 0% -0.11% ScanRecursiveIntReaderWrapper 122ms ± 0% 122ms ± 0% -0.18% Change-Id: I4d66780261b57b06ef600229e475462e7313f0d6 Reviewed-on: https://go-review.googlesource.com/c/go/+/253748 Run-TryBot: Lynn Boger Reviewed-by: Lynn Boger Reviewed-by: Keith Randall Trust: Lynn Boger TryBot-Result: Go Bot --- src/sync/pool.go | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'src/sync/pool.go') diff --git a/src/sync/pool.go b/src/sync/pool.go index ca7afdb12f..137413fdc4 100644 --- a/src/sync/pool.go +++ b/src/sync/pool.go @@ -7,6 +7,7 @@ package sync import ( "internal/race" "runtime" + runtimeatomic "runtime/internal/atomic" "sync/atomic" "unsafe" ) @@ -152,8 +153,8 @@ func (p *Pool) Get() interface{} { func (p *Pool) getSlow(pid int) interface{} { // See the comment in pin regarding ordering of the loads. - size := atomic.LoadUintptr(&p.localSize) // load-acquire - locals := p.local // load-consume + size := runtimeatomic.LoadAcquintptr(&p.localSize) // load-acquire + locals := p.local // load-consume // Try to steal one element from other procs. for i := 0; i < int(size); i++ { l := indexLocal(locals, (pid+i+1)%int(size)) @@ -165,7 +166,7 @@ func (p *Pool) getSlow(pid int) interface{} { // Try the victim cache. We do this after attempting to steal // from all primary caches because we want objects in the // victim cache to age out if at all possible. - size = atomic.LoadUintptr(&p.victimSize) + size = runtimeatomic.Loaduintptr(&p.victimSize) if uintptr(pid) >= size { return nil } @@ -198,8 +199,8 @@ func (p *Pool) pin() (*poolLocal, int) { // Since we've disabled preemption, GC cannot happen in between. // Thus here we must observe local at least as large localSize. // We can observe a newer/larger local, it is fine (we must observe its zero-initialized-ness). - s := atomic.LoadUintptr(&p.localSize) // load-acquire - l := p.local // load-consume + s := runtimeatomic.LoadAcquintptr(&p.localSize) // load-acquire + l := p.local // load-consume if uintptr(pid) < s { return indexLocal(l, pid), pid } @@ -225,8 +226,8 @@ func (p *Pool) pinSlow() (*poolLocal, int) { // If GOMAXPROCS changes between GCs, we re-allocate the array and lose the old one. size := runtime.GOMAXPROCS(0) local := make([]poolLocal, size) - atomic.StorePointer(&p.local, unsafe.Pointer(&local[0])) // store-release - atomic.StoreUintptr(&p.localSize, uintptr(size)) // store-release + atomic.StorePointer(&p.local, unsafe.Pointer(&local[0])) // store-release + runtimeatomic.StoreReluintptr(&p.localSize, uintptr(size)) // store-release return &local[pid], pid } -- cgit v1.3 From c305e49e96deafe54a8e43010ea76fead6da0a98 Mon Sep 17 00:00:00 2001 From: Austin Clements Date: Mon, 26 Oct 2020 14:32:13 -0400 Subject: cmd/go,cmd/compile,sync: remove special import case in cmd/go CL 253748 introduced a special case in cmd/go to allow sync to import runtime/internal/atomic. Besides introducing unnecessary complexity into cmd/go, this breaks other packages (like gopls) that understand how imports work, but don't understand this special case. Fix this by using the more standard linkname-based approach to pull the necessary functions from runtime/internal/atomic into sync. Since these are compiler intrinsics, we also have to tell the compiler that the linknamed symbols are intrinsics to get this optimization in sync. Fixes #42196. Change-Id: I1f91498c255c91583950886a89c3c9adc39a32f0 Reviewed-on: https://go-review.googlesource.com/c/go/+/265124 Trust: Austin Clements Run-TryBot: Austin Clements Reviewed-by: Bryan C. Mills Reviewed-by: Keith Randall Reviewed-by: Paul Murphy TryBot-Result: Go Bot --- src/cmd/compile/internal/gc/ssa.go | 4 ++++ src/cmd/go/internal/load/pkg.go | 5 ----- src/sync/pool.go | 25 +++++++++++++++++-------- 3 files changed, 21 insertions(+), 13 deletions(-) (limited to 'src/sync/pool.go') diff --git a/src/cmd/compile/internal/gc/ssa.go b/src/cmd/compile/internal/gc/ssa.go index f840ef4066..a1b5a03687 100644 --- a/src/cmd/compile/internal/gc/ssa.go +++ b/src/cmd/compile/internal/gc/ssa.go @@ -3569,13 +3569,17 @@ func init() { alias("runtime/internal/atomic", "LoadAcq", "runtime/internal/atomic", "Load", lwatomics...) alias("runtime/internal/atomic", "LoadAcq64", "runtime/internal/atomic", "Load64", lwatomics...) alias("runtime/internal/atomic", "LoadAcquintptr", "runtime/internal/atomic", "LoadAcq", p4...) + alias("sync", "runtime_LoadAcquintptr", "runtime/internal/atomic", "LoadAcq", p4...) // linknamed alias("runtime/internal/atomic", "LoadAcquintptr", "runtime/internal/atomic", "LoadAcq64", p8...) + alias("sync", "runtime_LoadAcquintptr", "runtime/internal/atomic", "LoadAcq64", p8...) // linknamed alias("runtime/internal/atomic", "Storeuintptr", "runtime/internal/atomic", "Store", p4...) alias("runtime/internal/atomic", "Storeuintptr", "runtime/internal/atomic", "Store64", p8...) alias("runtime/internal/atomic", "StoreRel", "runtime/internal/atomic", "Store", lwatomics...) alias("runtime/internal/atomic", "StoreRel64", "runtime/internal/atomic", "Store64", lwatomics...) alias("runtime/internal/atomic", "StoreReluintptr", "runtime/internal/atomic", "StoreRel", p4...) + alias("sync", "runtime_StoreReluintptr", "runtime/internal/atomic", "StoreRel", p4...) // linknamed alias("runtime/internal/atomic", "StoreReluintptr", "runtime/internal/atomic", "StoreRel64", p8...) + alias("sync", "runtime_StoreReluintptr", "runtime/internal/atomic", "StoreRel64", p8...) // linknamed alias("runtime/internal/atomic", "Xchguintptr", "runtime/internal/atomic", "Xchg", p4...) alias("runtime/internal/atomic", "Xchguintptr", "runtime/internal/atomic", "Xchg64", p8...) alias("runtime/internal/atomic", "Xadduintptr", "runtime/internal/atomic", "Xadd", p4...) diff --git a/src/cmd/go/internal/load/pkg.go b/src/cmd/go/internal/load/pkg.go index fcd7728c7b..615b5ef769 100644 --- a/src/cmd/go/internal/load/pkg.go +++ b/src/cmd/go/internal/load/pkg.go @@ -1338,11 +1338,6 @@ func disallowInternal(srcDir string, importer *Package, importerPath string, p * return p } - // Allow sync package to access lightweight atomic functions limited to the runtime. - if p.Standard && strings.HasPrefix(importerPath, "sync") && p.ImportPath == "runtime/internal/atomic" { - return p - } - // Internal is present. // Map import path back to directory corresponding to parent of internal. if i > 0 { diff --git a/src/sync/pool.go b/src/sync/pool.go index 137413fdc4..1ae70127ac 100644 --- a/src/sync/pool.go +++ b/src/sync/pool.go @@ -7,7 +7,6 @@ package sync import ( "internal/race" "runtime" - runtimeatomic "runtime/internal/atomic" "sync/atomic" "unsafe" ) @@ -153,8 +152,8 @@ func (p *Pool) Get() interface{} { func (p *Pool) getSlow(pid int) interface{} { // See the comment in pin regarding ordering of the loads. - size := runtimeatomic.LoadAcquintptr(&p.localSize) // load-acquire - locals := p.local // load-consume + size := runtime_LoadAcquintptr(&p.localSize) // load-acquire + locals := p.local // load-consume // Try to steal one element from other procs. for i := 0; i < int(size); i++ { l := indexLocal(locals, (pid+i+1)%int(size)) @@ -166,7 +165,7 @@ func (p *Pool) getSlow(pid int) interface{} { // Try the victim cache. We do this after attempting to steal // from all primary caches because we want objects in the // victim cache to age out if at all possible. - size = runtimeatomic.Loaduintptr(&p.victimSize) + size = atomic.LoadUintptr(&p.victimSize) if uintptr(pid) >= size { return nil } @@ -199,8 +198,8 @@ func (p *Pool) pin() (*poolLocal, int) { // Since we've disabled preemption, GC cannot happen in between. // Thus here we must observe local at least as large localSize. // We can observe a newer/larger local, it is fine (we must observe its zero-initialized-ness). - s := runtimeatomic.LoadAcquintptr(&p.localSize) // load-acquire - l := p.local // load-consume + s := runtime_LoadAcquintptr(&p.localSize) // load-acquire + l := p.local // load-consume if uintptr(pid) < s { return indexLocal(l, pid), pid } @@ -226,8 +225,8 @@ func (p *Pool) pinSlow() (*poolLocal, int) { // If GOMAXPROCS changes between GCs, we re-allocate the array and lose the old one. size := runtime.GOMAXPROCS(0) local := make([]poolLocal, size) - atomic.StorePointer(&p.local, unsafe.Pointer(&local[0])) // store-release - runtimeatomic.StoreReluintptr(&p.localSize, uintptr(size)) // store-release + atomic.StorePointer(&p.local, unsafe.Pointer(&local[0])) // store-release + runtime_StoreReluintptr(&p.localSize, uintptr(size)) // store-release return &local[pid], pid } @@ -283,3 +282,13 @@ func indexLocal(l unsafe.Pointer, i int) *poolLocal { func runtime_registerPoolCleanup(cleanup func()) func runtime_procPin() int func runtime_procUnpin() + +// The below are implemented in runtime/internal/atomic and the +// compiler also knows to intrinsify the symbol we linkname into this +// package. + +//go:linkname runtime_LoadAcquintptr runtime/internal/atomic.LoadAcquintptr +func runtime_LoadAcquintptr(ptr *uintptr) uintptr + +//go:linkname runtime_StoreReluintptr runtime/internal/atomic.StoreReluintptr +func runtime_StoreReluintptr(ptr *uintptr, val uintptr) uintptr -- cgit v1.3