diff options
| author | Guoqi Chen <chenguoqi@loongson.cn> | 2024-09-13 18:47:56 +0800 |
|---|---|---|
| committer | abner chenc <chenguoqi@loongson.cn> | 2024-11-07 02:19:55 +0000 |
| commit | ac345fb7e704ede49c0c506bfd9f8d0f4b61cd7c (patch) | |
| tree | 2c39d32a0f7309d4fdcc917cd6601c04f33f12ac /src/internal/runtime/atomic | |
| parent | 9088883cf4a5181cf796c968bbce5a5bc3edc7ab (diff) | |
| download | go-ac345fb7e704ede49c0c506bfd9f8d0f4b61cd7c.tar.xz | |
cmd/compiler,internal/runtime/atomic: optimize Store{64,32,8} on loong64
On Loong64, AMSWAPDB{W,V} instructions are supported by default, and AMSWAPDB{B,H} [1]
is a new instruction added by LA664(Loongson 3A6000) and later microarchitectures.
Therefore, AMSWAPDB{W,V} (full barrier) is used to implement AtomicStore{32,64}, and
the traditional MOVB or the new AMSWAPDBB is used to implement AtomicStore8 according
to the CPU feature.
The StoreRelease barrier on Loong64 is "dbar 0x12", but it is still necessary to
ensure consistency in the order of Store/Load [2].
LoweredAtomicStorezero{32,64} was removed because on loong64 the constant "0" uses
the R0 register, and there is no performance difference between the implementations
of LoweredAtomicStorezero{32,64} and LoweredAtomicStore{32,64}.
goos: linux
goarch: loong64
pkg: internal/runtime/atomic
cpu: Loongson-3A5000-HV @ 2500.00MHz
| bench.old | bench.new |
| sec/op | sec/op vs base |
AtomicStore64 19.61n ± 0% 13.61n ± 0% -30.60% (p=0.000 n=20)
AtomicStore64-2 19.61n ± 0% 13.61n ± 0% -30.57% (p=0.000 n=20)
AtomicStore64-4 19.62n ± 0% 13.61n ± 0% -30.63% (p=0.000 n=20)
AtomicStore 19.61n ± 0% 13.61n ± 0% -30.60% (p=0.000 n=20)
AtomicStore-2 19.62n ± 0% 13.61n ± 0% -30.63% (p=0.000 n=20)
AtomicStore-4 19.62n ± 0% 13.62n ± 0% -30.58% (p=0.000 n=20)
AtomicStore8 19.61n ± 0% 20.01n ± 0% +2.04% (p=0.000 n=20)
AtomicStore8-2 19.62n ± 0% 20.02n ± 0% +2.01% (p=0.000 n=20)
AtomicStore8-4 19.61n ± 0% 20.02n ± 0% +2.09% (p=0.000 n=20)
geomean 19.61n 15.48n -21.08%
goos: linux
goarch: loong64
pkg: internal/runtime/atomic
cpu: Loongson-3A6000 @ 2500.00MHz
| bench.old | bench.new |
| sec/op | sec/op vs base |
AtomicStore64 18.03n ± 0% 12.81n ± 0% -28.93% (p=0.000 n=20)
AtomicStore64-2 18.02n ± 0% 12.81n ± 0% -28.91% (p=0.000 n=20)
AtomicStore64-4 18.01n ± 0% 12.81n ± 0% -28.87% (p=0.000 n=20)
AtomicStore 18.02n ± 0% 12.81n ± 0% -28.91% (p=0.000 n=20)
AtomicStore-2 18.01n ± 0% 12.81n ± 0% -28.87% (p=0.000 n=20)
AtomicStore-4 18.01n ± 0% 12.81n ± 0% -28.87% (p=0.000 n=20)
AtomicStore8 18.01n ± 0% 12.81n ± 0% -28.87% (p=0.000 n=20)
AtomicStore8-2 18.01n ± 0% 12.81n ± 0% -28.87% (p=0.000 n=20)
AtomicStore8-4 18.01n ± 0% 12.81n ± 0% -28.87% (p=0.000 n=20)
geomean 18.01n 12.81n -28.89%
[1]: https://loongson.github.io/LoongArch-Documentation/LoongArch-ELF-ABI-EN.html
[2]: https://gcc.gnu.org/git/?p=gcc.git;a=blob_plain;f=gcc/config/loongarch/sync.md
Change-Id: I4ae5e8dd0e6f026129b6e503990a763ed40c6097
Reviewed-on: https://go-review.googlesource.com/c/go/+/581356
Reviewed-by: sophie zhao <zhaoxiaolin@loongson.cn>
Reviewed-by: Cherry Mui <cherryyz@google.com>
Reviewed-by: Qiqi Huang <huangqiqi@loongson.cn>
Reviewed-by: Meidan Li <limeidan@loongson.cn>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: David Chase <drchase@google.com>
Diffstat (limited to 'src/internal/runtime/atomic')
| -rw-r--r-- | src/internal/runtime/atomic/atomic_loong64.go | 9 | ||||
| -rw-r--r-- | src/internal/runtime/atomic/atomic_loong64.s | 19 | ||||
| -rw-r--r-- | src/internal/runtime/atomic/bench_test.go | 8 |
3 files changed, 27 insertions, 9 deletions
diff --git a/src/internal/runtime/atomic/atomic_loong64.go b/src/internal/runtime/atomic/atomic_loong64.go index de6d4b4ba6..a362628323 100644 --- a/src/internal/runtime/atomic/atomic_loong64.go +++ b/src/internal/runtime/atomic/atomic_loong64.go @@ -6,7 +6,14 @@ package atomic -import "unsafe" +import ( + "internal/cpu" + "unsafe" +) + +const ( + offsetLoong64HasLAM_BH = unsafe.Offsetof(cpu.Loong64.HasLAM_BH) +) //go:noescape func Xadd(ptr *uint32, delta int32) uint32 diff --git a/src/internal/runtime/atomic/atomic_loong64.s b/src/internal/runtime/atomic/atomic_loong64.s index 9bed8654c8..1fe4e99dec 100644 --- a/src/internal/runtime/atomic/atomic_loong64.s +++ b/src/internal/runtime/atomic/atomic_loong64.s @@ -2,6 +2,7 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. +#include "go_asm.h" #include "textflag.h" // bool cas(uint32 *ptr, uint32 old, uint32 new) @@ -165,25 +166,27 @@ TEXT ·StoreReluintptr(SB), NOSPLIT, $0-16 TEXT ·Store(SB), NOSPLIT, $0-12 MOVV ptr+0(FP), R4 MOVW val+8(FP), R5 - DBAR - MOVW R5, 0(R4) - DBAR + AMSWAPDBW R5, (R4), R0 RET TEXT ·Store8(SB), NOSPLIT, $0-9 MOVV ptr+0(FP), R4 MOVB val+8(FP), R5 - DBAR + MOVBU internal∕cpu·Loong64+const_offsetLoong64HasLAM_BH(SB), R6 + BEQ R6, _legacy_store8_ + AMSWAPDBB R5, (R4), R0 + RET +_legacy_store8_: + // StoreRelease barrier + DBAR $0x12 MOVB R5, 0(R4) - DBAR + DBAR $0x18 RET TEXT ·Store64(SB), NOSPLIT, $0-16 MOVV ptr+0(FP), R4 MOVV val+8(FP), R5 - DBAR - MOVV R5, 0(R4) - DBAR + AMSWAPDBV R5, (R4), R0 RET // void Or8(byte volatile*, byte); diff --git a/src/internal/runtime/atomic/bench_test.go b/src/internal/runtime/atomic/bench_test.go index 6e3f14cbe4..b5837c9759 100644 --- a/src/internal/runtime/atomic/bench_test.go +++ b/src/internal/runtime/atomic/bench_test.go @@ -51,6 +51,14 @@ func BenchmarkAtomicLoad8(b *testing.B) { } } +func BenchmarkAtomicStore8(b *testing.B) { + var x uint8 + sink = &x + for i := 0; i < b.N; i++ { + atomic.Store8(&x, 0) + } +} + func BenchmarkAnd8(b *testing.B) { var x [512]uint8 // give byte its own cache line sink = &x |
