aboutsummaryrefslogtreecommitdiff
path: root/src/internal/runtime
diff options
context:
space:
mode:
authorGuoqi Chen <chenguoqi@loongson.cn>2024-09-20 11:06:18 +0800
committerabner chenc <chenguoqi@loongson.cn>2024-11-19 01:15:07 +0000
commit5432cd96fd951bce01bbce9f9744b62871f79b17 (patch)
treec39427705c0f47fd68575225c3509d4e9c9b824b /src/internal/runtime
parentec7824b6bb12481e7ffe50b7f1cbaa1faf465a44 (diff)
downloadgo-5432cd96fd951bce01bbce9f9744b62871f79b17.tar.xz
cmd/compiler,internal/runtime/atomic: optimize Cas{64,32} on loong64
In Loongson's new microstructure LA664 (Loongson-3A6000) and later, the atomic compare-and-exchange instruction AMCAS[DB]{B,W,H,V} [1] is supported. Therefore, the implementation of the atomic operation compare-and-swap can be selected according to the CPUCFG flag LAMCAS: AMCASDB(full barrier) instruction is used on new microstructures, and traditional LL-SC is used on LA464 (Loongson-3A5000) and older microstructures. This can significantly improve the performance of Go programs on new microstructures. goos: linux goarch: loong64 pkg: internal/runtime/atomic cpu: Loongson-3A6000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | Cas 46.84n ± 0% 22.82n ± 0% -51.28% (p=0.000 n=20) Cas-2 47.58n ± 0% 29.57n ± 0% -37.85% (p=0.000 n=20) Cas-4 43.27n ± 20% 25.31n ± 13% -41.50% (p=0.000 n=20) Cas64 46.85n ± 0% 22.82n ± 0% -51.29% (p=0.000 n=20) Cas64-2 47.43n ± 0% 29.53n ± 0% -37.74% (p=0.002 n=20) Cas64-4 43.18n ± 0% 25.28n ± 2% -41.46% (p=0.000 n=20) geomean 45.82n 25.74n -43.82% goos: linux goarch: loong64 pkg: internal/runtime/atomic cpu: Loongson-3A5000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | Cas 50.05n ± 0% 51.26n ± 0% +2.42% (p=0.000 n=20) Cas-2 52.80n ± 0% 53.11n ± 0% +0.59% (p=0.000 n=20) Cas-4 55.97n ± 0% 57.31n ± 0% +2.39% (p=0.000 n=20) Cas64 50.05n ± 0% 51.26n ± 0% +2.42% (p=0.000 n=20) Cas64-2 52.68n ± 0% 53.11n ± 0% +0.82% (p=0.000 n=20) Cas64-4 55.96n ± 0% 57.26n ± 0% +2.33% (p=0.000 n=20) geomean 52.86n 53.83n +1.82% [1]: https://loongson.github.io/LoongArch-Documentation/LoongArch-ELF-ABI-EN.html Change-Id: I9b777c63c124fb492f61c903f77061fa2b4e5322 Reviewed-on: https://go-review.googlesource.com/c/go/+/613396 Reviewed-by: Meidan Li <limeidan@loongson.cn> Reviewed-by: David Chase <drchase@google.com> Reviewed-by: Dmitri Shuralyov <dmitshur@google.com> Reviewed-by: Qiqi Huang <huangqiqi@loongson.cn> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Diffstat (limited to 'src/internal/runtime')
-rw-r--r--src/internal/runtime/atomic/atomic_loong64.go1
-rw-r--r--src/internal/runtime/atomic/atomic_loong64.s50
2 files changed, 43 insertions, 8 deletions
diff --git a/src/internal/runtime/atomic/atomic_loong64.go b/src/internal/runtime/atomic/atomic_loong64.go
index a362628323..6586ad2f6c 100644
--- a/src/internal/runtime/atomic/atomic_loong64.go
+++ b/src/internal/runtime/atomic/atomic_loong64.go
@@ -12,6 +12,7 @@ import (
)
const (
+ offsetLOONG64HasLAMCAS = unsafe.Offsetof(cpu.Loong64.HasLAMCAS)
offsetLoong64HasLAM_BH = unsafe.Offsetof(cpu.Loong64.HasLAM_BH)
)
diff --git a/src/internal/runtime/atomic/atomic_loong64.s b/src/internal/runtime/atomic/atomic_loong64.s
index 60741a23c2..d67300afc4 100644
--- a/src/internal/runtime/atomic/atomic_loong64.s
+++ b/src/internal/runtime/atomic/atomic_loong64.s
@@ -16,18 +16,32 @@ TEXT ·Cas(SB), NOSPLIT, $0-17
MOVV ptr+0(FP), R4
MOVW old+8(FP), R5
MOVW new+12(FP), R6
- DBAR
+
+ MOVBU internal∕cpu·Loong64+const_offsetLOONG64HasLAMCAS(SB), R8
+ BEQ R8, cas_again
+ MOVV R5, R7 // backup old value
+ AMCASDBW R6, (R4), R5
+ BNE R7, R5, cas_fail0
+ MOVV $1, R4
+ MOVB R4, ret+16(FP)
+ RET
+cas_fail0:
+ MOVB R0, ret+16(FP)
+ RET
+
+ // Implemented using the ll-sc instruction pair
+ DBAR $0x14 // LoadAcquire barrier
cas_again:
MOVV R6, R7
LL (R4), R8
- BNE R5, R8, cas_fail
+ BNE R5, R8, cas_fail1
SC R7, (R4)
BEQ R7, cas_again
MOVV $1, R4
MOVB R4, ret+16(FP)
- DBAR
+ DBAR $0x12 // StoreRelease barrier
RET
-cas_fail:
+cas_fail1:
MOVV $0, R4
JMP -4(PC)
@@ -43,21 +57,41 @@ TEXT ·Cas64(SB), NOSPLIT, $0-25
MOVV ptr+0(FP), R4
MOVV old+8(FP), R5
MOVV new+16(FP), R6
- DBAR
+
+ MOVBU internal∕cpu·Loong64+const_offsetLOONG64HasLAMCAS(SB), R8
+ BEQ R8, cas64_again
+ MOVV R5, R7 // backup old value
+ AMCASDBV R6, (R4), R5
+ BNE R7, R5, cas64_fail0
+ MOVV $1, R4
+ MOVB R4, ret+24(FP)
+ RET
+cas64_fail0:
+ MOVB R0, ret+24(FP)
+ RET
+
+ // Implemented using the ll-sc instruction pair
+ DBAR $0x14
cas64_again:
MOVV R6, R7
LLV (R4), R8
- BNE R5, R8, cas64_fail
+ BNE R5, R8, cas64_fail1
SCV R7, (R4)
BEQ R7, cas64_again
MOVV $1, R4
MOVB R4, ret+24(FP)
- DBAR
+ DBAR $0x12
RET
-cas64_fail:
+cas64_fail1:
MOVV $0, R4
JMP -4(PC)
+TEXT ·Casint32(SB),NOSPLIT,$0-17
+ JMP ·Cas(SB)
+
+TEXT ·Casint64(SB),NOSPLIT,$0-25
+ JMP ·Cas64(SB)
+
TEXT ·Casuintptr(SB), NOSPLIT, $0-25
JMP ·Cas64(SB)