From 5432cd96fd951bce01bbce9f9744b62871f79b17 Mon Sep 17 00:00:00 2001 From: Guoqi Chen Date: Fri, 20 Sep 2024 11:06:18 +0800 Subject: cmd/compiler,internal/runtime/atomic: optimize Cas{64,32} on loong64 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In Loongson's new microstructure LA664 (Loongson-3A6000) and later, the atomic compare-and-exchange instruction AMCAS[DB]{B,W,H,V} [1] is supported. Therefore, the implementation of the atomic operation compare-and-swap can be selected according to the CPUCFG flag LAMCAS: AMCASDB(full barrier) instruction is used on new microstructures, and traditional LL-SC is used on LA464 (Loongson-3A5000) and older microstructures. This can significantly improve the performance of Go programs on new microstructures. goos: linux goarch: loong64 pkg: internal/runtime/atomic cpu: Loongson-3A6000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | Cas 46.84n ± 0% 22.82n ± 0% -51.28% (p=0.000 n=20) Cas-2 47.58n ± 0% 29.57n ± 0% -37.85% (p=0.000 n=20) Cas-4 43.27n ± 20% 25.31n ± 13% -41.50% (p=0.000 n=20) Cas64 46.85n ± 0% 22.82n ± 0% -51.29% (p=0.000 n=20) Cas64-2 47.43n ± 0% 29.53n ± 0% -37.74% (p=0.002 n=20) Cas64-4 43.18n ± 0% 25.28n ± 2% -41.46% (p=0.000 n=20) geomean 45.82n 25.74n -43.82% goos: linux goarch: loong64 pkg: internal/runtime/atomic cpu: Loongson-3A5000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | Cas 50.05n ± 0% 51.26n ± 0% +2.42% (p=0.000 n=20) Cas-2 52.80n ± 0% 53.11n ± 0% +0.59% (p=0.000 n=20) Cas-4 55.97n ± 0% 57.31n ± 0% +2.39% (p=0.000 n=20) Cas64 50.05n ± 0% 51.26n ± 0% +2.42% (p=0.000 n=20) Cas64-2 52.68n ± 0% 53.11n ± 0% +0.82% (p=0.000 n=20) Cas64-4 55.96n ± 0% 57.26n ± 0% +2.33% (p=0.000 n=20) geomean 52.86n 53.83n +1.82% [1]: https://loongson.github.io/LoongArch-Documentation/LoongArch-ELF-ABI-EN.html Change-Id: I9b777c63c124fb492f61c903f77061fa2b4e5322 Reviewed-on: https://go-review.googlesource.com/c/go/+/613396 Reviewed-by: Meidan Li Reviewed-by: David Chase Reviewed-by: Dmitri Shuralyov Reviewed-by: Qiqi Huang LUCI-TryBot-Result: Go LUCI --- src/runtime/cpuflags.go | 1 + src/runtime/proc.go | 1 + 2 files changed, 2 insertions(+) (limited to 'src/runtime') diff --git a/src/runtime/cpuflags.go b/src/runtime/cpuflags.go index 3f88d20fb3..e81e50f5df 100644 --- a/src/runtime/cpuflags.go +++ b/src/runtime/cpuflags.go @@ -34,6 +34,7 @@ var ( arm64HasATOMICS bool + loong64HasLAMCAS bool loong64HasLAM_BH bool loong64HasLSX bool ) diff --git a/src/runtime/proc.go b/src/runtime/proc.go index cbfac3a923..3f360ef129 100644 --- a/src/runtime/proc.go +++ b/src/runtime/proc.go @@ -752,6 +752,7 @@ func cpuinit(env string) { arm64HasATOMICS = cpu.ARM64.HasATOMICS case "loong64": + loong64HasLAMCAS = cpu.Loong64.HasLAMCAS loong64HasLAM_BH = cpu.Loong64.HasLAM_BH loong64HasLSX = cpu.Loong64.HasLSX } -- cgit v1.3