diff options
| author | Guoqi Chen <chenguoqi@loongson.cn> | 2024-09-23 11:38:36 +0800 |
|---|---|---|
| committer | abner chenc <chenguoqi@loongson.cn> | 2024-11-11 00:08:08 +0000 |
| commit | 4b0da6b13feefa14e58b7524435afa5c14e7a554 (patch) | |
| tree | af341f1248632720eaf7c2040f00ca76ae3c501f /src/internal/runtime/atomic | |
| parent | 72a92ab5b72680e6e0f8acffcfd62b2c6fd98085 (diff) | |
| download | go-4b0da6b13feefa14e58b7524435afa5c14e7a554.tar.xz | |
cmd/compiler,internal/runtime/atomic: optimize And{64,32,8} and Or{64,32,8} on loong64
Use loong64's atomic operation instruction AMANDDB{V,W,W} (full barrier) to implement
And{64,32,8}, AMORDB{V,W,W} (full barrier) to implement Or{64,32,8}.
Intrinsify And{64,32,8} and Or{64,32,8}, And this CL alias all of the And/Or operations
into sync/atomic package.
goos: linux
goarch: loong64
pkg: internal/runtime/atomic
cpu: Loongson-3A6000-HV @ 2500.00MHz
| bench.old | bench.new |
| sec/op | sec/op vs base |
And32 27.73n ± 0% 10.81n ± 0% -61.02% (p=0.000 n=20)
And32Parallel 28.96n ± 0% 12.41n ± 0% -57.15% (p=0.000 n=20)
And64 27.73n ± 0% 10.81n ± 0% -61.02% (p=0.000 n=20)
And64Parallel 28.96n ± 0% 12.41n ± 0% -57.15% (p=0.000 n=20)
Or32 27.62n ± 0% 10.81n ± 0% -60.86% (p=0.000 n=20)
Or32Parallel 28.96n ± 0% 12.41n ± 0% -57.15% (p=0.000 n=20)
Or64 27.62n ± 0% 10.81n ± 0% -60.86% (p=0.000 n=20)
Or64Parallel 28.97n ± 0% 12.41n ± 0% -57.16% (p=0.000 n=20)
And8 29.15n ± 0% 13.21n ± 0% -54.68% (p=0.000 n=20)
And 27.71n ± 0% 12.82n ± 0% -53.74% (p=0.000 n=20)
And8Parallel 28.99n ± 0% 14.46n ± 0% -50.12% (p=0.000 n=20)
AndParallel 29.12n ± 0% 14.42n ± 0% -50.48% (p=0.000 n=20)
Or8 28.31n ± 0% 12.81n ± 0% -54.75% (p=0.000 n=20)
Or 27.72n ± 0% 12.81n ± 0% -53.79% (p=0.000 n=20)
Or8Parallel 29.03n ± 0% 14.62n ± 0% -49.64% (p=0.000 n=20)
OrParallel 29.12n ± 0% 14.42n ± 0% -50.49% (p=0.000 n=20)
geomean 28.47n 12.58n -55.80%
goos: linux
goarch: loong64
pkg: internal/runtime/atomic
cpu: Loongson-3A5000 @ 2500.00MHz
| bench.old | bench.new |
| sec/op | sec/op vs base |
And32 30.02n ± 0% 14.81n ± 0% -50.67% (p=0.000 n=20)
And32Parallel 30.83n ± 0% 15.61n ± 0% -49.37% (p=0.000 n=20)
And64 30.02n ± 0% 14.81n ± 0% -50.67% (p=0.000 n=20)
And64Parallel 30.83n ± 0% 15.61n ± 0% -49.37% (p=0.000 n=20)
And8 30.42n ± 0% 14.41n ± 0% -52.63% (p=0.000 n=20)
And 30.02n ± 0% 13.61n ± 0% -54.66% (p=0.000 n=20)
And8Parallel 31.23n ± 0% 15.21n ± 0% -51.30% (p=0.000 n=20)
AndParallel 30.83n ± 0% 14.41n ± 0% -53.26% (p=0.000 n=20)
Or32 30.02n ± 0% 14.81n ± 0% -50.67% (p=0.000 n=20)
Or32Parallel 30.83n ± 0% 15.61n ± 0% -49.37% (p=0.000 n=20)
Or64 30.02n ± 0% 14.82n ± 0% -50.63% (p=0.000 n=20)
Or64Parallel 30.83n ± 0% 15.61n ± 0% -49.37% (p=0.000 n=20)
Or8 30.02n ± 0% 14.01n ± 0% -53.33% (p=0.000 n=20)
Or 30.02n ± 0% 13.61n ± 0% -54.66% (p=0.000 n=20)
Or8Parallel 30.83n ± 0% 14.81n ± 0% -51.96% (p=0.000 n=20)
OrParallel 30.83n ± 0% 14.41n ± 0% -53.26% (p=0.000 n=20)
geomean 30.47n 14.75n -51.61%
Change-Id: If008ff6a08b51905076f8ddb6e92f8e214d3f7b3
Reviewed-on: https://go-review.googlesource.com/c/go/+/482756
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Qiqi Huang <huangqiqi@loongson.cn>
Reviewed-by: Meidan Li <limeidan@loongson.cn>
Reviewed-by: David Chase <drchase@google.com>
Reviewed-by: sophie zhao <zhaoxiaolin@loongson.cn>
Reviewed-by: Cherry Mui <cherryyz@google.com>
Diffstat (limited to 'src/internal/runtime/atomic')
| -rw-r--r-- | src/internal/runtime/atomic/atomic_loong64.s | 82 |
1 files changed, 19 insertions, 63 deletions
diff --git a/src/internal/runtime/atomic/atomic_loong64.s b/src/internal/runtime/atomic/atomic_loong64.s index 6ea162d9da..60741a23c2 100644 --- a/src/internal/runtime/atomic/atomic_loong64.s +++ b/src/internal/runtime/atomic/atomic_loong64.s @@ -185,122 +185,78 @@ TEXT ·Store64(SB), NOSPLIT, $0-16 TEXT ·Or8(SB), NOSPLIT, $0-9 MOVV ptr+0(FP), R4 MOVBU val+8(FP), R5 - // Align ptr down to 4 bytes so we can use 32-bit load/store. + // R6 = ptr & (~3) MOVV $~3, R6 AND R4, R6 // R7 = ((ptr & 3) * 8) AND $3, R4, R7 SLLV $3, R7 - // Shift val for aligned ptr. R5 = val << R4 + // R5 = val << R7 SLLV R7, R5 - - DBAR - LL (R6), R7 - OR R5, R7 - SC R7, (R6) - BEQ R7, -4(PC) - DBAR + AMORDBW R5, (R6), R0 RET // void And8(byte volatile*, byte); TEXT ·And8(SB), NOSPLIT, $0-9 MOVV ptr+0(FP), R4 MOVBU val+8(FP), R5 - // Align ptr down to 4 bytes so we can use 32-bit load/store. + // R6 = ptr & (~3) MOVV $~3, R6 AND R4, R6 // R7 = ((ptr & 3) * 8) AND $3, R4, R7 SLLV $3, R7 - // Shift val for aligned ptr. R5 = val << R7 | ^(0xFF << R7) - MOVV $0xFF, R8 - SLLV R7, R5 - SLLV R7, R8 - NOR R0, R8 - OR R8, R5 - - DBAR - LL (R6), R7 - AND R5, R7 - SC R7, (R6) - BEQ R7, -4(PC) - DBAR + // R5 = ((val ^ 0xFF) << R7) ^ (-1) + XOR $255, R5 + SLLV R7, R5 + XOR $-1, R5 + AMANDDBW R5, (R6), R0 RET // func Or(addr *uint32, v uint32) TEXT ·Or(SB), NOSPLIT, $0-12 MOVV ptr+0(FP), R4 MOVW val+8(FP), R5 - DBAR - LL (R4), R6 - OR R5, R6 - SC R6, (R4) - BEQ R6, -4(PC) - DBAR + AMORDBW R5, (R4), R0 RET // func And(addr *uint32, v uint32) TEXT ·And(SB), NOSPLIT, $0-12 MOVV ptr+0(FP), R4 MOVW val+8(FP), R5 - DBAR - LL (R4), R6 - AND R5, R6 - SC R6, (R4) - BEQ R6, -4(PC) - DBAR + AMANDDBW R5, (R4), R0 RET // func Or32(addr *uint32, v uint32) old uint32 TEXT ·Or32(SB), NOSPLIT, $0-20 MOVV ptr+0(FP), R4 MOVW val+8(FP), R5 - DBAR - LL (R4), R6 - OR R5, R6, R7 - SC R7, (R4) - BEQ R7, -4(PC) - DBAR - MOVW R6, ret+16(FP) + AMORDBW R5, (R4), R6 + MOVW R6, ret+16(FP) RET // func And32(addr *uint32, v uint32) old uint32 TEXT ·And32(SB), NOSPLIT, $0-20 MOVV ptr+0(FP), R4 MOVW val+8(FP), R5 - DBAR - LL (R4), R6 - AND R5, R6, R7 - SC R7, (R4) - BEQ R7, -4(PC) - DBAR - MOVW R6, ret+16(FP) + AMANDDBW R5, (R4), R6 + MOVW R6, ret+16(FP) RET // func Or64(addr *uint64, v uint64) old uint64 TEXT ·Or64(SB), NOSPLIT, $0-24 MOVV ptr+0(FP), R4 MOVV val+8(FP), R5 - DBAR - LLV (R4), R6 - OR R5, R6, R7 - SCV R7, (R4) - BEQ R7, -4(PC) - DBAR - MOVV R6, ret+16(FP) + AMORDBV R5, (R4), R6 + MOVV R6, ret+16(FP) RET // func And64(addr *uint64, v uint64) old uint64 TEXT ·And64(SB), NOSPLIT, $0-24 MOVV ptr+0(FP), R4 MOVV val+8(FP), R5 - DBAR - LLV (R4), R6 - AND R5, R6, R7 - SCV R7, (R4) - BEQ R7, -4(PC) - DBAR - MOVV R6, ret+16(FP) + AMANDDBV R5, (R4), R6 + MOVV R6, ret+16(FP) RET // func Anduintptr(addr *uintptr, v uintptr) old uintptr |
