aboutsummaryrefslogtreecommitdiff
path: root/src/internal/runtime/atomic
diff options
context:
space:
mode:
authorGuoqi Chen <chenguoqi@loongson.cn>2024-09-23 11:38:36 +0800
committerabner chenc <chenguoqi@loongson.cn>2024-11-11 00:08:08 +0000
commit4b0da6b13feefa14e58b7524435afa5c14e7a554 (patch)
treeaf341f1248632720eaf7c2040f00ca76ae3c501f /src/internal/runtime/atomic
parent72a92ab5b72680e6e0f8acffcfd62b2c6fd98085 (diff)
downloadgo-4b0da6b13feefa14e58b7524435afa5c14e7a554.tar.xz
cmd/compiler,internal/runtime/atomic: optimize And{64,32,8} and Or{64,32,8} on loong64
Use loong64's atomic operation instruction AMANDDB{V,W,W} (full barrier) to implement And{64,32,8}, AMORDB{V,W,W} (full barrier) to implement Or{64,32,8}. Intrinsify And{64,32,8} and Or{64,32,8}, And this CL alias all of the And/Or operations into sync/atomic package. goos: linux goarch: loong64 pkg: internal/runtime/atomic cpu: Loongson-3A6000-HV @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | And32 27.73n ± 0% 10.81n ± 0% -61.02% (p=0.000 n=20) And32Parallel 28.96n ± 0% 12.41n ± 0% -57.15% (p=0.000 n=20) And64 27.73n ± 0% 10.81n ± 0% -61.02% (p=0.000 n=20) And64Parallel 28.96n ± 0% 12.41n ± 0% -57.15% (p=0.000 n=20) Or32 27.62n ± 0% 10.81n ± 0% -60.86% (p=0.000 n=20) Or32Parallel 28.96n ± 0% 12.41n ± 0% -57.15% (p=0.000 n=20) Or64 27.62n ± 0% 10.81n ± 0% -60.86% (p=0.000 n=20) Or64Parallel 28.97n ± 0% 12.41n ± 0% -57.16% (p=0.000 n=20) And8 29.15n ± 0% 13.21n ± 0% -54.68% (p=0.000 n=20) And 27.71n ± 0% 12.82n ± 0% -53.74% (p=0.000 n=20) And8Parallel 28.99n ± 0% 14.46n ± 0% -50.12% (p=0.000 n=20) AndParallel 29.12n ± 0% 14.42n ± 0% -50.48% (p=0.000 n=20) Or8 28.31n ± 0% 12.81n ± 0% -54.75% (p=0.000 n=20) Or 27.72n ± 0% 12.81n ± 0% -53.79% (p=0.000 n=20) Or8Parallel 29.03n ± 0% 14.62n ± 0% -49.64% (p=0.000 n=20) OrParallel 29.12n ± 0% 14.42n ± 0% -50.49% (p=0.000 n=20) geomean 28.47n 12.58n -55.80% goos: linux goarch: loong64 pkg: internal/runtime/atomic cpu: Loongson-3A5000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | And32 30.02n ± 0% 14.81n ± 0% -50.67% (p=0.000 n=20) And32Parallel 30.83n ± 0% 15.61n ± 0% -49.37% (p=0.000 n=20) And64 30.02n ± 0% 14.81n ± 0% -50.67% (p=0.000 n=20) And64Parallel 30.83n ± 0% 15.61n ± 0% -49.37% (p=0.000 n=20) And8 30.42n ± 0% 14.41n ± 0% -52.63% (p=0.000 n=20) And 30.02n ± 0% 13.61n ± 0% -54.66% (p=0.000 n=20) And8Parallel 31.23n ± 0% 15.21n ± 0% -51.30% (p=0.000 n=20) AndParallel 30.83n ± 0% 14.41n ± 0% -53.26% (p=0.000 n=20) Or32 30.02n ± 0% 14.81n ± 0% -50.67% (p=0.000 n=20) Or32Parallel 30.83n ± 0% 15.61n ± 0% -49.37% (p=0.000 n=20) Or64 30.02n ± 0% 14.82n ± 0% -50.63% (p=0.000 n=20) Or64Parallel 30.83n ± 0% 15.61n ± 0% -49.37% (p=0.000 n=20) Or8 30.02n ± 0% 14.01n ± 0% -53.33% (p=0.000 n=20) Or 30.02n ± 0% 13.61n ± 0% -54.66% (p=0.000 n=20) Or8Parallel 30.83n ± 0% 14.81n ± 0% -51.96% (p=0.000 n=20) OrParallel 30.83n ± 0% 14.41n ± 0% -53.26% (p=0.000 n=20) geomean 30.47n 14.75n -51.61% Change-Id: If008ff6a08b51905076f8ddb6e92f8e214d3f7b3 Reviewed-on: https://go-review.googlesource.com/c/go/+/482756 LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: Qiqi Huang <huangqiqi@loongson.cn> Reviewed-by: Meidan Li <limeidan@loongson.cn> Reviewed-by: David Chase <drchase@google.com> Reviewed-by: sophie zhao <zhaoxiaolin@loongson.cn> Reviewed-by: Cherry Mui <cherryyz@google.com>
Diffstat (limited to 'src/internal/runtime/atomic')
-rw-r--r--src/internal/runtime/atomic/atomic_loong64.s82
1 files changed, 19 insertions, 63 deletions
diff --git a/src/internal/runtime/atomic/atomic_loong64.s b/src/internal/runtime/atomic/atomic_loong64.s
index 6ea162d9da..60741a23c2 100644
--- a/src/internal/runtime/atomic/atomic_loong64.s
+++ b/src/internal/runtime/atomic/atomic_loong64.s
@@ -185,122 +185,78 @@ TEXT ·Store64(SB), NOSPLIT, $0-16
TEXT ·Or8(SB), NOSPLIT, $0-9
MOVV ptr+0(FP), R4
MOVBU val+8(FP), R5
- // Align ptr down to 4 bytes so we can use 32-bit load/store.
+ // R6 = ptr & (~3)
MOVV $~3, R6
AND R4, R6
// R7 = ((ptr & 3) * 8)
AND $3, R4, R7
SLLV $3, R7
- // Shift val for aligned ptr. R5 = val << R4
+ // R5 = val << R7
SLLV R7, R5
-
- DBAR
- LL (R6), R7
- OR R5, R7
- SC R7, (R6)
- BEQ R7, -4(PC)
- DBAR
+ AMORDBW R5, (R6), R0
RET
// void And8(byte volatile*, byte);
TEXT ·And8(SB), NOSPLIT, $0-9
MOVV ptr+0(FP), R4
MOVBU val+8(FP), R5
- // Align ptr down to 4 bytes so we can use 32-bit load/store.
+ // R6 = ptr & (~3)
MOVV $~3, R6
AND R4, R6
// R7 = ((ptr & 3) * 8)
AND $3, R4, R7
SLLV $3, R7
- // Shift val for aligned ptr. R5 = val << R7 | ^(0xFF << R7)
- MOVV $0xFF, R8
- SLLV R7, R5
- SLLV R7, R8
- NOR R0, R8
- OR R8, R5
-
- DBAR
- LL (R6), R7
- AND R5, R7
- SC R7, (R6)
- BEQ R7, -4(PC)
- DBAR
+ // R5 = ((val ^ 0xFF) << R7) ^ (-1)
+ XOR $255, R5
+ SLLV R7, R5
+ XOR $-1, R5
+ AMANDDBW R5, (R6), R0
RET
// func Or(addr *uint32, v uint32)
TEXT ·Or(SB), NOSPLIT, $0-12
MOVV ptr+0(FP), R4
MOVW val+8(FP), R5
- DBAR
- LL (R4), R6
- OR R5, R6
- SC R6, (R4)
- BEQ R6, -4(PC)
- DBAR
+ AMORDBW R5, (R4), R0
RET
// func And(addr *uint32, v uint32)
TEXT ·And(SB), NOSPLIT, $0-12
MOVV ptr+0(FP), R4
MOVW val+8(FP), R5
- DBAR
- LL (R4), R6
- AND R5, R6
- SC R6, (R4)
- BEQ R6, -4(PC)
- DBAR
+ AMANDDBW R5, (R4), R0
RET
// func Or32(addr *uint32, v uint32) old uint32
TEXT ·Or32(SB), NOSPLIT, $0-20
MOVV ptr+0(FP), R4
MOVW val+8(FP), R5
- DBAR
- LL (R4), R6
- OR R5, R6, R7
- SC R7, (R4)
- BEQ R7, -4(PC)
- DBAR
- MOVW R6, ret+16(FP)
+ AMORDBW R5, (R4), R6
+ MOVW R6, ret+16(FP)
RET
// func And32(addr *uint32, v uint32) old uint32
TEXT ·And32(SB), NOSPLIT, $0-20
MOVV ptr+0(FP), R4
MOVW val+8(FP), R5
- DBAR
- LL (R4), R6
- AND R5, R6, R7
- SC R7, (R4)
- BEQ R7, -4(PC)
- DBAR
- MOVW R6, ret+16(FP)
+ AMANDDBW R5, (R4), R6
+ MOVW R6, ret+16(FP)
RET
// func Or64(addr *uint64, v uint64) old uint64
TEXT ·Or64(SB), NOSPLIT, $0-24
MOVV ptr+0(FP), R4
MOVV val+8(FP), R5
- DBAR
- LLV (R4), R6
- OR R5, R6, R7
- SCV R7, (R4)
- BEQ R7, -4(PC)
- DBAR
- MOVV R6, ret+16(FP)
+ AMORDBV R5, (R4), R6
+ MOVV R6, ret+16(FP)
RET
// func And64(addr *uint64, v uint64) old uint64
TEXT ·And64(SB), NOSPLIT, $0-24
MOVV ptr+0(FP), R4
MOVV val+8(FP), R5
- DBAR
- LLV (R4), R6
- AND R5, R6, R7
- SCV R7, (R4)
- BEQ R7, -4(PC)
- DBAR
- MOVV R6, ret+16(FP)
+ AMANDDBV R5, (R4), R6
+ MOVV R6, ret+16(FP)
RET
// func Anduintptr(addr *uintptr, v uintptr) old uintptr