From 15ead857dbc638b9d83a7686acf0dc746fc45918 Mon Sep 17 00:00:00 2001 From: "Paul E. Murphy" Date: Wed, 9 Sep 2020 17:24:23 -0500 Subject: cmd/compiler,cmd/go,sync: add internal {LoadAcq,StoreRel}64 on ppc64 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add an internal atomic intrinsic for load with acquire semantics (extending LoadAcq to 64b) and add LoadAcquintptr for internal use within the sync package. For other arches, this remaps to the appropriate atomic.Load{,64} intrinsic which should not alter code generation. Similarly, add StoreRel{uintptr,64} for consistency, and inline. Finally, add an exception to allow sync to directly use the runtime/internal/atomic package which avoids more convoluted workarounds (contributed by Lynn Boger). In an extreme example, sync.(*Pool).pin consumes 20% of wall time during fmt tests. This is reduced to 5% on ppc64le/power9. From the fmt benchmarks on ppc64le: name old time/op new time/op delta SprintfPadding 468ns ± 0% 451ns ± 0% -3.63% SprintfEmpty 73.3ns ± 0% 51.9ns ± 0% -29.20% SprintfString 135ns ± 0% 122ns ± 0% -9.63% SprintfTruncateString 232ns ± 0% 214ns ± 0% -7.76% SprintfTruncateBytes 216ns ± 0% 202ns ± 0% -6.48% SprintfSlowParsingPath 162ns ± 0% 142ns ± 0% -12.35% SprintfQuoteString 1.00µs ± 0% 0.99µs ± 0% -1.39% SprintfInt 117ns ± 0% 104ns ± 0% -11.11% SprintfIntInt 190ns ± 0% 175ns ± 0% -7.89% SprintfPrefixedInt 232ns ± 0% 212ns ± 0% -8.62% SprintfFloat 270ns ± 0% 255ns ± 0% -5.56% SprintfComplex 1.01µs ± 0% 0.99µs ± 0% -1.68% SprintfBoolean 127ns ± 0% 111ns ± 0% -12.60% SprintfHexString 220ns ± 0% 198ns ± 0% -10.00% SprintfHexBytes 261ns ± 0% 252ns ± 0% -3.45% SprintfBytes 600ns ± 0% 590ns ± 0% -1.67% SprintfStringer 684ns ± 0% 658ns ± 0% -3.80% SprintfStructure 2.57µs ± 0% 2.57µs ± 0% -0.12% ManyArgs 669ns ± 0% 646ns ± 0% -3.44% FprintInt 140ns ± 0% 136ns ± 0% -2.86% FprintfBytes 184ns ± 0% 181ns ± 0% -1.63% FprintIntNoAlloc 140ns ± 0% 136ns ± 0% -2.86% ScanInts 929µs ± 0% 921µs ± 0% -0.79% ScanRecursiveInt 122ms ± 0% 121ms ± 0% -0.11% ScanRecursiveIntReaderWrapper 122ms ± 0% 122ms ± 0% -0.18% Change-Id: I4d66780261b57b06ef600229e475462e7313f0d6 Reviewed-on: https://go-review.googlesource.com/c/go/+/253748 Run-TryBot: Lynn Boger Reviewed-by: Lynn Boger Reviewed-by: Keith Randall Trust: Lynn Boger TryBot-Result: Go Bot --- src/runtime/internal/atomic/asm_amd64.s | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'src/runtime/internal/atomic/asm_amd64.s') diff --git a/src/runtime/internal/atomic/asm_amd64.s b/src/runtime/internal/atomic/asm_amd64.s index 90c56424c9..80fb31285d 100644 --- a/src/runtime/internal/atomic/asm_amd64.s +++ b/src/runtime/internal/atomic/asm_amd64.s @@ -136,6 +136,12 @@ TEXT runtime∕internal∕atomic·Store(SB), NOSPLIT, $0-12 TEXT runtime∕internal∕atomic·StoreRel(SB), NOSPLIT, $0-12 JMP runtime∕internal∕atomic·Store(SB) +TEXT runtime∕internal∕atomic·StoreRel64(SB), NOSPLIT, $0-16 + JMP runtime∕internal∕atomic·Store64(SB) + +TEXT runtime∕internal∕atomic·StoreReluintptr(SB), NOSPLIT, $0-16 + JMP runtime∕internal∕atomic·Store64(SB) + TEXT runtime∕internal∕atomic·Store8(SB), NOSPLIT, $0-9 MOVQ ptr+0(FP), BX MOVB val+8(FP), AX -- cgit v1.3 From ad61343f886cc5ce677e7bd62385144b2ba7b8f5 Mon Sep 17 00:00:00 2001 From: Michael Pratt Date: Thu, 8 Oct 2020 14:38:39 -0400 Subject: runtime/internal/atomic: add 32-bit And/Or These will be used in a following CL to perform larger bit clear and bit set than And8/Or8. Change-Id: I60f7b1099e29b69eb64add77564faee862880a8d Reviewed-on: https://go-review.googlesource.com/c/go/+/260977 Run-TryBot: Michael Pratt TryBot-Result: Go Bot Reviewed-by: Cherry Zhang Trust: Michael Pratt --- src/runtime/internal/atomic/asm_386.s | 16 ++++ src/runtime/internal/atomic/asm_amd64.s | 16 ++++ src/runtime/internal/atomic/asm_mips64x.s | 26 ++++++ src/runtime/internal/atomic/asm_mipsx.s | 26 ++++++ src/runtime/internal/atomic/asm_ppc64x.s | 30 ++++++- src/runtime/internal/atomic/asm_s390x.s | 22 ++++- src/runtime/internal/atomic/atomic_386.go | 6 ++ src/runtime/internal/atomic/atomic_amd64.go | 6 ++ src/runtime/internal/atomic/atomic_arm.go | 20 +++++ src/runtime/internal/atomic/atomic_arm64.go | 6 ++ src/runtime/internal/atomic/atomic_arm64.s | 19 ++++ src/runtime/internal/atomic/atomic_mips64x.go | 6 ++ src/runtime/internal/atomic/atomic_mipsx.go | 6 ++ src/runtime/internal/atomic/atomic_ppc64x.go | 6 ++ src/runtime/internal/atomic/atomic_riscv64.go | 6 ++ src/runtime/internal/atomic/atomic_riscv64.s | 14 +++ src/runtime/internal/atomic/atomic_s390x.go | 6 ++ src/runtime/internal/atomic/atomic_test.go | 119 +++++++++++++++++++++++++- src/runtime/internal/atomic/atomic_wasm.go | 12 +++ src/runtime/internal/atomic/bench_test.go | 40 +++++++++ 20 files changed, 400 insertions(+), 8 deletions(-) (limited to 'src/runtime/internal/atomic/asm_amd64.s') diff --git a/src/runtime/internal/atomic/asm_386.s b/src/runtime/internal/atomic/asm_386.s index 7ebf675ac5..d82faef1f0 100644 --- a/src/runtime/internal/atomic/asm_386.s +++ b/src/runtime/internal/atomic/asm_386.s @@ -243,3 +243,19 @@ TEXT ·Store8(SB), NOSPLIT, $0-5 MOVB val+4(FP), AX XCHGB AX, 0(BX) RET + +// func Or(addr *uint32, v uint32) +TEXT ·Or(SB), NOSPLIT, $0-8 + MOVL ptr+0(FP), AX + MOVL val+4(FP), BX + LOCK + ORL BX, (AX) + RET + +// func And(addr *uint32, v uint32) +TEXT ·And(SB), NOSPLIT, $0-8 + MOVL ptr+0(FP), AX + MOVL val+4(FP), BX + LOCK + ANDL BX, (AX) + RET diff --git a/src/runtime/internal/atomic/asm_amd64.s b/src/runtime/internal/atomic/asm_amd64.s index 80fb31285d..2cf7c55870 100644 --- a/src/runtime/internal/atomic/asm_amd64.s +++ b/src/runtime/internal/atomic/asm_amd64.s @@ -169,3 +169,19 @@ TEXT runtime∕internal∕atomic·And8(SB), NOSPLIT, $0-9 LOCK ANDB BX, (AX) RET + +// func Or(addr *uint32, v uint32) +TEXT runtime∕internal∕atomic·Or(SB), NOSPLIT, $0-12 + MOVQ ptr+0(FP), AX + MOVL val+8(FP), BX + LOCK + ORL BX, (AX) + RET + +// func And(addr *uint32, v uint32) +TEXT runtime∕internal∕atomic·And(SB), NOSPLIT, $0-12 + MOVQ ptr+0(FP), AX + MOVL val+8(FP), BX + LOCK + ANDL BX, (AX) + RET diff --git a/src/runtime/internal/atomic/asm_mips64x.s b/src/runtime/internal/atomic/asm_mips64x.s index 03fb822929..a515683ebb 100644 --- a/src/runtime/internal/atomic/asm_mips64x.s +++ b/src/runtime/internal/atomic/asm_mips64x.s @@ -243,3 +243,29 @@ TEXT ·And8(SB), NOSPLIT, $0-9 BEQ R4, -4(PC) SYNC RET + +// func Or(addr *uint32, v uint32) +TEXT ·Or(SB), NOSPLIT, $0-12 + MOVV ptr+0(FP), R1 + MOVW val+8(FP), R2 + + SYNC + LL (R1), R3 + OR R2, R3 + SC R3, (R1) + BEQ R3, -4(PC) + SYNC + RET + +// func And(addr *uint32, v uint32) +TEXT ·And(SB), NOSPLIT, $0-12 + MOVV ptr+0(FP), R1 + MOVW val+8(FP), R2 + + SYNC + LL (R1), R3 + AND R2, R3 + SC R3, (R1) + BEQ R3, -4(PC) + SYNC + RET diff --git a/src/runtime/internal/atomic/asm_mipsx.s b/src/runtime/internal/atomic/asm_mipsx.s index 63bb548825..2b2cfabe08 100644 --- a/src/runtime/internal/atomic/asm_mipsx.s +++ b/src/runtime/internal/atomic/asm_mipsx.s @@ -172,3 +172,29 @@ try_and8: BEQ R4, try_and8 SYNC RET + +// func Or(addr *uint32, v uint32) +TEXT ·Or(SB), NOSPLIT, $0-8 + MOVW ptr+0(FP), R1 + MOVW val+4(FP), R2 + + SYNC + LL (R1), R3 + OR R2, R3 + SC R3, (R1) + BEQ R3, -4(PC) + SYNC + RET + +// func And(addr *uint32, v uint32) +TEXT ·And(SB), NOSPLIT, $0-8 + MOVW ptr+0(FP), R1 + MOVW val+4(FP), R2 + + SYNC + LL (R1), R3 + AND R2, R3 + SC R3, (R1) + BEQ R3, -4(PC) + SYNC + RET diff --git a/src/runtime/internal/atomic/asm_ppc64x.s b/src/runtime/internal/atomic/asm_ppc64x.s index c0237de4d0..bb009ab34d 100644 --- a/src/runtime/internal/atomic/asm_ppc64x.s +++ b/src/runtime/internal/atomic/asm_ppc64x.s @@ -222,8 +222,32 @@ TEXT runtime∕internal∕atomic·And8(SB), NOSPLIT, $0-9 MOVBZ val+8(FP), R4 LWSYNC again: - LBAR (R3),R6 - AND R4,R6 - STBCCC R6,(R3) + LBAR (R3), R6 + AND R4, R6 + STBCCC R6, (R3) + BNE again + RET + +// func Or(addr *uint32, v uint32) +TEXT runtime∕internal∕atomic·Or(SB), NOSPLIT, $0-12 + MOVD ptr+0(FP), R3 + MOVW val+8(FP), R4 + LWSYNC +again: + LWAR (R3), R6 + OR R4, R6 + STWCCC R6, (R3) + BNE again + RET + +// func And(addr *uint32, v uint32) +TEXT runtime∕internal∕atomic·And(SB), NOSPLIT, $0-12 + MOVD ptr+0(FP), R3 + MOVW val+8(FP), R4 + LWSYNC +again: + LWAR (R3),R6 + AND R4, R6 + STWCCC R6, (R3) BNE again RET diff --git a/src/runtime/internal/atomic/asm_s390x.s b/src/runtime/internal/atomic/asm_s390x.s index 9a19bc0ece..daf1f3cc9f 100644 --- a/src/runtime/internal/atomic/asm_s390x.s +++ b/src/runtime/internal/atomic/asm_s390x.s @@ -174,8 +174,8 @@ TEXT ·Xchguintptr(SB), NOSPLIT, $0-24 // func Or8(addr *uint8, v uint8) TEXT ·Or8(SB), NOSPLIT, $0-9 - MOVD ptr+0(FP), R3 - MOVBZ val+8(FP), R4 + MOVD ptr+0(FP), R3 + MOVBZ val+8(FP), R4 // We don't have atomic operations that work on individual bytes so we // need to align addr down to a word boundary and create a mask // containing v to OR with the entire word atomically. @@ -188,8 +188,8 @@ TEXT ·Or8(SB), NOSPLIT, $0-9 // func And8(addr *uint8, v uint8) TEXT ·And8(SB), NOSPLIT, $0-9 - MOVD ptr+0(FP), R3 - MOVBZ val+8(FP), R4 + MOVD ptr+0(FP), R3 + MOVBZ val+8(FP), R4 // We don't have atomic operations that work on individual bytes so we // need to align addr down to a word boundary and create a mask // containing v to AND with the entire word atomically. @@ -200,3 +200,17 @@ TEXT ·And8(SB), NOSPLIT, $0-9 RLL R5, R4, R4 // R4 = rotl(R4, R5) LAN R4, R6, 0(R3) // R6 = *R3; *R3 &= R4; (atomic) RET + +// func Or(addr *uint32, v uint32) +TEXT ·Or(SB), NOSPLIT, $0-12 + MOVD ptr+0(FP), R3 + MOVW val+8(FP), R4 + LAO R4, R6, 0(R3) // R6 = *R3; *R3 |= R4; (atomic) + RET + +// func And(addr *uint32, v uint32) +TEXT ·And(SB), NOSPLIT, $0-12 + MOVD ptr+0(FP), R3 + MOVW val+8(FP), R4 + LAN R4, R6, 0(R3) // R6 = *R3; *R3 &= R4; (atomic) + RET diff --git a/src/runtime/internal/atomic/atomic_386.go b/src/runtime/internal/atomic/atomic_386.go index 06ce6a5356..1bfcb1143d 100644 --- a/src/runtime/internal/atomic/atomic_386.go +++ b/src/runtime/internal/atomic/atomic_386.go @@ -69,6 +69,12 @@ func And8(ptr *uint8, val uint8) //go:noescape func Or8(ptr *uint8, val uint8) +//go:noescape +func And(ptr *uint32, val uint32) + +//go:noescape +func Or(ptr *uint32, val uint32) + // NOTE: Do not add atomicxor8 (XOR is not idempotent). //go:noescape diff --git a/src/runtime/internal/atomic/atomic_amd64.go b/src/runtime/internal/atomic/atomic_amd64.go index 1b71a16d94..e36eb83a11 100644 --- a/src/runtime/internal/atomic/atomic_amd64.go +++ b/src/runtime/internal/atomic/atomic_amd64.go @@ -77,6 +77,12 @@ func And8(ptr *uint8, val uint8) //go:noescape func Or8(ptr *uint8, val uint8) +//go:noescape +func And(ptr *uint32, val uint32) + +//go:noescape +func Or(ptr *uint32, val uint32) + // NOTE: Do not add atomicxor8 (XOR is not idempotent). //go:noescape diff --git a/src/runtime/internal/atomic/atomic_arm.go b/src/runtime/internal/atomic/atomic_arm.go index 67d529c1cb..546b3d6120 100644 --- a/src/runtime/internal/atomic/atomic_arm.go +++ b/src/runtime/internal/atomic/atomic_arm.go @@ -182,6 +182,26 @@ func And8(addr *uint8, v uint8) { } } +//go:nosplit +func Or(addr *uint32, v uint32) { + for { + old := *addr + if Cas(addr, old, old|v) { + return + } + } +} + +//go:nosplit +func And(addr *uint32, v uint32) { + for { + old := *addr + if Cas(addr, old, old&v) { + return + } + } +} + //go:nosplit func armcas(ptr *uint32, old, new uint32) bool diff --git a/src/runtime/internal/atomic/atomic_arm64.go b/src/runtime/internal/atomic/atomic_arm64.go index c9b4322fe9..d49bee8936 100644 --- a/src/runtime/internal/atomic/atomic_arm64.go +++ b/src/runtime/internal/atomic/atomic_arm64.go @@ -53,6 +53,12 @@ func Or8(ptr *uint8, val uint8) //go:noescape func And8(ptr *uint8, val uint8) +//go:noescape +func And(ptr *uint32, val uint32) + +//go:noescape +func Or(ptr *uint32, val uint32) + //go:noescape func Cas64(ptr *uint64, old, new uint64) bool diff --git a/src/runtime/internal/atomic/atomic_arm64.s b/src/runtime/internal/atomic/atomic_arm64.s index 36c7698b18..0cf3c40223 100644 --- a/src/runtime/internal/atomic/atomic_arm64.s +++ b/src/runtime/internal/atomic/atomic_arm64.s @@ -164,3 +164,22 @@ TEXT ·Or8(SB), NOSPLIT, $0-9 CBNZ R3, -3(PC) RET +// func And(addr *uint32, v uint32) +TEXT ·And(SB), NOSPLIT, $0-12 + MOVD ptr+0(FP), R0 + MOVW val+8(FP), R1 + LDAXRW (R0), R2 + AND R1, R2 + STLXRW R2, (R0), R3 + CBNZ R3, -3(PC) + RET + +// func Or(addr *uint32, v uint32) +TEXT ·Or(SB), NOSPLIT, $0-12 + MOVD ptr+0(FP), R0 + MOVW val+8(FP), R1 + LDAXRW (R0), R2 + ORR R1, R2 + STLXRW R2, (R0), R3 + CBNZ R3, -3(PC) + RET diff --git a/src/runtime/internal/atomic/atomic_mips64x.go b/src/runtime/internal/atomic/atomic_mips64x.go index fca2242514..b0109d72b0 100644 --- a/src/runtime/internal/atomic/atomic_mips64x.go +++ b/src/runtime/internal/atomic/atomic_mips64x.go @@ -55,6 +55,12 @@ func Or8(ptr *uint8, val uint8) // NOTE: Do not add atomicxor8 (XOR is not idempotent). +//go:noescape +func And(ptr *uint32, val uint32) + +//go:noescape +func Or(ptr *uint32, val uint32) + //go:noescape func Cas64(ptr *uint64, old, new uint64) bool diff --git a/src/runtime/internal/atomic/atomic_mipsx.go b/src/runtime/internal/atomic/atomic_mipsx.go index be1e6a038b..1336b50121 100644 --- a/src/runtime/internal/atomic/atomic_mipsx.go +++ b/src/runtime/internal/atomic/atomic_mipsx.go @@ -141,6 +141,12 @@ func And8(ptr *uint8, val uint8) //go:noescape func Or8(ptr *uint8, val uint8) +//go:noescape +func And(ptr *uint32, val uint32) + +//go:noescape +func Or(ptr *uint32, val uint32) + //go:noescape func Store(ptr *uint32, val uint32) diff --git a/src/runtime/internal/atomic/atomic_ppc64x.go b/src/runtime/internal/atomic/atomic_ppc64x.go index e759bb27a2..e4b109f0ec 100644 --- a/src/runtime/internal/atomic/atomic_ppc64x.go +++ b/src/runtime/internal/atomic/atomic_ppc64x.go @@ -55,6 +55,12 @@ func Or8(ptr *uint8, val uint8) // NOTE: Do not add atomicxor8 (XOR is not idempotent). +//go:noescape +func And(ptr *uint32, val uint32) + +//go:noescape +func Or(ptr *uint32, val uint32) + //go:noescape func Cas64(ptr *uint64, old, new uint64) bool diff --git a/src/runtime/internal/atomic/atomic_riscv64.go b/src/runtime/internal/atomic/atomic_riscv64.go index 617bc1a3eb..8f24d61625 100644 --- a/src/runtime/internal/atomic/atomic_riscv64.go +++ b/src/runtime/internal/atomic/atomic_riscv64.go @@ -51,6 +51,12 @@ func Or8(ptr *uint8, val uint8) //go:noescape func And8(ptr *uint8, val uint8) +//go:noescape +func And(ptr *uint32, val uint32) + +//go:noescape +func Or(ptr *uint32, val uint32) + //go:noescape func Cas64(ptr *uint64, old, new uint64) bool diff --git a/src/runtime/internal/atomic/atomic_riscv64.s b/src/runtime/internal/atomic/atomic_riscv64.s index db139d690a..74c896cea6 100644 --- a/src/runtime/internal/atomic/atomic_riscv64.s +++ b/src/runtime/internal/atomic/atomic_riscv64.s @@ -242,3 +242,17 @@ TEXT ·Or8(SB), NOSPLIT, $0-9 SLL A2, A1 AMOORW A1, (A0), ZERO RET + +// func And(ptr *uint32, val uint32) +TEXT ·And(SB), NOSPLIT, $0-12 + MOV ptr+0(FP), A0 + MOVW val+8(FP), A1 + AMOANDW A1, (A0), ZERO + RET + +// func Or(ptr *uint32, val uint32) +TEXT ·Or(SB), NOSPLIT, $0-12 + MOV ptr+0(FP), A0 + MOVW val+8(FP), A1 + AMOORW A1, (A0), ZERO + RET diff --git a/src/runtime/internal/atomic/atomic_s390x.go b/src/runtime/internal/atomic/atomic_s390x.go index b649caa39f..a058d60102 100644 --- a/src/runtime/internal/atomic/atomic_s390x.go +++ b/src/runtime/internal/atomic/atomic_s390x.go @@ -91,6 +91,12 @@ func Or8(ptr *uint8, val uint8) // NOTE: Do not add atomicxor8 (XOR is not idempotent). +//go:noescape +func And(ptr *uint32, val uint32) + +//go:noescape +func Or(ptr *uint32, val uint32) + //go:noescape func Xadd(ptr *uint32, delta int32) uint32 diff --git a/src/runtime/internal/atomic/atomic_test.go b/src/runtime/internal/atomic/atomic_test.go index a9f95077c0..c9c2eba248 100644 --- a/src/runtime/internal/atomic/atomic_test.go +++ b/src/runtime/internal/atomic/atomic_test.go @@ -150,6 +150,45 @@ func TestAnd8(t *testing.T) { } } +func TestAnd(t *testing.T) { + // Basic sanity check. + x := uint32(0xffffffff) + for i := uint32(0); i < 32; i++ { + atomic.And(&x, ^(1 << i)) + if r := uint32(0xffffffff) << (i + 1); x != r { + t.Fatalf("clearing bit %#x: want %#x, got %#x", uint32(1<