From afba990169f41d9026c923da5235584db32cab67 Mon Sep 17 00:00:00 2001
From: Austin Clements <austin@google.com>
Date: Thu, 15 Oct 2020 16:11:10 -0400
Subject: runtime/internal/atomic: drop package prefixes

This drops package prefixes from the assembly code on 386 and arm. In
addition to just being nicer, this allows the assembler to
automatically pick up the argument stack map from the Go signatures of
these functions. This doesn't matter right now because these functions
never call back out to Go, but prepares us for the next CL.

Change-Id: I90fed7d4dd63ad49274529c62804211b6390e2e9
Reviewed-on: https://go-review.googlesource.com/c/go/+/262777
Trust: Austin Clements <austin@google.com>
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
---
 src/runtime/internal/atomic/asm_386.s | 76 +++++++++++++++++------------------
 1 file changed, 38 insertions(+), 38 deletions(-)

(limited to 'src/runtime/internal/atomic/asm_386.s')

diff --git a/src/runtime/internal/atomic/asm_386.s b/src/runtime/internal/atomic/asm_386.s
index 9b9dc14a60..357ca95625 100644
--- a/src/runtime/internal/atomic/asm_386.s
+++ b/src/runtime/internal/atomic/asm_386.s
@@ -11,7 +11,7 @@
 //		return 1;
 //	}else
 //		return 0;
-TEXT runtime∕internal∕atomic·Cas(SB), NOSPLIT, $0-13
+TEXT ·Cas(SB), NOSPLIT, $0-13
 	MOVL	ptr+0(FP), BX
 	MOVL	old+4(FP), AX
 	MOVL	new+8(FP), CX
@@ -20,32 +20,32 @@ TEXT runtime∕internal∕atomic·Cas(SB), NOSPLIT, $0-13
 	SETEQ	ret+12(FP)
 	RET
 
-TEXT runtime∕internal∕atomic·Casuintptr(SB), NOSPLIT, $0-13
-	JMP	runtime∕internal∕atomic·Cas(SB)
+TEXT ·Casuintptr(SB), NOSPLIT, $0-13
+	JMP	·Cas(SB)
 
-TEXT runtime∕internal∕atomic·CasRel(SB), NOSPLIT, $0-13
-	JMP	runtime∕internal∕atomic·Cas(SB)
+TEXT ·CasRel(SB), NOSPLIT, $0-13
+	JMP	·Cas(SB)
 
-TEXT runtime∕internal∕atomic·Loaduintptr(SB), NOSPLIT, $0-8
-	JMP	runtime∕internal∕atomic·Load(SB)
+TEXT ·Loaduintptr(SB), NOSPLIT, $0-8
+	JMP	·Load(SB)
 
-TEXT runtime∕internal∕atomic·Loaduint(SB), NOSPLIT, $0-8
-	JMP	runtime∕internal∕atomic·Load(SB)
+TEXT ·Loaduint(SB), NOSPLIT, $0-8
+	JMP	·Load(SB)
 
-TEXT runtime∕internal∕atomic·Storeuintptr(SB), NOSPLIT, $0-8
-	JMP	runtime∕internal∕atomic·Store(SB)
+TEXT ·Storeuintptr(SB), NOSPLIT, $0-8
+	JMP	·Store(SB)
 
-TEXT runtime∕internal∕atomic·Xadduintptr(SB), NOSPLIT, $0-12
-	JMP runtime∕internal∕atomic·Xadd(SB)
+TEXT ·Xadduintptr(SB), NOSPLIT, $0-12
+	JMP	·Xadd(SB)
 
-TEXT runtime∕internal∕atomic·Loadint64(SB), NOSPLIT, $0-12
-	JMP runtime∕internal∕atomic·Load64(SB)
+TEXT ·Loadint64(SB), NOSPLIT, $0-12
+	JMP	·Load64(SB)
 
-TEXT runtime∕internal∕atomic·Xaddint64(SB), NOSPLIT, $0-20
-	JMP runtime∕internal∕atomic·Xadd64(SB)
+TEXT ·Xaddint64(SB), NOSPLIT, $0-20
+	JMP	·Xadd64(SB)
 
 
-// bool runtime∕internal∕atomic·Cas64(uint64 *val, uint64 old, uint64 new)
+// bool ·Cas64(uint64 *val, uint64 old, uint64 new)
 // Atomically:
 //	if(*val == *old){
 //		*val = new;
@@ -53,7 +53,7 @@ TEXT runtime∕internal∕atomic·Xaddint64(SB), NOSPLIT, $0-20
 //	} else {
 //		return 0;
 //	}
-TEXT runtime∕internal∕atomic·Cas64(SB), NOSPLIT, $0-21
+TEXT ·Cas64(SB), NOSPLIT, $0-21
 	MOVL	ptr+0(FP), BP
 	TESTL	$7, BP
 	JZ	2(PC)
@@ -74,7 +74,7 @@ TEXT runtime∕internal∕atomic·Cas64(SB), NOSPLIT, $0-21
 //		return 1;
 //	}else
 //		return 0;
-TEXT runtime∕internal∕atomic·Casp1(SB), NOSPLIT, $0-13
+TEXT ·Casp1(SB), NOSPLIT, $0-13
 	MOVL	ptr+0(FP), BX
 	MOVL	old+4(FP), AX
 	MOVL	new+8(FP), CX
@@ -87,7 +87,7 @@ TEXT runtime∕internal∕atomic·Casp1(SB), NOSPLIT, $0-13
 // Atomically:
 //	*val += delta;
 //	return *val;
-TEXT runtime∕internal∕atomic·Xadd(SB), NOSPLIT, $0-12
+TEXT ·Xadd(SB), NOSPLIT, $0-12
 	MOVL	ptr+0(FP), BX
 	MOVL	delta+4(FP), AX
 	MOVL	AX, CX
@@ -97,7 +97,7 @@ TEXT runtime∕internal∕atomic·Xadd(SB), NOSPLIT, $0-12
 	MOVL	AX, ret+8(FP)
 	RET
 
-TEXT runtime∕internal∕atomic·Xadd64(SB), NOSPLIT, $0-20
+TEXT ·Xadd64(SB), NOSPLIT, $0-20
 	// no XADDQ so use CMPXCHG8B loop
 	MOVL	ptr+0(FP), BP
 	TESTL	$7, BP
@@ -133,17 +133,17 @@ addloop:
 	MOVL	CX, ret_hi+16(FP)
 	RET
 
-TEXT runtime∕internal∕atomic·Xchg(SB), NOSPLIT, $0-12
+TEXT ·Xchg(SB), NOSPLIT, $0-12
 	MOVL	ptr+0(FP), BX
 	MOVL	new+4(FP), AX
 	XCHGL	AX, 0(BX)
 	MOVL	AX, ret+8(FP)
 	RET
 
-TEXT runtime∕internal∕atomic·Xchguintptr(SB), NOSPLIT, $0-12
-	JMP	runtime∕internal∕atomic·Xchg(SB)
+TEXT ·Xchguintptr(SB), NOSPLIT, $0-12
+	JMP	·Xchg(SB)
 
-TEXT  runtime∕internal∕atomic·Xchg64(SB),NOSPLIT,$0-20
+TEXT ·Xchg64(SB),NOSPLIT,$0-20
 	// no XCHGQ so use CMPXCHG8B loop
 	MOVL	ptr+0(FP), BP
 	TESTL	$7, BP
@@ -171,23 +171,23 @@ swaploop:
 	MOVL	DX, ret_hi+16(FP)
 	RET
 
-TEXT runtime∕internal∕atomic·StorepNoWB(SB), NOSPLIT, $0-8
+TEXT ·StorepNoWB(SB), NOSPLIT, $0-8
 	MOVL	ptr+0(FP), BX
 	MOVL	val+4(FP), AX
 	XCHGL	AX, 0(BX)
 	RET
 
-TEXT runtime∕internal∕atomic·Store(SB), NOSPLIT, $0-8
+TEXT ·Store(SB), NOSPLIT, $0-8
 	MOVL	ptr+0(FP), BX
 	MOVL	val+4(FP), AX
 	XCHGL	AX, 0(BX)
 	RET
 
-TEXT runtime∕internal∕atomic·StoreRel(SB), NOSPLIT, $0-8
-	JMP	runtime∕internal∕atomic·Store(SB)
+TEXT ·StoreRel(SB), NOSPLIT, $0-8
+	JMP	·Store(SB)
 
 // uint64 atomicload64(uint64 volatile* addr);
-TEXT runtime∕internal∕atomic·Load64(SB), NOSPLIT, $0-12
+TEXT ·Load64(SB), NOSPLIT, $0-12
 	MOVL	ptr+0(FP), AX
 	TESTL	$7, AX
 	JZ	2(PC)
@@ -197,8 +197,8 @@ TEXT runtime∕internal∕atomic·Load64(SB), NOSPLIT, $0-12
 	EMMS
 	RET
 
-// void runtime∕internal∕atomic·Store64(uint64 volatile* addr, uint64 v);
-TEXT runtime∕internal∕atomic·Store64(SB), NOSPLIT, $0-12
+// void ·Store64(uint64 volatile* addr, uint64 v);
+TEXT ·Store64(SB), NOSPLIT, $0-12
 	MOVL	ptr+0(FP), AX
 	TESTL	$7, AX
 	JZ	2(PC)
@@ -214,23 +214,23 @@ TEXT runtime∕internal∕atomic·Store64(SB), NOSPLIT, $0-12
 	XADDL	AX, (SP)
 	RET
 
-// void	runtime∕internal∕atomic·Or8(byte volatile*, byte);
-TEXT runtime∕internal∕atomic·Or8(SB), NOSPLIT, $0-5
+// void	·Or8(byte volatile*, byte);
+TEXT ·Or8(SB), NOSPLIT, $0-5
 	MOVL	ptr+0(FP), AX
 	MOVB	val+4(FP), BX
 	LOCK
 	ORB	BX, (AX)
 	RET
 
-// void	runtime∕internal∕atomic·And8(byte volatile*, byte);
-TEXT runtime∕internal∕atomic·And8(SB), NOSPLIT, $0-5
+// void	·And8(byte volatile*, byte);
+TEXT ·And8(SB), NOSPLIT, $0-5
 	MOVL	ptr+0(FP), AX
 	MOVB	val+4(FP), BX
 	LOCK
 	ANDB	BX, (AX)
 	RET
 
-TEXT runtime∕internal∕atomic·Store8(SB), NOSPLIT, $0-5
+TEXT ·Store8(SB), NOSPLIT, $0-5
 	MOVL	ptr+0(FP), BX
 	MOVB	val+4(FP), AX
 	XCHGB	AX, 0(BX)
-- 
cgit v1.3


From 83317d9e3cb0674f71d1118d8814aefb31ac1239 Mon Sep 17 00:00:00 2001
From: Austin Clements <austin@google.com>
Date: Thu, 15 Oct 2020 15:52:58 -0400
Subject: runtime/internal/atomic: panic nicely on unaligned 64-bit atomics

On 386 and arm, unaligned 64-bit atomics aren't safe, so we check for
this and panic. Currently, we panic by dereferencing nil, which may be
expedient but is pretty user-hostile since it gives no hint of what
the actual problem was.

This CL replaces this with an actual panic. The only subtlety here is
now the atomic assembly implementations are calling back into Go, so
they have to play nicely with stack maps and stack scanning. On 386,
this just requires declaring NO_LOCAL_POINTERS. On arm, this is
somewhat more complicated: first, we have to move the alignment check
into the functions that have Go signatures. Then we have to support
both the tail call from these functions to the underlying
implementation (which requires that they have no frame) and the call
into Go to panic (which requires that they have a frame). We resolve
this by forcing them to have no frame and setting up the frame
manually just before the panic call.

Change-Id: I19f1e860045df64088013db37a18acea47342c69
Reviewed-on: https://go-review.googlesource.com/c/go/+/262778
Trust: Austin Clements <austin@google.com>
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
---
 src/runtime/internal/atomic/asm_386.s       | 17 ++++--
 src/runtime/internal/atomic/asm_arm.s       | 89 +++++++++++++++++++----------
 src/runtime/internal/atomic/atomic_mipsx.go |  2 +-
 src/runtime/internal/atomic/atomic_test.go  |  9 ++-
 src/runtime/internal/atomic/unaligned.go    |  9 +++
 src/sync/atomic/atomic_test.go              |  9 ++-
 6 files changed, 96 insertions(+), 39 deletions(-)
 create mode 100644 src/runtime/internal/atomic/unaligned.go

(limited to 'src/runtime/internal/atomic/asm_386.s')

diff --git a/src/runtime/internal/atomic/asm_386.s b/src/runtime/internal/atomic/asm_386.s
index 357ca95625..bcefff373f 100644
--- a/src/runtime/internal/atomic/asm_386.s
+++ b/src/runtime/internal/atomic/asm_386.s
@@ -3,6 +3,7 @@
 // license that can be found in the LICENSE file.
 
 #include "textflag.h"
+#include "funcdata.h"
 
 // bool Cas(int32 *val, int32 old, int32 new)
 // Atomically:
@@ -44,7 +45,6 @@ TEXT ·Loadint64(SB), NOSPLIT, $0-12
 TEXT ·Xaddint64(SB), NOSPLIT, $0-20
 	JMP	·Xadd64(SB)
 
-
 // bool ·Cas64(uint64 *val, uint64 old, uint64 new)
 // Atomically:
 //	if(*val == *old){
@@ -54,10 +54,11 @@ TEXT ·Xaddint64(SB), NOSPLIT, $0-20
 //		return 0;
 //	}
 TEXT ·Cas64(SB), NOSPLIT, $0-21
+	NO_LOCAL_POINTERS
 	MOVL	ptr+0(FP), BP
 	TESTL	$7, BP
 	JZ	2(PC)
-	MOVL	0, BP // crash with nil ptr deref
+	CALL	·panicUnaligned(SB)
 	MOVL	old_lo+4(FP), AX
 	MOVL	old_hi+8(FP), DX
 	MOVL	new_lo+12(FP), BX
@@ -98,11 +99,12 @@ TEXT ·Xadd(SB), NOSPLIT, $0-12
 	RET
 
 TEXT ·Xadd64(SB), NOSPLIT, $0-20
+	NO_LOCAL_POINTERS
 	// no XADDQ so use CMPXCHG8B loop
 	MOVL	ptr+0(FP), BP
 	TESTL	$7, BP
 	JZ	2(PC)
-	MOVL	0, AX // crash when unaligned
+	CALL	·panicUnaligned(SB)
 	// DI:SI = delta
 	MOVL	delta_lo+4(FP), SI
 	MOVL	delta_hi+8(FP), DI
@@ -144,11 +146,12 @@ TEXT ·Xchguintptr(SB), NOSPLIT, $0-12
 	JMP	·Xchg(SB)
 
 TEXT ·Xchg64(SB),NOSPLIT,$0-20
+	NO_LOCAL_POINTERS
 	// no XCHGQ so use CMPXCHG8B loop
 	MOVL	ptr+0(FP), BP
 	TESTL	$7, BP
 	JZ	2(PC)
-	MOVL	0, AX // crash when unaligned
+	CALL	·panicUnaligned(SB)
 	// CX:BX = new
 	MOVL	new_lo+4(FP), BX
 	MOVL	new_hi+8(FP), CX
@@ -188,10 +191,11 @@ TEXT ·StoreRel(SB), NOSPLIT, $0-8
 
 // uint64 atomicload64(uint64 volatile* addr);
 TEXT ·Load64(SB), NOSPLIT, $0-12
+	NO_LOCAL_POINTERS
 	MOVL	ptr+0(FP), AX
 	TESTL	$7, AX
 	JZ	2(PC)
-	MOVL	0, AX // crash with nil ptr deref
+	CALL	·panicUnaligned(SB)
 	MOVQ	(AX), M0
 	MOVQ	M0, ret+4(FP)
 	EMMS
@@ -199,10 +203,11 @@ TEXT ·Load64(SB), NOSPLIT, $0-12
 
 // void ·Store64(uint64 volatile* addr, uint64 v);
 TEXT ·Store64(SB), NOSPLIT, $0-12
+	NO_LOCAL_POINTERS
 	MOVL	ptr+0(FP), AX
 	TESTL	$7, AX
 	JZ	2(PC)
-	MOVL	0, AX // crash with nil ptr deref
+	CALL	·panicUnaligned(SB)
 	// MOVQ and EMMS were introduced on the Pentium MMX.
 	MOVQ	val+4(FP), M0
 	MOVQ	M0, (AX)
diff --git a/src/runtime/internal/atomic/asm_arm.s b/src/runtime/internal/atomic/asm_arm.s
index db1267423d..c3d1d9025d 100644
--- a/src/runtime/internal/atomic/asm_arm.s
+++ b/src/runtime/internal/atomic/asm_arm.s
@@ -3,6 +3,7 @@
 // license that can be found in the LICENSE file.
 
 #include "textflag.h"
+#include "funcdata.h"
 
 // bool armcas(int32 *val, int32 old, int32 new)
 // Atomically:
@@ -96,11 +97,7 @@ TEXT ·Xaddint64(SB),NOSPLIT,$0-20
 // atomics with locks.
 
 TEXT armCas64<>(SB),NOSPLIT,$0-21
-	MOVW	addr+0(FP), R1
-	// make unaligned atomic access panic
-	AND.S	$7, R1, R2
-	BEQ 	2(PC)
-	MOVW	R2, (R2)	// crash. AND.S above left only low 3 bits in R2.
+	// addr is already in R1
 	MOVW	old_lo+4(FP), R2
 	MOVW	old_hi+8(FP), R3
 	MOVW	new_lo+12(FP), R4
@@ -129,11 +126,7 @@ cas64fail:
 	RET
 
 TEXT armXadd64<>(SB),NOSPLIT,$0-20
-	MOVW	addr+0(FP), R1
-	// make unaligned atomic access panic
-	AND.S	$7, R1, R2
-	BEQ 	2(PC)
-	MOVW	R2, (R2)	// crash. AND.S above left only low 3 bits in R2.
+	// addr is already in R1
 	MOVW	delta_lo+4(FP), R2
 	MOVW	delta_hi+8(FP), R3
 
@@ -155,11 +148,7 @@ add64loop:
 	RET
 
 TEXT armXchg64<>(SB),NOSPLIT,$0-20
-	MOVW	addr+0(FP), R1
-	// make unaligned atomic access panic
-	AND.S	$7, R1, R2
-	BEQ 	2(PC)
-	MOVW	R2, (R2)	// crash. AND.S above left only low 3 bits in R2.
+	// addr is already in R1
 	MOVW	new_lo+4(FP), R2
 	MOVW	new_hi+8(FP), R3
 
@@ -179,11 +168,7 @@ swap64loop:
 	RET
 
 TEXT armLoad64<>(SB),NOSPLIT,$0-12
-	MOVW	addr+0(FP), R1
-	// make unaligned atomic access panic
-	AND.S	$7, R1, R2
-	BEQ 	2(PC)
-	MOVW	R2, (R2)	// crash. AND.S above left only low 3 bits in R2.
+	// addr is already in R1
 
 	LDREXD	(R1), R2	// loads R2 and R3
 	DMB	MB_ISH
@@ -193,11 +178,7 @@ TEXT armLoad64<>(SB),NOSPLIT,$0-12
 	RET
 
 TEXT armStore64<>(SB),NOSPLIT,$0-12
-	MOVW	addr+0(FP), R1
-	// make unaligned atomic access panic
-	AND.S	$7, R1, R2
-	BEQ 	2(PC)
-	MOVW	R2, (R2)	// crash. AND.S above left only low 3 bits in R2.
+	// addr is already in R1
 	MOVW	val_lo+4(FP), R2
 	MOVW	val_hi+8(FP), R3
 
@@ -213,35 +194,83 @@ store64loop:
 	DMB	MB_ISH
 	RET
 
-TEXT ·Cas64(SB),NOSPLIT,$0-21
+// The following functions all panic if their address argument isn't
+// 8-byte aligned. Since we're calling back into Go code to do this,
+// we have to cooperate with stack unwinding. In the normal case, the
+// functions tail-call into the appropriate implementation, which
+// means they must not open a frame. Hence, when they go down the
+// panic path, at that point they push the LR to create a real frame
+// (they don't need to pop it because panic won't return).
+
+TEXT ·Cas64(SB),NOSPLIT,$-4-21
+	NO_LOCAL_POINTERS
+	MOVW	addr+0(FP), R1
+	// make unaligned atomic access panic
+	AND.S	$7, R1, R2
+	BEQ 	3(PC)
+	MOVW.W	R14, -4(R13) // prepare a real frame
+	BL	·panicUnaligned(SB)
+
 	MOVB	runtime·goarm(SB), R11
 	CMP	$7, R11
 	BLT	2(PC)
 	JMP	armCas64<>(SB)
 	JMP	·goCas64(SB)
 
-TEXT ·Xadd64(SB),NOSPLIT,$0-20
+TEXT ·Xadd64(SB),NOSPLIT,$-4-20
+	NO_LOCAL_POINTERS
+	MOVW	addr+0(FP), R1
+	// make unaligned atomic access panic
+	AND.S	$7, R1, R2
+	BEQ 	3(PC)
+	MOVW.W	R14, -4(R13) // prepare a real frame
+	BL	·panicUnaligned(SB)
+
 	MOVB	runtime·goarm(SB), R11
 	CMP	$7, R11
 	BLT	2(PC)
 	JMP	armXadd64<>(SB)
 	JMP	·goXadd64(SB)
 
-TEXT ·Xchg64(SB),NOSPLIT,$0-20
+TEXT ·Xchg64(SB),NOSPLIT,$-4-20
+	NO_LOCAL_POINTERS
+	MOVW	addr+0(FP), R1
+	// make unaligned atomic access panic
+	AND.S	$7, R1, R2
+	BEQ 	3(PC)
+	MOVW.W	R14, -4(R13) // prepare a real frame
+	BL	·panicUnaligned(SB)
+
 	MOVB	runtime·goarm(SB), R11
 	CMP	$7, R11
 	BLT	2(PC)
 	JMP	armXchg64<>(SB)
 	JMP	·goXchg64(SB)
 
-TEXT ·Load64(SB),NOSPLIT,$0-12
+TEXT ·Load64(SB),NOSPLIT,$-4-12
+	NO_LOCAL_POINTERS
+	MOVW	addr+0(FP), R1
+	// make unaligned atomic access panic
+	AND.S	$7, R1, R2
+	BEQ 	3(PC)
+	MOVW.W	R14, -4(R13) // prepare a real frame
+	BL	·panicUnaligned(SB)
+
 	MOVB	runtime·goarm(SB), R11
 	CMP	$7, R11
 	BLT	2(PC)
 	JMP	armLoad64<>(SB)
 	JMP	·goLoad64(SB)
 
-TEXT ·Store64(SB),NOSPLIT,$0-12
+TEXT ·Store64(SB),NOSPLIT,$-4-12
+	NO_LOCAL_POINTERS
+	MOVW	addr+0(FP), R1
+	// make unaligned atomic access panic
+	AND.S	$7, R1, R2
+	BEQ 	3(PC)
+	MOVW.W	R14, -4(R13) // prepare a real frame
+	BL	·panicUnaligned(SB)
+
 	MOVB	runtime·goarm(SB), R11
 	CMP	$7, R11
 	BLT	2(PC)
diff --git a/src/runtime/internal/atomic/atomic_mipsx.go b/src/runtime/internal/atomic/atomic_mipsx.go
index 0e2d77ade1..b99bfe7dbf 100644
--- a/src/runtime/internal/atomic/atomic_mipsx.go
+++ b/src/runtime/internal/atomic/atomic_mipsx.go
@@ -34,7 +34,7 @@ func spinUnlock(state *uint32)
 func lockAndCheck(addr *uint64) {
 	// ensure 8-byte alignment
 	if uintptr(unsafe.Pointer(addr))&7 != 0 {
-		addr = nil
+		panicUnaligned()
 	}
 	// force dereference before taking lock
 	_ = *addr
diff --git a/src/runtime/internal/atomic/atomic_test.go b/src/runtime/internal/atomic/atomic_test.go
index b0a8fa0610..a9f95077c0 100644
--- a/src/runtime/internal/atomic/atomic_test.go
+++ b/src/runtime/internal/atomic/atomic_test.go
@@ -73,8 +73,15 @@ func TestXadduintptrOnUint64(t *testing.T) {
 
 func shouldPanic(t *testing.T, name string, f func()) {
 	defer func() {
-		if recover() == nil {
+		// Check that all GC maps are sane.
+		runtime.GC()
+
+		err := recover()
+		want := "unaligned 64-bit atomic operation"
+		if err == nil {
 			t.Errorf("%s did not panic", name)
+		} else if s, _ := err.(string); s != want {
+			t.Errorf("%s: wanted panic %q, got %q", name, want, err)
 		}
 	}()
 	f()
diff --git a/src/runtime/internal/atomic/unaligned.go b/src/runtime/internal/atomic/unaligned.go
new file mode 100644
index 0000000000..a859de4144
--- /dev/null
+++ b/src/runtime/internal/atomic/unaligned.go
@@ -0,0 +1,9 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package atomic
+
+func panicUnaligned() {
+	panic("unaligned 64-bit atomic operation")
+}
diff --git a/src/sync/atomic/atomic_test.go b/src/sync/atomic/atomic_test.go
index 83e7c8d763..eadc962f70 100644
--- a/src/sync/atomic/atomic_test.go
+++ b/src/sync/atomic/atomic_test.go
@@ -1397,8 +1397,15 @@ func TestStoreLoadRelAcq64(t *testing.T) {
 
 func shouldPanic(t *testing.T, name string, f func()) {
 	defer func() {
-		if recover() == nil {
+		// Check that all GC maps are sane.
+		runtime.GC()
+
+		err := recover()
+		want := "unaligned 64-bit atomic operation"
+		if err == nil {
 			t.Errorf("%s did not panic", name)
+		} else if s, _ := err.(string); s != want {
+			t.Errorf("%s: wanted panic %q, got %q", name, want, err)
 		}
 	}()
 	f()
-- 
cgit v1.3


From 15ead857dbc638b9d83a7686acf0dc746fc45918 Mon Sep 17 00:00:00 2001
From: "Paul E. Murphy" <murp@ibm.com>
Date: Wed, 9 Sep 2020 17:24:23 -0500
Subject: cmd/compiler,cmd/go,sync: add internal {LoadAcq,StoreRel}64 on ppc64
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add an internal atomic intrinsic for load with acquire semantics
(extending LoadAcq to 64b) and add LoadAcquintptr for internal
use within the sync package.  For other arches, this remaps to the
appropriate atomic.Load{,64} intrinsic which should not alter code
generation.

Similarly, add StoreRel{uintptr,64} for consistency, and inline.

Finally, add an exception to allow sync to directly use the
runtime/internal/atomic package which avoids more convoluted
workarounds (contributed by Lynn Boger).

In an extreme example, sync.(*Pool).pin consumes 20% of wall time
during fmt tests.  This is reduced to 5% on ppc64le/power9.

From the fmt benchmarks on ppc64le:

name                           old time/op  new time/op  delta
SprintfPadding                  468ns ± 0%   451ns ± 0%   -3.63%
SprintfEmpty                   73.3ns ± 0%  51.9ns ± 0%  -29.20%
SprintfString                   135ns ± 0%   122ns ± 0%   -9.63%
SprintfTruncateString           232ns ± 0%   214ns ± 0%   -7.76%
SprintfTruncateBytes            216ns ± 0%   202ns ± 0%   -6.48%
SprintfSlowParsingPath          162ns ± 0%   142ns ± 0%  -12.35%
SprintfQuoteString             1.00µs ± 0%  0.99µs ± 0%   -1.39%
SprintfInt                      117ns ± 0%   104ns ± 0%  -11.11%
SprintfIntInt                   190ns ± 0%   175ns ± 0%   -7.89%
SprintfPrefixedInt              232ns ± 0%   212ns ± 0%   -8.62%
SprintfFloat                    270ns ± 0%   255ns ± 0%   -5.56%
SprintfComplex                 1.01µs ± 0%  0.99µs ± 0%   -1.68%
SprintfBoolean                  127ns ± 0%   111ns ± 0%  -12.60%
SprintfHexString                220ns ± 0%   198ns ± 0%  -10.00%
SprintfHexBytes                 261ns ± 0%   252ns ± 0%   -3.45%
SprintfBytes                    600ns ± 0%   590ns ± 0%   -1.67%
SprintfStringer                 684ns ± 0%   658ns ± 0%   -3.80%
SprintfStructure               2.57µs ± 0%  2.57µs ± 0%   -0.12%
ManyArgs                        669ns ± 0%   646ns ± 0%   -3.44%
FprintInt                       140ns ± 0%   136ns ± 0%   -2.86%
FprintfBytes                    184ns ± 0%   181ns ± 0%   -1.63%
FprintIntNoAlloc                140ns ± 0%   136ns ± 0%   -2.86%
ScanInts                        929µs ± 0%   921µs ± 0%   -0.79%
ScanRecursiveInt                122ms ± 0%   121ms ± 0%   -0.11%
ScanRecursiveIntReaderWrapper   122ms ± 0%   122ms ± 0%   -0.18%

Change-Id: I4d66780261b57b06ef600229e475462e7313f0d6
Reviewed-on: https://go-review.googlesource.com/c/go/+/253748
Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com>
Reviewed-by: Lynn Boger <laboger@linux.vnet.ibm.com>
Reviewed-by: Keith Randall <khr@golang.org>
Trust: Lynn Boger <laboger@linux.vnet.ibm.com>
TryBot-Result: Go Bot <gobot@golang.org>
---
 src/cmd/compile/internal/gc/ssa.go             | 19 ++++++++++++++
 src/cmd/compile/internal/ssa/branchelim.go     |  2 +-
 src/cmd/compile/internal/ssa/gen/PPC64.rules   |  4 +--
 src/cmd/compile/internal/ssa/gen/genericOps.go |  2 ++
 src/cmd/compile/internal/ssa/opGen.go          | 13 ++++++++++
 src/cmd/compile/internal/ssa/rewritePPC64.go   | 34 ++++++++++++++++++++++++++
 src/cmd/go/internal/load/pkg.go                |  5 ++++
 src/runtime/internal/atomic/asm_386.s          |  3 +++
 src/runtime/internal/atomic/asm_amd64.s        |  6 +++++
 src/runtime/internal/atomic/asm_arm.s          |  6 +++++
 src/runtime/internal/atomic/asm_mips64x.s      |  6 +++++
 src/runtime/internal/atomic/asm_mipsx.s        |  3 +++
 src/runtime/internal/atomic/asm_ppc64x.s       | 13 ++++++++++
 src/runtime/internal/atomic/atomic_386.go      |  9 +++++++
 src/runtime/internal/atomic/atomic_amd64.go    | 18 ++++++++++++++
 src/runtime/internal/atomic/atomic_arm.go      |  6 +++++
 src/runtime/internal/atomic/atomic_arm64.go    | 12 +++++++++
 src/runtime/internal/atomic/atomic_arm64.s     | 14 +++++++++++
 src/runtime/internal/atomic/atomic_mips64x.go  | 12 +++++++++
 src/runtime/internal/atomic/atomic_mips64x.s   |  8 ++++++
 src/runtime/internal/atomic/atomic_mipsx.go    |  6 +++++
 src/runtime/internal/atomic/atomic_ppc64x.go   | 12 +++++++++
 src/runtime/internal/atomic/atomic_ppc64x.s    | 20 +++++++++++++++
 src/runtime/internal/atomic/atomic_riscv64.go  | 12 +++++++++
 src/runtime/internal/atomic/atomic_riscv64.s   | 12 +++++++++
 src/runtime/internal/atomic/atomic_s390x.go    | 24 ++++++++++++++++++
 src/runtime/internal/atomic/atomic_wasm.go     | 24 ++++++++++++++++++
 src/sync/pool.go                               | 15 ++++++------
 28 files changed, 310 insertions(+), 10 deletions(-)

(limited to 'src/runtime/internal/atomic/asm_386.s')

diff --git a/src/cmd/compile/internal/gc/ssa.go b/src/cmd/compile/internal/gc/ssa.go
index d8f627c213..e1455d2c3f 100644
--- a/src/cmd/compile/internal/gc/ssa.go
+++ b/src/cmd/compile/internal/gc/ssa.go
@@ -3389,6 +3389,13 @@ func init() {
 			return s.newValue1(ssa.OpSelect0, types.Types[TUINT32], v)
 		},
 		sys.PPC64, sys.S390X)
+	addF("runtime/internal/atomic", "LoadAcq64",
+		func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
+			v := s.newValue2(ssa.OpAtomicLoadAcq64, types.NewTuple(types.Types[TUINT64], types.TypeMem), args[0], s.mem())
+			s.vars[&memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
+			return s.newValue1(ssa.OpSelect0, types.Types[TUINT64], v)
+		},
+		sys.PPC64)
 	addF("runtime/internal/atomic", "Loadp",
 		func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
 			v := s.newValue2(ssa.OpAtomicLoadPtr, types.NewTuple(s.f.Config.Types.BytePtr, types.TypeMem), args[0], s.mem())
@@ -3427,6 +3434,12 @@ func init() {
 			return nil
 		},
 		sys.PPC64, sys.S390X)
+	addF("runtime/internal/atomic", "StoreRel64",
+		func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
+			s.vars[&memVar] = s.newValue3(ssa.OpAtomicStoreRel64, types.TypeMem, args[0], args[1], s.mem())
+			return nil
+		},
+		sys.PPC64)
 
 	addF("runtime/internal/atomic", "Xchg",
 		func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
@@ -3542,9 +3555,15 @@ func init() {
 	alias("runtime/internal/atomic", "Loaduintptr", "runtime/internal/atomic", "Load", p4...)
 	alias("runtime/internal/atomic", "Loaduintptr", "runtime/internal/atomic", "Load64", p8...)
 	alias("runtime/internal/atomic", "LoadAcq", "runtime/internal/atomic", "Load", lwatomics...)
+	alias("runtime/internal/atomic", "LoadAcq64", "runtime/internal/atomic", "Load64", lwatomics...)
+	alias("runtime/internal/atomic", "LoadAcquintptr", "runtime/internal/atomic", "LoadAcq", p4...)
+	alias("runtime/internal/atomic", "LoadAcquintptr", "runtime/internal/atomic", "LoadAcq64", p8...)
 	alias("runtime/internal/atomic", "Storeuintptr", "runtime/internal/atomic", "Store", p4...)
 	alias("runtime/internal/atomic", "Storeuintptr", "runtime/internal/atomic", "Store64", p8...)
 	alias("runtime/internal/atomic", "StoreRel", "runtime/internal/atomic", "Store", lwatomics...)
+	alias("runtime/internal/atomic", "StoreRel64", "runtime/internal/atomic", "Store64", lwatomics...)
+	alias("runtime/internal/atomic", "StoreReluintptr", "runtime/internal/atomic", "StoreRel", p4...)
+	alias("runtime/internal/atomic", "StoreReluintptr", "runtime/internal/atomic", "StoreRel64", p8...)
 	alias("runtime/internal/atomic", "Xchguintptr", "runtime/internal/atomic", "Xchg", p4...)
 	alias("runtime/internal/atomic", "Xchguintptr", "runtime/internal/atomic", "Xchg64", p8...)
 	alias("runtime/internal/atomic", "Xadduintptr", "runtime/internal/atomic", "Xadd", p4...)
diff --git a/src/cmd/compile/internal/ssa/branchelim.go b/src/cmd/compile/internal/ssa/branchelim.go
index 4f9fd8e22e..1d34f8160b 100644
--- a/src/cmd/compile/internal/ssa/branchelim.go
+++ b/src/cmd/compile/internal/ssa/branchelim.go
@@ -35,7 +35,7 @@ func branchelim(f *Func) {
 	for _, b := range f.Blocks {
 		for _, v := range b.Values {
 			switch v.Op {
-			case OpLoad, OpAtomicLoad8, OpAtomicLoad32, OpAtomicLoad64, OpAtomicLoadPtr, OpAtomicLoadAcq32:
+			case OpLoad, OpAtomicLoad8, OpAtomicLoad32, OpAtomicLoad64, OpAtomicLoadPtr, OpAtomicLoadAcq32, OpAtomicLoadAcq64:
 				loadAddr.add(v.Args[0].ID)
 			case OpMove:
 				loadAddr.add(v.Args[1].ID)
diff --git a/src/cmd/compile/internal/ssa/gen/PPC64.rules b/src/cmd/compile/internal/ssa/gen/PPC64.rules
index a05cfee654..11b1a318fe 100644
--- a/src/cmd/compile/internal/ssa/gen/PPC64.rules
+++ b/src/cmd/compile/internal/ssa/gen/PPC64.rules
@@ -967,10 +967,10 @@
 
 // atomic intrinsics
 (AtomicLoad(8|32|64|Ptr)  ptr mem) => (LoweredAtomicLoad(8|32|64|Ptr) [1] ptr mem)
-(AtomicLoadAcq32        ptr mem) => (LoweredAtomicLoad32 [0] ptr mem)
+(AtomicLoadAcq(32|64)     ptr mem) => (LoweredAtomicLoad(32|64) [0] ptr mem)
 
 (AtomicStore(8|32|64)    ptr val mem) => (LoweredAtomicStore(8|32|64) [1] ptr val mem)
-(AtomicStoreRel32        ptr val mem) => (LoweredAtomicStore32 [0] ptr val mem)
+(AtomicStoreRel(32|64)   ptr val mem) => (LoweredAtomicStore(32|64) [0] ptr val mem)
 //(AtomicStorePtrNoWB ptr val mem) => (STLR  ptr val mem)
 
 (AtomicExchange(32|64) ...) => (LoweredAtomicExchange(32|64) ...)
diff --git a/src/cmd/compile/internal/ssa/gen/genericOps.go b/src/cmd/compile/internal/ssa/gen/genericOps.go
index 85839303c5..12ba9f1fc9 100644
--- a/src/cmd/compile/internal/ssa/gen/genericOps.go
+++ b/src/cmd/compile/internal/ssa/gen/genericOps.go
@@ -550,11 +550,13 @@ var genericOps = []opData{
 	{name: "AtomicLoad64", argLength: 2, typ: "(UInt64,Mem)"},                                  // Load from arg0.  arg1=memory.  Returns loaded value and new memory.
 	{name: "AtomicLoadPtr", argLength: 2, typ: "(BytePtr,Mem)"},                                // Load from arg0.  arg1=memory.  Returns loaded value and new memory.
 	{name: "AtomicLoadAcq32", argLength: 2, typ: "(UInt32,Mem)"},                               // Load from arg0.  arg1=memory.  Lock acquisition, returns loaded value and new memory.
+	{name: "AtomicLoadAcq64", argLength: 2, typ: "(UInt64,Mem)"},                               // Load from arg0.  arg1=memory.  Lock acquisition, returns loaded value and new memory.
 	{name: "AtomicStore8", argLength: 3, typ: "Mem", hasSideEffects: true},                     // Store arg1 to *arg0.  arg2=memory.  Returns memory.
 	{name: "AtomicStore32", argLength: 3, typ: "Mem", hasSideEffects: true},                    // Store arg1 to *arg0.  arg2=memory.  Returns memory.
 	{name: "AtomicStore64", argLength: 3, typ: "Mem", hasSideEffects: true},                    // Store arg1 to *arg0.  arg2=memory.  Returns memory.
 	{name: "AtomicStorePtrNoWB", argLength: 3, typ: "Mem", hasSideEffects: true},               // Store arg1 to *arg0.  arg2=memory.  Returns memory.
 	{name: "AtomicStoreRel32", argLength: 3, typ: "Mem", hasSideEffects: true},                 // Store arg1 to *arg0.  arg2=memory.  Lock release, returns memory.
+	{name: "AtomicStoreRel64", argLength: 3, typ: "Mem", hasSideEffects: true},                 // Store arg1 to *arg0.  arg2=memory.  Lock release, returns memory.
 	{name: "AtomicExchange32", argLength: 3, typ: "(UInt32,Mem)", hasSideEffects: true},        // Store arg1 to *arg0.  arg2=memory.  Returns old contents of *arg0 and new memory.
 	{name: "AtomicExchange64", argLength: 3, typ: "(UInt64,Mem)", hasSideEffects: true},        // Store arg1 to *arg0.  arg2=memory.  Returns old contents of *arg0 and new memory.
 	{name: "AtomicAdd32", argLength: 3, typ: "(UInt32,Mem)", hasSideEffects: true},             // Do *arg0 += arg1.  arg2=memory.  Returns sum and new memory.
diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go
index 051550fb17..eae30c79ba 100644
--- a/src/cmd/compile/internal/ssa/opGen.go
+++ b/src/cmd/compile/internal/ssa/opGen.go
@@ -2839,11 +2839,13 @@ const (
 	OpAtomicLoad64
 	OpAtomicLoadPtr
 	OpAtomicLoadAcq32
+	OpAtomicLoadAcq64
 	OpAtomicStore8
 	OpAtomicStore32
 	OpAtomicStore64
 	OpAtomicStorePtrNoWB
 	OpAtomicStoreRel32
+	OpAtomicStoreRel64
 	OpAtomicExchange32
 	OpAtomicExchange64
 	OpAtomicAdd32
@@ -35429,6 +35431,11 @@ var opcodeTable = [...]opInfo{
 		argLen:  2,
 		generic: true,
 	},
+	{
+		name:    "AtomicLoadAcq64",
+		argLen:  2,
+		generic: true,
+	},
 	{
 		name:           "AtomicStore8",
 		argLen:         3,
@@ -35459,6 +35466,12 @@ var opcodeTable = [...]opInfo{
 		hasSideEffects: true,
 		generic:        true,
 	},
+	{
+		name:           "AtomicStoreRel64",
+		argLen:         3,
+		hasSideEffects: true,
+		generic:        true,
+	},
 	{
 		name:           "AtomicExchange32",
 		argLen:         3,
diff --git a/src/cmd/compile/internal/ssa/rewritePPC64.go b/src/cmd/compile/internal/ssa/rewritePPC64.go
index 1b8a5a78ca..a820bc0c4e 100644
--- a/src/cmd/compile/internal/ssa/rewritePPC64.go
+++ b/src/cmd/compile/internal/ssa/rewritePPC64.go
@@ -82,6 +82,8 @@ func rewriteValuePPC64(v *Value) bool {
 		return rewriteValuePPC64_OpAtomicLoad8(v)
 	case OpAtomicLoadAcq32:
 		return rewriteValuePPC64_OpAtomicLoadAcq32(v)
+	case OpAtomicLoadAcq64:
+		return rewriteValuePPC64_OpAtomicLoadAcq64(v)
 	case OpAtomicLoadPtr:
 		return rewriteValuePPC64_OpAtomicLoadPtr(v)
 	case OpAtomicOr8:
@@ -95,6 +97,8 @@ func rewriteValuePPC64(v *Value) bool {
 		return rewriteValuePPC64_OpAtomicStore8(v)
 	case OpAtomicStoreRel32:
 		return rewriteValuePPC64_OpAtomicStoreRel32(v)
+	case OpAtomicStoreRel64:
+		return rewriteValuePPC64_OpAtomicStoreRel64(v)
 	case OpAvg64u:
 		return rewriteValuePPC64_OpAvg64u(v)
 	case OpBitLen32:
@@ -930,6 +934,20 @@ func rewriteValuePPC64_OpAtomicLoadAcq32(v *Value) bool {
 		return true
 	}
 }
+func rewriteValuePPC64_OpAtomicLoadAcq64(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	// match: (AtomicLoadAcq64 ptr mem)
+	// result: (LoweredAtomicLoad64 [0] ptr mem)
+	for {
+		ptr := v_0
+		mem := v_1
+		v.reset(OpPPC64LoweredAtomicLoad64)
+		v.AuxInt = int64ToAuxInt(0)
+		v.AddArg2(ptr, mem)
+		return true
+	}
+}
 func rewriteValuePPC64_OpAtomicLoadPtr(v *Value) bool {
 	v_1 := v.Args[1]
 	v_0 := v.Args[0]
@@ -1008,6 +1026,22 @@ func rewriteValuePPC64_OpAtomicStoreRel32(v *Value) bool {
 		return true
 	}
 }
+func rewriteValuePPC64_OpAtomicStoreRel64(v *Value) bool {
+	v_2 := v.Args[2]
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	// match: (AtomicStoreRel64 ptr val mem)
+	// result: (LoweredAtomicStore64 [0] ptr val mem)
+	for {
+		ptr := v_0
+		val := v_1
+		mem := v_2
+		v.reset(OpPPC64LoweredAtomicStore64)
+		v.AuxInt = int64ToAuxInt(0)
+		v.AddArg3(ptr, val, mem)
+		return true
+	}
+}
 func rewriteValuePPC64_OpAvg64u(v *Value) bool {
 	v_1 := v.Args[1]
 	v_0 := v.Args[0]
diff --git a/src/cmd/go/internal/load/pkg.go b/src/cmd/go/internal/load/pkg.go
index 2bdc08ba36..c9665265e9 100644
--- a/src/cmd/go/internal/load/pkg.go
+++ b/src/cmd/go/internal/load/pkg.go
@@ -1337,6 +1337,11 @@ func disallowInternal(srcDir string, importer *Package, importerPath string, p *
 		return p
 	}
 
+	// Allow sync package to access lightweight atomic functions limited to the runtime.
+	if p.Standard && strings.HasPrefix(importerPath, "sync") && p.ImportPath == "runtime/internal/atomic" {
+		return p
+	}
+
 	// Internal is present.
 	// Map import path back to directory corresponding to parent of internal.
 	if i > 0 {
diff --git a/src/runtime/internal/atomic/asm_386.s b/src/runtime/internal/atomic/asm_386.s
index bcefff373f..7ebf675ac5 100644
--- a/src/runtime/internal/atomic/asm_386.s
+++ b/src/runtime/internal/atomic/asm_386.s
@@ -189,6 +189,9 @@ TEXT ·Store(SB), NOSPLIT, $0-8
 TEXT ·StoreRel(SB), NOSPLIT, $0-8
 	JMP	·Store(SB)
 
+TEXT runtime∕internal∕atomic·StoreReluintptr(SB), NOSPLIT, $0-8
+	JMP	runtime∕internal∕atomic·Store(SB)
+
 // uint64 atomicload64(uint64 volatile* addr);
 TEXT ·Load64(SB), NOSPLIT, $0-12
 	NO_LOCAL_POINTERS
diff --git a/src/runtime/internal/atomic/asm_amd64.s b/src/runtime/internal/atomic/asm_amd64.s
index 90c56424c9..80fb31285d 100644
--- a/src/runtime/internal/atomic/asm_amd64.s
+++ b/src/runtime/internal/atomic/asm_amd64.s
@@ -136,6 +136,12 @@ TEXT runtime∕internal∕atomic·Store(SB), NOSPLIT, $0-12
 TEXT runtime∕internal∕atomic·StoreRel(SB), NOSPLIT, $0-12
 	JMP	runtime∕internal∕atomic·Store(SB)
 
+TEXT runtime∕internal∕atomic·StoreRel64(SB), NOSPLIT, $0-16
+	JMP	runtime∕internal∕atomic·Store64(SB)
+
+TEXT runtime∕internal∕atomic·StoreReluintptr(SB), NOSPLIT, $0-16
+	JMP	runtime∕internal∕atomic·Store64(SB)
+
 TEXT runtime∕internal∕atomic·Store8(SB), NOSPLIT, $0-9
 	MOVQ	ptr+0(FP), BX
 	MOVB	val+8(FP), AX
diff --git a/src/runtime/internal/atomic/asm_arm.s b/src/runtime/internal/atomic/asm_arm.s
index c3d1d9025d..274925ed60 100644
--- a/src/runtime/internal/atomic/asm_arm.s
+++ b/src/runtime/internal/atomic/asm_arm.s
@@ -57,6 +57,9 @@ TEXT ·Loadp(SB),NOSPLIT|NOFRAME,$0-8
 TEXT ·LoadAcq(SB),NOSPLIT|NOFRAME,$0-8
 	B	·Load(SB)
 
+TEXT ·LoadAcquintptr(SB),NOSPLIT|NOFRAME,$0-8
+	B 	·Load(SB)
+
 TEXT ·Casuintptr(SB),NOSPLIT,$0-13
 	B	·Cas(SB)
 
@@ -81,6 +84,9 @@ TEXT ·StorepNoWB(SB),NOSPLIT,$0-8
 TEXT ·StoreRel(SB),NOSPLIT,$0-8
 	B	·Store(SB)
 
+TEXT ·StoreReluintptr(SB),NOSPLIT,$0-8
+	B	·Store(SB)
+
 TEXT ·Xadduintptr(SB),NOSPLIT,$0-12
 	B	·Xadd(SB)
 
diff --git a/src/runtime/internal/atomic/asm_mips64x.s b/src/runtime/internal/atomic/asm_mips64x.s
index 3290fb726a..03fb822929 100644
--- a/src/runtime/internal/atomic/asm_mips64x.s
+++ b/src/runtime/internal/atomic/asm_mips64x.s
@@ -158,6 +158,12 @@ TEXT ·StorepNoWB(SB), NOSPLIT, $0-16
 TEXT ·StoreRel(SB), NOSPLIT, $0-12
 	JMP	·Store(SB)
 
+TEXT ·StoreRel64(SB), NOSPLIT, $0-16
+	JMP	·Store64(SB)
+
+TEXT ·StoreReluintptr(SB), NOSPLIT, $0-16
+	JMP	·Store64(SB)
+
 TEXT ·Store(SB), NOSPLIT, $0-12
 	MOVV	ptr+0(FP), R1
 	MOVW	val+8(FP), R2
diff --git a/src/runtime/internal/atomic/asm_mipsx.s b/src/runtime/internal/atomic/asm_mipsx.s
index 62811a6599..63bb548825 100644
--- a/src/runtime/internal/atomic/asm_mipsx.s
+++ b/src/runtime/internal/atomic/asm_mipsx.s
@@ -122,6 +122,9 @@ TEXT ·StorepNoWB(SB),NOSPLIT,$0-8
 TEXT ·StoreRel(SB),NOSPLIT,$0-8
 	JMP	·Store(SB)
 
+TEXT ·StoreReluintptr(SB),NOSPLIT,$0-8
+	JMP	·Store(SB)
+
 // void	Or8(byte volatile*, byte);
 TEXT ·Or8(SB),NOSPLIT,$0-5
 	MOVW	ptr+0(FP), R1
diff --git a/src/runtime/internal/atomic/asm_ppc64x.s b/src/runtime/internal/atomic/asm_ppc64x.s
index 06dc931bf4..c0237de4d0 100644
--- a/src/runtime/internal/atomic/asm_ppc64x.s
+++ b/src/runtime/internal/atomic/asm_ppc64x.s
@@ -83,12 +83,18 @@ TEXT runtime∕internal∕atomic·Casuintptr(SB), NOSPLIT, $0-25
 TEXT runtime∕internal∕atomic·Loaduintptr(SB),  NOSPLIT|NOFRAME, $0-16
 	BR	runtime∕internal∕atomic·Load64(SB)
 
+TEXT runtime∕internal∕atomic·LoadAcquintptr(SB),  NOSPLIT|NOFRAME, $0-16
+	BR	runtime∕internal∕atomic·LoadAcq64(SB)
+
 TEXT runtime∕internal∕atomic·Loaduint(SB), NOSPLIT|NOFRAME, $0-16
 	BR	runtime∕internal∕atomic·Load64(SB)
 
 TEXT runtime∕internal∕atomic·Storeuintptr(SB), NOSPLIT, $0-16
 	BR	runtime∕internal∕atomic·Store64(SB)
 
+TEXT runtime∕internal∕atomic·StoreReluintptr(SB), NOSPLIT, $0-16
+	BR	runtime∕internal∕atomic·StoreRel64(SB)
+
 TEXT runtime∕internal∕atomic·Xadduintptr(SB), NOSPLIT, $0-24
 	BR	runtime∕internal∕atomic·Xadd64(SB)
 
@@ -191,6 +197,13 @@ TEXT runtime∕internal∕atomic·StoreRel(SB), NOSPLIT, $0-12
 	MOVW	R4, 0(R3)
 	RET
 
+TEXT runtime∕internal∕atomic·StoreRel64(SB), NOSPLIT, $0-16
+	MOVD	ptr+0(FP), R3
+	MOVD	val+8(FP), R4
+	LWSYNC
+	MOVD	R4, 0(R3)
+	RET
+
 // void runtime∕internal∕atomic·Or8(byte volatile*, byte);
 TEXT runtime∕internal∕atomic·Or8(SB), NOSPLIT, $0-9
 	MOVD	ptr+0(FP), R3
diff --git a/src/runtime/internal/atomic/atomic_386.go b/src/runtime/internal/atomic/atomic_386.go
index 8d002ebfe3..06ce6a5356 100644
--- a/src/runtime/internal/atomic/atomic_386.go
+++ b/src/runtime/internal/atomic/atomic_386.go
@@ -30,6 +30,12 @@ func LoadAcq(ptr *uint32) uint32 {
 	return *ptr
 }
 
+//go:nosplit
+//go:noinline
+func LoadAcquintptr(ptr *uintptr) uintptr {
+	return *ptr
+}
+
 //go:noescape
 func Xadd64(ptr *uint64, delta int64) uint64
 
@@ -83,5 +89,8 @@ func Store64(ptr *uint64, val uint64)
 //go:noescape
 func StoreRel(ptr *uint32, val uint32)
 
+//go:noescape
+func StoreReluintptr(ptr *uintptr, val uintptr)
+
 // NO go:noescape annotation; see atomic_pointer.go.
 func StorepNoWB(ptr unsafe.Pointer, val unsafe.Pointer)
diff --git a/src/runtime/internal/atomic/atomic_amd64.go b/src/runtime/internal/atomic/atomic_amd64.go
index 14b8101720..1b71a16d94 100644
--- a/src/runtime/internal/atomic/atomic_amd64.go
+++ b/src/runtime/internal/atomic/atomic_amd64.go
@@ -35,6 +35,18 @@ func LoadAcq(ptr *uint32) uint32 {
 	return *ptr
 }
 
+//go:nosplit
+//go:noinline
+func LoadAcq64(ptr *uint64) uint64 {
+	return *ptr
+}
+
+//go:nosplit
+//go:noinline
+func LoadAcquintptr(ptr *uintptr) uintptr {
+	return *ptr
+}
+
 //go:noescape
 func Xadd(ptr *uint32, delta int32) uint32
 
@@ -85,6 +97,12 @@ func Store64(ptr *uint64, val uint64)
 //go:noescape
 func StoreRel(ptr *uint32, val uint32)
 
+//go:noescape
+func StoreRel64(ptr *uint64, val uint64)
+
+//go:noescape
+func StoreReluintptr(ptr *uintptr, val uintptr)
+
 // StorepNoWB performs *ptr = val atomically and without a write
 // barrier.
 //
diff --git a/src/runtime/internal/atomic/atomic_arm.go b/src/runtime/internal/atomic/atomic_arm.go
index 95713afcc1..67d529c1cb 100644
--- a/src/runtime/internal/atomic/atomic_arm.go
+++ b/src/runtime/internal/atomic/atomic_arm.go
@@ -81,6 +81,9 @@ func Store(addr *uint32, v uint32)
 //go:noescape
 func StoreRel(addr *uint32, v uint32)
 
+//go:noescape
+func StoreReluintptr(addr *uintptr, v uintptr)
+
 //go:nosplit
 func goCas64(addr *uint64, old, new uint64) bool {
 	if uintptr(unsafe.Pointer(addr))&7 != 0 {
@@ -194,6 +197,9 @@ func Load8(addr *uint8) uint8
 //go:noescape
 func LoadAcq(addr *uint32) uint32
 
+//go:noescape
+func LoadAcquintptr(ptr *uintptr) uintptr
+
 //go:noescape
 func Cas64(addr *uint64, old, new uint64) bool
 
diff --git a/src/runtime/internal/atomic/atomic_arm64.go b/src/runtime/internal/atomic/atomic_arm64.go
index 26ca94d54c..c9b4322fe9 100644
--- a/src/runtime/internal/atomic/atomic_arm64.go
+++ b/src/runtime/internal/atomic/atomic_arm64.go
@@ -41,6 +41,12 @@ func Loadp(ptr unsafe.Pointer) unsafe.Pointer
 //go:noescape
 func LoadAcq(addr *uint32) uint32
 
+//go:noescape
+func LoadAcq64(ptr *uint64) uint64
+
+//go:noescape
+func LoadAcquintptr(ptr *uintptr) uintptr
+
 //go:noescape
 func Or8(ptr *uint8, val uint8)
 
@@ -67,3 +73,9 @@ func StorepNoWB(ptr unsafe.Pointer, val unsafe.Pointer)
 
 //go:noescape
 func StoreRel(ptr *uint32, val uint32)
+
+//go:noescape
+func StoreRel64(ptr *uint64, val uint64)
+
+//go:noescape
+func StoreReluintptr(ptr *uintptr, val uintptr)
diff --git a/src/runtime/internal/atomic/atomic_arm64.s b/src/runtime/internal/atomic/atomic_arm64.s
index a2eb7568d2..36c7698b18 100644
--- a/src/runtime/internal/atomic/atomic_arm64.s
+++ b/src/runtime/internal/atomic/atomic_arm64.s
@@ -36,12 +36,26 @@ TEXT ·Loadp(SB),NOSPLIT,$0-16
 TEXT ·LoadAcq(SB),NOSPLIT,$0-12
 	B	·Load(SB)
 
+// uint64 runtime∕internal∕atomic·LoadAcquintptr(uint64 volatile* addr)
+TEXT ·LoadAcq64(SB),NOSPLIT,$0-16
+	B	·Load64(SB)
+
+// uintptr runtime∕internal∕atomic·LoadAcq64(uintptr volatile* addr)
+TEXT ·LoadAcquintptr(SB),NOSPLIT,$0-16
+	B	·Load64(SB)
+
 TEXT runtime∕internal∕atomic·StorepNoWB(SB), NOSPLIT, $0-16
 	B	runtime∕internal∕atomic·Store64(SB)
 
 TEXT runtime∕internal∕atomic·StoreRel(SB), NOSPLIT, $0-12
 	B	runtime∕internal∕atomic·Store(SB)
 
+TEXT runtime∕internal∕atomic·StoreRel64(SB), NOSPLIT, $0-16
+	B	runtime∕internal∕atomic·Store64(SB)
+
+TEXT runtime∕internal∕atomic·StoreReluintptr(SB), NOSPLIT, $0-16
+	B	runtime∕internal∕atomic·Store64(SB)
+
 TEXT runtime∕internal∕atomic·Store(SB), NOSPLIT, $0-12
 	MOVD	ptr+0(FP), R0
 	MOVW	val+8(FP), R1
diff --git a/src/runtime/internal/atomic/atomic_mips64x.go b/src/runtime/internal/atomic/atomic_mips64x.go
index 1d9977850b..fca2242514 100644
--- a/src/runtime/internal/atomic/atomic_mips64x.go
+++ b/src/runtime/internal/atomic/atomic_mips64x.go
@@ -41,6 +41,12 @@ func Loadp(ptr unsafe.Pointer) unsafe.Pointer
 //go:noescape
 func LoadAcq(ptr *uint32) uint32
 
+//go:noescape
+func LoadAcq64(ptr *uint64) uint64
+
+//go:noescape
+func LoadAcquintptr(ptr *uintptr) uintptr
+
 //go:noescape
 func And8(ptr *uint8, val uint8)
 
@@ -69,3 +75,9 @@ func StorepNoWB(ptr unsafe.Pointer, val unsafe.Pointer)
 
 //go:noescape
 func StoreRel(ptr *uint32, val uint32)
+
+//go:noescape
+func StoreRel64(ptr *uint64, val uint64)
+
+//go:noescape
+func StoreReluintptr(ptr *uintptr, val uintptr)
diff --git a/src/runtime/internal/atomic/atomic_mips64x.s b/src/runtime/internal/atomic/atomic_mips64x.s
index 1ed90937c9..125c0c221c 100644
--- a/src/runtime/internal/atomic/atomic_mips64x.s
+++ b/src/runtime/internal/atomic/atomic_mips64x.s
@@ -47,3 +47,11 @@ TEXT ·Loadp(SB),NOSPLIT|NOFRAME,$0-16
 // uint32 runtime∕internal∕atomic·LoadAcq(uint32 volatile* ptr)
 TEXT ·LoadAcq(SB),NOSPLIT|NOFRAME,$0-12
 	JMP	atomic·Load(SB)
+
+// uint64 runtime∕internal∕atomic·LoadAcq64(uint64 volatile* ptr)
+TEXT ·LoadAcq64(SB),NOSPLIT|NOFRAME,$0-16
+	JMP	atomic·Load64(SB)
+
+// uintptr runtime∕internal∕atomic·LoadAcquintptr(uintptr volatile* ptr)
+TEXT ·LoadAcquintptr(SB),NOSPLIT|NOFRAME,$0-16
+	JMP	atomic·Load64(SB)
diff --git a/src/runtime/internal/atomic/atomic_mipsx.go b/src/runtime/internal/atomic/atomic_mipsx.go
index b99bfe7dbf..be1e6a038b 100644
--- a/src/runtime/internal/atomic/atomic_mipsx.go
+++ b/src/runtime/internal/atomic/atomic_mipsx.go
@@ -132,6 +132,9 @@ func Loadp(ptr unsafe.Pointer) unsafe.Pointer
 //go:noescape
 func LoadAcq(ptr *uint32) uint32
 
+//go:noescape
+func LoadAcquintptr(ptr *uintptr) uintptr
+
 //go:noescape
 func And8(ptr *uint8, val uint8)
 
@@ -150,5 +153,8 @@ func StorepNoWB(ptr unsafe.Pointer, val unsafe.Pointer)
 //go:noescape
 func StoreRel(ptr *uint32, val uint32)
 
+//go:noescape
+func StoreReluintptr(ptr *uintptr, val uintptr)
+
 //go:noescape
 func CasRel(addr *uint32, old, new uint32) bool
diff --git a/src/runtime/internal/atomic/atomic_ppc64x.go b/src/runtime/internal/atomic/atomic_ppc64x.go
index a48ecf5ee8..e759bb27a2 100644
--- a/src/runtime/internal/atomic/atomic_ppc64x.go
+++ b/src/runtime/internal/atomic/atomic_ppc64x.go
@@ -41,6 +41,12 @@ func Loadp(ptr unsafe.Pointer) unsafe.Pointer
 //go:noescape
 func LoadAcq(ptr *uint32) uint32
 
+//go:noescape
+func LoadAcq64(ptr *uint64) uint64
+
+//go:noescape
+func LoadAcquintptr(ptr *uintptr) uintptr
+
 //go:noescape
 func And8(ptr *uint8, val uint8)
 
@@ -67,5 +73,11 @@ func Store64(ptr *uint64, val uint64)
 //go:noescape
 func StoreRel(ptr *uint32, val uint32)
 
+//go:noescape
+func StoreRel64(ptr *uint64, val uint64)
+
+//go:noescape
+func StoreReluintptr(ptr *uintptr, val uintptr)
+
 // NO go:noescape annotation; see atomic_pointer.go.
 func StorepNoWB(ptr unsafe.Pointer, val unsafe.Pointer)
diff --git a/src/runtime/internal/atomic/atomic_ppc64x.s b/src/runtime/internal/atomic/atomic_ppc64x.s
index c2f696fb34..b79cdbca34 100644
--- a/src/runtime/internal/atomic/atomic_ppc64x.s
+++ b/src/runtime/internal/atomic/atomic_ppc64x.s
@@ -6,6 +6,15 @@
 
 #include "textflag.h"
 
+
+// For more details about how various memory models are
+// enforced on POWER, the following paper provides more
+// details about how they enforce C/C++ like models. This
+// gives context about why the strange looking code
+// sequences below work.
+//
+// http://www.rdrop.com/users/paulmck/scalability/paper/N2745r.2011.03.04a.html
+
 // uint32 runtime∕internal∕atomic·Load(uint32 volatile* ptr)
 TEXT ·Load(SB),NOSPLIT|NOFRAME,$-8-12
 	MOVD	ptr+0(FP), R3
@@ -56,5 +65,16 @@ TEXT ·LoadAcq(SB),NOSPLIT|NOFRAME,$-8-12
 	MOVWZ  0(R3), R3
 	CMPW   R3, R3, CR7
 	BC     4, 30, 1(PC) // bne- cr7, 0x4
+	ISYNC
 	MOVW   R3, ret+8(FP)
 	RET
+
+// uint64 runtime∕internal∕atomic·LoadAcq64(uint64 volatile* ptr)
+TEXT ·LoadAcq64(SB),NOSPLIT|NOFRAME,$-8-16
+	MOVD   ptr+0(FP), R3
+	MOVD   0(R3), R3
+	CMP    R3, R3, CR7
+	BC     4, 30, 1(PC) // bne- cr7, 0x4
+	ISYNC
+	MOVD   R3, ret+8(FP)
+	RET
diff --git a/src/runtime/internal/atomic/atomic_riscv64.go b/src/runtime/internal/atomic/atomic_riscv64.go
index d52512369e..617bc1a3eb 100644
--- a/src/runtime/internal/atomic/atomic_riscv64.go
+++ b/src/runtime/internal/atomic/atomic_riscv64.go
@@ -39,6 +39,12 @@ func Loadp(ptr unsafe.Pointer) unsafe.Pointer
 //go:noescape
 func LoadAcq(ptr *uint32) uint32
 
+//go:noescape
+func LoadAcq64(ptr *uint64) uint64
+
+//go:noescape
+func LoadAcquintptr(ptr *uintptr) uintptr
+
 //go:noescape
 func Or8(ptr *uint8, val uint8)
 
@@ -65,3 +71,9 @@ func StorepNoWB(ptr unsafe.Pointer, val unsafe.Pointer)
 
 //go:noescape
 func StoreRel(ptr *uint32, val uint32)
+
+//go:noescape
+func StoreRel64(ptr *uint64, val uint64)
+
+//go:noescape
+func StoreReluintptr(ptr *uintptr, val uintptr)
diff --git a/src/runtime/internal/atomic/atomic_riscv64.s b/src/runtime/internal/atomic/atomic_riscv64.s
index d005325ca3..db139d690a 100644
--- a/src/runtime/internal/atomic/atomic_riscv64.s
+++ b/src/runtime/internal/atomic/atomic_riscv64.s
@@ -150,6 +150,12 @@ TEXT ·Xaddint64(SB),NOSPLIT,$0-24
 TEXT ·LoadAcq(SB),NOSPLIT|NOFRAME,$0-12
 	JMP	·Load(SB)
 
+TEXT ·LoadAcq64(SB),NOSPLIT|NOFRAME,$0-16
+	JMP	·Load64(SB)
+
+TEXT ·LoadAcquintptr(SB),NOSPLIT|NOFRAME,$0-16
+	JMP	·Load64(SB)
+
 // func Loadp(ptr unsafe.Pointer) unsafe.Pointer
 TEXT ·Loadp(SB),NOSPLIT,$0-16
 	JMP	·Load64(SB)
@@ -161,6 +167,12 @@ TEXT ·StorepNoWB(SB), NOSPLIT, $0-16
 TEXT ·StoreRel(SB), NOSPLIT, $0-12
 	JMP	·Store(SB)
 
+TEXT ·StoreRel64(SB), NOSPLIT, $0-16
+	JMP	·Store64(SB)
+
+TEXT ·StoreReluintptr(SB), NOSPLIT, $0-16
+	JMP	·Store64(SB)
+
 // func Xchg(ptr *uint32, new uint32) uint32
 TEXT ·Xchg(SB), NOSPLIT, $0-20
 	MOV	ptr+0(FP), A0
diff --git a/src/runtime/internal/atomic/atomic_s390x.go b/src/runtime/internal/atomic/atomic_s390x.go
index 4d73b39baf..b649caa39f 100644
--- a/src/runtime/internal/atomic/atomic_s390x.go
+++ b/src/runtime/internal/atomic/atomic_s390x.go
@@ -41,6 +41,18 @@ func LoadAcq(ptr *uint32) uint32 {
 	return *ptr
 }
 
+//go:nosplit
+//go:noinline
+func LoadAcq64(ptr *uint64) uint64 {
+	return *ptr
+}
+
+//go:nosplit
+//go:noinline
+func LoadAcquintptr(ptr *uintptr) uintptr {
+	return *ptr
+}
+
 //go:noescape
 func Store(ptr *uint32, val uint32)
 
@@ -59,6 +71,18 @@ func StoreRel(ptr *uint32, val uint32) {
 	*ptr = val
 }
 
+//go:nosplit
+//go:noinline
+func StoreRel64(ptr *uint64, val uint64) {
+	*ptr = val
+}
+
+//go:nosplit
+//go:noinline
+func StoreReluintptr(ptr *uintptr, val uintptr) {
+	*ptr = val
+}
+
 //go:noescape
 func And8(ptr *uint8, val uint8)
 
diff --git a/src/runtime/internal/atomic/atomic_wasm.go b/src/runtime/internal/atomic/atomic_wasm.go
index 2c0c3a8174..60a4942884 100644
--- a/src/runtime/internal/atomic/atomic_wasm.go
+++ b/src/runtime/internal/atomic/atomic_wasm.go
@@ -45,6 +45,18 @@ func LoadAcq(ptr *uint32) uint32 {
 	return *ptr
 }
 
+//go:nosplit
+//go:noinline
+func LoadAcq64(ptr *uint64) uint64 {
+	return *ptr
+}
+
+//go:nosplit
+//go:noinline
+func LoadAcquintptr(ptr *uintptr) uintptr {
+	return *ptr
+}
+
 //go:nosplit
 //go:noinline
 func Load8(ptr *uint8) uint8 {
@@ -141,6 +153,18 @@ func StoreRel(ptr *uint32, val uint32) {
 	*ptr = val
 }
 
+//go:nosplit
+//go:noinline
+func StoreRel64(ptr *uint64, val uint64) {
+	*ptr = val
+}
+
+//go:nosplit
+//go:noinline
+func StoreReluintptr(ptr *uintptr, val uintptr) {
+	*ptr = val
+}
+
 //go:nosplit
 //go:noinline
 func Store8(ptr *uint8, val uint8) {
diff --git a/src/sync/pool.go b/src/sync/pool.go
index ca7afdb12f..137413fdc4 100644
--- a/src/sync/pool.go
+++ b/src/sync/pool.go
@@ -7,6 +7,7 @@ package sync
 import (
 	"internal/race"
 	"runtime"
+	runtimeatomic "runtime/internal/atomic"
 	"sync/atomic"
 	"unsafe"
 )
@@ -152,8 +153,8 @@ func (p *Pool) Get() interface{} {
 
 func (p *Pool) getSlow(pid int) interface{} {
 	// See the comment in pin regarding ordering of the loads.
-	size := atomic.LoadUintptr(&p.localSize) // load-acquire
-	locals := p.local                        // load-consume
+	size := runtimeatomic.LoadAcquintptr(&p.localSize) // load-acquire
+	locals := p.local                                  // load-consume
 	// Try to steal one element from other procs.
 	for i := 0; i < int(size); i++ {
 		l := indexLocal(locals, (pid+i+1)%int(size))
@@ -165,7 +166,7 @@ func (p *Pool) getSlow(pid int) interface{} {
 	// Try the victim cache. We do this after attempting to steal
 	// from all primary caches because we want objects in the
 	// victim cache to age out if at all possible.
-	size = atomic.LoadUintptr(&p.victimSize)
+	size = runtimeatomic.Loaduintptr(&p.victimSize)
 	if uintptr(pid) >= size {
 		return nil
 	}
@@ -198,8 +199,8 @@ func (p *Pool) pin() (*poolLocal, int) {
 	// Since we've disabled preemption, GC cannot happen in between.
 	// Thus here we must observe local at least as large localSize.
 	// We can observe a newer/larger local, it is fine (we must observe its zero-initialized-ness).
-	s := atomic.LoadUintptr(&p.localSize) // load-acquire
-	l := p.local                          // load-consume
+	s := runtimeatomic.LoadAcquintptr(&p.localSize) // load-acquire
+	l := p.local                                    // load-consume
 	if uintptr(pid) < s {
 		return indexLocal(l, pid), pid
 	}
@@ -225,8 +226,8 @@ func (p *Pool) pinSlow() (*poolLocal, int) {
 	// If GOMAXPROCS changes between GCs, we re-allocate the array and lose the old one.
 	size := runtime.GOMAXPROCS(0)
 	local := make([]poolLocal, size)
-	atomic.StorePointer(&p.local, unsafe.Pointer(&local[0])) // store-release
-	atomic.StoreUintptr(&p.localSize, uintptr(size))         // store-release
+	atomic.StorePointer(&p.local, unsafe.Pointer(&local[0]))   // store-release
+	runtimeatomic.StoreReluintptr(&p.localSize, uintptr(size)) // store-release
 	return &local[pid], pid
 }
 
-- 
cgit v1.3


From ad61343f886cc5ce677e7bd62385144b2ba7b8f5 Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Thu, 8 Oct 2020 14:38:39 -0400
Subject: runtime/internal/atomic: add 32-bit And/Or

These will be used in a following CL to perform larger bit clear and bit
set than And8/Or8.

Change-Id: I60f7b1099e29b69eb64add77564faee862880a8d
Reviewed-on: https://go-review.googlesource.com/c/go/+/260977
Run-TryBot: Michael Pratt <mpratt@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
Trust: Michael Pratt <mpratt@google.com>
---
 src/runtime/internal/atomic/asm_386.s         |  16 ++++
 src/runtime/internal/atomic/asm_amd64.s       |  16 ++++
 src/runtime/internal/atomic/asm_mips64x.s     |  26 ++++++
 src/runtime/internal/atomic/asm_mipsx.s       |  26 ++++++
 src/runtime/internal/atomic/asm_ppc64x.s      |  30 ++++++-
 src/runtime/internal/atomic/asm_s390x.s       |  22 ++++-
 src/runtime/internal/atomic/atomic_386.go     |   6 ++
 src/runtime/internal/atomic/atomic_amd64.go   |   6 ++
 src/runtime/internal/atomic/atomic_arm.go     |  20 +++++
 src/runtime/internal/atomic/atomic_arm64.go   |   6 ++
 src/runtime/internal/atomic/atomic_arm64.s    |  19 ++++
 src/runtime/internal/atomic/atomic_mips64x.go |   6 ++
 src/runtime/internal/atomic/atomic_mipsx.go   |   6 ++
 src/runtime/internal/atomic/atomic_ppc64x.go  |   6 ++
 src/runtime/internal/atomic/atomic_riscv64.go |   6 ++
 src/runtime/internal/atomic/atomic_riscv64.s  |  14 +++
 src/runtime/internal/atomic/atomic_s390x.go   |   6 ++
 src/runtime/internal/atomic/atomic_test.go    | 119 +++++++++++++++++++++++++-
 src/runtime/internal/atomic/atomic_wasm.go    |  12 +++
 src/runtime/internal/atomic/bench_test.go     |  40 +++++++++
 20 files changed, 400 insertions(+), 8 deletions(-)

(limited to 'src/runtime/internal/atomic/asm_386.s')

diff --git a/src/runtime/internal/atomic/asm_386.s b/src/runtime/internal/atomic/asm_386.s
index 7ebf675ac5..d82faef1f0 100644
--- a/src/runtime/internal/atomic/asm_386.s
+++ b/src/runtime/internal/atomic/asm_386.s
@@ -243,3 +243,19 @@ TEXT ·Store8(SB), NOSPLIT, $0-5
 	MOVB	val+4(FP), AX
 	XCHGB	AX, 0(BX)
 	RET
+
+// func Or(addr *uint32, v uint32)
+TEXT ·Or(SB), NOSPLIT, $0-8
+	MOVL	ptr+0(FP), AX
+	MOVL	val+4(FP), BX
+	LOCK
+	ORL	BX, (AX)
+	RET
+
+// func And(addr *uint32, v uint32)
+TEXT ·And(SB), NOSPLIT, $0-8
+	MOVL	ptr+0(FP), AX
+	MOVL	val+4(FP), BX
+	LOCK
+	ANDL	BX, (AX)
+	RET
diff --git a/src/runtime/internal/atomic/asm_amd64.s b/src/runtime/internal/atomic/asm_amd64.s
index 80fb31285d..2cf7c55870 100644
--- a/src/runtime/internal/atomic/asm_amd64.s
+++ b/src/runtime/internal/atomic/asm_amd64.s
@@ -169,3 +169,19 @@ TEXT runtime∕internal∕atomic·And8(SB), NOSPLIT, $0-9
 	LOCK
 	ANDB	BX, (AX)
 	RET
+
+// func Or(addr *uint32, v uint32)
+TEXT runtime∕internal∕atomic·Or(SB), NOSPLIT, $0-12
+	MOVQ	ptr+0(FP), AX
+	MOVL	val+8(FP), BX
+	LOCK
+	ORL	BX, (AX)
+	RET
+
+// func And(addr *uint32, v uint32)
+TEXT runtime∕internal∕atomic·And(SB), NOSPLIT, $0-12
+	MOVQ	ptr+0(FP), AX
+	MOVL	val+8(FP), BX
+	LOCK
+	ANDL	BX, (AX)
+	RET
diff --git a/src/runtime/internal/atomic/asm_mips64x.s b/src/runtime/internal/atomic/asm_mips64x.s
index 03fb822929..a515683ebb 100644
--- a/src/runtime/internal/atomic/asm_mips64x.s
+++ b/src/runtime/internal/atomic/asm_mips64x.s
@@ -243,3 +243,29 @@ TEXT ·And8(SB), NOSPLIT, $0-9
 	BEQ	R4, -4(PC)
 	SYNC
 	RET
+
+// func Or(addr *uint32, v uint32)
+TEXT ·Or(SB), NOSPLIT, $0-12
+	MOVV	ptr+0(FP), R1
+	MOVW	val+8(FP), R2
+
+	SYNC
+	LL	(R1), R3
+	OR	R2, R3
+	SC	R3, (R1)
+	BEQ	R3, -4(PC)
+	SYNC
+	RET
+
+// func And(addr *uint32, v uint32)
+TEXT ·And(SB), NOSPLIT, $0-12
+	MOVV	ptr+0(FP), R1
+	MOVW	val+8(FP), R2
+
+	SYNC
+	LL	(R1), R3
+	AND	R2, R3
+	SC	R3, (R1)
+	BEQ	R3, -4(PC)
+	SYNC
+	RET
diff --git a/src/runtime/internal/atomic/asm_mipsx.s b/src/runtime/internal/atomic/asm_mipsx.s
index 63bb548825..2b2cfabe08 100644
--- a/src/runtime/internal/atomic/asm_mipsx.s
+++ b/src/runtime/internal/atomic/asm_mipsx.s
@@ -172,3 +172,29 @@ try_and8:
 	BEQ	R4, try_and8
 	SYNC
 	RET
+
+// func Or(addr *uint32, v uint32)
+TEXT ·Or(SB), NOSPLIT, $0-8
+	MOVW	ptr+0(FP), R1
+	MOVW	val+4(FP), R2
+
+	SYNC
+	LL	(R1), R3
+	OR	R2, R3
+	SC	R3, (R1)
+	BEQ	R3, -4(PC)
+	SYNC
+	RET
+
+// func And(addr *uint32, v uint32)
+TEXT ·And(SB), NOSPLIT, $0-8
+	MOVW	ptr+0(FP), R1
+	MOVW	val+4(FP), R2
+
+	SYNC
+	LL	(R1), R3
+	AND	R2, R3
+	SC	R3, (R1)
+	BEQ	R3, -4(PC)
+	SYNC
+	RET
diff --git a/src/runtime/internal/atomic/asm_ppc64x.s b/src/runtime/internal/atomic/asm_ppc64x.s
index c0237de4d0..bb009ab34d 100644
--- a/src/runtime/internal/atomic/asm_ppc64x.s
+++ b/src/runtime/internal/atomic/asm_ppc64x.s
@@ -222,8 +222,32 @@ TEXT runtime∕internal∕atomic·And8(SB), NOSPLIT, $0-9
 	MOVBZ	val+8(FP), R4
 	LWSYNC
 again:
-	LBAR	(R3),R6
-	AND	R4,R6
-	STBCCC	R6,(R3)
+	LBAR	(R3), R6
+	AND	R4, R6
+	STBCCC	R6, (R3)
+	BNE	again
+	RET
+
+// func Or(addr *uint32, v uint32)
+TEXT runtime∕internal∕atomic·Or(SB), NOSPLIT, $0-12
+	MOVD	ptr+0(FP), R3
+	MOVW	val+8(FP), R4
+	LWSYNC
+again:
+	LWAR	(R3), R6
+	OR	R4, R6
+	STWCCC	R6, (R3)
+	BNE	again
+	RET
+
+// func And(addr *uint32, v uint32)
+TEXT runtime∕internal∕atomic·And(SB), NOSPLIT, $0-12
+	MOVD	ptr+0(FP), R3
+	MOVW	val+8(FP), R4
+	LWSYNC
+again:
+	LWAR	(R3),R6
+	AND	R4, R6
+	STWCCC	R6, (R3)
 	BNE	again
 	RET
diff --git a/src/runtime/internal/atomic/asm_s390x.s b/src/runtime/internal/atomic/asm_s390x.s
index 9a19bc0ece..daf1f3cc9f 100644
--- a/src/runtime/internal/atomic/asm_s390x.s
+++ b/src/runtime/internal/atomic/asm_s390x.s
@@ -174,8 +174,8 @@ TEXT ·Xchguintptr(SB), NOSPLIT, $0-24
 
 // func Or8(addr *uint8, v uint8)
 TEXT ·Or8(SB), NOSPLIT, $0-9
-	MOVD    ptr+0(FP), R3
-	MOVBZ   val+8(FP), R4
+	MOVD	ptr+0(FP), R3
+	MOVBZ	val+8(FP), R4
 	// We don't have atomic operations that work on individual bytes so we
 	// need to align addr down to a word boundary and create a mask
 	// containing v to OR with the entire word atomically.
@@ -188,8 +188,8 @@ TEXT ·Or8(SB), NOSPLIT, $0-9
 
 // func And8(addr *uint8, v uint8)
 TEXT ·And8(SB), NOSPLIT, $0-9
-	MOVD    ptr+0(FP), R3
-	MOVBZ   val+8(FP), R4
+	MOVD	ptr+0(FP), R3
+	MOVBZ	val+8(FP), R4
 	// We don't have atomic operations that work on individual bytes so we
 	// need to align addr down to a word boundary and create a mask
 	// containing v to AND with the entire word atomically.
@@ -200,3 +200,17 @@ TEXT ·And8(SB), NOSPLIT, $0-9
 	RLL	R5, R4, R4           // R4 = rotl(R4, R5)
 	LAN	R4, R6, 0(R3)        // R6 = *R3; *R3 &= R4; (atomic)
 	RET
+
+// func Or(addr *uint32, v uint32)
+TEXT ·Or(SB), NOSPLIT, $0-12
+	MOVD	ptr+0(FP), R3
+	MOVW	val+8(FP), R4
+	LAO	R4, R6, 0(R3)        // R6 = *R3; *R3 |= R4; (atomic)
+	RET
+
+// func And(addr *uint32, v uint32)
+TEXT ·And(SB), NOSPLIT, $0-12
+	MOVD	ptr+0(FP), R3
+	MOVW	val+8(FP), R4
+	LAN	R4, R6, 0(R3)        // R6 = *R3; *R3 &= R4; (atomic)
+	RET
diff --git a/src/runtime/internal/atomic/atomic_386.go b/src/runtime/internal/atomic/atomic_386.go
index 06ce6a5356..1bfcb1143d 100644
--- a/src/runtime/internal/atomic/atomic_386.go
+++ b/src/runtime/internal/atomic/atomic_386.go
@@ -69,6 +69,12 @@ func And8(ptr *uint8, val uint8)
 //go:noescape
 func Or8(ptr *uint8, val uint8)
 
+//go:noescape
+func And(ptr *uint32, val uint32)
+
+//go:noescape
+func Or(ptr *uint32, val uint32)
+
 // NOTE: Do not add atomicxor8 (XOR is not idempotent).
 
 //go:noescape
diff --git a/src/runtime/internal/atomic/atomic_amd64.go b/src/runtime/internal/atomic/atomic_amd64.go
index 1b71a16d94..e36eb83a11 100644
--- a/src/runtime/internal/atomic/atomic_amd64.go
+++ b/src/runtime/internal/atomic/atomic_amd64.go
@@ -77,6 +77,12 @@ func And8(ptr *uint8, val uint8)
 //go:noescape
 func Or8(ptr *uint8, val uint8)
 
+//go:noescape
+func And(ptr *uint32, val uint32)
+
+//go:noescape
+func Or(ptr *uint32, val uint32)
+
 // NOTE: Do not add atomicxor8 (XOR is not idempotent).
 
 //go:noescape
diff --git a/src/runtime/internal/atomic/atomic_arm.go b/src/runtime/internal/atomic/atomic_arm.go
index 67d529c1cb..546b3d6120 100644
--- a/src/runtime/internal/atomic/atomic_arm.go
+++ b/src/runtime/internal/atomic/atomic_arm.go
@@ -182,6 +182,26 @@ func And8(addr *uint8, v uint8) {
 	}
 }
 
+//go:nosplit
+func Or(addr *uint32, v uint32) {
+	for {
+		old := *addr
+		if Cas(addr, old, old|v) {
+			return
+		}
+	}
+}
+
+//go:nosplit
+func And(addr *uint32, v uint32) {
+	for {
+		old := *addr
+		if Cas(addr, old, old&v) {
+			return
+		}
+	}
+}
+
 //go:nosplit
 func armcas(ptr *uint32, old, new uint32) bool
 
diff --git a/src/runtime/internal/atomic/atomic_arm64.go b/src/runtime/internal/atomic/atomic_arm64.go
index c9b4322fe9..d49bee8936 100644
--- a/src/runtime/internal/atomic/atomic_arm64.go
+++ b/src/runtime/internal/atomic/atomic_arm64.go
@@ -53,6 +53,12 @@ func Or8(ptr *uint8, val uint8)
 //go:noescape
 func And8(ptr *uint8, val uint8)
 
+//go:noescape
+func And(ptr *uint32, val uint32)
+
+//go:noescape
+func Or(ptr *uint32, val uint32)
+
 //go:noescape
 func Cas64(ptr *uint64, old, new uint64) bool
 
diff --git a/src/runtime/internal/atomic/atomic_arm64.s b/src/runtime/internal/atomic/atomic_arm64.s
index 36c7698b18..0cf3c40223 100644
--- a/src/runtime/internal/atomic/atomic_arm64.s
+++ b/src/runtime/internal/atomic/atomic_arm64.s
@@ -164,3 +164,22 @@ TEXT ·Or8(SB), NOSPLIT, $0-9
 	CBNZ	R3, -3(PC)
 	RET
 
+// func And(addr *uint32, v uint32)
+TEXT ·And(SB), NOSPLIT, $0-12
+	MOVD	ptr+0(FP), R0
+	MOVW	val+8(FP), R1
+	LDAXRW	(R0), R2
+	AND	R1, R2
+	STLXRW	R2, (R0), R3
+	CBNZ	R3, -3(PC)
+	RET
+
+// func Or(addr *uint32, v uint32)
+TEXT ·Or(SB), NOSPLIT, $0-12
+	MOVD	ptr+0(FP), R0
+	MOVW	val+8(FP), R1
+	LDAXRW	(R0), R2
+	ORR	R1, R2
+	STLXRW	R2, (R0), R3
+	CBNZ	R3, -3(PC)
+	RET
diff --git a/src/runtime/internal/atomic/atomic_mips64x.go b/src/runtime/internal/atomic/atomic_mips64x.go
index fca2242514..b0109d72b0 100644
--- a/src/runtime/internal/atomic/atomic_mips64x.go
+++ b/src/runtime/internal/atomic/atomic_mips64x.go
@@ -55,6 +55,12 @@ func Or8(ptr *uint8, val uint8)
 
 // NOTE: Do not add atomicxor8 (XOR is not idempotent).
 
+//go:noescape
+func And(ptr *uint32, val uint32)
+
+//go:noescape
+func Or(ptr *uint32, val uint32)
+
 //go:noescape
 func Cas64(ptr *uint64, old, new uint64) bool
 
diff --git a/src/runtime/internal/atomic/atomic_mipsx.go b/src/runtime/internal/atomic/atomic_mipsx.go
index be1e6a038b..1336b50121 100644
--- a/src/runtime/internal/atomic/atomic_mipsx.go
+++ b/src/runtime/internal/atomic/atomic_mipsx.go
@@ -141,6 +141,12 @@ func And8(ptr *uint8, val uint8)
 //go:noescape
 func Or8(ptr *uint8, val uint8)
 
+//go:noescape
+func And(ptr *uint32, val uint32)
+
+//go:noescape
+func Or(ptr *uint32, val uint32)
+
 //go:noescape
 func Store(ptr *uint32, val uint32)
 
diff --git a/src/runtime/internal/atomic/atomic_ppc64x.go b/src/runtime/internal/atomic/atomic_ppc64x.go
index e759bb27a2..e4b109f0ec 100644
--- a/src/runtime/internal/atomic/atomic_ppc64x.go
+++ b/src/runtime/internal/atomic/atomic_ppc64x.go
@@ -55,6 +55,12 @@ func Or8(ptr *uint8, val uint8)
 
 // NOTE: Do not add atomicxor8 (XOR is not idempotent).
 
+//go:noescape
+func And(ptr *uint32, val uint32)
+
+//go:noescape
+func Or(ptr *uint32, val uint32)
+
 //go:noescape
 func Cas64(ptr *uint64, old, new uint64) bool
 
diff --git a/src/runtime/internal/atomic/atomic_riscv64.go b/src/runtime/internal/atomic/atomic_riscv64.go
index 617bc1a3eb..8f24d61625 100644
--- a/src/runtime/internal/atomic/atomic_riscv64.go
+++ b/src/runtime/internal/atomic/atomic_riscv64.go
@@ -51,6 +51,12 @@ func Or8(ptr *uint8, val uint8)
 //go:noescape
 func And8(ptr *uint8, val uint8)
 
+//go:noescape
+func And(ptr *uint32, val uint32)
+
+//go:noescape
+func Or(ptr *uint32, val uint32)
+
 //go:noescape
 func Cas64(ptr *uint64, old, new uint64) bool
 
diff --git a/src/runtime/internal/atomic/atomic_riscv64.s b/src/runtime/internal/atomic/atomic_riscv64.s
index db139d690a..74c896cea6 100644
--- a/src/runtime/internal/atomic/atomic_riscv64.s
+++ b/src/runtime/internal/atomic/atomic_riscv64.s
@@ -242,3 +242,17 @@ TEXT ·Or8(SB), NOSPLIT, $0-9
 	SLL	A2, A1
 	AMOORW	A1, (A0), ZERO
 	RET
+
+// func And(ptr *uint32, val uint32)
+TEXT ·And(SB), NOSPLIT, $0-12
+	MOV	ptr+0(FP), A0
+	MOVW	val+8(FP), A1
+	AMOANDW	A1, (A0), ZERO
+	RET
+
+// func Or(ptr *uint32, val uint32)
+TEXT ·Or(SB), NOSPLIT, $0-12
+	MOV	ptr+0(FP), A0
+	MOVW	val+8(FP), A1
+	AMOORW	A1, (A0), ZERO
+	RET
diff --git a/src/runtime/internal/atomic/atomic_s390x.go b/src/runtime/internal/atomic/atomic_s390x.go
index b649caa39f..a058d60102 100644
--- a/src/runtime/internal/atomic/atomic_s390x.go
+++ b/src/runtime/internal/atomic/atomic_s390x.go
@@ -91,6 +91,12 @@ func Or8(ptr *uint8, val uint8)
 
 // NOTE: Do not add atomicxor8 (XOR is not idempotent).
 
+//go:noescape
+func And(ptr *uint32, val uint32)
+
+//go:noescape
+func Or(ptr *uint32, val uint32)
+
 //go:noescape
 func Xadd(ptr *uint32, delta int32) uint32
 
diff --git a/src/runtime/internal/atomic/atomic_test.go b/src/runtime/internal/atomic/atomic_test.go
index a9f95077c0..c9c2eba248 100644
--- a/src/runtime/internal/atomic/atomic_test.go
+++ b/src/runtime/internal/atomic/atomic_test.go
@@ -150,6 +150,45 @@ func TestAnd8(t *testing.T) {
 	}
 }
 
+func TestAnd(t *testing.T) {
+	// Basic sanity check.
+	x := uint32(0xffffffff)
+	for i := uint32(0); i < 32; i++ {
+		atomic.And(&x, ^(1 << i))
+		if r := uint32(0xffffffff) << (i + 1); x != r {
+			t.Fatalf("clearing bit %#x: want %#x, got %#x", uint32(1<<i), r, x)
+		}
+	}
+
+	// Set every bit in array to 1.
+	a := make([]uint32, 1<<12)
+	for i := range a {
+		a[i] = 0xffffffff
+	}
+
+	// Clear array bit-by-bit in different goroutines.
+	done := make(chan bool)
+	for i := 0; i < 32; i++ {
+		m := ^uint32(1 << i)
+		go func() {
+			for i := range a {
+				atomic.And(&a[i], m)
+			}
+			done <- true
+		}()
+	}
+	for i := 0; i < 32; i++ {
+		<-done
+	}
+
+	// Check that the array has been totally cleared.
+	for i, v := range a {
+		if v != 0 {
+			t.Fatalf("a[%v] not cleared: want %#x, got %#x", i, uint32(0), v)
+		}
+	}
+}
+
 func TestOr8(t *testing.T) {
 	// Basic sanity check.
 	x := uint8(0)
@@ -186,7 +225,43 @@ func TestOr8(t *testing.T) {
 	}
 }
 
-func TestBitwiseContended(t *testing.T) {
+func TestOr(t *testing.T) {
+	// Basic sanity check.
+	x := uint32(0)
+	for i := uint32(0); i < 32; i++ {
+		atomic.Or(&x, 1<<i)
+		if r := (uint32(1) << (i + 1)) - 1; x != r {
+			t.Fatalf("setting bit %#x: want %#x, got %#x", uint32(1)<<i, r, x)
+		}
+	}
+
+	// Start with every bit in array set to 0.
+	a := make([]uint32, 1<<12)
+
+	// Set every bit in array bit-by-bit in different goroutines.
+	done := make(chan bool)
+	for i := 0; i < 32; i++ {
+		m := uint32(1 << i)
+		go func() {
+			for i := range a {
+				atomic.Or(&a[i], m)
+			}
+			done <- true
+		}()
+	}
+	for i := 0; i < 32; i++ {
+		<-done
+	}
+
+	// Check that the array has been totally set.
+	for i, v := range a {
+		if v != 0xffffffff {
+			t.Fatalf("a[%v] not fully set: want %#x, got %#x", i, uint32(0xffffffff), v)
+		}
+	}
+}
+
+func TestBitwiseContended8(t *testing.T) {
 	// Start with every bit in array set to 0.
 	a := make([]uint8, 16)
 
@@ -228,6 +303,48 @@ func TestBitwiseContended(t *testing.T) {
 	}
 }
 
+func TestBitwiseContended(t *testing.T) {
+	// Start with every bit in array set to 0.
+	a := make([]uint32, 16)
+
+	// Iterations to try.
+	N := 1 << 16
+	if testing.Short() {
+		N = 1 << 10
+	}
+
+	// Set and then clear every bit in the array bit-by-bit in different goroutines.
+	done := make(chan bool)
+	for i := 0; i < 32; i++ {
+		m := uint32(1 << i)
+		go func() {
+			for n := 0; n < N; n++ {
+				for i := range a {
+					atomic.Or(&a[i], m)
+					if atomic.Load(&a[i])&m != m {
+						t.Errorf("a[%v] bit %#x not set", i, m)
+					}
+					atomic.And(&a[i], ^m)
+					if atomic.Load(&a[i])&m != 0 {
+						t.Errorf("a[%v] bit %#x not clear", i, m)
+					}
+				}
+			}
+			done <- true
+		}()
+	}
+	for i := 0; i < 32; i++ {
+		<-done
+	}
+
+	// Check that the array has been totally cleared.
+	for i, v := range a {
+		if v != 0 {
+			t.Fatalf("a[%v] not cleared: want %#x, got %#x", i, uint32(0), v)
+		}
+	}
+}
+
 func TestStorepNoWB(t *testing.T) {
 	var p [2]*int
 	for i := range p {
diff --git a/src/runtime/internal/atomic/atomic_wasm.go b/src/runtime/internal/atomic/atomic_wasm.go
index 60a4942884..b05d98ed51 100644
--- a/src/runtime/internal/atomic/atomic_wasm.go
+++ b/src/runtime/internal/atomic/atomic_wasm.go
@@ -131,6 +131,18 @@ func Or8(ptr *uint8, val uint8) {
 
 // NOTE: Do not add atomicxor8 (XOR is not idempotent).
 
+//go:nosplit
+//go:noinline
+func And(ptr *uint32, val uint32) {
+	*ptr = *ptr & val
+}
+
+//go:nosplit
+//go:noinline
+func Or(ptr *uint32, val uint32) {
+	*ptr = *ptr | val
+}
+
 //go:nosplit
 //go:noinline
 func Cas64(ptr *uint64, old, new uint64) bool {
diff --git a/src/runtime/internal/atomic/bench_test.go b/src/runtime/internal/atomic/bench_test.go
index de71b0f2c7..434aa6d434 100644
--- a/src/runtime/internal/atomic/bench_test.go
+++ b/src/runtime/internal/atomic/bench_test.go
@@ -51,6 +51,14 @@ func BenchmarkAnd8(b *testing.B) {
 	}
 }
 
+func BenchmarkAnd(b *testing.B) {
+	var x [128]uint32 // give x its own cache line
+	sink = &x
+	for i := 0; i < b.N; i++ {
+		atomic.And(&x[63], uint32(i))
+	}
+}
+
 func BenchmarkAnd8Parallel(b *testing.B) {
 	var x [512]uint8 // give byte its own cache line
 	sink = &x
@@ -63,6 +71,18 @@ func BenchmarkAnd8Parallel(b *testing.B) {
 	})
 }
 
+func BenchmarkAndParallel(b *testing.B) {
+	var x [128]uint32 // give x its own cache line
+	sink = &x
+	b.RunParallel(func(pb *testing.PB) {
+		i := uint32(0)
+		for pb.Next() {
+			atomic.And(&x[63], i)
+			i++
+		}
+	})
+}
+
 func BenchmarkOr8(b *testing.B) {
 	var x [512]uint8 // give byte its own cache line
 	sink = &x
@@ -71,6 +91,14 @@ func BenchmarkOr8(b *testing.B) {
 	}
 }
 
+func BenchmarkOr(b *testing.B) {
+	var x [128]uint32 // give x its own cache line
+	sink = &x
+	for i := 0; i < b.N; i++ {
+		atomic.Or(&x[63], uint32(i))
+	}
+}
+
 func BenchmarkOr8Parallel(b *testing.B) {
 	var x [512]uint8 // give byte its own cache line
 	sink = &x
@@ -83,6 +111,18 @@ func BenchmarkOr8Parallel(b *testing.B) {
 	})
 }
 
+func BenchmarkOrParallel(b *testing.B) {
+	var x [128]uint32 // give x its own cache line
+	sink = &x
+	b.RunParallel(func(pb *testing.PB) {
+		i := uint32(0)
+		for pb.Next() {
+			atomic.Or(&x[63], i)
+			i++
+		}
+	})
+}
+
 func BenchmarkXadd(b *testing.B) {
 	var x uint32
 	ptr := &x
-- 
cgit v1.3