From 67faca7d9c54b367aee5fdeef2d5dd609fcf99d0 Mon Sep 17 00:00:00 2001
From: Michael Matloob <matloob@golang.org>
Date: Mon, 2 Nov 2015 14:09:24 -0500
Subject: runtime: break atomics out into package runtime/internal/atomic

This change breaks out most of the atomics functions in the runtime
into package runtime/internal/atomic. It adds some basic support
in the toolchain for runtime packages, and also modifies linux/arm
atomics to remove the dependency on the runtime's mutex. The mutexes
have been replaced with spinlocks.

all trybots are happy!
In addition to the trybots, I've tested on the darwin/arm64 builder,
on the darwin/arm builder, and on a ppc64le machine.

Change-Id: I6698c8e3cf3834f55ce5824059f44d00dc8e3c2f
Reviewed-on: https://go-review.googlesource.com/14204
Run-TryBot: Michael Matloob <matloob@golang.org>
Reviewed-by: Russ Cox <rsc@golang.org>
---
 src/runtime/asm_386.s                         | 150 -----------------
 src/runtime/asm_amd64.s                       | 138 ----------------
 src/runtime/asm_amd64p32.s                    | 139 ----------------
 src/runtime/asm_arm.s                         |  57 -------
 src/runtime/asm_arm64.s                       |  46 +-----
 src/runtime/asm_ppc64x.s                      | 208 ------------------------
 src/runtime/atomic_386.go                     |  79 ---------
 src/runtime/atomic_amd64x.go                  |  69 --------
 src/runtime/atomic_arm.go                     | 165 -------------------
 src/runtime/atomic_arm64.go                   |  79 ---------
 src/runtime/atomic_arm64.s                    | 108 -------------
 src/runtime/atomic_pointer.go                 |  11 +-
 src/runtime/atomic_ppc64x.go                  |  57 -------
 src/runtime/atomic_ppc64x.s                   |  33 ----
 src/runtime/atomic_test.go                    |  66 --------
 src/runtime/chan.go                           |  11 +-
 src/runtime/cpuprof.go                        |  13 +-
 src/runtime/debug.go                          |   7 +-
 src/runtime/export_test.go                    |   7 +-
 src/runtime/hashmap.go                        |  19 +--
 src/runtime/hashmap_fast.go                   |  33 ++--
 src/runtime/iface.go                          |   9 +-
 src/runtime/internal/atomic/arch1_386.go      |   9 ++
 src/runtime/internal/atomic/arch1_amd64.go    |   9 ++
 src/runtime/internal/atomic/arch1_amd64p32.go |  14 ++
 src/runtime/internal/atomic/arch1_arm.go      |   9 ++
 src/runtime/internal/atomic/arch1_arm64.go    |   9 ++
 src/runtime/internal/atomic/arch1_ppc64.go    |   9 ++
 src/runtime/internal/atomic/arch1_ppc64le.go  |   9 ++
 src/runtime/internal/atomic/asm.s             |   8 +
 src/runtime/internal/atomic/asm_386.s         | 166 +++++++++++++++++++
 src/runtime/internal/atomic/asm_amd64.s       | 150 +++++++++++++++++
 src/runtime/internal/atomic/asm_amd64p32.s    | 150 +++++++++++++++++
 src/runtime/internal/atomic/asm_arm.s         |  71 ++++++++
 src/runtime/internal/atomic/asm_arm64.s       |  58 +++++++
 src/runtime/internal/atomic/asm_ppc64x.s      | 225 ++++++++++++++++++++++++++
 src/runtime/internal/atomic/atomic_386.go     |  80 +++++++++
 src/runtime/internal/atomic/atomic_amd64x.go  |  69 ++++++++
 src/runtime/internal/atomic/atomic_arm.go     | 180 +++++++++++++++++++++
 src/runtime/internal/atomic/atomic_arm64.go   |  80 +++++++++
 src/runtime/internal/atomic/atomic_arm64.s    | 113 +++++++++++++
 src/runtime/internal/atomic/atomic_ppc64x.go  |  56 +++++++
 src/runtime/internal/atomic/atomic_ppc64x.s   |  40 +++++
 src/runtime/internal/atomic/atomic_test.go    |  67 ++++++++
 src/runtime/internal/atomic/stubs.go          |  35 ++++
 src/runtime/internal/atomic/sys_darwin_arm.s  |  11 ++
 src/runtime/internal/atomic/sys_freebsd_arm.s |  19 +++
 src/runtime/internal/atomic/sys_linux_arm.s   |  42 +++++
 src/runtime/internal/atomic/sys_nacl_arm.s    |  16 ++
 src/runtime/internal/atomic/sys_netbsd_arm.s  |  21 +++
 src/runtime/internal/atomic/sys_openbsd_arm.s |  11 ++
 src/runtime/internal/atomic/textflag.h        |  30 ++++
 src/runtime/lfstack.go                        |  15 +-
 src/runtime/lock_futex.go                     |  27 ++--
 src/runtime/lock_sema.go                      |  33 ++--
 src/runtime/mbitmap.go                        |  23 +--
 src/runtime/mcentral.go                       |  10 +-
 src/runtime/mfinal.go                         |   7 +-
 src/runtime/mgc.go                            |  53 +++---
 src/runtime/mgcmark.go                        |  37 +++--
 src/runtime/mgcsweep.go                       |  23 +--
 src/runtime/mgcwork.go                        |  15 +-
 src/runtime/mheap.go                          |  11 +-
 src/runtime/mprof.go                          |   7 +-
 src/runtime/mstats.go                         |  13 +-
 src/runtime/netpoll.go                        |  17 +-
 src/runtime/os1_netbsd.go                     |  11 +-
 src/runtime/os1_openbsd.go                    |  11 +-
 src/runtime/os1_plan9.go                      |   7 +-
 src/runtime/os1_windows.go                    |  11 +-
 src/runtime/panic.go                          |   9 +-
 src/runtime/parfor.go                         |  30 ++--
 src/runtime/proc.go                           | 171 ++++++++++----------
 src/runtime/runtime.go                        |   9 +-
 src/runtime/runtime1.go                       |  31 ++--
 src/runtime/runtime2.go                       |   7 +-
 src/runtime/sema.go                           |  21 +--
 src/runtime/sigqueue.go                       |  21 +--
 src/runtime/stack.go                          |   7 +-
 src/runtime/string.go                         |   5 +-
 src/runtime/stubs.go                          |  36 -----
 src/runtime/sys_darwin_arm.s                  |   6 -
 src/runtime/sys_freebsd_arm.s                 |  14 --
 src/runtime/sys_linux_arm.s                   |  29 ----
 src/runtime/sys_nacl_arm.s                    |  15 --
 src/runtime/sys_netbsd_arm.s                  |  14 --
 src/runtime/sys_openbsd_arm.s                 |   7 -
 src/runtime/trace.go                          |  13 +-
 88 files changed, 2199 insertions(+), 1847 deletions(-)
 delete mode 100644 src/runtime/atomic_386.go
 delete mode 100644 src/runtime/atomic_amd64x.go
 delete mode 100644 src/runtime/atomic_arm.go
 delete mode 100644 src/runtime/atomic_arm64.go
 delete mode 100644 src/runtime/atomic_ppc64x.go
 delete mode 100644 src/runtime/atomic_test.go
 create mode 100644 src/runtime/internal/atomic/arch1_386.go
 create mode 100644 src/runtime/internal/atomic/arch1_amd64.go
 create mode 100644 src/runtime/internal/atomic/arch1_amd64p32.go
 create mode 100644 src/runtime/internal/atomic/arch1_arm.go
 create mode 100644 src/runtime/internal/atomic/arch1_arm64.go
 create mode 100644 src/runtime/internal/atomic/arch1_ppc64.go
 create mode 100644 src/runtime/internal/atomic/arch1_ppc64le.go
 create mode 100644 src/runtime/internal/atomic/asm.s
 create mode 100644 src/runtime/internal/atomic/asm_386.s
 create mode 100644 src/runtime/internal/atomic/asm_amd64.s
 create mode 100644 src/runtime/internal/atomic/asm_amd64p32.s
 create mode 100644 src/runtime/internal/atomic/asm_arm.s
 create mode 100644 src/runtime/internal/atomic/asm_arm64.s
 create mode 100644 src/runtime/internal/atomic/asm_ppc64x.s
 create mode 100644 src/runtime/internal/atomic/atomic_386.go
 create mode 100644 src/runtime/internal/atomic/atomic_amd64x.go
 create mode 100644 src/runtime/internal/atomic/atomic_arm.go
 create mode 100644 src/runtime/internal/atomic/atomic_arm64.go
 create mode 100644 src/runtime/internal/atomic/atomic_arm64.s
 create mode 100644 src/runtime/internal/atomic/atomic_ppc64x.go
 create mode 100644 src/runtime/internal/atomic/atomic_ppc64x.s
 create mode 100644 src/runtime/internal/atomic/atomic_test.go
 create mode 100644 src/runtime/internal/atomic/stubs.go
 create mode 100644 src/runtime/internal/atomic/sys_darwin_arm.s
 create mode 100644 src/runtime/internal/atomic/sys_freebsd_arm.s
 create mode 100644 src/runtime/internal/atomic/sys_linux_arm.s
 create mode 100644 src/runtime/internal/atomic/sys_nacl_arm.s
 create mode 100644 src/runtime/internal/atomic/sys_netbsd_arm.s
 create mode 100644 src/runtime/internal/atomic/sys_openbsd_arm.s
 create mode 100644 src/runtime/internal/atomic/textflag.h

(limited to 'src/runtime')

diff --git a/src/runtime/asm_386.s b/src/runtime/asm_386.s
index d423df7924..effa661acd 100644
--- a/src/runtime/asm_386.s
+++ b/src/runtime/asm_386.s
@@ -469,93 +469,6 @@ CALLFN(·call268435456, 268435456)
 CALLFN(·call536870912, 536870912)
 CALLFN(·call1073741824, 1073741824)
 
-// bool cas(int32 *val, int32 old, int32 new)
-// Atomically:
-//	if(*val == old){
-//		*val = new;
-//		return 1;
-//	}else
-//		return 0;
-TEXT runtime·cas(SB), NOSPLIT, $0-13
-	MOVL	ptr+0(FP), BX
-	MOVL	old+4(FP), AX
-	MOVL	new+8(FP), CX
-	LOCK
-	CMPXCHGL	CX, 0(BX)
-	SETEQ	ret+12(FP)
-	RET
-
-TEXT runtime·casuintptr(SB), NOSPLIT, $0-13
-	JMP	runtime·cas(SB)
-
-TEXT runtime·atomicloaduintptr(SB), NOSPLIT, $0-8
-	JMP	runtime·atomicload(SB)
-
-TEXT runtime·atomicloaduint(SB), NOSPLIT, $0-8
-	JMP	runtime·atomicload(SB)
-
-TEXT runtime·atomicstoreuintptr(SB), NOSPLIT, $0-8
-	JMP	runtime·atomicstore(SB)
-
-// bool runtime·cas64(uint64 *val, uint64 old, uint64 new)
-// Atomically:
-//	if(*val == *old){
-//		*val = new;
-//		return 1;
-//	} else {
-//		return 0;
-//	}
-TEXT runtime·cas64(SB), NOSPLIT, $0-21
-	MOVL	ptr+0(FP), BP
-	MOVL	old_lo+4(FP), AX
-	MOVL	old_hi+8(FP), DX
-	MOVL	new_lo+12(FP), BX
-	MOVL	new_hi+16(FP), CX
-	LOCK
-	CMPXCHG8B	0(BP)
-	SETEQ	ret+20(FP)
-	RET
-
-// bool casp(void **p, void *old, void *new)
-// Atomically:
-//	if(*p == old){
-//		*p = new;
-//		return 1;
-//	}else
-//		return 0;
-TEXT runtime·casp1(SB), NOSPLIT, $0-13
-	MOVL	ptr+0(FP), BX
-	MOVL	old+4(FP), AX
-	MOVL	new+8(FP), CX
-	LOCK
-	CMPXCHGL	CX, 0(BX)
-	SETEQ	ret+12(FP)
-	RET
-
-// uint32 xadd(uint32 volatile *val, int32 delta)
-// Atomically:
-//	*val += delta;
-//	return *val;
-TEXT runtime·xadd(SB), NOSPLIT, $0-12
-	MOVL	ptr+0(FP), BX
-	MOVL	delta+4(FP), AX
-	MOVL	AX, CX
-	LOCK
-	XADDL	AX, 0(BX)
-	ADDL	CX, AX
-	MOVL	AX, ret+8(FP)
-	RET
-
-TEXT runtime·xchg(SB), NOSPLIT, $0-12
-	MOVL	ptr+0(FP), BX
-	MOVL	new+4(FP), AX
-	XCHGL	AX, 0(BX)
-	MOVL	AX, ret+8(FP)
-	RET
-
-TEXT runtime·xchguintptr(SB), NOSPLIT, $0-12
-	JMP	runtime·xchg(SB)
-
 TEXT runtime·procyield(SB),NOSPLIT,$0-0
 	MOVL	cycles+0(FP), AX
 again:
@@ -564,69 +477,6 @@ again:
 	JNZ	again
 	RET
 
-TEXT runtime·atomicstorep1(SB), NOSPLIT, $0-8
-	MOVL	ptr+0(FP), BX
-	MOVL	val+4(FP), AX
-	XCHGL	AX, 0(BX)
-	RET
-
-TEXT runtime·atomicstore(SB), NOSPLIT, $0-8
-	MOVL	ptr+0(FP), BX
-	MOVL	val+4(FP), AX
-	XCHGL	AX, 0(BX)
-	RET
-
-// uint64 atomicload64(uint64 volatile* addr);
-TEXT runtime·atomicload64(SB), NOSPLIT, $0-12
-	MOVL	ptr+0(FP), AX
-	TESTL	$7, AX
-	JZ	2(PC)
-	MOVL	0, AX // crash with nil ptr deref
-	LEAL	ret_lo+4(FP), BX
-	// MOVQ (%EAX), %MM0
-	BYTE $0x0f; BYTE $0x6f; BYTE $0x00
-	// MOVQ %MM0, 0(%EBX)
-	BYTE $0x0f; BYTE $0x7f; BYTE $0x03
-	// EMMS
-	BYTE $0x0F; BYTE $0x77
-	RET
-
-// void runtime·atomicstore64(uint64 volatile* addr, uint64 v);
-TEXT runtime·atomicstore64(SB), NOSPLIT, $0-12
-	MOVL	ptr+0(FP), AX
-	TESTL	$7, AX
-	JZ	2(PC)
-	MOVL	0, AX // crash with nil ptr deref
-	// MOVQ and EMMS were introduced on the Pentium MMX.
-	// MOVQ 0x8(%ESP), %MM0
-	BYTE $0x0f; BYTE $0x6f; BYTE $0x44; BYTE $0x24; BYTE $0x08
-	// MOVQ %MM0, (%EAX)
-	BYTE $0x0f; BYTE $0x7f; BYTE $0x00 
-	// EMMS
-	BYTE $0x0F; BYTE $0x77
-	// This is essentially a no-op, but it provides required memory fencing.
-	// It can be replaced with MFENCE, but MFENCE was introduced only on the Pentium4 (SSE2).
-	MOVL	$0, AX
-	LOCK
-	XADDL	AX, (SP)
-	RET
-
-// void	runtime·atomicor8(byte volatile*, byte);
-TEXT runtime·atomicor8(SB), NOSPLIT, $0-5
-	MOVL	ptr+0(FP), AX
-	MOVB	val+4(FP), BX
-	LOCK
-	ORB	BX, (AX)
-	RET
-
-// void	runtime·atomicand8(byte volatile*, byte);
-TEXT runtime·atomicand8(SB), NOSPLIT, $0-5
-	MOVL	ptr+0(FP), AX
-	MOVB	val+4(FP), BX
-	LOCK
-	ANDB	BX, (AX)
-	RET
-
 TEXT ·publicationBarrier(SB),NOSPLIT,$0-0
 	// Stores are already ordered on x86, so this is just a
 	// compile barrier.
diff --git a/src/runtime/asm_amd64.s b/src/runtime/asm_amd64.s
index 68b342d4db..2f8940a678 100644
--- a/src/runtime/asm_amd64.s
+++ b/src/runtime/asm_amd64.s
@@ -495,111 +495,6 @@ CALLFN(·call268435456, 268435456)
 CALLFN(·call536870912, 536870912)
 CALLFN(·call1073741824, 1073741824)
 
-// bool cas(int32 *val, int32 old, int32 new)
-// Atomically:
-//	if(*val == old){
-//		*val = new;
-//		return 1;
-//	} else
-//		return 0;
-TEXT runtime·cas(SB), NOSPLIT, $0-17
-	MOVQ	ptr+0(FP), BX
-	MOVL	old+8(FP), AX
-	MOVL	new+12(FP), CX
-	LOCK
-	CMPXCHGL	CX, 0(BX)
-	SETEQ	ret+16(FP)
-	RET
-
-// bool	runtime·cas64(uint64 *val, uint64 old, uint64 new)
-// Atomically:
-//	if(*val == *old){
-//		*val = new;
-//		return 1;
-//	} else {
-//		return 0;
-//	}
-TEXT runtime·cas64(SB), NOSPLIT, $0-25
-	MOVQ	ptr+0(FP), BX
-	MOVQ	old+8(FP), AX
-	MOVQ	new+16(FP), CX
-	LOCK
-	CMPXCHGQ	CX, 0(BX)
-	SETEQ	ret+24(FP)
-	RET
-	
-TEXT runtime·casuintptr(SB), NOSPLIT, $0-25
-	JMP	runtime·cas64(SB)
-
-TEXT runtime·atomicloaduintptr(SB), NOSPLIT, $0-16
-	JMP	runtime·atomicload64(SB)
-
-TEXT runtime·atomicloaduint(SB), NOSPLIT, $0-16
-	JMP	runtime·atomicload64(SB)
-
-TEXT runtime·atomicstoreuintptr(SB), NOSPLIT, $0-16
-	JMP	runtime·atomicstore64(SB)
-
-// bool casp(void **val, void *old, void *new)
-// Atomically:
-//	if(*val == old){
-//		*val = new;
-//		return 1;
-//	} else
-//		return 0;
-TEXT runtime·casp1(SB), NOSPLIT, $0-25
-	MOVQ	ptr+0(FP), BX
-	MOVQ	old+8(FP), AX
-	MOVQ	new+16(FP), CX
-	LOCK
-	CMPXCHGQ	CX, 0(BX)
-	SETEQ	ret+24(FP)
-	RET
-
-// uint32 xadd(uint32 volatile *val, int32 delta)
-// Atomically:
-//	*val += delta;
-//	return *val;
-TEXT runtime·xadd(SB), NOSPLIT, $0-20
-	MOVQ	ptr+0(FP), BX
-	MOVL	delta+8(FP), AX
-	MOVL	AX, CX
-	LOCK
-	XADDL	AX, 0(BX)
-	ADDL	CX, AX
-	MOVL	AX, ret+16(FP)
-	RET
-
-TEXT runtime·xadd64(SB), NOSPLIT, $0-24
-	MOVQ	ptr+0(FP), BX
-	MOVQ	delta+8(FP), AX
-	MOVQ	AX, CX
-	LOCK
-	XADDQ	AX, 0(BX)
-	ADDQ	CX, AX
-	MOVQ	AX, ret+16(FP)
-	RET
-
-TEXT runtime·xadduintptr(SB), NOSPLIT, $0-24
-	JMP	runtime·xadd64(SB)
-
-TEXT runtime·xchg(SB), NOSPLIT, $0-20
-	MOVQ	ptr+0(FP), BX
-	MOVL	new+8(FP), AX
-	XCHGL	AX, 0(BX)
-	MOVL	AX, ret+16(FP)
-	RET
-
-TEXT runtime·xchg64(SB), NOSPLIT, $0-24
-	MOVQ	ptr+0(FP), BX
-	MOVQ	new+8(FP), AX
-	XCHGQ	AX, 0(BX)
-	MOVQ	AX, ret+16(FP)
-	RET
-
-TEXT runtime·xchguintptr(SB), NOSPLIT, $0-24
-	JMP	runtime·xchg64(SB)
-
 TEXT runtime·procyield(SB),NOSPLIT,$0-0
 	MOVL	cycles+0(FP), AX
 again:
@@ -608,39 +503,6 @@ again:
 	JNZ	again
 	RET
 
-TEXT runtime·atomicstorep1(SB), NOSPLIT, $0-16
-	MOVQ	ptr+0(FP), BX
-	MOVQ	val+8(FP), AX
-	XCHGQ	AX, 0(BX)
-	RET
-
-TEXT runtime·atomicstore(SB), NOSPLIT, $0-12
-	MOVQ	ptr+0(FP), BX
-	MOVL	val+8(FP), AX
-	XCHGL	AX, 0(BX)
-	RET
-
-TEXT runtime·atomicstore64(SB), NOSPLIT, $0-16
-	MOVQ	ptr+0(FP), BX
-	MOVQ	val+8(FP), AX
-	XCHGQ	AX, 0(BX)
-	RET
-
-// void	runtime·atomicor8(byte volatile*, byte);
-TEXT runtime·atomicor8(SB), NOSPLIT, $0-9
-	MOVQ	ptr+0(FP), AX
-	MOVB	val+8(FP), BX
-	LOCK
-	ORB	BX, (AX)
-	RET
-
-// void	runtime·atomicand8(byte volatile*, byte);
-TEXT runtime·atomicand8(SB), NOSPLIT, $0-9
-	MOVQ	ptr+0(FP), AX
-	MOVB	val+8(FP), BX
-	LOCK
-	ANDB	BX, (AX)
-	RET
 
 TEXT ·publicationBarrier(SB),NOSPLIT,$0-0
 	// Stores are already ordered on x86, so this is just a
diff --git a/src/runtime/asm_amd64p32.s b/src/runtime/asm_amd64p32.s
index 78033e1b82..aeb9d131b2 100644
--- a/src/runtime/asm_amd64p32.s
+++ b/src/runtime/asm_amd64p32.s
@@ -415,111 +415,6 @@ CALLFN(·call268435456, 268435456)
 CALLFN(·call536870912, 536870912)
 CALLFN(·call1073741824, 1073741824)
 
-// bool cas(int32 *val, int32 old, int32 new)
-// Atomically:
-//	if(*val == old){
-//		*val = new;
-//		return 1;
-//	} else
-//		return 0;
-TEXT runtime·cas(SB), NOSPLIT, $0-17
-	MOVL	ptr+0(FP), BX
-	MOVL	old+4(FP), AX
-	MOVL	new+8(FP), CX
-	LOCK
-	CMPXCHGL	CX, 0(BX)
-	SETEQ	ret+16(FP)
-	RET
-
-TEXT runtime·casuintptr(SB), NOSPLIT, $0-17
-	JMP	runtime·cas(SB)
-
-TEXT runtime·atomicloaduintptr(SB), NOSPLIT, $0-12
-	JMP	runtime·atomicload(SB)
-
-TEXT runtime·atomicloaduint(SB), NOSPLIT, $0-12
-	JMP	runtime·atomicload(SB)
-
-TEXT runtime·atomicstoreuintptr(SB), NOSPLIT, $0-12
-	JMP	runtime·atomicstore(SB)
-
-// bool	runtime·cas64(uint64 *val, uint64 old, uint64 new)
-// Atomically:
-//	if(*val == *old){
-//		*val = new;
-//		return 1;
-//	} else {
-//		return 0;
-//	}
-TEXT runtime·cas64(SB), NOSPLIT, $0-25
-	MOVL	ptr+0(FP), BX
-	MOVQ	old+8(FP), AX
-	MOVQ	new+16(FP), CX
-	LOCK
-	CMPXCHGQ	CX, 0(BX)
-	SETEQ	ret+24(FP)
-	RET
-
-// bool casp(void **val, void *old, void *new)
-// Atomically:
-//	if(*val == old){
-//		*val = new;
-//		return 1;
-//	} else
-//		return 0;
-TEXT runtime·casp1(SB), NOSPLIT, $0-17
-	MOVL	ptr+0(FP), BX
-	MOVL	old+4(FP), AX
-	MOVL	new+8(FP), CX
-	LOCK
-	CMPXCHGL	CX, 0(BX)
-	SETEQ	ret+16(FP)
-	RET
-
-// uint32 xadd(uint32 volatile *val, int32 delta)
-// Atomically:
-//	*val += delta;
-//	return *val;
-TEXT runtime·xadd(SB), NOSPLIT, $0-12
-	MOVL	ptr+0(FP), BX
-	MOVL	delta+4(FP), AX
-	MOVL	AX, CX
-	LOCK
-	XADDL	AX, 0(BX)
-	ADDL	CX, AX
-	MOVL	AX, ret+8(FP)
-	RET
-
-TEXT runtime·xadd64(SB), NOSPLIT, $0-24
-	MOVL	ptr+0(FP), BX
-	MOVQ	delta+8(FP), AX
-	MOVQ	AX, CX
-	LOCK
-	XADDQ	AX, 0(BX)
-	ADDQ	CX, AX
-	MOVQ	AX, ret+16(FP)
-	RET
-
-TEXT runtime·xadduintptr(SB), NOSPLIT, $0-12
-	JMP	runtime·xadd(SB)
-
-TEXT runtime·xchg(SB), NOSPLIT, $0-12
-	MOVL	ptr+0(FP), BX
-	MOVL	new+4(FP), AX
-	XCHGL	AX, 0(BX)
-	MOVL	AX, ret+8(FP)
-	RET
-
-TEXT runtime·xchg64(SB), NOSPLIT, $0-24
-	MOVL	ptr+0(FP), BX
-	MOVQ	new+8(FP), AX
-	XCHGQ	AX, 0(BX)
-	MOVQ	AX, ret+16(FP)
-	RET
-
-TEXT runtime·xchguintptr(SB), NOSPLIT, $0-12
-	JMP	runtime·xchg(SB)
-
 TEXT runtime·procyield(SB),NOSPLIT,$0-0
 	MOVL	cycles+0(FP), AX
 again:
@@ -528,40 +423,6 @@ again:
 	JNZ	again
 	RET
 
-TEXT runtime·atomicstorep1(SB), NOSPLIT, $0-8
-	MOVL	ptr+0(FP), BX
-	MOVL	val+4(FP), AX
-	XCHGL	AX, 0(BX)
-	RET
-
-TEXT runtime·atomicstore(SB), NOSPLIT, $0-8
-	MOVL	ptr+0(FP), BX
-	MOVL	val+4(FP), AX
-	XCHGL	AX, 0(BX)
-	RET
-
-TEXT runtime·atomicstore64(SB), NOSPLIT, $0-16
-	MOVL	ptr+0(FP), BX
-	MOVQ	val+8(FP), AX
-	XCHGQ	AX, 0(BX)
-	RET
-
-// void	runtime·atomicor8(byte volatile*, byte);
-TEXT runtime·atomicor8(SB), NOSPLIT, $0-5
-	MOVL	ptr+0(FP), BX
-	MOVB	val+4(FP), AX
-	LOCK
-	ORB	AX, 0(BX)
-	RET
-
-// void	runtime·atomicand8(byte volatile*, byte);
-TEXT runtime·atomicand8(SB), NOSPLIT, $0-5
-	MOVL	ptr+0(FP), BX
-	MOVB	val+4(FP), AX
-	LOCK
-	ANDB	AX, 0(BX)
-	RET
-
 TEXT ·publicationBarrier(SB),NOSPLIT,$0-0
 	// Stores are already ordered on x86, so this is just a
 	// compile barrier.
diff --git a/src/runtime/asm_arm.s b/src/runtime/asm_arm.s
index 62f2a276e7..48fc321df3 100644
--- a/src/runtime/asm_arm.s
+++ b/src/runtime/asm_arm.s
@@ -695,63 +695,6 @@ TEXT runtime·abort(SB),NOSPLIT,$-4-0
 	MOVW	$0, R0
 	MOVW	(R0), R1
 
-// bool armcas(int32 *val, int32 old, int32 new)
-// Atomically:
-//	if(*val == old){
-//		*val = new;
-//		return 1;
-//	}else
-//		return 0;
-//
-// To implement runtime·cas in sys_$GOOS_arm.s
-// using the native instructions, use:
-//
-//	TEXT runtime·cas(SB),NOSPLIT,$0
-//		B	runtime·armcas(SB)
-//
-TEXT runtime·armcas(SB),NOSPLIT,$0-13
-	MOVW	valptr+0(FP), R1
-	MOVW	old+4(FP), R2
-	MOVW	new+8(FP), R3
-casl:
-	LDREX	(R1), R0
-	CMP	R0, R2
-	BNE	casfail
-
-	MOVB	runtime·goarm(SB), R11
-	CMP	$7, R11
-	BLT	2(PC)
-	WORD	$0xf57ff05a	// dmb ishst
-
-	STREX	R3, (R1), R0
-	CMP	$0, R0
-	BNE	casl
-	MOVW	$1, R0
-
-	MOVB	runtime·goarm(SB), R11
-	CMP	$7, R11
-	BLT	2(PC)
-	WORD	$0xf57ff05b	// dmb ish
-
-	MOVB	R0, ret+12(FP)
-	RET
-casfail:
-	MOVW	$0, R0
-	MOVB	R0, ret+12(FP)
-	RET
-
-TEXT runtime·casuintptr(SB),NOSPLIT,$0-13
-	B	runtime·cas(SB)
-
-TEXT runtime·atomicloaduintptr(SB),NOSPLIT,$0-8
-	B	runtime·atomicload(SB)
-
-TEXT runtime·atomicloaduint(SB),NOSPLIT,$0-8
-	B	runtime·atomicload(SB)
-
-TEXT runtime·atomicstoreuintptr(SB),NOSPLIT,$0-8
-	B	runtime·atomicstore(SB)
-
 // armPublicationBarrier is a native store/store barrier for ARMv7+.
 // On earlier ARM revisions, armPublicationBarrier is a no-op.
 // This will not work on SMP ARMv6 machines, if any are in use.
diff --git a/src/runtime/asm_arm64.s b/src/runtime/asm_arm64.s
index 80309868f4..33755b35c0 100644
--- a/src/runtime/asm_arm64.s
+++ b/src/runtime/asm_arm64.s
@@ -453,40 +453,6 @@ CALLFN(·call268435456, 268435464 )
 CALLFN(·call536870912, 536870920 )
 CALLFN(·call1073741824, 1073741832 )
 
-// bool cas(uint32 *ptr, uint32 old, uint32 new)
-// Atomically:
-//	if(*val == old){
-//		*val = new;
-//		return 1;
-//	} else
-//		return 0;
-TEXT runtime·cas(SB), NOSPLIT, $0-17
-	MOVD	ptr+0(FP), R0
-	MOVW	old+8(FP), R1
-	MOVW	new+12(FP), R2
-again:
-	LDAXRW	(R0), R3
-	CMPW	R1, R3
-	BNE	ok
-	STLXRW	R2, (R0), R3
-	CBNZ	R3, again
-ok:
-	CSET	EQ, R0
-	MOVB	R0, ret+16(FP)
-	RET
-
-TEXT runtime·casuintptr(SB), NOSPLIT, $0-25
-	B	runtime·cas64(SB)
-
-TEXT runtime·atomicloaduintptr(SB), NOSPLIT, $-8-16
-	B	runtime·atomicload64(SB)
-
-TEXT runtime·atomicloaduint(SB), NOSPLIT, $-8-16
-	B	runtime·atomicload64(SB)
-
-TEXT runtime·atomicstoreuintptr(SB), NOSPLIT, $0-16
-	B	runtime·atomicstore64(SB)
-
 // AES hashing not implemented for ARM64, issue #10109.
 TEXT runtime·aeshash(SB),NOSPLIT,$-8-0
 	MOVW	$0, R0
@@ -500,17 +466,7 @@ TEXT runtime·aeshash64(SB),NOSPLIT,$-8-0
 TEXT runtime·aeshashstr(SB),NOSPLIT,$-8-0
 	MOVW	$0, R0
 	MOVW	(R0), R1
-
-// bool casp(void **val, void *old, void *new)
-// Atomically:
-//	if(*val == old){
-//		*val = new;
-//		return 1;
-//	} else
-//		return 0;
-TEXT runtime·casp1(SB), NOSPLIT, $0-25
-	B runtime·cas64(SB)
-
+	
 TEXT runtime·procyield(SB),NOSPLIT,$0-0
 	MOVWU	cycles+0(FP), R0
 again:
diff --git a/src/runtime/asm_ppc64x.s b/src/runtime/asm_ppc64x.s
index 1946cc5c47..6f1191be02 100644
--- a/src/runtime/asm_ppc64x.s
+++ b/src/runtime/asm_ppc64x.s
@@ -453,217 +453,9 @@ CALLFN(·call268435456, 268435456)
 CALLFN(·call536870912, 536870912)
 CALLFN(·call1073741824, 1073741824)
 
-// bool cas(uint32 *ptr, uint32 old, uint32 new)
-// Atomically:
-//	if(*val == old){
-//		*val = new;
-//		return 1;
-//	} else
-//		return 0;
-TEXT runtime·cas(SB), NOSPLIT, $0-17
-	MOVD	ptr+0(FP), R3
-	MOVWZ	old+8(FP), R4
-	MOVWZ	new+12(FP), R5
-cas_again:
-	SYNC
-	LWAR	(R3), R6
-	CMPW	R6, R4
-	BNE	cas_fail
-	STWCCC	R5, (R3)
-	BNE	cas_again
-	MOVD	$1, R3
-	SYNC
-	ISYNC
-	MOVB	R3, ret+16(FP)
-	RET
-cas_fail:
-	MOVD	$0, R3
-	BR	-5(PC)
-
-// bool	runtime·cas64(uint64 *ptr, uint64 old, uint64 new)
-// Atomically:
-//	if(*val == *old){
-//		*val = new;
-//		return 1;
-//	} else {
-//		return 0;
-//	}
-TEXT runtime·cas64(SB), NOSPLIT, $0-25
-	MOVD	ptr+0(FP), R3
-	MOVD	old+8(FP), R4
-	MOVD	new+16(FP), R5
-cas64_again:
-	SYNC
-	LDAR	(R3), R6
-	CMP	R6, R4
-	BNE	cas64_fail
-	STDCCC	R5, (R3)
-	BNE	cas64_again
-	MOVD	$1, R3
-	SYNC
-	ISYNC
-	MOVB	R3, ret+24(FP)
-	RET
-cas64_fail:
-	MOVD	$0, R3
-	BR	-5(PC)
-
-TEXT runtime·casuintptr(SB), NOSPLIT, $0-25
-	BR	runtime·cas64(SB)
-
-TEXT runtime·atomicloaduintptr(SB), NOSPLIT|NOFRAME, $0-16
-	BR	runtime·atomicload64(SB)
-
-TEXT runtime·atomicloaduint(SB), NOSPLIT|NOFRAME, $0-16
-	BR	runtime·atomicload64(SB)
-
-TEXT runtime·atomicstoreuintptr(SB), NOSPLIT, $0-16
-	BR	runtime·atomicstore64(SB)
-
-// bool casp(void **val, void *old, void *new)
-// Atomically:
-//	if(*val == old){
-//		*val = new;
-//		return 1;
-//	} else
-//		return 0;
-TEXT runtime·casp1(SB), NOSPLIT, $0-25
-	BR runtime·cas64(SB)
-
-// uint32 xadd(uint32 volatile *ptr, int32 delta)
-// Atomically:
-//	*val += delta;
-//	return *val;
-TEXT runtime·xadd(SB), NOSPLIT, $0-20
-	MOVD	ptr+0(FP), R4
-	MOVW	delta+8(FP), R5
-	SYNC
-	LWAR	(R4), R3
-	ADD	R5, R3
-	STWCCC	R3, (R4)
-	BNE	-4(PC)
-	SYNC
-	ISYNC
-	MOVW	R3, ret+16(FP)
-	RET
-
-TEXT runtime·xadd64(SB), NOSPLIT, $0-24
-	MOVD	ptr+0(FP), R4
-	MOVD	delta+8(FP), R5
-	SYNC
-	LDAR	(R4), R3
-	ADD	R5, R3
-	STDCCC	R3, (R4)
-	BNE	-4(PC)
-	SYNC
-	ISYNC
-	MOVD	R3, ret+16(FP)
-	RET
-
-TEXT runtime·xchg(SB), NOSPLIT, $0-20
-	MOVD	ptr+0(FP), R4
-	MOVW	new+8(FP), R5
-	SYNC
-	LWAR	(R4), R3
-	STWCCC	R5, (R4)
-	BNE	-3(PC)
-	SYNC
-	ISYNC
-	MOVW	R3, ret+16(FP)
-	RET
-
-TEXT runtime·xchg64(SB), NOSPLIT, $0-24
-	MOVD	ptr+0(FP), R4
-	MOVD	new+8(FP), R5
-	SYNC
-	LDAR	(R4), R3
-	STDCCC	R5, (R4)
-	BNE	-3(PC)
-	SYNC
-	ISYNC
-	MOVD	R3, ret+16(FP)
-	RET
-
-TEXT runtime·xchguintptr(SB), NOSPLIT, $0-24
-	BR	runtime·xchg64(SB)
-
 TEXT runtime·procyield(SB),NOSPLIT,$0-0
 	RET
 
-TEXT runtime·atomicstorep1(SB), NOSPLIT, $0-16
-	BR	runtime·atomicstore64(SB)
-
-TEXT runtime·atomicstore(SB), NOSPLIT, $0-12
-	MOVD	ptr+0(FP), R3
-	MOVW	val+8(FP), R4
-	SYNC
-	MOVW	R4, 0(R3)
-	RET
-
-TEXT runtime·atomicstore64(SB), NOSPLIT, $0-16
-	MOVD	ptr+0(FP), R3
-	MOVD	val+8(FP), R4
-	SYNC
-	MOVD	R4, 0(R3)
-	RET
-
-// void	runtime·atomicor8(byte volatile*, byte);
-TEXT runtime·atomicor8(SB), NOSPLIT, $0-9
-	MOVD	ptr+0(FP), R3
-	MOVBZ	val+8(FP), R4
-	// Align ptr down to 4 bytes so we can use 32-bit load/store.
-	// R5 = (R3 << 0) & ~3
-	RLDCR	$0, R3, $~3, R5
-	// Compute val shift.
-#ifdef GOARCH_ppc64
-	// Big endian.  ptr = ptr ^ 3
-	XOR	$3, R3
-#endif
-	// R6 = ((ptr & 3) * 8) = (ptr << 3) & (3*8)
-	RLDC	$3, R3, $(3*8), R6
-	// Shift val for aligned ptr.  R4 = val << R6
-	SLD	R6, R4, R4
-
-again:
-	SYNC
-	LWAR	(R5), R6
-	OR	R4, R6
-	STWCCC	R6, (R5)
-	BNE	again
-	SYNC
-	ISYNC
-	RET
-
-// void	runtime·atomicand8(byte volatile*, byte);
-TEXT runtime·atomicand8(SB), NOSPLIT, $0-9
-	MOVD	ptr+0(FP), R3
-	MOVBZ	val+8(FP), R4
-	// Align ptr down to 4 bytes so we can use 32-bit load/store.
-	// R5 = (R3 << 0) & ~3
-	RLDCR	$0, R3, $~3, R5
-	// Compute val shift.
-#ifdef GOARCH_ppc64
-	// Big endian.  ptr = ptr ^ 3
-	XOR	$3, R3
-#endif
-	// R6 = ((ptr & 3) * 8) = (ptr << 3) & (3*8)
-	RLDC	$3, R3, $(3*8), R6
-	// Shift val for aligned ptr.  R4 = val << R6 | ^(0xFF << R6)
-	MOVD	$0xFF, R7
-	SLD	R6, R4
-	SLD	R6, R7
-	XOR $-1, R7
-	OR	R7, R4
-again:
-	SYNC
-	LWAR	(R5), R6
-	AND	R4, R6
-	STWCCC	R6, (R5)
-	BNE	again
-	SYNC
-	ISYNC
-	RET
-
 // void jmpdefer(fv, sp);
 // called from deferreturn.
 // 1. grab stored LR for caller
diff --git a/src/runtime/atomic_386.go b/src/runtime/atomic_386.go
deleted file mode 100644
index 204a0eb0bc..0000000000
--- a/src/runtime/atomic_386.go
+++ /dev/null
@@ -1,79 +0,0 @@
-// Copyright 2009 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package runtime
-
-import "unsafe"
-
-// The calls to nop are to keep these functions from being inlined.
-// If they are inlined we have no guarantee that later rewrites of the
-// code by optimizers will preserve the relative order of memory accesses.
-
-//go:nosplit
-func atomicload(ptr *uint32) uint32 {
-	nop()
-	return *ptr
-}
-
-//go:nosplit
-func atomicloadp(ptr unsafe.Pointer) unsafe.Pointer {
-	nop()
-	return *(*unsafe.Pointer)(ptr)
-}
-
-//go:nosplit
-func xadd64(ptr *uint64, delta int64) uint64 {
-	for {
-		old := *ptr
-		if cas64(ptr, old, old+uint64(delta)) {
-			return old + uint64(delta)
-		}
-	}
-}
-
-//go:noescape
-//go:linkname xadduintptr runtime.xadd
-func xadduintptr(ptr *uintptr, delta uintptr) uintptr
-
-//go:nosplit
-func xchg64(ptr *uint64, new uint64) uint64 {
-	for {
-		old := *ptr
-		if cas64(ptr, old, new) {
-			return old
-		}
-	}
-}
-
-//go:noescape
-func xadd(ptr *uint32, delta int32) uint32
-
-//go:noescape
-func xchg(ptr *uint32, new uint32) uint32
-
-//go:noescape
-func xchguintptr(ptr *uintptr, new uintptr) uintptr
-
-//go:noescape
-func atomicload64(ptr *uint64) uint64
-
-//go:noescape
-func atomicand8(ptr *uint8, val uint8)
-
-//go:noescape
-func atomicor8(ptr *uint8, val uint8)
-
-// NOTE: Do not add atomicxor8 (XOR is not idempotent).
-
-//go:noescape
-func cas64(ptr *uint64, old, new uint64) bool
-
-//go:noescape
-func atomicstore(ptr *uint32, val uint32)
-
-//go:noescape
-func atomicstore64(ptr *uint64, val uint64)
-
-// NO go:noescape annotation; see atomic_pointer.go.
-func atomicstorep1(ptr unsafe.Pointer, val unsafe.Pointer)
diff --git a/src/runtime/atomic_amd64x.go b/src/runtime/atomic_amd64x.go
deleted file mode 100644
index 256b30bb5f..0000000000
--- a/src/runtime/atomic_amd64x.go
+++ /dev/null
@@ -1,69 +0,0 @@
-// Copyright 2009 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build amd64 amd64p32
-
-package runtime
-
-import "unsafe"
-
-// The calls to nop are to keep these functions from being inlined.
-// If they are inlined we have no guarantee that later rewrites of the
-// code by optimizers will preserve the relative order of memory accesses.
-
-//go:nosplit
-func atomicload(ptr *uint32) uint32 {
-	nop()
-	return *ptr
-}
-
-//go:nosplit
-func atomicloadp(ptr unsafe.Pointer) unsafe.Pointer {
-	nop()
-	return *(*unsafe.Pointer)(ptr)
-}
-
-//go:nosplit
-func atomicload64(ptr *uint64) uint64 {
-	nop()
-	return *ptr
-}
-
-//go:noescape
-func xadd(ptr *uint32, delta int32) uint32
-
-//go:noescape
-func xadd64(ptr *uint64, delta int64) uint64
-
-//go:noescape
-func xadduintptr(ptr *uintptr, delta uintptr) uintptr
-
-//go:noescape
-func xchg(ptr *uint32, new uint32) uint32
-
-//go:noescape
-func xchg64(ptr *uint64, new uint64) uint64
-
-//go:noescape
-func xchguintptr(ptr *uintptr, new uintptr) uintptr
-
-//go:noescape
-func atomicand8(ptr *uint8, val uint8)
-
-//go:noescape
-func atomicor8(ptr *uint8, val uint8)
-
-// NOTE: Do not add atomicxor8 (XOR is not idempotent).
-
-//go:noescape
-func cas64(ptr *uint64, old, new uint64) bool
-
-//go:noescape
-func atomicstore(ptr *uint32, val uint32)
-
-//go:noescape
-func atomicstore64(ptr *uint64, val uint64)
-
-// NO go:noescape annotation; see atomic_pointer.go.
-func atomicstorep1(ptr unsafe.Pointer, val unsafe.Pointer)
diff --git a/src/runtime/atomic_arm.go b/src/runtime/atomic_arm.go
deleted file mode 100644
index 0c53b102c1..0000000000
--- a/src/runtime/atomic_arm.go
+++ /dev/null
@@ -1,165 +0,0 @@
-// Copyright 2009 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package runtime
-
-import "unsafe"
-
-var locktab [57]struct {
-	l   mutex
-	pad [_CacheLineSize - unsafe.Sizeof(mutex{})]byte
-}
-
-func addrLock(addr *uint64) *mutex {
-	return &locktab[(uintptr(unsafe.Pointer(addr))>>3)%uintptr(len(locktab))].l
-}
-
-// Atomic add and return new value.
-//go:nosplit
-func xadd(val *uint32, delta int32) uint32 {
-	for {
-		oval := *val
-		nval := oval + uint32(delta)
-		if cas(val, oval, nval) {
-			return nval
-		}
-	}
-}
-
-//go:noescape
-//go:linkname xadduintptr runtime.xadd
-func xadduintptr(ptr *uintptr, delta uintptr) uintptr
-
-//go:nosplit
-func xchg(addr *uint32, v uint32) uint32 {
-	for {
-		old := *addr
-		if cas(addr, old, v) {
-			return old
-		}
-	}
-}
-
-//go:nosplit
-func xchguintptr(addr *uintptr, v uintptr) uintptr {
-	return uintptr(xchg((*uint32)(unsafe.Pointer(addr)), uint32(v)))
-}
-
-//go:nosplit
-func atomicload(addr *uint32) uint32 {
-	return xadd(addr, 0)
-}
-
-//go:nosplit
-func atomicloadp(addr unsafe.Pointer) unsafe.Pointer {
-	return unsafe.Pointer(uintptr(xadd((*uint32)(addr), 0)))
-}
-
-//go:nosplit
-func atomicstorep1(addr unsafe.Pointer, v unsafe.Pointer) {
-	for {
-		old := *(*unsafe.Pointer)(addr)
-		if casp1((*unsafe.Pointer)(addr), old, v) {
-			return
-		}
-	}
-}
-
-//go:nosplit
-func atomicstore(addr *uint32, v uint32) {
-	for {
-		old := *addr
-		if cas(addr, old, v) {
-			return
-		}
-	}
-}
-
-//go:nosplit
-func cas64(addr *uint64, old, new uint64) bool {
-	var ok bool
-	systemstack(func() {
-		lock(addrLock(addr))
-		if *addr == old {
-			*addr = new
-			ok = true
-		}
-		unlock(addrLock(addr))
-	})
-	return ok
-}
-
-//go:nosplit
-func xadd64(addr *uint64, delta int64) uint64 {
-	var r uint64
-	systemstack(func() {
-		lock(addrLock(addr))
-		r = *addr + uint64(delta)
-		*addr = r
-		unlock(addrLock(addr))
-	})
-	return r
-}
-
-//go:nosplit
-func xchg64(addr *uint64, v uint64) uint64 {
-	var r uint64
-	systemstack(func() {
-		lock(addrLock(addr))
-		r = *addr
-		*addr = v
-		unlock(addrLock(addr))
-	})
-	return r
-}
-
-//go:nosplit
-func atomicload64(addr *uint64) uint64 {
-	var r uint64
-	systemstack(func() {
-		lock(addrLock(addr))
-		r = *addr
-		unlock(addrLock(addr))
-	})
-	return r
-}
-
-//go:nosplit
-func atomicstore64(addr *uint64, v uint64) {
-	systemstack(func() {
-		lock(addrLock(addr))
-		*addr = v
-		unlock(addrLock(addr))
-	})
-}
-
-//go:nosplit
-func atomicor8(addr *uint8, v uint8) {
-	// Align down to 4 bytes and use 32-bit CAS.
-	uaddr := uintptr(unsafe.Pointer(addr))
-	addr32 := (*uint32)(unsafe.Pointer(uaddr &^ 3))
-	word := uint32(v) << ((uaddr & 3) * 8) // little endian
-	for {
-		old := *addr32
-		if cas(addr32, old, old|word) {
-			return
-		}
-	}
-}
-
-//go:nosplit
-func atomicand8(addr *uint8, v uint8) {
-	// Align down to 4 bytes and use 32-bit CAS.
-	uaddr := uintptr(unsafe.Pointer(addr))
-	addr32 := (*uint32)(unsafe.Pointer(uaddr &^ 3))
-	word := uint32(v) << ((uaddr & 3) * 8)    // little endian
-	mask := uint32(0xFF) << ((uaddr & 3) * 8) // little endian
-	word |= ^mask
-	for {
-		old := *addr32
-		if cas(addr32, old, old&word) {
-			return
-		}
-	}
-}
diff --git a/src/runtime/atomic_arm64.go b/src/runtime/atomic_arm64.go
deleted file mode 100644
index b3af4002c4..0000000000
--- a/src/runtime/atomic_arm64.go
+++ /dev/null
@@ -1,79 +0,0 @@
-// Copyright 2015 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package runtime
-
-import "unsafe"
-
-//go:noescape
-func xadd(ptr *uint32, delta int32) uint32
-
-//go:noescape
-func xadd64(ptr *uint64, delta int64) uint64
-
-//go:noescape
-//go:linkname xadduintptr runtime.xadd64
-func xadduintptr(ptr *uintptr, delta uintptr) uintptr
-
-//go:noescape
-func xchg(ptr *uint32, new uint32) uint32
-
-//go:noescape
-func xchg64(ptr *uint64, new uint64) uint64
-
-//go:noescape
-func xchguintptr(ptr *uintptr, new uintptr) uintptr
-
-//go:noescape
-func atomicload(ptr *uint32) uint32
-
-//go:noescape
-func atomicload64(ptr *uint64) uint64
-
-//go:noescape
-func atomicloadp(ptr unsafe.Pointer) unsafe.Pointer
-
-//go:nosplit
-func atomicor8(addr *uint8, v uint8) {
-	// TODO(dfc) implement this in asm.
-	// Align down to 4 bytes and use 32-bit CAS.
-	uaddr := uintptr(unsafe.Pointer(addr))
-	addr32 := (*uint32)(unsafe.Pointer(uaddr &^ 3))
-	word := uint32(v) << ((uaddr & 3) * 8) // little endian
-	for {
-		old := *addr32
-		if cas(addr32, old, old|word) {
-			return
-		}
-	}
-}
-
-//go:nosplit
-func atomicand8(addr *uint8, v uint8) {
-	// TODO(dfc) implement this in asm.
-	// Align down to 4 bytes and use 32-bit CAS.
-	uaddr := uintptr(unsafe.Pointer(addr))
-	addr32 := (*uint32)(unsafe.Pointer(uaddr &^ 3))
-	word := uint32(v) << ((uaddr & 3) * 8)    // little endian
-	mask := uint32(0xFF) << ((uaddr & 3) * 8) // little endian
-	word |= ^mask
-	for {
-		old := *addr32
-		if cas(addr32, old, old&word) {
-			return
-		}
-	}
-}
-
-//go:noescape
-func cas64(ptr *uint64, old, new uint64) bool
-
-//go:noescape
-func atomicstore(ptr *uint32, val uint32)
-
-//go:noescape
-func atomicstore64(ptr *uint64, val uint64)
-
-// NO go:noescape annotation; see atomic_pointer.go.
-func atomicstorep1(ptr unsafe.Pointer, val unsafe.Pointer)
diff --git a/src/runtime/atomic_arm64.s b/src/runtime/atomic_arm64.s
index d3ab2a121c..4704aa6ffc 100644
--- a/src/runtime/atomic_arm64.s
+++ b/src/runtime/atomic_arm64.s
@@ -4,114 +4,6 @@
 
 #include "textflag.h"
 
-// uint32 runtime·atomicload(uint32 volatile* addr)
-TEXT ·atomicload(SB),NOSPLIT,$-8-12
-	MOVD	ptr+0(FP), R0
-	LDARW	(R0), R0
-	MOVW	R0, ret+8(FP)
-	RET
-
-// uint64 runtime·atomicload64(uint64 volatile* addr)
-TEXT ·atomicload64(SB),NOSPLIT,$-8-16
-	MOVD	ptr+0(FP), R0
-	LDAR	(R0), R0
-	MOVD	R0, ret+8(FP)
-	RET
-
-// void *runtime·atomicloadp(void *volatile *addr)
-TEXT ·atomicloadp(SB),NOSPLIT,$-8-16
-	MOVD	ptr+0(FP), R0
-	LDAR	(R0), R0
-	MOVD	R0, ret+8(FP)
-	RET
-
-TEXT runtime·atomicstorep1(SB), NOSPLIT, $0-16
-	B	runtime·atomicstore64(SB)
-
-TEXT runtime·atomicstore(SB), NOSPLIT, $0-12
-	MOVD	ptr+0(FP), R0
-	MOVW	val+8(FP), R1
-	STLRW	R1, (R0)
-	RET
-
-TEXT runtime·atomicstore64(SB), NOSPLIT, $0-16
-	MOVD	ptr+0(FP), R0
-	MOVD	val+8(FP), R1
-	STLR	R1, (R0)
-	RET
-
-TEXT runtime·xchg(SB), NOSPLIT, $0-20
-again:
-	MOVD	ptr+0(FP), R0
-	MOVW	new+8(FP), R1
-	LDAXRW	(R0), R2
-	STLXRW	R1, (R0), R3
-	CBNZ	R3, again
-	MOVW	R2, ret+16(FP)
-	RET
-
-TEXT runtime·xchg64(SB), NOSPLIT, $0-24
-again:
-	MOVD	ptr+0(FP), R0
-	MOVD	new+8(FP), R1
-	LDAXR	(R0), R2
-	STLXR	R1, (R0), R3
-	CBNZ	R3, again
-	MOVD	R2, ret+16(FP)
-	RET
-
-// bool runtime·cas64(uint64 *ptr, uint64 old, uint64 new)
-// Atomically:
-//      if(*val == *old){
-//              *val = new;
-//              return 1;
-//      } else {
-//              return 0;
-//      }
-TEXT runtime·cas64(SB), NOSPLIT, $0-25
-	MOVD	ptr+0(FP), R0
-	MOVD	old+8(FP), R1
-	MOVD	new+16(FP), R2
-again:
-	LDAXR	(R0), R3
-	CMP	R1, R3
-	BNE	ok
-	STLXR	R2, (R0), R3
-	CBNZ	R3, again
-ok:
-	CSET	EQ, R0
-	MOVB	R0, ret+24(FP)
-	RET
-
-// uint32 xadd(uint32 volatile *ptr, int32 delta)
-// Atomically:
-//      *val += delta;
-//      return *val;
-TEXT runtime·xadd(SB), NOSPLIT, $0-20
-again:
-	MOVD	ptr+0(FP), R0
-	MOVW	delta+8(FP), R1
-	LDAXRW	(R0), R2
-	ADDW	R2, R1, R2
-	STLXRW	R2, (R0), R3
-	CBNZ	R3, again
-	MOVW	R2, ret+16(FP)
-	RET
-
-TEXT runtime·xadd64(SB), NOSPLIT, $0-24
-again:
-	MOVD	ptr+0(FP), R0
-	MOVD	delta+8(FP), R1
-	LDAXR	(R0), R2
-	ADD	R2, R1, R2
-	STLXR	R2, (R0), R3
-	CBNZ	R3, again
-	MOVD	R2, ret+16(FP)
-	RET
-
-TEXT runtime·xchguintptr(SB), NOSPLIT, $0-24
-	B	runtime·xchg64(SB)
-
 TEXT ·publicationBarrier(SB),NOSPLIT,$-8-0
 	DMB	$0xe	// DMB ST
 	RET
diff --git a/src/runtime/atomic_pointer.go b/src/runtime/atomic_pointer.go
index ec2ea8a338..bd21b49945 100644
--- a/src/runtime/atomic_pointer.go
+++ b/src/runtime/atomic_pointer.go
@@ -4,7 +4,10 @@
 
 package runtime
 
-import "unsafe"
+import (
+	"runtime/internal/atomic"
+	"unsafe"
+)
 
 // These functions cannot have go:noescape annotations,
 // because while ptr does not escape, new does.
@@ -18,13 +21,13 @@ import "unsafe"
 
 //go:nosplit
 func atomicstorep(ptr unsafe.Pointer, new unsafe.Pointer) {
-	atomicstorep1(noescape(ptr), new)
+	atomic.Storep1(noescape(ptr), new)
 	writebarrierptr_nostore((*uintptr)(ptr), uintptr(new))
 }
 
 //go:nosplit
 func casp(ptr *unsafe.Pointer, old, new unsafe.Pointer) bool {
-	if !casp1((*unsafe.Pointer)(noescape(unsafe.Pointer(ptr))), noescape(old), new) {
+	if !atomic.Casp1((*unsafe.Pointer)(noescape(unsafe.Pointer(ptr))), noescape(old), new) {
 		return false
 	}
 	writebarrierptr_nostore((*uintptr)(unsafe.Pointer(ptr)), uintptr(new))
@@ -42,7 +45,7 @@ func sync_atomic_StoreUintptr(ptr *uintptr, new uintptr)
 //go:nosplit
 func sync_atomic_StorePointer(ptr *unsafe.Pointer, new unsafe.Pointer) {
 	sync_atomic_StoreUintptr((*uintptr)(unsafe.Pointer(ptr)), uintptr(new))
-	atomicstorep1(noescape(unsafe.Pointer(ptr)), new)
+	atomic.Storep1(noescape(unsafe.Pointer(ptr)), new)
 	writebarrierptr_nostore((*uintptr)(unsafe.Pointer(ptr)), uintptr(new))
 }
 
diff --git a/src/runtime/atomic_ppc64x.go b/src/runtime/atomic_ppc64x.go
deleted file mode 100644
index 9273ddf03e..0000000000
--- a/src/runtime/atomic_ppc64x.go
+++ /dev/null
@@ -1,57 +0,0 @@
-// Copyright 2014 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build ppc64 ppc64le
-
-package runtime
-
-import "unsafe"
-
-//go:noescape
-func xadd(ptr *uint32, delta int32) uint32
-
-//go:noescape
-func xadd64(ptr *uint64, delta int64) uint64
-
-//go:noescape
-//go:linkname xadduintptr runtime.xadd64
-func xadduintptr(ptr *uintptr, delta uintptr) uintptr
-
-//go:noescape
-func xchg(ptr *uint32, new uint32) uint32
-
-//go:noescape
-func xchg64(ptr *uint64, new uint64) uint64
-
-//go:noescape
-func xchguintptr(ptr *uintptr, new uintptr) uintptr
-
-//go:noescape
-func atomicload(ptr *uint32) uint32
-
-//go:noescape
-func atomicload64(ptr *uint64) uint64
-
-//go:noescape
-func atomicloadp(ptr unsafe.Pointer) unsafe.Pointer
-
-//go:noescape
-func atomicand8(ptr *uint8, val uint8)
-
-//go:noescape
-func atomicor8(ptr *uint8, val uint8)
-
-// NOTE: Do not add atomicxor8 (XOR is not idempotent).
-
-//go:noescape
-func cas64(ptr *uint64, old, new uint64) bool
-
-//go:noescape
-func atomicstore(ptr *uint32, val uint32)
-
-//go:noescape
-func atomicstore64(ptr *uint64, val uint64)
-
-// NO go:noescape annotation; see atomic_pointer.go.
-func atomicstorep1(ptr unsafe.Pointer, val unsafe.Pointer)
diff --git a/src/runtime/atomic_ppc64x.s b/src/runtime/atomic_ppc64x.s
index 47769e6fd8..7cdb7466eb 100644
--- a/src/runtime/atomic_ppc64x.s
+++ b/src/runtime/atomic_ppc64x.s
@@ -6,39 +6,6 @@
 
 #include "textflag.h"
 
-// uint32 runtime·atomicload(uint32 volatile* addr)
-TEXT ·atomicload(SB),NOSPLIT|NOFRAME,$0-12
-	MOVD	addr+0(FP), R3
-	SYNC
-	MOVWZ	0(R3), R3
-	CMPW	R3, R3, CR7
-	BC	4, 30, 1(PC) // bne- cr7,0x4
-	ISYNC
-	MOVW	R3, ret+8(FP)
-	RET
-
-// uint64 runtime·atomicload64(uint64 volatile* addr)
-TEXT ·atomicload64(SB),NOSPLIT|NOFRAME,$0-16
-	MOVD	addr+0(FP), R3
-	SYNC
-	MOVD	0(R3), R3
-	CMP	R3, R3, CR7
-	BC	4, 30, 1(PC) // bne- cr7,0x4
-	ISYNC
-	MOVD	R3, ret+8(FP)
-	RET
-
-// void *runtime·atomicloadp(void *volatile *addr)
-TEXT ·atomicloadp(SB),NOSPLIT|NOFRAME,$0-16
-	MOVD	addr+0(FP), R3
-	SYNC
-	MOVD	0(R3), R3
-	CMP	R3, R3, CR7
-	BC	4, 30, 1(PC) // bne- cr7,0x4
-	ISYNC
-	MOVD	R3, ret+8(FP)
-	RET
-
 TEXT ·publicationBarrier(SB),NOSPLIT|NOFRAME,$0-0
 	// LWSYNC is the "export" barrier recommended by Power ISA
 	// v2.07 book II, appendix B.2.2.2.
diff --git a/src/runtime/atomic_test.go b/src/runtime/atomic_test.go
deleted file mode 100644
index 26991037f6..0000000000
--- a/src/runtime/atomic_test.go
+++ /dev/null
@@ -1,66 +0,0 @@
-// Copyright 2015 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package runtime_test
-
-import (
-	"runtime"
-	"testing"
-	"unsafe"
-)
-
-func runParallel(N, iter int, f func()) {
-	defer runtime.GOMAXPROCS(runtime.GOMAXPROCS(int(N)))
-	done := make(chan bool)
-	for i := 0; i < N; i++ {
-		go func() {
-			for j := 0; j < iter; j++ {
-				f()
-			}
-			done <- true
-		}()
-	}
-	for i := 0; i < N; i++ {
-		<-done
-	}
-}
-
-func TestXadduintptr(t *testing.T) {
-	const N = 20
-	const iter = 100000
-	inc := uintptr(100)
-	total := uintptr(0)
-	runParallel(N, iter, func() {
-		runtime.Xadduintptr(&total, inc)
-	})
-	if want := uintptr(N * iter * inc); want != total {
-		t.Fatalf("xadduintpr error, want %d, got %d", want, total)
-	}
-	total = 0
-	runParallel(N, iter, func() {
-		runtime.Xadduintptr(&total, inc)
-		runtime.Xadduintptr(&total, uintptr(-int64(inc)))
-	})
-	if total != 0 {
-		t.Fatalf("xadduintpr total error, want %d, got %d", 0, total)
-	}
-}
-
-// Tests that xadduintptr correctly updates 64-bit values.  The place where
-// we actually do so is mstats.go, functions mSysStat{Inc,Dec}.
-func TestXadduintptrOnUint64(t *testing.T) {
-	if runtime.BigEndian != 0 {
-		// On big endian architectures, we never use xadduintptr to update
-		// 64-bit values and hence we skip the test.  (Note that functions
-		// mSysStat{Inc,Dec} in mstats.go have explicit checks for
-		// big-endianness.)
-		return
-	}
-	const inc = 100
-	val := uint64(0)
-	runtime.Xadduintptr((*uintptr)(unsafe.Pointer(&val)), inc)
-	if inc != val {
-		t.Fatalf("xadduintptr should increase lower-order bits, want %d, got %d", inc, val)
-	}
-}
diff --git a/src/runtime/chan.go b/src/runtime/chan.go
index 839e235a9e..5be18beb23 100644
--- a/src/runtime/chan.go
+++ b/src/runtime/chan.go
@@ -11,7 +11,10 @@ package runtime
 // For buffered channels, also:
 //  c.qcount > 0 implies that c.recvq is empty.
 //  c.qcount < c.dataqsiz implies that c.sendq is empty.
-import "unsafe"
+import (
+	"runtime/internal/atomic"
+	"unsafe"
+)
 
 const (
 	maxAlign  = 8
@@ -393,8 +396,8 @@ func chanrecv(t *chantype, c *hchan, ep unsafe.Pointer, block bool) (selected, r
 	// The order of operations is important here: reversing the operations can lead to
 	// incorrect behavior when racing with a close.
 	if !block && (c.dataqsiz == 0 && c.sendq.first == nil ||
-		c.dataqsiz > 0 && atomicloaduint(&c.qcount) == 0) &&
-		atomicload(&c.closed) == 0 {
+		c.dataqsiz > 0 && atomic.Loaduint(&c.qcount) == 0) &&
+		atomic.Load(&c.closed) == 0 {
 		return
 	}
 
@@ -669,7 +672,7 @@ func (q *waitq) dequeue() *sudog {
 		// if sgp participates in a select and is already signaled, ignore it
 		if sgp.selectdone != nil {
 			// claim the right to signal
-			if *sgp.selectdone != 0 || !cas(sgp.selectdone, 0, 1) {
+			if *sgp.selectdone != 0 || !atomic.Cas(sgp.selectdone, 0, 1) {
 				continue
 			}
 		}
diff --git a/src/runtime/cpuprof.go b/src/runtime/cpuprof.go
index 0790852d97..87d5f99b44 100644
--- a/src/runtime/cpuprof.go
+++ b/src/runtime/cpuprof.go
@@ -50,7 +50,10 @@
 
 package runtime
 
-import "unsafe"
+import (
+	"runtime/internal/atomic"
+	"unsafe"
+)
 
 const (
 	numBuckets      = 1 << 10
@@ -173,7 +176,7 @@ func SetCPUProfileRate(hz int) {
 			if n&0x80000000 != 0 {
 				print("runtime: setcpuprofile(off) twice\n")
 			}
-			if cas(&cpuprof.handoff, n, n|0x80000000) {
+			if atomic.Cas(&cpuprof.handoff, n, n|0x80000000) {
 				if n == 0 {
 					// we did the transition from 0 -> nonzero so we wake getprofile
 					notewakeup(&cpuprof.wait)
@@ -276,7 +279,7 @@ func (p *cpuProfile) evict(e *cpuprofEntry) bool {
 // so it cannot allocate memory or block.  It can try to swap logs with
 // the writing goroutine, as explained in the comment at the top of this file.
 func (p *cpuProfile) flushlog() bool {
-	if !cas(&p.handoff, 0, uint32(p.nlog)) {
+	if !atomic.Cas(&p.handoff, 0, uint32(p.nlog)) {
 		return false
 	}
 	notewakeup(&p.wait)
@@ -318,7 +321,7 @@ func (p *cpuProfile) getprofile() []byte {
 				p.flushing = true
 				goto Flush
 			}
-			if cas(&p.handoff, n, 0) {
+			if atomic.Cas(&p.handoff, n, 0) {
 				break
 			}
 		}
@@ -389,7 +392,7 @@ Flush:
 
 	// Finally done.  Clean up and return nil.
 	p.flushing = false
-	if !cas(&p.handoff, p.handoff, 0) {
+	if !atomic.Cas(&p.handoff, p.handoff, 0) {
 		print("runtime: profile flush racing with something\n")
 	}
 	return nil
diff --git a/src/runtime/debug.go b/src/runtime/debug.go
index b7e7971104..ac61173b7f 100644
--- a/src/runtime/debug.go
+++ b/src/runtime/debug.go
@@ -4,7 +4,10 @@
 
 package runtime
 
-import "unsafe"
+import (
+	"runtime/internal/atomic"
+	"unsafe"
+)
 
 // GOMAXPROCS sets the maximum number of CPUs that can be executing
 // simultaneously and returns the previous setting.  If n < 1, it does not
@@ -39,7 +42,7 @@ func NumCPU() int {
 // NumCgoCall returns the number of cgo calls made by the current process.
 func NumCgoCall() int64 {
 	var n int64
-	for mp := (*m)(atomicloadp(unsafe.Pointer(&allm))); mp != nil; mp = mp.alllink {
+	for mp := (*m)(atomic.Loadp(unsafe.Pointer(&allm))); mp != nil; mp = mp.alllink {
 		n += int64(mp.ncgocall)
 	}
 	return n
diff --git a/src/runtime/export_test.go b/src/runtime/export_test.go
index d94fcb3bae..ad2bf1c628 100644
--- a/src/runtime/export_test.go
+++ b/src/runtime/export_test.go
@@ -6,7 +6,10 @@
 
 package runtime
 
-import "unsafe"
+import (
+	"runtime/internal/atomic"
+	"unsafe"
+)
 
 var Fadd64 = fadd64
 var Fsub64 = fsub64
@@ -22,7 +25,7 @@ var Sqrt = sqrt
 var Entersyscall = entersyscall
 var Exitsyscall = exitsyscall
 var LockedOSThread = lockedOSThread
-var Xadduintptr = xadduintptr
+var Xadduintptr = atomic.Xadduintptr
 
 var FuncPC = funcPC
 
diff --git a/src/runtime/hashmap.go b/src/runtime/hashmap.go
index d59ad297f5..667367891c 100644
--- a/src/runtime/hashmap.go
+++ b/src/runtime/hashmap.go
@@ -54,6 +54,7 @@ package runtime
 // before the table grows.  Typical tables will be somewhat less loaded.
 
 import (
+	"runtime/internal/atomic"
 	"unsafe"
 )
 
@@ -280,7 +281,7 @@ func mapaccess1(t *maptype, h *hmap, key unsafe.Pointer) unsafe.Pointer {
 		msanread(key, t.key.size)
 	}
 	if h == nil || h.count == 0 {
-		return atomicloadp(unsafe.Pointer(&zeroptr))
+		return atomic.Loadp(unsafe.Pointer(&zeroptr))
 	}
 	alg := t.key.alg
 	hash := alg.hash(key, uintptr(h.hash0))
@@ -315,7 +316,7 @@ func mapaccess1(t *maptype, h *hmap, key unsafe.Pointer) unsafe.Pointer {
 		}
 		b = b.overflow(t)
 		if b == nil {
-			return atomicloadp(unsafe.Pointer(&zeroptr))
+			return atomic.Loadp(unsafe.Pointer(&zeroptr))
 		}
 	}
 }
@@ -331,7 +332,7 @@ func mapaccess2(t *maptype, h *hmap, key unsafe.Pointer) (unsafe.Pointer, bool)
 		msanread(key, t.key.size)
 	}
 	if h == nil || h.count == 0 {
-		return atomicloadp(unsafe.Pointer(&zeroptr)), false
+		return atomic.Loadp(unsafe.Pointer(&zeroptr)), false
 	}
 	alg := t.key.alg
 	hash := alg.hash(key, uintptr(h.hash0))
@@ -366,7 +367,7 @@ func mapaccess2(t *maptype, h *hmap, key unsafe.Pointer) (unsafe.Pointer, bool)
 		}
 		b = b.overflow(t)
 		if b == nil {
-			return atomicloadp(unsafe.Pointer(&zeroptr)), false
+			return atomic.Loadp(unsafe.Pointer(&zeroptr)), false
 		}
 	}
 }
@@ -627,7 +628,7 @@ func mapiterinit(t *maptype, h *hmap, it *hiter) {
 	// Remember we have an iterator.
 	// Can run concurrently with another hash_iter_init().
 	if old := h.flags; old&(iterator|oldIterator) != iterator|oldIterator {
-		atomicor8(&h.flags, iterator|oldIterator)
+		atomic.Or8(&h.flags, iterator|oldIterator)
 	}
 
 	mapiternext(it)
@@ -1024,14 +1025,14 @@ var zerosize uintptr = initialZeroSize
 // serve as the zero value for t.
 func mapzero(t *_type) {
 	// Is the type small enough for existing buffer?
-	cursize := uintptr(atomicloadp(unsafe.Pointer(&zerosize)))
+	cursize := uintptr(atomic.Loadp(unsafe.Pointer(&zerosize)))
 	if t.size <= cursize {
 		return
 	}
 
 	// Allocate a new buffer.
 	lock(&zerolock)
-	cursize = uintptr(atomicloadp(unsafe.Pointer(&zerosize)))
+	cursize = uintptr(atomic.Loadp(unsafe.Pointer(&zerosize)))
 	if cursize < t.size {
 		for cursize < t.size {
 			cursize *= 2
@@ -1040,8 +1041,8 @@ func mapzero(t *_type) {
 				throw("map element too large")
 			}
 		}
-		atomicstorep1(unsafe.Pointer(&zeroptr), persistentalloc(cursize, 64, &memstats.other_sys))
-		atomicstorep1(unsafe.Pointer(&zerosize), unsafe.Pointer(zerosize))
+		atomic.Storep1(unsafe.Pointer(&zeroptr), persistentalloc(cursize, 64, &memstats.other_sys))
+		atomic.Storep1(unsafe.Pointer(&zerosize), unsafe.Pointer(zerosize))
 	}
 	unlock(&zerolock)
 }
diff --git a/src/runtime/hashmap_fast.go b/src/runtime/hashmap_fast.go
index de9b267fde..9f310f8bf3 100644
--- a/src/runtime/hashmap_fast.go
+++ b/src/runtime/hashmap_fast.go
@@ -5,6 +5,7 @@
 package runtime
 
 import (
+	"runtime/internal/atomic"
 	"unsafe"
 )
 
@@ -14,7 +15,7 @@ func mapaccess1_fast32(t *maptype, h *hmap, key uint32) unsafe.Pointer {
 		racereadpc(unsafe.Pointer(h), callerpc, funcPC(mapaccess1_fast32))
 	}
 	if h == nil || h.count == 0 {
-		return atomicloadp(unsafe.Pointer(&zeroptr))
+		return atomic.Loadp(unsafe.Pointer(&zeroptr))
 	}
 	var b *bmap
 	if h.B == 0 {
@@ -45,7 +46,7 @@ func mapaccess1_fast32(t *maptype, h *hmap, key uint32) unsafe.Pointer {
 		}
 		b = b.overflow(t)
 		if b == nil {
-			return atomicloadp(unsafe.Pointer(&zeroptr))
+			return atomic.Loadp(unsafe.Pointer(&zeroptr))
 		}
 	}
 }
@@ -56,7 +57,7 @@ func mapaccess2_fast32(t *maptype, h *hmap, key uint32) (unsafe.Pointer, bool) {
 		racereadpc(unsafe.Pointer(h), callerpc, funcPC(mapaccess2_fast32))
 	}
 	if h == nil || h.count == 0 {
-		return atomicloadp(unsafe.Pointer(&zeroptr)), false
+		return atomic.Loadp(unsafe.Pointer(&zeroptr)), false
 	}
 	var b *bmap
 	if h.B == 0 {
@@ -87,7 +88,7 @@ func mapaccess2_fast32(t *maptype, h *hmap, key uint32) (unsafe.Pointer, bool) {
 		}
 		b = b.overflow(t)
 		if b == nil {
-			return atomicloadp(unsafe.Pointer(&zeroptr)), false
+			return atomic.Loadp(unsafe.Pointer(&zeroptr)), false
 		}
 	}
 }
@@ -98,7 +99,7 @@ func mapaccess1_fast64(t *maptype, h *hmap, key uint64) unsafe.Pointer {
 		racereadpc(unsafe.Pointer(h), callerpc, funcPC(mapaccess1_fast64))
 	}
 	if h == nil || h.count == 0 {
-		return atomicloadp(unsafe.Pointer(&zeroptr))
+		return atomic.Loadp(unsafe.Pointer(&zeroptr))
 	}
 	var b *bmap
 	if h.B == 0 {
@@ -129,7 +130,7 @@ func mapaccess1_fast64(t *maptype, h *hmap, key uint64) unsafe.Pointer {
 		}
 		b = b.overflow(t)
 		if b == nil {
-			return atomicloadp(unsafe.Pointer(&zeroptr))
+			return atomic.Loadp(unsafe.Pointer(&zeroptr))
 		}
 	}
 }
@@ -140,7 +141,7 @@ func mapaccess2_fast64(t *maptype, h *hmap, key uint64) (unsafe.Pointer, bool) {
 		racereadpc(unsafe.Pointer(h), callerpc, funcPC(mapaccess2_fast64))
 	}
 	if h == nil || h.count == 0 {
-		return atomicloadp(unsafe.Pointer(&zeroptr)), false
+		return atomic.Loadp(unsafe.Pointer(&zeroptr)), false
 	}
 	var b *bmap
 	if h.B == 0 {
@@ -171,7 +172,7 @@ func mapaccess2_fast64(t *maptype, h *hmap, key uint64) (unsafe.Pointer, bool) {
 		}
 		b = b.overflow(t)
 		if b == nil {
-			return atomicloadp(unsafe.Pointer(&zeroptr)), false
+			return atomic.Loadp(unsafe.Pointer(&zeroptr)), false
 		}
 	}
 }
@@ -182,7 +183,7 @@ func mapaccess1_faststr(t *maptype, h *hmap, ky string) unsafe.Pointer {
 		racereadpc(unsafe.Pointer(h), callerpc, funcPC(mapaccess1_faststr))
 	}
 	if h == nil || h.count == 0 {
-		return atomicloadp(unsafe.Pointer(&zeroptr))
+		return atomic.Loadp(unsafe.Pointer(&zeroptr))
 	}
 	key := stringStructOf(&ky)
 	if h.B == 0 {
@@ -203,7 +204,7 @@ func mapaccess1_faststr(t *maptype, h *hmap, ky string) unsafe.Pointer {
 					return add(unsafe.Pointer(b), dataOffset+bucketCnt*2*ptrSize+i*uintptr(t.valuesize))
 				}
 			}
-			return atomicloadp(unsafe.Pointer(&zeroptr))
+			return atomic.Loadp(unsafe.Pointer(&zeroptr))
 		}
 		// long key, try not to do more comparisons than necessary
 		keymaybe := uintptr(bucketCnt)
@@ -241,7 +242,7 @@ func mapaccess1_faststr(t *maptype, h *hmap, ky string) unsafe.Pointer {
 				return add(unsafe.Pointer(b), dataOffset+bucketCnt*2*ptrSize+keymaybe*uintptr(t.valuesize))
 			}
 		}
-		return atomicloadp(unsafe.Pointer(&zeroptr))
+		return atomic.Loadp(unsafe.Pointer(&zeroptr))
 	}
 dohash:
 	hash := t.key.alg.hash(noescape(unsafe.Pointer(&ky)), uintptr(h.hash0))
@@ -273,7 +274,7 @@ dohash:
 		}
 		b = b.overflow(t)
 		if b == nil {
-			return atomicloadp(unsafe.Pointer(&zeroptr))
+			return atomic.Loadp(unsafe.Pointer(&zeroptr))
 		}
 	}
 }
@@ -284,7 +285,7 @@ func mapaccess2_faststr(t *maptype, h *hmap, ky string) (unsafe.Pointer, bool) {
 		racereadpc(unsafe.Pointer(h), callerpc, funcPC(mapaccess2_faststr))
 	}
 	if h == nil || h.count == 0 {
-		return atomicloadp(unsafe.Pointer(&zeroptr)), false
+		return atomic.Loadp(unsafe.Pointer(&zeroptr)), false
 	}
 	key := stringStructOf(&ky)
 	if h.B == 0 {
@@ -305,7 +306,7 @@ func mapaccess2_faststr(t *maptype, h *hmap, ky string) (unsafe.Pointer, bool) {
 					return add(unsafe.Pointer(b), dataOffset+bucketCnt*2*ptrSize+i*uintptr(t.valuesize)), true
 				}
 			}
-			return atomicloadp(unsafe.Pointer(&zeroptr)), false
+			return atomic.Loadp(unsafe.Pointer(&zeroptr)), false
 		}
 		// long key, try not to do more comparisons than necessary
 		keymaybe := uintptr(bucketCnt)
@@ -341,7 +342,7 @@ func mapaccess2_faststr(t *maptype, h *hmap, ky string) (unsafe.Pointer, bool) {
 				return add(unsafe.Pointer(b), dataOffset+bucketCnt*2*ptrSize+keymaybe*uintptr(t.valuesize)), true
 			}
 		}
-		return atomicloadp(unsafe.Pointer(&zeroptr)), false
+		return atomic.Loadp(unsafe.Pointer(&zeroptr)), false
 	}
 dohash:
 	hash := t.key.alg.hash(noescape(unsafe.Pointer(&ky)), uintptr(h.hash0))
@@ -373,7 +374,7 @@ dohash:
 		}
 		b = b.overflow(t)
 		if b == nil {
-			return atomicloadp(unsafe.Pointer(&zeroptr)), false
+			return atomic.Loadp(unsafe.Pointer(&zeroptr)), false
 		}
 	}
 }
diff --git a/src/runtime/iface.go b/src/runtime/iface.go
index 58c422a528..d4e8b8e69f 100644
--- a/src/runtime/iface.go
+++ b/src/runtime/iface.go
@@ -4,7 +4,10 @@
 
 package runtime
 
-import "unsafe"
+import (
+	"runtime/internal/atomic"
+	"unsafe"
+)
 
 const (
 	hashSize = 1009
@@ -43,7 +46,7 @@ func getitab(inter *interfacetype, typ *_type, canfail bool) *itab {
 		if locked != 0 {
 			lock(&ifaceLock)
 		}
-		for m = (*itab)(atomicloadp(unsafe.Pointer(&hash[h]))); m != nil; m = m.link {
+		for m = (*itab)(atomic.Loadp(unsafe.Pointer(&hash[h]))); m != nil; m = m.link {
 			if m.inter == inter && m._type == typ {
 				if m.bad != 0 {
 					m = nil
@@ -151,7 +154,7 @@ func convT2I(t *_type, inter *interfacetype, cache **itab, elem unsafe.Pointer,
 	if msanenabled {
 		msanread(elem, t.size)
 	}
-	tab := (*itab)(atomicloadp(unsafe.Pointer(cache)))
+	tab := (*itab)(atomic.Loadp(unsafe.Pointer(cache)))
 	if tab == nil {
 		tab = getitab(inter, t, false)
 		atomicstorep(unsafe.Pointer(cache), unsafe.Pointer(tab))
diff --git a/src/runtime/internal/atomic/arch1_386.go b/src/runtime/internal/atomic/arch1_386.go
new file mode 100644
index 0000000000..76a9e2ecc4
--- /dev/null
+++ b/src/runtime/internal/atomic/arch1_386.go
@@ -0,0 +1,9 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package atomic
+
+const (
+	_CacheLineSize = 64
+)
diff --git a/src/runtime/internal/atomic/arch1_amd64.go b/src/runtime/internal/atomic/arch1_amd64.go
new file mode 100644
index 0000000000..76a9e2ecc4
--- /dev/null
+++ b/src/runtime/internal/atomic/arch1_amd64.go
@@ -0,0 +1,9 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package atomic
+
+const (
+	_CacheLineSize = 64
+)
diff --git a/src/runtime/internal/atomic/arch1_amd64p32.go b/src/runtime/internal/atomic/arch1_amd64p32.go
new file mode 100644
index 0000000000..7bb6e1aaf9
--- /dev/null
+++ b/src/runtime/internal/atomic/arch1_amd64p32.go
@@ -0,0 +1,14 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package atomic
+
+const (
+	thechar        = '6'
+	_BigEndian     = 0
+	_CacheLineSize = 64
+	_PCQuantum     = 1
+	_Int64Align    = 8
+	hugePageSize   = 1 << 21
+)
diff --git a/src/runtime/internal/atomic/arch1_arm.go b/src/runtime/internal/atomic/arch1_arm.go
new file mode 100644
index 0000000000..d9b997c5ca
--- /dev/null
+++ b/src/runtime/internal/atomic/arch1_arm.go
@@ -0,0 +1,9 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package atomic
+
+const (
+	_CacheLineSize = 32
+)
diff --git a/src/runtime/internal/atomic/arch1_arm64.go b/src/runtime/internal/atomic/arch1_arm64.go
new file mode 100644
index 0000000000..d9b997c5ca
--- /dev/null
+++ b/src/runtime/internal/atomic/arch1_arm64.go
@@ -0,0 +1,9 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package atomic
+
+const (
+	_CacheLineSize = 32
+)
diff --git a/src/runtime/internal/atomic/arch1_ppc64.go b/src/runtime/internal/atomic/arch1_ppc64.go
new file mode 100644
index 0000000000..815a5840f7
--- /dev/null
+++ b/src/runtime/internal/atomic/arch1_ppc64.go
@@ -0,0 +1,9 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package atomic
+
+const (
+	_CacheLineSize = 64
+)
diff --git a/src/runtime/internal/atomic/arch1_ppc64le.go b/src/runtime/internal/atomic/arch1_ppc64le.go
new file mode 100644
index 0000000000..815a5840f7
--- /dev/null
+++ b/src/runtime/internal/atomic/arch1_ppc64le.go
@@ -0,0 +1,9 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package atomic
+
+const (
+	_CacheLineSize = 64
+)
diff --git a/src/runtime/internal/atomic/asm.s b/src/runtime/internal/atomic/asm.s
new file mode 100644
index 0000000000..b5d0211803
--- /dev/null
+++ b/src/runtime/internal/atomic/asm.s
@@ -0,0 +1,8 @@
+// Copyright 2015 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+TEXT runtime∕internal∕atomic·nop(SB),NOSPLIT,$0-0
+	RET
diff --git a/src/runtime/internal/atomic/asm_386.s b/src/runtime/internal/atomic/asm_386.s
new file mode 100644
index 0000000000..ce84fd83d1
--- /dev/null
+++ b/src/runtime/internal/atomic/asm_386.s
@@ -0,0 +1,166 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+// bool Cas(int32 *val, int32 old, int32 new)
+// Atomically:
+//	if(*val == old){
+//		*val = new;
+//		return 1;
+//	}else
+//		return 0;
+TEXT runtime∕internal∕atomic·Cas(SB), NOSPLIT, $0-13
+	MOVL	ptr+0(FP), BX
+	MOVL	old+4(FP), AX
+	MOVL	new+8(FP), CX
+	LOCK
+	CMPXCHGL	CX, 0(BX)
+	SETEQ	ret+12(FP)
+	RET
+
+TEXT runtime∕internal∕atomic·Casuintptr(SB), NOSPLIT, $0-13
+	JMP	runtime∕internal∕atomic·Cas(SB)
+
+TEXT runtime∕internal∕atomic·Loaduintptr(SB), NOSPLIT, $0-8
+	JMP	runtime∕internal∕atomic·Load(SB)
+
+TEXT runtime∕internal∕atomic·Loaduint(SB), NOSPLIT, $0-8
+	JMP	runtime∕internal∕atomic·Load(SB)
+
+TEXT runtime∕internal∕atomic·Storeuintptr(SB), NOSPLIT, $0-8
+	JMP	runtime∕internal∕atomic·Store(SB)
+
+TEXT runtime∕internal∕atomic·Xadduintptr(SB), NOSPLIT, $0-8
+	JMP runtime∕internal∕atomic·Xadd(SB)
+
+TEXT runtime∕internal∕atomic·Loadint64(SB), NOSPLIT, $0-16
+	JMP runtime∕internal∕atomic·Load64(SB)
+
+TEXT runtime∕internal∕atomic·Xaddint64(SB), NOSPLIT, $0-16
+	JMP runtime∕internal∕atomic·Xadd64(SB)
+
+
+// bool runtime∕internal∕atomic·Cas64(uint64 *val, uint64 old, uint64 new)
+// Atomically:
+//	if(*val == *old){
+//		*val = new;
+//		return 1;
+//	} else {
+//		return 0;
+//	}
+TEXT runtime∕internal∕atomic·Cas64(SB), NOSPLIT, $0-21
+	MOVL	ptr+0(FP), BP
+	MOVL	old_lo+4(FP), AX
+	MOVL	old_hi+8(FP), DX
+	MOVL	new_lo+12(FP), BX
+	MOVL	new_hi+16(FP), CX
+	LOCK
+	CMPXCHG8B	0(BP)
+	SETEQ	ret+20(FP)
+	RET
+
+// bool Casp(void **p, void *old, void *new)
+// Atomically:
+//	if(*p == old){
+//		*p = new;
+//		return 1;
+//	}else
+//		return 0;
+TEXT runtime∕internal∕atomic·Casp1(SB), NOSPLIT, $0-13
+	MOVL	ptr+0(FP), BX
+	MOVL	old+4(FP), AX
+	MOVL	new+8(FP), CX
+	LOCK
+	CMPXCHGL	CX, 0(BX)
+	SETEQ	ret+12(FP)
+	RET
+
+// uint32 Xadd(uint32 volatile *val, int32 delta)
+// Atomically:
+//	*val += delta;
+//	return *val;
+TEXT runtime∕internal∕atomic·Xadd(SB), NOSPLIT, $0-12
+	MOVL	ptr+0(FP), BX
+	MOVL	delta+4(FP), AX
+	MOVL	AX, CX
+	LOCK
+	XADDL	AX, 0(BX)
+	ADDL	CX, AX
+	MOVL	AX, ret+8(FP)
+	RET
+
+TEXT runtime∕internal∕atomic·Xchg(SB), NOSPLIT, $0-12
+	MOVL	ptr+0(FP), BX
+	MOVL	new+4(FP), AX
+	XCHGL	AX, 0(BX)
+	MOVL	AX, ret+8(FP)
+	RET
+
+TEXT runtime∕internal∕atomic·Xchguintptr(SB), NOSPLIT, $0-12
+	JMP	runtime∕internal∕atomic·Xchg(SB)
+
+
+TEXT runtime∕internal∕atomic·Storep1(SB), NOSPLIT, $0-8
+	MOVL	ptr+0(FP), BX
+	MOVL	val+4(FP), AX
+	XCHGL	AX, 0(BX)
+	RET
+
+TEXT runtime∕internal∕atomic·Store(SB), NOSPLIT, $0-8
+	MOVL	ptr+0(FP), BX
+	MOVL	val+4(FP), AX
+	XCHGL	AX, 0(BX)
+	RET
+
+// uint64 atomicload64(uint64 volatile* addr);
+TEXT runtime∕internal∕atomic·Load64(SB), NOSPLIT, $0-12
+	MOVL	ptr+0(FP), AX
+	TESTL	$7, AX
+	JZ	2(PC)
+	MOVL	0, AX // crash with nil ptr deref
+	LEAL	ret_lo+4(FP), BX
+	// MOVQ (%EAX), %MM0
+	BYTE $0x0f; BYTE $0x6f; BYTE $0x00
+	// MOVQ %MM0, 0(%EBX)
+	BYTE $0x0f; BYTE $0x7f; BYTE $0x03
+	// EMMS
+	BYTE $0x0F; BYTE $0x77
+	RET
+
+// void runtime∕internal∕atomic·Store64(uint64 volatile* addr, uint64 v);
+TEXT runtime∕internal∕atomic·Store64(SB), NOSPLIT, $0-12
+	MOVL	ptr+0(FP), AX
+	TESTL	$7, AX
+	JZ	2(PC)
+	MOVL	0, AX // crash with nil ptr deref
+	// MOVQ and EMMS were introduced on the Pentium MMX.
+	// MOVQ 0x8(%ESP), %MM0
+	BYTE $0x0f; BYTE $0x6f; BYTE $0x44; BYTE $0x24; BYTE $0x08
+	// MOVQ %MM0, (%EAX)
+	BYTE $0x0f; BYTE $0x7f; BYTE $0x00 
+	// EMMS
+	BYTE $0x0F; BYTE $0x77
+	// This is essentially a no-op, but it provides required memory fencing.
+	// It can be replaced with MFENCE, but MFENCE was introduced only on the Pentium4 (SSE2).
+	MOVL	$0, AX
+	LOCK
+	XADDL	AX, (SP)
+	RET
+
+// void	runtime∕internal∕atomic·Or8(byte volatile*, byte);
+TEXT runtime∕internal∕atomic·Or8(SB), NOSPLIT, $0-5
+	MOVL	ptr+0(FP), AX
+	MOVB	val+4(FP), BX
+	LOCK
+	ORB	BX, (AX)
+	RET
+
+// void	runtime∕internal∕atomic·And8(byte volatile*, byte);
+TEXT runtime∕internal∕atomic·And8(SB), NOSPLIT, $0-5
+	MOVL	ptr+0(FP), AX
+	MOVB	val+4(FP), BX
+	LOCK
+	ANDB	BX, (AX)
+	RET
diff --git a/src/runtime/internal/atomic/asm_amd64.s b/src/runtime/internal/atomic/asm_amd64.s
new file mode 100644
index 0000000000..7463fec4a1
--- /dev/null
+++ b/src/runtime/internal/atomic/asm_amd64.s
@@ -0,0 +1,150 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+// bool Cas(int32 *val, int32 old, int32 new)
+// Atomically:
+//	if(*val == old){
+//		*val = new;
+//		return 1;
+//	} else
+//		return 0;
+TEXT runtime∕internal∕atomic·Cas(SB),NOSPLIT,$0-17
+	MOVQ	ptr+0(FP), BX
+	MOVL	old+8(FP), AX
+	MOVL	new+12(FP), CX
+	LOCK
+	CMPXCHGL	CX, 0(BX)
+	SETEQ	ret+16(FP)
+	RET
+
+// bool	runtime∕internal∕atomic·Cas64(uint64 *val, uint64 old, uint64 new)
+// Atomically:
+//	if(*val == *old){
+//		*val = new;
+//		return 1;
+//	} else {
+//		return 0;
+//	}
+TEXT runtime∕internal∕atomic·Cas64(SB), NOSPLIT, $0-25
+	MOVQ	ptr+0(FP), BX
+	MOVQ	old+8(FP), AX
+	MOVQ	new+16(FP), CX
+	LOCK
+	CMPXCHGQ	CX, 0(BX)
+	SETEQ	ret+24(FP)
+	RET
+
+TEXT runtime∕internal∕atomic·Casuintptr(SB), NOSPLIT, $0-25
+	JMP	runtime∕internal∕atomic·Cas64(SB)
+
+TEXT runtime∕internal∕atomic·Loaduintptr(SB), NOSPLIT, $0-16
+	JMP	runtime∕internal∕atomic·Load64(SB)
+
+TEXT runtime∕internal∕atomic·Loaduint(SB), NOSPLIT, $0-16
+	JMP	runtime∕internal∕atomic·Load64(SB)
+
+TEXT runtime∕internal∕atomic·Storeuintptr(SB), NOSPLIT, $0-16
+	JMP	runtime∕internal∕atomic·Store64(SB)
+
+TEXT runtime∕internal∕atomic·Loadint64(SB), NOSPLIT, $0-16
+	JMP	runtime∕internal∕atomic·Load64(SB)
+
+TEXT runtime∕internal∕atomic·Xaddint64(SB), NOSPLIT, $0-16
+	JMP	runtime∕internal∕atomic·Xadd64(SB)
+
+// bool Casp(void **val, void *old, void *new)
+// Atomically:
+//	if(*val == old){
+//		*val = new;
+//		return 1;
+//	} else
+//		return 0;
+TEXT runtime∕internal∕atomic·Casp1(SB), NOSPLIT, $0-25
+	MOVQ	ptr+0(FP), BX
+	MOVQ	old+8(FP), AX
+	MOVQ	new+16(FP), CX
+	LOCK
+	CMPXCHGQ	CX, 0(BX)
+	SETEQ	ret+24(FP)
+	RET
+
+// uint32 Xadd(uint32 volatile *val, int32 delta)
+// Atomically:
+//	*val += delta;
+//	return *val;
+TEXT runtime∕internal∕atomic·Xadd(SB), NOSPLIT, $0-20
+	MOVQ	ptr+0(FP), BX
+	MOVL	delta+8(FP), AX
+	MOVL	AX, CX
+	LOCK
+	XADDL	AX, 0(BX)
+	ADDL	CX, AX
+	MOVL	AX, ret+16(FP)
+	RET
+
+TEXT runtime∕internal∕atomic·Xadd64(SB), NOSPLIT, $0-24
+	MOVQ	ptr+0(FP), BX
+	MOVQ	delta+8(FP), AX
+	MOVQ	AX, CX
+	LOCK
+	XADDQ	AX, 0(BX)
+	ADDQ	CX, AX
+	MOVQ	AX, ret+16(FP)
+	RET
+
+TEXT runtime∕internal∕atomic·Xadduintptr(SB), NOSPLIT, $0-24
+	JMP	runtime∕internal∕atomic·Xadd64(SB)
+
+TEXT runtime∕internal∕atomic·Xchg(SB), NOSPLIT, $0-20
+	MOVQ	ptr+0(FP), BX
+	MOVL	new+8(FP), AX
+	XCHGL	AX, 0(BX)
+	MOVL	AX, ret+16(FP)
+	RET
+
+TEXT runtime∕internal∕atomic·Xchg64(SB), NOSPLIT, $0-24
+	MOVQ	ptr+0(FP), BX
+	MOVQ	new+8(FP), AX
+	XCHGQ	AX, 0(BX)
+	MOVQ	AX, ret+16(FP)
+	RET
+
+TEXT runtime∕internal∕atomic·Xchguintptr(SB), NOSPLIT, $0-24
+	JMP	runtime∕internal∕atomic·Xchg64(SB)
+
+TEXT runtime∕internal∕atomic·Storep1(SB), NOSPLIT, $0-16
+	MOVQ	ptr+0(FP), BX
+	MOVQ	val+8(FP), AX
+	XCHGQ	AX, 0(BX)
+	RET
+
+TEXT runtime∕internal∕atomic·Store(SB), NOSPLIT, $0-12
+	MOVQ	ptr+0(FP), BX
+	MOVL	val+8(FP), AX
+	XCHGL	AX, 0(BX)
+	RET
+
+TEXT runtime∕internal∕atomic·Store64(SB), NOSPLIT, $0-16
+	MOVQ	ptr+0(FP), BX
+	MOVQ	val+8(FP), AX
+	XCHGQ	AX, 0(BX)
+	RET
+
+// void	runtime∕internal∕atomic·Or8(byte volatile*, byte);
+TEXT runtime∕internal∕atomic·Or8(SB), NOSPLIT, $0-9
+	MOVQ	ptr+0(FP), AX
+	MOVB	val+8(FP), BX
+	LOCK
+	ORB	BX, (AX)
+	RET
+
+// void	runtime∕internal∕atomic·And8(byte volatile*, byte);
+TEXT runtime∕internal∕atomic·And8(SB), NOSPLIT, $0-9
+	MOVQ	ptr+0(FP), AX
+	MOVB	val+8(FP), BX
+	LOCK
+	ANDB	BX, (AX)
+	RET
diff --git a/src/runtime/internal/atomic/asm_amd64p32.s b/src/runtime/internal/atomic/asm_amd64p32.s
new file mode 100644
index 0000000000..f1e2c3aca6
--- /dev/null
+++ b/src/runtime/internal/atomic/asm_amd64p32.s
@@ -0,0 +1,150 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+// bool Cas(int32 *val, int32 old, int32 new)
+// Atomically:
+//	if(*val == old){
+//		*val = new;
+//		return 1;
+//	} else
+//		return 0;
+TEXT runtime∕internal∕atomic·Cas(SB), NOSPLIT, $0-17
+	MOVL	ptr+0(FP), BX
+	MOVL	old+4(FP), AX
+	MOVL	new+8(FP), CX
+	LOCK
+	CMPXCHGL	CX, 0(BX)
+	SETEQ	ret+16(FP)
+	RET
+
+TEXT runtime∕internal∕atomic·Casuintptr(SB), NOSPLIT, $0-17
+	JMP	runtime∕internal∕atomic·Cas(SB)
+
+TEXT runtime∕internal∕atomic·Loaduintptr(SB), NOSPLIT, $0-12
+	JMP	runtime∕internal∕atomic·Load(SB)
+
+TEXT runtime∕internal∕atomic·Loaduint(SB), NOSPLIT, $0-12
+	JMP	runtime∕internal∕atomic·Load(SB)
+
+TEXT runtime∕internal∕atomic·Storeuintptr(SB), NOSPLIT, $0-12
+	JMP	runtime∕internal∕atomic·Store(SB)
+
+TEXT runtime∕internal∕atomic·Loadint64(SB), NOSPLIT, $0-24
+	JMP	runtime∕internal∕atomic·Load64(SB)
+
+TEXT runtime∕internal∕atomic·Xaddint64(SB), NOSPLIT, $0-24
+	JMP	runtime∕internal∕atomic·Xadd64(SB)
+
+// bool	runtime∕internal∕atomic·cas64(uint64 *val, uint64 old, uint64 new)
+// Atomically:
+//	if(*val == *old){
+//		*val = new;
+//		return 1;
+//	} else {
+//		return 0;
+//	}
+TEXT runtime∕internal∕atomic·Cas64(SB), NOSPLIT, $0-25
+	MOVL	ptr+0(FP), BX
+	MOVQ	old+8(FP), AX
+	MOVQ	new+16(FP), CX
+	LOCK
+	CMPXCHGQ	CX, 0(BX)
+	SETEQ	ret+24(FP)
+	RET
+
+// bool Casp(void **val, void *old, void *new)
+// Atomically:
+//	if(*val == old){
+//		*val = new;
+//		return 1;
+//	} else
+//		return 0;
+TEXT runtime∕internal∕atomic·Casp1(SB), NOSPLIT, $0-17
+	MOVL	ptr+0(FP), BX
+	MOVL	old+4(FP), AX
+	MOVL	new+8(FP), CX
+	LOCK
+	CMPXCHGL	CX, 0(BX)
+	SETEQ	ret+16(FP)
+	RET
+
+// uint32 Xadd(uint32 volatile *val, int32 delta)
+// Atomically:
+//	*val += delta;
+//	return *val;
+TEXT runtime∕internal∕atomic·Xadd(SB), NOSPLIT, $0-12
+	MOVL	ptr+0(FP), BX
+	MOVL	delta+4(FP), AX
+	MOVL	AX, CX
+	LOCK
+	XADDL	AX, 0(BX)
+	ADDL	CX, AX
+	MOVL	AX, ret+8(FP)
+	RET
+
+TEXT runtime∕internal∕atomic·Xadd64(SB), NOSPLIT, $0-24
+	MOVL	ptr+0(FP), BX
+	MOVQ	delta+8(FP), AX
+	MOVQ	AX, CX
+	LOCK
+	XADDQ	AX, 0(BX)
+	ADDQ	CX, AX
+	MOVQ	AX, ret+16(FP)
+	RET
+
+TEXT runtime∕internal∕atomic·Xadduintptr(SB), NOSPLIT, $0-12
+	JMP	runtime∕internal∕atomic·Xadd(SB)
+
+TEXT runtime∕internal∕atomic·Xchg(SB), NOSPLIT, $0-12
+	MOVL	ptr+0(FP), BX
+	MOVL	new+4(FP), AX
+	XCHGL	AX, 0(BX)
+	MOVL	AX, ret+8(FP)
+	RET
+
+TEXT runtime∕internal∕atomic·Xchg64(SB), NOSPLIT, $0-24
+	MOVL	ptr+0(FP), BX
+	MOVQ	new+8(FP), AX
+	XCHGQ	AX, 0(BX)
+	MOVQ	AX, ret+16(FP)
+	RET
+
+TEXT runtime∕internal∕atomic·Xchguintptr(SB), NOSPLIT, $0-12
+	JMP	runtime∕internal∕atomic·Xchg(SB)
+
+TEXT runtime∕internal∕atomic·Storep1(SB), NOSPLIT, $0-8
+	MOVL	ptr+0(FP), BX
+	MOVL	val+4(FP), AX
+	XCHGL	AX, 0(BX)
+	RET
+
+TEXT runtime∕internal∕atomic·Store(SB), NOSPLIT, $0-8
+	MOVL	ptr+0(FP), BX
+	MOVL	val+4(FP), AX
+	XCHGL	AX, 0(BX)
+	RET
+
+TEXT runtime∕internal∕atomic·Store64(SB), NOSPLIT, $0-16
+	MOVL	ptr+0(FP), BX
+	MOVQ	val+8(FP), AX
+	XCHGQ	AX, 0(BX)
+	RET
+
+// void	runtime∕internal∕atomic·Or8(byte volatile*, byte);
+TEXT runtime∕internal∕atomic·Or8(SB), NOSPLIT, $0-5
+	MOVL	ptr+0(FP), BX
+	MOVB	val+4(FP), AX
+	LOCK
+	ORB	AX, 0(BX)
+	RET
+
+// void	runtime∕internal∕atomic·And8(byte volatile*, byte);
+TEXT runtime∕internal∕atomic·And8(SB), NOSPLIT, $0-5
+	MOVL	ptr+0(FP), BX
+	MOVB	val+4(FP), AX
+	LOCK
+	ANDB	AX, 0(BX)
+	RET
diff --git a/src/runtime/internal/atomic/asm_arm.s b/src/runtime/internal/atomic/asm_arm.s
new file mode 100644
index 0000000000..235e8bfd20
--- /dev/null
+++ b/src/runtime/internal/atomic/asm_arm.s
@@ -0,0 +1,71 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+// bool armcas(int32 *val, int32 old, int32 new)
+// Atomically:
+//	if(*val == old){
+//		*val = new;
+//		return 1;
+//	}else
+//		return 0;
+//
+// To implement runtime∕internal∕atomic·cas in sys_$GOOS_arm.s
+// using the native instructions, use:
+//
+//	TEXT runtime∕internal∕atomic·cas(SB),NOSPLIT,$0
+//		B	runtime∕internal∕atomic·armcas(SB)
+//
+TEXT runtime∕internal∕atomic·armcas(SB),NOSPLIT,$0-13
+	MOVW	valptr+0(FP), R1
+	MOVW	old+4(FP), R2
+	MOVW	new+8(FP), R3
+casl:
+	LDREX	(R1), R0
+	CMP	R0, R2
+	BNE	casfail
+
+	MOVB	runtime·goarm(SB), R11
+	CMP	$7, R11
+	BLT	2(PC)
+	WORD	$0xf57ff05a	// dmb ishst
+
+	STREX	R3, (R1), R0
+	CMP	$0, R0
+	BNE	casl
+	MOVW	$1, R0
+
+	MOVB	runtime·goarm(SB), R11
+	CMP	$7, R11
+	BLT	2(PC)
+	WORD	$0xf57ff05b	// dmb ish
+
+	MOVB	R0, ret+12(FP)
+	RET
+casfail:
+	MOVW	$0, R0
+	MOVB	R0, ret+12(FP)
+	RET
+
+TEXT runtime∕internal∕atomic·Casuintptr(SB),NOSPLIT,$0-13
+	B	runtime∕internal∕atomic·Cas(SB)
+
+TEXT runtime∕internal∕atomic·Loaduintptr(SB),NOSPLIT,$0-8
+	B	runtime∕internal∕atomic·Load(SB)
+
+TEXT runtime∕internal∕atomic·Loaduint(SB),NOSPLIT,$0-8
+	B	runtime∕internal∕atomic·Load(SB)
+
+TEXT runtime∕internal∕atomic·Storeuintptr(SB),NOSPLIT,$0-8
+	B	runtime∕internal∕atomic·Store(SB)
+
+TEXT runtime∕internal∕atomic·Xadduintptr(SB),NOSPLIT,$0-8
+	B	runtime∕internal∕atomic·Xadd(SB)
+
+TEXT runtime∕internal∕atomic·Loadint64(SB),NOSPLIT,$0-16
+	B	runtime∕internal∕atomic·Load64(SB)
+
+TEXT runtime∕internal∕atomic·Xaddint64(SB),NOSPLIT,$0-16
+	B	runtime∕internal∕atomic·Xadd64(SB)
diff --git a/src/runtime/internal/atomic/asm_arm64.s b/src/runtime/internal/atomic/asm_arm64.s
new file mode 100644
index 0000000000..c255677f78
--- /dev/null
+++ b/src/runtime/internal/atomic/asm_arm64.s
@@ -0,0 +1,58 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+// bool Cas(uint32 *ptr, uint32 old, uint32 new)
+// Atomically:
+//	if(*val == old){
+//		*val = new;
+//		return 1;
+//	} else
+//		return 0;
+TEXT runtime∕internal∕atomic·Cas(SB), NOSPLIT, $0-17
+	MOVD	ptr+0(FP), R0
+	MOVW	old+8(FP), R1
+	MOVW	new+12(FP), R2
+again:
+	LDAXRW	(R0), R3
+	CMPW	R1, R3
+	BNE	ok
+	STLXRW	R2, (R0), R3
+	CBNZ	R3, again
+ok:
+	CSET	EQ, R0
+	MOVB	R0, ret+16(FP)
+	RET
+
+TEXT runtime∕internal∕atomic·Casuintptr(SB), NOSPLIT, $0-25
+	B	runtime∕internal∕atomic·Cas64(SB)
+
+TEXT runtime∕internal∕atomic·Loaduintptr(SB), NOSPLIT, $-8-16
+	B	runtime∕internal∕atomic·Load64(SB)
+
+TEXT runtime∕internal∕atomic·Loaduint(SB), NOSPLIT, $-8-16
+	B	runtime∕internal∕atomic·Load64(SB)
+
+TEXT runtime∕internal∕atomic·Storeuintptr(SB), NOSPLIT, $0-16
+	B	runtime∕internal∕atomic·Store64(SB)
+
+TEXT runtime∕internal∕atomic·Xadduintptr(SB), NOSPLIT, $0-16
+	B	runtime∕internal∕atomic·Xadd64(SB)
+
+TEXT runtime∕internal∕atomic·Loadint64(SB), NOSPLIT, $0-16
+	B	runtime∕internal∕atomic·Load64(SB)
+
+TEXT runtime∕internal∕atomic·Xaddint64(SB), NOSPLIT, $0-16
+	B	runtime∕internal∕atomic·Xadd64(SB)
+
+// bool Casp(void **val, void *old, void *new)
+// Atomically:
+//	if(*val == old){
+//		*val = new;
+//		return 1;
+//	} else
+//		return 0;
+TEXT runtime∕internal∕atomic·Casp1(SB), NOSPLIT, $0-25
+	B runtime∕internal∕atomic·Cas64(SB)
diff --git a/src/runtime/internal/atomic/asm_ppc64x.s b/src/runtime/internal/atomic/asm_ppc64x.s
new file mode 100644
index 0000000000..87f7f5d892
--- /dev/null
+++ b/src/runtime/internal/atomic/asm_ppc64x.s
@@ -0,0 +1,225 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build ppc64 ppc64le
+
+#include "textflag.h"
+
+// bool cas(uint32 *ptr, uint32 old, uint32 new)
+// Atomically:
+//	if(*val == old){
+//		*val = new;
+//		return 1;
+//	} else
+//		return 0;
+TEXT runtime∕internal∕atomic·Cas(SB), NOSPLIT, $0-17
+	MOVD	ptr+0(FP), R3
+	MOVWZ	old+8(FP), R4
+	MOVWZ	new+12(FP), R5
+cas_again:
+	SYNC
+	LWAR	(R3), R6
+	CMPW	R6, R4
+	BNE	cas_fail
+	STWCCC	R5, (R3)
+	BNE	cas_again
+	MOVD	$1, R3
+	SYNC
+	ISYNC
+	MOVB	R3, ret+16(FP)
+	RET
+cas_fail:
+	MOVD	$0, R3
+	BR	-5(PC)
+
+// bool	runtime∕internal∕atomic·Cas64(uint64 *ptr, uint64 old, uint64 new)
+// Atomically:
+//	if(*val == *old){
+//		*val = new;
+//		return 1;
+//	} else {
+//		return 0;
+//	}
+TEXT runtime∕internal∕atomic·Cas64(SB), NOSPLIT, $0-25
+	MOVD	ptr+0(FP), R3
+	MOVD	old+8(FP), R4
+	MOVD	new+16(FP), R5
+cas64_again:
+	SYNC
+	LDAR	(R3), R6
+	CMP	R6, R4
+	BNE	cas64_fail
+	STDCCC	R5, (R3)
+	BNE	cas64_again
+	MOVD	$1, R3
+	SYNC
+	ISYNC
+	MOVB	R3, ret+24(FP)
+	RET
+cas64_fail:
+	MOVD	$0, R3
+	BR	-5(PC)
+
+TEXT runtime∕internal∕atomic·Casuintptr(SB), NOSPLIT, $0-25
+	BR	runtime∕internal∕atomic·Cas64(SB)
+
+TEXT runtime∕internal∕atomic·Loaduintptr(SB),  NOSPLIT|NOFRAME, $0-16
+	BR	runtime∕internal∕atomic·Load64(SB)
+
+TEXT runtime∕internal∕atomic·Loaduint(SB), NOSPLIT|NOFRAME, $0-16
+	BR	runtime∕internal∕atomic·Load64(SB)
+
+TEXT runtime∕internal∕atomic·Storeuintptr(SB), NOSPLIT, $0-16
+	BR	runtime∕internal∕atomic·Store64(SB)
+
+TEXT runtime∕internal∕atomic·Xadduintptr(SB), NOSPLIT, $0-24
+	BR	runtime∕internal∕atomic·Xadd64(SB)
+
+TEXT runtime∕internal∕atomic·Loadint64(SB), NOSPLIT, $0-16
+	BR	runtime∕internal∕atomic·Load64(SB)
+
+TEXT runtime∕internal∕atomic·Xaddint64(SB), NOSPLIT, $0-16
+	BR	runtime∕internal∕atomic·Xadd64(SB)
+
+// bool casp(void **val, void *old, void *new)
+// Atomically:
+//	if(*val == old){
+//		*val = new;
+//		return 1;
+//	} else
+//		return 0;
+TEXT runtime∕internal∕atomic·Casp1(SB), NOSPLIT, $0-25
+	BR runtime∕internal∕atomic·Cas64(SB)
+
+// uint32 xadd(uint32 volatile *ptr, int32 delta)
+// Atomically:
+//	*val += delta;
+//	return *val;
+TEXT runtime∕internal∕atomic·Xadd(SB), NOSPLIT, $0-20
+	MOVD	ptr+0(FP), R4
+	MOVW	delta+8(FP), R5
+	SYNC
+	LWAR	(R4), R3
+	ADD	R5, R3
+	STWCCC	R3, (R4)
+	BNE	-4(PC)
+	SYNC
+	ISYNC
+	MOVW	R3, ret+16(FP)
+	RET
+
+TEXT runtime∕internal∕atomic·Xadd64(SB), NOSPLIT, $0-24
+	MOVD	ptr+0(FP), R4
+	MOVD	delta+8(FP), R5
+	SYNC
+	LDAR	(R4), R3
+	ADD	R5, R3
+	STDCCC	R3, (R4)
+	BNE	-4(PC)
+	SYNC
+	ISYNC
+	MOVD	R3, ret+16(FP)
+	RET
+
+TEXT runtime∕internal∕atomic·Xchg(SB), NOSPLIT, $0-20
+	MOVD	ptr+0(FP), R4
+	MOVW	new+8(FP), R5
+	SYNC
+	LWAR	(R4), R3
+	STWCCC	R5, (R4)
+	BNE	-3(PC)
+	SYNC
+	ISYNC
+	MOVW	R3, ret+16(FP)
+	RET
+
+TEXT runtime∕internal∕atomic·Xchg64(SB), NOSPLIT, $0-24
+	MOVD	ptr+0(FP), R4
+	MOVD	new+8(FP), R5
+	SYNC
+	LDAR	(R4), R3
+	STDCCC	R5, (R4)
+	BNE	-3(PC)
+	SYNC
+	ISYNC
+	MOVD	R3, ret+16(FP)
+	RET
+
+TEXT runtime∕internal∕atomic·Xchguintptr(SB), NOSPLIT, $0-24
+	BR	runtime∕internal∕atomic·Xchg64(SB)
+
+
+TEXT runtime∕internal∕atomic·Storep1(SB), NOSPLIT, $0-16
+	BR	runtime∕internal∕atomic·Store64(SB)
+
+TEXT runtime∕internal∕atomic·Store(SB), NOSPLIT, $0-12
+	MOVD	ptr+0(FP), R3
+	MOVW	val+8(FP), R4
+	SYNC
+	MOVW	R4, 0(R3)
+	RET
+
+TEXT runtime∕internal∕atomic·Store64(SB), NOSPLIT, $0-16
+	MOVD	ptr+0(FP), R3
+	MOVD	val+8(FP), R4
+	SYNC
+	MOVD	R4, 0(R3)
+	RET
+
+// void	runtime∕internal∕atomic·Or8(byte volatile*, byte);
+TEXT runtime∕internal∕atomic·Or8(SB), NOSPLIT, $0-9
+	MOVD	ptr+0(FP), R3
+	MOVBZ	val+8(FP), R4
+	// Align ptr down to 4 bytes so we can use 32-bit load/store.
+	// R5 = (R3 << 0) & ~3
+	RLDCR	$0, R3, $~3, R5
+	// Compute val shift.
+#ifdef GOARCH_ppc64
+	// Big endian.  ptr = ptr ^ 3
+	XOR	$3, R3
+#endif
+	// R6 = ((ptr & 3) * 8) = (ptr << 3) & (3*8)
+	RLDC	$3, R3, $(3*8), R6
+	// Shift val for aligned ptr.  R4 = val << R6
+	SLD	R6, R4, R4
+
+again:
+	SYNC
+	LWAR	(R5), R6
+	OR	R4, R6
+	STWCCC	R6, (R5)
+	BNE	again
+	SYNC
+	ISYNC
+	RET
+
+// void	runtime∕internal∕atomic·And8(byte volatile*, byte);
+TEXT runtime∕internal∕atomic·And8(SB), NOSPLIT, $0-9
+	MOVD	ptr+0(FP), R3
+	MOVBZ	val+8(FP), R4
+	// Align ptr down to 4 bytes so we can use 32-bit load/store.
+	// R5 = (R3 << 0) & ~3
+	RLDCR	$0, R3, $~3, R5
+	// Compute val shift.
+#ifdef GOARCH_ppc64
+	// Big endian.  ptr = ptr ^ 3
+	XOR	$3, R3
+#endif
+	// R6 = ((ptr & 3) * 8) = (ptr << 3) & (3*8)
+	RLDC	$3, R3, $(3*8), R6
+	// Shift val for aligned ptr.  R4 = val << R6 | ^(0xFF << R6)
+	MOVD	$0xFF, R7
+	SLD	R6, R4
+	SLD	R6, R7
+	XOR $-1, R7
+	OR	R7, R4
+again:
+	SYNC
+	LWAR	(R5), R6
+	AND	R4, R6
+	STWCCC	R6, (R5)
+	BNE	again
+	SYNC
+	ISYNC
+	RET
diff --git a/src/runtime/internal/atomic/atomic_386.go b/src/runtime/internal/atomic/atomic_386.go
new file mode 100644
index 0000000000..ae7b58325a
--- /dev/null
+++ b/src/runtime/internal/atomic/atomic_386.go
@@ -0,0 +1,80 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build 386
+
+package atomic
+
+import "unsafe"
+
+// The calls to nop are to keep these functions from being inlined.
+// If they are inlined we have no guarantee that later rewrites of the
+// code by optimizers will preserve the relative order of memory accesses.
+
+//go:nosplit
+func Load(ptr *uint32) uint32 {
+	nop()
+	return *ptr
+}
+
+//go:nosplit
+func Loadp(ptr unsafe.Pointer) unsafe.Pointer {
+	nop()
+	return *(*unsafe.Pointer)(ptr)
+}
+
+//go:nosplit
+func Xadd64(ptr *uint64, delta int64) uint64 {
+	for {
+		old := *ptr
+		if Cas64(ptr, old, old+uint64(delta)) {
+			return old + uint64(delta)
+		}
+	}
+}
+
+//go:noescape
+func Xadduintptr(ptr *uintptr, delta uintptr) uintptr
+
+//go:nosplit
+func Xchg64(ptr *uint64, new uint64) uint64 {
+	for {
+		old := *ptr
+		if Cas64(ptr, old, new) {
+			return old
+		}
+	}
+}
+
+//go:noescape
+func Xadd(ptr *uint32, delta int32) uint32
+
+//go:noescape
+func Xchg(ptr *uint32, new uint32) uint32
+
+//go:noescape
+func Xchguintptr(ptr *uintptr, new uintptr) uintptr
+
+//go:noescape
+func Load64(ptr *uint64) uint64
+
+//go:noescape
+func And8(ptr *uint8, val uint8)
+
+//go:noescape
+func Or8(ptr *uint8, val uint8)
+
+// NOTE: Do not add atomicxor8 (XOR is not idempotent).
+
+//go:noescape
+func Cas64(ptr *uint64, old, new uint64) bool
+
+//go:noescape
+func Store(ptr *uint32, val uint32)
+
+//go:noescape
+func Store64(ptr *uint64, val uint64)
+
+// NO go:noescape annotation; see atomic_pointer.go.
+func Storep1(ptr unsafe.Pointer, val unsafe.Pointer)
diff --git a/src/runtime/internal/atomic/atomic_amd64x.go b/src/runtime/internal/atomic/atomic_amd64x.go
new file mode 100644
index 0000000000..7f6c892364
--- /dev/null
+++ b/src/runtime/internal/atomic/atomic_amd64x.go
@@ -0,0 +1,69 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build amd64 amd64p32
+
+package atomic
+
+import "unsafe"
+
+// The calls to nop are to keep these functions from being inlined.
+// If they are inlined we have no guarantee that later rewrites of the
+// code by optimizers will preserve the relative order of memory accesses.
+
+//go:nosplit
+func Load(ptr *uint32) uint32 {
+	nop()
+	return *ptr
+}
+
+//go:nosplit
+func Loadp(ptr unsafe.Pointer) unsafe.Pointer {
+	nop()
+	return *(*unsafe.Pointer)(ptr)
+}
+
+//go:nosplit
+func Load64(ptr *uint64) uint64 {
+	nop()
+	return *ptr
+}
+
+//go:noescape
+func Xadd(ptr *uint32, delta int32) uint32
+
+//go:noescape
+func Xadd64(ptr *uint64, delta int64) uint64
+
+//go:noescape
+func Xadduintptr(ptr *uintptr, delta uintptr) uintptr
+
+//go:noescape
+func Xchg(ptr *uint32, new uint32) uint32
+
+//go:noescape
+func Xchg64(ptr *uint64, new uint64) uint64
+
+//go:noescape
+func Xchguintptr(ptr *uintptr, new uintptr) uintptr
+
+//go:noescape
+func And8(ptr *uint8, val uint8)
+
+//go:noescape
+func Or8(ptr *uint8, val uint8)
+
+// NOTE: Do not add atomicxor8 (XOR is not idempotent).
+
+//go:noescape
+func Cas64(ptr *uint64, old, new uint64) bool
+
+//go:noescape
+func Store(ptr *uint32, val uint32)
+
+//go:noescape
+func Store64(ptr *uint64, val uint64)
+
+// NO go:noescape annotation; see atomic_pointer.go.
+func Storep1(ptr unsafe.Pointer, val unsafe.Pointer)
diff --git a/src/runtime/internal/atomic/atomic_arm.go b/src/runtime/internal/atomic/atomic_arm.go
new file mode 100644
index 0000000000..1f33eef3c4
--- /dev/null
+++ b/src/runtime/internal/atomic/atomic_arm.go
@@ -0,0 +1,180 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build arm
+
+package atomic
+
+import "unsafe"
+
+type spinlock struct {
+	v uint32
+}
+
+//go:nosplit
+func (l *spinlock) lock() {
+	for {
+		if Cas(&l.v, 0, 1) {
+			return
+		}
+	}
+}
+
+//go:nosplit
+func (l *spinlock) unlock() {
+	Store(&l.v, 0)
+}
+
+var locktab [57]struct {
+	l   spinlock
+	pad [_CacheLineSize - unsafe.Sizeof(spinlock{})]byte
+}
+
+func addrLock(addr *uint64) *spinlock {
+	return &locktab[(uintptr(unsafe.Pointer(addr))>>3)%uintptr(len(locktab))].l
+}
+
+// Atomic add and return new value.
+//go:nosplit
+func Xadd(val *uint32, delta int32) uint32 {
+	for {
+		oval := *val
+		nval := oval + uint32(delta)
+		if Cas(val, oval, nval) {
+			return nval
+		}
+	}
+}
+
+//go:noescape
+func Xadduintptr(ptr *uintptr, delta uintptr) uintptr
+
+//go:nosplit
+func Xchg(addr *uint32, v uint32) uint32 {
+	for {
+		old := *addr
+		if Cas(addr, old, v) {
+			return old
+		}
+	}
+}
+
+//go:nosplit
+func Xchguintptr(addr *uintptr, v uintptr) uintptr {
+	return uintptr(Xchg((*uint32)(unsafe.Pointer(addr)), uint32(v)))
+}
+
+//go:nosplit
+func Load(addr *uint32) uint32 {
+	return Xadd(addr, 0)
+}
+
+// Should be a built-in for unsafe.Pointer?
+//go:nosplit
+func add(p unsafe.Pointer, x uintptr) unsafe.Pointer {
+	return unsafe.Pointer(uintptr(p) + x)
+}
+
+//go:nosplit
+func Loadp(addr unsafe.Pointer) unsafe.Pointer {
+	return unsafe.Pointer(uintptr(Xadd((*uint32)(addr), 0)))
+}
+
+//go:nosplit
+func Storep1(addr unsafe.Pointer, v unsafe.Pointer) {
+	for {
+		old := *(*unsafe.Pointer)(addr)
+		if Casp1((*unsafe.Pointer)(addr), old, v) {
+			return
+		}
+	}
+}
+
+//go:nosplit
+func Store(addr *uint32, v uint32) {
+	for {
+		old := *addr
+		if Cas(addr, old, v) {
+			return
+		}
+	}
+}
+
+//go:nosplit
+func Cas64(addr *uint64, old, new uint64) bool {
+	var ok bool
+	addrLock(addr).lock()
+	if *addr == old {
+		*addr = new
+		ok = true
+	}
+	addrLock(addr).unlock()
+	return ok
+}
+
+//go:nosplit
+func Xadd64(addr *uint64, delta int64) uint64 {
+	var r uint64
+	addrLock(addr).lock()
+	r = *addr + uint64(delta)
+	*addr = r
+	addrLock(addr).unlock()
+	return r
+}
+
+//go:nosplit
+func Xchg64(addr *uint64, v uint64) uint64 {
+	var r uint64
+	addrLock(addr).lock()
+	r = *addr
+	*addr = v
+	addrLock(addr).unlock()
+	return r
+}
+
+//go:nosplit
+func Load64(addr *uint64) uint64 {
+	var r uint64
+	addrLock(addr).lock()
+	r = *addr
+	addrLock(addr).unlock()
+	return r
+}
+
+//go:nosplit
+func Store64(addr *uint64, v uint64) {
+	addrLock(addr).lock()
+	*addr = v
+	addrLock(addr).unlock()
+}
+
+//go:nosplit
+func Or8(addr *uint8, v uint8) {
+	// Align down to 4 bytes and use 32-bit CAS.
+	uaddr := uintptr(unsafe.Pointer(addr))
+	addr32 := (*uint32)(unsafe.Pointer(uaddr &^ 3))
+	word := uint32(v) << ((uaddr & 3) * 8) // little endian
+	for {
+		old := *addr32
+		if Cas(addr32, old, old|word) {
+			return
+		}
+	}
+}
+
+//go:nosplit
+func And8(addr *uint8, v uint8) {
+	// Align down to 4 bytes and use 32-bit CAS.
+	uaddr := uintptr(unsafe.Pointer(addr))
+	addr32 := (*uint32)(unsafe.Pointer(uaddr &^ 3))
+	word := uint32(v) << ((uaddr & 3) * 8)    // little endian
+	mask := uint32(0xFF) << ((uaddr & 3) * 8) // little endian
+	word |= ^mask
+	for {
+		old := *addr32
+		if Cas(addr32, old, old&word) {
+			return
+		}
+	}
+}
diff --git a/src/runtime/internal/atomic/atomic_arm64.go b/src/runtime/internal/atomic/atomic_arm64.go
new file mode 100644
index 0000000000..6b32346656
--- /dev/null
+++ b/src/runtime/internal/atomic/atomic_arm64.go
@@ -0,0 +1,80 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build arm64
+
+package atomic
+
+import "unsafe"
+
+//go:noescape
+func Xadd(ptr *uint32, delta int32) uint32
+
+//go:noescape
+func Xadd64(ptr *uint64, delta int64) uint64
+
+//go:noescape
+func Xadduintptr(ptr *uintptr, delta uintptr) uintptr
+
+//go:noescape
+func Xchg(ptr *uint32, new uint32) uint32
+
+//go:noescape
+func Xchg64(ptr *uint64, new uint64) uint64
+
+//go:noescape
+func Xchguintptr(ptr *uintptr, new uintptr) uintptr
+
+//go:noescape
+func Load(ptr *uint32) uint32
+
+//go:noescape
+func Load64(ptr *uint64) uint64
+
+//go:noescape
+func Loadp(ptr unsafe.Pointer) unsafe.Pointer
+
+//go:nosplit
+func Or8(addr *uint8, v uint8) {
+	// TODO(dfc) implement this in asm.
+	// Align down to 4 bytes and use 32-bit CAS.
+	uaddr := uintptr(unsafe.Pointer(addr))
+	addr32 := (*uint32)(unsafe.Pointer(uaddr &^ 3))
+	word := uint32(v) << ((uaddr & 3) * 8) // little endian
+	for {
+		old := *addr32
+		if Cas(addr32, old, old|word) {
+			return
+		}
+	}
+}
+
+//go:nosplit
+func And8(addr *uint8, v uint8) {
+	// TODO(dfc) implement this in asm.
+	// Align down to 4 bytes and use 32-bit CAS.
+	uaddr := uintptr(unsafe.Pointer(addr))
+	addr32 := (*uint32)(unsafe.Pointer(uaddr &^ 3))
+	word := uint32(v) << ((uaddr & 3) * 8)    // little endian
+	mask := uint32(0xFF) << ((uaddr & 3) * 8) // little endian
+	word |= ^mask
+	for {
+		old := *addr32
+		if Cas(addr32, old, old&word) {
+			return
+		}
+	}
+}
+
+//go:noescape
+func Cas64(ptr *uint64, old, new uint64) bool
+
+//go:noescape
+func Store(ptr *uint32, val uint32)
+
+//go:noescape
+func Store64(ptr *uint64, val uint64)
+
+// NO go:noescape annotation; see atomic_pointer.go.
+func Storep1(ptr unsafe.Pointer, val unsafe.Pointer)
diff --git a/src/runtime/internal/atomic/atomic_arm64.s b/src/runtime/internal/atomic/atomic_arm64.s
new file mode 100644
index 0000000000..7b1b0efaf6
--- /dev/null
+++ b/src/runtime/internal/atomic/atomic_arm64.s
@@ -0,0 +1,113 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+// uint32 runtime∕internal∕atomic·Load(uint32 volatile* addr)
+TEXT ·Load(SB),NOSPLIT,$-8-12
+	MOVD	ptr+0(FP), R0
+	LDARW	(R0), R0
+	MOVW	R0, ret+8(FP)
+	RET
+
+// uint64 runtime∕internal∕atomic·Load64(uint64 volatile* addr)
+TEXT ·Load64(SB),NOSPLIT,$-8-16
+	MOVD	ptr+0(FP), R0
+	LDAR	(R0), R0
+	MOVD	R0, ret+8(FP)
+	RET
+
+// void *runtime∕internal∕atomic·Loadp(void *volatile *addr)
+TEXT ·Loadp(SB),NOSPLIT,$-8-16
+	MOVD	ptr+0(FP), R0
+	LDAR	(R0), R0
+	MOVD	R0, ret+8(FP)
+	RET
+
+TEXT runtime∕internal∕atomic·Storep1(SB), NOSPLIT, $0-16
+	B	runtime∕internal∕atomic·Store64(SB)
+
+TEXT runtime∕internal∕atomic·Store(SB), NOSPLIT, $0-12
+	MOVD	ptr+0(FP), R0
+	MOVW	val+8(FP), R1
+	STLRW	R1, (R0)
+	RET
+
+TEXT runtime∕internal∕atomic·Store64(SB), NOSPLIT, $0-16
+	MOVD	ptr+0(FP), R0
+	MOVD	val+8(FP), R1
+	STLR	R1, (R0)
+	RET
+
+TEXT runtime∕internal∕atomic·Xchg(SB), NOSPLIT, $0-20
+again:
+	MOVD	ptr+0(FP), R0
+	MOVW	new+8(FP), R1
+	LDAXRW	(R0), R2
+	STLXRW	R1, (R0), R3
+	CBNZ	R3, again
+	MOVW	R2, ret+16(FP)
+	RET
+
+TEXT runtime∕internal∕atomic·Xchg64(SB), NOSPLIT, $0-24
+again:
+	MOVD	ptr+0(FP), R0
+	MOVD	new+8(FP), R1
+	LDAXR	(R0), R2
+	STLXR	R1, (R0), R3
+	CBNZ	R3, again
+	MOVD	R2, ret+16(FP)
+	RET
+
+// bool runtime∕internal∕atomic·Cas64(uint64 *ptr, uint64 old, uint64 new)
+// Atomically:
+//      if(*val == *old){
+//              *val = new;
+//              return 1;
+//      } else {
+//              return 0;
+//      }
+TEXT runtime∕internal∕atomic·Cas64(SB), NOSPLIT, $0-25
+	MOVD	ptr+0(FP), R0
+	MOVD	old+8(FP), R1
+	MOVD	new+16(FP), R2
+again:
+	LDAXR	(R0), R3
+	CMP	R1, R3
+	BNE	ok
+	STLXR	R2, (R0), R3
+	CBNZ	R3, again
+ok:
+	CSET	EQ, R0
+	MOVB	R0, ret+24(FP)
+	RET
+
+// uint32 xadd(uint32 volatile *ptr, int32 delta)
+// Atomically:
+//      *val += delta;
+//      return *val;
+TEXT runtime∕internal∕atomic·Xadd(SB), NOSPLIT, $0-20
+again:
+	MOVD	ptr+0(FP), R0
+	MOVW	delta+8(FP), R1
+	LDAXRW	(R0), R2
+	ADDW	R2, R1, R2
+	STLXRW	R2, (R0), R3
+	CBNZ	R3, again
+	MOVW	R2, ret+16(FP)
+	RET
+
+TEXT runtime∕internal∕atomic·Xadd64(SB), NOSPLIT, $0-24
+again:
+	MOVD	ptr+0(FP), R0
+	MOVD	delta+8(FP), R1
+	LDAXR	(R0), R2
+	ADD	R2, R1, R2
+	STLXR	R2, (R0), R3
+	CBNZ	R3, again
+	MOVD	R2, ret+16(FP)
+	RET
+
+TEXT runtime∕internal∕atomic·Xchguintptr(SB), NOSPLIT, $0-24
+	B	runtime∕internal∕atomic·Xchg64(SB)
diff --git a/src/runtime/internal/atomic/atomic_ppc64x.go b/src/runtime/internal/atomic/atomic_ppc64x.go
new file mode 100644
index 0000000000..bf82b82643
--- /dev/null
+++ b/src/runtime/internal/atomic/atomic_ppc64x.go
@@ -0,0 +1,56 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build ppc64 ppc64le
+
+package atomic
+
+import "unsafe"
+
+//go:noescape
+func Xadd(ptr *uint32, delta int32) uint32
+
+//go:noescape
+func Xadd64(ptr *uint64, delta int64) uint64
+
+//go:noescape
+func Xadduintptr(ptr *uintptr, delta uintptr) uintptr
+
+//go:noescape
+func Xchg(ptr *uint32, new uint32) uint32
+
+//go:noescape
+func Xchg64(ptr *uint64, new uint64) uint64
+
+//go:noescape
+func Xchguintptr(ptr *uintptr, new uintptr) uintptr
+
+//go:noescape
+func Load(ptr *uint32) uint32
+
+//go:noescape
+func Load64(ptr *uint64) uint64
+
+//go:noescape
+func Loadp(ptr unsafe.Pointer) unsafe.Pointer
+
+//go:noescape
+func And8(ptr *uint8, val uint8)
+
+//go:noescape
+func Or8(ptr *uint8, val uint8)
+
+// NOTE: Do not add atomicxor8 (XOR is not idempotent).
+
+//go:noescape
+func Cas64(ptr *uint64, old, new uint64) bool
+
+//go:noescape
+func Store(ptr *uint32, val uint32)
+
+//go:noescape
+func Store64(ptr *uint64, val uint64)
+
+// NO go:noescape annotation; see atomic_pointer.go.
+func Storep1(ptr unsafe.Pointer, val unsafe.Pointer)
diff --git a/src/runtime/internal/atomic/atomic_ppc64x.s b/src/runtime/internal/atomic/atomic_ppc64x.s
new file mode 100644
index 0000000000..1a7537ed33
--- /dev/null
+++ b/src/runtime/internal/atomic/atomic_ppc64x.s
@@ -0,0 +1,40 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build ppc64 ppc64le
+
+#include "textflag.h"
+
+// uint32 runtime∕internal∕atomic·Load(uint32 volatile* addr)
+TEXT ·Load(SB),NOSPLIT|NOFRAME,$-8-12
+	MOVD	addr+0(FP), R3
+	SYNC
+	MOVWZ	0(R3), R3
+	CMPW	R3, R3, CR7
+	BC	4, 30, 1(PC) // bne- cr7,0x4
+	ISYNC
+	MOVW	R3, ret+8(FP)
+	RET
+
+// uint64 runtime∕internal∕atomic·Load64(uint64 volatile* addr)
+TEXT ·Load64(SB),NOSPLIT|NOFRAME,$-8-16
+	MOVD	addr+0(FP), R3
+	SYNC
+	MOVD	0(R3), R3
+	CMP	R3, R3, CR7
+	BC	4, 30, 1(PC) // bne- cr7,0x4
+	ISYNC
+	MOVD	R3, ret+8(FP)
+	RET
+
+// void *runtime∕internal∕atomic·Loadp(void *volatile *addr)
+TEXT ·Loadp(SB),NOSPLIT|NOFRAME,$-8-16
+	MOVD	addr+0(FP), R3
+	SYNC
+	MOVD	0(R3), R3
+	CMP	R3, R3, CR7
+	BC	4, 30, 1(PC) // bne- cr7,0x4
+	ISYNC
+	MOVD	R3, ret+8(FP)
+	RET
diff --git a/src/runtime/internal/atomic/atomic_test.go b/src/runtime/internal/atomic/atomic_test.go
new file mode 100644
index 0000000000..e8ec788d6a
--- /dev/null
+++ b/src/runtime/internal/atomic/atomic_test.go
@@ -0,0 +1,67 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package atomic_test
+
+import (
+	"runtime"
+	"runtime/internal/atomic"
+	"testing"
+	"unsafe"
+)
+
+func runParallel(N, iter int, f func()) {
+	defer runtime.GOMAXPROCS(runtime.GOMAXPROCS(int(N)))
+	done := make(chan bool)
+	for i := 0; i < N; i++ {
+		go func() {
+			for j := 0; j < iter; j++ {
+				f()
+			}
+			done <- true
+		}()
+	}
+	for i := 0; i < N; i++ {
+		<-done
+	}
+}
+
+func TestXadduintptr(t *testing.T) {
+	const N = 20
+	const iter = 100000
+	inc := uintptr(100)
+	total := uintptr(0)
+	runParallel(N, iter, func() {
+		atomic.Xadduintptr(&total, inc)
+	})
+	if want := uintptr(N * iter * inc); want != total {
+		t.Fatalf("xadduintpr error, want %d, got %d", want, total)
+	}
+	total = 0
+	runParallel(N, iter, func() {
+		atomic.Xadduintptr(&total, inc)
+		atomic.Xadduintptr(&total, uintptr(-int64(inc)))
+	})
+	if total != 0 {
+		t.Fatalf("xadduintpr total error, want %d, got %d", 0, total)
+	}
+}
+
+// Tests that xadduintptr correctly updates 64-bit values.  The place where
+// we actually do so is mstats.go, functions mSysStat{Inc,Dec}.
+func TestXadduintptrOnUint64(t *testing.T) {
+	/*	if runtime.BigEndian != 0 {
+		// On big endian architectures, we never use xadduintptr to update
+		// 64-bit values and hence we skip the test.  (Note that functions
+		// mSysStat{Inc,Dec} in mstats.go have explicit checks for
+		// big-endianness.)
+		return
+	}*/
+	const inc = 100
+	val := uint64(0)
+	atomic.Xadduintptr((*uintptr)(unsafe.Pointer(&val)), inc)
+	if inc != val {
+		t.Fatalf("xadduintptr should increase lower-order bits, want %d, got %d", inc, val)
+	}
+}
diff --git a/src/runtime/internal/atomic/stubs.go b/src/runtime/internal/atomic/stubs.go
new file mode 100644
index 0000000000..826829938d
--- /dev/null
+++ b/src/runtime/internal/atomic/stubs.go
@@ -0,0 +1,35 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package atomic
+
+import "unsafe"
+
+//go:noescape
+func Cas(ptr *uint32, old, new uint32) bool
+
+// NO go:noescape annotation; see atomic_pointer.go.
+func Casp1(ptr *unsafe.Pointer, old, new unsafe.Pointer) bool
+
+func nop() // call to prevent inlining of function body
+
+//go:noescape
+func Casuintptr(ptr *uintptr, old, new uintptr) bool
+
+//go:noescape
+func Storeuintptr(ptr *uintptr, new uintptr)
+
+//go:noescape
+func Loaduintptr(ptr *uintptr) uintptr
+
+//go:noescape
+func Loaduint(ptr *uint) uint
+
+// TODO(matloob): Should these functions have the go:noescape annotation?
+
+//go:noescape
+func Loadint64(ptr *int64) int64
+
+//go:noescape
+func Xaddint64(ptr *int64, delta int64) int64
diff --git a/src/runtime/internal/atomic/sys_darwin_arm.s b/src/runtime/internal/atomic/sys_darwin_arm.s
new file mode 100644
index 0000000000..01b7aefd1c
--- /dev/null
+++ b/src/runtime/internal/atomic/sys_darwin_arm.s
@@ -0,0 +1,11 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+TEXT runtime∕internal∕atomic·Cas(SB),NOSPLIT,$0
+	B	runtime∕internal∕atomic·armcas(SB)
+
+TEXT runtime∕internal∕atomic·Casp1(SB),NOSPLIT,$0
+	B	runtime∕internal∕atomic·Cas(SB)
diff --git a/src/runtime/internal/atomic/sys_freebsd_arm.s b/src/runtime/internal/atomic/sys_freebsd_arm.s
new file mode 100644
index 0000000000..30d49b8d39
--- /dev/null
+++ b/src/runtime/internal/atomic/sys_freebsd_arm.s
@@ -0,0 +1,19 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+// TODO(minux): this is only valid for ARMv6+
+// bool armcas(int32 *val, int32 old, int32 new)
+// Atomically:
+//	if(*val == old){
+//		*val = new;
+//		return 1;
+//	}else
+//		return 0;
+TEXT runtime∕internal∕atomic·Cas(SB),NOSPLIT,$0
+	B	runtime∕internal∕atomic·armcas(SB)
+
+TEXT runtime∕internal∕atomic·Casp1(SB),NOSPLIT,$0
+	B	runtime∕internal∕atomic·Cas(SB)
diff --git a/src/runtime/internal/atomic/sys_linux_arm.s b/src/runtime/internal/atomic/sys_linux_arm.s
new file mode 100644
index 0000000000..f8de2a2a41
--- /dev/null
+++ b/src/runtime/internal/atomic/sys_linux_arm.s
@@ -0,0 +1,42 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+// Use kernel version instead of native armcas in asm_arm.s.
+// See ../../../sync/atomic/asm_linux_arm.s for details.
+TEXT cas<>(SB),NOSPLIT,$0
+	MOVW	$0xffff0fc0, R15 // R15 is hardware PC.
+
+TEXT runtime∕internal∕atomic·Cas(SB),NOSPLIT,$0
+	MOVW	ptr+0(FP), R2
+	MOVW	old+4(FP), R0
+loop:
+	MOVW	new+8(FP), R1
+	BL	cas<>(SB)
+	BCC	check
+	MOVW	$1, R0
+	MOVB	R0, ret+12(FP)
+	RET
+check:
+	// Kernel lies; double-check.
+	MOVW	ptr+0(FP), R2
+	MOVW	old+4(FP), R0
+	MOVW	0(R2), R3
+	CMP	R0, R3
+	BEQ	loop
+	MOVW	$0, R0
+	MOVB	R0, ret+12(FP)
+	RET
+
+TEXT runtime∕internal∕atomic·Casp1(SB),NOSPLIT,$0
+	B	runtime∕internal∕atomic·Cas(SB)
+
+// As for cas, memory barriers are complicated on ARM, but the kernel
+// provides a user helper. ARMv5 does not support SMP and has no
+// memory barrier instruction at all. ARMv6 added SMP support and has
+// a memory barrier, but it requires writing to a coprocessor
+// register. ARMv7 introduced the DMB instruction, but it's expensive
+// even on single-core devices. The kernel helper takes care of all of
+// this for us.
\ No newline at end of file
diff --git a/src/runtime/internal/atomic/sys_nacl_arm.s b/src/runtime/internal/atomic/sys_nacl_arm.s
new file mode 100644
index 0000000000..efa960474d
--- /dev/null
+++ b/src/runtime/internal/atomic/sys_nacl_arm.s
@@ -0,0 +1,16 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+TEXT runtime∕internal∕atomic·Casp(SB),NOSPLIT,$0
+	B	runtime·cas(SB)
+
+// This is only valid for ARMv6+, however, NaCl/ARM is only defined
+// for ARMv7A anyway.
+TEXT runtime∕internal∕atomic·Cas(SB),NOSPLIT,$0
+	B	runtime∕internal∕atomic·armcas(SB)
+
+TEXT runtime∕internal∕atomic·Casp1(SB),NOSPLIT,$0
+	B	runtime∕internal∕atomic·Cas(SB)
diff --git a/src/runtime/internal/atomic/sys_netbsd_arm.s b/src/runtime/internal/atomic/sys_netbsd_arm.s
new file mode 100644
index 0000000000..3277d94af6
--- /dev/null
+++ b/src/runtime/internal/atomic/sys_netbsd_arm.s
@@ -0,0 +1,21 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+// TODO(minux): this is only valid for ARMv6+
+// bool Armcas(int32 *val, int32 old, int32 new)
+// Atomically:
+//	if(*val == old){
+//		*val = new;
+//		return 1;
+//	}else
+//		return 0;
+TEXT runtime∕internal∕atomic·Cas(SB),NOSPLIT,$0
+	B	runtime∕internal∕atomic·armcas(SB)
+
+TEXT runtime∕internal∕atomic·Casp1(SB),NOSPLIT,$0
+	B	runtime∕internal∕atomic·Cas(SB)
+
+
diff --git a/src/runtime/internal/atomic/sys_openbsd_arm.s b/src/runtime/internal/atomic/sys_openbsd_arm.s
new file mode 100644
index 0000000000..01b7aefd1c
--- /dev/null
+++ b/src/runtime/internal/atomic/sys_openbsd_arm.s
@@ -0,0 +1,11 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+TEXT runtime∕internal∕atomic·Cas(SB),NOSPLIT,$0
+	B	runtime∕internal∕atomic·armcas(SB)
+
+TEXT runtime∕internal∕atomic·Casp1(SB),NOSPLIT,$0
+	B	runtime∕internal∕atomic·Cas(SB)
diff --git a/src/runtime/internal/atomic/textflag.h b/src/runtime/internal/atomic/textflag.h
new file mode 100644
index 0000000000..dbf3d9977c
--- /dev/null
+++ b/src/runtime/internal/atomic/textflag.h
@@ -0,0 +1,30 @@
+// Copyright 2013 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// This file defines flags attached to various functions
+// and data objects.  The compilers, assemblers, and linker must
+// all agree on these values.
+
+// Don't profile the marked routine.  This flag is deprecated.
+#define NOPROF	1
+// It is ok for the linker to get multiple of these symbols.  It will
+// pick one of the duplicates to use.
+#define DUPOK	2
+// Don't insert stack check preamble.
+#define NOSPLIT	4
+// Put this data in a read-only section.
+#define RODATA	8
+// This data contains no pointers.
+#define NOPTR	16
+// This is a wrapper function and should not count as disabling 'recover'.
+#define WRAPPER 32
+// This function uses its incoming context register.
+#define NEEDCTXT 64
+// Allocate a word of thread local storage and store the offset from the
+// thread local base to the thread local storage in this variable.
+#define TLSBSS	256
+// Do not insert instructions to allocate a stack frame for this function.
+// Only valid on functions that declare a frame size of 0.
+// TODO(mwhudson): only implemented for ppc64x at present.
+#define NOFRAME 512
diff --git a/src/runtime/lfstack.go b/src/runtime/lfstack.go
index 5838c1d14d..de3999a5fb 100644
--- a/src/runtime/lfstack.go
+++ b/src/runtime/lfstack.go
@@ -7,7 +7,10 @@
 
 package runtime
 
-import "unsafe"
+import (
+	"runtime/internal/atomic"
+	"unsafe"
+)
 
 func lfstackpush(head *uint64, node *lfnode) {
 	node.pushcnt++
@@ -17,9 +20,9 @@ func lfstackpush(head *uint64, node *lfnode) {
 		throw("lfstackpush")
 	}
 	for {
-		old := atomicload64(head)
+		old := atomic.Load64(head)
 		node.next = old
-		if cas64(head, old, new) {
+		if atomic.Cas64(head, old, new) {
 			break
 		}
 	}
@@ -27,13 +30,13 @@ func lfstackpush(head *uint64, node *lfnode) {
 
 func lfstackpop(head *uint64) unsafe.Pointer {
 	for {
-		old := atomicload64(head)
+		old := atomic.Load64(head)
 		if old == 0 {
 			return nil
 		}
 		node, _ := lfstackUnpack(old)
-		next := atomicload64(&node.next)
-		if cas64(head, old, next) {
+		next := atomic.Load64(&node.next)
+		if atomic.Cas64(head, old, next) {
 			return unsafe.Pointer(node)
 		}
 	}
diff --git a/src/runtime/lock_futex.go b/src/runtime/lock_futex.go
index 768fd5769f..fc480290cf 100644
--- a/src/runtime/lock_futex.go
+++ b/src/runtime/lock_futex.go
@@ -6,7 +6,10 @@
 
 package runtime
 
-import "unsafe"
+import (
+	"runtime/internal/atomic"
+	"unsafe"
+)
 
 // This implementation depends on OS-specific implementations of
 //
@@ -48,7 +51,7 @@ func lock(l *mutex) {
 	gp.m.locks++
 
 	// Speculative grab for lock.
-	v := xchg(key32(&l.key), mutex_locked)
+	v := atomic.Xchg(key32(&l.key), mutex_locked)
 	if v == mutex_unlocked {
 		return
 	}
@@ -72,7 +75,7 @@ func lock(l *mutex) {
 		// Try for lock, spinning.
 		for i := 0; i < spin; i++ {
 			for l.key == mutex_unlocked {
-				if cas(key32(&l.key), mutex_unlocked, wait) {
+				if atomic.Cas(key32(&l.key), mutex_unlocked, wait) {
 					return
 				}
 			}
@@ -82,7 +85,7 @@ func lock(l *mutex) {
 		// Try for lock, rescheduling.
 		for i := 0; i < passive_spin; i++ {
 			for l.key == mutex_unlocked {
-				if cas(key32(&l.key), mutex_unlocked, wait) {
+				if atomic.Cas(key32(&l.key), mutex_unlocked, wait) {
 					return
 				}
 			}
@@ -90,7 +93,7 @@ func lock(l *mutex) {
 		}
 
 		// Sleep.
-		v = xchg(key32(&l.key), mutex_sleeping)
+		v = atomic.Xchg(key32(&l.key), mutex_sleeping)
 		if v == mutex_unlocked {
 			return
 		}
@@ -100,7 +103,7 @@ func lock(l *mutex) {
 }
 
 func unlock(l *mutex) {
-	v := xchg(key32(&l.key), mutex_unlocked)
+	v := atomic.Xchg(key32(&l.key), mutex_unlocked)
 	if v == mutex_unlocked {
 		throw("unlock of unlocked lock")
 	}
@@ -124,7 +127,7 @@ func noteclear(n *note) {
 }
 
 func notewakeup(n *note) {
-	old := xchg(key32(&n.key), 1)
+	old := atomic.Xchg(key32(&n.key), 1)
 	if old != 0 {
 		print("notewakeup - double wakeup (", old, ")\n")
 		throw("notewakeup - double wakeup")
@@ -137,7 +140,7 @@ func notesleep(n *note) {
 	if gp != gp.m.g0 {
 		throw("notesleep not on g0")
 	}
-	for atomicload(key32(&n.key)) == 0 {
+	for atomic.Load(key32(&n.key)) == 0 {
 		gp.m.blocked = true
 		futexsleep(key32(&n.key), 0, -1)
 		gp.m.blocked = false
@@ -153,7 +156,7 @@ func notetsleep_internal(n *note, ns int64) bool {
 	gp := getg()
 
 	if ns < 0 {
-		for atomicload(key32(&n.key)) == 0 {
+		for atomic.Load(key32(&n.key)) == 0 {
 			gp.m.blocked = true
 			futexsleep(key32(&n.key), 0, -1)
 			gp.m.blocked = false
@@ -161,7 +164,7 @@ func notetsleep_internal(n *note, ns int64) bool {
 		return true
 	}
 
-	if atomicload(key32(&n.key)) != 0 {
+	if atomic.Load(key32(&n.key)) != 0 {
 		return true
 	}
 
@@ -170,7 +173,7 @@ func notetsleep_internal(n *note, ns int64) bool {
 		gp.m.blocked = true
 		futexsleep(key32(&n.key), 0, ns)
 		gp.m.blocked = false
-		if atomicload(key32(&n.key)) != 0 {
+		if atomic.Load(key32(&n.key)) != 0 {
 			break
 		}
 		now := nanotime()
@@ -179,7 +182,7 @@ func notetsleep_internal(n *note, ns int64) bool {
 		}
 		ns = deadline - now
 	}
-	return atomicload(key32(&n.key)) != 0
+	return atomic.Load(key32(&n.key)) != 0
 }
 
 func notetsleep(n *note, ns int64) bool {
diff --git a/src/runtime/lock_sema.go b/src/runtime/lock_sema.go
index 531f1861e9..ebf786f0af 100644
--- a/src/runtime/lock_sema.go
+++ b/src/runtime/lock_sema.go
@@ -6,7 +6,10 @@
 
 package runtime
 
-import "unsafe"
+import (
+	"runtime/internal/atomic"
+	"unsafe"
+)
 
 // This implementation depends on OS-specific implementations of
 //
@@ -39,7 +42,7 @@ func lock(l *mutex) {
 	gp.m.locks++
 
 	// Speculative grab for lock.
-	if casuintptr(&l.key, 0, locked) {
+	if atomic.Casuintptr(&l.key, 0, locked) {
 		return
 	}
 	if gp.m.waitsema == 0 {
@@ -54,10 +57,10 @@ func lock(l *mutex) {
 	}
 Loop:
 	for i := 0; ; i++ {
-		v := atomicloaduintptr(&l.key)
+		v := atomic.Loaduintptr(&l.key)
 		if v&locked == 0 {
 			// Unlocked. Try to lock.
-			if casuintptr(&l.key, v, v|locked) {
+			if atomic.Casuintptr(&l.key, v, v|locked) {
 				return
 			}
 			i = 0
@@ -73,10 +76,10 @@ Loop:
 			// Queue this M.
 			for {
 				gp.m.nextwaitm = v &^ locked
-				if casuintptr(&l.key, v, uintptr(unsafe.Pointer(gp.m))|locked) {
+				if atomic.Casuintptr(&l.key, v, uintptr(unsafe.Pointer(gp.m))|locked) {
 					break
 				}
-				v = atomicloaduintptr(&l.key)
+				v = atomic.Loaduintptr(&l.key)
 				if v&locked == 0 {
 					continue Loop
 				}
@@ -96,16 +99,16 @@ func unlock(l *mutex) {
 	gp := getg()
 	var mp *m
 	for {
-		v := atomicloaduintptr(&l.key)
+		v := atomic.Loaduintptr(&l.key)
 		if v == locked {
-			if casuintptr(&l.key, locked, 0) {
+			if atomic.Casuintptr(&l.key, locked, 0) {
 				break
 			}
 		} else {
 			// Other M's are waiting for the lock.
 			// Dequeue an M.
 			mp = (*m)(unsafe.Pointer(v &^ locked))
-			if casuintptr(&l.key, v, mp.nextwaitm) {
+			if atomic.Casuintptr(&l.key, v, mp.nextwaitm) {
 				// Dequeued an M.  Wake it.
 				semawakeup(mp)
 				break
@@ -129,8 +132,8 @@ func noteclear(n *note) {
 func notewakeup(n *note) {
 	var v uintptr
 	for {
-		v = atomicloaduintptr(&n.key)
-		if casuintptr(&n.key, v, locked) {
+		v = atomic.Loaduintptr(&n.key)
+		if atomic.Casuintptr(&n.key, v, locked) {
 			break
 		}
 	}
@@ -157,7 +160,7 @@ func notesleep(n *note) {
 	if gp.m.waitsema == 0 {
 		gp.m.waitsema = semacreate()
 	}
-	if !casuintptr(&n.key, 0, uintptr(unsafe.Pointer(gp.m))) {
+	if !atomic.Casuintptr(&n.key, 0, uintptr(unsafe.Pointer(gp.m))) {
 		// Must be locked (got wakeup).
 		if n.key != locked {
 			throw("notesleep - waitm out of sync")
@@ -179,7 +182,7 @@ func notetsleep_internal(n *note, ns int64, gp *g, deadline int64) bool {
 	gp = getg()
 
 	// Register for wakeup on n->waitm.
-	if !casuintptr(&n.key, 0, uintptr(unsafe.Pointer(gp.m))) {
+	if !atomic.Casuintptr(&n.key, 0, uintptr(unsafe.Pointer(gp.m))) {
 		// Must be locked (got wakeup).
 		if n.key != locked {
 			throw("notetsleep - waitm out of sync")
@@ -218,11 +221,11 @@ func notetsleep_internal(n *note, ns int64, gp *g, deadline int64) bool {
 	// so that any notewakeup racing with the return does not
 	// try to grant us the semaphore when we don't expect it.
 	for {
-		v := atomicloaduintptr(&n.key)
+		v := atomic.Loaduintptr(&n.key)
 		switch v {
 		case uintptr(unsafe.Pointer(gp.m)):
 			// No wakeup yet; unregister if possible.
-			if casuintptr(&n.key, v, 0) {
+			if atomic.Casuintptr(&n.key, v, 0) {
 				return false
 			}
 		case locked:
diff --git a/src/runtime/mbitmap.go b/src/runtime/mbitmap.go
index 42afdf4390..11cb6e2c6c 100644
--- a/src/runtime/mbitmap.go
+++ b/src/runtime/mbitmap.go
@@ -66,7 +66,10 @@
 
 package runtime
 
-import "unsafe"
+import (
+	"runtime/internal/atomic"
+	"unsafe"
+)
 
 const (
 	bitPointer = 1 << 0
@@ -302,7 +305,7 @@ func (h heapBits) setMarked() {
 	// Might be racing with other updates, so use atomic update always.
 	// We used to be clever here and use a non-atomic update in certain
 	// cases, but it's not worth the risk.
-	atomicor8(h.bitp, bitMarked<<h.shift)
+	atomic.Or8(h.bitp, bitMarked<<h.shift)
 }
 
 // setMarkedNonAtomic sets the marked bit in the heap bits, non-atomically.
@@ -367,10 +370,10 @@ func (h heapBits) isCheckmarked(size uintptr) bool {
 // h must describe the initial word of the object.
 func (h heapBits) setCheckmarked(size uintptr) {
 	if size == ptrSize {
-		atomicor8(h.bitp, bitPointer<<h.shift)
+		atomic.Or8(h.bitp, bitPointer<<h.shift)
 		return
 	}
-	atomicor8(h.bitp, bitMarked<<(heapBitsShift+h.shift))
+	atomic.Or8(h.bitp, bitMarked<<(heapBitsShift+h.shift))
 }
 
 // heapBitsBulkBarrier executes writebarrierptr_nostore
@@ -724,14 +727,14 @@ func heapBitsSetType(x, size, dataSize uintptr, typ *_type) {
 				if gcphase == _GCoff {
 					*h.bitp |= bitPointer << h.shift
 				} else {
-					atomicor8(h.bitp, bitPointer<<h.shift)
+					atomic.Or8(h.bitp, bitPointer<<h.shift)
 				}
 			} else {
 				// 2-element slice of pointer.
 				if gcphase == _GCoff {
 					*h.bitp |= (bitPointer | bitPointer<<heapBitsShift) << h.shift
 				} else {
-					atomicor8(h.bitp, (bitPointer|bitPointer<<heapBitsShift)<<h.shift)
+					atomic.Or8(h.bitp, (bitPointer|bitPointer<<heapBitsShift)<<h.shift)
 				}
 			}
 			return
@@ -748,7 +751,7 @@ func heapBitsSetType(x, size, dataSize uintptr, typ *_type) {
 		if gcphase == _GCoff {
 			*h.bitp |= uint8(hb << h.shift)
 		} else {
-			atomicor8(h.bitp, uint8(hb<<h.shift))
+			atomic.Or8(h.bitp, uint8(hb<<h.shift))
 		}
 		return
 	}
@@ -960,7 +963,7 @@ func heapBitsSetType(x, size, dataSize uintptr, typ *_type) {
 		if gcphase == _GCoff {
 			*hbitp |= uint8(hb)
 		} else {
-			atomicor8(hbitp, uint8(hb))
+			atomic.Or8(hbitp, uint8(hb))
 		}
 		hbitp = subtract1(hbitp)
 		if w += 2; w >= nw {
@@ -1080,8 +1083,8 @@ Phase3:
 		if gcphase == _GCoff {
 			*hbitp = *hbitp&^(bitPointer|bitMarked|(bitPointer|bitMarked)<<heapBitsShift) | uint8(hb)
 		} else {
-			atomicand8(hbitp, ^uint8(bitPointer|bitMarked|(bitPointer|bitMarked)<<heapBitsShift))
-			atomicor8(hbitp, uint8(hb))
+			atomic.And8(hbitp, ^uint8(bitPointer|bitMarked|(bitPointer|bitMarked)<<heapBitsShift))
+			atomic.Or8(hbitp, uint8(hb))
 		}
 	}
 
diff --git a/src/runtime/mcentral.go b/src/runtime/mcentral.go
index 22c13e0568..1e1f6fd13d 100644
--- a/src/runtime/mcentral.go
+++ b/src/runtime/mcentral.go
@@ -12,6 +12,8 @@
 
 package runtime
 
+import "runtime/internal/atomic"
+
 // Central list of free objects of a given size.
 type mcentral struct {
 	lock      mutex
@@ -37,7 +39,7 @@ func mCentral_CacheSpan(c *mcentral) *mspan {
 retry:
 	var s *mspan
 	for s = c.nonempty.first; s != nil; s = s.next {
-		if s.sweepgen == sg-2 && cas(&s.sweepgen, sg-2, sg-1) {
+		if s.sweepgen == sg-2 && atomic.Cas(&s.sweepgen, sg-2, sg-1) {
 			mSpanList_Remove(&c.nonempty, s)
 			mSpanList_InsertBack(&c.empty, s)
 			unlock(&c.lock)
@@ -56,7 +58,7 @@ retry:
 	}
 
 	for s = c.empty.first; s != nil; s = s.next {
-		if s.sweepgen == sg-2 && cas(&s.sweepgen, sg-2, sg-1) {
+		if s.sweepgen == sg-2 && atomic.Cas(&s.sweepgen, sg-2, sg-1) {
 			// we have an empty span that requires sweeping,
 			// sweep it and see if we can free some space in it
 			mSpanList_Remove(&c.empty, s)
@@ -148,7 +150,7 @@ func mCentral_FreeSpan(c *mcentral, s *mspan, n int32, start gclinkptr, end gcli
 		if !mSpan_InList(s) {
 			throw("can't preserve unlinked span")
 		}
-		atomicstore(&s.sweepgen, mheap_.sweepgen)
+		atomic.Store(&s.sweepgen, mheap_.sweepgen)
 		return false
 	}
 
@@ -164,7 +166,7 @@ func mCentral_FreeSpan(c *mcentral, s *mspan, n int32, start gclinkptr, end gcli
 	// the span may be used in an MCache, so it must come after the
 	// linked list operations above (actually, just after the
 	// lock of c above.)
-	atomicstore(&s.sweepgen, mheap_.sweepgen)
+	atomic.Store(&s.sweepgen, mheap_.sweepgen)
 
 	if s.ref != 0 {
 		unlock(&c.lock)
diff --git a/src/runtime/mfinal.go b/src/runtime/mfinal.go
index 92b3cf523c..8f0f31e7e8 100644
--- a/src/runtime/mfinal.go
+++ b/src/runtime/mfinal.go
@@ -6,7 +6,10 @@
 
 package runtime
 
-import "unsafe"
+import (
+	"runtime/internal/atomic"
+	"unsafe"
+)
 
 type finblock struct {
 	alllink *finblock
@@ -131,7 +134,7 @@ var (
 
 func createfing() {
 	// start the finalizer goroutine exactly once
-	if fingCreate == 0 && cas(&fingCreate, 0, 1) {
+	if fingCreate == 0 && atomic.Cas(&fingCreate, 0, 1) {
 		go runfinq()
 	}
 }
diff --git a/src/runtime/mgc.go b/src/runtime/mgc.go
index b987bcc833..abebe10660 100644
--- a/src/runtime/mgc.go
+++ b/src/runtime/mgc.go
@@ -120,7 +120,10 @@
 
 package runtime
 
-import "unsafe"
+import (
+	"runtime/internal/atomic"
+	"unsafe"
+)
 
 const (
 	_DebugGC         = 0
@@ -236,7 +239,7 @@ const (
 
 //go:nosplit
 func setGCPhase(x uint32) {
-	atomicstore(&gcphase, x)
+	atomic.Store(&gcphase, x)
 	writeBarrierEnabled = gcphase == _GCmark || gcphase == _GCmarktermination
 }
 
@@ -632,11 +635,11 @@ func (c *gcControllerState) findRunnableGCWorker(_p_ *p) *g {
 
 	decIfPositive := func(ptr *int64) bool {
 		if *ptr > 0 {
-			if xaddint64(ptr, -1) >= 0 {
+			if atomic.Xaddint64(ptr, -1) >= 0 {
 				return true
 			}
 			// We lost a race
-			xaddint64(ptr, +1)
+			atomic.Xaddint64(ptr, +1)
 		}
 		return false
 	}
@@ -690,7 +693,7 @@ func (c *gcControllerState) findRunnableGCWorker(_p_ *p) *g {
 		timeUsed := c.fractionalMarkTime + gcForcePreemptNS
 		if then > 0 && float64(timeUsed)/float64(then) > c.fractionalUtilizationGoal {
 			// Nope, we'd overshoot the utilization goal
-			xaddint64(&c.fractionalMarkWorkersNeeded, +1)
+			atomic.Xaddint64(&c.fractionalMarkWorkersNeeded, +1)
 			return nil
 		}
 		_p_.gcMarkWorkerMode = gcMarkWorkerFractionalMode
@@ -983,7 +986,7 @@ func gcStart(mode gcMode, forceTrigger bool) {
 		// black invariant. Enable mutator assists to
 		// put back-pressure on fast allocating
 		// mutators.
-		atomicstore(&gcBlackenEnabled, 1)
+		atomic.Store(&gcBlackenEnabled, 1)
 
 		// Assists and workers can start the moment we start
 		// the world.
@@ -1031,8 +1034,8 @@ func gcMarkDone() {
 	//
 	// TODO(austin): Should dedicated workers keep an eye on this
 	// and exit gcDrain promptly?
-	xaddint64(&gcController.dedicatedMarkWorkersNeeded, -0xffffffff)
-	xaddint64(&gcController.fractionalMarkWorkersNeeded, -0xffffffff)
+	atomic.Xaddint64(&gcController.dedicatedMarkWorkersNeeded, -0xffffffff)
+	atomic.Xaddint64(&gcController.fractionalMarkWorkersNeeded, -0xffffffff)
 
 	if !gcBlackenPromptly {
 		// Transition from mark 1 to mark 2.
@@ -1049,14 +1052,14 @@ func gcMarkDone() {
 
 		// Prevent completion of mark 2 until we've flushed
 		// cached workbufs.
-		xadd(&work.nwait, -1)
+		atomic.Xadd(&work.nwait, -1)
 
 		// Rescan global data and BSS. There may still work
 		// workers running at this point, so bump "jobs" down
 		// before "next" so they won't try running root jobs
 		// until we set next.
-		atomicstore(&work.markrootJobs, uint32(fixedRootCount+work.nDataRoots+work.nBSSRoots))
-		atomicstore(&work.markrootNext, fixedRootCount)
+		atomic.Store(&work.markrootJobs, uint32(fixedRootCount+work.nDataRoots+work.nBSSRoots))
+		atomic.Store(&work.markrootNext, fixedRootCount)
 
 		// GC is set up for mark 2. Let Gs blocked on the
 		// transition lock go while we flush caches.
@@ -1075,10 +1078,10 @@ func gcMarkDone() {
 		})
 
 		// Now we can start up mark 2 workers.
-		xaddint64(&gcController.dedicatedMarkWorkersNeeded, 0xffffffff)
-		xaddint64(&gcController.fractionalMarkWorkersNeeded, 0xffffffff)
+		atomic.Xaddint64(&gcController.dedicatedMarkWorkersNeeded, 0xffffffff)
+		atomic.Xaddint64(&gcController.fractionalMarkWorkersNeeded, 0xffffffff)
 
-		incnwait := xadd(&work.nwait, +1)
+		incnwait := atomic.Xadd(&work.nwait, +1)
 		if incnwait == work.nproc && !gcMarkWorkAvailable(nil) {
 			// This recursion is safe because the call
 			// can't take this same "if" branch.
@@ -1122,7 +1125,7 @@ func gcMarkDone() {
 func gcMarkTermination() {
 	// World is stopped.
 	// Start marktermination which includes enabling the write barrier.
-	atomicstore(&gcBlackenEnabled, 0)
+	atomic.Store(&gcBlackenEnabled, 0)
 	gcBlackenPromptly = false
 	setGCPhase(_GCmarktermination)
 
@@ -1205,7 +1208,7 @@ func gcMarkTermination() {
 	now, unixNow := nanotime(), unixnanotime()
 	work.pauseNS += now - work.pauseStart
 	work.tEnd = now
-	atomicstore64(&memstats.last_gc, uint64(unixNow)) // must be Unix time to make sense to user
+	atomic.Store64(&memstats.last_gc, uint64(unixNow)) // must be Unix time to make sense to user
 	memstats.pause_ns[memstats.numgc%uint32(len(memstats.pause_ns))] = uint64(work.pauseNS)
 	memstats.pause_end[memstats.numgc%uint32(len(memstats.pause_end))] = uint64(unixNow)
 	memstats.pause_total_ns += uint64(work.pauseNS)
@@ -1372,7 +1375,7 @@ func gcBgMarkWorker(p *p) {
 
 		startTime := nanotime()
 
-		decnwait := xadd(&work.nwait, -1)
+		decnwait := atomic.Xadd(&work.nwait, -1)
 		if decnwait == work.nproc {
 			println("runtime: work.nwait=", decnwait, "work.nproc=", work.nproc)
 			throw("work.nwait was > work.nproc")
@@ -1401,18 +1404,18 @@ func gcBgMarkWorker(p *p) {
 		duration := nanotime() - startTime
 		switch p.gcMarkWorkerMode {
 		case gcMarkWorkerDedicatedMode:
-			xaddint64(&gcController.dedicatedMarkTime, duration)
-			xaddint64(&gcController.dedicatedMarkWorkersNeeded, 1)
+			atomic.Xaddint64(&gcController.dedicatedMarkTime, duration)
+			atomic.Xaddint64(&gcController.dedicatedMarkWorkersNeeded, 1)
 		case gcMarkWorkerFractionalMode:
-			xaddint64(&gcController.fractionalMarkTime, duration)
-			xaddint64(&gcController.fractionalMarkWorkersNeeded, 1)
+			atomic.Xaddint64(&gcController.fractionalMarkTime, duration)
+			atomic.Xaddint64(&gcController.fractionalMarkWorkersNeeded, 1)
 		case gcMarkWorkerIdleMode:
-			xaddint64(&gcController.idleMarkTime, duration)
+			atomic.Xaddint64(&gcController.idleMarkTime, duration)
 		}
 
 		// Was this the last worker and did we run out
 		// of work?
-		incnwait := xadd(&work.nwait, +1)
+		incnwait := atomic.Xadd(&work.nwait, +1)
 		if incnwait > work.nproc {
 			println("runtime: p.gcMarkWorkerMode=", p.gcMarkWorkerMode,
 				"work.nwait=", incnwait, "work.nproc=", work.nproc)
@@ -1452,7 +1455,7 @@ func gcMarkWorkAvailable(p *p) bool {
 	if p != nil && !p.gcw.empty() {
 		return true
 	}
-	if atomicload64(&work.full) != 0 {
+	if atomic.Load64(&work.full) != 0 {
 		return true // global work available
 	}
 	if work.markrootNext < work.markrootJobs {
@@ -1773,7 +1776,7 @@ func gchelper() {
 	}
 
 	nproc := work.nproc // work.nproc can change right after we increment work.ndone
-	if xadd(&work.ndone, +1) == nproc-1 {
+	if atomic.Xadd(&work.ndone, +1) == nproc-1 {
 		notewakeup(&work.alldone)
 	}
 	_g_.m.traceback = 0
diff --git a/src/runtime/mgcmark.go b/src/runtime/mgcmark.go
index 6dd9747483..f03441b2f9 100644
--- a/src/runtime/mgcmark.go
+++ b/src/runtime/mgcmark.go
@@ -6,7 +6,10 @@
 
 package runtime
 
-import "unsafe"
+import (
+	"runtime/internal/atomic"
+	"unsafe"
+)
 
 const (
 	fixedRootFinalizers = iota
@@ -61,7 +64,7 @@ func gcMarkRootPrepare() {
 	// roots they create during the concurrent phase will be
 	// scanned during mark termination. During mark termination,
 	// allglen isn't changing, so we'll scan all Gs.
-	work.nStackRoots = int(atomicloaduintptr(&allglen))
+	work.nStackRoots = int(atomic.Loaduintptr(&allglen))
 
 	work.markrootNext = 0
 	work.markrootJobs = uint32(fixedRootCount + work.nDataRoots + work.nBSSRoots + work.nSpanRoots + work.nStackRoots)
@@ -332,7 +335,7 @@ retry:
 	// will just cause steals to fail until credit is accumulated
 	// again, so in the long run it doesn't really matter, but we
 	// do have to handle the negative credit case.
-	bgScanCredit := atomicloadint64(&gcController.bgScanCredit)
+	bgScanCredit := atomic.Loadint64(&gcController.bgScanCredit)
 	stolen := int64(0)
 	if bgScanCredit > 0 {
 		if bgScanCredit < scanWork {
@@ -342,7 +345,7 @@ retry:
 			stolen = scanWork
 			gp.gcAssistBytes += debtBytes
 		}
-		xaddint64(&gcController.bgScanCredit, -stolen)
+		atomic.Xaddint64(&gcController.bgScanCredit, -stolen)
 
 		scanWork -= stolen
 
@@ -356,7 +359,7 @@ retry:
 	// Perform assist work
 	completed := false
 	systemstack(func() {
-		if atomicload(&gcBlackenEnabled) == 0 {
+		if atomic.Load(&gcBlackenEnabled) == 0 {
 			// The gcBlackenEnabled check in malloc races with the
 			// store that clears it but an atomic check in every malloc
 			// would be a performance hit.
@@ -372,7 +375,7 @@ retry:
 		// just measure start and end time.
 		startTime := nanotime()
 
-		decnwait := xadd(&work.nwait, -1)
+		decnwait := atomic.Xadd(&work.nwait, -1)
 		if decnwait == work.nproc {
 			println("runtime: work.nwait =", decnwait, "work.nproc=", work.nproc)
 			throw("nwait > work.nprocs")
@@ -398,7 +401,7 @@ retry:
 
 		// If this is the last worker and we ran out of work,
 		// signal a completion point.
-		incnwait := xadd(&work.nwait, +1)
+		incnwait := atomic.Xadd(&work.nwait, +1)
 		if incnwait > work.nproc {
 			println("runtime: work.nwait=", incnwait,
 				"work.nproc=", work.nproc,
@@ -415,7 +418,7 @@ retry:
 		_p_ := gp.m.p.ptr()
 		_p_.gcAssistTime += duration
 		if _p_.gcAssistTime > gcAssistTimeSlack {
-			xaddint64(&gcController.assistTime, _p_.gcAssistTime)
+			atomic.Xaddint64(&gcController.assistTime, _p_.gcAssistTime)
 			_p_.gcAssistTime = 0
 		}
 	})
@@ -452,7 +455,7 @@ retry:
 		// likely path if we completed above. We do this
 		// under the lock to prevent a GC cycle from ending
 		// between this check and queuing the assist.
-		if atomicload(&gcBlackenEnabled) == 0 {
+		if atomic.Load(&gcBlackenEnabled) == 0 {
 			unlock(&work.assistQueue.lock)
 			return
 		}
@@ -469,7 +472,7 @@ retry:
 		// the queue, but can still back out. This avoids a
 		// race in case background marking has flushed more
 		// credit since we checked above.
-		if atomicloadint64(&gcController.bgScanCredit) > 0 {
+		if atomic.Loadint64(&gcController.bgScanCredit) > 0 {
 			work.assistQueue.head = oldHead
 			work.assistQueue.tail = oldTail
 			if oldTail != 0 {
@@ -506,7 +509,7 @@ func gcFlushBgCredit(scanWork int64) {
 		// small window here where an assist may add itself to
 		// the blocked queue and park. If that happens, we'll
 		// just get it on the next flush.
-		xaddint64(&gcController.bgScanCredit, scanWork)
+		atomic.Xaddint64(&gcController.bgScanCredit, scanWork)
 		return
 	}
 
@@ -553,7 +556,7 @@ func gcFlushBgCredit(scanWork int64) {
 	if scanBytes > 0 {
 		// Convert from scan bytes back to work.
 		scanWork = int64(float64(scanBytes) * gcController.assistWorkPerByte)
-		xaddint64(&gcController.bgScanCredit, scanWork)
+		atomic.Xaddint64(&gcController.bgScanCredit, scanWork)
 	}
 	unlock(&work.assistQueue.lock)
 }
@@ -788,7 +791,7 @@ func gcDrain(gcw *gcWork, flags gcDrainFlags) {
 	// Drain root marking jobs.
 	if work.markrootNext < work.markrootJobs {
 		for blocking || !gp.preempt {
-			job := xadd(&work.markrootNext, +1) - 1
+			job := atomic.Xadd(&work.markrootNext, +1) - 1
 			if job >= work.markrootJobs {
 				break
 			}
@@ -828,7 +831,7 @@ func gcDrain(gcw *gcWork, flags gcDrainFlags) {
 		// account if we've accumulated enough locally so
 		// mutator assists can draw on it.
 		if gcw.scanWork >= gcCreditSlack {
-			xaddint64(&gcController.scanWork, gcw.scanWork)
+			atomic.Xaddint64(&gcController.scanWork, gcw.scanWork)
 			if flushBgCredit {
 				gcFlushBgCredit(gcw.scanWork - initScanWork)
 				initScanWork = 0
@@ -839,7 +842,7 @@ func gcDrain(gcw *gcWork, flags gcDrainFlags) {
 
 	// Flush remaining scan work credit.
 	if gcw.scanWork > 0 {
-		xaddint64(&gcController.scanWork, gcw.scanWork)
+		atomic.Xaddint64(&gcController.scanWork, gcw.scanWork)
 		if flushBgCredit {
 			gcFlushBgCredit(gcw.scanWork - initScanWork)
 		}
@@ -877,7 +880,7 @@ func gcDrainN(gcw *gcWork, scanWork int64) int64 {
 
 		// Flush background scan work credit.
 		if gcw.scanWork >= gcCreditSlack {
-			xaddint64(&gcController.scanWork, gcw.scanWork)
+			atomic.Xaddint64(&gcController.scanWork, gcw.scanWork)
 			workFlushed += gcw.scanWork
 			gcw.scanWork = 0
 		}
@@ -1114,7 +1117,7 @@ func gcmarknewobject_m(obj, size uintptr) {
 		throw("gcmarknewobject called while doing checkmark")
 	}
 	heapBitsForAddr(obj).setMarked()
-	xadd64(&work.bytesMarked, int64(size))
+	atomic.Xadd64(&work.bytesMarked, int64(size))
 }
 
 // Checkmarking
diff --git a/src/runtime/mgcsweep.go b/src/runtime/mgcsweep.go
index ace5618213..fc92301857 100644
--- a/src/runtime/mgcsweep.go
+++ b/src/runtime/mgcsweep.go
@@ -6,7 +6,10 @@
 
 package runtime
 
-import "unsafe"
+import (
+	"runtime/internal/atomic"
+	"unsafe"
+)
 
 var sweep sweepdata
 
@@ -87,7 +90,7 @@ func sweepone() uintptr {
 	_g_.m.locks++
 	sg := mheap_.sweepgen
 	for {
-		idx := xadd(&sweep.spanidx, 1) - 1
+		idx := atomic.Xadd(&sweep.spanidx, 1) - 1
 		if idx >= uint32(len(work.spans)) {
 			mheap_.sweepdone = 1
 			_g_.m.locks--
@@ -98,7 +101,7 @@ func sweepone() uintptr {
 			s.sweepgen = sg
 			continue
 		}
-		if s.sweepgen != sg-2 || !cas(&s.sweepgen, sg-2, sg-1) {
+		if s.sweepgen != sg-2 || !atomic.Cas(&s.sweepgen, sg-2, sg-1) {
 			continue
 		}
 		npages := s.npages
@@ -136,16 +139,16 @@ func mSpan_EnsureSwept(s *mspan) {
 	}
 
 	sg := mheap_.sweepgen
-	if atomicload(&s.sweepgen) == sg {
+	if atomic.Load(&s.sweepgen) == sg {
 		return
 	}
 	// The caller must be sure that the span is a MSpanInUse span.
-	if cas(&s.sweepgen, sg-2, sg-1) {
+	if atomic.Cas(&s.sweepgen, sg-2, sg-1) {
 		mSpan_Sweep(s, false)
 		return
 	}
 	// unfortunate condition, and we don't have efficient means to wait
-	for atomicload(&s.sweepgen) != sg {
+	for atomic.Load(&s.sweepgen) != sg {
 		osyield()
 	}
 }
@@ -173,7 +176,7 @@ func mSpan_Sweep(s *mspan, preserve bool) bool {
 		traceGCSweepStart()
 	}
 
-	xadd64(&mheap_.pagesSwept, int64(s.npages))
+	atomic.Xadd64(&mheap_.pagesSwept, int64(s.npages))
 
 	cl := s.sizeclass
 	size := s.elemsize
@@ -305,7 +308,7 @@ func mSpan_Sweep(s *mspan, preserve bool) bool {
 			print("MSpan_Sweep: state=", s.state, " sweepgen=", s.sweepgen, " mheap.sweepgen=", sweepgen, "\n")
 			throw("MSpan_Sweep: bad span state after sweep")
 		}
-		atomicstore(&s.sweepgen, sweepgen)
+		atomic.Store(&s.sweepgen, sweepgen)
 	}
 	if nfree > 0 {
 		c.local_nsmallfree[cl] += uintptr(nfree)
@@ -364,11 +367,11 @@ func deductSweepCredit(spanBytes uintptr, callerSweepPages uintptr) {
 	}
 
 	// Account for this span allocation.
-	spanBytesAlloc := xadd64(&mheap_.spanBytesAlloc, int64(spanBytes))
+	spanBytesAlloc := atomic.Xadd64(&mheap_.spanBytesAlloc, int64(spanBytes))
 
 	// Fix debt if necessary.
 	pagesOwed := int64(mheap_.sweepPagesPerByte * float64(spanBytesAlloc))
-	for pagesOwed-int64(atomicload64(&mheap_.pagesSwept)) > int64(callerSweepPages) {
+	for pagesOwed-int64(atomic.Load64(&mheap_.pagesSwept)) > int64(callerSweepPages) {
 		if gosweepone() == ^uintptr(0) {
 			mheap_.sweepPagesPerByte = 0
 			break
diff --git a/src/runtime/mgcwork.go b/src/runtime/mgcwork.go
index 1d66200bec..3654778c94 100644
--- a/src/runtime/mgcwork.go
+++ b/src/runtime/mgcwork.go
@@ -4,7 +4,10 @@
 
 package runtime
 
-import "unsafe"
+import (
+	"runtime/internal/atomic"
+	"unsafe"
+)
 
 const (
 	_Debugwbufs  = false // if true check wbufs consistency
@@ -218,11 +221,11 @@ func (w *gcWork) dispose() {
 		// atomic becomes a problem, we should first try to
 		// dispose less and if necessary aggregate in a per-P
 		// counter.
-		xadd64(&work.bytesMarked, int64(w.bytesMarked))
+		atomic.Xadd64(&work.bytesMarked, int64(w.bytesMarked))
 		w.bytesMarked = 0
 	}
 	if w.scanWork != 0 {
-		xaddint64(&gcController.scanWork, w.scanWork)
+		atomic.Xaddint64(&gcController.scanWork, w.scanWork)
 		w.scanWork = 0
 	}
 }
@@ -404,14 +407,14 @@ func getfull(entry int) *workbuf {
 		return b
 	}
 
-	incnwait := xadd(&work.nwait, +1)
+	incnwait := atomic.Xadd(&work.nwait, +1)
 	if incnwait > work.nproc {
 		println("runtime: work.nwait=", incnwait, "work.nproc=", work.nproc)
 		throw("work.nwait > work.nproc")
 	}
 	for i := 0; ; i++ {
 		if work.full != 0 {
-			decnwait := xadd(&work.nwait, -1)
+			decnwait := atomic.Xadd(&work.nwait, -1)
 			if decnwait == work.nproc {
 				println("runtime: work.nwait=", decnwait, "work.nproc=", work.nproc)
 				throw("work.nwait > work.nproc")
@@ -422,7 +425,7 @@ func getfull(entry int) *workbuf {
 				b.checknonempty()
 				return b
 			}
-			incnwait := xadd(&work.nwait, +1)
+			incnwait := atomic.Xadd(&work.nwait, +1)
 			if incnwait > work.nproc {
 				println("runtime: work.nwait=", incnwait, "work.nproc=", work.nproc)
 				throw("work.nwait > work.nproc")
diff --git a/src/runtime/mheap.go b/src/runtime/mheap.go
index 907c27b3a6..359e62fc01 100644
--- a/src/runtime/mheap.go
+++ b/src/runtime/mheap.go
@@ -8,7 +8,10 @@
 
 package runtime
 
-import "unsafe"
+import (
+	"runtime/internal/atomic"
+	"unsafe"
+)
 
 // Main malloc heap.
 // The heap itself is the "free[]" and "large" arrays,
@@ -334,7 +337,7 @@ func mHeap_ReclaimList(h *mheap, list *mSpanList, npages uintptr) uintptr {
 	sg := mheap_.sweepgen
 retry:
 	for s := list.first; s != nil; s = s.next {
-		if s.sweepgen == sg-2 && cas(&s.sweepgen, sg-2, sg-1) {
+		if s.sweepgen == sg-2 && atomic.Cas(&s.sweepgen, sg-2, sg-1) {
 			mSpanList_Remove(list, s)
 			// swept spans are at the end of the list
 			mSpanList_InsertBack(list, s)
@@ -436,7 +439,7 @@ func mHeap_Alloc_m(h *mheap, npage uintptr, sizeclass int32, large bool) *mspan
 	if s != nil {
 		// Record span info, because gc needs to be
 		// able to map interior pointer to containing span.
-		atomicstore(&s.sweepgen, h.sweepgen)
+		atomic.Store(&s.sweepgen, h.sweepgen)
 		s.state = _MSpanInUse
 		s.freelist = 0
 		s.ref = 0
@@ -667,7 +670,7 @@ func mHeap_Grow(h *mheap, npage uintptr) bool {
 	for i := p; i < p+s.npages; i++ {
 		h_spans[i] = s
 	}
-	atomicstore(&s.sweepgen, h.sweepgen)
+	atomic.Store(&s.sweepgen, h.sweepgen)
 	s.state = _MSpanInUse
 	h.pagesInUse += uint64(npage)
 	mHeap_FreeSpanLocked(h, s, false, true, 0)
diff --git a/src/runtime/mprof.go b/src/runtime/mprof.go
index d4ffb3eafd..ff4b9e9103 100644
--- a/src/runtime/mprof.go
+++ b/src/runtime/mprof.go
@@ -8,6 +8,7 @@
 package runtime
 
 import (
+	"runtime/internal/atomic"
 	"unsafe"
 )
 
@@ -280,14 +281,14 @@ func SetBlockProfileRate(rate int) {
 		}
 	}
 
-	atomicstore64(&blockprofilerate, uint64(r))
+	atomic.Store64(&blockprofilerate, uint64(r))
 }
 
 func blockevent(cycles int64, skip int) {
 	if cycles <= 0 {
 		cycles = 1
 	}
-	rate := int64(atomicload64(&blockprofilerate))
+	rate := int64(atomic.Load64(&blockprofilerate))
 	if rate <= 0 || (rate > cycles && int64(fastrand1())%rate > cycles) {
 		return
 	}
@@ -488,7 +489,7 @@ func BlockProfile(p []BlockProfileRecord) (n int, ok bool) {
 // Most clients should use the runtime/pprof package instead
 // of calling ThreadCreateProfile directly.
 func ThreadCreateProfile(p []StackRecord) (n int, ok bool) {
-	first := (*m)(atomicloadp(unsafe.Pointer(&allm)))
+	first := (*m)(atomic.Loadp(unsafe.Pointer(&allm)))
 	for mp := first; mp != nil; mp = mp.alllink {
 		n++
 	}
diff --git a/src/runtime/mstats.go b/src/runtime/mstats.go
index 08b82e021a..6e1116bc59 100644
--- a/src/runtime/mstats.go
+++ b/src/runtime/mstats.go
@@ -6,7 +6,10 @@
 
 package runtime
 
-import "unsafe"
+import (
+	"runtime/internal/atomic"
+	"unsafe"
+)
 
 // Statistics.
 // If you edit this structure, also edit type MemStats below.
@@ -367,10 +370,10 @@ func purgecachedstats(c *mcache) {
 //go:nosplit
 func mSysStatInc(sysStat *uint64, n uintptr) {
 	if _BigEndian != 0 {
-		xadd64(sysStat, int64(n))
+		atomic.Xadd64(sysStat, int64(n))
 		return
 	}
-	if val := xadduintptr((*uintptr)(unsafe.Pointer(sysStat)), n); val < n {
+	if val := atomic.Xadduintptr((*uintptr)(unsafe.Pointer(sysStat)), n); val < n {
 		print("runtime: stat overflow: val ", val, ", n ", n, "\n")
 		exit(2)
 	}
@@ -381,10 +384,10 @@ func mSysStatInc(sysStat *uint64, n uintptr) {
 //go:nosplit
 func mSysStatDec(sysStat *uint64, n uintptr) {
 	if _BigEndian != 0 {
-		xadd64(sysStat, -int64(n))
+		atomic.Xadd64(sysStat, -int64(n))
 		return
 	}
-	if val := xadduintptr((*uintptr)(unsafe.Pointer(sysStat)), uintptr(-int64(n))); val+n < n {
+	if val := atomic.Xadduintptr((*uintptr)(unsafe.Pointer(sysStat)), uintptr(-int64(n))); val+n < n {
 		print("runtime: stat underflow: val ", val, ", n ", n, "\n")
 		exit(2)
 	}
diff --git a/src/runtime/netpoll.go b/src/runtime/netpoll.go
index 7c6e3fa93e..19adeff787 100644
--- a/src/runtime/netpoll.go
+++ b/src/runtime/netpoll.go
@@ -6,7 +6,10 @@
 
 package runtime
 
-import "unsafe"
+import (
+	"runtime/internal/atomic"
+	"unsafe"
+)
 
 // Integrated network poller (platform-independent part).
 // A particular implementation (epoll/kqueue) must define the following functions:
@@ -77,11 +80,11 @@ var (
 //go:linkname net_runtime_pollServerInit net.runtime_pollServerInit
 func net_runtime_pollServerInit() {
 	netpollinit()
-	atomicstore(&netpollInited, 1)
+	atomic.Store(&netpollInited, 1)
 }
 
 func netpollinited() bool {
-	return atomicload(&netpollInited) != 0
+	return atomic.Load(&netpollInited) != 0
 }
 
 //go:linkname net_runtime_pollOpen net.runtime_pollOpen
@@ -305,7 +308,7 @@ func netpollcheckerr(pd *pollDesc, mode int32) int {
 }
 
 func netpollblockcommit(gp *g, gpp unsafe.Pointer) bool {
-	return casuintptr((*uintptr)(gpp), pdWait, uintptr(unsafe.Pointer(gp)))
+	return atomic.Casuintptr((*uintptr)(gpp), pdWait, uintptr(unsafe.Pointer(gp)))
 }
 
 // returns true if IO is ready, or false if timedout or closed
@@ -326,7 +329,7 @@ func netpollblock(pd *pollDesc, mode int32, waitio bool) bool {
 		if old != 0 {
 			throw("netpollblock: double wait")
 		}
-		if casuintptr(gpp, 0, pdWait) {
+		if atomic.Casuintptr(gpp, 0, pdWait) {
 			break
 		}
 	}
@@ -338,7 +341,7 @@ func netpollblock(pd *pollDesc, mode int32, waitio bool) bool {
 		gopark(netpollblockcommit, unsafe.Pointer(gpp), "IO wait", traceEvGoBlockNet, 5)
 	}
 	// be careful to not lose concurrent READY notification
-	old := xchguintptr(gpp, 0)
+	old := atomic.Xchguintptr(gpp, 0)
 	if old > pdWait {
 		throw("netpollblock: corrupted state")
 	}
@@ -365,7 +368,7 @@ func netpollunblock(pd *pollDesc, mode int32, ioready bool) *g {
 		if ioready {
 			new = pdReady
 		}
-		if casuintptr(gpp, old, new) {
+		if atomic.Casuintptr(gpp, old, new) {
 			if old == pdReady || old == pdWait {
 				old = 0
 			}
diff --git a/src/runtime/os1_netbsd.go b/src/runtime/os1_netbsd.go
index cacd60620b..6e6a77f799 100644
--- a/src/runtime/os1_netbsd.go
+++ b/src/runtime/os1_netbsd.go
@@ -4,7 +4,10 @@
 
 package runtime
 
-import "unsafe"
+import (
+	"runtime/internal/atomic"
+	"unsafe"
+)
 
 const (
 	_ESRCH     = 3
@@ -57,9 +60,9 @@ func semasleep(ns int64) int32 {
 	}
 
 	for {
-		v := atomicload(&_g_.m.waitsemacount)
+		v := atomic.Load(&_g_.m.waitsemacount)
 		if v > 0 {
-			if cas(&_g_.m.waitsemacount, v, v-1) {
+			if atomic.Cas(&_g_.m.waitsemacount, v, v-1) {
 				return 0 // semaphore acquired
 			}
 			continue
@@ -75,7 +78,7 @@ func semasleep(ns int64) int32 {
 
 //go:nosplit
 func semawakeup(mp *m) {
-	xadd(&mp.waitsemacount, 1)
+	atomic.Xadd(&mp.waitsemacount, 1)
 	// From NetBSD's _lwp_unpark(2) manual:
 	// "If the target LWP is not currently waiting, it will return
 	// immediately upon the next call to _lwp_park()."
diff --git a/src/runtime/os1_openbsd.go b/src/runtime/os1_openbsd.go
index 24a095b9d6..2eb6e8bedd 100644
--- a/src/runtime/os1_openbsd.go
+++ b/src/runtime/os1_openbsd.go
@@ -4,7 +4,10 @@
 
 package runtime
 
-import "unsafe"
+import (
+	"runtime/internal/atomic"
+	"unsafe"
+)
 
 const (
 	_ESRCH       = 3
@@ -64,9 +67,9 @@ func semasleep(ns int64) int32 {
 	}
 
 	for {
-		v := atomicload(&_g_.m.waitsemacount)
+		v := atomic.Load(&_g_.m.waitsemacount)
 		if v > 0 {
-			if cas(&_g_.m.waitsemacount, v, v-1) {
+			if atomic.Cas(&_g_.m.waitsemacount, v, v-1) {
 				return 0 // semaphore acquired
 			}
 			continue
@@ -88,7 +91,7 @@ func semasleep(ns int64) int32 {
 
 //go:nosplit
 func semawakeup(mp *m) {
-	xadd(&mp.waitsemacount, 1)
+	atomic.Xadd(&mp.waitsemacount, 1)
 	ret := thrwakeup(uintptr(unsafe.Pointer(&mp.waitsemacount)), 1)
 	if ret != 0 && ret != _ESRCH {
 		// semawakeup can be called on signal stack.
diff --git a/src/runtime/os1_plan9.go b/src/runtime/os1_plan9.go
index 43ebfa30a4..07ad498fbc 100644
--- a/src/runtime/os1_plan9.go
+++ b/src/runtime/os1_plan9.go
@@ -4,7 +4,10 @@
 
 package runtime
 
-import "unsafe"
+import (
+	"runtime/internal/atomic"
+	"unsafe"
+)
 
 // Called to initialize a new m (including the bootstrap m).
 // Called on the parent thread (main thread in case of bootstrap), can allocate memory.
@@ -143,7 +146,7 @@ func goexitsall(status *byte) {
 	n := copy(buf[:], goexits)
 	n = copy(buf[n:], gostringnocopy(status))
 	pid := getpid()
-	for mp := (*m)(atomicloadp(unsafe.Pointer(&allm))); mp != nil; mp = mp.alllink {
+	for mp := (*m)(atomic.Loadp(unsafe.Pointer(&allm))); mp != nil; mp = mp.alllink {
 		if mp.procid != pid {
 			postnote(mp.procid, buf[:])
 		}
diff --git a/src/runtime/os1_windows.go b/src/runtime/os1_windows.go
index 99c6df4008..feece9fcc2 100644
--- a/src/runtime/os1_windows.go
+++ b/src/runtime/os1_windows.go
@@ -5,6 +5,7 @@
 package runtime
 
 import (
+	"runtime/internal/atomic"
 	"unsafe"
 )
 
@@ -311,7 +312,7 @@ func msigsave(mp *m) {
 func minit() {
 	var thandle uintptr
 	stdcall7(_DuplicateHandle, currentProcess, currentThread, currentProcess, uintptr(unsafe.Pointer(&thandle)), 0, 0, _DUPLICATE_SAME_ACCESS)
-	atomicstoreuintptr(&getg().m.thread, thandle)
+	atomic.Storeuintptr(&getg().m.thread, thandle)
 }
 
 // Called from dropm to undo the effect of an minit.
@@ -510,9 +511,9 @@ func profileloop1(param uintptr) uint32 {
 
 	for {
 		stdcall2(_WaitForSingleObject, profiletimer, _INFINITE)
-		first := (*m)(atomicloadp(unsafe.Pointer(&allm)))
+		first := (*m)(atomic.Loadp(unsafe.Pointer(&allm)))
 		for mp := first; mp != nil; mp = mp.alllink {
-			thread := atomicloaduintptr(&mp.thread)
+			thread := atomic.Loaduintptr(&mp.thread)
 			// Do not profile threads blocked on Notes,
 			// this includes idle worker threads,
 			// idle timer thread, idle heap scavenger, etc.
@@ -534,7 +535,7 @@ func resetcpuprofiler(hz int32) {
 	lock(&cpuprofilerlock)
 	if profiletimer == 0 {
 		timer := stdcall3(_CreateWaitableTimerA, 0, 0, 0)
-		atomicstoreuintptr(&profiletimer, timer)
+		atomic.Storeuintptr(&profiletimer, timer)
 		thread := stdcall6(_CreateThread, 0, 0, funcPC(profileloop), 0, 0, 0)
 		stdcall2(_SetThreadPriority, thread, _THREAD_PRIORITY_HIGHEST)
 		stdcall1(_CloseHandle, thread)
@@ -551,7 +552,7 @@ func resetcpuprofiler(hz int32) {
 		due = int64(ms) * -10000
 	}
 	stdcall6(_SetWaitableTimer, profiletimer, uintptr(unsafe.Pointer(&due)), uintptr(ms), 0, 0, 0)
-	atomicstore((*uint32)(unsafe.Pointer(&getg().m.profilehz)), uint32(hz))
+	atomic.Store((*uint32)(unsafe.Pointer(&getg().m.profilehz)), uint32(hz))
 }
 
 func memlimit() uintptr {
diff --git a/src/runtime/panic.go b/src/runtime/panic.go
index 8d858e851c..ba07330e35 100644
--- a/src/runtime/panic.go
+++ b/src/runtime/panic.go
@@ -4,7 +4,10 @@
 
 package runtime
 
-import "unsafe"
+import (
+	"runtime/internal/atomic"
+	"unsafe"
+)
 
 var indexError = error(errorString("index out of range"))
 
@@ -569,7 +572,7 @@ func startpanic_m() {
 	case 0:
 		_g_.m.dying = 1
 		_g_.writebuf = nil
-		xadd(&panicking, 1)
+		atomic.Xadd(&panicking, 1)
 		lock(&paniclk)
 		if debug.schedtrace > 0 || debug.scheddetail > 0 {
 			schedtrace(true)
@@ -626,7 +629,7 @@ func dopanic_m(gp *g, pc, sp uintptr) {
 	}
 	unlock(&paniclk)
 
-	if xadd(&panicking, -1) != 0 {
+	if atomic.Xadd(&panicking, -1) != 0 {
 		// Some other m is panicking too.
 		// Let it print what it needs to print.
 		// Wait forever without chewing up cpu.
diff --git a/src/runtime/parfor.go b/src/runtime/parfor.go
index c82beee3fd..2db43bd424 100644
--- a/src/runtime/parfor.go
+++ b/src/runtime/parfor.go
@@ -6,6 +6,8 @@
 
 package runtime
 
+import "runtime/internal/atomic"
+
 // A parfor holds state for the parallel for operation.
 type parfor struct {
 	body   func(*parfor, uint32) // executed for each element
@@ -82,7 +84,7 @@ func parforsetup(desc *parfor, nthr, n uint32, wait bool, body func(*parfor, uin
 
 func parfordo(desc *parfor) {
 	// Obtain 0-based thread index.
-	tid := xadd(&desc.thrseq, 1) - 1
+	tid := atomic.Xadd(&desc.thrseq, 1) - 1
 	if tid >= desc.nthr {
 		print("tid=", tid, " nthr=", desc.nthr, "\n")
 		throw("parfor: invalid tid")
@@ -103,7 +105,7 @@ func parfordo(desc *parfor) {
 		for {
 			// While there is local work,
 			// bump low index and execute the iteration.
-			pos := xadd64(mypos, 1)
+			pos := atomic.Xadd64(mypos, 1)
 			begin := uint32(pos) - 1
 			end := uint32(pos >> 32)
 			if begin < end {
@@ -120,7 +122,7 @@ func parfordo(desc *parfor) {
 			// increment the done counter...
 			if try > desc.nthr*4 && !idle {
 				idle = true
-				xadd(&desc.done, 1)
+				atomic.Xadd(&desc.done, 1)
 			}
 
 			// ...if all threads have incremented the counter,
@@ -131,7 +133,7 @@ func parfordo(desc *parfor) {
 			}
 			if desc.done+extra == desc.nthr {
 				if !idle {
-					xadd(&desc.done, 1)
+					atomic.Xadd(&desc.done, 1)
 				}
 				goto exit
 			}
@@ -145,7 +147,7 @@ func parfordo(desc *parfor) {
 			victimpos := &desc.thr[victim].pos
 			for {
 				// See if it has any work.
-				pos := atomicload64(victimpos)
+				pos := atomic.Load64(victimpos)
 				begin = uint32(pos)
 				end = uint32(pos >> 32)
 				if begin+1 >= end {
@@ -154,12 +156,12 @@ func parfordo(desc *parfor) {
 					break
 				}
 				if idle {
-					xadd(&desc.done, -1)
+					atomic.Xadd(&desc.done, -1)
 					idle = false
 				}
 				begin2 := begin + (end-begin)/2
 				newpos := uint64(begin) | uint64(begin2)<<32
-				if cas64(victimpos, pos, newpos) {
+				if atomic.Cas64(victimpos, pos, newpos) {
 					begin = begin2
 					break
 				}
@@ -169,7 +171,7 @@ func parfordo(desc *parfor) {
 				if idle {
 					throw("parfor: should not be idle")
 				}
-				atomicstore64(mypos, uint64(begin)|uint64(end)<<32)
+				atomic.Store64(mypos, uint64(begin)|uint64(end)<<32)
 				me.nsteal++
 				me.nstealcnt += uint64(end) - uint64(begin)
 				break
@@ -185,7 +187,7 @@ func parfordo(desc *parfor) {
 				// If a caller asked not to wait for the others, exit now
 				// (assume that most work is already done at this point).
 				if !idle {
-					xadd(&desc.done, 1)
+					atomic.Xadd(&desc.done, 1)
 				}
 				goto exit
 			} else if try < 6*desc.nthr {
@@ -199,11 +201,11 @@ func parfordo(desc *parfor) {
 	}
 
 exit:
-	xadd64(&desc.nsteal, int64(me.nsteal))
-	xadd64(&desc.nstealcnt, int64(me.nstealcnt))
-	xadd64(&desc.nprocyield, int64(me.nprocyield))
-	xadd64(&desc.nosyield, int64(me.nosyield))
-	xadd64(&desc.nsleep, int64(me.nsleep))
+	atomic.Xadd64(&desc.nsteal, int64(me.nsteal))
+	atomic.Xadd64(&desc.nstealcnt, int64(me.nstealcnt))
+	atomic.Xadd64(&desc.nprocyield, int64(me.nprocyield))
+	atomic.Xadd64(&desc.nosyield, int64(me.nosyield))
+	atomic.Xadd64(&desc.nsleep, int64(me.nsleep))
 	me.nsteal = 0
 	me.nstealcnt = 0
 	me.nprocyield = 0
diff --git a/src/runtime/proc.go b/src/runtime/proc.go
index cc2134fc1b..4dba0cabe9 100644
--- a/src/runtime/proc.go
+++ b/src/runtime/proc.go
@@ -4,7 +4,10 @@
 
 package runtime
 
-import "unsafe"
+import (
+	"runtime/internal/atomic"
+	"unsafe"
+)
 
 // Goroutine scheduler
 // The scheduler's job is to distribute ready-to-run goroutines over worker threads.
@@ -165,7 +168,7 @@ func forcegchelper() {
 		if forcegc.idle != 0 {
 			throw("forcegc: phase error")
 		}
-		atomicstore(&forcegc.idle, 1)
+		atomic.Store(&forcegc.idle, 1)
 		goparkunlock(&forcegc.lock, "force gc (idle)", traceEvGoBlock, 1)
 		// this goroutine is explicitly resumed by sysmon
 		if debug.gctrace > 0 {
@@ -462,11 +465,11 @@ func ready(gp *g, traceskip int) {
 	// status is Gwaiting or Gscanwaiting, make Grunnable and put on runq
 	casgstatus(gp, _Gwaiting, _Grunnable)
 	runqput(_g_.m.p.ptr(), gp, true)
-	if atomicload(&sched.npidle) != 0 && atomicload(&sched.nmspinning) == 0 { // TODO: fast atomic
+	if atomic.Load(&sched.npidle) != 0 && atomic.Load(&sched.nmspinning) == 0 { // TODO: fast atomic
 		wakep()
 	}
 	_g_.m.locks--
-	if _g_.m.locks == 0 && _g_.preempt { // restore the preemption request in case we've cleared it in newstack
+	if _g_.m.locks == 0 && _g_.preempt { // restore the preemption request in Case we've cleared it in newstack
 		_g_.stackguard0 = stackPreempt
 	}
 }
@@ -538,7 +541,7 @@ func freezetheworld() {
 	for i := 0; i < 5; i++ {
 		// this should tell the scheduler to not start any new goroutines
 		sched.stopwait = freezeStopWait
-		atomicstore(&sched.gcwaiting, 1)
+		atomic.Store(&sched.gcwaiting, 1)
 		// this should stop running goroutines
 		if !preemptall() {
 			break // no running goroutines
@@ -562,7 +565,7 @@ func isscanstatus(status uint32) bool {
 // castogscanstatus, casfrom_Gscanstatus.
 //go:nosplit
 func readgstatus(gp *g) uint32 {
-	return atomicload(&gp.atomicstatus)
+	return atomic.Load(&gp.atomicstatus)
 }
 
 // Ownership of gscanvalid:
@@ -595,11 +598,11 @@ func casfrom_Gscanstatus(gp *g, oldval, newval uint32) {
 		_Gscanrunning,
 		_Gscansyscall:
 		if newval == oldval&^_Gscan {
-			success = cas(&gp.atomicstatus, oldval, newval)
+			success = atomic.Cas(&gp.atomicstatus, oldval, newval)
 		}
 	case _Gscanenqueue:
 		if newval == _Gwaiting {
-			success = cas(&gp.atomicstatus, oldval, newval)
+			success = atomic.Cas(&gp.atomicstatus, oldval, newval)
 		}
 	}
 	if !success {
@@ -620,11 +623,11 @@ func castogscanstatus(gp *g, oldval, newval uint32) bool {
 		_Gwaiting,
 		_Gsyscall:
 		if newval == oldval|_Gscan {
-			return cas(&gp.atomicstatus, oldval, newval)
+			return atomic.Cas(&gp.atomicstatus, oldval, newval)
 		}
 	case _Grunning:
 		if newval == _Gscanrunning || newval == _Gscanenqueue {
-			return cas(&gp.atomicstatus, oldval, newval)
+			return atomic.Cas(&gp.atomicstatus, oldval, newval)
 		}
 	}
 	print("runtime: castogscanstatus oldval=", hex(oldval), " newval=", hex(newval), "\n")
@@ -656,7 +659,7 @@ func casgstatus(gp *g, oldval, newval uint32) {
 
 	// loop if gp->atomicstatus is in a scan state giving
 	// GC time to finish and change the state to oldval.
-	for !cas(&gp.atomicstatus, oldval, newval) {
+	for !atomic.Cas(&gp.atomicstatus, oldval, newval) {
 		if oldval == _Gwaiting && gp.atomicstatus == _Grunnable {
 			systemstack(func() {
 				throw("casgstatus: waiting for Gwaiting but is Grunnable")
@@ -687,7 +690,7 @@ func casgcopystack(gp *g) uint32 {
 		if oldstatus != _Gwaiting && oldstatus != _Grunnable {
 			throw("copystack: bad status, not Gwaiting or Grunnable")
 		}
-		if cas(&gp.atomicstatus, oldstatus, _Gcopystack) {
+		if atomic.Cas(&gp.atomicstatus, oldstatus, _Gcopystack) {
 			return oldstatus
 		}
 	}
@@ -733,11 +736,11 @@ func scang(gp *g) {
 				if !gp.gcscandone {
 					// Coordinate with traceback
 					// in sigprof.
-					for !cas(&gp.stackLock, 0, 1) {
+					for !atomic.Cas(&gp.stackLock, 0, 1) {
 						osyield()
 					}
 					scanstack(gp)
-					atomicstore(&gp.stackLock, 0)
+					atomic.Store(&gp.stackLock, 0)
 					gp.gcscandone = true
 				}
 				restartg(gp)
@@ -866,7 +869,7 @@ func stopTheWorldWithSema() {
 
 	lock(&sched.lock)
 	sched.stopwait = gomaxprocs
-	atomicstore(&sched.gcwaiting, 1)
+	atomic.Store(&sched.gcwaiting, 1)
 	preemptall()
 	// stop current P
 	_g_.m.p.ptr().status = _Pgcstop // Pgcstop is only diagnostic.
@@ -875,7 +878,7 @@ func stopTheWorldWithSema() {
 	for i := 0; i < int(gomaxprocs); i++ {
 		p := allp[i]
 		s := p.status
-		if s == _Psyscall && cas(&p.status, s, _Pgcstop) {
+		if s == _Psyscall && atomic.Cas(&p.status, s, _Pgcstop) {
 			if trace.enabled {
 				traceGoSysBlock(p)
 				traceProcStop(p)
@@ -966,7 +969,7 @@ func startTheWorldWithSema() {
 	// Wakeup an additional proc in case we have excessive runnable goroutines
 	// in local queues or in the global queue. If we don't, the proc will park itself.
 	// If we have lots of excessive work, resetspinning will unpark additional procs as necessary.
-	if atomicload(&sched.npidle) != 0 && atomicload(&sched.nmspinning) == 0 {
+	if atomic.Load(&sched.npidle) != 0 && atomic.Load(&sched.nmspinning) == 0 {
 		wakep()
 	}
 
@@ -1073,7 +1076,7 @@ func forEachP(fn func(*p)) {
 	// Ask all Ps to run the safe point function.
 	for _, p := range allp[:gomaxprocs] {
 		if p != _p_ {
-			atomicstore(&p.runSafePointFn, 1)
+			atomic.Store(&p.runSafePointFn, 1)
 		}
 	}
 	preemptall()
@@ -1085,7 +1088,7 @@ func forEachP(fn func(*p)) {
 	// Run safe point function for all idle Ps. sched.pidle will
 	// not change because we hold sched.lock.
 	for p := sched.pidle.ptr(); p != nil; p = p.link.ptr() {
-		if cas(&p.runSafePointFn, 1, 0) {
+		if atomic.Cas(&p.runSafePointFn, 1, 0) {
 			fn(p)
 			sched.safePointWait--
 		}
@@ -1102,7 +1105,7 @@ func forEachP(fn func(*p)) {
 	for i := 0; i < int(gomaxprocs); i++ {
 		p := allp[i]
 		s := p.status
-		if s == _Psyscall && p.runSafePointFn == 1 && cas(&p.status, s, _Pidle) {
+		if s == _Psyscall && p.runSafePointFn == 1 && atomic.Cas(&p.status, s, _Pidle) {
 			if trace.enabled {
 				traceGoSysBlock(p)
 				traceProcStop(p)
@@ -1158,7 +1161,7 @@ func runSafePointFn() {
 	// Resolve the race between forEachP running the safe-point
 	// function on this P's behalf and this P running the
 	// safe-point function directly.
-	if !cas(&p.runSafePointFn, 1, 0) {
+	if !atomic.Cas(&p.runSafePointFn, 1, 0) {
 		return
 	}
 	sched.safePointFn(p)
@@ -1318,7 +1321,7 @@ func newextram() {
 	mp.locked = _LockInternal
 	mp.lockedg = gp
 	gp.lockedm = mp
-	gp.goid = int64(xadd64(&sched.goidgen, 1))
+	gp.goid = int64(atomic.Xadd64(&sched.goidgen, 1))
 	if raceenabled {
 		gp.racectx = racegostart(funcPC(newextram))
 	}
@@ -1381,7 +1384,7 @@ func lockextra(nilokay bool) *m {
 	const locked = 1
 
 	for {
-		old := atomicloaduintptr(&extram)
+		old := atomic.Loaduintptr(&extram)
 		if old == locked {
 			yield := osyield
 			yield()
@@ -1391,7 +1394,7 @@ func lockextra(nilokay bool) *m {
 			usleep(1)
 			continue
 		}
-		if casuintptr(&extram, old, locked) {
+		if atomic.Casuintptr(&extram, old, locked) {
 			return (*m)(unsafe.Pointer(old))
 		}
 		yield := osyield
@@ -1402,7 +1405,7 @@ func lockextra(nilokay bool) *m {
 
 //go:nosplit
 func unlockextra(mp *m) {
-	atomicstoreuintptr(&extram, uintptr(unsafe.Pointer(mp)))
+	atomic.Storeuintptr(&extram, uintptr(unsafe.Pointer(mp)))
 }
 
 // Create a new m.  It will start off with a call to fn, or else the scheduler.
@@ -1440,7 +1443,7 @@ func stopm() {
 	}
 	if _g_.m.spinning {
 		_g_.m.spinning = false
-		xadd(&sched.nmspinning, -1)
+		atomic.Xadd(&sched.nmspinning, -1)
 	}
 
 retry:
@@ -1466,7 +1469,7 @@ func mspinning() {
 		// Something (presumably the GC) was readied while the
 		// runtime was starting up this M, so the M is no
 		// longer spinning.
-		if int32(xadd(&sched.nmspinning, -1)) < 0 {
+		if int32(atomic.Xadd(&sched.nmspinning, -1)) < 0 {
 			throw("mspinning: nmspinning underflowed")
 		}
 	} else {
@@ -1485,7 +1488,7 @@ func startm(_p_ *p, spinning bool) {
 		if _p_ == nil {
 			unlock(&sched.lock)
 			if spinning {
-				xadd(&sched.nmspinning, -1)
+				atomic.Xadd(&sched.nmspinning, -1)
 			}
 			return
 		}
@@ -1525,7 +1528,7 @@ func handoffp(_p_ *p) {
 	}
 	// no local work, check that there are no spinning/idle M's,
 	// otherwise our help is not required
-	if atomicload(&sched.nmspinning)+atomicload(&sched.npidle) == 0 && cas(&sched.nmspinning, 0, 1) { // TODO: fast atomic
+	if atomic.Load(&sched.nmspinning)+atomic.Load(&sched.npidle) == 0 && atomic.Cas(&sched.nmspinning, 0, 1) { // TODO: fast atomic
 		startm(_p_, true)
 		return
 	}
@@ -1539,7 +1542,7 @@ func handoffp(_p_ *p) {
 		unlock(&sched.lock)
 		return
 	}
-	if _p_.runSafePointFn != 0 && cas(&_p_.runSafePointFn, 1, 0) {
+	if _p_.runSafePointFn != 0 && atomic.Cas(&_p_.runSafePointFn, 1, 0) {
 		sched.safePointFn(_p_)
 		sched.safePointWait--
 		if sched.safePointWait == 0 {
@@ -1553,7 +1556,7 @@ func handoffp(_p_ *p) {
 	}
 	// If this is the last running P and nobody is polling network,
 	// need to wakeup another M to poll network.
-	if sched.npidle == uint32(gomaxprocs-1) && atomicload64(&sched.lastpoll) != 0 {
+	if sched.npidle == uint32(gomaxprocs-1) && atomic.Load64(&sched.lastpoll) != 0 {
 		unlock(&sched.lock)
 		startm(_p_, false)
 		return
@@ -1566,7 +1569,7 @@ func handoffp(_p_ *p) {
 // Called when a G is made runnable (newproc, ready).
 func wakep() {
 	// be conservative about spinning threads
-	if !cas(&sched.nmspinning, 0, 1) {
+	if !atomic.Cas(&sched.nmspinning, 0, 1) {
 		return
 	}
 	startm(nil, true)
@@ -1630,7 +1633,7 @@ func gcstopm() {
 	}
 	if _g_.m.spinning {
 		_g_.m.spinning = false
-		xadd(&sched.nmspinning, -1)
+		atomic.Xadd(&sched.nmspinning, -1)
 	}
 	_p_ := releasep()
 	lock(&sched.lock)
@@ -1749,12 +1752,12 @@ top:
 	// If number of spinning M's >= number of busy P's, block.
 	// This is necessary to prevent excessive CPU consumption
 	// when GOMAXPROCS>>1 but the program parallelism is low.
-	if !_g_.m.spinning && 2*atomicload(&sched.nmspinning) >= uint32(gomaxprocs)-atomicload(&sched.npidle) { // TODO: fast atomic
+	if !_g_.m.spinning && 2*atomic.Load(&sched.nmspinning) >= uint32(gomaxprocs)-atomic.Load(&sched.npidle) { // TODO: fast atomic
 		goto stop
 	}
 	if !_g_.m.spinning {
 		_g_.m.spinning = true
-		xadd(&sched.nmspinning, 1)
+		atomic.Xadd(&sched.nmspinning, 1)
 	}
 	// random steal from other P's
 	for i := 0; i < int(4*gomaxprocs); i++ {
@@ -1805,7 +1808,7 @@ stop:
 	unlock(&sched.lock)
 	if _g_.m.spinning {
 		_g_.m.spinning = false
-		xadd(&sched.nmspinning, -1)
+		atomic.Xadd(&sched.nmspinning, -1)
 	}
 
 	// check all runqueues once again
@@ -1824,7 +1827,7 @@ stop:
 	}
 
 	// poll network
-	if netpollinited() && xchg64(&sched.lastpoll, 0) != 0 {
+	if netpollinited() && atomic.Xchg64(&sched.lastpoll, 0) != 0 {
 		if _g_.m.p != 0 {
 			throw("findrunnable: netpoll with p")
 		}
@@ -1832,7 +1835,7 @@ stop:
 			throw("findrunnable: netpoll with spinning")
 		}
 		gp := netpoll(true) // block until new work is available
-		atomicstore64(&sched.lastpoll, uint64(nanotime()))
+		atomic.Store64(&sched.lastpoll, uint64(nanotime()))
 		if gp != nil {
 			lock(&sched.lock)
 			_p_ = pidleget()
@@ -1859,17 +1862,17 @@ func resetspinning() {
 	var nmspinning uint32
 	if _g_.m.spinning {
 		_g_.m.spinning = false
-		nmspinning = xadd(&sched.nmspinning, -1)
+		nmspinning = atomic.Xadd(&sched.nmspinning, -1)
 		if int32(nmspinning) < 0 {
 			throw("findrunnable: negative nmspinning")
 		}
 	} else {
-		nmspinning = atomicload(&sched.nmspinning)
+		nmspinning = atomic.Load(&sched.nmspinning)
 	}
 
 	// M wakeup policy is deliberately somewhat conservative (see nmspinning handling),
 	// so see if we need to wakeup another P here.
-	if nmspinning == 0 && atomicload(&sched.npidle) > 0 {
+	if nmspinning == 0 && atomic.Load(&sched.npidle) > 0 {
 		wakep()
 	}
 }
@@ -2171,7 +2174,7 @@ func reentersyscall(pc, sp uintptr) {
 		save(pc, sp)
 	}
 
-	if atomicload(&sched.sysmonwait) != 0 { // TODO: fast atomic
+	if atomic.Load(&sched.sysmonwait) != 0 { // TODO: fast atomic
 		systemstack(entersyscall_sysmon)
 		save(pc, sp)
 	}
@@ -2186,7 +2189,7 @@ func reentersyscall(pc, sp uintptr) {
 	_g_.sysblocktraced = true
 	_g_.m.mcache = nil
 	_g_.m.p.ptr().m = 0
-	atomicstore(&_g_.m.p.ptr().status, _Psyscall)
+	atomic.Store(&_g_.m.p.ptr().status, _Psyscall)
 	if sched.gcwaiting != 0 {
 		systemstack(entersyscall_gcwait)
 		save(pc, sp)
@@ -2207,8 +2210,8 @@ func entersyscall(dummy int32) {
 
 func entersyscall_sysmon() {
 	lock(&sched.lock)
-	if atomicload(&sched.sysmonwait) != 0 {
-		atomicstore(&sched.sysmonwait, 0)
+	if atomic.Load(&sched.sysmonwait) != 0 {
+		atomic.Store(&sched.sysmonwait, 0)
 		notewakeup(&sched.sysmonnote)
 	}
 	unlock(&sched.lock)
@@ -2219,7 +2222,7 @@ func entersyscall_gcwait() {
 	_p_ := _g_.m.p.ptr()
 
 	lock(&sched.lock)
-	if sched.stopwait > 0 && cas(&_p_.status, _Psyscall, _Pgcstop) {
+	if sched.stopwait > 0 && atomic.Cas(&_p_.status, _Psyscall, _Pgcstop) {
 		if trace.enabled {
 			traceGoSysBlock(_p_)
 			traceProcStop(_p_)
@@ -2374,7 +2377,7 @@ func exitsyscallfast() bool {
 	}
 
 	// Try to re-acquire the last P.
-	if _g_.m.p != 0 && _g_.m.p.ptr().status == _Psyscall && cas(&_g_.m.p.ptr().status, _Psyscall, _Prunning) {
+	if _g_.m.p != 0 && _g_.m.p.ptr().status == _Psyscall && atomic.Cas(&_g_.m.p.ptr().status, _Psyscall, _Prunning) {
 		// There's a cpu for us, so we can run.
 		_g_.m.mcache = _g_.m.p.ptr().mcache
 		_g_.m.p.ptr().m.set(_g_.m)
@@ -2424,8 +2427,8 @@ func exitsyscallfast() bool {
 func exitsyscallfast_pidle() bool {
 	lock(&sched.lock)
 	_p_ := pidleget()
-	if _p_ != nil && atomicload(&sched.sysmonwait) != 0 {
-		atomicstore(&sched.sysmonwait, 0)
+	if _p_ != nil && atomic.Load(&sched.sysmonwait) != 0 {
+		atomic.Store(&sched.sysmonwait, 0)
 		notewakeup(&sched.sysmonnote)
 	}
 	unlock(&sched.lock)
@@ -2447,8 +2450,8 @@ func exitsyscall0(gp *g) {
 	_p_ := pidleget()
 	if _p_ == nil {
 		globrunqput(gp)
-	} else if atomicload(&sched.sysmonwait) != 0 {
-		atomicstore(&sched.sysmonwait, 0)
+	} else if atomic.Load(&sched.sysmonwait) != 0 {
+		atomic.Store(&sched.sysmonwait, 0)
 		notewakeup(&sched.sysmonnote)
 	}
 	unlock(&sched.lock)
@@ -2602,7 +2605,7 @@ func newproc1(fn *funcval, argp *uint8, narg int32, nret int32, callerpc uintptr
 		// Sched.goidgen is the last allocated id,
 		// this batch must be [sched.goidgen+1, sched.goidgen+GoidCacheBatch].
 		// At startup sched.goidgen=0, so main goroutine receives goid=1.
-		_p_.goidcache = xadd64(&sched.goidgen, _GoidCacheBatch)
+		_p_.goidcache = atomic.Xadd64(&sched.goidgen, _GoidCacheBatch)
 		_p_.goidcache -= _GoidCacheBatch - 1
 		_p_.goidcacheend = _p_.goidcache + _GoidCacheBatch
 	}
@@ -2616,7 +2619,7 @@ func newproc1(fn *funcval, argp *uint8, narg int32, nret int32, callerpc uintptr
 	}
 	runqput(_p_, newg, true)
 
-	if atomicload(&sched.npidle) != 0 && atomicload(&sched.nmspinning) == 0 && unsafe.Pointer(fn.fn) != unsafe.Pointer(funcPC(main)) { // TODO: fast atomic
+	if atomic.Load(&sched.npidle) != 0 && atomic.Load(&sched.nmspinning) == 0 && unsafe.Pointer(fn.fn) != unsafe.Pointer(funcPC(main)) { // TODO: fast atomic
 		wakep()
 	}
 	_g_.m.locks--
@@ -2828,7 +2831,7 @@ func sigprof(pc, sp, lr uintptr, gp *g, mp *m) {
 	mp.mallocing++
 
 	// Coordinate with stack barrier insertion in scanstack.
-	for !cas(&gp.stackLock, 0, 1) {
+	for !atomic.Cas(&gp.stackLock, 0, 1) {
 		osyield()
 	}
 
@@ -2935,17 +2938,17 @@ func sigprof(pc, sp, lr uintptr, gp *g, mp *m) {
 			}
 		}
 	}
-	atomicstore(&gp.stackLock, 0)
+	atomic.Store(&gp.stackLock, 0)
 
 	if prof.hz != 0 {
 		// Simple cas-lock to coordinate with setcpuprofilerate.
-		for !cas(&prof.lock, 0, 1) {
+		for !atomic.Cas(&prof.lock, 0, 1) {
 			osyield()
 		}
 		if prof.hz != 0 {
 			cpuprof.add(stk[:n])
 		}
-		atomicstore(&prof.lock, 0)
+		atomic.Store(&prof.lock, 0)
 	}
 	mp.mallocing--
 }
@@ -2991,11 +2994,11 @@ func setcpuprofilerate_m(hz int32) {
 	// it would deadlock.
 	resetcpuprofiler(0)
 
-	for !cas(&prof.lock, 0, 1) {
+	for !atomic.Cas(&prof.lock, 0, 1) {
 		osyield()
 	}
 	prof.hz = hz
-	atomicstore(&prof.lock, 0)
+	atomic.Store(&prof.lock, 0)
 
 	lock(&sched.lock)
 	sched.profilehz = hz
@@ -3139,7 +3142,7 @@ func procresize(nprocs int32) *p {
 		}
 	}
 	var int32p *int32 = &gomaxprocs // make compiler check that gomaxprocs is an int32
-	atomicstore((*uint32)(unsafe.Pointer(int32p)), uint32(nprocs))
+	atomic.Store((*uint32)(unsafe.Pointer(int32p)), uint32(nprocs))
 	return runnablePs
 }
 
@@ -3317,10 +3320,10 @@ func sysmon() {
 			delay = 10 * 1000
 		}
 		usleep(delay)
-		if debug.schedtrace <= 0 && (sched.gcwaiting != 0 || atomicload(&sched.npidle) == uint32(gomaxprocs)) { // TODO: fast atomic
+		if debug.schedtrace <= 0 && (sched.gcwaiting != 0 || atomic.Load(&sched.npidle) == uint32(gomaxprocs)) { // TODO: fast atomic
 			lock(&sched.lock)
-			if atomicload(&sched.gcwaiting) != 0 || atomicload(&sched.npidle) == uint32(gomaxprocs) {
-				atomicstore(&sched.sysmonwait, 1)
+			if atomic.Load(&sched.gcwaiting) != 0 || atomic.Load(&sched.npidle) == uint32(gomaxprocs) {
+				atomic.Store(&sched.sysmonwait, 1)
 				unlock(&sched.lock)
 				// Make wake-up period small enough
 				// for the sampling to be correct.
@@ -3330,7 +3333,7 @@ func sysmon() {
 				}
 				notetsleep(&sched.sysmonnote, maxsleep)
 				lock(&sched.lock)
-				atomicstore(&sched.sysmonwait, 0)
+				atomic.Store(&sched.sysmonwait, 0)
 				noteclear(&sched.sysmonnote)
 				idle = 0
 				delay = 20
@@ -3338,11 +3341,11 @@ func sysmon() {
 			unlock(&sched.lock)
 		}
 		// poll network if not polled for more than 10ms
-		lastpoll := int64(atomicload64(&sched.lastpoll))
+		lastpoll := int64(atomic.Load64(&sched.lastpoll))
 		now := nanotime()
 		unixnow := unixnanotime()
 		if lastpoll != 0 && lastpoll+10*1000*1000 < now {
-			cas64(&sched.lastpoll, uint64(lastpoll), uint64(now))
+			atomic.Cas64(&sched.lastpoll, uint64(lastpoll), uint64(now))
 			gp := netpoll(false) // non-blocking - returns list of goroutines
 			if gp != nil {
 				// Need to decrement number of idle locked M's
@@ -3365,8 +3368,8 @@ func sysmon() {
 			idle++
 		}
 		// check if we need to force a GC
-		lastgc := int64(atomicload64(&memstats.last_gc))
-		if lastgc != 0 && unixnow-lastgc > forcegcperiod && atomicload(&forcegc.idle) != 0 {
+		lastgc := int64(atomic.Load64(&memstats.last_gc))
+		if lastgc != 0 && unixnow-lastgc > forcegcperiod && atomic.Load(&forcegc.idle) != 0 {
 			lock(&forcegc.lock)
 			forcegc.idle = 0
 			forcegc.g.schedlink = 0
@@ -3417,7 +3420,7 @@ func retake(now int64) uint32 {
 			// On the one hand we don't want to retake Ps if there is no other work to do,
 			// but on the other hand we want to retake them eventually
 			// because they can prevent the sysmon thread from deep sleep.
-			if runqempty(_p_) && atomicload(&sched.nmspinning)+atomicload(&sched.npidle) > 0 && pd.syscallwhen+10*1000*1000 > now {
+			if runqempty(_p_) && atomic.Load(&sched.nmspinning)+atomic.Load(&sched.npidle) > 0 && pd.syscallwhen+10*1000*1000 > now {
 				continue
 			}
 			// Need to decrement number of idle locked M's
@@ -3425,7 +3428,7 @@ func retake(now int64) uint32 {
 			// Otherwise the M from which we retake can exit the syscall,
 			// increment nmidle and report deadlock.
 			incidlelocked(-1)
-			if cas(&_p_.status, s, _Pidle) {
+			if atomic.Cas(&_p_.status, s, _Pidle) {
 				if trace.enabled {
 					traceGoSysBlock(_p_)
 					traceProcStop(_p_)
@@ -3523,8 +3526,8 @@ func schedtrace(detailed bool) {
 			continue
 		}
 		mp := _p_.m.ptr()
-		h := atomicload(&_p_.runqhead)
-		t := atomicload(&_p_.runqtail)
+		h := atomic.Load(&_p_.runqhead)
+		t := atomic.Load(&_p_.runqtail)
 		if detailed {
 			id := int32(-1)
 			if mp != nil {
@@ -3697,7 +3700,7 @@ func pidleput(_p_ *p) {
 	}
 	_p_.link = sched.pidle
 	sched.pidle.set(_p_)
-	xadd(&sched.npidle, 1) // TODO: fast atomic
+	atomic.Xadd(&sched.npidle, 1) // TODO: fast atomic
 }
 
 // Try get a p from _Pidle list.
@@ -3708,7 +3711,7 @@ func pidleget() *p {
 	_p_ := sched.pidle.ptr()
 	if _p_ != nil {
 		sched.pidle = _p_.link
-		xadd(&sched.npidle, -1) // TODO: fast atomic
+		atomic.Xadd(&sched.npidle, -1) // TODO: fast atomic
 	}
 	return _p_
 }
@@ -3754,11 +3757,11 @@ func runqput(_p_ *p, gp *g, next bool) {
 	}
 
 retry:
-	h := atomicload(&_p_.runqhead) // load-acquire, synchronize with consumers
+	h := atomic.Load(&_p_.runqhead) // load-acquire, synchronize with consumers
 	t := _p_.runqtail
 	if t-h < uint32(len(_p_.runq)) {
 		_p_.runq[t%uint32(len(_p_.runq))].set(gp)
-		atomicstore(&_p_.runqtail, t+1) // store-release, makes the item available for consumption
+		atomic.Store(&_p_.runqtail, t+1) // store-release, makes the item available for consumption
 		return
 	}
 	if runqputslow(_p_, gp, h, t) {
@@ -3782,7 +3785,7 @@ func runqputslow(_p_ *p, gp *g, h, t uint32) bool {
 	for i := uint32(0); i < n; i++ {
 		batch[i] = _p_.runq[(h+i)%uint32(len(_p_.runq))].ptr()
 	}
-	if !cas(&_p_.runqhead, h, h+n) { // cas-release, commits consume
+	if !atomic.Cas(&_p_.runqhead, h, h+n) { // cas-release, commits consume
 		return false
 	}
 	batch[n] = gp
@@ -3823,13 +3826,13 @@ func runqget(_p_ *p) (gp *g, inheritTime bool) {
 	}
 
 	for {
-		h := atomicload(&_p_.runqhead) // load-acquire, synchronize with other consumers
+		h := atomic.Load(&_p_.runqhead) // load-acquire, synchronize with other consumers
 		t := _p_.runqtail
 		if t == h {
 			return nil, false
 		}
 		gp := _p_.runq[h%uint32(len(_p_.runq))].ptr()
-		if cas(&_p_.runqhead, h, h+1) { // cas-release, commits consume
+		if atomic.Cas(&_p_.runqhead, h, h+1) { // cas-release, commits consume
 			return gp, false
 		}
 	}
@@ -3841,8 +3844,8 @@ func runqget(_p_ *p) (gp *g, inheritTime bool) {
 // Can be executed by any P.
 func runqgrab(_p_ *p, batch *[256]guintptr, batchHead uint32, stealRunNextG bool) uint32 {
 	for {
-		h := atomicload(&_p_.runqhead) // load-acquire, synchronize with other consumers
-		t := atomicload(&_p_.runqtail) // load-acquire, synchronize with the producer
+		h := atomic.Load(&_p_.runqhead) // load-acquire, synchronize with other consumers
+		t := atomic.Load(&_p_.runqtail) // load-acquire, synchronize with the producer
 		n := t - h
 		n = n - n/2
 		if n == 0 {
@@ -3873,7 +3876,7 @@ func runqgrab(_p_ *p, batch *[256]guintptr, batchHead uint32, stealRunNextG bool
 			g := _p_.runq[(h+i)%uint32(len(_p_.runq))]
 			batch[(batchHead+i)%uint32(len(batch))] = g
 		}
-		if cas(&_p_.runqhead, h, h+n) { // cas-release, commits consume
+		if atomic.Cas(&_p_.runqhead, h, h+n) { // cas-release, commits consume
 			return n
 		}
 	}
@@ -3893,11 +3896,11 @@ func runqsteal(_p_, p2 *p, stealRunNextG bool) *g {
 	if n == 0 {
 		return gp
 	}
-	h := atomicload(&_p_.runqhead) // load-acquire, synchronize with consumers
+	h := atomic.Load(&_p_.runqhead) // load-acquire, synchronize with consumers
 	if t-h+n >= uint32(len(_p_.runq)) {
 		throw("runqsteal: runq overflow")
 	}
-	atomicstore(&_p_.runqtail, t+n) // store-release, makes the item available for consumption
+	atomic.Store(&_p_.runqtail, t+n) // store-release, makes the item available for consumption
 	return gp
 }
 
diff --git a/src/runtime/runtime.go b/src/runtime/runtime.go
index 81d3e5b3c3..0bbe42739d 100644
--- a/src/runtime/runtime.go
+++ b/src/runtime/runtime.go
@@ -4,7 +4,10 @@
 
 package runtime
 
-import _ "unsafe" // for go:linkname
+import (
+	"runtime/internal/atomic"
+	_ "unsafe" // for go:linkname
+)
 
 //go:generate go run wincallback.go
 //go:generate go run mkduff.go
@@ -20,7 +23,7 @@ var tls0 [8]uintptr // available storage for m0's TLS; not necessarily used; opa
 
 // Note: Called by runtime/pprof in addition to runtime code.
 func tickspersecond() int64 {
-	r := int64(atomicload64(&ticks.val))
+	r := int64(atomic.Load64(&ticks.val))
 	if r != 0 {
 		return r
 	}
@@ -39,7 +42,7 @@ func tickspersecond() int64 {
 		if r == 0 {
 			r++
 		}
-		atomicstore64(&ticks.val, uint64(r))
+		atomic.Store64(&ticks.val, uint64(r))
 	}
 	unlock(&ticks.lock)
 	return r
diff --git a/src/runtime/runtime1.go b/src/runtime/runtime1.go
index 8878817aab..f9b11b4de1 100644
--- a/src/runtime/runtime1.go
+++ b/src/runtime/runtime1.go
@@ -4,7 +4,10 @@
 
 package runtime
 
-import "unsafe"
+import (
+	"runtime/internal/atomic"
+	"unsafe"
+)
 
 // Keep a cached value to make gotraceback fast,
 // since we call it on every call to gentraceback.
@@ -99,36 +102,36 @@ func testAtomic64() {
 	prefetcht1(uintptr(unsafe.Pointer(&test_z64)))
 	prefetcht2(uintptr(unsafe.Pointer(&test_z64)))
 	prefetchnta(uintptr(unsafe.Pointer(&test_z64)))
-	if cas64(&test_z64, test_x64, 1) {
+	if atomic.Cas64(&test_z64, test_x64, 1) {
 		throw("cas64 failed")
 	}
 	if test_x64 != 0 {
 		throw("cas64 failed")
 	}
 	test_x64 = 42
-	if !cas64(&test_z64, test_x64, 1) {
+	if !atomic.Cas64(&test_z64, test_x64, 1) {
 		throw("cas64 failed")
 	}
 	if test_x64 != 42 || test_z64 != 1 {
 		throw("cas64 failed")
 	}
-	if atomicload64(&test_z64) != 1 {
+	if atomic.Load64(&test_z64) != 1 {
 		throw("load64 failed")
 	}
-	atomicstore64(&test_z64, (1<<40)+1)
-	if atomicload64(&test_z64) != (1<<40)+1 {
+	atomic.Store64(&test_z64, (1<<40)+1)
+	if atomic.Load64(&test_z64) != (1<<40)+1 {
 		throw("store64 failed")
 	}
-	if xadd64(&test_z64, (1<<40)+1) != (2<<40)+2 {
+	if atomic.Xadd64(&test_z64, (1<<40)+1) != (2<<40)+2 {
 		throw("xadd64 failed")
 	}
-	if atomicload64(&test_z64) != (2<<40)+2 {
+	if atomic.Load64(&test_z64) != (2<<40)+2 {
 		throw("xadd64 failed")
 	}
-	if xchg64(&test_z64, (3<<40)+3) != (2<<40)+2 {
+	if atomic.Xchg64(&test_z64, (3<<40)+3) != (2<<40)+2 {
 		throw("xchg64 failed")
 	}
-	if atomicload64(&test_z64) != (3<<40)+3 {
+	if atomic.Load64(&test_z64) != (3<<40)+3 {
 		throw("xchg64 failed")
 	}
 }
@@ -211,7 +214,7 @@ func check() {
 
 	var z uint32
 	z = 1
-	if !cas(&z, 1, 2) {
+	if !atomic.Cas(&z, 1, 2) {
 		throw("cas1")
 	}
 	if z != 2 {
@@ -219,7 +222,7 @@ func check() {
 	}
 
 	z = 4
-	if cas(&z, 5, 6) {
+	if atomic.Cas(&z, 5, 6) {
 		throw("cas3")
 	}
 	if z != 4 {
@@ -227,7 +230,7 @@ func check() {
 	}
 
 	z = 0xffffffff
-	if !cas(&z, 0xffffffff, 0xfffffffe) {
+	if !atomic.Cas(&z, 0xffffffff, 0xfffffffe) {
 		throw("cas5")
 	}
 	if z != 0xfffffffe {
@@ -250,7 +253,7 @@ func check() {
 	}
 
 	m = [4]byte{1, 1, 1, 1}
-	atomicor8(&m[1], 0xf0)
+	atomic.Or8(&m[1], 0xf0)
 	if m[0] != 1 || m[1] != 0xf1 || m[2] != 1 || m[3] != 1 {
 		throw("atomicor8")
 	}
diff --git a/src/runtime/runtime2.go b/src/runtime/runtime2.go
index a1ea3e78f0..be43e42540 100644
--- a/src/runtime/runtime2.go
+++ b/src/runtime/runtime2.go
@@ -4,7 +4,10 @@
 
 package runtime
 
-import "unsafe"
+import (
+	"runtime/internal/atomic"
+	"unsafe"
+)
 
 /*
  * defined constants
@@ -129,7 +132,7 @@ type guintptr uintptr
 func (gp guintptr) ptr() *g   { return (*g)(unsafe.Pointer(gp)) }
 func (gp *guintptr) set(g *g) { *gp = guintptr(unsafe.Pointer(g)) }
 func (gp *guintptr) cas(old, new guintptr) bool {
-	return casuintptr((*uintptr)(unsafe.Pointer(gp)), uintptr(old), uintptr(new))
+	return atomic.Casuintptr((*uintptr)(unsafe.Pointer(gp)), uintptr(old), uintptr(new))
 }
 
 type puintptr uintptr
diff --git a/src/runtime/sema.go b/src/runtime/sema.go
index 8ae51b4311..d9bf4c1cfd 100644
--- a/src/runtime/sema.go
+++ b/src/runtime/sema.go
@@ -19,7 +19,10 @@
 
 package runtime
 
-import "unsafe"
+import (
+	"runtime/internal/atomic"
+	"unsafe"
+)
 
 // Asynchronous semaphore for sync.Mutex.
 
@@ -87,10 +90,10 @@ func semacquire(addr *uint32, profile bool) {
 	for {
 		lock(&root.lock)
 		// Add ourselves to nwait to disable "easy case" in semrelease.
-		xadd(&root.nwait, 1)
+		atomic.Xadd(&root.nwait, 1)
 		// Check cansemacquire to avoid missed wakeup.
 		if cansemacquire(addr) {
-			xadd(&root.nwait, -1)
+			atomic.Xadd(&root.nwait, -1)
 			unlock(&root.lock)
 			break
 		}
@@ -110,18 +113,18 @@ func semacquire(addr *uint32, profile bool) {
 
 func semrelease(addr *uint32) {
 	root := semroot(addr)
-	xadd(addr, 1)
+	atomic.Xadd(addr, 1)
 
 	// Easy case: no waiters?
 	// This check must happen after the xadd, to avoid a missed wakeup
 	// (see loop in semacquire).
-	if atomicload(&root.nwait) == 0 {
+	if atomic.Load(&root.nwait) == 0 {
 		return
 	}
 
 	// Harder case: search for a waiter and wake it.
 	lock(&root.lock)
-	if atomicload(&root.nwait) == 0 {
+	if atomic.Load(&root.nwait) == 0 {
 		// The count is already consumed by another goroutine,
 		// so no need to wake up another goroutine.
 		unlock(&root.lock)
@@ -130,7 +133,7 @@ func semrelease(addr *uint32) {
 	s := root.head
 	for ; s != nil; s = s.next {
 		if s.elem == unsafe.Pointer(addr) {
-			xadd(&root.nwait, -1)
+			atomic.Xadd(&root.nwait, -1)
 			root.dequeue(s)
 			break
 		}
@@ -150,11 +153,11 @@ func semroot(addr *uint32) *semaRoot {
 
 func cansemacquire(addr *uint32) bool {
 	for {
-		v := atomicload(addr)
+		v := atomic.Load(addr)
 		if v == 0 {
 			return false
 		}
-		if cas(addr, v, v-1) {
+		if atomic.Cas(addr, v, v-1) {
 			return true
 		}
 	}
diff --git a/src/runtime/sigqueue.go b/src/runtime/sigqueue.go
index f28067f3f9..8c9951ae1d 100644
--- a/src/runtime/sigqueue.go
+++ b/src/runtime/sigqueue.go
@@ -28,7 +28,10 @@
 
 package runtime
 
-import "unsafe"
+import (
+	"runtime/internal/atomic"
+	"unsafe"
+)
 
 var sig struct {
 	note   note
@@ -59,7 +62,7 @@ func sigsend(s uint32) bool {
 		if mask&bit != 0 {
 			return true // signal already in queue
 		}
-		if cas(&sig.mask[s/32], mask, mask|bit) {
+		if atomic.Cas(&sig.mask[s/32], mask, mask|bit) {
 			break
 		}
 	}
@@ -67,18 +70,18 @@ func sigsend(s uint32) bool {
 	// Notify receiver that queue has new bit.
 Send:
 	for {
-		switch atomicload(&sig.state) {
+		switch atomic.Load(&sig.state) {
 		default:
 			throw("sigsend: inconsistent state")
 		case sigIdle:
-			if cas(&sig.state, sigIdle, sigSending) {
+			if atomic.Cas(&sig.state, sigIdle, sigSending) {
 				break Send
 			}
 		case sigSending:
 			// notification already pending
 			break Send
 		case sigReceiving:
-			if cas(&sig.state, sigReceiving, sigIdle) {
+			if atomic.Cas(&sig.state, sigReceiving, sigIdle) {
 				notewakeup(&sig.note)
 				break Send
 			}
@@ -104,17 +107,17 @@ func signal_recv() uint32 {
 		// Wait for updates to be available from signal sender.
 	Receive:
 		for {
-			switch atomicload(&sig.state) {
+			switch atomic.Load(&sig.state) {
 			default:
 				throw("signal_recv: inconsistent state")
 			case sigIdle:
-				if cas(&sig.state, sigIdle, sigReceiving) {
+				if atomic.Cas(&sig.state, sigIdle, sigReceiving) {
 					notetsleepg(&sig.note, -1)
 					noteclear(&sig.note)
 					break Receive
 				}
 			case sigSending:
-				if cas(&sig.state, sigSending, sigIdle) {
+				if atomic.Cas(&sig.state, sigSending, sigIdle) {
 					break Receive
 				}
 			}
@@ -122,7 +125,7 @@ func signal_recv() uint32 {
 
 		// Incorporate updates from sender into local copy.
 		for i := range sig.mask {
-			sig.recv[i] = xchg(&sig.mask[i], 0)
+			sig.recv[i] = atomic.Xchg(&sig.mask[i], 0)
 		}
 	}
 }
diff --git a/src/runtime/stack.go b/src/runtime/stack.go
index e3087af940..cce371c883 100644
--- a/src/runtime/stack.go
+++ b/src/runtime/stack.go
@@ -4,7 +4,10 @@
 
 package runtime
 
-import "unsafe"
+import (
+	"runtime/internal/atomic"
+	"unsafe"
+)
 
 /*
 Stack layout parameters.
@@ -806,7 +809,7 @@ func newstack() {
 	// NOTE: stackguard0 may change underfoot, if another thread
 	// is about to try to preempt gp. Read it just once and use that same
 	// value now and below.
-	preempt := atomicloaduintptr(&gp.stackguard0) == stackPreempt
+	preempt := atomic.Loaduintptr(&gp.stackguard0) == stackPreempt
 
 	// Be conservative about where we preempt.
 	// We are interested in preempting user Go code, not runtime code.
diff --git a/src/runtime/string.go b/src/runtime/string.go
index 03230a8b3e..f8ccd41b1d 100644
--- a/src/runtime/string.go
+++ b/src/runtime/string.go
@@ -5,6 +5,7 @@
 package runtime
 
 import (
+	"runtime/internal/atomic"
 	"unsafe"
 )
 
@@ -289,7 +290,7 @@ func rawstring(size int) (s string, b []byte) {
 
 	for {
 		ms := maxstring
-		if uintptr(size) <= uintptr(ms) || casuintptr((*uintptr)(unsafe.Pointer(&maxstring)), uintptr(ms), uintptr(size)) {
+		if uintptr(size) <= uintptr(ms) || atomic.Casuintptr((*uintptr)(unsafe.Pointer(&maxstring)), uintptr(ms), uintptr(size)) {
 			return
 		}
 	}
@@ -413,7 +414,7 @@ func gostringnocopy(str *byte) string {
 	s := *(*string)(unsafe.Pointer(&ss))
 	for {
 		ms := maxstring
-		if uintptr(len(s)) <= ms || casuintptr(&maxstring, ms, uintptr(len(s))) {
+		if uintptr(len(s)) <= ms || atomic.Casuintptr(&maxstring, ms, uintptr(len(s))) {
 			break
 		}
 	}
diff --git a/src/runtime/stubs.go b/src/runtime/stubs.go
index d725bb11f5..58ea474c70 100644
--- a/src/runtime/stubs.go
+++ b/src/runtime/stubs.go
@@ -151,42 +151,6 @@ func goexit(neverCallThisFunction)
 // See the assembly implementations for more details.
 func cgocallback_gofunc(fv uintptr, frame uintptr, framesize uintptr)
 
-//go:noescape
-func cas(ptr *uint32, old, new uint32) bool
-
-// NO go:noescape annotation; see atomic_pointer.go.
-func casp1(ptr *unsafe.Pointer, old, new unsafe.Pointer) bool
-
-func nop() // call to prevent inlining of function body
-
-//go:noescape
-func casuintptr(ptr *uintptr, old, new uintptr) bool
-
-//go:noescape
-func atomicstoreuintptr(ptr *uintptr, new uintptr)
-
-//go:noescape
-func atomicloaduintptr(ptr *uintptr) uintptr
-
-//go:noescape
-func atomicloaduint(ptr *uint) uint
-
-// TODO: Write native implementations of int64 atomic ops (or improve
-// inliner). These portable ones can't be inlined right now, so we're
-// taking an extra function call hit.
-
-func atomicstoreint64(ptr *int64, new int64) {
-	atomicstore64((*uint64)(unsafe.Pointer(ptr)), uint64(new))
-}
-
-func atomicloadint64(ptr *int64) int64 {
-	return int64(atomicload64((*uint64)(unsafe.Pointer(ptr))))
-}
-
-func xaddint64(ptr *int64, delta int64) int64 {
-	return int64(xadd64((*uint64)(unsafe.Pointer(ptr)), delta))
-}
-
 // publicationBarrier performs a store/store barrier (a "publication"
 // or "export" barrier). Some form of synchronization is required
 // between initializing an object and making that object accessible to
diff --git a/src/runtime/sys_darwin_arm.s b/src/runtime/sys_darwin_arm.s
index a9dbf8bdbb..fdbb27826a 100644
--- a/src/runtime/sys_darwin_arm.s
+++ b/src/runtime/sys_darwin_arm.s
@@ -297,12 +297,6 @@ TEXT runtime·usleep(SB),NOSPLIT,$12
 	SWI	$0x80
 	RET
 
-TEXT runtime·cas(SB),NOSPLIT,$0
-	B	runtime·armcas(SB)
-
-TEXT runtime·casp1(SB),NOSPLIT,$0
-	B	runtime·cas(SB)
-
 TEXT ·publicationBarrier(SB),NOSPLIT,$-4-0
 	B	runtime·armPublicationBarrier(SB)
 
diff --git a/src/runtime/sys_freebsd_arm.s b/src/runtime/sys_freebsd_arm.s
index bd6ff96f31..b755ec70a8 100644
--- a/src/runtime/sys_freebsd_arm.s
+++ b/src/runtime/sys_freebsd_arm.s
@@ -377,20 +377,6 @@ TEXT runtime·closeonexec(SB),NOSPLIT,$0
 	SWI $0
 	RET
 
-TEXT runtime·casp1(SB),NOSPLIT,$0
-	B	runtime·cas(SB)
-
-// TODO(minux): this is only valid for ARMv6+
-// bool armcas(int32 *val, int32 old, int32 new)
-// Atomically:
-//	if(*val == old){
-//		*val = new;
-//		return 1;
-//	}else
-//		return 0;
-TEXT runtime·cas(SB),NOSPLIT,$0
-	B runtime·armcas(SB)
-
 // TODO: this is only valid for ARMv7+
 TEXT ·publicationBarrier(SB),NOSPLIT,$-4-0
 	B	runtime·armPublicationBarrier(SB)
diff --git a/src/runtime/sys_linux_arm.s b/src/runtime/sys_linux_arm.s
index 29eb8eb077..216781ef7a 100644
--- a/src/runtime/sys_linux_arm.s
+++ b/src/runtime/sys_linux_arm.s
@@ -390,35 +390,6 @@ TEXT runtime·usleep(SB),NOSPLIT,$12
 	SWI	$0
 	RET
 
-// Use kernel version instead of native armcas in asm_arm.s.
-// See ../sync/atomic/asm_linux_arm.s for details.
-TEXT cas<>(SB),NOSPLIT,$0
-	MOVW	$0xffff0fc0, R15 // R15 is hardware PC.
-
-TEXT runtime·cas(SB),NOSPLIT,$0
-	MOVW	ptr+0(FP), R2
-	MOVW	old+4(FP), R0
-loop:
-	MOVW	new+8(FP), R1
-	BL	cas<>(SB)
-	BCC	check
-	MOVW	$1, R0
-	MOVB	R0, ret+12(FP)
-	RET
-check:
-	// Kernel lies; double-check.
-	MOVW	ptr+0(FP), R2
-	MOVW	old+4(FP), R0
-	MOVW	0(R2), R3
-	CMP	R0, R3
-	BEQ	loop
-	MOVW	$0, R0
-	MOVB	R0, ret+12(FP)
-	RET
-
-TEXT runtime·casp1(SB),NOSPLIT,$0
-	B	runtime·cas(SB)
-
 // As for cas, memory barriers are complicated on ARM, but the kernel
 // provides a user helper. ARMv5 does not support SMP and has no
 // memory barrier instruction at all. ARMv6 added SMP support and has
diff --git a/src/runtime/sys_nacl_arm.s b/src/runtime/sys_nacl_arm.s
index cf4804fe14..474d9fe174 100644
--- a/src/runtime/sys_nacl_arm.s
+++ b/src/runtime/sys_nacl_arm.s
@@ -308,21 +308,6 @@ TEXT runtime·getRandomData(SB),NOSPLIT,$0-12
 	NACL_SYSCALL(SYS_get_random_bytes)
 	RET
 
-TEXT runtime·casp1(SB),NOSPLIT,$0
-	B	runtime·cas(SB)
-
-// This is only valid for ARMv6+, however, NaCl/ARM is only defined
-// for ARMv7A anyway.
-// bool armcas(int32 *val, int32 old, int32 new)
-// AtomiBLy:
-//	if(*val == old){
-//		*val = new;
-//		return 1;
-//	}else
-//		return 0;
-TEXT runtime·cas(SB),NOSPLIT,$0
-	B runtime·armcas(SB)
-
 // Likewise, this is only valid for ARMv7+, but that's okay.
 TEXT ·publicationBarrier(SB),NOSPLIT,$-4-0
 	B	runtime·armPublicationBarrier(SB)
diff --git a/src/runtime/sys_netbsd_arm.s b/src/runtime/sys_netbsd_arm.s
index ae669ce76b..d6628848f7 100644
--- a/src/runtime/sys_netbsd_arm.s
+++ b/src/runtime/sys_netbsd_arm.s
@@ -338,20 +338,6 @@ TEXT runtime·closeonexec(SB),NOSPLIT,$0
 	SWI $0xa0005c	// sys_fcntl
 	RET
 
-TEXT runtime·casp1(SB),NOSPLIT,$0
-	B	runtime·cas(SB)
-
-// TODO(minux): this is only valid for ARMv6+
-// bool armcas(int32 *val, int32 old, int32 new)
-// Atomically:
-//	if(*val == old){
-//		*val = new;
-//		return 1;
-//	}else
-//		return 0;
-TEXT runtime·cas(SB),NOSPLIT,$0
-	B runtime·armcas(SB)
-
 // TODO: this is only valid for ARMv7+
 TEXT ·publicationBarrier(SB),NOSPLIT,$-4-0
 	B	runtime·armPublicationBarrier(SB)
diff --git a/src/runtime/sys_openbsd_arm.s b/src/runtime/sys_openbsd_arm.s
index 60deb8f38a..8c951e46d4 100644
--- a/src/runtime/sys_openbsd_arm.s
+++ b/src/runtime/sys_openbsd_arm.s
@@ -373,13 +373,6 @@ TEXT runtime·closeonexec(SB),NOSPLIT,$0
 	MOVW	R0, ret+4(FP)
 	RET
 
-TEXT runtime·casp1(SB),NOSPLIT,$0
-	//B	runtime·armcas(SB)
-	B	runtime·cas(SB)
-
-TEXT runtime·cas(SB),NOSPLIT,$0
-	B	runtime·armcas(SB)
-
 TEXT ·publicationBarrier(SB),NOSPLIT,$-4-0
 	B	runtime·armPublicationBarrier(SB)
 
diff --git a/src/runtime/trace.go b/src/runtime/trace.go
index 7ea4e8a61f..f9e9a1f763 100644
--- a/src/runtime/trace.go
+++ b/src/runtime/trace.go
@@ -12,7 +12,10 @@
 
 package runtime
 
-import "unsafe"
+import (
+	"runtime/internal/atomic"
+	"unsafe"
+)
 
 // Event types in the trace, args are given in square brackets.
 const (
@@ -123,12 +126,12 @@ var traceseq uint64 // global trace sequence number
 // that used to call xadd64 and cputicks are sensitive to that.
 //go:nosplit
 func tracestamp() (seq uint64, ts int64) {
-	seq = atomicload64(&traceseq)
-	for seq&1 != 0 || !cas64(&traceseq, seq, seq+1) {
-		seq = atomicload64(&traceseq)
+	seq = atomic.Load64(&traceseq)
+	for seq&1 != 0 || !atomic.Cas64(&traceseq, seq, seq+1) {
+		seq = atomic.Load64(&traceseq)
 	}
 	ts = cputicks()
-	atomicstore64(&traceseq, seq+2)
+	atomic.Store64(&traceseq, seq+2)
 	return seq >> 1, ts
 }
 
-- 
cgit v1.3