[dev.fuzz] all: merge master into dev.fuzz

Change-Id: I5d8c8329ccc9d747bd81ade6b1cb7cb8ae2e94b2
author: Katie Hockman <katie@golang.org> 2020-12-14 10:03:05 -0500
committer: Katie Hockman <katie@golang.org> 2020-12-14 10:06:13 -0500
commit: 0345ede87ee12698988973884cfc0fd3d499dffd (patch)
tree: 7123cff141ee5661208d2f5f437b8f5252ac7f6a /src/runtime/memmove_arm64.s
parent: 4651d6b267818b0e0d128a5443289717c4bb8cbc (diff)
parent: 0a02371b0576964e81c3b40d328db9a3ef3b031b (diff)
download: go-0345ede87ee12698988973884cfc0fd3d499dffd.tar.xz
1 files changed, 201 insertions, 117 deletions
diff --git a/src/runtime/memmove_arm64.s b/src/runtime/memmove_arm64.s
index dbb7e9a28a..43d27629e5 100644
--- a/src/runtime/memmove_arm64.s
+++ b/src/runtime/memmove_arm64.s
@@ -6,152 +6,236 @@
 
 // See memmove Go doc for important implementation constraints.
 
+// Register map
+//
+// dstin  R0
+// src    R1
+// count  R2
+// dst    R3 (same as R0, but gets modified in unaligned cases)
+// srcend R4
+// dstend R5
+// data   R6-R17
+// tmp1   R14
+
+// Copies are split into 3 main cases: small copies of up to 32 bytes, medium
+// copies of up to 128 bytes, and large copies. The overhead of the overlap
+// check is negligible since it is only required for large copies.
+//
+// Large copies use a software pipelined loop processing 64 bytes per iteration.
+// The destination pointer is 16-byte aligned to minimize unaligned accesses.
+// The loop tail is handled by always copying 64 bytes from the end.
+
 // func memmove(to, from unsafe.Pointer, n uintptr)
 TEXT runtime·memmove(SB), NOSPLIT|NOFRAME, $0-24
-	MOVD	to+0(FP), R3
-	MOVD	from+8(FP), R4
-	MOVD	n+16(FP), R5
-	CBNZ	R5, check
-	RET
+	MOVD	to+0(FP), R0
+	MOVD	from+8(FP), R1
+	MOVD	n+16(FP), R2
+	CBZ	R2, copy0
 
-check:
-	CMP	$16, R5
+	// Small copies: 1..16 bytes
+	CMP	$16, R2
 	BLE	copy16
 
-	AND	$~31, R5, R7	// R7 is N&~31
-	SUB	R7, R5, R6	// R6 is N&31
-
-	CMP	R3, R4
-	BLT	backward
-
-	// Copying forward proceeds by copying R7/32 quadwords then R6 <= 31 tail bytes.
-	// R3 and R4 are advanced as we copy.
-
-	// (There may be implementations of armv8 where copying by bytes until
-	// at least one of source or dest is word aligned is a worthwhile
-	// optimization, but the on the one tested so far (xgene) it did not
-	// make a significance difference.)
+	// Large copies
+	CMP	$128, R2
+	BHI	copy_long
+	CMP	$32, R2
+	BHI	copy32_128
 
-	CBZ	R7, noforwardlarge	// Do we need to do any quadword copying?
-
-	ADD	R3, R7, R9	// R9 points just past where we copy by word
-
-forwardlargeloop:
-	// Copy 32 bytes at a time.
-	LDP.P	32(R4), (R8, R10)
-	STP.P	(R8, R10), 32(R3)
-	LDP	-16(R4), (R11, R12)
-	STP	(R11, R12), -16(R3)
-	SUB 	$32, R7, R7
-	CBNZ	R7, forwardlargeloop
-
-noforwardlarge:
-	CBNZ	R6, forwardtail		// Do we need to copy any tail bytes?
+	// Small copies: 17..32 bytes.
+	LDP	(R1), (R6, R7)
+	ADD	R1, R2, R4          // R4 points just past the last source byte
+	LDP	-16(R4), (R12, R13)
+	STP	(R6, R7), (R0)
+	ADD	R0, R2, R5          // R5 points just past the last destination byte
+	STP	(R12, R13), -16(R5)
 	RET
 
-forwardtail:
-	// There are R6 <= 31 bytes remaining to copy.
-	// This is large enough to still contain pointers,
-	// which must be copied atomically.
-	// Copy the next 16 bytes, then 8 bytes, then any remaining bytes.
-	TBZ	$4, R6, 3(PC)	// write 16 bytes if R6&16 != 0
-	LDP.P	16(R4), (R8, R10)
-	STP.P	(R8, R10), 16(R3)
-
-	TBZ	$3, R6, 3(PC)	// write 8 bytes if R6&8 != 0
-	MOVD.P	8(R4), R8
-	MOVD.P	R8, 8(R3)
-
-	AND	$7, R6
-	CBNZ	R6, 2(PC)
-	RET
-
-	ADD	R3, R6, R9	// R9 points just past the destination memory
-
-forwardtailloop:
-	MOVBU.P 1(R4), R8
-	MOVBU.P	R8, 1(R3)
-	CMP	R3, R9
-	BNE	forwardtailloop
-	RET
-
-	// Small copies: 1..16 bytes.
+// Small copies: 1..16 bytes.
 copy16:
-	ADD	R4, R5, R8	// R8 points just past the last source byte
-	ADD	R3, R5, R9	// R9 points just past the last destination byte
-	CMP	$8, R5
+	ADD	R1, R2, R4 // R4 points just past the last source byte
+	ADD	R0, R2, R5 // R5 points just past the last destination byte
+	CMP	$8, R2
 	BLT	copy7
-	MOVD	(R4), R6
-	MOVD	-8(R8), R7
-	MOVD	R6, (R3)
-	MOVD	R7, -8(R9)
+	MOVD	(R1), R6
+	MOVD	-8(R4), R7
+	MOVD	R6, (R0)
+	MOVD	R7, -8(R5)
 	RET
 
 copy7:
-	TBZ	$2, R5, copy3
-	MOVWU	(R4), R6
-	MOVWU	-4(R8), R7
-	MOVW	R6, (R3)
-	MOVW	R7, -4(R9)
+	TBZ	$2, R2, copy3
+	MOVWU	(R1), R6
+	MOVWU	-4(R4), R7
+	MOVW	R6, (R0)
+	MOVW	R7, -4(R5)
 	RET
 
 copy3:
-	TBZ	$1, R5, copy1
-	MOVHU	(R4), R6
-	MOVHU	-2(R8), R7
-	MOVH	R6, (R3)
-	MOVH	R7, -2(R9)
+	TBZ	$1, R2, copy1
+	MOVHU	(R1), R6
+	MOVHU	-2(R4), R7
+	MOVH	R6, (R0)
+	MOVH	R7, -2(R5)
 	RET
 
 copy1:
-	MOVBU	(R4), R6
-	MOVB	R6, (R3)
+	MOVBU	(R1), R6
+	MOVB	R6, (R0)
+
+copy0:
 	RET
 
-backward:
-	// Copying backwards first copies R6 <= 31 tail bytes, then R7/32 quadwords.
-	// R3 and R4 are advanced to the end of the destination/source buffers
-	// respectively and moved back as we copy.
+	// Medium copies: 33..128 bytes.
+copy32_128:
+	ADD	R1, R2, R4          // R4 points just past the last source byte
+	ADD	R0, R2, R5          // R5 points just past the last destination byte
+	LDP	(R1), (R6, R7)
+	LDP	16(R1), (R8, R9)
+	LDP	-32(R4), (R10, R11)
+	LDP	-16(R4), (R12, R13)
+	CMP	$64, R2
+	BHI	copy128
+	STP	(R6, R7), (R0)
+	STP	(R8, R9), 16(R0)
+	STP	(R10, R11), -32(R5)
+	STP	(R12, R13), -16(R5)
+	RET
 
-	ADD	R4, R5, R4	// R4 points just past the last source byte
-	ADD	R3, R5, R3	// R3 points just past the last destination byte
+	// Copy 65..128 bytes.
+copy128:
+	LDP	32(R1), (R14, R15)
+	LDP	48(R1), (R16, R17)
+	CMP	$96, R2
+	BLS	copy96
+	LDP	-64(R4), (R2, R3)
+	LDP	-48(R4), (R1, R4)
+	STP	(R2, R3), -64(R5)
+	STP	(R1, R4), -48(R5)
 
-	CBZ	R6, nobackwardtail	// Do we need to do any byte-by-byte copying?
+copy96:
+	STP	(R6, R7), (R0)
+	STP	(R8, R9), 16(R0)
+	STP	(R14, R15), 32(R0)
+	STP	(R16, R17), 48(R0)
+	STP	(R10, R11), -32(R5)
+	STP	(R12, R13), -16(R5)
+	RET
 
-	AND	$7, R6, R12
-	CBZ	R12, backwardtaillarge
+	// Copy more than 128 bytes.
+copy_long:
+	ADD	R1, R2, R4 // R4 points just past the last source byte
+	ADD	R0, R2, R5 // R5 points just past the last destination byte
+	MOVD	ZR, R7
+	MOVD	ZR, R8
 
-	SUB	R12, R3, R9	// R9 points at the lowest destination byte that should be copied by byte.
-backwardtailloop:
-	// Copy sub-pointer-size tail.
-	MOVBU.W	-1(R4), R8
-	MOVBU.W	R8, -1(R3)
-	CMP	R9, R3
-	BNE	backwardtailloop
+	CMP	$1024, R2
+	BLT	backward_check
+	// feature detect to decide how to align
+	MOVBU	runtime·arm64UseAlignedLoads(SB), R6
+	CBNZ	R6, use_aligned_loads
+	MOVD	R0, R7
+	MOVD	R5, R8
+	B	backward_check
+use_aligned_loads:
+	MOVD	R1, R7
+	MOVD	R4, R8
+	// R7 and R8 are used here for the realignment calculation. In
+	// the use_aligned_loads case, R7 is the src pointer and R8 is
+	// srcend pointer, which is used in the backward copy case.
+	// When doing aligned stores, R7 is the dst pointer and R8 is
+	// the dstend pointer.
 
-backwardtaillarge:
-	// Do 8/16-byte write if possible.
-	// See comment at forwardtail.
-	TBZ	$3, R6, 3(PC)
-	MOVD.W	-8(R4), R8
-	MOVD.W	R8, -8(R3)
+backward_check:
+	// Use backward copy if there is an overlap.
+	SUB	R1, R0, R14
+	CBZ	R14, copy0
+	CMP	R2, R14
+	BCC	copy_long_backward
 
-	TBZ	$4, R6, 3(PC)
-	LDP.W	-16(R4), (R8, R10)
-	STP.W	(R8, R10), -16(R3)
+	// Copy 16 bytes and then align src (R1) or dst (R0) to 16-byte alignment.
+	LDP	(R1), (R12, R13)     // Load  A
+	AND	$15, R7, R14         // Calculate the realignment offset
+	SUB	R14, R1, R1
+	SUB	R14, R0, R3          // move dst back same amount as src
+	ADD	R14, R2, R2
+	LDP	16(R1), (R6, R7)     // Load   B
+	STP	(R12, R13), (R0)     // Store A
+	LDP	32(R1), (R8, R9)     // Load    C
+	LDP	48(R1), (R10, R11)   // Load     D
+	LDP.W	64(R1), (R12, R13)   // Load      E
+	// 80 bytes have been loaded; if less than 80+64 bytes remain, copy from the end
+	SUBS	$144, R2, R2
+	BLS	copy64_from_end
 
-nobackwardtail:
-	CBNZ     R7, backwardlarge	// Do we need to do any doubleword-by-doubleword copying?
+loop64:
+	STP	(R6, R7), 16(R3)     // Store  B
+	LDP	16(R1), (R6, R7)     // Load   B (next iteration)
+	STP	(R8, R9), 32(R3)     // Store   C
+	LDP	32(R1), (R8, R9)     // Load    C
+	STP	(R10, R11), 48(R3)   // Store    D
+	LDP	48(R1), (R10, R11)   // Load     D
+	STP.W	(R12, R13), 64(R3)   // Store     E
+	LDP.W	64(R1), (R12, R13)   // Load      E
+	SUBS	$64, R2, R2
+	BHI	loop64
+
+	// Write the last iteration and copy 64 bytes from the end.
+copy64_from_end:
+	LDP	-64(R4), (R14, R15)  // Load       F
+	STP	(R6, R7), 16(R3)     // Store  B
+	LDP	-48(R4), (R6, R7)    // Load        G
+	STP	(R8, R9), 32(R3)     // Store   C
+	LDP	-32(R4), (R8, R9)    // Load         H
+	STP	(R10, R11), 48(R3)   // Store    D
+	LDP	-16(R4), (R10, R11)  // Load          I
+	STP	(R12, R13), 64(R3)   // Store     E
+	STP	(R14, R15), -64(R5)  // Store      F
+	STP	(R6, R7), -48(R5)    // Store       G
+	STP	(R8, R9), -32(R5)    // Store        H
+	STP	(R10, R11), -16(R5)  // Store         I
 	RET
 
-backwardlarge:
-	SUB	R7, R3, R9	// R9 points at the lowest destination byte
+	// Large backward copy for overlapping copies.
+	// Copy 16 bytes and then align srcend (R4) or dstend (R5) to 16-byte alignment.
+copy_long_backward:
+	LDP	-16(R4), (R12, R13)
+	AND	$15, R8, R14
+	SUB	R14, R4, R4
+	SUB	R14, R2, R2
+	LDP	-16(R4), (R6, R7)
+	STP	(R12, R13), -16(R5)
+	LDP	-32(R4), (R8, R9)
+	LDP	-48(R4), (R10, R11)
+	LDP.W	-64(R4), (R12, R13)
+	SUB	R14, R5, R5
+	SUBS	$128, R2, R2
+	BLS	copy64_from_start
+
+loop64_backward:
+	STP	(R6, R7), -16(R5)
+	LDP	-16(R4), (R6, R7)
+	STP	(R8, R9), -32(R5)
+	LDP	-32(R4), (R8, R9)
+	STP	(R10, R11), -48(R5)
+	LDP	-48(R4), (R10, R11)
+	STP.W	(R12, R13), -64(R5)
+	LDP.W	-64(R4), (R12, R13)
+	SUBS	$64, R2, R2
+	BHI	loop64_backward
 
-backwardlargeloop:
-	LDP	-16(R4), (R8, R10)
-	STP	(R8, R10), -16(R3)
-	LDP.W	-32(R4), (R11, R12)
-	STP.W	(R11, R12), -32(R3)
-	CMP	R9, R3
-	BNE	backwardlargeloop
+	// Write the last iteration and copy 64 bytes from the start.
+copy64_from_start:
+	LDP	48(R1), (R2, R3)
+	STP	(R6, R7), -16(R5)
+	LDP	32(R1), (R6, R7)
+	STP	(R8, R9), -32(R5)
+	LDP	16(R1), (R8, R9)
+	STP	(R10, R11), -48(R5)
+	LDP	(R1), (R10, R11)
+	STP	(R12, R13), -64(R5)
+	STP	(R2, R3), 48(R0)
+	STP	(R6, R7), 32(R0)
+	STP	(R8, R9), 16(R0)
+	STP	(R10, R11), (R0)
 	RET
author	Katie Hockman <katie@golang.org>	2020-12-14 10:03:05 -0500
committer	Katie Hockman <katie@golang.org>	2020-12-14 10:06:13 -0500
commit	0345ede87ee12698988973884cfc0fd3d499dffd (patch)
tree	7123cff141ee5661208d2f5f437b8f5252ac7f6a /src/runtime/memmove_arm64.s
parent	4651d6b267818b0e0d128a5443289717c4bb8cbc (diff)
parent	0a02371b0576964e81c3b40d328db9a3ef3b031b (diff)
download	go-0345ede87ee12698988973884cfc0fd3d499dffd.tar.xz