1 files changed, 70 insertions, 39 deletions
diff --git a/src/runtime/memmove_ppc64x.s b/src/runtime/memmove_ppc64x.s
index ea73b455b4..26dabd9e69 100644
--- a/src/runtime/memmove_ppc64x.s
+++ b/src/runtime/memmove_ppc64x.s
@@ -11,78 +11,109 @@ TEXT runtime·memmove(SB), NOSPLIT|NOFRAME, $0-24
 	MOVD	to+0(FP), R3
 	MOVD	from+8(FP), R4
 	MOVD	n+16(FP), R5
-	CMP	R5, $0
-	BNE	check
-	RET
 
+	// Determine if there are doublewords to
+	// copy so a more efficient move can be done
 check:
-	ANDCC	$7, R5, R7	// R7 is the number of bytes to copy and CR0[EQ] is set if there are none.
-	SRAD	$3, R5, R6	// R6 is the number of words to copy
-	CMP	R6, $0, CR1	// CR1[EQ] is set if there are no words to copy.
+	ANDCC	$7, R5, R7	// R7: bytes to copy
+	SRAD	$3, R5, R6	// R6: double words to copy
+	CMP	R6, $0, CR1	// CR1[EQ] set if no double words to copy
+
+	// Determine overlap by subtracting dest - src and comparing against the
+	// length.  The catches the cases where src and dest are in different types
+	// of storage such as stack and static to avoid doing backward move when not
+	// necessary.
 
-	CMP	R3, R4, CR2
-	BC	12, 9, backward	// I think you should be able to write this as "BGT CR2, backward"
+	SUB	R4, R3, R8	// dest - src
+	CMPU	R8, R5, CR2	// < len?
+	BC	12, 8, backward // BLT CR2 backward
 
-	// Copying forward proceeds by copying R6 words then copying R7 bytes.
-	// R3 and R4 are advanced as we copy. Because PPC64 lacks post-increment
-	// load/store, R3 and R4 point before the bytes that are to be copied.
+	// Copying forward if no overlap.
 
 	BC	12, 6, noforwardlarge	// "BEQ CR1, noforwardlarge"
+	MOVD	R6,CTR			// R6 = number of double words
+	SRADCC	$2,R6,R8		// 32 byte chunks?
+	BNE	forward32setup		//
 
-	MOVD	R6, CTR
+	// Move double words
 
-	SUB	$8, R3
-	SUB	$8, R4
+forward8:
+	MOVD    0(R4), R8		// double word
+	ADD     $8,R4
+	MOVD    R8, 0(R3)		//
+	ADD     $8,R3
+	BC      16, 0, forward8
+	BR	noforwardlarge		// handle remainder
 
-forwardlargeloop:
-	MOVDU	8(R4), R8
-	MOVDU	R8, 8(R3)
-	BC	16, 0, forwardlargeloop // "BDNZ"
+	// Prepare for moves of 32 bytes at a time.
 
-	ADD	$8, R3
-	ADD	$8, R4
+forward32setup:
+	DCBTST	(R3)			// prepare data cache
+	DCBT	(R4)
+	MOVD	R8, CTR			// double work count
+
+forward32:
+	MOVD	0(R4), R8		// load 4 double words
+	MOVD	8(R4), R9
+	MOVD	16(R4), R14
+	MOVD	24(R4), R15
+	ADD	$32,R4
+	MOVD	R8, 0(R3)		// store those 4
+	MOVD	R9, 8(R3)
+	MOVD	R14,16(R3)
+	MOVD	R15,24(R3)
+	ADD	$32,R3			// bump up for next set
+	BC	16, 0, forward32	// continue
+	RLDCLCC	$61,R5,$3,R6		// remaining doublewords
+	BEQ	noforwardlarge
+	MOVD	R6,CTR			// set up the CTR
+	BR	forward8
 
 noforwardlarge:
-	BNE	forwardtail	// Tests the bit set by ANDCC above
-	RET
+	CMP	R7,$0			// any remaining bytes
+	BC	4, 1, LR
 
 forwardtail:
-	SUB	$1, R3
-	SUB	$1, R4
-	MOVD	R7, CTR
+	MOVD	R7, CTR			// move tail bytes
 
 forwardtailloop:
-	MOVBZU	1(R4), R8
-	MOVBZU	R8, 1(R3)
+	MOVBZ	0(R4), R8		// move single bytes
+	ADD	$1,R4
+	MOVBZ	R8, 0(R3)
+	ADD	$1,R3
 	BC	16, 0, forwardtailloop
 	RET
 
 backward:
-	// Copying backwards proceeds by copying R7 bytes then copying R6 words.
+	// Copying backwards proceeds by copying R7 bytes then copying R6 double words.
 	// R3 and R4 are advanced to the end of the destination/source buffers
 	// respectively and moved back as we copy.
 
-	ADD	R5, R4, R4
-	ADD	R3, R5, R3
+	ADD	R5, R4, R4		// end of source
+	ADD	R3, R5, R3		// end of dest
 
-	BEQ	nobackwardtail
+	BEQ	nobackwardtail		// earlier condition
 
-	MOVD	R7, CTR
+	MOVD	R7, CTR			// bytes to move
 
 backwardtailloop:
-	MOVBZU	-1(R4), R8
-	MOVBZU	R8, -1(R3)
+	MOVBZ 	-1(R4), R8		// point to last byte
+	SUB	$1,R4
+	MOVBZ 	R8, -1(R3)
+	SUB	$1,R3
 	BC	16, 0, backwardtailloop
 
 nobackwardtail:
-	BC	4, 6, backwardlarge		// "BNE CR1"
-	RET
+	CMP	R6,$0
+	BC	4, 5, LR
 
 backwardlarge:
 	MOVD	R6, CTR
 
 backwardlargeloop:
-	MOVDU	-8(R4), R8
-	MOVDU	R8, -8(R3)
-	BC	16, 0, backwardlargeloop	// "BDNZ"
+	MOVD 	-8(R4), R8
+	SUB	$8,R4
+	MOVD 	R8, -8(R3)
+	SUB	$8,R3
+	BC	16, 0, backwardlargeloop	//
 	RET