diff options
Diffstat (limited to 'src/runtime/memmove_ppc64x.s')
| -rw-r--r-- | src/runtime/memmove_ppc64x.s | 109 |
1 files changed, 70 insertions, 39 deletions
diff --git a/src/runtime/memmove_ppc64x.s b/src/runtime/memmove_ppc64x.s index ea73b455b4..26dabd9e69 100644 --- a/src/runtime/memmove_ppc64x.s +++ b/src/runtime/memmove_ppc64x.s @@ -11,78 +11,109 @@ TEXT runtime·memmove(SB), NOSPLIT|NOFRAME, $0-24 MOVD to+0(FP), R3 MOVD from+8(FP), R4 MOVD n+16(FP), R5 - CMP R5, $0 - BNE check - RET + // Determine if there are doublewords to + // copy so a more efficient move can be done check: - ANDCC $7, R5, R7 // R7 is the number of bytes to copy and CR0[EQ] is set if there are none. - SRAD $3, R5, R6 // R6 is the number of words to copy - CMP R6, $0, CR1 // CR1[EQ] is set if there are no words to copy. + ANDCC $7, R5, R7 // R7: bytes to copy + SRAD $3, R5, R6 // R6: double words to copy + CMP R6, $0, CR1 // CR1[EQ] set if no double words to copy + + // Determine overlap by subtracting dest - src and comparing against the + // length. The catches the cases where src and dest are in different types + // of storage such as stack and static to avoid doing backward move when not + // necessary. - CMP R3, R4, CR2 - BC 12, 9, backward // I think you should be able to write this as "BGT CR2, backward" + SUB R4, R3, R8 // dest - src + CMPU R8, R5, CR2 // < len? + BC 12, 8, backward // BLT CR2 backward - // Copying forward proceeds by copying R6 words then copying R7 bytes. - // R3 and R4 are advanced as we copy. Because PPC64 lacks post-increment - // load/store, R3 and R4 point before the bytes that are to be copied. + // Copying forward if no overlap. BC 12, 6, noforwardlarge // "BEQ CR1, noforwardlarge" + MOVD R6,CTR // R6 = number of double words + SRADCC $2,R6,R8 // 32 byte chunks? + BNE forward32setup // - MOVD R6, CTR + // Move double words - SUB $8, R3 - SUB $8, R4 +forward8: + MOVD 0(R4), R8 // double word + ADD $8,R4 + MOVD R8, 0(R3) // + ADD $8,R3 + BC 16, 0, forward8 + BR noforwardlarge // handle remainder -forwardlargeloop: - MOVDU 8(R4), R8 - MOVDU R8, 8(R3) - BC 16, 0, forwardlargeloop // "BDNZ" + // Prepare for moves of 32 bytes at a time. - ADD $8, R3 - ADD $8, R4 +forward32setup: + DCBTST (R3) // prepare data cache + DCBT (R4) + MOVD R8, CTR // double work count + +forward32: + MOVD 0(R4), R8 // load 4 double words + MOVD 8(R4), R9 + MOVD 16(R4), R14 + MOVD 24(R4), R15 + ADD $32,R4 + MOVD R8, 0(R3) // store those 4 + MOVD R9, 8(R3) + MOVD R14,16(R3) + MOVD R15,24(R3) + ADD $32,R3 // bump up for next set + BC 16, 0, forward32 // continue + RLDCLCC $61,R5,$3,R6 // remaining doublewords + BEQ noforwardlarge + MOVD R6,CTR // set up the CTR + BR forward8 noforwardlarge: - BNE forwardtail // Tests the bit set by ANDCC above - RET + CMP R7,$0 // any remaining bytes + BC 4, 1, LR forwardtail: - SUB $1, R3 - SUB $1, R4 - MOVD R7, CTR + MOVD R7, CTR // move tail bytes forwardtailloop: - MOVBZU 1(R4), R8 - MOVBZU R8, 1(R3) + MOVBZ 0(R4), R8 // move single bytes + ADD $1,R4 + MOVBZ R8, 0(R3) + ADD $1,R3 BC 16, 0, forwardtailloop RET backward: - // Copying backwards proceeds by copying R7 bytes then copying R6 words. + // Copying backwards proceeds by copying R7 bytes then copying R6 double words. // R3 and R4 are advanced to the end of the destination/source buffers // respectively and moved back as we copy. - ADD R5, R4, R4 - ADD R3, R5, R3 + ADD R5, R4, R4 // end of source + ADD R3, R5, R3 // end of dest - BEQ nobackwardtail + BEQ nobackwardtail // earlier condition - MOVD R7, CTR + MOVD R7, CTR // bytes to move backwardtailloop: - MOVBZU -1(R4), R8 - MOVBZU R8, -1(R3) + MOVBZ -1(R4), R8 // point to last byte + SUB $1,R4 + MOVBZ R8, -1(R3) + SUB $1,R3 BC 16, 0, backwardtailloop nobackwardtail: - BC 4, 6, backwardlarge // "BNE CR1" - RET + CMP R6,$0 + BC 4, 5, LR backwardlarge: MOVD R6, CTR backwardlargeloop: - MOVDU -8(R4), R8 - MOVDU R8, -8(R3) - BC 16, 0, backwardlargeloop // "BDNZ" + MOVD -8(R4), R8 + SUB $8,R4 + MOVD R8, -8(R3) + SUB $8,R3 + BC 16, 0, backwardlargeloop // RET |
