diff options
| author | Katie Hockman <katie@golang.org> | 2020-12-14 10:03:05 -0500 |
|---|---|---|
| committer | Katie Hockman <katie@golang.org> | 2020-12-14 10:06:13 -0500 |
| commit | 0345ede87ee12698988973884cfc0fd3d499dffd (patch) | |
| tree | 7123cff141ee5661208d2f5f437b8f5252ac7f6a /src/runtime/memmove_arm64.s | |
| parent | 4651d6b267818b0e0d128a5443289717c4bb8cbc (diff) | |
| parent | 0a02371b0576964e81c3b40d328db9a3ef3b031b (diff) | |
| download | go-0345ede87ee12698988973884cfc0fd3d499dffd.tar.xz | |
[dev.fuzz] all: merge master into dev.fuzz
Change-Id: I5d8c8329ccc9d747bd81ade6b1cb7cb8ae2e94b2
Diffstat (limited to 'src/runtime/memmove_arm64.s')
| -rw-r--r-- | src/runtime/memmove_arm64.s | 318 |
1 files changed, 201 insertions, 117 deletions
diff --git a/src/runtime/memmove_arm64.s b/src/runtime/memmove_arm64.s index dbb7e9a28a..43d27629e5 100644 --- a/src/runtime/memmove_arm64.s +++ b/src/runtime/memmove_arm64.s @@ -6,152 +6,236 @@ // See memmove Go doc for important implementation constraints. +// Register map +// +// dstin R0 +// src R1 +// count R2 +// dst R3 (same as R0, but gets modified in unaligned cases) +// srcend R4 +// dstend R5 +// data R6-R17 +// tmp1 R14 + +// Copies are split into 3 main cases: small copies of up to 32 bytes, medium +// copies of up to 128 bytes, and large copies. The overhead of the overlap +// check is negligible since it is only required for large copies. +// +// Large copies use a software pipelined loop processing 64 bytes per iteration. +// The destination pointer is 16-byte aligned to minimize unaligned accesses. +// The loop tail is handled by always copying 64 bytes from the end. + // func memmove(to, from unsafe.Pointer, n uintptr) TEXT runtime·memmove(SB), NOSPLIT|NOFRAME, $0-24 - MOVD to+0(FP), R3 - MOVD from+8(FP), R4 - MOVD n+16(FP), R5 - CBNZ R5, check - RET + MOVD to+0(FP), R0 + MOVD from+8(FP), R1 + MOVD n+16(FP), R2 + CBZ R2, copy0 -check: - CMP $16, R5 + // Small copies: 1..16 bytes + CMP $16, R2 BLE copy16 - AND $~31, R5, R7 // R7 is N&~31 - SUB R7, R5, R6 // R6 is N&31 - - CMP R3, R4 - BLT backward - - // Copying forward proceeds by copying R7/32 quadwords then R6 <= 31 tail bytes. - // R3 and R4 are advanced as we copy. - - // (There may be implementations of armv8 where copying by bytes until - // at least one of source or dest is word aligned is a worthwhile - // optimization, but the on the one tested so far (xgene) it did not - // make a significance difference.) + // Large copies + CMP $128, R2 + BHI copy_long + CMP $32, R2 + BHI copy32_128 - CBZ R7, noforwardlarge // Do we need to do any quadword copying? - - ADD R3, R7, R9 // R9 points just past where we copy by word - -forwardlargeloop: - // Copy 32 bytes at a time. - LDP.P 32(R4), (R8, R10) - STP.P (R8, R10), 32(R3) - LDP -16(R4), (R11, R12) - STP (R11, R12), -16(R3) - SUB $32, R7, R7 - CBNZ R7, forwardlargeloop - -noforwardlarge: - CBNZ R6, forwardtail // Do we need to copy any tail bytes? + // Small copies: 17..32 bytes. + LDP (R1), (R6, R7) + ADD R1, R2, R4 // R4 points just past the last source byte + LDP -16(R4), (R12, R13) + STP (R6, R7), (R0) + ADD R0, R2, R5 // R5 points just past the last destination byte + STP (R12, R13), -16(R5) RET -forwardtail: - // There are R6 <= 31 bytes remaining to copy. - // This is large enough to still contain pointers, - // which must be copied atomically. - // Copy the next 16 bytes, then 8 bytes, then any remaining bytes. - TBZ $4, R6, 3(PC) // write 16 bytes if R6&16 != 0 - LDP.P 16(R4), (R8, R10) - STP.P (R8, R10), 16(R3) - - TBZ $3, R6, 3(PC) // write 8 bytes if R6&8 != 0 - MOVD.P 8(R4), R8 - MOVD.P R8, 8(R3) - - AND $7, R6 - CBNZ R6, 2(PC) - RET - - ADD R3, R6, R9 // R9 points just past the destination memory - -forwardtailloop: - MOVBU.P 1(R4), R8 - MOVBU.P R8, 1(R3) - CMP R3, R9 - BNE forwardtailloop - RET - - // Small copies: 1..16 bytes. +// Small copies: 1..16 bytes. copy16: - ADD R4, R5, R8 // R8 points just past the last source byte - ADD R3, R5, R9 // R9 points just past the last destination byte - CMP $8, R5 + ADD R1, R2, R4 // R4 points just past the last source byte + ADD R0, R2, R5 // R5 points just past the last destination byte + CMP $8, R2 BLT copy7 - MOVD (R4), R6 - MOVD -8(R8), R7 - MOVD R6, (R3) - MOVD R7, -8(R9) + MOVD (R1), R6 + MOVD -8(R4), R7 + MOVD R6, (R0) + MOVD R7, -8(R5) RET copy7: - TBZ $2, R5, copy3 - MOVWU (R4), R6 - MOVWU -4(R8), R7 - MOVW R6, (R3) - MOVW R7, -4(R9) + TBZ $2, R2, copy3 + MOVWU (R1), R6 + MOVWU -4(R4), R7 + MOVW R6, (R0) + MOVW R7, -4(R5) RET copy3: - TBZ $1, R5, copy1 - MOVHU (R4), R6 - MOVHU -2(R8), R7 - MOVH R6, (R3) - MOVH R7, -2(R9) + TBZ $1, R2, copy1 + MOVHU (R1), R6 + MOVHU -2(R4), R7 + MOVH R6, (R0) + MOVH R7, -2(R5) RET copy1: - MOVBU (R4), R6 - MOVB R6, (R3) + MOVBU (R1), R6 + MOVB R6, (R0) + +copy0: RET -backward: - // Copying backwards first copies R6 <= 31 tail bytes, then R7/32 quadwords. - // R3 and R4 are advanced to the end of the destination/source buffers - // respectively and moved back as we copy. + // Medium copies: 33..128 bytes. +copy32_128: + ADD R1, R2, R4 // R4 points just past the last source byte + ADD R0, R2, R5 // R5 points just past the last destination byte + LDP (R1), (R6, R7) + LDP 16(R1), (R8, R9) + LDP -32(R4), (R10, R11) + LDP -16(R4), (R12, R13) + CMP $64, R2 + BHI copy128 + STP (R6, R7), (R0) + STP (R8, R9), 16(R0) + STP (R10, R11), -32(R5) + STP (R12, R13), -16(R5) + RET - ADD R4, R5, R4 // R4 points just past the last source byte - ADD R3, R5, R3 // R3 points just past the last destination byte + // Copy 65..128 bytes. +copy128: + LDP 32(R1), (R14, R15) + LDP 48(R1), (R16, R17) + CMP $96, R2 + BLS copy96 + LDP -64(R4), (R2, R3) + LDP -48(R4), (R1, R4) + STP (R2, R3), -64(R5) + STP (R1, R4), -48(R5) - CBZ R6, nobackwardtail // Do we need to do any byte-by-byte copying? +copy96: + STP (R6, R7), (R0) + STP (R8, R9), 16(R0) + STP (R14, R15), 32(R0) + STP (R16, R17), 48(R0) + STP (R10, R11), -32(R5) + STP (R12, R13), -16(R5) + RET - AND $7, R6, R12 - CBZ R12, backwardtaillarge + // Copy more than 128 bytes. +copy_long: + ADD R1, R2, R4 // R4 points just past the last source byte + ADD R0, R2, R5 // R5 points just past the last destination byte + MOVD ZR, R7 + MOVD ZR, R8 - SUB R12, R3, R9 // R9 points at the lowest destination byte that should be copied by byte. -backwardtailloop: - // Copy sub-pointer-size tail. - MOVBU.W -1(R4), R8 - MOVBU.W R8, -1(R3) - CMP R9, R3 - BNE backwardtailloop + CMP $1024, R2 + BLT backward_check + // feature detect to decide how to align + MOVBU runtime·arm64UseAlignedLoads(SB), R6 + CBNZ R6, use_aligned_loads + MOVD R0, R7 + MOVD R5, R8 + B backward_check +use_aligned_loads: + MOVD R1, R7 + MOVD R4, R8 + // R7 and R8 are used here for the realignment calculation. In + // the use_aligned_loads case, R7 is the src pointer and R8 is + // srcend pointer, which is used in the backward copy case. + // When doing aligned stores, R7 is the dst pointer and R8 is + // the dstend pointer. -backwardtaillarge: - // Do 8/16-byte write if possible. - // See comment at forwardtail. - TBZ $3, R6, 3(PC) - MOVD.W -8(R4), R8 - MOVD.W R8, -8(R3) +backward_check: + // Use backward copy if there is an overlap. + SUB R1, R0, R14 + CBZ R14, copy0 + CMP R2, R14 + BCC copy_long_backward - TBZ $4, R6, 3(PC) - LDP.W -16(R4), (R8, R10) - STP.W (R8, R10), -16(R3) + // Copy 16 bytes and then align src (R1) or dst (R0) to 16-byte alignment. + LDP (R1), (R12, R13) // Load A + AND $15, R7, R14 // Calculate the realignment offset + SUB R14, R1, R1 + SUB R14, R0, R3 // move dst back same amount as src + ADD R14, R2, R2 + LDP 16(R1), (R6, R7) // Load B + STP (R12, R13), (R0) // Store A + LDP 32(R1), (R8, R9) // Load C + LDP 48(R1), (R10, R11) // Load D + LDP.W 64(R1), (R12, R13) // Load E + // 80 bytes have been loaded; if less than 80+64 bytes remain, copy from the end + SUBS $144, R2, R2 + BLS copy64_from_end -nobackwardtail: - CBNZ R7, backwardlarge // Do we need to do any doubleword-by-doubleword copying? +loop64: + STP (R6, R7), 16(R3) // Store B + LDP 16(R1), (R6, R7) // Load B (next iteration) + STP (R8, R9), 32(R3) // Store C + LDP 32(R1), (R8, R9) // Load C + STP (R10, R11), 48(R3) // Store D + LDP 48(R1), (R10, R11) // Load D + STP.W (R12, R13), 64(R3) // Store E + LDP.W 64(R1), (R12, R13) // Load E + SUBS $64, R2, R2 + BHI loop64 + + // Write the last iteration and copy 64 bytes from the end. +copy64_from_end: + LDP -64(R4), (R14, R15) // Load F + STP (R6, R7), 16(R3) // Store B + LDP -48(R4), (R6, R7) // Load G + STP (R8, R9), 32(R3) // Store C + LDP -32(R4), (R8, R9) // Load H + STP (R10, R11), 48(R3) // Store D + LDP -16(R4), (R10, R11) // Load I + STP (R12, R13), 64(R3) // Store E + STP (R14, R15), -64(R5) // Store F + STP (R6, R7), -48(R5) // Store G + STP (R8, R9), -32(R5) // Store H + STP (R10, R11), -16(R5) // Store I RET -backwardlarge: - SUB R7, R3, R9 // R9 points at the lowest destination byte + // Large backward copy for overlapping copies. + // Copy 16 bytes and then align srcend (R4) or dstend (R5) to 16-byte alignment. +copy_long_backward: + LDP -16(R4), (R12, R13) + AND $15, R8, R14 + SUB R14, R4, R4 + SUB R14, R2, R2 + LDP -16(R4), (R6, R7) + STP (R12, R13), -16(R5) + LDP -32(R4), (R8, R9) + LDP -48(R4), (R10, R11) + LDP.W -64(R4), (R12, R13) + SUB R14, R5, R5 + SUBS $128, R2, R2 + BLS copy64_from_start + +loop64_backward: + STP (R6, R7), -16(R5) + LDP -16(R4), (R6, R7) + STP (R8, R9), -32(R5) + LDP -32(R4), (R8, R9) + STP (R10, R11), -48(R5) + LDP -48(R4), (R10, R11) + STP.W (R12, R13), -64(R5) + LDP.W -64(R4), (R12, R13) + SUBS $64, R2, R2 + BHI loop64_backward -backwardlargeloop: - LDP -16(R4), (R8, R10) - STP (R8, R10), -16(R3) - LDP.W -32(R4), (R11, R12) - STP.W (R11, R12), -32(R3) - CMP R9, R3 - BNE backwardlargeloop + // Write the last iteration and copy 64 bytes from the start. +copy64_from_start: + LDP 48(R1), (R2, R3) + STP (R6, R7), -16(R5) + LDP 32(R1), (R6, R7) + STP (R8, R9), -32(R5) + LDP 16(R1), (R8, R9) + STP (R10, R11), -48(R5) + LDP (R1), (R10, R11) + STP (R12, R13), -64(R5) + STP (R2, R3), 48(R0) + STP (R6, R7), 32(R0) + STP (R8, R9), 16(R0) + STP (R10, R11), (R0) RET |
