diff options
| -rw-r--r-- | src/internal/bytealg/compare_riscv64.s | 173 |
1 files changed, 103 insertions, 70 deletions
diff --git a/src/internal/bytealg/compare_riscv64.s b/src/internal/bytealg/compare_riscv64.s index 44a743d3af..68cba2a37f 100644 --- a/src/internal/bytealg/compare_riscv64.s +++ b/src/internal/bytealg/compare_riscv64.s @@ -40,13 +40,13 @@ use_a_len: BEQZ X5, cmp_len MOV $32, X6 - BLT X5, X6, loop4_check + BLT X5, X6, check8_unaligned // Check alignment - if alignment differs we have to do one byte at a time. AND $7, X10, X7 AND $7, X12, X8 - BNE X7, X8, loop4_check - BEQZ X7, loop32_check + BNE X7, X8, check8_unaligned + BEQZ X7, compare32 // Check one byte at a time until we reach 8 byte alignment. SUB X7, X5, X5 @@ -59,94 +59,99 @@ align: ADD $1, X12 BNEZ X7, align -loop32_check: - MOV $32, X7 - BLT X5, X7, loop16_check -loop32: +check32: + MOV $32, X6 + BLT X5, X6, compare16 +compare32: MOV 0(X10), X15 MOV 0(X12), X16 MOV 8(X10), X17 MOV 8(X12), X18 - BEQ X15, X16, loop32a - JMP cmp8a -loop32a: - BEQ X17, X18, loop32b - JMP cmp8b -loop32b: + BNE X15, X16, cmp8a + BNE X17, X18, cmp8b MOV 16(X10), X15 MOV 16(X12), X16 MOV 24(X10), X17 MOV 24(X12), X18 - BEQ X15, X16, loop32c - JMP cmp8a -loop32c: - BEQ X17, X18, loop32d - JMP cmp8b -loop32d: + BNE X15, X16, cmp8a + BNE X17, X18, cmp8b ADD $32, X10 ADD $32, X12 ADD $-32, X5 - BGE X5, X7, loop32 + BGE X5, X6, compare32 BEQZ X5, cmp_len -loop16_check: +check16: MOV $16, X6 - BLT X5, X6, loop4_check -loop16: + BLT X5, X6, check8_unaligned +compare16: MOV 0(X10), X15 MOV 0(X12), X16 MOV 8(X10), X17 MOV 8(X12), X18 - BEQ X15, X16, loop16a - JMP cmp8a -loop16a: - BEQ X17, X18, loop16b - JMP cmp8b -loop16b: + BNE X15, X16, cmp8a + BNE X17, X18, cmp8b ADD $16, X10 ADD $16, X12 ADD $-16, X5 - BGE X5, X6, loop16 BEQZ X5, cmp_len -loop4_check: - MOV $4, X6 - BLT X5, X6, loop1 -loop4: +check8_unaligned: + MOV $8, X6 + BLT X5, X6, check4_unaligned +compare8_unaligned: MOVBU 0(X10), X8 + MOVBU 1(X10), X15 + MOVBU 2(X10), X17 + MOVBU 3(X10), X19 + MOVBU 4(X10), X21 + MOVBU 5(X10), X23 + MOVBU 6(X10), X25 + MOVBU 7(X10), X29 MOVBU 0(X12), X9 + MOVBU 1(X12), X16 + MOVBU 2(X12), X18 + MOVBU 3(X12), X20 + MOVBU 4(X12), X22 + MOVBU 5(X12), X24 + MOVBU 6(X12), X28 + MOVBU 7(X12), X30 + BNE X8, X9, cmp1a + BNE X15, X16, cmp1b + BNE X17, X18, cmp1c + BNE X19, X20, cmp1d + BNE X21, X22, cmp1e + BNE X23, X24, cmp1f + BNE X25, X28, cmp1g + BNE X29, X30, cmp1h + ADD $8, X10 + ADD $8, X12 + ADD $-8, X5 + BGE X5, X6, compare8_unaligned + BEQZ X5, cmp_len + +check4_unaligned: + MOV $4, X6 + BLT X5, X6, compare1 +compare4_unaligned: + MOVBU 0(X10), X8 MOVBU 1(X10), X15 + MOVBU 2(X10), X17 + MOVBU 3(X10), X19 + MOVBU 0(X12), X9 MOVBU 1(X12), X16 - BEQ X8, X9, loop4a - SLTU X9, X8, X5 - SLTU X8, X9, X6 - JMP cmp_ret -loop4a: - BEQ X15, X16, loop4b - SLTU X16, X15, X5 - SLTU X15, X16, X6 - JMP cmp_ret -loop4b: - MOVBU 2(X10), X21 - MOVBU 2(X12), X22 - MOVBU 3(X10), X23 - MOVBU 3(X12), X24 - BEQ X21, X22, loop4c - SLTU X22, X21, X5 - SLTU X21, X22, X6 - JMP cmp_ret -loop4c: - BEQ X23, X24, loop4d - SLTU X24, X23, X5 - SLTU X23, X24, X6 - JMP cmp_ret -loop4d: + MOVBU 2(X12), X18 + MOVBU 3(X12), X20 + BNE X8, X9, cmp1a + BNE X15, X16, cmp1b + BNE X17, X18, cmp1c + BNE X19, X20, cmp1d ADD $4, X10 ADD $4, X12 ADD $-4, X5 - BGE X5, X6, loop4 + BGE X5, X6, compare4_unaligned -loop1: +compare1: BEQZ X5, cmp_len MOVBU 0(X10), X8 MOVBU 0(X12), X9 @@ -154,27 +159,55 @@ loop1: ADD $1, X10 ADD $1, X12 ADD $-1, X5 - JMP loop1 + JMP compare1 // Compare 8 bytes of memory in X15/X16 that are known to differ. cmp8a: - MOV $0xff, X19 -cmp8a_loop: - AND X15, X19, X8 - AND X16, X19, X9 - BNE X8, X9, cmp - SLLI $8, X19 - JMP cmp8a_loop + MOV X15, X17 + MOV X16, X18 // Compare 8 bytes of memory in X17/X18 that are known to differ. cmp8b: MOV $0xff, X19 -cmp8b_loop: +cmp8_loop: AND X17, X19, X8 AND X18, X19, X9 BNE X8, X9, cmp SLLI $8, X19 - JMP cmp8b_loop + JMP cmp8_loop + +cmp1a: + SLTU X9, X8, X5 + SLTU X8, X9, X6 + JMP cmp_ret +cmp1b: + SLTU X16, X15, X5 + SLTU X15, X16, X6 + JMP cmp_ret +cmp1c: + SLTU X18, X17, X5 + SLTU X17, X18, X6 + JMP cmp_ret +cmp1d: + SLTU X20, X19, X5 + SLTU X19, X20, X6 + JMP cmp_ret +cmp1e: + SLTU X22, X21, X5 + SLTU X21, X22, X6 + JMP cmp_ret +cmp1f: + SLTU X24, X23, X5 + SLTU X23, X24, X6 + JMP cmp_ret +cmp1g: + SLTU X28, X25, X5 + SLTU X25, X28, X6 + JMP cmp_ret +cmp1h: + SLTU X30, X29, X5 + SLTU X29, X30, X6 + JMP cmp_ret cmp_len: MOV X11, X8 |
