aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorJoel Sing <joel@sing.id.au>2022-09-18 01:43:20 +1000
committerJoel Sing <joel@sing.id.au>2023-02-11 16:16:34 +0000
commit261fe25c83a94fc3defe064baed3944cd3d16959 (patch)
treec941b8c2abc0c3912a2f9e2240eccb3dcdaa2f3c /src
parente03ee85ef434f307500a71927dfb3e876161847a (diff)
downloadgo-261fe25c83a94fc3defe064baed3944cd3d16959.tar.xz
internal/bytealg: simplify and improve compare on riscv64
Remove some unnecessary loops and pull the comparison code out from the compare/loop code. Add an unaligned 8 byte comparison, which reads 8 bytes from each input before comparing them. This gives a reasonable gain in performance for the large unaligned case. Updates #50615 name old time/op new time/op delta CompareBytesEqual-4 116ns _ 0% 111ns _ 0% -4.10% (p=0.000 n=5+5) CompareBytesToNil-4 34.9ns _ 0% 35.0ns _ 0% +0.45% (p=0.002 n=5+5) CompareBytesEmpty-4 29.6ns _ 1% 29.8ns _ 0% +0.71% (p=0.016 n=5+5) CompareBytesIdentical-4 29.8ns _ 0% 29.9ns _ 1% +0.50% (p=0.036 n=5+5) CompareBytesSameLength-4 66.1ns _ 0% 60.4ns _ 0% -8.59% (p=0.000 n=5+5) CompareBytesDifferentLength-4 63.1ns _ 0% 60.5ns _ 0% -4.20% (p=0.000 n=5+5) CompareBytesBigUnaligned/offset=1-4 6.84ms _ 3% 6.04ms _ 5% -11.70% (p=0.001 n=5+5) CompareBytesBigUnaligned/offset=2-4 6.99ms _ 4% 5.93ms _ 6% -15.22% (p=0.000 n=5+5) CompareBytesBigUnaligned/offset=3-4 6.74ms _ 1% 6.00ms _ 5% -10.94% (p=0.001 n=5+5) CompareBytesBigUnaligned/offset=4-4 7.20ms _ 6% 5.97ms _ 6% -17.05% (p=0.000 n=5+5) CompareBytesBigUnaligned/offset=5-4 6.75ms _ 1% 5.81ms _ 8% -13.93% (p=0.001 n=5+5) CompareBytesBigUnaligned/offset=6-4 6.89ms _ 5% 5.75ms _ 2% -16.58% (p=0.000 n=5+4) CompareBytesBigUnaligned/offset=7-4 6.91ms _ 6% 6.13ms _ 6% -11.27% (p=0.001 n=5+5) CompareBytesBig-4 2.75ms _ 5% 2.71ms _ 8% ~ (p=0.651 n=5+5) CompareBytesBigIdentical-4 29.9ns _ 1% 29.8ns _ 0% ~ (p=0.751 n=5+5) name old speed new speed delta CompareBytesBigUnaligned/offset=1-4 153MB/s _ 3% 174MB/s _ 6% +13.40% (p=0.003 n=5+5) CompareBytesBigUnaligned/offset=2-4 150MB/s _ 4% 177MB/s _ 6% +18.06% (p=0.001 n=5+5) CompareBytesBigUnaligned/offset=3-4 156MB/s _ 1% 175MB/s _ 5% +12.39% (p=0.002 n=5+5) CompareBytesBigUnaligned/offset=4-4 146MB/s _ 6% 176MB/s _ 6% +20.67% (p=0.001 n=5+5) CompareBytesBigUnaligned/offset=5-4 155MB/s _ 1% 181MB/s _ 7% +16.35% (p=0.002 n=5+5) CompareBytesBigUnaligned/offset=6-4 152MB/s _ 5% 182MB/s _ 2% +19.74% (p=0.000 n=5+4) CompareBytesBigUnaligned/offset=7-4 152MB/s _ 6% 171MB/s _ 6% +12.70% (p=0.001 n=5+5) CompareBytesBig-4 382MB/s _ 5% 388MB/s _ 9% ~ (p=0.616 n=5+5) CompareBytesBigIdentical-4 35.1TB/s _ 1% 35.1TB/s _ 0% ~ (p=0.800 n=5+5) Change-Id: I127edc376e62a2c529719a4ab172f481e0a81357 Reviewed-on: https://go-review.googlesource.com/c/go/+/431100 Reviewed-by: Cherry Mui <cherryyz@google.com> Reviewed-by: Meng Zhuo <mzh@golangcn.org> Reviewed-by: Bryan Mills <bcmills@google.com> TryBot-Result: Gopher Robot <gobot@golang.org> Reviewed-by: Joedian Reid <joedian@golang.org> Run-TryBot: Joel Sing <joel@sing.id.au>
Diffstat (limited to 'src')
-rw-r--r--src/internal/bytealg/compare_riscv64.s173
1 files changed, 103 insertions, 70 deletions
diff --git a/src/internal/bytealg/compare_riscv64.s b/src/internal/bytealg/compare_riscv64.s
index 44a743d3af..68cba2a37f 100644
--- a/src/internal/bytealg/compare_riscv64.s
+++ b/src/internal/bytealg/compare_riscv64.s
@@ -40,13 +40,13 @@ use_a_len:
BEQZ X5, cmp_len
MOV $32, X6
- BLT X5, X6, loop4_check
+ BLT X5, X6, check8_unaligned
// Check alignment - if alignment differs we have to do one byte at a time.
AND $7, X10, X7
AND $7, X12, X8
- BNE X7, X8, loop4_check
- BEQZ X7, loop32_check
+ BNE X7, X8, check8_unaligned
+ BEQZ X7, compare32
// Check one byte at a time until we reach 8 byte alignment.
SUB X7, X5, X5
@@ -59,94 +59,99 @@ align:
ADD $1, X12
BNEZ X7, align
-loop32_check:
- MOV $32, X7
- BLT X5, X7, loop16_check
-loop32:
+check32:
+ MOV $32, X6
+ BLT X5, X6, compare16
+compare32:
MOV 0(X10), X15
MOV 0(X12), X16
MOV 8(X10), X17
MOV 8(X12), X18
- BEQ X15, X16, loop32a
- JMP cmp8a
-loop32a:
- BEQ X17, X18, loop32b
- JMP cmp8b
-loop32b:
+ BNE X15, X16, cmp8a
+ BNE X17, X18, cmp8b
MOV 16(X10), X15
MOV 16(X12), X16
MOV 24(X10), X17
MOV 24(X12), X18
- BEQ X15, X16, loop32c
- JMP cmp8a
-loop32c:
- BEQ X17, X18, loop32d
- JMP cmp8b
-loop32d:
+ BNE X15, X16, cmp8a
+ BNE X17, X18, cmp8b
ADD $32, X10
ADD $32, X12
ADD $-32, X5
- BGE X5, X7, loop32
+ BGE X5, X6, compare32
BEQZ X5, cmp_len
-loop16_check:
+check16:
MOV $16, X6
- BLT X5, X6, loop4_check
-loop16:
+ BLT X5, X6, check8_unaligned
+compare16:
MOV 0(X10), X15
MOV 0(X12), X16
MOV 8(X10), X17
MOV 8(X12), X18
- BEQ X15, X16, loop16a
- JMP cmp8a
-loop16a:
- BEQ X17, X18, loop16b
- JMP cmp8b
-loop16b:
+ BNE X15, X16, cmp8a
+ BNE X17, X18, cmp8b
ADD $16, X10
ADD $16, X12
ADD $-16, X5
- BGE X5, X6, loop16
BEQZ X5, cmp_len
-loop4_check:
- MOV $4, X6
- BLT X5, X6, loop1
-loop4:
+check8_unaligned:
+ MOV $8, X6
+ BLT X5, X6, check4_unaligned
+compare8_unaligned:
MOVBU 0(X10), X8
+ MOVBU 1(X10), X15
+ MOVBU 2(X10), X17
+ MOVBU 3(X10), X19
+ MOVBU 4(X10), X21
+ MOVBU 5(X10), X23
+ MOVBU 6(X10), X25
+ MOVBU 7(X10), X29
MOVBU 0(X12), X9
+ MOVBU 1(X12), X16
+ MOVBU 2(X12), X18
+ MOVBU 3(X12), X20
+ MOVBU 4(X12), X22
+ MOVBU 5(X12), X24
+ MOVBU 6(X12), X28
+ MOVBU 7(X12), X30
+ BNE X8, X9, cmp1a
+ BNE X15, X16, cmp1b
+ BNE X17, X18, cmp1c
+ BNE X19, X20, cmp1d
+ BNE X21, X22, cmp1e
+ BNE X23, X24, cmp1f
+ BNE X25, X28, cmp1g
+ BNE X29, X30, cmp1h
+ ADD $8, X10
+ ADD $8, X12
+ ADD $-8, X5
+ BGE X5, X6, compare8_unaligned
+ BEQZ X5, cmp_len
+
+check4_unaligned:
+ MOV $4, X6
+ BLT X5, X6, compare1
+compare4_unaligned:
+ MOVBU 0(X10), X8
MOVBU 1(X10), X15
+ MOVBU 2(X10), X17
+ MOVBU 3(X10), X19
+ MOVBU 0(X12), X9
MOVBU 1(X12), X16
- BEQ X8, X9, loop4a
- SLTU X9, X8, X5
- SLTU X8, X9, X6
- JMP cmp_ret
-loop4a:
- BEQ X15, X16, loop4b
- SLTU X16, X15, X5
- SLTU X15, X16, X6
- JMP cmp_ret
-loop4b:
- MOVBU 2(X10), X21
- MOVBU 2(X12), X22
- MOVBU 3(X10), X23
- MOVBU 3(X12), X24
- BEQ X21, X22, loop4c
- SLTU X22, X21, X5
- SLTU X21, X22, X6
- JMP cmp_ret
-loop4c:
- BEQ X23, X24, loop4d
- SLTU X24, X23, X5
- SLTU X23, X24, X6
- JMP cmp_ret
-loop4d:
+ MOVBU 2(X12), X18
+ MOVBU 3(X12), X20
+ BNE X8, X9, cmp1a
+ BNE X15, X16, cmp1b
+ BNE X17, X18, cmp1c
+ BNE X19, X20, cmp1d
ADD $4, X10
ADD $4, X12
ADD $-4, X5
- BGE X5, X6, loop4
+ BGE X5, X6, compare4_unaligned
-loop1:
+compare1:
BEQZ X5, cmp_len
MOVBU 0(X10), X8
MOVBU 0(X12), X9
@@ -154,27 +159,55 @@ loop1:
ADD $1, X10
ADD $1, X12
ADD $-1, X5
- JMP loop1
+ JMP compare1
// Compare 8 bytes of memory in X15/X16 that are known to differ.
cmp8a:
- MOV $0xff, X19
-cmp8a_loop:
- AND X15, X19, X8
- AND X16, X19, X9
- BNE X8, X9, cmp
- SLLI $8, X19
- JMP cmp8a_loop
+ MOV X15, X17
+ MOV X16, X18
// Compare 8 bytes of memory in X17/X18 that are known to differ.
cmp8b:
MOV $0xff, X19
-cmp8b_loop:
+cmp8_loop:
AND X17, X19, X8
AND X18, X19, X9
BNE X8, X9, cmp
SLLI $8, X19
- JMP cmp8b_loop
+ JMP cmp8_loop
+
+cmp1a:
+ SLTU X9, X8, X5
+ SLTU X8, X9, X6
+ JMP cmp_ret
+cmp1b:
+ SLTU X16, X15, X5
+ SLTU X15, X16, X6
+ JMP cmp_ret
+cmp1c:
+ SLTU X18, X17, X5
+ SLTU X17, X18, X6
+ JMP cmp_ret
+cmp1d:
+ SLTU X20, X19, X5
+ SLTU X19, X20, X6
+ JMP cmp_ret
+cmp1e:
+ SLTU X22, X21, X5
+ SLTU X21, X22, X6
+ JMP cmp_ret
+cmp1f:
+ SLTU X24, X23, X5
+ SLTU X23, X24, X6
+ JMP cmp_ret
+cmp1g:
+ SLTU X28, X25, X5
+ SLTU X25, X28, X6
+ JMP cmp_ret
+cmp1h:
+ SLTU X30, X29, X5
+ SLTU X29, X30, X6
+ JMP cmp_ret
cmp_len:
MOV X11, X8