aboutsummaryrefslogtreecommitdiff
path: root/src/internal/bytealg
diff options
context:
space:
mode:
authorfanzha02 <fannie.zhang@arm.com>2018-01-26 09:18:31 +0000
committerCherry Zhang <cherryyz@google.com>2018-03-20 00:06:34 +0000
commitbfa8b6f8ffa49b3e5796af45a2f0bf9714a37bb9 (patch)
tree0fabed3fb136fa3aaa9adc646affc50367cd7198 /src/internal/bytealg
parent910c3a9dfc4ff8ea4c25d725783bce4887d790f2 (diff)
downloadgo-bfa8b6f8ffa49b3e5796af45a2f0bf9714a37bb9.tar.xz
bytes: add optimized Compare for arm64
Use LDP instructions to load 16 bytes per loop when the source length is long. Specially process the 8 bytes length, 4 bytes length and 2 bytes length to get a better performance. Benchmark result: name old time/op new time/op delta BytesCompare/1-8 21.0ns ± 0% 10.5ns ± 0% ~ (p=0.079 n=4+5) BytesCompare/2-8 11.5ns ± 0% 10.5ns ± 0% -8.70% (p=0.008 n=5+5) BytesCompare/4-8 13.5ns ± 0% 10.0ns ± 0% -25.93% (p=0.008 n=5+5) BytesCompare/8-8 28.8ns ± 0% 9.5ns ± 0% ~ (p=0.079 n=4+5) BytesCompare/16-8 40.5ns ± 0% 10.5ns ± 0% -74.07% (p=0.008 n=5+5) BytesCompare/32-8 64.6ns ± 0% 12.5ns ± 0% -80.65% (p=0.008 n=5+5) BytesCompare/64-8 112ns ± 0% 16ns ± 0% -85.27% (p=0.008 n=5+5) BytesCompare/128-8 208ns ± 0% 24ns ± 0% -88.22% (p=0.008 n=5+5) BytesCompare/256-8 400ns ± 0% 50ns ± 0% -87.62% (p=0.008 n=5+5) BytesCompare/512-8 785ns ± 0% 82ns ± 0% -89.61% (p=0.008 n=5+5) BytesCompare/1024-8 1.55µs ± 0% 0.14µs ± 0% ~ (p=0.079 n=4+5) BytesCompare/2048-8 3.09µs ± 0% 0.27µs ± 0% ~ (p=0.079 n=4+5) CompareBytesEqual-8 39.0ns ± 0% 12.0ns ± 0% -69.23% (p=0.008 n=5+5) CompareBytesToNil-8 8.57ns ± 5% 8.23ns ± 2% -3.99% (p=0.016 n=5+5) CompareBytesEmpty-8 7.37ns ± 0% 7.36ns ± 4% ~ (p=0.690 n=5+5) CompareBytesIdentical-8 7.39ns ± 0% 7.46ns ± 2% ~ (p=0.667 n=5+5) CompareBytesSameLength-8 17.0ns ± 0% 10.5ns ± 0% -38.24% (p=0.008 n=5+5) CompareBytesDifferentLength-8 17.0ns ± 0% 10.5ns ± 0% -38.24% (p=0.008 n=5+5) CompareBytesBigUnaligned-8 1.58ms ± 0% 0.19ms ± 0% -88.31% (p=0.016 n=4+5) CompareBytesBig-8 1.59ms ± 0% 0.19ms ± 0% -88.27% (p=0.016 n=5+4) CompareBytesBigIdentical-8 7.01ns ± 0% 6.60ns ± 3% -5.91% (p=0.008 n=5+5) name old speed new speed delta CompareBytesBigUnaligned-8 662MB/s ± 0% 5660MB/s ± 0% +755.15% (p=0.016 n=4+5) CompareBytesBig-8 661MB/s ± 0% 5636MB/s ± 0% +752.57% (p=0.016 n=5+4) CompareBytesBigIdentical-8 150TB/s ± 0% 159TB/s ± 3% +6.27% (p=0.008 n=5+5) Change-Id: I70332de06f873df3bc12c4a5af1028307b670046 Reviewed-on: https://go-review.googlesource.com/90175 Reviewed-by: Cherry Zhang <cherryyz@google.com> Run-TryBot: Cherry Zhang <cherryyz@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org>
Diffstat (limited to 'src/internal/bytealg')
-rw-r--r--src/internal/bytealg/compare_arm64.s106
1 files changed, 87 insertions, 19 deletions
diff --git a/src/internal/bytealg/compare_arm64.s b/src/internal/bytealg/compare_arm64.s
index 9b6354715a..2bd38064c3 100644
--- a/src/internal/bytealg/compare_arm64.s
+++ b/src/internal/bytealg/compare_arm64.s
@@ -10,7 +10,7 @@ TEXT ·Compare(SB),NOSPLIT|NOFRAME,$0-56
MOVD a_len+8(FP), R0
MOVD b_base+24(FP), R3
MOVD b_len+32(FP), R1
- ADD $56, RSP, R7
+ MOVD $ret+48(FP), R7
B cmpbody<>(SB)
TEXT bytes·Compare(SB),NOSPLIT|NOFRAME,$0-56
@@ -18,7 +18,7 @@ TEXT bytes·Compare(SB),NOSPLIT|NOFRAME,$0-56
MOVD a_len+8(FP), R0
MOVD b_base+24(FP), R3
MOVD b_len+32(FP), R1
- ADD $56, RSP, R7
+ MOVD $ret+48(FP), R7
B cmpbody<>(SB)
TEXT runtime·cmpstring(SB),NOSPLIT|NOFRAME,$0-40
@@ -26,7 +26,7 @@ TEXT runtime·cmpstring(SB),NOSPLIT|NOFRAME,$0-40
MOVD a_len+8(FP), R0
MOVD b_base+16(FP), R3
MOVD b_len+24(FP), R1
- ADD $40, RSP, R7
+ MOVD $ret+32(FP), R7
B cmpbody<>(SB)
// On entry:
@@ -37,30 +37,98 @@ TEXT runtime·cmpstring(SB),NOSPLIT|NOFRAME,$0-40
// R7 points to return value (-1/0/1 will be written here)
//
// On exit:
-// R4, R5, and R6 are clobbered
+// R4, R5, R6, R8, R9 and R10 are clobbered
TEXT cmpbody<>(SB),NOSPLIT|NOFRAME,$0-0
CMP R2, R3
- BEQ samebytes // same starting pointers; compare lengths
+ BEQ samebytes // same starting pointers; compare lengths
CMP R0, R1
- CSEL LT, R1, R0, R6 // R6 is min(R0, R1)
+ CSEL LT, R1, R0, R6 // R6 is min(R0, R1)
- ADD R2, R6 // R2 is current byte in a, R6 is last byte in a to compare
-loop:
- CMP R2, R6
- BEQ samebytes // all compared bytes were the same; compare lengths
- MOVBU.P 1(R2), R4
- MOVBU.P 1(R3), R5
+ CMP $0, R6
+ BEQ samebytes
+ BIC $0xf, R6, R10
+ CBZ R10, small // length < 16
+ ADD R2, R10 // end of chunk16
+ // length >= 16
+chunk16_loop:
+ LDP.P 16(R2), (R4, R8)
+ LDP.P 16(R3), (R5, R9)
CMP R4, R5
- BEQ loop
- // bytes differed
+ BNE cmp
+ CMP R8, R9
+ BNE cmpnext
+ CMP R10, R2
+ BNE chunk16_loop
+ AND $0xf, R6, R6
+ CBZ R6, samebytes
+ SUBS $8, R6
+ BLT tail
+ // the length of tail > 8 bytes
+ MOVD.P 8(R2), R4
+ MOVD.P 8(R3), R5
+ CMP R4, R5
+ BNE cmp
+ SUB $8, R6
+ // compare last 8 bytes
+tail:
+ MOVD (R2)(R6), R4
+ MOVD (R3)(R6), R5
+ CMP R4, R5
+ BEQ samebytes
+cmp:
+ REV R4, R4
+ REV R5, R5
+ CMP R4, R5
+ret:
MOVD $1, R4
- CSNEG LT, R4, R4, R4
+ CNEG HI, R4, R4
MOVD R4, (R7)
RET
+small:
+ TBZ $3, R6, lt_8
+ MOVD (R2), R4
+ MOVD (R3), R5
+ CMP R4, R5
+ BNE cmp
+ SUBS $8, R6
+ BEQ samebytes
+ ADD $8, R2
+ ADD $8, R3
+ SUB $8, R6
+ B tail
+lt_8:
+ TBZ $2, R6, lt_4
+ MOVWU (R2), R4
+ MOVWU (R3), R5
+ CMPW R4, R5
+ BNE cmp
+ SUBS $4, R6
+ BEQ samebytes
+ ADD $4, R2
+ ADD $4, R3
+lt_4:
+ TBZ $1, R6, lt_2
+ MOVHU (R2), R4
+ MOVHU (R3), R5
+ CMPW R4, R5
+ BNE cmp
+ ADD $2, R2
+ ADD $2, R3
+lt_2:
+ TBZ $0, R6, samebytes
+one:
+ MOVBU (R2), R4
+ MOVBU (R3), R5
+ CMPW R4, R5
+ BNE ret
samebytes:
- MOVD $1, R4
- CMP R0, R1
- CSNEG LT, R4, R4, R4
- CSEL EQ, ZR, R4, R4
+ CMP R1, R0
+ CSET NE, R4
+ CNEG LO, R4, R4
MOVD R4, (R7)
RET
+cmpnext:
+ REV R8, R4
+ REV R9, R5
+ CMP R4, R5
+ B ret