diff options
| author | Lynn Boger <laboger@linux.vnet.ibm.com> | 2018-08-27 17:15:39 -0400 |
|---|---|---|
| committer | Lynn Boger <laboger@linux.vnet.ibm.com> | 2018-10-23 19:22:44 +0000 |
| commit | 6994731ec2babb6a4f2bbbb08dbe649767a25942 (patch) | |
| tree | e0237884a1aa11ccbb97671db3e6ccd95accabc1 /src/internal | |
| parent | 5c472132bf88cc04c85ad5f848d8a2f77f21b228 (diff) | |
| download | go-6994731ec2babb6a4f2bbbb08dbe649767a25942.tar.xz | |
internal/bytealg: improve asm for memequal on ppc64x
This includes two changes to the memequal function.
Previously the asm implementation on ppc64x for Equal called the internal
function memequal using a BL, whereas the other asm implementations for
bytes functions on ppc64x used BR. The BR is preferred because the BL
causes the calling function to stack a frame. This changes Equal so it
uses BR and is consistent with the others.
This also uses vsx instructions where possible to improve performance
of the compares for sizes over 32.
Here are results from the sizes affected:
Equal/32 8.40ns ± 0% 7.66ns ± 0% -8.81% (p=0.029 n=4+4)
Equal/4K 193ns ± 0% 144ns ± 0% -25.39% (p=0.029 n=4+4)
Equal/4M 346µs ± 0% 277µs ± 0% -20.08% (p=0.029 n=4+4)
Equal/64M 7.66ms ± 1% 7.27ms ± 0% -5.10% (p=0.029 n=4+4)
Change-Id: Ib6ee2cdc3e5d146e2705e3338858b8e965d25420
Reviewed-on: https://go-review.googlesource.com/c/143060
Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com>
Reviewed-by: Carlos Eduardo Seo <cseo@linux.vnet.ibm.com>
Reviewed-by: David Chase <drchase@google.com>
Diffstat (limited to 'src/internal')
| -rw-r--r-- | src/internal/bytealg/equal_ppc64x.s | 64 |
1 files changed, 26 insertions, 38 deletions
diff --git a/src/internal/bytealg/equal_ppc64x.s b/src/internal/bytealg/equal_ppc64x.s index 9c9cf77588..34d2a2574b 100644 --- a/src/internal/bytealg/equal_ppc64x.s +++ b/src/internal/bytealg/equal_ppc64x.s @@ -7,17 +7,15 @@ #include "go_asm.h" #include "textflag.h" -TEXT ·Equal(SB),NOSPLIT,$0-49 +TEXT ·Equal(SB),NOSPLIT|NOFRAME,$0-49 MOVD a_len+8(FP), R4 MOVD b_len+32(FP), R5 CMP R5, R4 // unequal lengths are not equal BNE noteq MOVD a_base+0(FP), R3 MOVD b_base+24(FP), R4 - BL memeqbody<>(SB) - - MOVBZ R9,ret+48(FP) - RET + MOVD $ret+48(FP), R10 + BR memeqbody<>(SB) noteq: MOVBZ $0,ret+48(FP) @@ -28,7 +26,7 @@ equal: MOVBZ R3,ret+48(FP) RET -TEXT bytes·Equal(SB),NOSPLIT,$0-49 +TEXT bytes·Equal(SB),NOSPLIT|NOFRAME,$0-49 FUNCDATA $0, ·Equal·args_stackmap(SB) MOVD a_len+8(FP), R4 MOVD b_len+32(FP), R5 @@ -36,10 +34,8 @@ TEXT bytes·Equal(SB),NOSPLIT,$0-49 BNE noteq MOVD a_base+0(FP), R3 MOVD b_base+24(FP), R4 - BL memeqbody<>(SB) - - MOVBZ R9,ret+48(FP) - RET + MOVD $ret+48(FP), R10 + BR memeqbody<>(SB) noteq: MOVBZ $0,ret+48(FP) @@ -51,25 +47,23 @@ equal: RET // memequal(a, b unsafe.Pointer, size uintptr) bool -TEXT runtime·memequal(SB),NOSPLIT,$0-25 +TEXT runtime·memequal(SB),NOSPLIT|NOFRAME,$0-25 MOVD a+0(FP), R3 MOVD b+8(FP), R4 MOVD size+16(FP), R5 + MOVD $ret+24(FP), R10 - BL memeqbody<>(SB) - MOVB R9, ret+24(FP) - RET + BR memeqbody<>(SB) // memequal_varlen(a, b unsafe.Pointer) bool -TEXT runtime·memequal_varlen(SB),NOSPLIT,$40-17 +TEXT runtime·memequal_varlen(SB),NOSPLIT|NOFRAME,$0-17 MOVD a+0(FP), R3 MOVD b+8(FP), R4 CMP R3, R4 BEQ eq MOVD 8(R11), R5 // compiler stores size at offset 8 in the closure - BL memeqbody<>(SB) - MOVB R9, ret+16(FP) - RET + MOVD $ret+16(FP), R10 + BR memeqbody<>(SB) eq: MOVD $1, R3 MOVB R3, ret+16(FP) @@ -79,7 +73,7 @@ eq: // R3 = s1 // R4 = s2 // R5 = len -// R9 = return value +// R10 = addr of return value (byte) TEXT memeqbody<>(SB),NOSPLIT|NOFRAME,$0-0 MOVD R5,CTR CMP R5,$8 // only optimize >=8 @@ -92,26 +86,19 @@ TEXT memeqbody<>(SB),NOSPLIT|NOFRAME,$0-0 setup32a: // 8 byte aligned, >= 32 bytes SRADCC $5,R5,R6 // number of 32 byte chunks to compare MOVD R6,CTR + MOVD $16,R14 // index for VSX loads and stores loop32a: - MOVD 0(R3),R6 // doublewords to compare - MOVD 0(R4),R7 - MOVD 8(R3),R8 // - MOVD 8(R4),R9 - CMP R6,R7 // bytes batch? - BNE noteq - MOVD 16(R3),R6 - MOVD 16(R4),R7 - CMP R8,R9 // bytes match? - MOVD 24(R3),R8 - MOVD 24(R4),R9 - BNE noteq - CMP R6,R7 // bytes match? - BNE noteq + LXVD2X (R3+R0), VS32 // VS32 = V0 + LXVD2X (R4+R0), VS33 // VS33 = V1 + VCMPEQUBCC V0, V1, V2 // compare, setting CR6 + BGE CR6, noteq + LXVD2X (R3+R14), VS32 + LXVD2X (R4+R14), VS33 + VCMPEQUBCC V0, V1, V2 + BGE CR6, noteq ADD $32,R3 // bump up to next 32 ADD $32,R4 - CMP R8,R9 // bytes match? - BC 8,2,loop32a // br ctr and cr - BNE noteq + BC 16, 0, loop32a // br ctr and cr ANDCC $24,R5,R6 // Any 8 byte chunks? BEQ leftover // and result is 0 setup8a: @@ -145,9 +132,10 @@ simple: BNE noteq BR equal noteq: - MOVD $0, R9 + MOVB $0, (R10) RET equal: - MOVD $1, R9 + MOVD $1, R3 + MOVB R3, (R10) RET |
