aboutsummaryrefslogtreecommitdiff
path: root/src/internal/bytealg
diff options
context:
space:
mode:
authorKeith Randall <khr@golang.org>2018-03-03 14:24:54 -0800
committerKeith Randall <khr@golang.org>2018-03-04 17:49:39 +0000
commitf6332bb84ad87e958290ae23b29a2b13a41ee2a2 (patch)
treef09ef9174bee3ae86920a113318f4a322a1a98ad /src/internal/bytealg
parent45964e4f9c950863adcaeb62fbe49f3fa913f27d (diff)
downloadgo-f6332bb84ad87e958290ae23b29a2b13a41ee2a2.tar.xz
internal/bytealg: move compare functions to bytealg
Move bytes.Compare and runtime·cmpstring to bytealg. Update #19792 Change-Id: I139e6d7c59686bef7a3017e3dec99eba5fd10447 Reviewed-on: https://go-review.googlesource.com/98515 Run-TryBot: Keith Randall <khr@golang.org> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>
Diffstat (limited to 'src/internal/bytealg')
-rw-r--r--src/internal/bytealg/compare_386.s151
-rw-r--r--src/internal/bytealg/compare_amd64.s236
-rw-r--r--src/internal/bytealg/compare_amd64p32.s151
-rw-r--r--src/internal/bytealg/compare_arm.s67
-rw-r--r--src/internal/bytealg/compare_arm64.s66
-rw-r--r--src/internal/bytealg/compare_generic.go88
-rw-r--r--src/internal/bytealg/compare_mipsx.s104
-rw-r--r--src/internal/bytealg/compare_native.go10
-rw-r--r--src/internal/bytealg/compare_ppc64x.s307
-rw-r--r--src/internal/bytealg/compare_s390x.s75
10 files changed, 1255 insertions, 0 deletions
diff --git a/src/internal/bytealg/compare_386.s b/src/internal/bytealg/compare_386.s
new file mode 100644
index 0000000000..f2a7fcce24
--- /dev/null
+++ b/src/internal/bytealg/compare_386.s
@@ -0,0 +1,151 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·Compare(SB),NOSPLIT,$0-28
+ MOVL a_base+0(FP), SI
+ MOVL a_len+4(FP), BX
+ MOVL b_base+12(FP), DI
+ MOVL b_len+16(FP), DX
+ LEAL ret+24(FP), AX
+ JMP cmpbody<>(SB)
+
+TEXT bytes·Compare(SB),NOSPLIT,$0-28
+ MOVL a_base+0(FP), SI
+ MOVL a_len+4(FP), BX
+ MOVL b_base+12(FP), DI
+ MOVL b_len+16(FP), DX
+ LEAL ret+24(FP), AX
+ JMP cmpbody<>(SB)
+
+TEXT runtime·cmpstring(SB),NOSPLIT,$0-20
+ MOVL a_base+0(FP), SI
+ MOVL a_len+4(FP), BX
+ MOVL b_base+8(FP), DI
+ MOVL b_len+12(FP), DX
+ LEAL ret+16(FP), AX
+ JMP cmpbody<>(SB)
+
+// input:
+// SI = a
+// DI = b
+// BX = alen
+// DX = blen
+// AX = address of return word (set to 1/0/-1)
+TEXT cmpbody<>(SB),NOSPLIT,$0-0
+ MOVL DX, BP
+ SUBL BX, DX // DX = blen-alen
+ JLE 2(PC)
+ MOVL BX, BP // BP = min(alen, blen)
+ CMPL SI, DI
+ JEQ allsame
+ CMPL BP, $4
+ JB small
+ CMPB runtime·support_sse2(SB), $1
+ JNE mediumloop
+largeloop:
+ CMPL BP, $16
+ JB mediumloop
+ MOVOU (SI), X0
+ MOVOU (DI), X1
+ PCMPEQB X0, X1
+ PMOVMSKB X1, BX
+ XORL $0xffff, BX // convert EQ to NE
+ JNE diff16 // branch if at least one byte is not equal
+ ADDL $16, SI
+ ADDL $16, DI
+ SUBL $16, BP
+ JMP largeloop
+
+diff16:
+ BSFL BX, BX // index of first byte that differs
+ XORL DX, DX
+ MOVB (SI)(BX*1), CX
+ CMPB CX, (DI)(BX*1)
+ SETHI DX
+ LEAL -1(DX*2), DX // convert 1/0 to +1/-1
+ MOVL DX, (AX)
+ RET
+
+mediumloop:
+ CMPL BP, $4
+ JBE _0through4
+ MOVL (SI), BX
+ MOVL (DI), CX
+ CMPL BX, CX
+ JNE diff4
+ ADDL $4, SI
+ ADDL $4, DI
+ SUBL $4, BP
+ JMP mediumloop
+
+_0through4:
+ MOVL -4(SI)(BP*1), BX
+ MOVL -4(DI)(BP*1), CX
+ CMPL BX, CX
+ JEQ allsame
+
+diff4:
+ BSWAPL BX // reverse order of bytes
+ BSWAPL CX
+ XORL BX, CX // find bit differences
+ BSRL CX, CX // index of highest bit difference
+ SHRL CX, BX // move a's bit to bottom
+ ANDL $1, BX // mask bit
+ LEAL -1(BX*2), BX // 1/0 => +1/-1
+ MOVL BX, (AX)
+ RET
+
+ // 0-3 bytes in common
+small:
+ LEAL (BP*8), CX
+ NEGL CX
+ JEQ allsame
+
+ // load si
+ CMPB SI, $0xfc
+ JA si_high
+ MOVL (SI), SI
+ JMP si_finish
+si_high:
+ MOVL -4(SI)(BP*1), SI
+ SHRL CX, SI
+si_finish:
+ SHLL CX, SI
+
+ // same for di
+ CMPB DI, $0xfc
+ JA di_high
+ MOVL (DI), DI
+ JMP di_finish
+di_high:
+ MOVL -4(DI)(BP*1), DI
+ SHRL CX, DI
+di_finish:
+ SHLL CX, DI
+
+ BSWAPL SI // reverse order of bytes
+ BSWAPL DI
+ XORL SI, DI // find bit differences
+ JEQ allsame
+ BSRL DI, CX // index of highest bit difference
+ SHRL CX, SI // move a's bit to bottom
+ ANDL $1, SI // mask bit
+ LEAL -1(SI*2), BX // 1/0 => +1/-1
+ MOVL BX, (AX)
+ RET
+
+ // all the bytes in common are the same, so we just need
+ // to compare the lengths.
+allsame:
+ XORL BX, BX
+ XORL CX, CX
+ TESTL DX, DX
+ SETLT BX // 1 if alen > blen
+ SETEQ CX // 1 if alen == blen
+ LEAL -1(CX)(BX*2), BX // 1,0,-1 result
+ MOVL BX, (AX)
+ RET
diff --git a/src/internal/bytealg/compare_amd64.s b/src/internal/bytealg/compare_amd64.s
new file mode 100644
index 0000000000..7d950286e3
--- /dev/null
+++ b/src/internal/bytealg/compare_amd64.s
@@ -0,0 +1,236 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·Compare(SB),NOSPLIT,$0-56
+ MOVQ a_base+0(FP), SI
+ MOVQ a_len+8(FP), BX
+ MOVQ b_base+24(FP), DI
+ MOVQ b_len+32(FP), DX
+ LEAQ ret+48(FP), R9
+ JMP cmpbody<>(SB)
+
+TEXT bytes·Compare(SB),NOSPLIT,$0-56
+ MOVQ a_base+0(FP), SI
+ MOVQ a_len+8(FP), BX
+ MOVQ b_base+24(FP), DI
+ MOVQ b_len+32(FP), DX
+ LEAQ ret+48(FP), R9
+ JMP cmpbody<>(SB)
+
+TEXT runtime·cmpstring(SB),NOSPLIT,$0-40
+ MOVQ a_base+0(FP), SI
+ MOVQ a_len+8(FP), BX
+ MOVQ b_base+16(FP), DI
+ MOVQ b_len+24(FP), DX
+ LEAQ ret+32(FP), R9
+ JMP cmpbody<>(SB)
+
+// input:
+// SI = a
+// DI = b
+// BX = alen
+// DX = blen
+// R9 = address of output word (stores -1/0/1 here)
+TEXT cmpbody<>(SB),NOSPLIT,$0-0
+ CMPQ SI, DI
+ JEQ allsame
+ CMPQ BX, DX
+ MOVQ DX, R8
+ CMOVQLT BX, R8 // R8 = min(alen, blen) = # of bytes to compare
+ CMPQ R8, $8
+ JB small
+
+ CMPQ R8, $63
+ JBE loop
+ CMPB internal∕cpu·X86+const_x86_HasAVX2(SB), $1
+ JEQ big_loop_avx2
+ JMP big_loop
+loop:
+ CMPQ R8, $16
+ JBE _0through16
+ MOVOU (SI), X0
+ MOVOU (DI), X1
+ PCMPEQB X0, X1
+ PMOVMSKB X1, AX
+ XORQ $0xffff, AX // convert EQ to NE
+ JNE diff16 // branch if at least one byte is not equal
+ ADDQ $16, SI
+ ADDQ $16, DI
+ SUBQ $16, R8
+ JMP loop
+
+diff64:
+ ADDQ $48, SI
+ ADDQ $48, DI
+ JMP diff16
+diff48:
+ ADDQ $32, SI
+ ADDQ $32, DI
+ JMP diff16
+diff32:
+ ADDQ $16, SI
+ ADDQ $16, DI
+ // AX = bit mask of differences
+diff16:
+ BSFQ AX, BX // index of first byte that differs
+ XORQ AX, AX
+ MOVB (SI)(BX*1), CX
+ CMPB CX, (DI)(BX*1)
+ SETHI AX
+ LEAQ -1(AX*2), AX // convert 1/0 to +1/-1
+ MOVQ AX, (R9)
+ RET
+
+ // 0 through 16 bytes left, alen>=8, blen>=8
+_0through16:
+ CMPQ R8, $8
+ JBE _0through8
+ MOVQ (SI), AX
+ MOVQ (DI), CX
+ CMPQ AX, CX
+ JNE diff8
+_0through8:
+ MOVQ -8(SI)(R8*1), AX
+ MOVQ -8(DI)(R8*1), CX
+ CMPQ AX, CX
+ JEQ allsame
+
+ // AX and CX contain parts of a and b that differ.
+diff8:
+ BSWAPQ AX // reverse order of bytes
+ BSWAPQ CX
+ XORQ AX, CX
+ BSRQ CX, CX // index of highest bit difference
+ SHRQ CX, AX // move a's bit to bottom
+ ANDQ $1, AX // mask bit
+ LEAQ -1(AX*2), AX // 1/0 => +1/-1
+ MOVQ AX, (R9)
+ RET
+
+ // 0-7 bytes in common
+small:
+ LEAQ (R8*8), CX // bytes left -> bits left
+ NEGQ CX // - bits lift (== 64 - bits left mod 64)
+ JEQ allsame
+
+ // load bytes of a into high bytes of AX
+ CMPB SI, $0xf8
+ JA si_high
+ MOVQ (SI), SI
+ JMP si_finish
+si_high:
+ MOVQ -8(SI)(R8*1), SI
+ SHRQ CX, SI
+si_finish:
+ SHLQ CX, SI
+
+ // load bytes of b in to high bytes of BX
+ CMPB DI, $0xf8
+ JA di_high
+ MOVQ (DI), DI
+ JMP di_finish
+di_high:
+ MOVQ -8(DI)(R8*1), DI
+ SHRQ CX, DI
+di_finish:
+ SHLQ CX, DI
+
+ BSWAPQ SI // reverse order of bytes
+ BSWAPQ DI
+ XORQ SI, DI // find bit differences
+ JEQ allsame
+ BSRQ DI, CX // index of highest bit difference
+ SHRQ CX, SI // move a's bit to bottom
+ ANDQ $1, SI // mask bit
+ LEAQ -1(SI*2), AX // 1/0 => +1/-1
+ MOVQ AX, (R9)
+ RET
+
+allsame:
+ XORQ AX, AX
+ XORQ CX, CX
+ CMPQ BX, DX
+ SETGT AX // 1 if alen > blen
+ SETEQ CX // 1 if alen == blen
+ LEAQ -1(CX)(AX*2), AX // 1,0,-1 result
+ MOVQ AX, (R9)
+ RET
+
+ // this works for >= 64 bytes of data.
+big_loop:
+ MOVOU (SI), X0
+ MOVOU (DI), X1
+ PCMPEQB X0, X1
+ PMOVMSKB X1, AX
+ XORQ $0xffff, AX
+ JNE diff16
+
+ MOVOU 16(SI), X0
+ MOVOU 16(DI), X1
+ PCMPEQB X0, X1
+ PMOVMSKB X1, AX
+ XORQ $0xffff, AX
+ JNE diff32
+
+ MOVOU 32(SI), X0
+ MOVOU 32(DI), X1
+ PCMPEQB X0, X1
+ PMOVMSKB X1, AX
+ XORQ $0xffff, AX
+ JNE diff48
+
+ MOVOU 48(SI), X0
+ MOVOU 48(DI), X1
+ PCMPEQB X0, X1
+ PMOVMSKB X1, AX
+ XORQ $0xffff, AX
+ JNE diff64
+
+ ADDQ $64, SI
+ ADDQ $64, DI
+ SUBQ $64, R8
+ CMPQ R8, $64
+ JBE loop
+ JMP big_loop
+
+ // Compare 64-bytes per loop iteration.
+ // Loop is unrolled and uses AVX2.
+big_loop_avx2:
+ VMOVDQU (SI), Y2
+ VMOVDQU (DI), Y3
+ VMOVDQU 32(SI), Y4
+ VMOVDQU 32(DI), Y5
+ VPCMPEQB Y2, Y3, Y0
+ VPMOVMSKB Y0, AX
+ XORL $0xffffffff, AX
+ JNE diff32_avx2
+ VPCMPEQB Y4, Y5, Y6
+ VPMOVMSKB Y6, AX
+ XORL $0xffffffff, AX
+ JNE diff64_avx2
+
+ ADDQ $64, SI
+ ADDQ $64, DI
+ SUBQ $64, R8
+ CMPQ R8, $64
+ JB big_loop_avx2_exit
+ JMP big_loop_avx2
+
+ // Avoid AVX->SSE transition penalty and search first 32 bytes of 64 byte chunk.
+diff32_avx2:
+ VZEROUPPER
+ JMP diff16
+
+ // Same as diff32_avx2, but for last 32 bytes.
+diff64_avx2:
+ VZEROUPPER
+ JMP diff48
+
+ // For <64 bytes remainder jump to normal loop.
+big_loop_avx2_exit:
+ VZEROUPPER
+ JMP loop
diff --git a/src/internal/bytealg/compare_amd64p32.s b/src/internal/bytealg/compare_amd64p32.s
new file mode 100644
index 0000000000..0f23147338
--- /dev/null
+++ b/src/internal/bytealg/compare_amd64p32.s
@@ -0,0 +1,151 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·Compare(SB),NOSPLIT,$0-28
+ MOVL a_base+0(FP), SI
+ MOVL a_len+4(FP), BX
+ MOVL b_base+12(FP), DI
+ MOVL b_len+16(FP), DX
+ CALL cmpbody<>(SB)
+ MOVL AX, ret+24(FP)
+ RET
+
+TEXT bytes·Compare(SB),NOSPLIT,$0-28
+ MOVL a_base+0(FP), SI
+ MOVL a_len+4(FP), BX
+ MOVL b_base+12(FP), DI
+ MOVL b_len+16(FP), DX
+ CALL cmpbody<>(SB)
+ MOVL AX, ret+24(FP)
+ RET
+
+TEXT runtime·cmpstring(SB),NOSPLIT,$0-20
+ MOVL a_base+0(FP), SI
+ MOVL a_len+4(FP), BX
+ MOVL b_base+8(FP), DI
+ MOVL b_len+12(FP), DX
+ CALL cmpbody<>(SB)
+ MOVL AX, ret+16(FP)
+ RET
+
+// input:
+// SI = a
+// DI = b
+// BX = alen
+// DX = blen
+// output:
+// AX = 1/0/-1
+TEXT cmpbody<>(SB),NOSPLIT,$0-0
+ CMPQ SI, DI
+ JEQ allsame
+ CMPQ BX, DX
+ MOVQ DX, R8
+ CMOVQLT BX, R8 // R8 = min(alen, blen) = # of bytes to compare
+ CMPQ R8, $8
+ JB small
+
+loop:
+ CMPQ R8, $16
+ JBE _0through16
+ MOVOU (SI), X0
+ MOVOU (DI), X1
+ PCMPEQB X0, X1
+ PMOVMSKB X1, AX
+ XORQ $0xffff, AX // convert EQ to NE
+ JNE diff16 // branch if at least one byte is not equal
+ ADDQ $16, SI
+ ADDQ $16, DI
+ SUBQ $16, R8
+ JMP loop
+
+ // AX = bit mask of differences
+diff16:
+ BSFQ AX, BX // index of first byte that differs
+ XORQ AX, AX
+ ADDQ BX, SI
+ MOVB (SI), CX
+ ADDQ BX, DI
+ CMPB CX, (DI)
+ SETHI AX
+ LEAQ -1(AX*2), AX // convert 1/0 to +1/-1
+ RET
+
+ // 0 through 16 bytes left, alen>=8, blen>=8
+_0through16:
+ CMPQ R8, $8
+ JBE _0through8
+ MOVQ (SI), AX
+ MOVQ (DI), CX
+ CMPQ AX, CX
+ JNE diff8
+_0through8:
+ ADDQ R8, SI
+ ADDQ R8, DI
+ MOVQ -8(SI), AX
+ MOVQ -8(DI), CX
+ CMPQ AX, CX
+ JEQ allsame
+
+ // AX and CX contain parts of a and b that differ.
+diff8:
+ BSWAPQ AX // reverse order of bytes
+ BSWAPQ CX
+ XORQ AX, CX
+ BSRQ CX, CX // index of highest bit difference
+ SHRQ CX, AX // move a's bit to bottom
+ ANDQ $1, AX // mask bit
+ LEAQ -1(AX*2), AX // 1/0 => +1/-1
+ RET
+
+ // 0-7 bytes in common
+small:
+ LEAQ (R8*8), CX // bytes left -> bits left
+ NEGQ CX // - bits lift (== 64 - bits left mod 64)
+ JEQ allsame
+
+ // load bytes of a into high bytes of AX
+ CMPB SI, $0xf8
+ JA si_high
+ MOVQ (SI), SI
+ JMP si_finish
+si_high:
+ ADDQ R8, SI
+ MOVQ -8(SI), SI
+ SHRQ CX, SI
+si_finish:
+ SHLQ CX, SI
+
+ // load bytes of b in to high bytes of BX
+ CMPB DI, $0xf8
+ JA di_high
+ MOVQ (DI), DI
+ JMP di_finish
+di_high:
+ ADDQ R8, DI
+ MOVQ -8(DI), DI
+ SHRQ CX, DI
+di_finish:
+ SHLQ CX, DI
+
+ BSWAPQ SI // reverse order of bytes
+ BSWAPQ DI
+ XORQ SI, DI // find bit differences
+ JEQ allsame
+ BSRQ DI, CX // index of highest bit difference
+ SHRQ CX, SI // move a's bit to bottom
+ ANDQ $1, SI // mask bit
+ LEAQ -1(SI*2), AX // 1/0 => +1/-1
+ RET
+
+allsame:
+ XORQ AX, AX
+ XORQ CX, CX
+ CMPQ BX, DX
+ SETGT AX // 1 if alen > blen
+ SETEQ CX // 1 if alen == blen
+ LEAQ -1(CX)(AX*2), AX // 1,0,-1 result
+ RET
diff --git a/src/internal/bytealg/compare_arm.s b/src/internal/bytealg/compare_arm.s
new file mode 100644
index 0000000000..72cae3309c
--- /dev/null
+++ b/src/internal/bytealg/compare_arm.s
@@ -0,0 +1,67 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·Compare(SB),NOSPLIT|NOFRAME,$0-28
+ MOVW a_base+0(FP), R2
+ MOVW a_len+4(FP), R0
+ MOVW b_base+12(FP), R3
+ MOVW b_len+16(FP), R1
+ ADD $28, R13, R7
+ B cmpbody<>(SB)
+
+TEXT bytes·Compare(SB),NOSPLIT|NOFRAME,$0-28
+ MOVW a_base+0(FP), R2
+ MOVW a_len+4(FP), R0
+ MOVW b_base+12(FP), R3
+ MOVW b_len+16(FP), R1
+ ADD $28, R13, R7
+ B cmpbody<>(SB)
+
+TEXT runtime·cmpstring(SB),NOSPLIT|NOFRAME,$0-20
+ MOVW a_base+0(FP), R2
+ MOVW a_len+4(FP), R0
+ MOVW b_base+8(FP), R3
+ MOVW b_len+12(FP), R1
+ ADD $20, R13, R7
+ B cmpbody<>(SB)
+
+// On entry:
+// R0 is the length of a
+// R1 is the length of b
+// R2 points to the start of a
+// R3 points to the start of b
+// R7 points to return value (-1/0/1 will be written here)
+//
+// On exit:
+// R4, R5, and R6 are clobbered
+TEXT cmpbody<>(SB),NOSPLIT|NOFRAME,$0-0
+ CMP R2, R3
+ BEQ samebytes
+ CMP R0, R1
+ MOVW R0, R6
+ MOVW.LT R1, R6 // R6 is min(R0, R1)
+
+ ADD R2, R6 // R2 is current byte in a, R6 is last byte in a to compare
+loop:
+ CMP R2, R6
+ BEQ samebytes // all compared bytes were the same; compare lengths
+ MOVBU.P 1(R2), R4
+ MOVBU.P 1(R3), R5
+ CMP R4, R5
+ BEQ loop
+ // bytes differed
+ MOVW.LT $1, R0
+ MOVW.GT $-1, R0
+ MOVW R0, (R7)
+ RET
+samebytes:
+ CMP R0, R1
+ MOVW.LT $1, R0
+ MOVW.GT $-1, R0
+ MOVW.EQ $0, R0
+ MOVW R0, (R7)
+ RET
diff --git a/src/internal/bytealg/compare_arm64.s b/src/internal/bytealg/compare_arm64.s
new file mode 100644
index 0000000000..9b6354715a
--- /dev/null
+++ b/src/internal/bytealg/compare_arm64.s
@@ -0,0 +1,66 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·Compare(SB),NOSPLIT|NOFRAME,$0-56
+ MOVD a_base+0(FP), R2
+ MOVD a_len+8(FP), R0
+ MOVD b_base+24(FP), R3
+ MOVD b_len+32(FP), R1
+ ADD $56, RSP, R7
+ B cmpbody<>(SB)
+
+TEXT bytes·Compare(SB),NOSPLIT|NOFRAME,$0-56
+ MOVD a_base+0(FP), R2
+ MOVD a_len+8(FP), R0
+ MOVD b_base+24(FP), R3
+ MOVD b_len+32(FP), R1
+ ADD $56, RSP, R7
+ B cmpbody<>(SB)
+
+TEXT runtime·cmpstring(SB),NOSPLIT|NOFRAME,$0-40
+ MOVD a_base+0(FP), R2
+ MOVD a_len+8(FP), R0
+ MOVD b_base+16(FP), R3
+ MOVD b_len+24(FP), R1
+ ADD $40, RSP, R7
+ B cmpbody<>(SB)
+
+// On entry:
+// R0 is the length of a
+// R1 is the length of b
+// R2 points to the start of a
+// R3 points to the start of b
+// R7 points to return value (-1/0/1 will be written here)
+//
+// On exit:
+// R4, R5, and R6 are clobbered
+TEXT cmpbody<>(SB),NOSPLIT|NOFRAME,$0-0
+ CMP R2, R3
+ BEQ samebytes // same starting pointers; compare lengths
+ CMP R0, R1
+ CSEL LT, R1, R0, R6 // R6 is min(R0, R1)
+
+ ADD R2, R6 // R2 is current byte in a, R6 is last byte in a to compare
+loop:
+ CMP R2, R6
+ BEQ samebytes // all compared bytes were the same; compare lengths
+ MOVBU.P 1(R2), R4
+ MOVBU.P 1(R3), R5
+ CMP R4, R5
+ BEQ loop
+ // bytes differed
+ MOVD $1, R4
+ CSNEG LT, R4, R4, R4
+ MOVD R4, (R7)
+ RET
+samebytes:
+ MOVD $1, R4
+ CMP R0, R1
+ CSNEG LT, R4, R4, R4
+ CSEL EQ, ZR, R4, R4
+ MOVD R4, (R7)
+ RET
diff --git a/src/internal/bytealg/compare_generic.go b/src/internal/bytealg/compare_generic.go
new file mode 100644
index 0000000000..69610517b2
--- /dev/null
+++ b/src/internal/bytealg/compare_generic.go
@@ -0,0 +1,88 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !386,!amd64,!amd64p32,!s390x,!arm,!arm64,!ppc64,!ppc64le,!mips,!mipsle
+
+package bytealg
+
+import _ "unsafe" // for go:linkname
+
+func Compare(a, b []byte) int {
+ l := len(a)
+ if len(b) < l {
+ l = len(b)
+ }
+ if l == 0 || &a[0] == &b[0] {
+ goto samebytes
+ }
+ for i := 0; i < l; i++ {
+ c1, c2 := a[i], b[i]
+ if c1 < c2 {
+ return -1
+ }
+ if c1 > c2 {
+ return +1
+ }
+ }
+samebytes:
+ if len(a) < len(b) {
+ return -1
+ }
+ if len(a) > len(b) {
+ return +1
+ }
+ return 0
+}
+
+//go:linkname bytes_Compare bytes.Compare
+func bytes_Compare(a, b []byte) int {
+ l := len(a)
+ if len(b) < l {
+ l = len(b)
+ }
+ if l == 0 || &a[0] == &b[0] {
+ goto samebytes
+ }
+ for i := 0; i < l; i++ {
+ c1, c2 := a[i], b[i]
+ if c1 < c2 {
+ return -1
+ }
+ if c1 > c2 {
+ return +1
+ }
+ }
+samebytes:
+ if len(a) < len(b) {
+ return -1
+ }
+ if len(a) > len(b) {
+ return +1
+ }
+ return 0
+}
+
+//go:linkname runtime_cmpstring runtime.cmpstring
+func runtime_cmpstring(a, b string) int {
+ l := len(a)
+ if len(b) < l {
+ l = len(b)
+ }
+ for i := 0; i < l; i++ {
+ c1, c2 := a[i], b[i]
+ if c1 < c2 {
+ return -1
+ }
+ if c1 > c2 {
+ return +1
+ }
+ }
+ if len(a) < len(b) {
+ return -1
+ }
+ if len(a) > len(b) {
+ return +1
+ }
+ return 0
+}
diff --git a/src/internal/bytealg/compare_mipsx.s b/src/internal/bytealg/compare_mipsx.s
new file mode 100644
index 0000000000..b8e225ea85
--- /dev/null
+++ b/src/internal/bytealg/compare_mipsx.s
@@ -0,0 +1,104 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build mips mipsle
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·Compare(SB),NOSPLIT,$0-28
+ MOVW a_base+0(FP), R3
+ MOVW b_base+12(FP), R4
+ MOVW a_len+4(FP), R1
+ MOVW b_len+16(FP), R2
+ BEQ R3, R4, samebytes
+ SGTU R1, R2, R7
+ MOVW R1, R8
+ CMOVN R7, R2, R8 // R8 is min(R1, R2)
+
+ ADDU R3, R8 // R3 is current byte in a, R8 is last byte in a to compare
+loop:
+ BEQ R3, R8, samebytes
+
+ MOVBU (R3), R6
+ ADDU $1, R3
+ MOVBU (R4), R7
+ ADDU $1, R4
+ BEQ R6, R7 , loop
+
+ SGTU R6, R7, R8
+ MOVW $-1, R6
+ CMOVZ R8, R6, R8
+ JMP cmp_ret
+samebytes:
+ SGTU R1, R2, R6
+ SGTU R2, R1, R7
+ SUBU R7, R6, R8
+cmp_ret:
+ MOVW R8, ret+24(FP)
+ RET
+
+TEXT bytes·Compare(SB),NOSPLIT,$0-28
+ MOVW a_base+0(FP), R3
+ MOVW b_base+12(FP), R4
+ MOVW a_len+4(FP), R1
+ MOVW b_len+16(FP), R2
+ BEQ R3, R4, samebytes
+ SGTU R1, R2, R7
+ MOVW R1, R8
+ CMOVN R7, R2, R8 // R8 is min(R1, R2)
+
+ ADDU R3, R8 // R3 is current byte in a, R8 is last byte in a to compare
+loop:
+ BEQ R3, R8, samebytes
+
+ MOVBU (R3), R6
+ ADDU $1, R3
+ MOVBU (R4), R7
+ ADDU $1, R4
+ BEQ R6, R7 , loop
+
+ SGTU R6, R7, R8
+ MOVW $-1, R6
+ CMOVZ R8, R6, R8
+ JMP cmp_ret
+samebytes:
+ SGTU R1, R2, R6
+ SGTU R2, R1, R7
+ SUBU R7, R6, R8
+cmp_ret:
+ MOVW R8, ret+24(FP)
+ RET
+
+TEXT runtime·cmpstring(SB),NOSPLIT,$0-20
+ MOVW a_base+0(FP), R3
+ MOVW a_len+4(FP), R1
+ MOVW b_base+8(FP), R4
+ MOVW b_len+12(FP), R2
+ BEQ R3, R4, samebytes
+ SGTU R1, R2, R7
+ MOVW R1, R8
+ CMOVN R7, R2, R8 // R8 is min(R1, R2)
+
+ ADDU R3, R8 // R3 is current byte in a, R8 is last byte in a to compare
+loop:
+ BEQ R3, R8, samebytes // all compared bytes were the same; compare lengths
+
+ MOVBU (R3), R6
+ ADDU $1, R3
+ MOVBU (R4), R7
+ ADDU $1, R4
+ BEQ R6, R7 , loop
+ // bytes differed
+ SGTU R6, R7, R8
+ MOVW $-1, R6
+ CMOVZ R8, R6, R8
+ JMP cmp_ret
+samebytes:
+ SGTU R1, R2, R6
+ SGTU R2, R1, R7
+ SUBU R7, R6, R8
+cmp_ret:
+ MOVW R8, ret+16(FP)
+ RET
diff --git a/src/internal/bytealg/compare_native.go b/src/internal/bytealg/compare_native.go
new file mode 100644
index 0000000000..8ffade55be
--- /dev/null
+++ b/src/internal/bytealg/compare_native.go
@@ -0,0 +1,10 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build 386 amd64 amd64p32 s390x arm arm64 ppc64 ppc64le mips mipsle
+
+package bytealg
+
+//go:noescape
+func Compare(a, b []byte) int
diff --git a/src/internal/bytealg/compare_ppc64x.s b/src/internal/bytealg/compare_ppc64x.s
new file mode 100644
index 0000000000..9b13c9a14d
--- /dev/null
+++ b/src/internal/bytealg/compare_ppc64x.s
@@ -0,0 +1,307 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build ppc64 ppc64le
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·Compare(SB),NOSPLIT|NOFRAME,$0-56
+ MOVD a_base+0(FP), R5
+ MOVD b_base+24(FP), R6
+ MOVD a_len+8(FP), R3
+ CMP R5,R6,CR7
+ MOVD b_len+32(FP), R4
+ MOVD $ret+48(FP), R7
+ CMP R3,R4,CR6
+ BEQ CR7,equal
+
+#ifdef GOARCH_ppc64le
+ BR cmpbodyLE<>(SB)
+#else
+ BR cmpbodyBE<>(SB)
+#endif
+
+equal:
+ BEQ CR6,done
+ MOVD $1, R8
+ BGT CR6,greater
+ NEG R8
+
+greater:
+ MOVD R8, (R7)
+ RET
+
+done:
+ MOVD $0, (R7)
+ RET
+
+TEXT bytes·Compare(SB),NOSPLIT|NOFRAME,$0-56
+ MOVD a_base+0(FP), R5
+ MOVD b_base+24(FP), R6
+ MOVD a_len+8(FP), R3
+ CMP R5,R6,CR7
+ MOVD b_len+32(FP), R4
+ MOVD $ret+48(FP), R7
+ CMP R3,R4,CR6
+ BEQ CR7,equal
+
+#ifdef GOARCH_ppc64le
+ BR cmpbodyLE<>(SB)
+#else
+ BR cmpbodyBE<>(SB)
+#endif
+
+equal:
+ BEQ CR6,done
+ MOVD $1, R8
+ BGT CR6,greater
+ NEG R8
+
+greater:
+ MOVD R8, (R7)
+ RET
+
+done:
+ MOVD $0, (R7)
+ RET
+
+TEXT runtime·cmpstring(SB),NOSPLIT|NOFRAME,$0-40
+ MOVD a_base+0(FP), R5
+ MOVD b_base+16(FP), R6
+ MOVD a_len+8(FP), R3
+ CMP R5,R6,CR7
+ MOVD b_len+24(FP), R4
+ MOVD $ret+32(FP), R7
+ CMP R3,R4,CR6
+ BEQ CR7,equal
+
+#ifdef GOARCH_ppc64le
+ BR cmpbodyLE<>(SB)
+#else
+ BR cmpbodyBE<>(SB)
+#endif
+
+equal:
+ BEQ CR6,done
+ MOVD $1, R8
+ BGT CR6,greater
+ NEG R8
+
+greater:
+ MOVD R8, (R7)
+ RET
+
+done:
+ MOVD $0, (R7)
+ RET
+
+// Do an efficient memcmp for ppc64le
+// R3 = a len
+// R4 = b len
+// R5 = a addr
+// R6 = b addr
+// R7 = addr of return value
+TEXT cmpbodyLE<>(SB),NOSPLIT|NOFRAME,$0-0
+ MOVD R3,R8 // set up length
+ CMP R3,R4,CR2 // unequal?
+ BC 12,8,setuplen // BLT CR2
+ MOVD R4,R8 // use R4 for comparison len
+setuplen:
+ MOVD R8,CTR // set up loop counter
+ CMP R8,$8 // only optimize >=8
+ BLT simplecheck
+ DCBT (R5) // cache hint
+ DCBT (R6)
+ CMP R8,$32 // optimize >= 32
+ MOVD R8,R9
+ BLT setup8a // 8 byte moves only
+setup32a:
+ SRADCC $5,R8,R9 // number of 32 byte chunks
+ MOVD R9,CTR
+
+ // Special processing for 32 bytes or longer.
+ // Loading this way is faster and correct as long as the
+ // doublewords being compared are equal. Once they
+ // are found unequal, reload them in proper byte order
+ // to determine greater or less than.
+loop32a:
+ MOVD 0(R5),R9 // doublewords to compare
+ MOVD 0(R6),R10 // get 4 doublewords
+ MOVD 8(R5),R14
+ MOVD 8(R6),R15
+ CMPU R9,R10 // bytes equal?
+ MOVD $0,R16 // set up for cmpne
+ BNE cmpne // further compare for LT or GT
+ MOVD 16(R5),R9 // get next pair of doublewords
+ MOVD 16(R6),R10
+ CMPU R14,R15 // bytes match?
+ MOVD $8,R16 // set up for cmpne
+ BNE cmpne // further compare for LT or GT
+ MOVD 24(R5),R14 // get next pair of doublewords
+ MOVD 24(R6),R15
+ CMPU R9,R10 // bytes match?
+ MOVD $16,R16 // set up for cmpne
+ BNE cmpne // further compare for LT or GT
+ MOVD $-8,R16 // for cmpne, R5,R6 already inc by 32
+ ADD $32,R5 // bump up to next 32
+ ADD $32,R6
+ CMPU R14,R15 // bytes match?
+ BC 8,2,loop32a // br ctr and cr
+ BNE cmpne
+ ANDCC $24,R8,R9 // Any 8 byte chunks?
+ BEQ leftover // and result is 0
+setup8a:
+ SRADCC $3,R9,R9 // get the 8 byte count
+ BEQ leftover // shifted value is 0
+ MOVD R9,CTR // loop count for doublewords
+loop8:
+ MOVDBR (R5+R0),R9 // doublewords to compare
+ MOVDBR (R6+R0),R10 // LE compare order
+ ADD $8,R5
+ ADD $8,R6
+ CMPU R9,R10 // match?
+ BC 8,2,loop8 // bt ctr <> 0 && cr
+ BGT greater
+ BLT less
+leftover:
+ ANDCC $7,R8,R9 // check for leftover bytes
+ MOVD R9,CTR // save the ctr
+ BNE simple // leftover bytes
+ BC 12,10,equal // test CR2 for length comparison
+ BC 12,8,less
+ BR greater
+simplecheck:
+ CMP R8,$0 // remaining compare length 0
+ BNE simple // do simple compare
+ BC 12,10,equal // test CR2 for length comparison
+ BC 12,8,less // 1st len < 2nd len, result less
+ BR greater // 1st len > 2nd len must be greater
+simple:
+ MOVBZ 0(R5), R9 // get byte from 1st operand
+ ADD $1,R5
+ MOVBZ 0(R6), R10 // get byte from 2nd operand
+ ADD $1,R6
+ CMPU R9, R10
+ BC 8,2,simple // bc ctr <> 0 && cr
+ BGT greater // 1st > 2nd
+ BLT less // 1st < 2nd
+ BC 12,10,equal // test CR2 for length comparison
+ BC 12,9,greater // 2nd len > 1st len
+ BR less // must be less
+cmpne: // only here is not equal
+ MOVDBR (R5+R16),R8 // reload in reverse order
+ MOVDBR (R6+R16),R9
+ CMPU R8,R9 // compare correct endianness
+ BGT greater // here only if NE
+less:
+ MOVD $-1,R3
+ MOVD R3,(R7) // return value if A < B
+ RET
+equal:
+ MOVD $0,(R7) // return value if A == B
+ RET
+greater:
+ MOVD $1,R3
+ MOVD R3,(R7) // return value if A > B
+ RET
+
+// Do an efficient memcmp for ppc64 (BE)
+// R3 = a len
+// R4 = b len
+// R5 = a addr
+// R6 = b addr
+// R7 = addr of return value
+TEXT cmpbodyBE<>(SB),NOSPLIT|NOFRAME,$0-0
+ MOVD R3,R8 // set up length
+ CMP R3,R4,CR2 // unequal?
+ BC 12,8,setuplen // BLT CR2
+ MOVD R4,R8 // use R4 for comparison len
+setuplen:
+ MOVD R8,CTR // set up loop counter
+ CMP R8,$8 // only optimize >=8
+ BLT simplecheck
+ DCBT (R5) // cache hint
+ DCBT (R6)
+ CMP R8,$32 // optimize >= 32
+ MOVD R8,R9
+ BLT setup8a // 8 byte moves only
+
+setup32a:
+ SRADCC $5,R8,R9 // number of 32 byte chunks
+ MOVD R9,CTR
+loop32a:
+ MOVD 0(R5),R9 // doublewords to compare
+ MOVD 0(R6),R10 // get 4 doublewords
+ MOVD 8(R5),R14
+ MOVD 8(R6),R15
+ CMPU R9,R10 // bytes equal?
+ BLT less // found to be less
+ BGT greater // found to be greater
+ MOVD 16(R5),R9 // get next pair of doublewords
+ MOVD 16(R6),R10
+ CMPU R14,R15 // bytes match?
+ BLT less // found less
+ BGT greater // found greater
+ MOVD 24(R5),R14 // get next pair of doublewords
+ MOVD 24(R6),R15
+ CMPU R9,R10 // bytes match?
+ BLT less // found to be less
+ BGT greater // found to be greater
+ ADD $32,R5 // bump up to next 32
+ ADD $32,R6
+ CMPU R14,R15 // bytes match?
+ BC 8,2,loop32a // br ctr and cr
+ BLT less // with BE, byte ordering is
+ BGT greater // good for compare
+ ANDCC $24,R8,R9 // Any 8 byte chunks?
+ BEQ leftover // and result is 0
+setup8a:
+ SRADCC $3,R9,R9 // get the 8 byte count
+ BEQ leftover // shifted value is 0
+ MOVD R9,CTR // loop count for doublewords
+loop8:
+ MOVD (R5),R9
+ MOVD (R6),R10
+ ADD $8,R5
+ ADD $8,R6
+ CMPU R9,R10 // match?
+ BC 8,2,loop8 // bt ctr <> 0 && cr
+ BGT greater
+ BLT less
+leftover:
+ ANDCC $7,R8,R9 // check for leftover bytes
+ MOVD R9,CTR // save the ctr
+ BNE simple // leftover bytes
+ BC 12,10,equal // test CR2 for length comparison
+ BC 12,8,less
+ BR greater
+simplecheck:
+ CMP R8,$0 // remaining compare length 0
+ BNE simple // do simple compare
+ BC 12,10,equal // test CR2 for length comparison
+ BC 12,8,less // 1st len < 2nd len, result less
+ BR greater // same len, must be equal
+simple:
+ MOVBZ 0(R5),R9 // get byte from 1st operand
+ ADD $1,R5
+ MOVBZ 0(R6),R10 // get byte from 2nd operand
+ ADD $1,R6
+ CMPU R9,R10
+ BC 8,2,simple // bc ctr <> 0 && cr
+ BGT greater // 1st > 2nd
+ BLT less // 1st < 2nd
+ BC 12,10,equal // test CR2 for length comparison
+ BC 12,9,greater // 2nd len > 1st len
+less:
+ MOVD $-1,R3
+ MOVD R3,(R7) // return value if A < B
+ RET
+equal:
+ MOVD $0,(R7) // return value if A == B
+ RET
+greater:
+ MOVD $1,R3
+ MOVD R3,(R7) // return value if A > B
+ RET
diff --git a/src/internal/bytealg/compare_s390x.s b/src/internal/bytealg/compare_s390x.s
new file mode 100644
index 0000000000..7f27b08c0e
--- /dev/null
+++ b/src/internal/bytealg/compare_s390x.s
@@ -0,0 +1,75 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·Compare(SB),NOSPLIT|NOFRAME,$0-56
+ MOVD a_base+0(FP), R3
+ MOVD a_len+8(FP), R4
+ MOVD b_base+24(FP), R5
+ MOVD b_len+32(FP), R6
+ LA ret+48(FP), R7
+ BR cmpbody<>(SB)
+
+TEXT bytes·Compare(SB),NOSPLIT|NOFRAME,$0-56
+ MOVD a_base+0(FP), R3
+ MOVD a_len+8(FP), R4
+ MOVD b_base+24(FP), R5
+ MOVD b_len+32(FP), R6
+ LA ret+48(FP), R7
+ BR cmpbody<>(SB)
+
+TEXT runtime·cmpstring(SB),NOSPLIT|NOFRAME,$0-40
+ MOVD a_base+0(FP), R3
+ MOVD a_len+8(FP), R4
+ MOVD b_base+16(FP), R5
+ MOVD b_len+24(FP), R6
+ LA ret+32(FP), R7
+ BR cmpbody<>(SB)
+
+// input:
+// R3 = a
+// R4 = alen
+// R5 = b
+// R6 = blen
+// R7 = address of output word (stores -1/0/1 here)
+TEXT cmpbody<>(SB),NOSPLIT|NOFRAME,$0-0
+ CMPBEQ R3, R5, cmplengths
+ MOVD R4, R8
+ CMPBLE R4, R6, amin
+ MOVD R6, R8
+amin:
+ CMPBEQ R8, $0, cmplengths
+ CMP R8, $256
+ BLE tail
+loop:
+ CLC $256, 0(R3), 0(R5)
+ BGT gt
+ BLT lt
+ SUB $256, R8
+ CMP R8, $256
+ BGT loop
+tail:
+ SUB $1, R8
+ EXRL $cmpbodyclc<>(SB), R8
+ BGT gt
+ BLT lt
+cmplengths:
+ CMP R4, R6
+ BEQ eq
+ BLT lt
+gt:
+ MOVD $1, 0(R7)
+ RET
+lt:
+ MOVD $-1, 0(R7)
+ RET
+eq:
+ MOVD $0, 0(R7)
+ RET
+
+TEXT cmpbodyclc<>(SB),NOSPLIT|NOFRAME,$0-0
+ CLC $1, 0(R3), 0(R5)
+ RET