aboutsummaryrefslogtreecommitdiff
path: root/src/runtime/asm_amd64.s
diff options
context:
space:
mode:
Diffstat (limited to 'src/runtime/asm_amd64.s')
-rw-r--r--src/runtime/asm_amd64.s268
1 files changed, 0 insertions, 268 deletions
diff --git a/src/runtime/asm_amd64.s b/src/runtime/asm_amd64.s
index f91a01da72..ab5407bbcd 100644
--- a/src/runtime/asm_amd64.s
+++ b/src/runtime/asm_amd64.s
@@ -1358,274 +1358,6 @@ DATA shifts<>+0xf0(SB)/8, $0x0807060504030201
DATA shifts<>+0xf8(SB)/8, $0xff0f0e0d0c0b0a09
GLOBL shifts<>(SB),RODATA,$256
-TEXT strings·indexShortStr(SB),NOSPLIT,$0-40
- MOVQ s+0(FP), DI
- // We want len in DX and AX, because PCMPESTRI implicitly consumes them
- MOVQ s_len+8(FP), DX
- MOVQ c+16(FP), BP
- MOVQ c_len+24(FP), AX
- MOVQ DI, R10
- LEAQ ret+32(FP), R11
- JMP runtime·indexShortStr(SB)
-
-TEXT bytes·indexShortStr(SB),NOSPLIT,$0-56
- MOVQ s+0(FP), DI
- MOVQ s_len+8(FP), DX
- MOVQ c+24(FP), BP
- MOVQ c_len+32(FP), AX
- MOVQ DI, R10
- LEAQ ret+48(FP), R11
- JMP runtime·indexShortStr(SB)
-
-// AX: length of string, that we are searching for
-// DX: length of string, in which we are searching
-// DI: pointer to string, in which we are searching
-// BP: pointer to string, that we are searching for
-// R11: address, where to put return value
-TEXT runtime·indexShortStr(SB),NOSPLIT,$0
- CMPQ AX, DX
- JA fail
- CMPQ DX, $16
- JAE sse42
-no_sse42:
- CMPQ AX, $2
- JA _3_or_more
- MOVW (BP), BP
- LEAQ -1(DI)(DX*1), DX
-loop2:
- MOVW (DI), SI
- CMPW SI,BP
- JZ success
- ADDQ $1,DI
- CMPQ DI,DX
- JB loop2
- JMP fail
-_3_or_more:
- CMPQ AX, $3
- JA _4_or_more
- MOVW 1(BP), BX
- MOVW (BP), BP
- LEAQ -2(DI)(DX*1), DX
-loop3:
- MOVW (DI), SI
- CMPW SI,BP
- JZ partial_success3
- ADDQ $1,DI
- CMPQ DI,DX
- JB loop3
- JMP fail
-partial_success3:
- MOVW 1(DI), SI
- CMPW SI,BX
- JZ success
- ADDQ $1,DI
- CMPQ DI,DX
- JB loop3
- JMP fail
-_4_or_more:
- CMPQ AX, $4
- JA _5_or_more
- MOVL (BP), BP
- LEAQ -3(DI)(DX*1), DX
-loop4:
- MOVL (DI), SI
- CMPL SI,BP
- JZ success
- ADDQ $1,DI
- CMPQ DI,DX
- JB loop4
- JMP fail
-_5_or_more:
- CMPQ AX, $7
- JA _8_or_more
- LEAQ 1(DI)(DX*1), DX
- SUBQ AX, DX
- MOVL -4(BP)(AX*1), BX
- MOVL (BP), BP
-loop5to7:
- MOVL (DI), SI
- CMPL SI,BP
- JZ partial_success5to7
- ADDQ $1,DI
- CMPQ DI,DX
- JB loop5to7
- JMP fail
-partial_success5to7:
- MOVL -4(AX)(DI*1), SI
- CMPL SI,BX
- JZ success
- ADDQ $1,DI
- CMPQ DI,DX
- JB loop5to7
- JMP fail
-_8_or_more:
- CMPQ AX, $8
- JA _9_or_more
- MOVQ (BP), BP
- LEAQ -7(DI)(DX*1), DX
-loop8:
- MOVQ (DI), SI
- CMPQ SI,BP
- JZ success
- ADDQ $1,DI
- CMPQ DI,DX
- JB loop8
- JMP fail
-_9_or_more:
- CMPQ AX, $15
- JA _16_or_more
- LEAQ 1(DI)(DX*1), DX
- SUBQ AX, DX
- MOVQ -8(BP)(AX*1), BX
- MOVQ (BP), BP
-loop9to15:
- MOVQ (DI), SI
- CMPQ SI,BP
- JZ partial_success9to15
- ADDQ $1,DI
- CMPQ DI,DX
- JB loop9to15
- JMP fail
-partial_success9to15:
- MOVQ -8(AX)(DI*1), SI
- CMPQ SI,BX
- JZ success
- ADDQ $1,DI
- CMPQ DI,DX
- JB loop9to15
- JMP fail
-_16_or_more:
- CMPQ AX, $16
- JA _17_or_more
- MOVOU (BP), X1
- LEAQ -15(DI)(DX*1), DX
-loop16:
- MOVOU (DI), X2
- PCMPEQB X1, X2
- PMOVMSKB X2, SI
- CMPQ SI, $0xffff
- JE success
- ADDQ $1,DI
- CMPQ DI,DX
- JB loop16
- JMP fail
-_17_or_more:
- CMPQ AX, $31
- JA _32_or_more
- LEAQ 1(DI)(DX*1), DX
- SUBQ AX, DX
- MOVOU -16(BP)(AX*1), X0
- MOVOU (BP), X1
-loop17to31:
- MOVOU (DI), X2
- PCMPEQB X1,X2
- PMOVMSKB X2, SI
- CMPQ SI, $0xffff
- JE partial_success17to31
- ADDQ $1,DI
- CMPQ DI,DX
- JB loop17to31
- JMP fail
-partial_success17to31:
- MOVOU -16(AX)(DI*1), X3
- PCMPEQB X0, X3
- PMOVMSKB X3, SI
- CMPQ SI, $0xffff
- JE success
- ADDQ $1,DI
- CMPQ DI,DX
- JB loop17to31
- JMP fail
-// We can get here only when AVX2 is enabled and cutoff for indexShortStr is set to 63
-// So no need to check cpuid
-_32_or_more:
- CMPQ AX, $32
- JA _33_to_63
- VMOVDQU (BP), Y1
- LEAQ -31(DI)(DX*1), DX
-loop32:
- VMOVDQU (DI), Y2
- VPCMPEQB Y1, Y2, Y3
- VPMOVMSKB Y3, SI
- CMPL SI, $0xffffffff
- JE success_avx2
- ADDQ $1,DI
- CMPQ DI,DX
- JB loop32
- JMP fail_avx2
-_33_to_63:
- LEAQ 1(DI)(DX*1), DX
- SUBQ AX, DX
- VMOVDQU -32(BP)(AX*1), Y0
- VMOVDQU (BP), Y1
-loop33to63:
- VMOVDQU (DI), Y2
- VPCMPEQB Y1, Y2, Y3
- VPMOVMSKB Y3, SI
- CMPL SI, $0xffffffff
- JE partial_success33to63
- ADDQ $1,DI
- CMPQ DI,DX
- JB loop33to63
- JMP fail_avx2
-partial_success33to63:
- VMOVDQU -32(AX)(DI*1), Y3
- VPCMPEQB Y0, Y3, Y4
- VPMOVMSKB Y4, SI
- CMPL SI, $0xffffffff
- JE success_avx2
- ADDQ $1,DI
- CMPQ DI,DX
- JB loop33to63
-fail_avx2:
- VZEROUPPER
-fail:
- MOVQ $-1, (R11)
- RET
-success_avx2:
- VZEROUPPER
- JMP success
-sse42:
- CMPB runtime·support_sse42(SB), $1
- JNE no_sse42
- CMPQ AX, $12
- // PCMPESTRI is slower than normal compare,
- // so using it makes sense only if we advance 4+ bytes per compare
- // This value was determined experimentally and is the ~same
- // on Nehalem (first with SSE42) and Haswell.
- JAE _9_or_more
- LEAQ 16(BP), SI
- TESTW $0xff0, SI
- JEQ no_sse42
- MOVOU (BP), X1
- LEAQ -15(DI)(DX*1), SI
- MOVQ $16, R9
- SUBQ AX, R9 // We advance by 16-len(sep) each iteration, so precalculate it into R9
-loop_sse42:
- // 0x0c means: unsigned byte compare (bits 0,1 are 00)
- // for equality (bits 2,3 are 11)
- // result is not masked or inverted (bits 4,5 are 00)
- // and corresponds to first matching byte (bit 6 is 0)
- PCMPESTRI $0x0c, (DI), X1
- // CX == 16 means no match,
- // CX > R9 means partial match at the end of the string,
- // otherwise sep is at offset CX from X1 start
- CMPQ CX, R9
- JBE sse42_success
- ADDQ R9, DI
- CMPQ DI, SI
- JB loop_sse42
- PCMPESTRI $0x0c, -1(SI), X1
- CMPQ CX, R9
- JA fail
- LEAQ -1(SI), DI
-sse42_success:
- ADDQ CX, DI
-success:
- SUBQ R10, DI
- MOVQ DI, (R11)
- RET
-
TEXT runtime·return0(SB), NOSPLIT, $0
MOVL $0, AX
RET