diff options
Diffstat (limited to 'src/runtime')
| -rw-r--r-- | src/runtime/asm_amd64.s | 189 | ||||
| -rw-r--r-- | src/runtime/runtime2.go | 1 |
2 files changed, 1 insertions, 189 deletions
diff --git a/src/runtime/asm_amd64.s b/src/runtime/asm_amd64.s index 5835443ff6..386307afa5 100644 --- a/src/runtime/asm_amd64.s +++ b/src/runtime/asm_amd64.s @@ -1848,195 +1848,6 @@ success: MOVQ DI, (R11) RET -TEXT bytes·countByte(SB),NOSPLIT,$0-40 - MOVQ s+0(FP), SI - MOVQ s_len+8(FP), BX - MOVB c+24(FP), AL - LEAQ ret+32(FP), R8 - JMP runtime·countByte(SB) - -TEXT strings·countByte(SB),NOSPLIT,$0-32 - MOVQ s+0(FP), SI - MOVQ s_len+8(FP), BX - MOVB c+16(FP), AL - LEAQ ret+24(FP), R8 - JMP runtime·countByte(SB) - -// input: -// SI: data -// BX: data len -// AL: byte sought -// R8: address to put result -// This requires the POPCNT instruction -TEXT runtime·countByte(SB),NOSPLIT,$0 - // Shuffle X0 around so that each byte contains - // the character we're looking for. - MOVD AX, X0 - PUNPCKLBW X0, X0 - PUNPCKLBW X0, X0 - PSHUFL $0, X0, X0 - - CMPQ BX, $16 - JLT small - - MOVQ $0, R12 // Accumulator - - MOVQ SI, DI - - CMPQ BX, $32 - JA avx2 -sse: - LEAQ -16(SI)(BX*1), AX // AX = address of last 16 bytes - JMP sseloopentry - -sseloop: - // Move the next 16-byte chunk of the data into X1. - MOVOU (DI), X1 - // Compare bytes in X0 to X1. - PCMPEQB X0, X1 - // Take the top bit of each byte in X1 and put the result in DX. - PMOVMSKB X1, DX - // Count number of matching bytes - POPCNTL DX, DX - // Accumulate into R12 - ADDQ DX, R12 - // Advance to next block. - ADDQ $16, DI -sseloopentry: - CMPQ DI, AX - JBE sseloop - - // Get the number of bytes to consider in the last 16 bytes - ANDQ $15, BX - JZ end - - // Create mask to ignore overlap between previous 16 byte block - // and the next. - MOVQ $16,CX - SUBQ BX, CX - MOVQ $0xFFFF, R10 - SARQ CL, R10 - SALQ CL, R10 - - // Process the last 16-byte chunk. This chunk may overlap with the - // chunks we've already searched so we need to mask part of it. - MOVOU (AX), X1 - PCMPEQB X0, X1 - PMOVMSKB X1, DX - // Apply mask - ANDQ R10, DX - POPCNTL DX, DX - ADDQ DX, R12 -end: - MOVQ R12, (R8) - RET - -// handle for lengths < 16 -small: - TESTQ BX, BX - JEQ endzero - - // Check if we'll load across a page boundary. - LEAQ 16(SI), AX - TESTW $0xff0, AX - JEQ endofpage - - // We must ignore high bytes as they aren't part of our slice. - // Create mask. - MOVB BX, CX - MOVQ $1, R10 - SALQ CL, R10 - SUBQ $1, R10 - - // Load data - MOVOU (SI), X1 - // Compare target byte with each byte in data. - PCMPEQB X0, X1 - // Move result bits to integer register. - PMOVMSKB X1, DX - // Apply mask - ANDQ R10, DX - POPCNTL DX, DX - // Directly return DX, we don't need to accumulate - // since we have <16 bytes. - MOVQ DX, (R8) - RET -endzero: - MOVQ $0, (R8) - RET - -endofpage: - // We must ignore low bytes as they aren't part of our slice. - MOVQ $16,CX - SUBQ BX, CX - MOVQ $0xFFFF, R10 - SARQ CL, R10 - SALQ CL, R10 - - // Load data into the high end of X1. - MOVOU -16(SI)(BX*1), X1 - // Compare target byte with each byte in data. - PCMPEQB X0, X1 - // Move result bits to integer register. - PMOVMSKB X1, DX - // Apply mask - ANDQ R10, DX - // Directly return DX, we don't need to accumulate - // since we have <16 bytes. - POPCNTL DX, DX - MOVQ DX, (R8) - RET - -avx2: - CMPB runtime·support_avx2(SB), $1 - JNE sse - MOVD AX, X0 - LEAQ -32(SI)(BX*1), R11 - VPBROADCASTB X0, Y1 -avx2_loop: - VMOVDQU (DI), Y2 - VPCMPEQB Y1, Y2, Y3 - VPMOVMSKB Y3, DX - POPCNTL DX, DX - ADDQ DX, R12 - ADDQ $32, DI - CMPQ DI, R11 - JLE avx2_loop - - // If last block is already processed, - // skip to the end. - CMPQ DI, R11 - JEQ endavx - - // Load address of the last 32 bytes. - // There is an overlap with the previous block. - MOVQ R11, DI - VMOVDQU (DI), Y2 - VPCMPEQB Y1, Y2, Y3 - VPMOVMSKB Y3, DX - // Exit AVX mode. - VZEROUPPER - - // Create mask to ignore overlap between previous 32 byte block - // and the next. - ANDQ $31, BX - MOVQ $32,CX - SUBQ BX, CX - MOVQ $0xFFFFFFFF, R10 - SARQ CL, R10 - SALQ CL, R10 - // Apply mask - ANDQ R10, DX - POPCNTL DX, DX - ADDQ DX, R12 - MOVQ R12, (R8) - RET -endavx: - // Exit AVX mode. - VZEROUPPER - MOVQ R12, (R8) - RET - TEXT runtime·return0(SB), NOSPLIT, $0 MOVL $0, AX RET diff --git a/src/runtime/runtime2.go b/src/runtime/runtime2.go index 556f13d1c1..77982c3e80 100644 --- a/src/runtime/runtime2.go +++ b/src/runtime/runtime2.go @@ -754,6 +754,7 @@ var ( // Set on startup in asm_{386,amd64,amd64p32}.s. // Packages outside the runtime should not use these // as they are not an external api. + // TODO: deprecate these; use internal/cpu directly. processorVersionInfo uint32 isIntel bool lfenceBeforeRdtsc bool |
