aboutsummaryrefslogtreecommitdiff
path: root/src/runtime
diff options
context:
space:
mode:
Diffstat (limited to 'src/runtime')
-rw-r--r--src/runtime/asm_386.s28
-rw-r--r--src/runtime/asm_amd64.s142
-rw-r--r--src/runtime/asm_amd64p32.s107
-rw-r--r--src/runtime/asm_arm.s48
-rw-r--r--src/runtime/asm_arm64.s120
-rw-r--r--src/runtime/asm_mips64x.s46
-rw-r--r--src/runtime/asm_mipsx.s44
-rw-r--r--src/runtime/asm_ppc64x.s302
-rw-r--r--src/runtime/asm_s390x.s102
-rw-r--r--src/runtime/error.go11
-rw-r--r--src/runtime/os_linux_s390x.go3
11 files changed, 6 insertions, 947 deletions
diff --git a/src/runtime/asm_386.s b/src/runtime/asm_386.s
index a8de5976ac..5533681cab 100644
--- a/src/runtime/asm_386.s
+++ b/src/runtime/asm_386.s
@@ -1495,34 +1495,6 @@ TEXT bytes·Compare(SB),NOSPLIT,$0-28
LEAL ret+24(FP), AX
JMP runtime·cmpbody(SB)
-TEXT bytes·IndexByte(SB),NOSPLIT,$0-20
- MOVL s+0(FP), SI
- MOVL s_len+4(FP), CX
- MOVB c+12(FP), AL
- MOVL SI, DI
- CLD; REPN; SCASB
- JZ 3(PC)
- MOVL $-1, ret+16(FP)
- RET
- SUBL SI, DI
- SUBL $1, DI
- MOVL DI, ret+16(FP)
- RET
-
-TEXT strings·IndexByte(SB),NOSPLIT,$0-16
- MOVL s+0(FP), SI
- MOVL s_len+4(FP), CX
- MOVB c+8(FP), AL
- MOVL SI, DI
- CLD; REPN; SCASB
- JZ 3(PC)
- MOVL $-1, ret+12(FP)
- RET
- SUBL SI, DI
- SUBL $1, DI
- MOVL DI, ret+12(FP)
- RET
-
// input:
// SI = a
// DI = b
diff --git a/src/runtime/asm_amd64.s b/src/runtime/asm_amd64.s
index 2376fe0aae..07e3b0b6e9 100644
--- a/src/runtime/asm_amd64.s
+++ b/src/runtime/asm_amd64.s
@@ -1995,148 +1995,6 @@ success:
MOVQ DI, (R11)
RET
-
-TEXT bytes·IndexByte(SB),NOSPLIT,$0-40
- MOVQ s+0(FP), SI
- MOVQ s_len+8(FP), BX
- MOVB c+24(FP), AL
- LEAQ ret+32(FP), R8
- JMP runtime·indexbytebody(SB)
-
-TEXT strings·IndexByte(SB),NOSPLIT,$0-32
- MOVQ s+0(FP), SI
- MOVQ s_len+8(FP), BX
- MOVB c+16(FP), AL
- LEAQ ret+24(FP), R8
- JMP runtime·indexbytebody(SB)
-
-// input:
-// SI: data
-// BX: data len
-// AL: byte sought
-// R8: address to put result
-TEXT runtime·indexbytebody(SB),NOSPLIT,$0
- // Shuffle X0 around so that each byte contains
- // the character we're looking for.
- MOVD AX, X0
- PUNPCKLBW X0, X0
- PUNPCKLBW X0, X0
- PSHUFL $0, X0, X0
-
- CMPQ BX, $16
- JLT small
-
- MOVQ SI, DI
-
- CMPQ BX, $32
- JA avx2
-sse:
- LEAQ -16(SI)(BX*1), AX // AX = address of last 16 bytes
- JMP sseloopentry
-
-sseloop:
- // Move the next 16-byte chunk of the data into X1.
- MOVOU (DI), X1
- // Compare bytes in X0 to X1.
- PCMPEQB X0, X1
- // Take the top bit of each byte in X1 and put the result in DX.
- PMOVMSKB X1, DX
- // Find first set bit, if any.
- BSFL DX, DX
- JNZ ssesuccess
- // Advance to next block.
- ADDQ $16, DI
-sseloopentry:
- CMPQ DI, AX
- JB sseloop
-
- // Search the last 16-byte chunk. This chunk may overlap with the
- // chunks we've already searched, but that's ok.
- MOVQ AX, DI
- MOVOU (AX), X1
- PCMPEQB X0, X1
- PMOVMSKB X1, DX
- BSFL DX, DX
- JNZ ssesuccess
-
-failure:
- MOVQ $-1, (R8)
- RET
-
-// We've found a chunk containing the byte.
-// The chunk was loaded from DI.
-// The index of the matching byte in the chunk is DX.
-// The start of the data is SI.
-ssesuccess:
- SUBQ SI, DI // Compute offset of chunk within data.
- ADDQ DX, DI // Add offset of byte within chunk.
- MOVQ DI, (R8)
- RET
-
-// handle for lengths < 16
-small:
- TESTQ BX, BX
- JEQ failure
-
- // Check if we'll load across a page boundary.
- LEAQ 16(SI), AX
- TESTW $0xff0, AX
- JEQ endofpage
-
- MOVOU (SI), X1 // Load data
- PCMPEQB X0, X1 // Compare target byte with each byte in data.
- PMOVMSKB X1, DX // Move result bits to integer register.
- BSFL DX, DX // Find first set bit.
- JZ failure // No set bit, failure.
- CMPL DX, BX
- JAE failure // Match is past end of data.
- MOVQ DX, (R8)
- RET
-
-endofpage:
- MOVOU -16(SI)(BX*1), X1 // Load data into the high end of X1.
- PCMPEQB X0, X1 // Compare target byte with each byte in data.
- PMOVMSKB X1, DX // Move result bits to integer register.
- MOVL BX, CX
- SHLL CX, DX
- SHRL $16, DX // Shift desired bits down to bottom of register.
- BSFL DX, DX // Find first set bit.
- JZ failure // No set bit, failure.
- MOVQ DX, (R8)
- RET
-
-avx2:
- CMPB runtime·support_avx2(SB), $1
- JNE sse
- MOVD AX, X0
- LEAQ -32(SI)(BX*1), R11
- VPBROADCASTB X0, Y1
-avx2_loop:
- VMOVDQU (DI), Y2
- VPCMPEQB Y1, Y2, Y3
- VPTEST Y3, Y3
- JNZ avx2success
- ADDQ $32, DI
- CMPQ DI, R11
- JLT avx2_loop
- MOVQ R11, DI
- VMOVDQU (DI), Y2
- VPCMPEQB Y1, Y2, Y3
- VPTEST Y3, Y3
- JNZ avx2success
- VZEROUPPER
- MOVQ $-1, (R8)
- RET
-
-avx2success:
- VPMOVMSKB Y3, DX
- BSFL DX, DX
- SUBQ SI, DI
- ADDQ DI, DX
- MOVQ DX, (R8)
- VZEROUPPER
- RET
-
TEXT bytes·Equal(SB),NOSPLIT,$0-49
MOVQ a_len+8(FP), BX
MOVQ b_len+32(FP), CX
diff --git a/src/runtime/asm_amd64p32.s b/src/runtime/asm_amd64p32.s
index dc4c57de13..3c3adc3990 100644
--- a/src/runtime/asm_amd64p32.s
+++ b/src/runtime/asm_amd64p32.s
@@ -837,113 +837,6 @@ allsame:
LEAQ -1(CX)(AX*2), AX // 1,0,-1 result
RET
-TEXT bytes·IndexByte(SB),NOSPLIT,$0-20
- MOVL s+0(FP), SI
- MOVL s_len+4(FP), BX
- MOVB c+12(FP), AL
- CALL runtime·indexbytebody(SB)
- MOVL AX, ret+16(FP)
- RET
-
-TEXT strings·IndexByte(SB),NOSPLIT,$0-20
- MOVL s+0(FP), SI
- MOVL s_len+4(FP), BX
- MOVB c+8(FP), AL
- CALL runtime·indexbytebody(SB)
- MOVL AX, ret+16(FP)
- RET
-
-// input:
-// SI: data
-// BX: data len
-// AL: byte sought
-// output:
-// AX
-TEXT runtime·indexbytebody(SB),NOSPLIT,$0
- MOVL SI, DI
-
- CMPL BX, $16
- JLT small
-
- // round up to first 16-byte boundary
- TESTL $15, SI
- JZ aligned
- MOVL SI, CX
- ANDL $~15, CX
- ADDL $16, CX
-
- // search the beginning
- SUBL SI, CX
- REPN; SCASB
- JZ success
-
-// DI is 16-byte aligned; get ready to search using SSE instructions
-aligned:
- // round down to last 16-byte boundary
- MOVL BX, R11
- ADDL SI, R11
- ANDL $~15, R11
-
- // shuffle X0 around so that each byte contains c
- MOVD AX, X0
- PUNPCKLBW X0, X0
- PUNPCKLBW X0, X0
- PSHUFL $0, X0, X0
- JMP condition
-
-sse:
- // move the next 16-byte chunk of the buffer into X1
- MOVO (DI), X1
- // compare bytes in X0 to X1
- PCMPEQB X0, X1
- // take the top bit of each byte in X1 and put the result in DX
- PMOVMSKB X1, DX
- TESTL DX, DX
- JNZ ssesuccess
- ADDL $16, DI
-
-condition:
- CMPL DI, R11
- JNE sse
-
- // search the end
- MOVL SI, CX
- ADDL BX, CX
- SUBL R11, CX
- // if CX == 0, the zero flag will be set and we'll end up
- // returning a false success
- JZ failure
- REPN; SCASB
- JZ success
-
-failure:
- MOVL $-1, AX
- RET
-
-// handle for lengths < 16
-small:
- MOVL BX, CX
- REPN; SCASB
- JZ success
- MOVL $-1, AX
- RET
-
-// we've found the chunk containing the byte
-// now just figure out which specific byte it is
-ssesuccess:
- // get the index of the least significant set bit
- BSFW DX, DX
- SUBL SI, DI
- ADDL DI, DX
- MOVL DX, AX
- RET
-
-success:
- SUBL SI, DI
- SUBL $1, DI
- MOVL DI, AX
- RET
-
TEXT bytes·Equal(SB),NOSPLIT,$0-25
MOVL a_len+4(FP), BX
MOVL b_len+16(FP), CX
diff --git a/src/runtime/asm_arm.s b/src/runtime/asm_arm.s
index 0b429705e8..d672bc26a2 100644
--- a/src/runtime/asm_arm.s
+++ b/src/runtime/asm_arm.s
@@ -925,54 +925,6 @@ equal:
MOVBU R0, ret+24(FP)
RET
-TEXT bytes·IndexByte(SB),NOSPLIT,$0-20
- MOVW s+0(FP), R0
- MOVW s_len+4(FP), R1
- MOVBU c+12(FP), R2 // byte to find
- MOVW R0, R4 // store base for later
- ADD R0, R1 // end
-
-_loop:
- CMP R0, R1
- B.EQ _notfound
- MOVBU.P 1(R0), R3
- CMP R2, R3
- B.NE _loop
-
- SUB $1, R0 // R0 will be one beyond the position we want
- SUB R4, R0 // remove base
- MOVW R0, ret+16(FP)
- RET
-
-_notfound:
- MOVW $-1, R0
- MOVW R0, ret+16(FP)
- RET
-
-TEXT strings·IndexByte(SB),NOSPLIT,$0-16
- MOVW s+0(FP), R0
- MOVW s_len+4(FP), R1
- MOVBU c+8(FP), R2 // byte to find
- MOVW R0, R4 // store base for later
- ADD R0, R1 // end
-
-_sib_loop:
- CMP R0, R1
- B.EQ _sib_notfound
- MOVBU.P 1(R0), R3
- CMP R2, R3
- B.NE _sib_loop
-
- SUB $1, R0 // R0 will be one beyond the position we want
- SUB R4, R0 // remove base
- MOVW R0, ret+12(FP)
- RET
-
-_sib_notfound:
- MOVW $-1, R0
- MOVW R0, ret+12(FP)
- RET
-
TEXT runtime·return0(SB),NOSPLIT,$0
MOVW $0, R0
RET
diff --git a/src/runtime/asm_arm64.s b/src/runtime/asm_arm64.s
index 2e08013097..6abb9945e2 100644
--- a/src/runtime/asm_arm64.s
+++ b/src/runtime/asm_arm64.s
@@ -800,126 +800,6 @@ samebytes:
//
// functions for other packages
//
-TEXT bytes·IndexByte(SB),NOSPLIT,$0-40
- MOVD b+0(FP), R0
- MOVD b_len+8(FP), R2
- MOVBU c+24(FP), R1
- MOVD $ret+32(FP), R8
- B runtime·indexbytebody<>(SB)
-
-TEXT strings·IndexByte(SB),NOSPLIT,$0-32
- MOVD s+0(FP), R0
- MOVD s_len+8(FP), R2
- MOVBU c+16(FP), R1
- MOVD $ret+24(FP), R8
- B runtime·indexbytebody<>(SB)
-
-// input:
-// R0: data
-// R1: byte to search
-// R2: data len
-// R8: address to put result
-TEXT runtime·indexbytebody<>(SB),NOSPLIT,$0
- // Core algorithm:
- // For each 32-byte chunk we calculate a 64-bit syndrome value,
- // with two bits per byte. For each tuple, bit 0 is set if the
- // relevant byte matched the requested character and bit 1 is
- // not used (faster than using a 32bit syndrome). Since the bits
- // in the syndrome reflect exactly the order in which things occur
- // in the original string, counting trailing zeros allows to
- // identify exactly which byte has matched.
-
- CBZ R2, fail
- MOVD R0, R11
- // Magic constant 0x40100401 allows us to identify
- // which lane matches the requested byte.
- // 0x40100401 = ((1<<0) + (4<<8) + (16<<16) + (64<<24))
- // Different bytes have different bit masks (i.e: 1, 4, 16, 64)
- MOVD $0x40100401, R5
- VMOV R1, V0.B16
- // Work with aligned 32-byte chunks
- BIC $0x1f, R0, R3
- VMOV R5, V5.S4
- ANDS $0x1f, R0, R9
- AND $0x1f, R2, R10
- BEQ loop
-
- // Input string is not 32-byte aligned. We calculate the
- // syndrome value for the aligned 32 bytes block containing
- // the first bytes and mask off the irrelevant part.
- VLD1.P (R3), [V1.B16, V2.B16]
- SUB $0x20, R9, R4
- ADDS R4, R2, R2
- VCMEQ V0.B16, V1.B16, V3.B16
- VCMEQ V0.B16, V2.B16, V4.B16
- VAND V5.B16, V3.B16, V3.B16
- VAND V5.B16, V4.B16, V4.B16
- VADDP V4.B16, V3.B16, V6.B16 // 256->128
- VADDP V6.B16, V6.B16, V6.B16 // 128->64
- VMOV V6.D[0], R6
- // Clear the irrelevant lower bits
- LSL $1, R9, R4
- LSR R4, R6, R6
- LSL R4, R6, R6
- // The first block can also be the last
- BLS masklast
- // Have we found something already?
- CBNZ R6, tail
-
-loop:
- VLD1.P (R3), [V1.B16, V2.B16]
- SUBS $0x20, R2, R2
- VCMEQ V0.B16, V1.B16, V3.B16
- VCMEQ V0.B16, V2.B16, V4.B16
- // If we're out of data we finish regardless of the result
- BLS end
- // Use a fast check for the termination condition
- VORR V4.B16, V3.B16, V6.B16
- VADDP V6.D2, V6.D2, V6.D2
- VMOV V6.D[0], R6
- // We're not out of data, loop if we haven't found the character
- CBZ R6, loop
-
-end:
- // Termination condition found, let's calculate the syndrome value
- VAND V5.B16, V3.B16, V3.B16
- VAND V5.B16, V4.B16, V4.B16
- VADDP V4.B16, V3.B16, V6.B16
- VADDP V6.B16, V6.B16, V6.B16
- VMOV V6.D[0], R6
- // Only do the clear for the last possible block with less than 32 bytes
- // Condition flags come from SUBS in the loop
- BHS tail
-
-masklast:
- // Clear the irrelevant upper bits
- ADD R9, R10, R4
- AND $0x1f, R4, R4
- SUB $0x20, R4, R4
- NEG R4<<1, R4
- LSL R4, R6, R6
- LSR R4, R6, R6
-
-tail:
- // Check that we have found a character
- CBZ R6, fail
- // Count the trailing zeros using bit reversing
- RBIT R6, R6
- // Compensate the last post-increment
- SUB $0x20, R3, R3
- // And count the leading zeros
- CLZ R6, R6
- // R6 is twice the offset into the fragment
- ADD R6>>1, R3, R0
- // Compute the offset result
- SUB R11, R0, R0
- MOVD R0, (R8)
- RET
-
-fail:
- MOVD $-1, R0
- MOVD R0, (R8)
- RET
// Equal(a, b []byte) bool
TEXT bytes·Equal(SB),NOSPLIT,$0-49
diff --git a/src/runtime/asm_mips64x.s b/src/runtime/asm_mips64x.s
index f59421fbf6..ca47824ab8 100644
--- a/src/runtime/asm_mips64x.s
+++ b/src/runtime/asm_mips64x.s
@@ -697,52 +697,6 @@ equal:
MOVB R1, ret+48(FP)
RET
-TEXT bytes·IndexByte(SB),NOSPLIT,$0-40
- MOVV s+0(FP), R1
- MOVV s_len+8(FP), R2
- MOVBU c+24(FP), R3 // byte to find
- MOVV R1, R4 // store base for later
- ADDV R1, R2 // end
- ADDV $-1, R1
-
-loop:
- ADDV $1, R1
- BEQ R1, R2, notfound
- MOVBU (R1), R5
- BNE R3, R5, loop
-
- SUBV R4, R1 // remove base
- MOVV R1, ret+32(FP)
- RET
-
-notfound:
- MOVV $-1, R1
- MOVV R1, ret+32(FP)
- RET
-
-TEXT strings·IndexByte(SB),NOSPLIT,$0-32
- MOVV p+0(FP), R1
- MOVV b_len+8(FP), R2
- MOVBU c+16(FP), R3 // byte to find
- MOVV R1, R4 // store base for later
- ADDV R1, R2 // end
- ADDV $-1, R1
-
-loop:
- ADDV $1, R1
- BEQ R1, R2, notfound
- MOVBU (R1), R5
- BNE R3, R5, loop
-
- SUBV R4, R1 // remove base
- MOVV R1, ret+24(FP)
- RET
-
-notfound:
- MOVV $-1, R1
- MOVV R1, ret+24(FP)
- RET
-
TEXT runtime·return0(SB), NOSPLIT, $0
MOVW $0, R1
RET
diff --git a/src/runtime/asm_mipsx.s b/src/runtime/asm_mipsx.s
index 47367f1703..ba80361a80 100644
--- a/src/runtime/asm_mipsx.s
+++ b/src/runtime/asm_mipsx.s
@@ -712,50 +712,6 @@ equal:
MOVB R1, ret+24(FP)
RET
-TEXT bytes·IndexByte(SB),NOSPLIT,$0-20
- MOVW s+0(FP), R1
- MOVW s_len+4(FP), R2
- MOVBU c+12(FP), R3 // byte to find
- ADDU $1, R1, R4 // store base+1 for later
- ADDU R1, R2 // end
-
-loop:
- BEQ R1, R2, notfound
- MOVBU (R1), R5
- ADDU $1, R1
- BNE R3, R5, loop
-
- SUBU R4, R1 // R1 will be one beyond the position we want so remove (base+1)
- MOVW R1, ret+16(FP)
- RET
-
-notfound:
- MOVW $-1, R1
- MOVW R1, ret+16(FP)
- RET
-
-TEXT strings·IndexByte(SB),NOSPLIT,$0-16
- MOVW s_base+0(FP), R1
- MOVW s_len+4(FP), R2
- MOVBU c+8(FP), R3 // byte to find
- ADDU $1, R1, R4 // store base+1 for later
- ADDU R1, R2 // end
-
-loop:
- BEQ R1, R2, notfound
- MOVBU (R1), R5
- ADDU $1, R1
- BNE R3, R5, loop
-
- SUBU R4, R1 // remove (base+1)
- MOVW R1, ret+12(FP)
- RET
-
-notfound:
- MOVW $-1, R1
- MOVW R1, ret+12(FP)
- RET
-
TEXT runtime·cmpstring(SB),NOSPLIT,$0-20
MOVW s1_base+0(FP), R3
MOVW s1_len+4(FP), R1
diff --git a/src/runtime/asm_ppc64x.s b/src/runtime/asm_ppc64x.s
index c0e872f7a9..0440751724 100644
--- a/src/runtime/asm_ppc64x.s
+++ b/src/runtime/asm_ppc64x.s
@@ -1068,308 +1068,6 @@ equal:
MOVBZ R3,ret+48(FP)
RET
-TEXT bytes·IndexByte(SB),NOSPLIT|NOFRAME,$0-40
- MOVD s+0(FP), R3 // R3 = byte array pointer
- MOVD s_len+8(FP), R4 // R4 = length
- MOVBZ c+24(FP), R5 // R5 = byte
- MOVD $ret+32(FP), R14 // R14 = &ret
- BR runtime·indexbytebody<>(SB)
-
-TEXT strings·IndexByte(SB),NOSPLIT|NOFRAME,$0-32
- MOVD s+0(FP), R3 // R3 = string
- MOVD s_len+8(FP), R4 // R4 = length
- MOVBZ c+16(FP), R5 // R5 = byte
- MOVD $ret+24(FP), R14 // R14 = &ret
- BR runtime·indexbytebody<>(SB)
-
-TEXT runtime·indexbytebody<>(SB),NOSPLIT|NOFRAME,$0-0
- DCBT (R3) // Prepare cache line.
- MOVD R3,R17 // Save base address for calculating the index later.
- RLDICR $0,R3,$60,R8 // Align address to doubleword boundary in R8.
- RLDIMI $8,R5,$48,R5 // Replicating the byte across the register.
- ADD R4,R3,R7 // Last acceptable address in R7.
-
- RLDIMI $16,R5,$32,R5
- CMPU R4,$32 // Check if it's a small string (<32 bytes). Those will be processed differently.
- MOVD $-1,R9
- WORD $0x54661EB8 // Calculate padding in R6 (rlwinm r6,r3,3,26,28).
- RLDIMI $32,R5,$0,R5
- MOVD R7,R10 // Save last acceptable address in R10 for later.
- ADD $-1,R7,R7
-#ifdef GOARCH_ppc64le
- SLD R6,R9,R9 // Prepare mask for Little Endian
-#else
- SRD R6,R9,R9 // Same for Big Endian
-#endif
- BLE small_string // Jump to the small string case if it's <32 bytes.
-
- // If we are 64-byte aligned, branch to qw_align just to get the auxiliary values
- // in V0, V1 and V10, then branch to the preloop.
- ANDCC $63,R3,R11
- BEQ CR0,qw_align
- RLDICL $0,R3,$61,R11
-
- MOVD 0(R8),R12 // Load one doubleword from the aligned address in R8.
- CMPB R12,R5,R3 // Check for a match.
- AND R9,R3,R3 // Mask bytes below s_base
- RLDICL $0,R7,$61,R6 // length-1
- RLDICR $0,R7,$60,R7 // Last doubleword in R7
- CMPU R3,$0,CR7 // If we have a match, jump to the final computation
- BNE CR7,done
- ADD $8,R8,R8
- ADD $-8,R4,R4
- ADD R4,R11,R4
-
- // Check for quadword alignment
- ANDCC $15,R8,R11
- BEQ CR0,qw_align
-
- // Not aligned, so handle the next doubleword
- MOVD 0(R8),R12
- CMPB R12,R5,R3
- CMPU R3,$0,CR7
- BNE CR7,done
- ADD $8,R8,R8
- ADD $-8,R4,R4
-
- // Either quadword aligned or 64-byte at this point. We can use LVX.
-qw_align:
-
- // Set up auxiliary data for the vectorized algorithm.
- VSPLTISB $0,V0 // Replicate 0 across V0
- VSPLTISB $3,V10 // Use V10 as control for VBPERMQ
- MTVRD R5,V1
- LVSL (R0+R0),V11
- VSLB V11,V10,V10
- VSPLTB $7,V1,V1 // Replicate byte across V1
- CMPU R4, $64 // If len <= 64, don't use the vectorized loop
- BLE tail
-
- // We will load 4 quardwords per iteration in the loop, so check for
- // 64-byte alignment. If 64-byte aligned, then branch to the preloop.
- ANDCC $63,R8,R11
- BEQ CR0,preloop
-
- // Not 64-byte aligned. Load one quadword at a time until aligned.
- LVX (R8+R0),V4
- VCMPEQUBCC V1,V4,V6 // Check for byte in V4
- BNE CR6,found_qw_align
- ADD $16,R8,R8
- ADD $-16,R4,R4
-
- ANDCC $63,R8,R11
- BEQ CR0,preloop
- LVX (R8+R0),V4
- VCMPEQUBCC V1,V4,V6 // Check for byte in V4
- BNE CR6,found_qw_align
- ADD $16,R8,R8
- ADD $-16,R4,R4
-
- ANDCC $63,R8,R11
- BEQ CR0,preloop
- LVX (R8+R0),V4
- VCMPEQUBCC V1,V4,V6 // Check for byte in V4
- BNE CR6,found_qw_align
- ADD $-16,R4,R4
- ADD $16,R8,R8
-
- // 64-byte aligned. Prepare for the main loop.
-preloop:
- CMPU R4,$64
- BLE tail // If len <= 64, don't use the vectorized loop
-
- // We are now aligned to a 64-byte boundary. We will load 4 quadwords
- // per loop iteration. The last doubleword is in R10, so our loop counter
- // starts at (R10-R8)/64.
- SUB R8,R10,R6
- SRD $6,R6,R9 // Loop counter in R9
- MOVD R9,CTR
-
- MOVD $16,R11 // Load offsets for the vector loads
- MOVD $32,R9
- MOVD $48,R7
-
- // Main loop we will load 64 bytes per iteration
-loop:
- LVX (R8+R0),V2 // Load 4 16-byte vectors
- LVX (R11+R8),V3
- LVX (R9+R8),V4
- LVX (R7+R8),V5
- VCMPEQUB V1,V2,V6 // Look for byte in each vector
- VCMPEQUB V1,V3,V7
- VCMPEQUB V1,V4,V8
- VCMPEQUB V1,V5,V9
- VOR V6,V7,V11 // Compress the result in a single vector
- VOR V8,V9,V12
- VOR V11,V12,V11
- VCMPEQUBCC V0,V11,V11 // Check for byte
- BGE CR6,found
- ADD $64,R8,R8
- BC 16,0,loop // bdnz loop
-
- // Handle the tailing bytes or R4 <= 64
- RLDICL $0,R6,$58,R4
-tail:
- CMPU R4,$0
- BEQ notfound
- LVX (R8+R0),V4
- VCMPEQUBCC V1,V4,V6
- BNE CR6,found_qw_align
- ADD $16,R8,R8
- CMPU R4,$16,CR6
- BLE CR6,notfound
- ADD $-16,R4,R4
-
- LVX (R8+R0),V4
- VCMPEQUBCC V1,V4,V6
- BNE CR6,found_qw_align
- ADD $16,R8,R8
- CMPU R4,$16,CR6
- BLE CR6,notfound
- ADD $-16,R4,R4
-
- LVX (R8+R0),V4
- VCMPEQUBCC V1,V4,V6
- BNE CR6,found_qw_align
- ADD $16,R8,R8
- CMPU R4,$16,CR6
- BLE CR6,notfound
- ADD $-16,R4,R4
-
- LVX (R8+R0),V4
- VCMPEQUBCC V1,V4,V6
- BNE CR6,found_qw_align
-
-notfound:
- MOVD $-1,R3
- MOVD R3,(R14)
- RET
-
-found:
- // We will now compress the results into a single doubleword,
- // so it can be moved to a GPR for the final index calculation.
-
- // The bytes in V6-V9 are either 0x00 or 0xFF. So, permute the
- // first bit of each byte into bits 48-63.
- VBPERMQ V6,V10,V6
- VBPERMQ V7,V10,V7
- VBPERMQ V8,V10,V8
- VBPERMQ V9,V10,V9
-
- // Shift each 16-bit component into its correct position for
- // merging into a single doubleword.
-#ifdef GOARCH_ppc64le
- VSLDOI $2,V7,V7,V7
- VSLDOI $4,V8,V8,V8
- VSLDOI $6,V9,V9,V9
-#else
- VSLDOI $6,V6,V6,V6
- VSLDOI $4,V7,V7,V7
- VSLDOI $2,V8,V8,V8
-#endif
-
- // Merge V6-V9 into a single doubleword and move to a GPR.
- VOR V6,V7,V11
- VOR V8,V9,V4
- VOR V4,V11,V4
- MFVRD V4,R3
-
-#ifdef GOARCH_ppc64le
- ADD $-1,R3,R11
- ANDN R3,R11,R11
- POPCNTD R11,R11 // Count trailing zeros (Little Endian).
-#else
- CNTLZD R3,R11 // Count leading zeros (Big Endian).
-#endif
- ADD R8,R11,R3 // Calculate byte address
-
-return:
- SUB R17,R3
- MOVD R3,(R14)
- RET
-
-found_qw_align:
- // Use the same algorithm as above. Compress the result into
- // a single doubleword and move it to a GPR for the final
- // calculation.
- VBPERMQ V6,V10,V6
-
-#ifdef GOARCH_ppc64le
- MFVRD V6,R3
- ADD $-1,R3,R11
- ANDN R3,R11,R11
- POPCNTD R11,R11
-#else
- VSLDOI $6,V6,V6,V6
- MFVRD V6,R3
- CNTLZD R3,R11
-#endif
- ADD R8,R11,R3
- CMPU R11,R4
- BLT return
- BR notfound
-
-done:
- // At this point, R3 has 0xFF in the same position as the byte we are
- // looking for in the doubleword. Use that to calculate the exact index
- // of the byte.
-#ifdef GOARCH_ppc64le
- ADD $-1,R3,R11
- ANDN R3,R11,R11
- POPCNTD R11,R11 // Count trailing zeros (Little Endian).
-#else
- CNTLZD R3,R11 // Count leading zeros (Big Endian).
-#endif
- CMPU R8,R7 // Check if we are at the last doubleword.
- SRD $3,R11 // Convert trailing zeros to bytes.
- ADD R11,R8,R3
- CMPU R11,R6,CR7 // If at the last doubleword, check the byte offset.
- BNE return
- BLE CR7,return
- BR notfound
-
-small_string:
- // We unroll this loop for better performance.
- CMPU R4,$0 // Check for length=0
- BEQ notfound
-
- MOVD 0(R8),R12 // Load one doubleword from the aligned address in R8.
- CMPB R12,R5,R3 // Check for a match.
- AND R9,R3,R3 // Mask bytes below s_base.
- CMPU R3,$0,CR7 // If we have a match, jump to the final computation.
- RLDICL $0,R7,$61,R6 // length-1
- RLDICR $0,R7,$60,R7 // Last doubleword in R7.
- CMPU R8,R7
- BNE CR7,done
- BEQ notfound // Hit length.
-
- MOVDU 8(R8),R12
- CMPB R12,R5,R3
- CMPU R3,$0,CR6
- CMPU R8,R7
- BNE CR6,done
- BEQ notfound
-
- MOVDU 8(R8),R12
- CMPB R12,R5,R3
- CMPU R3,$0,CR6
- CMPU R8,R7
- BNE CR6,done
- BEQ notfound
-
- MOVDU 8(R8),R12
- CMPB R12,R5,R3
- CMPU R3,$0,CR6
- CMPU R8,R7
- BNE CR6,done
- BEQ notfound
-
- MOVDU 8(R8),R12
- CMPB R12,R5,R3
- CMPU R3,$0,CR6
- BNE CR6,done
- BR notfound
-
TEXT runtime·cmpstring(SB),NOSPLIT|NOFRAME,$0-40
MOVD s1_base+0(FP), R5
MOVD s2_base+16(FP), R6
diff --git a/src/runtime/asm_s390x.s b/src/runtime/asm_s390x.s
index 766a408c3c..19262a332a 100644
--- a/src/runtime/asm_s390x.s
+++ b/src/runtime/asm_s390x.s
@@ -854,108 +854,6 @@ TEXT runtime·memeqbodyclc(SB),NOSPLIT|NOFRAME,$0-0
CLC $1, 0(R3), 0(R5)
RET
-TEXT bytes·IndexByte(SB),NOSPLIT|NOFRAME,$0-40
- MOVD s+0(FP), R3 // s => R3
- MOVD s_len+8(FP), R4 // s_len => R4
- MOVBZ c+24(FP), R5 // c => R5
- MOVD $ret+32(FP), R2 // &ret => R9
- BR runtime·indexbytebody(SB)
-
-TEXT strings·IndexByte(SB),NOSPLIT|NOFRAME,$0-32
- MOVD s+0(FP), R3 // s => R3
- MOVD s_len+8(FP), R4 // s_len => R4
- MOVBZ c+16(FP), R5 // c => R5
- MOVD $ret+24(FP), R2 // &ret => R9
- BR runtime·indexbytebody(SB)
-
-// input:
-// R3: s
-// R4: s_len
-// R5: c -- byte sought
-// R2: &ret -- address to put index into
-TEXT runtime·indexbytebody(SB),NOSPLIT|NOFRAME,$0
- CMPBEQ R4, $0, notfound
- MOVD R3, R6 // store base for later
- ADD R3, R4, R8 // the address after the end of the string
- //if the length is small, use loop; otherwise, use vector or srst search
- CMPBGE R4, $16, large
-
-residual:
- CMPBEQ R3, R8, notfound
- MOVBZ 0(R3), R7
- LA 1(R3), R3
- CMPBNE R7, R5, residual
-
-found:
- SUB R6, R3
- SUB $1, R3
- MOVD R3, 0(R2)
- RET
-
-notfound:
- MOVD $-1, 0(R2)
- RET
-
-large:
- MOVBZ ·cpu+facilities_hasVX(SB), R1
- CMPBNE R1, $0, vectorimpl
-
-srstimpl: // no vector facility
- MOVBZ R5, R0 // c needs to be in R0, leave until last minute as currently R0 is expected to be 0
-srstloop:
- WORD $0xB25E0083 // srst %r8, %r3 (search the range [R3, R8))
- BVS srstloop // interrupted - continue
- BGT notfoundr0
-foundr0:
- XOR R0, R0 // reset R0
- SUB R6, R8 // remove base
- MOVD R8, 0(R2)
- RET
-notfoundr0:
- XOR R0, R0 // reset R0
- MOVD $-1, 0(R2)
- RET
-
-vectorimpl:
- //if the address is not 16byte aligned, use loop for the header
- MOVD R3, R8
- AND $15, R8
- CMPBGT R8, $0, notaligned
-
-aligned:
- ADD R6, R4, R8
- MOVD R8, R7
- AND $-16, R7
- // replicate c across V17
- VLVGB $0, R5, V19
- VREPB $0, V19, V17
-
-vectorloop:
- CMPBGE R3, R7, residual
- VL 0(R3), V16 // load string to be searched into V16
- ADD $16, R3
- VFEEBS V16, V17, V18 // search V17 in V16 and set conditional code accordingly
- BVS vectorloop
-
- // when vector search found c in the string
- VLGVB $7, V18, R7 // load 7th element of V18 containing index into R7
- SUB $16, R3
- SUB R6, R3
- ADD R3, R7
- MOVD R7, 0(R2)
- RET
-
-notaligned:
- MOVD R3, R8
- AND $-16, R8
- ADD $16, R8
-notalignedloop:
- CMPBEQ R3, R8, aligned
- MOVBZ 0(R3), R7
- LA 1(R3), R3
- CMPBNE R7, R5, notalignedloop
- BR found
-
TEXT runtime·return0(SB), NOSPLIT, $0
MOVW $0, R3
RET
diff --git a/src/runtime/error.go b/src/runtime/error.go
index e1291e1543..4b6fb32b78 100644
--- a/src/runtime/error.go
+++ b/src/runtime/error.go
@@ -4,7 +4,7 @@
package runtime
-import _ "unsafe" // for go:linkname
+import "internal/bytealg"
// The Error interface identifies a run time error.
type Error interface {
@@ -118,11 +118,6 @@ func printany(i interface{}) {
}
}
-// strings.IndexByte is implemented in runtime/asm_$goarch.s
-// but amusingly we need go:linkname to get access to it here in the runtime.
-//go:linkname stringsIndexByte strings.IndexByte
-func stringsIndexByte(s string, c byte) int
-
// panicwrap generates a panic for a call to a wrapped value method
// with a nil pointer receiver.
//
@@ -133,7 +128,7 @@ func panicwrap() {
// name is something like "main.(*T).F".
// We want to extract pkg ("main"), typ ("T"), and meth ("F").
// Do it by finding the parens.
- i := stringsIndexByte(name, '(')
+ i := bytealg.IndexByteString(name, '(')
if i < 0 {
throw("panicwrap: no ( in " + name)
}
@@ -142,7 +137,7 @@ func panicwrap() {
throw("panicwrap: unexpected string after package name: " + name)
}
name = name[i+2:]
- i = stringsIndexByte(name, ')')
+ i = bytealg.IndexByteString(name, ')')
if i < 0 {
throw("panicwrap: no ) in " + name)
}
diff --git a/src/runtime/os_linux_s390x.go b/src/runtime/os_linux_s390x.go
index 3ca6d4c8c8..2129052836 100644
--- a/src/runtime/os_linux_s390x.go
+++ b/src/runtime/os_linux_s390x.go
@@ -5,6 +5,7 @@
package runtime
import (
+ internalcpu "internal/cpu"
"runtime/internal/sys"
)
@@ -22,11 +23,13 @@ type facilities struct {
// cpu indicates the availability of s390x facilities that can be used in
// Go assembly but are optional on models supported by Go.
+// TODO: remove this once we're only using internal/cpu.
var cpu facilities
func archauxv(tag, val uintptr) {
switch tag {
case _AT_HWCAP: // CPU capability bit flags
+ internalcpu.S390X.HasVX = val&_HWCAP_S390_VX != 0
cpu.hasVX = val&_HWCAP_S390_VX != 0
}
}