internal/bytealg: optimize indexbyte function for ppc64le/power9

Added specific code For POWER9 that does not need prealignment prior to load vector. Optimized vector loop to jump out as soon as there is a match instead of accumulating matches for 4 indices and then processing the same. For small input size 10, the caller function dominates performance. name old time/op new time/op delta IndexByte/10 9.20ns ± 0% 10.40ns ± 0% +13.08% IndexByte/32 9.77ns ± 0% 9.20ns ± 0% -5.84% IndexByte/4K 171ns ± 0% 136ns ± 0% -20.51% IndexByte/4M 154µs ± 0% 126µs ± 0% -17.92% IndexByte/64M 2.48ms ± 0% 2.03ms ± 0% -18.27% IndexAnyASCII/1:32 10.2ns ± 1% 9.2ns ± 0% -9.19% IndexAnyASCII/1:64 11.3ns ± 0% 10.1ns ± 0% -11.29% IndexAnyUTF8/1:64 11.4ns ± 0% 9.8ns ± 0% -13.73% IndexAnyUTF8/16:64 156ns ± 1% 131ns ± 0% -16.23% IndexAnyUTF8/256:64 2.27µs ± 0% 1.86µs ± 0% -18.03% LastIndexAnyUTF8/1:64 11.8ns ± 0% 10.5ns ± 0% -10.81% LastIndexAnyUTF8/16:64 165ns ±11% 132ns ± 0% -19.75% LastIndexAnyUTF8/256:2 1.68µs ± 0% 1.44µs ± 0% -14.33% LastIndexAnyUTF8/256:4 1.68µs ± 0% 1.49µs ± 0% -11.10% LastIndexAnyUTF8/256:8 1.68µs ± 0% 1.50µs ± 0% -11.05% LastIndexAnyUTF8/256:64 2.30µs ± 0% 1.90µs ± 0% -17.56% Change-Id: I3d2550bdfdea38fece2da9960bbe62fe6cb1840c Reviewed-on: https://go-review.googlesource.com/c/go/+/397614 Reviewed-by: Paul Murphy <murp@ibm.com> Reviewed-by: Cherry Mui <cherryyz@google.com> Run-TryBot: Archana Ravindar <aravind5@in.ibm.com> TryBot-Result: Gopher Robot <gobot@golang.org> Reviewed-by: Russ Cox <rsc@golang.org>
author: Archana R <aravind5@in.ibm.com> 2022-04-01 12:05:07 -0500
committer: Lynn Boger <laboger@linux.vnet.ibm.com> 2022-04-15 16:05:09 +0000
commit: 5c707f5f3ace728f08997960ec67d9f55cdbf1a3 (patch)
tree: 9484c80c8708b64d185997eb2ec90203ed8e2f58 /src/internal/bytealg
parent: 2c73f5f32fceb31b5da7f9a820c0c637f57a9ab5 (diff)
download: go-5c707f5f3ace728f08997960ec67d9f55cdbf1a3.tar.xz
1 files changed, 80 insertions, 5 deletions
diff --git a/src/internal/bytealg/indexbyte_ppc64x.s b/src/internal/bytealg/indexbyte_ppc64x.s
index 4cc2b44087..1a6e852d67 100644
--- a/src/internal/bytealg/indexbyte_ppc64x.s
+++ b/src/internal/bytealg/indexbyte_ppc64x.s
@@ -11,17 +11,20 @@ TEXT ·IndexByte<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-40
 	// R3 = byte array pointer
 	// R4 = length
 	MOVD	R6, R5		// R5 = byte
+	MOVBZ	internal∕cpu·PPC64+const_offsetPPC64HasPOWER9(SB), R16
 	BR	indexbytebody<>(SB)
 
 TEXT ·IndexByteString<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-32
 	// R3 = string
 	// R4 = length
 	// R5 = byte
+	MOVBZ	internal∕cpu·PPC64+const_offsetPPC64HasPOWER9(SB), R16
 	BR	indexbytebody<>(SB)
 
 // R3 = addr of string
 // R4 = len of string
 // R5 = byte to find
+// R16 = 1 if running on a POWER9 system, 0 otherwise
 // On exit:
 // R3 = return value
 TEXT indexbytebody<>(SB),NOSPLIT|NOFRAME,$0-0
@@ -29,12 +32,11 @@ TEXT indexbytebody<>(SB),NOSPLIT|NOFRAME,$0-0
 	RLDICR	$0,R3,$60,R8	// Align address to doubleword boundary in R8.
 	RLDIMI	$8,R5,$48,R5	// Replicating the byte across the register.
 	ADD	R4,R3,R7	// Last acceptable address in R7.
-	DCBT	(R8)		// Prepare cache line.
 
 	RLDIMI	$16,R5,$32,R5
 	CMPU	R4,$32		// Check if it's a small string (≤32 bytes). Those will be processed differently.
 	MOVD	$-1,R9
-	WORD	$0x54661EB8	// Calculate padding in R6 (rlwinm r6,r3,3,26,28).
+	RLWNM	$3,R3,$26,$28,R6	// shift amount for mask (r3&0x7)*8
 	RLDIMI	$32,R5,$0,R5
 	MOVD	R7,R10		// Save last acceptable address in R10 for later.
 	ADD	$-1,R7,R7
@@ -43,8 +45,77 @@ TEXT indexbytebody<>(SB),NOSPLIT|NOFRAME,$0-0
 #else
 	SRD	R6,R9,R9	// Same for Big Endian
 #endif
-	BLE	small_string	// Jump to the small string case if it's ≤32 bytes.
+	BLT	small_string	// Jump to the small string case if it's <32 bytes.
+	CMP	R16,$1		// optimize for power8 v power9
+	BNE	power8
+	VSPLTISB	$3,V10	// Use V10 as control for VBPERMQ
+	MTVRD	R5,V1
+	LVSL	(R0+R0),V11	// set up the permute vector such that V10 has {0x78, .., 0x8, 0x0}
+	VSLB	V11,V10,V10	// to extract the first bit of match result into GPR
+	VSPLTB	$7,V1,V1	// Replicate byte across V1
+	CMP	R4,$64
+	MOVD	$16,R11
+	MOVD	R3,R8
+	BLT	cmp32
+	MOVD	$32,R12
+	MOVD	$48,R6
 
+loop64:
+	LXVB16X	(R0)(R8),V2	// scan 64 bytes at a time
+	VCMPEQUBCC	V2,V1,V6
+	BNE	CR6,foundat0	// match found at R8, jump out
+
+	LXVB16X	(R8)(R11),V2
+	VCMPEQUBCC	V2,V1,V6
+	BNE	CR6,foundat1	// match found at R8+16 bytes, jump out
+
+	LXVB16X	(R8)(R12),V2
+	VCMPEQUBCC	V2,V1,V6
+	BNE	CR6,foundat2	// match found at R8+32 bytes, jump out
+
+	LXVB16X	(R8)(R6),V2
+	VCMPEQUBCC	V2,V1,V6
+	BNE	CR6,foundat3	// match found at R8+48 bytes, jump out
+	ADD	$64,R8
+	ADD	$-64,R4
+	CMP	R4,$64		// >=64 bytes left to scan?
+	BGE	loop64
+	CMP	R4,$32
+	BLT	rem		// jump to rem if there are < 32 bytes left
+cmp32:
+	LXVB16X	(R0)(R8),V2	// 32-63 bytes left
+	VCMPEQUBCC	V2,V1,V6
+	BNE	CR6,foundat0	// match found at R8
+
+	LXVB16X	(R11)(R8),V2
+	VCMPEQUBCC	V2,V1,V6
+	BNE	CR6,foundat1	// match found at R8+16
+
+	ADD	$32,R8
+	ADD	$-32,R4
+rem:
+	RLDICR	$0,R8,$60,R8	// align address to reuse code for tail end processing
+	BR	small_string
+
+foundat3:
+	ADD	$16,R8
+foundat2:
+	ADD	$16,R8
+foundat1:
+	ADD	$16,R8
+foundat0:
+	// Compress the result into a single doubleword and
+	// move it to a GPR for the final calculation.
+	VBPERMQ	V6,V10,V6
+	MFVRD	V6,R3
+	// count leading zeroes upto the match that ends up in low 16 bits
+	// in both endian modes, compute index by subtracting the number by 16
+	CNTLZW	R3,R11
+	ADD	$-16,R11
+	ADD	R8,R11,R3	// Calculate byte address
+	SUB	R17,R3
+	RET
+power8:
 	// If we are 64-byte aligned, branch to qw_align just to get the auxiliary values
 	// in V0, V1 and V10, then branch to the preloop.
 	ANDCC	$63,R3,R11
@@ -54,7 +125,6 @@ TEXT indexbytebody<>(SB),NOSPLIT|NOFRAME,$0-0
 	MOVD	0(R8),R12	// Load one doubleword from the aligned address in R8.
 	CMPB	R12,R5,R3	// Check for a match.
 	AND	R9,R3,R3	// Mask bytes below s_base
-	RLDICL	$0,R7,$61,R6	// length-1
 	RLDICR	$0,R7,$60,R7	// Last doubleword in R7
 	CMPU	R3,$0,CR7	// If we have a match, jump to the final computation
 	BNE	CR7,done
@@ -252,8 +322,13 @@ found_qw_align:
 	CMPU	  R11,R4
 	BLT	  return
 	BR	  notfound
+	PCALIGN	  $16
 
 done:
+	ADD	$-1,R10,R6
+	// Offset of last index for the final
+	// doubleword comparison
+	RLDICL	$0,R6,$61,R6
 	// At this point, R3 has 0xFF in the same position as the byte we are
 	// looking for in the doubleword. Use that to calculate the exact index
 	// of the byte.
@@ -273,6 +348,7 @@ done:
 	BR	notfound
 
 small_string:
+	// process string of length < 32 bytes
 	// We unroll this loop for better performance.
 	CMPU	R4,$0		// Check for length=0
 	BEQ	notfound
@@ -281,7 +357,6 @@ small_string:
 	CMPB	R12,R5,R3	// Check for a match.
 	AND	R9,R3,R3	// Mask bytes below s_base.
 	CMPU	R3,$0,CR7	// If we have a match, jump to the final computation.
-	RLDICL	$0,R7,$61,R6	// length-1
 	RLDICR	$0,R7,$60,R7	// Last doubleword in R7.
 	CMPU	R8,R7
 	BNE	CR7,done
author	Archana R <aravind5@in.ibm.com>	2022-04-01 12:05:07 -0500
committer	Lynn Boger <laboger@linux.vnet.ibm.com>	2022-04-15 16:05:09 +0000
commit	5c707f5f3ace728f08997960ec67d9f55cdbf1a3 (patch)
tree	9484c80c8708b64d185997eb2ec90203ed8e2f58 /src/internal/bytealg
parent	2c73f5f32fceb31b5da7f9a820c0c637f57a9ab5 (diff)
download	go-5c707f5f3ace728f08997960ec67d9f55cdbf1a3.tar.xz