aboutsummaryrefslogtreecommitdiff
path: root/src/internal/bytealg
diff options
context:
space:
mode:
authorArchana R <aravind5@in.ibm.com>2023-03-23 12:35:39 -0500
committerLynn Boger <laboger@linux.vnet.ibm.com>2023-04-18 15:42:18 +0000
commit56c4422770c076b86c6c66d8d96bed31b66aaa28 (patch)
tree1730ce46504d47c5b16acc1f3c03cbb2a1e11580 /src/internal/bytealg
parente875cbd9aaec22264da01b76958e565b3262e970 (diff)
downloadgo-56c4422770c076b86c6c66d8d96bed31b66aaa28.tar.xz
runtime: improve index on ppc64x/power10
Rewrite index asm function to use the new power10 instruction lxvl, stxvl or the load, store vector with length which can specify the number of bytes to be stored in a register. This avoids the need to create a separator mask and extra AND instructions. It also allows us to process the tail end of the string using a lot fewer instructions as we can load bytes of separator length directly rather than loading 16 bytes and masking out bytes that are greater than separator length On power9 and power8 the code remains unchanged. The performance for smaller sizes improve the most, on larger sizes we see minimal improvement. name old time/op new time/op delta Index/10 10.6ns ± 3% 9.8ns ± 2% -7.20% Index/11 11.2ns ± 4% 10.6ns ± 0% -5.99% Index/12 12.7ns ± 3% 11.3ns ± 0% -11.21% Index/13 13.5ns ± 2% 11.7ns ± 0% -13.11% Index/14 14.1ns ± 1% 12.0ns ± 0% -14.43% Index/15 14.3ns ± 2% 12.4ns ± 0% -13.39% Index/16 14.5ns ± 1% 12.7ns ± 0% -12.57% Index/17 26.7ns ± 0% 25.9ns ± 0% -2.99% Index/18 27.3ns ± 0% 26.4ns ± 1% -3.35% Index/19 35.7ns ±16% 26.1ns ± 1% -26.87% Index/20 29.4ns ± 0% 27.3ns ± 1% -7.06% Index/21 29.3ns ± 0% 26.9ns ± 1% -8.37% Index/22 30.0ns ± 0% 27.4ns ± 0% -8.68% Index/23 29.9ns ± 0% 27.7ns ± 0% -7.15% Index/24 31.0ns ± 0% 28.0ns ± 0% -9.92% Index/25 31.7ns ± 0% 28.4ns ± 0% -10.54% Index/26 30.6ns ± 0% 28.9ns ± 1% -5.67% Index/27 31.4ns ± 0% 29.3ns ± 0% -6.71% Index/28 32.7ns ± 0% 29.6ns ± 1% -9.36% Index/29 33.3ns ± 0% 30.1ns ± 1% -9.70% Index/30 32.4ns ± 0% 30.7ns ± 0% -5.23% Index/31 33.2ns ± 0% 30.6ns ± 1% -7.83% Index/32 34.3ns ± 0% 30.9ns ± 0% -9.94% Index/64 46.8ns ± 0% 44.2ns ± 0% -5.66% Index/128 71.2ns ± 0% 67.3ns ± 0% -5.43% Index/256 129ns ± 0% 127ns ± 0% -1.67% Index/2K 838ns ± 0% 804ns ± 0% -4.03% Index/4K 1.65µs ± 0% 1.58µs ± 0% -4.25% Index/2M 829µs ± 0% 793µs ± 0% -4.42% Index/4M 1.65ms ± 0% 1.59ms ± 0% -4.19% Index/64M 26.5ms ± 0% 25.4ms ± 0% -4.18% IndexHard2 412µs ± 0% 396µs ± 0% -3.76% IndexEasy/10 10.0ns ± 0% 9.3ns ± 1% -7.20% IndexEasy/11 10.8ns ± 1% 11.0ns ± 1% +2.22% IndexEasy/12 12.3ns ± 2% 11.5ns ± 1% -6.37% IndexEasy/13 13.1ns ± 0% 11.7ns ± 2% -10.83% IndexEasy/14 13.8ns ± 2% 11.9ns ± 1% -13.52% IndexEasy/15 14.0ns ± 2% 12.4ns ± 2% -11.46% IndexEasy/16 14.3ns ± 1% 12.5ns ± 0% -12.40% CountHard2 415µs ± 0% 396µs ± 0% -4.48% Change-Id: Id3efa5ed9c662a29f58125c7f866a09f29a59b6c Reviewed-on: https://go-review.googlesource.com/c/go/+/478918 Reviewed-by: Dmitri Shuralyov <dmitshur@google.com> Reviewed-by: Paul Murphy <murp@ibm.com> Run-TryBot: Archana Ravindar <aravind5@in.ibm.com> Reviewed-by: David Chase <drchase@google.com> TryBot-Result: Gopher Robot <gobot@golang.org> Reviewed-by: Lynn Boger <laboger@linux.vnet.ibm.com>
Diffstat (limited to 'src/internal/bytealg')
-rw-r--r--src/internal/bytealg/index_ppc64x.s63
1 files changed, 51 insertions, 12 deletions
diff --git a/src/internal/bytealg/index_ppc64x.s b/src/internal/bytealg/index_ppc64x.s
index 26205cebaf..e98f96b715 100644
--- a/src/internal/bytealg/index_ppc64x.s
+++ b/src/internal/bytealg/index_ppc64x.s
@@ -434,41 +434,49 @@ TEXT indexbodyp9<>(SB), NOSPLIT|NOFRAME, $0
CMP R6, $0 // Check sep len
BEQ notfound // sep len 0 -- not found
MOVD R3, R7 // Copy of string addr
+#ifndef GOPPC64_power10
MOVD $16, R16 // Index value 16
MOVD $17, R17 // Index value 17
MOVD $18, R18 // Index value 18
- MOVD $1, R19 // Index value 1
VSPLTISB $0xFF, ONES // splat all 1s
-
- CMP R6, $16, CR4 // CR4 for len(sep) >= 16
VOR ONES, ONES, SEPMASK // Set up full SEPMASK
+#else
+ SLD $56, R6, R14 // Set up separator length for LXVLL
+#endif
+ MOVD $1, R19 // Index value 1
+ CMP R6, $16, CR4 // CR4 for len(sep) >= 16
BGE CR4, loadge16 // Load for len(sep) >= 16
+#ifndef GOPPC64_power10
SUB R6, R16, R9 // 16-len of sep
SLD $3, R9 // Set up for VSLO
MTVSRD R9, V9 // Set up for VSLO
VSLDOI $8, V9, V9, V9 // Set up for VSLO
VSLO ONES, V9, SEPMASK // Mask for separator len(sep) < 16
-
+#endif
loadge16:
ANDCC $15, R5, R9 // Find byte offset of sep
ADD R9, R6, R10 // Add sep len
CMP R10, $16 // Check if sep len+offset > 16
BGT sepcross16 // Sep crosses 16 byte boundary
-
+#ifdef GOPPC64_power10
+ LXVLL R5, R14, V0 // Load separator
+#else
RLDICR $0, R5, $59, R8 // Adjust addr to 16 byte container
LXVB16X (R8)(R0), V0 // Load 16 bytes @R8 into V0
SLD $3, R9 // Set up shift count for VSLO
MTVSRD R9, V8 // Set up shift count for VSLO
VSLDOI $8, V8, V8, V8
VSLO V0, V8, V0 // Shift by start byte
-
VAND V0, SEPMASK, V0 // Mask separator (< 16)
- BR index2plus
-
+#endif
+ BR index2plus
sepcross16:
- LXVB16X (R5)(R0), V0 // Load 16 bytes @R5 into V0
-
+#ifdef GOPPC64_power10
+ LXVLL R5, R14, V0 // Load separator
+#else
+ LXVB16X (R5)(R0), V0 // Load 16 bytes @R5 into V0\
VAND V0, SEPMASK, V0 // mask out separator
+#endif
BLE CR4, index2to16
BR index17plus // Handle sep > 16
@@ -659,9 +667,23 @@ index2to16:
VSPLTISB $0, V10 // Clear
BGT index2to16tail
- MOVD $3, R17 // Number of bytes beyond 16
+#ifdef GOPPC64_power10
+ ADD $3,R7, R17 // Base+3
+ ADD $2,R7, R8 // Base+2
+ ADD $1,R7, R10 // Base+1
+#else
+ MOVD $3, R17 // Number of bytes beyond 16
+#endif
PCALIGN $32
+
index2to16loop:
+
+#ifdef GOPPC64_power10
+ LXVLL R7, R14, V8 // Load next 16 bytes of string from Base
+ LXVLL R10, R14, V9 // Load next 16 bytes of string from Base+1
+ LXVLL R8, R14, V11 // Load next 16 bytes of string from Base+2
+ LXVLL R17,R14, V12 // Load next 16 bytes of string from Base+3
+#else
LXVB16X (R7)(R0), V1 // Load next 16 bytes of string into V1 from R7
LXVB16X (R7)(R17), V5 // Load next 16 bytes of string into V5 from R7+3
@@ -672,6 +694,7 @@ index2to16loop:
VAND V3, SEPMASK, V9 // Mask out sep size 1st index
VAND V4, SEPMASK, V11 // Mask out sep size 2nd index
VAND V5, SEPMASK, V12 // Mask out sep size 3rd index
+#endif
VCMPEQUBCC V0, V8, V8 // compare masked string
BLT CR6, found // All equal while comparing 0th index
VCMPEQUBCC V0, V9, V9 // compare masked string
@@ -682,14 +705,29 @@ index2to16loop:
BLT CR6, found4 // All equal while comparing 3rd index
ADD $4, R7 // Update ptr to next 4 bytes
+#ifdef GOPPC64_power10
+ ADD $4, R17 // Update ptr to next 4 bytes
+ ADD $4, R8 // Update ptr to next 4 bytes
+ ADD $4, R10 // Update ptr to next 4 bytes
+#endif
CMP R7, LASTSTR // Still less than last start byte
BGT notfound // Not found
ADD $19, R7, R9 // Verify remaining bytes
CMP R9, LASTBYTE // length of string at least 19
BLE index2to16loop // Try again, else do post processing and jump to index2to16next
-
+ PCALIGN $32
// <19 bytes left, post process the remaining string
index2to16tail:
+#ifdef GOPPC64_power10
+index2to16next_p10:
+ LXVLL R7,R14, V1 // Load 16 bytes @R7 into V1
+ VCMPEQUBCC V1, V0, V3 // Compare sep and partial string
+ BLT CR6, found // Found
+ ADD $1, R7 // Not found, try next partial string
+ CMP R7, LASTSTR // Check for end of string
+ BLE index2to16next_p10 // If at end, then not found
+ BR notfound // go to remainder loop
+#else
ADD R3, R4, R9 // End of string
SUB R7, R9, R9 // Number of bytes left
ANDCC $15, R7, R10 // 16 byte offset
@@ -748,6 +786,7 @@ index2to16next:
BGT notfound // If at end, then not found
VSLDOI $1, V1, V10, V1 // Shift string left by 1 byte
BR index2to16next // Check the next partial string
+#endif // Tail processing if GOPPC64!=power10
index17plus:
CMP R6, $32 // Check if 17 < len(sep) <= 32