aboutsummaryrefslogtreecommitdiff
path: root/test/codegen
diff options
context:
space:
mode:
authorwdvxdr <wdvxdr1123@gmail.com>2021-10-08 16:06:42 +0800
committerKeith Randall <khr@golang.org>2021-10-18 16:01:55 +0000
commit3e5cc4d6f6befb284d7b2a5142a8b576bf5970ea (patch)
tree71a8f579ba1233ea67a4cd03d798df86b6107d3d /test/codegen
parentc091767d87b7a6ef6016286bc0fae8add59b92de (diff)
downloadgo-3e5cc4d6f6befb284d7b2a5142a8b576bf5970ea.tar.xz
cmd/compile: use MOVBE instruction for GOAMD64>=v3
encoding/binary benchmark on my laptop: name old time/op new time/op delta ReadSlice1000Int32s-8 4.42µs ± 5% 4.20µs ± 1% -4.94% (p=0.046 n=9+8) ReadStruct-8 359ns ± 8% 368ns ± 5% +2.35% (p=0.041 n=9+10) WriteStruct-8 349ns ± 1% 357ns ± 1% +2.15% (p=0.000 n=8+10) ReadInts-8 235ns ± 1% 233ns ± 1% -1.01% (p=0.005 n=10+10) WriteInts-8 265ns ± 1% 274ns ± 1% +3.45% (p=0.000 n=10+10) WriteSlice1000Int32s-8 4.61µs ± 5% 4.59µs ± 5% ~ (p=0.986 n=10+10) PutUint16-8 0.56ns ± 4% 0.57ns ± 4% ~ (p=0.101 n=10+10) PutUint32-8 0.83ns ± 2% 0.56ns ± 6% -32.91% (p=0.000 n=10+10) PutUint64-8 0.81ns ± 3% 0.62ns ± 4% -23.82% (p=0.000 n=10+10) LittleEndianPutUint16-8 0.55ns ± 4% 0.55ns ± 3% ~ (p=0.926 n=10+10) LittleEndianPutUint32-8 0.41ns ± 4% 0.42ns ± 3% ~ (p=0.148 n=10+9) LittleEndianPutUint64-8 0.55ns ± 2% 0.56ns ± 6% ~ (p=0.897 n=10+10) ReadFloats-8 60.4ns ± 4% 59.0ns ± 1% -2.25% (p=0.007 n=10+10) WriteFloats-8 72.3ns ± 2% 71.5ns ± 7% ~ (p=0.089 n=10+10) ReadSlice1000Float32s-8 4.21µs ± 3% 4.18µs ± 2% ~ (p=0.197 n=10+10) WriteSlice1000Float32s-8 4.61µs ± 2% 4.68µs ± 7% ~ (p=1.000 n=8+10) ReadSlice1000Uint8s-8 250ns ± 4% 247ns ± 4% ~ (p=0.324 n=10+10) WriteSlice1000Uint8s-8 227ns ± 5% 229ns ± 2% ~ (p=0.193 n=10+7) PutUvarint32-8 15.3ns ± 2% 15.4ns ± 4% ~ (p=0.782 n=10+10) PutUvarint64-8 38.5ns ± 1% 38.6ns ± 5% ~ (p=0.396 n=8+10) name old speed new speed delta ReadSlice1000Int32s-8 890MB/s ±17% 953MB/s ± 1% +7.00% (p=0.027 n=10+8) ReadStruct-8 209MB/s ± 8% 204MB/s ± 5% -2.42% (p=0.043 n=9+10) WriteStruct-8 214MB/s ± 3% 210MB/s ± 1% -1.75% (p=0.003 n=9+10) ReadInts-8 127MB/s ± 1% 129MB/s ± 1% +1.01% (p=0.006 n=10+10) WriteInts-8 113MB/s ± 1% 109MB/s ± 1% -3.34% (p=0.000 n=10+10) WriteSlice1000Int32s-8 868MB/s ± 5% 872MB/s ± 5% ~ (p=1.000 n=10+10) PutUint16-8 3.55GB/s ± 4% 3.50GB/s ± 4% ~ (p=0.093 n=10+10) PutUint32-8 4.83GB/s ± 2% 7.21GB/s ± 6% +49.16% (p=0.000 n=10+10) PutUint64-8 9.89GB/s ± 3% 12.99GB/s ± 4% +31.26% (p=0.000 n=10+10) LittleEndianPutUint16-8 3.65GB/s ± 4% 3.65GB/s ± 4% ~ (p=0.912 n=10+10) LittleEndianPutUint32-8 9.74GB/s ± 3% 9.63GB/s ± 3% ~ (p=0.222 n=9+9) LittleEndianPutUint64-8 14.4GB/s ± 2% 14.3GB/s ± 5% ~ (p=0.912 n=10+10) ReadFloats-8 199MB/s ± 4% 203MB/s ± 1% +2.27% (p=0.007 n=10+10) WriteFloats-8 166MB/s ± 2% 168MB/s ± 7% ~ (p=0.089 n=10+10) ReadSlice1000Float32s-8 949MB/s ± 3% 958MB/s ± 2% ~ (p=0.218 n=10+10) WriteSlice1000Float32s-8 867MB/s ± 2% 857MB/s ± 6% ~ (p=1.000 n=8+10) ReadSlice1000Uint8s-8 4.00GB/s ± 4% 4.06GB/s ± 4% ~ (p=0.353 n=10+10) WriteSlice1000Uint8s-8 4.40GB/s ± 4% 4.36GB/s ± 2% ~ (p=0.193 n=10+7) PutUvarint32-8 262MB/s ± 2% 260MB/s ± 4% ~ (p=0.739 n=10+10) PutUvarint64-8 208MB/s ± 1% 207MB/s ± 5% ~ (p=0.408 n=8+10) Updates #45453 Change-Id: Ifda0d48d54665cef45d46d3aad974062633142c4 Reviewed-on: https://go-review.googlesource.com/c/go/+/354670 Run-TryBot: Alberto Donizetti <alb.donizetti@gmail.com> TryBot-Result: Go Bot <gobot@golang.org> Reviewed-by: Keith Randall <khr@golang.org> Trust: Matthew Dempsky <mdempsky@google.com>
Diffstat (limited to 'test/codegen')
-rw-r--r--test/codegen/memcombine.go36
1 files changed, 24 insertions, 12 deletions
diff --git a/test/codegen/memcombine.go b/test/codegen/memcombine.go
index 2a0c534df0..97e1d4bdfb 100644
--- a/test/codegen/memcombine.go
+++ b/test/codegen/memcombine.go
@@ -70,7 +70,8 @@ func load_le16_idx(b []byte, idx int) {
}
func load_be64(b []byte) {
- // amd64:`BSWAPQ`,-`MOV[BWL]\t[^$]`,-`OR`
+ // amd64/v1,amd64/v2:`BSWAPQ`,-`MOV[BWL]\t[^$]`,-`OR`
+ // amd64/v3:`MOVBEQ`
// s390x:`MOVD\s\(.*\),`
// arm64:`REV`,`MOVD\s\(R[0-9]+\),`,-`MOV[BHW]`,-`REVW`,-`REV16W`
// ppc64le:`MOVDBR`,-`MOV[BHW]Z`
@@ -78,7 +79,8 @@ func load_be64(b []byte) {
}
func load_be64_idx(b []byte, idx int) {
- // amd64:`BSWAPQ`,-`MOV[BWL]\t[^$]`,-`OR`
+ // amd64/v1,amd64/v2:`BSWAPQ`,-`MOV[BWL]\t[^$]`,-`OR`
+ // amd64/v3: `MOVBEQ`
// s390x:`MOVD\s\(.*\)\(.*\*1\),`
// arm64:`REV`,`MOVD\s\(R[0-9]+\)\(R[0-9]+\),`,-`MOV[WHB]`,-`REVW`,-`REV16W`
// ppc64le:`MOVDBR`,-`MOV[BHW]Z`
@@ -86,7 +88,8 @@ func load_be64_idx(b []byte, idx int) {
}
func load_be32(b []byte) {
- // amd64:`BSWAPL`,-`MOV[BW]`,-`OR`
+ // amd64/v1,amd64/v2:`BSWAPL`,-`MOV[BW]`,-`OR`
+ // amd64/v3: `MOVBEL`
// s390x:`MOVWZ\s\(.*\),`
// arm64:`REVW`,`MOVWU\s\(R[0-9]+\),`,-`MOV[BH]`,-`REV16W`
// ppc64le:`MOVWBR`,-`MOV[BH]Z`
@@ -94,7 +97,8 @@ func load_be32(b []byte) {
}
func load_be32_idx(b []byte, idx int) {
- // amd64:`BSWAPL`,-`MOV[BW]`,-`OR`
+ // amd64/v1,amd64/v2:`BSWAPL`,-`MOV[BW]`,-`OR`
+ // amd64/v3: `MOVBEL`
// s390x:`MOVWZ\s\(.*\)\(.*\*1\),`
// arm64:`REVW`,`MOVWU\s\(R[0-9]+\)\(R[0-9]+\),`,-`MOV[HB]`,-`REV16W`
// ppc64le:`MOVWBR`,-`MOV[BH]Z`
@@ -179,7 +183,8 @@ func load_be_byte4_uint32(s []byte) uint32 {
func load_be_byte4_uint32_inv(s []byte) uint32 {
// arm64:`MOVWU\t\(R[0-9]+\)`,`REVW`,-`ORR`,-`REV16W`,-`MOV[BH]`
- // amd64:`MOVL\s\([A-Z]+\)`,`BSWAPL`,-`MOV[BW]`,-`OR`
+ // amd64/v1,amd64/v2:`MOVL\s\([A-Z]+\)`,`BSWAPL`,-`MOV[BW]`,-`OR`
+ // amd64/v3: `MOVBEL`
return uint32(s[3]) | uint32(s[2])<<8 | uint32(s[1])<<16 | uint32(s[0])<<24
}
@@ -191,7 +196,8 @@ func load_be_byte8_uint64(s []byte) uint64 {
func load_be_byte8_uint64_inv(s []byte) uint64 {
// arm64:`MOVD\t\(R[0-9]+\)`,`REV`,-`ORR`,-`REVW`,-`REV16W`,-`MOV[BHW]`
- // amd64:`MOVQ\s\([A-Z]+\),\s[A-Z]+`,`BSWAPQ`,-`MOV[BWL]\t[^$]`,-`OR`
+ // amd64/v1,amd64/v2:`MOVQ\s\([A-Z]+\),\s[A-Z]+`,`BSWAPQ`,-`MOV[BWL]\t[^$]`,-`OR`
+ // amd64/v3: `MOVBEQ`
// ppc64le:`MOVDBR\t\(R[0-9]+\)`,-`MOV[BHW]Z`
return uint64(s[7]) | uint64(s[6])<<8 | uint64(s[5])<<16 | uint64(s[4])<<24 | uint64(s[3])<<32 | uint64(s[2])<<40 | uint64(s[1])<<48 | uint64(s[0])<<56
}
@@ -409,7 +415,8 @@ func store_le16_idx(b []byte, idx int) {
}
func store_be64(b []byte) {
- // amd64:`BSWAPQ`,-`SHR.`
+ // amd64/v1,amd64/v2:`BSWAPQ`,-`SHR.`
+ // amd64/v3: `MOVBEQ`
// arm64:`MOVD`,`REV`,-`MOV[WBH]`,-`REVW`,-`REV16W`
// ppc64le:`MOVDBR`
// s390x:`MOVD\s.*\(.*\)$`,-`SRW\s`,-`SRD\s`
@@ -417,7 +424,8 @@ func store_be64(b []byte) {
}
func store_be64_idx(b []byte, idx int) {
- // amd64:`BSWAPQ`,-`SHR.`
+ // amd64/v1,amd64/v2:`BSWAPQ`,-`SHR.`
+ // amd64/v3:`MOVBEQ`
// arm64:`REV`,`MOVD\sR[0-9]+,\s\(R[0-9]+\)\(R[0-9]+\)`,-`MOV[BHW]`,-`REV16W`,-`REVW`
// ppc64le:`MOVDBR`
// s390x:`MOVD\s.*\(.*\)\(.*\*1\)$`,-`SRW\s`,-`SRD\s`
@@ -425,7 +433,8 @@ func store_be64_idx(b []byte, idx int) {
}
func store_be32(b []byte) {
- // amd64:`BSWAPL`,-`SHR.`
+ // amd64/v1,amd64/v2:`BSWAPL`,-`SHR.`
+ // amd64/v3:`MOVBEL`
// arm64:`MOVW`,`REVW`,-`MOV[BH]`,-`REV16W`
// ppc64le:`MOVWBR`
// s390x:`MOVW\s.*\(.*\)$`,-`SRW\s`,-`SRD\s`
@@ -445,7 +454,8 @@ func store_be32_load(b, x *[8]byte) {
}
func store_be32_idx(b []byte, idx int) {
- // amd64:`BSWAPL`,-`SHR.`
+ // amd64/v1,amd64/v2:`BSWAPL`,-`SHR.`
+ // amd64/v3:`MOVBEL`
// arm64:`REVW`,`MOVW\sR[0-9]+,\s\(R[0-9]+\)\(R[0-9]+\)`,-`MOV[BH]`,-`REV16W`
// ppc64le:`MOVWBR`
// s390x:`MOVW\s.*\(.*\)\(.*\*1\)$`,-`SRW\s`,-`SRD\s`
@@ -508,14 +518,16 @@ func store_be_byte_2(b []byte, val uint16) {
func store_be_byte_4(b []byte, val uint32) {
_ = b[4]
// arm64:`REVW`,`MOVW\sR[0-9]+,\s1\(R[0-9]+\)`,-`MOVB`,-`MOVH`,-`REV16W`
- // amd64:`MOVL\s[A-Z]+,\s1\([A-Z]+\)`,-`MOVB`,-`MOVW`
+ // amd64/v1,amd64/v2:`MOVL\s[A-Z]+,\s1\([A-Z]+\)`,-`MOVB`,-`MOVW`
+ // amd64/v3:`MOVBEL\s[A-Z]+,\s1\([A-Z]+\)`
b[1], b[2], b[3], b[4] = byte(val>>24), byte(val>>16), byte(val>>8), byte(val)
}
func store_be_byte_8(b []byte, val uint64) {
_ = b[8]
// arm64:`REV`,`MOVD\sR[0-9]+,\s1\(R[0-9]+\)`,-`MOVB`,-`MOVH`,-`MOVW`,-`REV16W`,-`REVW`
- // amd64:`MOVQ\s[A-Z]+,\s1\([A-Z]+\)`,-`MOVB`,-`MOVW`,-`MOVL`
+ // amd64/v1,amd64/v2:`MOVQ\s[A-Z]+,\s1\([A-Z]+\)`,-`MOVB`,-`MOVW`,-`MOVL`
+ // amd64/v3:`MOVBEQ\s[A-Z]+,\s1\([A-Z]+\)`, -`MOVBEL`
b[1], b[2], b[3], b[4], b[5], b[6], b[7], b[8] = byte(val>>56), byte(val>>48), byte(val>>40), byte(val>>32), byte(val>>24), byte(val>>16), byte(val>>8), byte(val)
}