diff options
| author | wdvxdr <wdvxdr1123@gmail.com> | 2021-10-08 16:06:42 +0800 |
|---|---|---|
| committer | Keith Randall <khr@golang.org> | 2021-10-18 16:01:55 +0000 |
| commit | 3e5cc4d6f6befb284d7b2a5142a8b576bf5970ea (patch) | |
| tree | 71a8f579ba1233ea67a4cd03d798df86b6107d3d /test/codegen | |
| parent | c091767d87b7a6ef6016286bc0fae8add59b92de (diff) | |
| download | go-3e5cc4d6f6befb284d7b2a5142a8b576bf5970ea.tar.xz | |
cmd/compile: use MOVBE instruction for GOAMD64>=v3
encoding/binary benchmark on my laptop:
name old time/op new time/op delta
ReadSlice1000Int32s-8 4.42µs ± 5% 4.20µs ± 1% -4.94% (p=0.046 n=9+8)
ReadStruct-8 359ns ± 8% 368ns ± 5% +2.35% (p=0.041 n=9+10)
WriteStruct-8 349ns ± 1% 357ns ± 1% +2.15% (p=0.000 n=8+10)
ReadInts-8 235ns ± 1% 233ns ± 1% -1.01% (p=0.005 n=10+10)
WriteInts-8 265ns ± 1% 274ns ± 1% +3.45% (p=0.000 n=10+10)
WriteSlice1000Int32s-8 4.61µs ± 5% 4.59µs ± 5% ~ (p=0.986 n=10+10)
PutUint16-8 0.56ns ± 4% 0.57ns ± 4% ~ (p=0.101 n=10+10)
PutUint32-8 0.83ns ± 2% 0.56ns ± 6% -32.91% (p=0.000 n=10+10)
PutUint64-8 0.81ns ± 3% 0.62ns ± 4% -23.82% (p=0.000 n=10+10)
LittleEndianPutUint16-8 0.55ns ± 4% 0.55ns ± 3% ~ (p=0.926 n=10+10)
LittleEndianPutUint32-8 0.41ns ± 4% 0.42ns ± 3% ~ (p=0.148 n=10+9)
LittleEndianPutUint64-8 0.55ns ± 2% 0.56ns ± 6% ~ (p=0.897 n=10+10)
ReadFloats-8 60.4ns ± 4% 59.0ns ± 1% -2.25% (p=0.007 n=10+10)
WriteFloats-8 72.3ns ± 2% 71.5ns ± 7% ~ (p=0.089 n=10+10)
ReadSlice1000Float32s-8 4.21µs ± 3% 4.18µs ± 2% ~ (p=0.197 n=10+10)
WriteSlice1000Float32s-8 4.61µs ± 2% 4.68µs ± 7% ~ (p=1.000 n=8+10)
ReadSlice1000Uint8s-8 250ns ± 4% 247ns ± 4% ~ (p=0.324 n=10+10)
WriteSlice1000Uint8s-8 227ns ± 5% 229ns ± 2% ~ (p=0.193 n=10+7)
PutUvarint32-8 15.3ns ± 2% 15.4ns ± 4% ~ (p=0.782 n=10+10)
PutUvarint64-8 38.5ns ± 1% 38.6ns ± 5% ~ (p=0.396 n=8+10)
name old speed new speed delta
ReadSlice1000Int32s-8 890MB/s ±17% 953MB/s ± 1% +7.00% (p=0.027 n=10+8)
ReadStruct-8 209MB/s ± 8% 204MB/s ± 5% -2.42% (p=0.043 n=9+10)
WriteStruct-8 214MB/s ± 3% 210MB/s ± 1% -1.75% (p=0.003 n=9+10)
ReadInts-8 127MB/s ± 1% 129MB/s ± 1% +1.01% (p=0.006 n=10+10)
WriteInts-8 113MB/s ± 1% 109MB/s ± 1% -3.34% (p=0.000 n=10+10)
WriteSlice1000Int32s-8 868MB/s ± 5% 872MB/s ± 5% ~ (p=1.000 n=10+10)
PutUint16-8 3.55GB/s ± 4% 3.50GB/s ± 4% ~ (p=0.093 n=10+10)
PutUint32-8 4.83GB/s ± 2% 7.21GB/s ± 6% +49.16% (p=0.000 n=10+10)
PutUint64-8 9.89GB/s ± 3% 12.99GB/s ± 4% +31.26% (p=0.000 n=10+10)
LittleEndianPutUint16-8 3.65GB/s ± 4% 3.65GB/s ± 4% ~ (p=0.912 n=10+10)
LittleEndianPutUint32-8 9.74GB/s ± 3% 9.63GB/s ± 3% ~ (p=0.222 n=9+9)
LittleEndianPutUint64-8 14.4GB/s ± 2% 14.3GB/s ± 5% ~ (p=0.912 n=10+10)
ReadFloats-8 199MB/s ± 4% 203MB/s ± 1% +2.27% (p=0.007 n=10+10)
WriteFloats-8 166MB/s ± 2% 168MB/s ± 7% ~ (p=0.089 n=10+10)
ReadSlice1000Float32s-8 949MB/s ± 3% 958MB/s ± 2% ~ (p=0.218 n=10+10)
WriteSlice1000Float32s-8 867MB/s ± 2% 857MB/s ± 6% ~ (p=1.000 n=8+10)
ReadSlice1000Uint8s-8 4.00GB/s ± 4% 4.06GB/s ± 4% ~ (p=0.353 n=10+10)
WriteSlice1000Uint8s-8 4.40GB/s ± 4% 4.36GB/s ± 2% ~ (p=0.193 n=10+7)
PutUvarint32-8 262MB/s ± 2% 260MB/s ± 4% ~ (p=0.739 n=10+10)
PutUvarint64-8 208MB/s ± 1% 207MB/s ± 5% ~ (p=0.408 n=8+10)
Updates #45453
Change-Id: Ifda0d48d54665cef45d46d3aad974062633142c4
Reviewed-on: https://go-review.googlesource.com/c/go/+/354670
Run-TryBot: Alberto Donizetti <alb.donizetti@gmail.com>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
Trust: Matthew Dempsky <mdempsky@google.com>
Diffstat (limited to 'test/codegen')
| -rw-r--r-- | test/codegen/memcombine.go | 36 |
1 files changed, 24 insertions, 12 deletions
diff --git a/test/codegen/memcombine.go b/test/codegen/memcombine.go index 2a0c534df0..97e1d4bdfb 100644 --- a/test/codegen/memcombine.go +++ b/test/codegen/memcombine.go @@ -70,7 +70,8 @@ func load_le16_idx(b []byte, idx int) { } func load_be64(b []byte) { - // amd64:`BSWAPQ`,-`MOV[BWL]\t[^$]`,-`OR` + // amd64/v1,amd64/v2:`BSWAPQ`,-`MOV[BWL]\t[^$]`,-`OR` + // amd64/v3:`MOVBEQ` // s390x:`MOVD\s\(.*\),` // arm64:`REV`,`MOVD\s\(R[0-9]+\),`,-`MOV[BHW]`,-`REVW`,-`REV16W` // ppc64le:`MOVDBR`,-`MOV[BHW]Z` @@ -78,7 +79,8 @@ func load_be64(b []byte) { } func load_be64_idx(b []byte, idx int) { - // amd64:`BSWAPQ`,-`MOV[BWL]\t[^$]`,-`OR` + // amd64/v1,amd64/v2:`BSWAPQ`,-`MOV[BWL]\t[^$]`,-`OR` + // amd64/v3: `MOVBEQ` // s390x:`MOVD\s\(.*\)\(.*\*1\),` // arm64:`REV`,`MOVD\s\(R[0-9]+\)\(R[0-9]+\),`,-`MOV[WHB]`,-`REVW`,-`REV16W` // ppc64le:`MOVDBR`,-`MOV[BHW]Z` @@ -86,7 +88,8 @@ func load_be64_idx(b []byte, idx int) { } func load_be32(b []byte) { - // amd64:`BSWAPL`,-`MOV[BW]`,-`OR` + // amd64/v1,amd64/v2:`BSWAPL`,-`MOV[BW]`,-`OR` + // amd64/v3: `MOVBEL` // s390x:`MOVWZ\s\(.*\),` // arm64:`REVW`,`MOVWU\s\(R[0-9]+\),`,-`MOV[BH]`,-`REV16W` // ppc64le:`MOVWBR`,-`MOV[BH]Z` @@ -94,7 +97,8 @@ func load_be32(b []byte) { } func load_be32_idx(b []byte, idx int) { - // amd64:`BSWAPL`,-`MOV[BW]`,-`OR` + // amd64/v1,amd64/v2:`BSWAPL`,-`MOV[BW]`,-`OR` + // amd64/v3: `MOVBEL` // s390x:`MOVWZ\s\(.*\)\(.*\*1\),` // arm64:`REVW`,`MOVWU\s\(R[0-9]+\)\(R[0-9]+\),`,-`MOV[HB]`,-`REV16W` // ppc64le:`MOVWBR`,-`MOV[BH]Z` @@ -179,7 +183,8 @@ func load_be_byte4_uint32(s []byte) uint32 { func load_be_byte4_uint32_inv(s []byte) uint32 { // arm64:`MOVWU\t\(R[0-9]+\)`,`REVW`,-`ORR`,-`REV16W`,-`MOV[BH]` - // amd64:`MOVL\s\([A-Z]+\)`,`BSWAPL`,-`MOV[BW]`,-`OR` + // amd64/v1,amd64/v2:`MOVL\s\([A-Z]+\)`,`BSWAPL`,-`MOV[BW]`,-`OR` + // amd64/v3: `MOVBEL` return uint32(s[3]) | uint32(s[2])<<8 | uint32(s[1])<<16 | uint32(s[0])<<24 } @@ -191,7 +196,8 @@ func load_be_byte8_uint64(s []byte) uint64 { func load_be_byte8_uint64_inv(s []byte) uint64 { // arm64:`MOVD\t\(R[0-9]+\)`,`REV`,-`ORR`,-`REVW`,-`REV16W`,-`MOV[BHW]` - // amd64:`MOVQ\s\([A-Z]+\),\s[A-Z]+`,`BSWAPQ`,-`MOV[BWL]\t[^$]`,-`OR` + // amd64/v1,amd64/v2:`MOVQ\s\([A-Z]+\),\s[A-Z]+`,`BSWAPQ`,-`MOV[BWL]\t[^$]`,-`OR` + // amd64/v3: `MOVBEQ` // ppc64le:`MOVDBR\t\(R[0-9]+\)`,-`MOV[BHW]Z` return uint64(s[7]) | uint64(s[6])<<8 | uint64(s[5])<<16 | uint64(s[4])<<24 | uint64(s[3])<<32 | uint64(s[2])<<40 | uint64(s[1])<<48 | uint64(s[0])<<56 } @@ -409,7 +415,8 @@ func store_le16_idx(b []byte, idx int) { } func store_be64(b []byte) { - // amd64:`BSWAPQ`,-`SHR.` + // amd64/v1,amd64/v2:`BSWAPQ`,-`SHR.` + // amd64/v3: `MOVBEQ` // arm64:`MOVD`,`REV`,-`MOV[WBH]`,-`REVW`,-`REV16W` // ppc64le:`MOVDBR` // s390x:`MOVD\s.*\(.*\)$`,-`SRW\s`,-`SRD\s` @@ -417,7 +424,8 @@ func store_be64(b []byte) { } func store_be64_idx(b []byte, idx int) { - // amd64:`BSWAPQ`,-`SHR.` + // amd64/v1,amd64/v2:`BSWAPQ`,-`SHR.` + // amd64/v3:`MOVBEQ` // arm64:`REV`,`MOVD\sR[0-9]+,\s\(R[0-9]+\)\(R[0-9]+\)`,-`MOV[BHW]`,-`REV16W`,-`REVW` // ppc64le:`MOVDBR` // s390x:`MOVD\s.*\(.*\)\(.*\*1\)$`,-`SRW\s`,-`SRD\s` @@ -425,7 +433,8 @@ func store_be64_idx(b []byte, idx int) { } func store_be32(b []byte) { - // amd64:`BSWAPL`,-`SHR.` + // amd64/v1,amd64/v2:`BSWAPL`,-`SHR.` + // amd64/v3:`MOVBEL` // arm64:`MOVW`,`REVW`,-`MOV[BH]`,-`REV16W` // ppc64le:`MOVWBR` // s390x:`MOVW\s.*\(.*\)$`,-`SRW\s`,-`SRD\s` @@ -445,7 +454,8 @@ func store_be32_load(b, x *[8]byte) { } func store_be32_idx(b []byte, idx int) { - // amd64:`BSWAPL`,-`SHR.` + // amd64/v1,amd64/v2:`BSWAPL`,-`SHR.` + // amd64/v3:`MOVBEL` // arm64:`REVW`,`MOVW\sR[0-9]+,\s\(R[0-9]+\)\(R[0-9]+\)`,-`MOV[BH]`,-`REV16W` // ppc64le:`MOVWBR` // s390x:`MOVW\s.*\(.*\)\(.*\*1\)$`,-`SRW\s`,-`SRD\s` @@ -508,14 +518,16 @@ func store_be_byte_2(b []byte, val uint16) { func store_be_byte_4(b []byte, val uint32) { _ = b[4] // arm64:`REVW`,`MOVW\sR[0-9]+,\s1\(R[0-9]+\)`,-`MOVB`,-`MOVH`,-`REV16W` - // amd64:`MOVL\s[A-Z]+,\s1\([A-Z]+\)`,-`MOVB`,-`MOVW` + // amd64/v1,amd64/v2:`MOVL\s[A-Z]+,\s1\([A-Z]+\)`,-`MOVB`,-`MOVW` + // amd64/v3:`MOVBEL\s[A-Z]+,\s1\([A-Z]+\)` b[1], b[2], b[3], b[4] = byte(val>>24), byte(val>>16), byte(val>>8), byte(val) } func store_be_byte_8(b []byte, val uint64) { _ = b[8] // arm64:`REV`,`MOVD\sR[0-9]+,\s1\(R[0-9]+\)`,-`MOVB`,-`MOVH`,-`MOVW`,-`REV16W`,-`REVW` - // amd64:`MOVQ\s[A-Z]+,\s1\([A-Z]+\)`,-`MOVB`,-`MOVW`,-`MOVL` + // amd64/v1,amd64/v2:`MOVQ\s[A-Z]+,\s1\([A-Z]+\)`,-`MOVB`,-`MOVW`,-`MOVL` + // amd64/v3:`MOVBEQ\s[A-Z]+,\s1\([A-Z]+\)`, -`MOVBEL` b[1], b[2], b[3], b[4], b[5], b[6], b[7], b[8] = byte(val>>56), byte(val>>48), byte(val>>40), byte(val>>32), byte(val>>24), byte(val>>16), byte(val>>8), byte(val) } |
