From a92ca515077e5cf54673eb8c5c2d9db4824330db Mon Sep 17 00:00:00 2001 From: Wayne Zuo Date: Wed, 30 Mar 2022 21:44:44 +0800 Subject: cmd/compile: use LZCNT instruction for GOAMD64>=3 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit LZCNT is similar to BSR, but BSR(x) is undefined when x == 0, so using LZCNT can avoid a special case for zero input. Except that case, LZCNTQ(x) == 63-BSRQ(x) and LZCNTL(x) == 31-BSRL(x). And according to https://www.agner.org/optimize/instruction_tables.pdf, LZCNT instructions are much faster than BSR on AMD CPU. name old time/op new time/op delta LeadingZeros-8 0.91ns ± 1% 0.80ns ± 7% -11.68% (p=0.000 n=9+9) LeadingZeros8-8 0.98ns ±15% 0.91ns ± 1% -7.34% (p=0.000 n=9+9) LeadingZeros16-8 0.94ns ± 3% 0.92ns ± 2% -2.36% (p=0.001 n=10+10) LeadingZeros32-8 0.89ns ± 1% 0.78ns ± 2% -12.49% (p=0.000 n=10+10) LeadingZeros64-8 0.92ns ± 1% 0.78ns ± 1% -14.48% (p=0.000 n=10+10) Change-Id: I125147fe3d6994a4cfe558432780408e9a27557a Reviewed-on: https://go-review.googlesource.com/c/go/+/396794 Reviewed-by: Keith Randall Trust: Emmanuel Odeke Run-TryBot: Emmanuel Odeke TryBot-Result: Gopher Robot --- test/codegen/mathbits.go | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) (limited to 'test/codegen') diff --git a/test/codegen/mathbits.go b/test/codegen/mathbits.go index 859490c363..58d57b3523 100644 --- a/test/codegen/mathbits.go +++ b/test/codegen/mathbits.go @@ -13,7 +13,8 @@ import "math/bits" // ----------------------- // func LeadingZeros(n uint) int { - // amd64:"BSRQ" + // amd64/v1,amd64/v2:"BSRQ" + // amd64/v3:"LZCNTQ", -"BSRQ" // s390x:"FLOGR" // arm:"CLZ" arm64:"CLZ" // mips:"CLZ" @@ -22,7 +23,8 @@ func LeadingZeros(n uint) int { } func LeadingZeros64(n uint64) int { - // amd64:"BSRQ" + // amd64/v1,amd64/v2:"BSRQ" + // amd64/v3:"LZCNTQ", -"BSRQ" // s390x:"FLOGR" // arm:"CLZ" arm64:"CLZ" // mips:"CLZ" @@ -31,7 +33,8 @@ func LeadingZeros64(n uint64) int { } func LeadingZeros32(n uint32) int { - // amd64:"BSRQ","LEAQ",-"CMOVQEQ" + // amd64/v1,amd64/v2:"BSRQ","LEAQ",-"CMOVQEQ" + // amd64/v3: "LZCNTL",- "BSRL" // s390x:"FLOGR" // arm:"CLZ" arm64:"CLZW" // mips:"CLZ" @@ -40,7 +43,8 @@ func LeadingZeros32(n uint32) int { } func LeadingZeros16(n uint16) int { - // amd64:"BSRL","LEAL",-"CMOVQEQ" + // amd64/v1,amd64/v2:"BSRL","LEAL",-"CMOVQEQ" + // amd64/v3: "LZCNTL",- "BSRL" // s390x:"FLOGR" // arm:"CLZ" arm64:"CLZ" // mips:"CLZ" @@ -49,7 +53,8 @@ func LeadingZeros16(n uint16) int { } func LeadingZeros8(n uint8) int { - // amd64:"BSRL","LEAL",-"CMOVQEQ" + // amd64/v1,amd64/v2:"BSRL","LEAL",-"CMOVQEQ" + // amd64/v3: "LZCNTL",- "BSRL" // s390x:"FLOGR" // arm:"CLZ" arm64:"CLZ" // mips:"CLZ" @@ -62,7 +67,8 @@ func LeadingZeros8(n uint8) int { // --------------- // func Len(n uint) int { - // amd64:"BSRQ" + // amd64/v1,amd64/v2:"BSRQ" + // amd64/v3: "LZCNTQ" // s390x:"FLOGR" // arm:"CLZ" arm64:"CLZ" // mips:"CLZ" @@ -71,7 +77,8 @@ func Len(n uint) int { } func Len64(n uint64) int { - // amd64:"BSRQ" + // amd64/v1,amd64/v2:"BSRQ" + // amd64/v3: "LZCNTQ" // s390x:"FLOGR" // arm:"CLZ" arm64:"CLZ" // mips:"CLZ" @@ -88,7 +95,8 @@ func SubFromLen64(n uint64) int { } func Len32(n uint32) int { - // amd64:"BSRQ","LEAQ",-"CMOVQEQ" + // amd64/v1,amd64/v2:"BSRQ","LEAQ",-"CMOVQEQ" + // amd64/v3: "LZCNTL" // s390x:"FLOGR" // arm:"CLZ" arm64:"CLZ" // mips:"CLZ" @@ -99,7 +107,8 @@ func Len32(n uint32) int { } func Len16(n uint16) int { - // amd64:"BSRL","LEAL",-"CMOVQEQ" + // amd64/v1,amd64/v2:"BSRL","LEAL",-"CMOVQEQ" + // amd64/v3: "LZCNTL" // s390x:"FLOGR" // arm:"CLZ" arm64:"CLZ" // mips:"CLZ" @@ -108,7 +117,8 @@ func Len16(n uint16) int { } func Len8(n uint8) int { - // amd64:"BSRL","LEAL",-"CMOVQEQ" + // amd64/v1,amd64/v2:"BSRL","LEAL",-"CMOVQEQ" + // amd64/v3: "LZCNTL" // s390x:"FLOGR" // arm:"CLZ" arm64:"CLZ" // mips:"CLZ" -- cgit v1.3-5-g45d5