From b70244ff7a043786c211775b68259de6104ff91c Mon Sep 17 00:00:00 2001 From: Joel Sing Date: Mon, 24 Feb 2025 00:27:34 +1100 Subject: cmd/compile: intrinsify math/bits.Len on riscv64 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For riscv64/rva22u64 and above, we can intrinsify math/bits.Len using the CLZ/CLZW machine instructions. On a StarFive VisionFive 2 with GORISCV64=rva22u64: │ clz.b.1 │ clz.b.2 │ │ sec/op │ sec/op vs base │ LeadingZeros-4 28.89n ± 0% 12.08n ± 0% -58.19% (p=0.000 n=10) LeadingZeros8-4 18.79n ± 0% 14.76n ± 0% -21.45% (p=0.000 n=10) LeadingZeros16-4 25.27n ± 0% 14.76n ± 0% -41.59% (p=0.000 n=10) LeadingZeros32-4 25.12n ± 0% 12.08n ± 0% -51.92% (p=0.000 n=10) LeadingZeros64-4 25.89n ± 0% 12.08n ± 0% -53.35% (p=0.000 n=10) geomean 24.55n 13.09n -46.70% Change-Id: I0dda684713dbdf5336af393f5ccbdae861c4f694 Reviewed-on: https://go-review.googlesource.com/c/go/+/652321 Reviewed-by: David Chase Reviewed-by: Meng Zhuo LUCI-TryBot-Result: Go LUCI Reviewed-by: Mark Ryan Reviewed-by: Cherry Mui --- test/codegen/mathbits.go | 83 +++++++++++++++++++++++++++++++----------------- 1 file changed, 53 insertions(+), 30 deletions(-) (limited to 'test/codegen') diff --git a/test/codegen/mathbits.go b/test/codegen/mathbits.go index 786fad3bd9..a9cf466780 100644 --- a/test/codegen/mathbits.go +++ b/test/codegen/mathbits.go @@ -15,60 +15,70 @@ import "math/bits" func LeadingZeros(n uint) int { // amd64/v1,amd64/v2:"BSRQ" // amd64/v3:"LZCNTQ", -"BSRQ" - // s390x:"FLOGR" - // arm:"CLZ" arm64:"CLZ" + // arm64:"CLZ" + // arm:"CLZ" // loong64:"CLZV",-"SUB" // mips:"CLZ" - // wasm:"I64Clz" // ppc64x:"CNTLZD" + // riscv64/rva22u64,riscv64/rva23u64:"CLZ\t",-"SUB" + // s390x:"FLOGR" + // wasm:"I64Clz" return bits.LeadingZeros(n) } func LeadingZeros64(n uint64) int { // amd64/v1,amd64/v2:"BSRQ" // amd64/v3:"LZCNTQ", -"BSRQ" - // s390x:"FLOGR" - // arm:"CLZ" arm64:"CLZ" + // arm:"CLZ" + // arm64:"CLZ" // loong64:"CLZV",-"SUB" // mips:"CLZ" - // wasm:"I64Clz" // ppc64x:"CNTLZD" + // riscv64/rva22u64,riscv64/rva23u64:"CLZ\t",-"ADDI" + // s390x:"FLOGR" + // wasm:"I64Clz" return bits.LeadingZeros64(n) } func LeadingZeros32(n uint32) int { // amd64/v1,amd64/v2:"BSRQ","LEAQ",-"CMOVQEQ" // amd64/v3: "LZCNTL",- "BSRL" - // s390x:"FLOGR" - // arm:"CLZ" arm64:"CLZW" + // arm:"CLZ" + // arm64:"CLZW" // loong64:"CLZW",-"SUB" // mips:"CLZ" - // wasm:"I64Clz" // ppc64x:"CNTLZW" + // riscv64/rva22u64,riscv64/rva23u64:"CLZW",-"ADDI" + // s390x:"FLOGR" + // wasm:"I64Clz" return bits.LeadingZeros32(n) } func LeadingZeros16(n uint16) int { // amd64/v1,amd64/v2:"BSRL","LEAL",-"CMOVQEQ" // amd64/v3: "LZCNTL",- "BSRL" - // s390x:"FLOGR" - // arm:"CLZ" arm64:"CLZ" + // arm64:"CLZ" + // arm:"CLZ" // loong64:"CLZV" // mips:"CLZ" - // wasm:"I64Clz" // ppc64x:"CNTLZD" + // riscv64/rva22u64,riscv64/rva23u64:"CLZ\t","ADDI\t\\$-48",-"NEG" + // s390x:"FLOGR" + // wasm:"I64Clz" return bits.LeadingZeros16(n) } func LeadingZeros8(n uint8) int { // amd64/v1,amd64/v2:"BSRL","LEAL",-"CMOVQEQ" // amd64/v3: "LZCNTL",- "BSRL" - // s390x:"FLOGR" - // arm:"CLZ" arm64:"CLZ" + // arm64:"CLZ" + // arm:"CLZ" // loong64:"CLZV" // mips:"CLZ" - // wasm:"I64Clz" // ppc64x:"CNTLZD" + // riscv64/rva22u64,riscv64/rva23u64:"CLZ\t","ADDI\t\\$-56",-"NEG" + // s390x:"FLOGR" + // wasm:"I64Clz" return bits.LeadingZeros8(n) } @@ -79,30 +89,35 @@ func LeadingZeros8(n uint8) int { func Len(n uint) int { // amd64/v1,amd64/v2:"BSRQ" // amd64/v3: "LZCNTQ" - // s390x:"FLOGR" - // arm:"CLZ" arm64:"CLZ" + // arm64:"CLZ" + // arm:"CLZ" // loong64:"CLZV" // mips:"CLZ" - // wasm:"I64Clz" // ppc64x:"SUBC","CNTLZD" + // riscv64/rva22u64,riscv64/rva23u64:"CLZ\t","ADDI\t\\$-64" + // s390x:"FLOGR" + // wasm:"I64Clz" return bits.Len(n) } func Len64(n uint64) int { // amd64/v1,amd64/v2:"BSRQ" // amd64/v3: "LZCNTQ" - // s390x:"FLOGR" - // arm:"CLZ" arm64:"CLZ" + // arm64:"CLZ" + // arm:"CLZ" // loong64:"CLZV" // mips:"CLZ" - // wasm:"I64Clz" // ppc64x:"SUBC","CNTLZD" + // riscv64/rva22u64,riscv64/rva23u64:"CLZ\t","ADDI\t\\$-64" + // s390x:"FLOGR" + // wasm:"I64Clz" return bits.Len64(n) } func SubFromLen64(n uint64) int { // loong64:"CLZV",-"ADD" // ppc64x:"CNTLZD",-"SUBC" + // riscv64/rva22u64,riscv64/rva23u64:"CLZ\t",-"ADDI",-"NEG" return 64 - bits.Len64(n) } @@ -114,36 +129,42 @@ func CompareWithLen64(n uint64) bool { func Len32(n uint32) int { // amd64/v1,amd64/v2:"BSRQ","LEAQ",-"CMOVQEQ" // amd64/v3: "LZCNTL" - // s390x:"FLOGR" - // arm:"CLZ" arm64:"CLZ" + // arm64:"CLZ" + // arm:"CLZ" // loong64:"CLZW" // mips:"CLZ" - // wasm:"I64Clz" // ppc64x: "CNTLZW" + // riscv64/rva22u64,riscv64/rva23u64:"CLZW","ADDI\t\\$-32" + // s390x:"FLOGR" + // wasm:"I64Clz" return bits.Len32(n) } func Len16(n uint16) int { // amd64/v1,amd64/v2:"BSRL","LEAL",-"CMOVQEQ" // amd64/v3: "LZCNTL" - // s390x:"FLOGR" - // arm:"CLZ" arm64:"CLZ" + // arm64:"CLZ" + // arm:"CLZ" // loong64:"CLZV" // mips:"CLZ" - // wasm:"I64Clz" // ppc64x:"SUBC","CNTLZD" + // riscv64/rva22u64,riscv64/rva23u64:"CLZ\t","ADDI\t\\$-64" + // s390x:"FLOGR" + // wasm:"I64Clz" return bits.Len16(n) } func Len8(n uint8) int { // amd64/v1,amd64/v2:"BSRL","LEAL",-"CMOVQEQ" // amd64/v3: "LZCNTL" - // s390x:"FLOGR" - // arm:"CLZ" arm64:"CLZ" + // arm64:"CLZ" + // arm:"CLZ" // loong64:"CLZV" // mips:"CLZ" - // wasm:"I64Clz" // ppc64x:"SUBC","CNTLZD" + // riscv64/rva22u64,riscv64/rva23u64:"CLZ\t","ADDI\t\\$-64" + // s390x:"FLOGR" + // wasm:"I64Clz" return bits.Len8(n) } @@ -451,6 +472,7 @@ func IterateBits64(n uint64) int { for n != 0 { // amd64/v1,amd64/v2:"BSFQ",-"CMOVEQ" // amd64/v3:"TZCNTQ" + // riscv64/rva22u64,riscv64/rva23u64: "CTZ\t" i += bits.TrailingZeros64(n) n &= n - 1 } @@ -462,6 +484,7 @@ func IterateBits32(n uint32) int { for n != 0 { // amd64/v1,amd64/v2:"BSFL",-"BTSQ" // amd64/v3:"TZCNTL" + // riscv64/rva22u64,riscv64/rva23u64: "CTZ\t" i += bits.TrailingZeros32(n) n &= n - 1 } -- cgit v1.3