aboutsummaryrefslogtreecommitdiff
path: root/test/codegen
diff options
context:
space:
mode:
authorJoel Sing <joel@sing.id.au>2025-03-19 23:57:23 +1100
committerJoel Sing <joel@sing.id.au>2025-05-01 05:57:41 -0700
commit4d10d4ad849467f12a1a16a5ade26cc03d8f1a1f (patch)
tree1dcf434f810ddbd98e82351a31575364b37d70ca /test/codegen
parent90e8b8cdaeb76a57604a461a138c59340daed7ef (diff)
downloadgo-4d10d4ad849467f12a1a16a5ade26cc03d8f1a1f.tar.xz
cmd/compile,internal/cpu,runtime: intrinsify math/bits.OnesCount on riscv64
For riscv64/rva22u64 and above, we can intrinsify math/bits.OnesCount using the CPOP/CPOPW machine instructions. Since the native Go implementation of OnesCount is relatively expensive, it is also worth emitting a check for Zbb support when compiled for rva20u64. On a Banana Pi F3, with GORISCV64=rva22u64: │ oc.1 │ oc.2 │ │ sec/op │ sec/op vs base │ OnesCount-8 16.930n ± 0% 4.389n ± 0% -74.08% (p=0.000 n=10) OnesCount8-8 5.642n ± 0% 5.016n ± 0% -11.10% (p=0.000 n=10) OnesCount16-8 9.404n ± 0% 5.015n ± 0% -46.67% (p=0.000 n=10) OnesCount32-8 13.165n ± 0% 4.388n ± 0% -66.67% (p=0.000 n=10) OnesCount64-8 16.300n ± 0% 4.388n ± 0% -73.08% (p=0.000 n=10) geomean 11.40n 4.629n -59.40% On a Banana Pi F3, compiled with GORISCV64=rva20u64 and with Zbb detection enabled: │ oc.3 │ oc.4 │ │ sec/op │ sec/op vs base │ OnesCount-8 16.930n ± 0% 5.643n ± 0% -66.67% (p=0.000 n=10) OnesCount8-8 5.642n ± 0% 5.642n ± 0% ~ (p=0.447 n=10) OnesCount16-8 10.030n ± 0% 6.896n ± 0% -31.25% (p=0.000 n=10) OnesCount32-8 13.170n ± 0% 5.642n ± 0% -57.16% (p=0.000 n=10) OnesCount64-8 16.300n ± 0% 5.642n ± 0% -65.39% (p=0.000 n=10) geomean 11.55n 5.873n -49.16% On a Banana Pi F3, compiled with GORISCV64=rva20u64 but with Zbb detection disabled: │ oc.3 │ oc.5 │ │ sec/op │ sec/op vs base │ OnesCount-8 16.93n ± 0% 29.47n ± 0% +74.07% (p=0.000 n=10) OnesCount8-8 5.642n ± 0% 5.643n ± 0% ~ (p=0.191 n=10) OnesCount16-8 10.03n ± 0% 15.05n ± 0% +50.05% (p=0.000 n=10) OnesCount32-8 13.17n ± 0% 18.18n ± 0% +38.04% (p=0.000 n=10) OnesCount64-8 16.30n ± 0% 21.94n ± 0% +34.60% (p=0.000 n=10) geomean 11.55n 15.84n +37.16% For hardware without Zbb, this adds ~5ns overhead, while for hardware with Zbb we achieve a performance gain up of up to 11ns. It is worth noting that OnesCount8 is cheap enough that it is preferable to stick with the generic version in this case. Change-Id: Id657e40e0dd1b1ab8cc0fe0f8a68df4c9f2d7da5 Reviewed-on: https://go-review.googlesource.com/c/go/+/660856 Reviewed-by: Carlos Amedee <carlos@golang.org> Reviewed-by: Meng Zhuo <mengzhuo1203@gmail.com> Reviewed-by: Mark Ryan <markdryan@rivosinc.com> Reviewed-by: Dmitri Shuralyov <dmitshur@google.com> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Diffstat (limited to 'test/codegen')
-rw-r--r--test/codegen/mathbits.go15
1 files changed, 10 insertions, 5 deletions
diff --git a/test/codegen/mathbits.go b/test/codegen/mathbits.go
index e9dfbb1443..c7ba357d09 100644
--- a/test/codegen/mathbits.go
+++ b/test/codegen/mathbits.go
@@ -181,8 +181,9 @@ func OnesCount(n uint) int {
// amd64:"POPCNTQ"
// arm64:"VCNT","VUADDLV"
// loong64:"VPCNTV"
- // s390x:"POPCNT"
// ppc64x:"POPCNTD"
+ // riscv64:"CPOP\t"
+ // s390x:"POPCNT"
// wasm:"I64Popcnt"
return bits.OnesCount(n)
}
@@ -192,8 +193,9 @@ func OnesCount64(n uint64) int {
// amd64:"POPCNTQ"
// arm64:"VCNT","VUADDLV"
// loong64:"VPCNTV"
- // s390x:"POPCNT"
// ppc64x:"POPCNTD"
+ // riscv64:"CPOP\t"
+ // s390x:"POPCNT"
// wasm:"I64Popcnt"
return bits.OnesCount64(n)
}
@@ -203,8 +205,9 @@ func OnesCount32(n uint32) int {
// amd64:"POPCNTL"
// arm64:"VCNT","VUADDLV"
// loong64:"VPCNTW"
- // s390x:"POPCNT"
// ppc64x:"POPCNTW"
+ // riscv64:"CPOPW"
+ // s390x:"POPCNT"
// wasm:"I64Popcnt"
return bits.OnesCount32(n)
}
@@ -214,15 +217,17 @@ func OnesCount16(n uint16) int {
// amd64:"POPCNTL"
// arm64:"VCNT","VUADDLV"
// loong64:"VPCNTH"
- // s390x:"POPCNT"
// ppc64x:"POPCNTW"
+ // riscv64:"CPOP\t"
+ // s390x:"POPCNT"
// wasm:"I64Popcnt"
return bits.OnesCount16(n)
}
func OnesCount8(n uint8) int {
- // s390x:"POPCNT"
// ppc64x:"POPCNTB"
+ // riscv64/rva22u64,riscv64/rva23u64:"CPOP\t"
+ // s390x:"POPCNT"
// wasm:"I64Popcnt"
return bits.OnesCount8(n)
}