aboutsummaryrefslogtreecommitdiff
path: root/test/codegen
diff options
context:
space:
mode:
authorJorropo <jorropo.pgm@gmail.com>2025-07-03 01:35:51 +0200
committerJorropo <jorropo.pgm@gmail.com>2025-07-24 14:42:10 -0700
commitfcd28070fe4fe86b04c760dd7ce5fff2aa63bad5 (patch)
treeb31d02d99517c995438222a9f243fec1e5b5deb3 /test/codegen
parentf32cf8e4b025eee84aa3ec690966fa4e737a7522 (diff)
downloadgo-fcd28070fe4fe86b04c760dd7ce5fff2aa63bad5.tar.xz
cmd/compile: add opt branchelim to rewrite some CondSelect into math
This allows something like: if y { x++ } To be compiled to: MOVBLZX BX, CX ADDQ CX, AX Instead of: LEAQ 1(AX), CX MOVBLZX BL, DX TESTQ DX, DX CMOVQNE CX, AX While ./make.bash uniqued per LOC, there is 100 additions and 75 substractions. See benchmark here: https://go.dev/play/p/DJf5COjwhd_s Either it's a performance no-op or it is faster: goos: linux goarch: amd64 cpu: AMD Ryzen 5 3600 6-Core Processor │ /tmp/old.logs │ /tmp/new.logs │ │ sec/op │ sec/op vs base │ CmovInlineConditionAddLatency-12 0.5443n ± 5% 0.5339n ± 3% -1.90% (p=0.004 n=10) CmovInlineConditionAddThroughputBy6-12 1.492n ± 1% 1.494n ± 1% ~ (p=0.955 n=10) CmovInlineConditionSubLatency-12 0.5419n ± 3% 0.5282n ± 3% -2.52% (p=0.019 n=10) CmovInlineConditionSubThroughputBy6-12 1.587n ± 1% 1.584n ± 2% ~ (p=0.492 n=10) CmovOutlineConditionAddLatency-12 0.5223n ± 1% 0.2639n ± 4% -49.47% (p=0.000 n=10) CmovOutlineConditionAddThroughputBy6-12 1.159n ± 1% 1.097n ± 2% -5.35% (p=0.000 n=10) CmovOutlineConditionSubLatency-12 0.5271n ± 3% 0.2654n ± 2% -49.66% (p=0.000 n=10) CmovOutlineConditionSubThroughputBy6-12 1.053n ± 1% 1.050n ± 1% ~ (p=1.000 n=10) geomean There are other benefits not tested by this benchmark: - the math form is usually a couple bytes shorter (ICACHE) - the math form is usually 0~2 uops shorter (UCACHE) - the math form has usually less register pressure* - the math form can sometimes be optimized further *regalloc rarely find how it can use less registers As far as pass ordering goes there are many possible options, I've decided to reorder branchelim before late opt since: - unlike running exclusively the CondSelect rules after branchelim, some extra optimizations might trigger on the adds or subs. - I don't want to maintain a second generic.rules file of only the stuff, that can trigger after branchelim. - rerunning all of opt a third time increase compilation time for little gains. By elimination moving branchelim seems fine. Change-Id: I869adf57e4d109948ee157cfc47144445146bafd Reviewed-on: https://go-review.googlesource.com/c/go/+/685676 Reviewed-by: Keith Randall <khr@golang.org> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: Keith Randall <khr@google.com> Reviewed-by: Michael Knyszek <mknyszek@google.com>
Diffstat (limited to 'test/codegen')
-rw-r--r--test/codegen/condmove.go28
1 files changed, 25 insertions, 3 deletions
diff --git a/test/codegen/condmove.go b/test/codegen/condmove.go
index 1058910307..95a9d2cd23 100644
--- a/test/codegen/condmove.go
+++ b/test/codegen/condmove.go
@@ -106,7 +106,7 @@ func cmovfloatint2(x, y float64) float64 {
for r >= y {
rfr, rexp := frexp(r)
if rfr < yfr {
- rexp = rexp - 1
+ rexp = rexp - 42
}
// amd64:"CMOVQHI"
// arm64:"CSEL\tMI"
@@ -205,7 +205,7 @@ func cmovinvert6(x, y uint64) uint64 {
func cmovload(a []int, i int, b bool) int {
if b {
- i++
+ i += 42
}
// See issue 26306
// amd64:-"CMOVQNE"
@@ -214,7 +214,7 @@ func cmovload(a []int, i int, b bool) int {
func cmovstore(a []int, i int, b bool) {
if b {
- i++
+ i += 42
}
// amd64:"CMOVQNE"
a[i] = 7
@@ -451,3 +451,25 @@ func cmovzeroreg1(a, b int) int {
// ppc64x:"ISEL\t[$]2, R0, R[0-9]+, R[0-9]+"
return x
}
+
+func cmovmathadd(a uint, b bool) uint {
+ if b {
+ a++
+ }
+ // amd64:"ADDQ", -"CMOV"
+ // arm64:"CSINC", -"CSEL"
+ // ppc64x:"ADD", -"ISEL"
+ // wasm:"Add", "-Select"
+ return a
+}
+
+func cmovmathsub(a uint, b bool) uint {
+ if b {
+ a--
+ }
+ // amd64:"SUBQ", -"CMOV"
+ // arm64:"SUB", -"CSEL"
+ // ppc64x:"SUB", -"ISEL"
+ // wasm:"Sub", "-Select"
+ return a
+}