From fcd28070fe4fe86b04c760dd7ce5fff2aa63bad5 Mon Sep 17 00:00:00 2001
From: Jorropo <jorropo.pgm@gmail.com>
Date: Thu, 3 Jul 2025 01:35:51 +0200
Subject: cmd/compile: add opt branchelim to rewrite some CondSelect into math
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This allows something like:
  if y { x++ }

To be compiled to:
  MOVBLZX BX, CX
  ADDQ CX, AX

Instead of:
  LEAQ    1(AX), CX
  MOVBLZX BL, DX
  TESTQ   DX, DX
  CMOVQNE CX, AX

While ./make.bash uniqued per LOC, there is 100 additions and 75 substractions.

See benchmark here: https://go.dev/play/p/DJf5COjwhd_s

Either it's a performance no-op or it is faster:

  goos: linux
  goarch: amd64
  cpu: AMD Ryzen 5 3600 6-Core Processor
                                          │ /tmp/old.logs │            /tmp/new.logs             │
                                          │    sec/op     │    sec/op     vs base                │
  CmovInlineConditionAddLatency-12           0.5443n ± 5%   0.5339n ± 3%   -1.90% (p=0.004 n=10)
  CmovInlineConditionAddThroughputBy6-12      1.492n ± 1%    1.494n ± 1%        ~ (p=0.955 n=10)
  CmovInlineConditionSubLatency-12           0.5419n ± 3%   0.5282n ± 3%   -2.52% (p=0.019 n=10)
  CmovInlineConditionSubThroughputBy6-12      1.587n ± 1%    1.584n ± 2%        ~ (p=0.492 n=10)
  CmovOutlineConditionAddLatency-12          0.5223n ± 1%   0.2639n ± 4%  -49.47% (p=0.000 n=10)
  CmovOutlineConditionAddThroughputBy6-12     1.159n ± 1%    1.097n ± 2%   -5.35% (p=0.000 n=10)
  CmovOutlineConditionSubLatency-12          0.5271n ± 3%   0.2654n ± 2%  -49.66% (p=0.000 n=10)
  CmovOutlineConditionSubThroughputBy6-12     1.053n ± 1%    1.050n ± 1%        ~ (p=1.000 n=10)
  geomean

There are other benefits not tested by this benchmark:
- the math form is usually a couple bytes shorter (ICACHE)
- the math form is usually 0~2 uops shorter (UCACHE)
- the math form has usually less register pressure*
- the math form can sometimes be optimized further

*regalloc rarely find how it can use less registers

As far as pass ordering goes there are many possible options,
I've decided to reorder branchelim before late opt since:
- unlike running exclusively the CondSelect rules after branchelim,
  some extra optimizations might trigger on the adds or subs.
- I don't want to maintain a second generic.rules file of only the stuff,
  that can trigger after branchelim.
- rerunning all of opt a third time increase compilation time for little gains.

By elimination moving branchelim seems fine.

Change-Id: I869adf57e4d109948ee157cfc47144445146bafd
Reviewed-on: https://go-review.googlesource.com/c/go/+/685676
Reviewed-by: Keith Randall <khr@golang.org>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Keith Randall <khr@google.com>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
---
 test/codegen/condmove.go | 28 +++++++++++++++++++++++++---
 1 file changed, 25 insertions(+), 3 deletions(-)

(limited to 'test/codegen')

diff --git a/test/codegen/condmove.go b/test/codegen/condmove.go
index 1058910307..95a9d2cd23 100644
--- a/test/codegen/condmove.go
+++ b/test/codegen/condmove.go
@@ -106,7 +106,7 @@ func cmovfloatint2(x, y float64) float64 {
 	for r >= y {
 		rfr, rexp := frexp(r)
 		if rfr < yfr {
-			rexp = rexp - 1
+			rexp = rexp - 42
 		}
 		// amd64:"CMOVQHI"
 		// arm64:"CSEL\tMI"
@@ -205,7 +205,7 @@ func cmovinvert6(x, y uint64) uint64 {
 
 func cmovload(a []int, i int, b bool) int {
 	if b {
-		i++
+		i += 42
 	}
 	// See issue 26306
 	// amd64:-"CMOVQNE"
@@ -214,7 +214,7 @@ func cmovload(a []int, i int, b bool) int {
 
 func cmovstore(a []int, i int, b bool) {
 	if b {
-		i++
+		i += 42
 	}
 	// amd64:"CMOVQNE"
 	a[i] = 7
@@ -451,3 +451,25 @@ func cmovzeroreg1(a, b int) int {
 	// ppc64x:"ISEL\t[$]2, R0, R[0-9]+, R[0-9]+"
 	return x
 }
+
+func cmovmathadd(a uint, b bool) uint {
+	if b {
+		a++
+	}
+	// amd64:"ADDQ", -"CMOV"
+	// arm64:"CSINC", -"CSEL"
+	// ppc64x:"ADD", -"ISEL"
+	// wasm:"Add", "-Select"
+	return a
+}
+
+func cmovmathsub(a uint, b bool) uint {
+	if b {
+		a--
+	}
+	// amd64:"SUBQ", -"CMOV"
+	// arm64:"SUB", -"CSEL"
+	// ppc64x:"SUB", -"ISEL"
+	// wasm:"Sub", "-Select"
+	return a
+}
-- 
cgit v1.3