cmd/compile: CSE loads across disjoint stores

Enable partitioning together memory user instructions, such as regular loads, across disjoint memory defining instructions (currently only stores). Keep a memory table to remember appropriate memory definition for any supported memory using instruction. This allows to match more load instructions and potentially may be further improved with handling additional cases in the common utility `disjoint`. Generally this change allows to improve code size. For example, here is code size difference on linux_arm64: Executable Old .text New .text Change ------------------------------------------------------- asm 1963124 1961972 -0.06% cgo 1734228 1733140 -0.06% compile 8948740 8948516 -0.00% cover 1864500 1863588 -0.05% link 2555700 2552676 -0.12% preprofile 863636 862980 -0.08% vet 2869220 2867556 -0.06% Some benchmarks result from a local run: shortname: aws_jsonutil pkg: github.com/aws/aws-sdk-go/private/protocol/json/jsonutil │ Orig-rand.stdout │ Cse1-rand.stdout │ │ sec/op │ sec/op vs base │ BuildJSON-8 1.511µ ± 0% 1.516µ ± 0% +0.33% (p=0.003 n=15) StdlibJSON-8 1.254µ ± 0% 1.227µ ± 0% -2.15% (p=0.000 n=15) geomean 1.377µ 1.364µ -0.92% shortname: kanzi toolchain: Cse1-rand goos: linux goarch: arm64 pkg: github.com/flanglet/kanzi-go/benchmark │ Orig-rand.stdout │ Cse1-rand.stdout │ │ sec/op │ sec/op vs base │ FPAQ-4 26.11m ± 0% 25.61m ± 0% -1.93% (p=0.000 n=10) LZ-4 1.461m ± 1% 1.445m ± 1% ~ (p=0.105 n=10) MTFT-4 1.197m ± 0% 1.201m ± 0% +0.36% (p=0.000 n=10) geomean 3.574m 3.543m -0.88% This change also tends to increase number of NilChecks matched, which led to moving statement boundary marks from OpNilCheck to its user instruction (such as OpOffPtr), where it is more likely to be lost during subsequent optimizations - e.g. see #75249. Because we don't remove the nil checks in cse, here we also update it to not move the statement boundary marks from OpNilCheck - the later related optimizations can handle that better. Change-Id: Iddf4aa13d44de78ffecf6ccb4c0fd1d35533e844 Reviewed-on: https://go-review.googlesource.com/c/go/+/608115 LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: Keith Randall <khr@golang.org> Reviewed-by: Keith Randall <khr@google.com> Reviewed-by: Michael Pratt <mpratt@google.com>
author: amusman <alexander.musman@gmail.com> 2024-08-24 00:25:32 +0300
committer: Keith Randall <khr@golang.org> 2026-02-05 11:50:32 -0800
commit: c34b99a5e20307a55a047543f6b48d8a28d830b5 (patch)
tree: 770109a6071b6bb884a6fa789418dc222a0efea4 /src/cmd
parent: 99d7121934a9cfa7963d3a9bfd840779fd2869f6 (diff)
download: go-c34b99a5e20307a55a047543f6b48d8a28d830b5.tar.xz
2 files changed, 125 insertions, 9 deletions
diff --git a/src/cmd/compile/internal/ssa/cse.go b/src/cmd/compile/internal/ssa/cse.go
index 28eb4f76a9..44a6ad9f9f 100644
--- a/src/cmd/compile/internal/ssa/cse.go
+++ b/src/cmd/compile/internal/ssa/cse.go
@@ -85,6 +85,11 @@ func cse(f *Func) {
 		pNum++
 	}
 
+	// Keep a table to remap memory operand of any memory user which does not have a memory result (such as a regular load),
+	// to some dominating memory operation, skipping the memory defs that do not alias with it.
+	memTable := f.Cache.allocInt32Slice(f.NumValues())
+	defer f.Cache.freeInt32Slice(memTable)
+
 	// Split equivalence classes at points where they have
 	// non-equivalent arguments.  Repeat until we can't find any
 	// more splits.
@@ -108,12 +113,23 @@ func cse(f *Func) {
 
 			// Sort by eq class of arguments.
 			slices.SortFunc(e, func(v, w *Value) int {
+				_, idxMem, _, _ := isMemUser(v)
 				for i, a := range v.Args {
-					b := w.Args[i]
-					if valueEqClass[a.ID] < valueEqClass[b.ID] {
+					var aId, bId ID
+					if i != idxMem {
+						b := w.Args[i]
+						aId = a.ID
+						bId = b.ID
+					} else {
+						// A memory user's mem argument may be remapped to allow matching
+						// identical load-like instructions across disjoint stores.
+						aId, _ = getEffectiveMemoryArg(memTable, v)
+						bId, _ = getEffectiveMemoryArg(memTable, w)
+					}
+					if valueEqClass[aId] < valueEqClass[bId] {
 						return -1
 					}
-					if valueEqClass[a.ID] > valueEqClass[b.ID] {
+					if valueEqClass[aId] > valueEqClass[bId] {
 						return +1
 					}
 				}
@@ -126,12 +142,23 @@ func cse(f *Func) {
 				v, w := e[j-1], e[j]
 				// Note: commutative args already correctly ordered by byArgClass.
 				eqArgs := true
+				_, idxMem, _, _ := isMemUser(v)
 				for k, a := range v.Args {
 					if v.Op == OpLocalAddr && k == 1 {
 						continue
 					}
-					b := w.Args[k]
-					if valueEqClass[a.ID] != valueEqClass[b.ID] {
+					var aId, bId ID
+					if k != idxMem {
+						b := w.Args[k]
+						aId = a.ID
+						bId = b.ID
+					} else {
+						// A memory user's mem argument may be remapped to allow matching
+						// identical load-like instructions across disjoint stores.
+						aId, _ = getEffectiveMemoryArg(memTable, v)
+						bId, _ = getEffectiveMemoryArg(memTable, w)
+					}
+					if valueEqClass[aId] != valueEqClass[bId] {
 						eqArgs = false
 						break
 					}
@@ -180,10 +207,19 @@ func cse(f *Func) {
 	defer f.Cache.freeValueSlice(rewrite)
 	for _, e := range partition {
 		slices.SortFunc(e, func(v, w *Value) int {
-			c := cmp.Compare(sdom.domorder(v.Block), sdom.domorder(w.Block))
-			if c != 0 {
+			if c := cmp.Compare(sdom.domorder(v.Block), sdom.domorder(w.Block)); c != 0 {
 				return c
 			}
+			if _, _, _, ok := isMemUser(v); ok {
+				// Additional ordering among the memory users within one block: prefer the earliest
+				// possible value among the set of equivalent values, that is the one with the lowest
+				// skip count (lowest number of memory defs skipped until their common def).
+				_, vSkips := getEffectiveMemoryArg(memTable, v)
+				_, wSkips := getEffectiveMemoryArg(memTable, w)
+				if c := cmp.Compare(vSkips, wSkips); c != 0 {
+					return c
+				}
+			}
 			if v.Op == OpLocalAddr {
 				// compare the memory args for OpLocalAddrs in the same block
 				vm := v.Args[1]
@@ -254,7 +290,7 @@ func cse(f *Func) {
 		for _, v := range b.Values {
 			for i, w := range v.Args {
 				if x := rewrite[w.ID]; x != nil {
-					if w.Pos.IsStmt() == src.PosIsStmt {
+					if w.Pos.IsStmt() == src.PosIsStmt && w.Op != OpNilCheck {
 						// about to lose a statement marker, w
 						// w is an input to v; if they're in the same block
 						// and the same line, v is a good-enough new statement boundary.
@@ -420,3 +456,82 @@ func cmpVal(v, w *Value, auxIDs auxmap) types.Cmp {
 
 	return types.CMPeq
 }
+
+// Query if the given instruction only uses "memory" argument and we may try to skip some memory "defs" if they do not alias with its address.
+// Return index of pointer argument, index of "memory" argument, the access width and true on such instructions, otherwise return (-1, -1, 0, false).
+func isMemUser(v *Value) (int, int, int64, bool) {
+	switch v.Op {
+	case OpLoad:
+		return 0, 1, v.Type.Size(), true
+	case OpNilCheck:
+		return 0, 1, 0, true
+	default:
+		return -1, -1, 0, false
+	}
+}
+
+// Query if the given "memory"-defining instruction's memory destination can be analyzed for aliasing with a memory "user" instructions.
+// Return index of pointer argument, index of "memory" argument, the access width and true on such instructions, otherwise return (-1, -1, 0, false).
+func isMemDef(v *Value) (int, int, int64, bool) {
+	switch v.Op {
+	case OpStore:
+		return 0, 2, auxToType(v.Aux).Size(), true
+	default:
+		return -1, -1, 0, false
+	}
+}
+
+// Mem table keeps memTableSkipBits lower bits to store the number of skips of "memory" operand
+// and the rest to store the ID of the destination "memory"-producing instruction.
+const memTableSkipBits = 8
+
+// The maximum ID value we are able to store in the memTable, otherwise fall back to v.ID
+const maxId = ID(1<<(31-memTableSkipBits)) - 1
+
+// Return the first possibly-aliased store along the memory chain starting at v's memory argument and the number of not-aliased stores skipped.
+func getEffectiveMemoryArg(memTable []int32, v *Value) (ID, uint32) {
+	if code := uint32(memTable[v.ID]); code != 0 {
+		return ID(code >> memTableSkipBits), code & ((1 << memTableSkipBits) - 1)
+	}
+	if idxPtr, idxMem, width, ok := isMemUser(v); ok {
+		// TODO: We could early return some predefined value if width==0
+		memId := v.Args[idxMem].ID
+		if memId > maxId {
+			return memId, 0
+		}
+		mem, skips := skipDisjointMemDefs(v, idxPtr, idxMem, width)
+		if mem.ID <= maxId {
+			memId = mem.ID
+		} else {
+			skips = 0 // avoid the skip
+		}
+		memTable[v.ID] = int32(memId<<memTableSkipBits) | int32(skips)
+		return memId, skips
+	} else {
+		v.Block.Func.Fatalf("expected memory user instruction: %v", v.LongString())
+	}
+	return 0, 0
+}
+
+// Find a memory def that's not trivially disjoint with the user instruction, count the number
+// of "skips" along the path. Return the corresponding memory def's value and the number of skips.
+func skipDisjointMemDefs(user *Value, idxUserPtr, idxUserMem int, useWidth int64) (*Value, uint32) {
+	usePtr, mem := user.Args[idxUserPtr], user.Args[idxUserMem]
+	const maxSkips = (1 << memTableSkipBits) - 1
+	var skips uint32
+	for skips = 0; skips < maxSkips; skips++ {
+		if idxPtr, idxMem, width, ok := isMemDef(mem); ok {
+			if mem.Args[idxMem].Uses > 50 {
+				// Skipping a memory def with a lot of uses may potentially increase register pressure.
+				break
+			}
+			defPtr := mem.Args[idxPtr]
+			if disjoint(defPtr, width, usePtr, useWidth) {
+				mem = mem.Args[idxMem]
+				continue
+			}
+		}
+		break
+	}
+	return mem, skips
+}
diff --git a/src/cmd/compile/internal/ssa/rewrite.go b/src/cmd/compile/internal/ssa/rewrite.go
index 4b13d65618..04989d93c1 100644
--- a/src/cmd/compile/internal/ssa/rewrite.go
+++ b/src/cmd/compile/internal/ssa/rewrite.go
@@ -944,7 +944,8 @@ func disjointTypes(t1 *types.Type, t2 *types.Type) bool {
 	}
 
 	if !t1.IsPtr() || !t2.IsPtr() {
-		panic("disjointTypes: one of arguments is not a pointer")
+		// Treat non-pointer types (such as TFUNC, TMAP, uintptr) conservatively.
+		return false
 	}
 
 	t1 = t1.Elem()
author	amusman <alexander.musman@gmail.com>	2024-08-24 00:25:32 +0300
committer	Keith Randall <khr@golang.org>	2026-02-05 11:50:32 -0800
commit	c34b99a5e20307a55a047543f6b48d8a28d830b5 (patch)
tree	770109a6071b6bb884a6fa789418dc222a0efea4 /src/cmd
parent	99d7121934a9cfa7963d3a9bfd840779fd2869f6 (diff)
download	go-c34b99a5e20307a55a047543f6b48d8a28d830b5.tar.xz