diff options
| author | amusman <alexander.musman@gmail.com> | 2024-08-24 00:25:32 +0300 |
|---|---|---|
| committer | Keith Randall <khr@golang.org> | 2026-02-05 11:50:32 -0800 |
| commit | c34b99a5e20307a55a047543f6b48d8a28d830b5 (patch) | |
| tree | 770109a6071b6bb884a6fa789418dc222a0efea4 /src/cmd | |
| parent | 99d7121934a9cfa7963d3a9bfd840779fd2869f6 (diff) | |
| download | go-c34b99a5e20307a55a047543f6b48d8a28d830b5.tar.xz | |
cmd/compile: CSE loads across disjoint stores
Enable partitioning together memory user instructions, such as regular
loads, across disjoint memory defining instructions (currently only stores).
Keep a memory table to remember appropriate memory definition for any
supported memory using instruction. This allows to match more load
instructions and potentially may be further improved with handling
additional cases in the common utility `disjoint`.
Generally this change allows to improve code size. For example, here is
code size difference on linux_arm64:
Executable Old .text New .text Change
-------------------------------------------------------
asm 1963124 1961972 -0.06%
cgo 1734228 1733140 -0.06%
compile 8948740 8948516 -0.00%
cover 1864500 1863588 -0.05%
link 2555700 2552676 -0.12%
preprofile 863636 862980 -0.08%
vet 2869220 2867556 -0.06%
Some benchmarks result from a local run:
shortname: aws_jsonutil
pkg: github.com/aws/aws-sdk-go/private/protocol/json/jsonutil
│ Orig-rand.stdout │ Cse1-rand.stdout │
│ sec/op │ sec/op vs base │
BuildJSON-8 1.511µ ± 0% 1.516µ ± 0% +0.33% (p=0.003 n=15)
StdlibJSON-8 1.254µ ± 0% 1.227µ ± 0% -2.15% (p=0.000 n=15)
geomean 1.377µ 1.364µ -0.92%
shortname: kanzi
toolchain: Cse1-rand
goos: linux
goarch: arm64
pkg: github.com/flanglet/kanzi-go/benchmark
│ Orig-rand.stdout │ Cse1-rand.stdout │
│ sec/op │ sec/op vs base │
FPAQ-4 26.11m ± 0% 25.61m ± 0% -1.93% (p=0.000 n=10)
LZ-4 1.461m ± 1% 1.445m ± 1% ~ (p=0.105 n=10)
MTFT-4 1.197m ± 0% 1.201m ± 0% +0.36% (p=0.000 n=10)
geomean 3.574m 3.543m -0.88%
This change also tends to increase number of NilChecks matched, which
led to moving statement boundary marks from OpNilCheck to its user
instruction (such as OpOffPtr), where it is more likely to be lost
during subsequent optimizations - e.g. see #75249. Because we don't
remove the nil checks in cse, here we also update it to not move
the statement boundary marks from OpNilCheck - the later related
optimizations can handle that better.
Change-Id: Iddf4aa13d44de78ffecf6ccb4c0fd1d35533e844
Reviewed-on: https://go-review.googlesource.com/c/go/+/608115
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Keith Randall <khr@golang.org>
Reviewed-by: Keith Randall <khr@google.com>
Reviewed-by: Michael Pratt <mpratt@google.com>
Diffstat (limited to 'src/cmd')
| -rw-r--r-- | src/cmd/compile/internal/ssa/cse.go | 131 | ||||
| -rw-r--r-- | src/cmd/compile/internal/ssa/rewrite.go | 3 |
2 files changed, 125 insertions, 9 deletions
diff --git a/src/cmd/compile/internal/ssa/cse.go b/src/cmd/compile/internal/ssa/cse.go index 28eb4f76a9..44a6ad9f9f 100644 --- a/src/cmd/compile/internal/ssa/cse.go +++ b/src/cmd/compile/internal/ssa/cse.go @@ -85,6 +85,11 @@ func cse(f *Func) { pNum++ } + // Keep a table to remap memory operand of any memory user which does not have a memory result (such as a regular load), + // to some dominating memory operation, skipping the memory defs that do not alias with it. + memTable := f.Cache.allocInt32Slice(f.NumValues()) + defer f.Cache.freeInt32Slice(memTable) + // Split equivalence classes at points where they have // non-equivalent arguments. Repeat until we can't find any // more splits. @@ -108,12 +113,23 @@ func cse(f *Func) { // Sort by eq class of arguments. slices.SortFunc(e, func(v, w *Value) int { + _, idxMem, _, _ := isMemUser(v) for i, a := range v.Args { - b := w.Args[i] - if valueEqClass[a.ID] < valueEqClass[b.ID] { + var aId, bId ID + if i != idxMem { + b := w.Args[i] + aId = a.ID + bId = b.ID + } else { + // A memory user's mem argument may be remapped to allow matching + // identical load-like instructions across disjoint stores. + aId, _ = getEffectiveMemoryArg(memTable, v) + bId, _ = getEffectiveMemoryArg(memTable, w) + } + if valueEqClass[aId] < valueEqClass[bId] { return -1 } - if valueEqClass[a.ID] > valueEqClass[b.ID] { + if valueEqClass[aId] > valueEqClass[bId] { return +1 } } @@ -126,12 +142,23 @@ func cse(f *Func) { v, w := e[j-1], e[j] // Note: commutative args already correctly ordered by byArgClass. eqArgs := true + _, idxMem, _, _ := isMemUser(v) for k, a := range v.Args { if v.Op == OpLocalAddr && k == 1 { continue } - b := w.Args[k] - if valueEqClass[a.ID] != valueEqClass[b.ID] { + var aId, bId ID + if k != idxMem { + b := w.Args[k] + aId = a.ID + bId = b.ID + } else { + // A memory user's mem argument may be remapped to allow matching + // identical load-like instructions across disjoint stores. + aId, _ = getEffectiveMemoryArg(memTable, v) + bId, _ = getEffectiveMemoryArg(memTable, w) + } + if valueEqClass[aId] != valueEqClass[bId] { eqArgs = false break } @@ -180,10 +207,19 @@ func cse(f *Func) { defer f.Cache.freeValueSlice(rewrite) for _, e := range partition { slices.SortFunc(e, func(v, w *Value) int { - c := cmp.Compare(sdom.domorder(v.Block), sdom.domorder(w.Block)) - if c != 0 { + if c := cmp.Compare(sdom.domorder(v.Block), sdom.domorder(w.Block)); c != 0 { return c } + if _, _, _, ok := isMemUser(v); ok { + // Additional ordering among the memory users within one block: prefer the earliest + // possible value among the set of equivalent values, that is the one with the lowest + // skip count (lowest number of memory defs skipped until their common def). + _, vSkips := getEffectiveMemoryArg(memTable, v) + _, wSkips := getEffectiveMemoryArg(memTable, w) + if c := cmp.Compare(vSkips, wSkips); c != 0 { + return c + } + } if v.Op == OpLocalAddr { // compare the memory args for OpLocalAddrs in the same block vm := v.Args[1] @@ -254,7 +290,7 @@ func cse(f *Func) { for _, v := range b.Values { for i, w := range v.Args { if x := rewrite[w.ID]; x != nil { - if w.Pos.IsStmt() == src.PosIsStmt { + if w.Pos.IsStmt() == src.PosIsStmt && w.Op != OpNilCheck { // about to lose a statement marker, w // w is an input to v; if they're in the same block // and the same line, v is a good-enough new statement boundary. @@ -420,3 +456,82 @@ func cmpVal(v, w *Value, auxIDs auxmap) types.Cmp { return types.CMPeq } + +// Query if the given instruction only uses "memory" argument and we may try to skip some memory "defs" if they do not alias with its address. +// Return index of pointer argument, index of "memory" argument, the access width and true on such instructions, otherwise return (-1, -1, 0, false). +func isMemUser(v *Value) (int, int, int64, bool) { + switch v.Op { + case OpLoad: + return 0, 1, v.Type.Size(), true + case OpNilCheck: + return 0, 1, 0, true + default: + return -1, -1, 0, false + } +} + +// Query if the given "memory"-defining instruction's memory destination can be analyzed for aliasing with a memory "user" instructions. +// Return index of pointer argument, index of "memory" argument, the access width and true on such instructions, otherwise return (-1, -1, 0, false). +func isMemDef(v *Value) (int, int, int64, bool) { + switch v.Op { + case OpStore: + return 0, 2, auxToType(v.Aux).Size(), true + default: + return -1, -1, 0, false + } +} + +// Mem table keeps memTableSkipBits lower bits to store the number of skips of "memory" operand +// and the rest to store the ID of the destination "memory"-producing instruction. +const memTableSkipBits = 8 + +// The maximum ID value we are able to store in the memTable, otherwise fall back to v.ID +const maxId = ID(1<<(31-memTableSkipBits)) - 1 + +// Return the first possibly-aliased store along the memory chain starting at v's memory argument and the number of not-aliased stores skipped. +func getEffectiveMemoryArg(memTable []int32, v *Value) (ID, uint32) { + if code := uint32(memTable[v.ID]); code != 0 { + return ID(code >> memTableSkipBits), code & ((1 << memTableSkipBits) - 1) + } + if idxPtr, idxMem, width, ok := isMemUser(v); ok { + // TODO: We could early return some predefined value if width==0 + memId := v.Args[idxMem].ID + if memId > maxId { + return memId, 0 + } + mem, skips := skipDisjointMemDefs(v, idxPtr, idxMem, width) + if mem.ID <= maxId { + memId = mem.ID + } else { + skips = 0 // avoid the skip + } + memTable[v.ID] = int32(memId<<memTableSkipBits) | int32(skips) + return memId, skips + } else { + v.Block.Func.Fatalf("expected memory user instruction: %v", v.LongString()) + } + return 0, 0 +} + +// Find a memory def that's not trivially disjoint with the user instruction, count the number +// of "skips" along the path. Return the corresponding memory def's value and the number of skips. +func skipDisjointMemDefs(user *Value, idxUserPtr, idxUserMem int, useWidth int64) (*Value, uint32) { + usePtr, mem := user.Args[idxUserPtr], user.Args[idxUserMem] + const maxSkips = (1 << memTableSkipBits) - 1 + var skips uint32 + for skips = 0; skips < maxSkips; skips++ { + if idxPtr, idxMem, width, ok := isMemDef(mem); ok { + if mem.Args[idxMem].Uses > 50 { + // Skipping a memory def with a lot of uses may potentially increase register pressure. + break + } + defPtr := mem.Args[idxPtr] + if disjoint(defPtr, width, usePtr, useWidth) { + mem = mem.Args[idxMem] + continue + } + } + break + } + return mem, skips +} diff --git a/src/cmd/compile/internal/ssa/rewrite.go b/src/cmd/compile/internal/ssa/rewrite.go index 4b13d65618..04989d93c1 100644 --- a/src/cmd/compile/internal/ssa/rewrite.go +++ b/src/cmd/compile/internal/ssa/rewrite.go @@ -944,7 +944,8 @@ func disjointTypes(t1 *types.Type, t2 *types.Type) bool { } if !t1.IsPtr() || !t2.IsPtr() { - panic("disjointTypes: one of arguments is not a pointer") + // Treat non-pointer types (such as TFUNC, TMAP, uintptr) conservatively. + return false } t1 = t1.Elem() |
