aboutsummaryrefslogtreecommitdiff
path: root/src/cmd/internal
diff options
context:
space:
mode:
authorKeith Randall <khr@golang.org>2025-05-17 15:05:56 -0700
committerKeith Randall <khr@golang.org>2025-10-06 14:11:41 -0700
commit719dfcf8a8478d70360bf3c34c0e920be7b32994 (patch)
treed58aaf3289de3bb18901e34b336da46b425f8075 /src/cmd/internal
parentf3312124c2370c2f64a7f9ad29732ec30209647a (diff)
downloadgo-719dfcf8a8478d70360bf3c34c0e920be7b32994.tar.xz
cmd/compile: redo arm64 LR/FP save and restore
Instead of storing LR (the return address) at 0(SP) and the FP (parent's frame pointer) at -8(SP), store them at framesize-8(SP) and framesize-16(SP), respectively. We push and pop data onto the stack such that we're never accessing anything below SP. The prolog/epilog lengths are unchanged (3 insns for a typical prolog, 2 for a typical epilog). We use 8 bytes more per frame. Typical prologue: STP.W (FP, LR), -16(SP) MOVD SP, FP SUB $C, SP Typical epilogue: ADD $C, SP LDP.P 16(SP), (FP, LR) RET The previous word where we stored LR, at 0(SP), is now unused. We could repurpose that slot for storing a local variable. The new prolog and epilog instructions are recognized by libunwind, so pc-sampling tools like perf should now be accurate. (TODO: except maybe after the first RET instruction? Have to look into that.) Update #73753 (fixes, for arm64) Update #57302 (Quim thinks this will help on that issue) Change-Id: I4800036a9a9a08aaaf35d9f99de79a36cf37ebb8 Reviewed-on: https://go-review.googlesource.com/c/go/+/674615 Reviewed-by: David Chase <drchase@google.com> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: Keith Randall <khr@google.com>
Diffstat (limited to 'src/cmd/internal')
-rw-r--r--src/cmd/internal/obj/arm64/asm7.go12
-rw-r--r--src/cmd/internal/obj/arm64/obj7.go314
2 files changed, 114 insertions, 212 deletions
diff --git a/src/cmd/internal/obj/arm64/asm7.go b/src/cmd/internal/obj/arm64/asm7.go
index 743d09a319..281d705a3e 100644
--- a/src/cmd/internal/obj/arm64/asm7.go
+++ b/src/cmd/internal/obj/arm64/asm7.go
@@ -51,7 +51,6 @@ type ctxt7 struct {
blitrl *obj.Prog
elitrl *obj.Prog
autosize int32
- extrasize int32
instoffset int64
pc int64
pool struct {
@@ -1122,8 +1121,7 @@ func span7(ctxt *obj.Link, cursym *obj.LSym, newprog obj.ProgAlloc) {
ctxt.Diag("arm64 ops not initialized, call arm64.buildop first")
}
- c := ctxt7{ctxt: ctxt, newprog: newprog, cursym: cursym, autosize: int32(p.To.Offset & 0xffffffff), extrasize: int32(p.To.Offset >> 32)}
- p.To.Offset &= 0xffffffff // extrasize is no longer needed
+ c := ctxt7{ctxt: ctxt, newprog: newprog, cursym: cursym, autosize: int32(p.To.Offset)}
// Process literal pool and allocate initial program counter for each Prog, before
// generating branch veneers.
@@ -2119,8 +2117,8 @@ func (c *ctxt7) aclass(a *obj.Addr) int {
// a.Offset is still relative to pseudo-SP.
a.Reg = obj.REG_NONE
}
- // The frame top 8 or 16 bytes are for FP
- c.instoffset = int64(c.autosize) + a.Offset - int64(c.extrasize)
+ // The frame top 16 bytes are for LR/FP
+ c.instoffset = int64(c.autosize) + a.Offset - extrasize
return autoclass(c.instoffset)
case obj.NAME_PARAM:
@@ -2180,8 +2178,8 @@ func (c *ctxt7) aclass(a *obj.Addr) int {
// a.Offset is still relative to pseudo-SP.
a.Reg = obj.REG_NONE
}
- // The frame top 8 or 16 bytes are for FP
- c.instoffset = int64(c.autosize) + a.Offset - int64(c.extrasize)
+ // The frame top 16 bytes are for LR/FP
+ c.instoffset = int64(c.autosize) + a.Offset - extrasize
case obj.NAME_PARAM:
if a.Reg == REGSP {
diff --git a/src/cmd/internal/obj/arm64/obj7.go b/src/cmd/internal/obj/arm64/obj7.go
index 2583e46354..a697426145 100644
--- a/src/cmd/internal/obj/arm64/obj7.go
+++ b/src/cmd/internal/obj/arm64/obj7.go
@@ -36,7 +36,6 @@ import (
"cmd/internal/src"
"cmd/internal/sys"
"internal/abi"
- "internal/buildcfg"
"log"
"math"
)
@@ -472,6 +471,8 @@ func (c *ctxt7) rewriteToUseGot(p *obj.Prog) {
obj.Nopout(p)
}
+const extrasize = 16 // space needed in the frame for LR+FP
+
func preprocess(ctxt *obj.Link, cursym *obj.LSym, newprog obj.ProgAlloc) {
if cursym.Func().Text == nil || cursym.Func().Text.Link == nil {
return
@@ -521,33 +522,26 @@ func preprocess(ctxt *obj.Link, cursym *obj.LSym, newprog obj.ProgAlloc) {
c.autosize = int32(textstksiz)
if p.Mark&LEAF != 0 && c.autosize == 0 {
- // A leaf function with no locals has no frame.
+ // A leaf function with no locals needs no frame.
p.From.Sym.Set(obj.AttrNoFrame, true)
}
if !p.From.Sym.NoFrame() {
// If there is a stack frame at all, it includes
- // space to save the LR.
+ // space for the (now unused) word at [SP:SP+8].
c.autosize += 8
}
+ // Round up to a multiple of 16.
+ c.autosize += (-c.autosize) & 15
+
if c.autosize != 0 {
- extrasize := int32(0)
- if c.autosize%16 == 8 {
- // Allocate extra 8 bytes on the frame top to save FP
- extrasize = 8
- } else if c.autosize&(16-1) == 0 {
- // Allocate extra 16 bytes to save FP for the old frame whose size is 8 mod 16
- extrasize = 16
- } else {
- c.ctxt.Diag("%v: unaligned frame size %d - must be 16 aligned", p, c.autosize-8)
- }
+ // Allocate an extra 16 bytes at the top of the frame
+ // to save LR+FP.
c.autosize += extrasize
c.cursym.Func().Locals += extrasize
- // low 32 bits for autosize
- // high 32 bits for extrasize
- p.To.Offset = int64(c.autosize) | int64(extrasize)<<32
+ p.To.Offset = int64(c.autosize)
} else {
// NOFRAME
p.To.Offset = 0
@@ -580,120 +574,72 @@ func preprocess(ctxt *obj.Link, cursym *obj.LSym, newprog obj.ProgAlloc) {
var prologueEnd *obj.Prog
aoffset := c.autosize
- if aoffset > 0xf0 {
- // MOVD.W offset variant range is -0x100 to 0xf8, SP should be 16-byte aligned.
- // so the maximum aoffset value is 0xf0.
- aoffset = 0xf0
+ if aoffset < 16 {
+ log.Fatalf("aoffset too small %d", aoffset)
}
- // Frame is non-empty. Make sure to save link register, even if
- // it is a leaf function, so that traceback works.
q = p
- if c.autosize > aoffset {
- // Frame size is too large for a MOVD.W instruction. Store the frame pointer
- // register and link register before decrementing SP, so if a signal comes
- // during the execution of the function prologue, the traceback code will
- // not see a half-updated stack frame.
-
- // SUB $autosize, RSP, R20
- q1 = obj.Appendp(q, c.newprog)
- q1.Pos = p.Pos
- q1.As = ASUB
- q1.From.Type = obj.TYPE_CONST
- q1.From.Offset = int64(c.autosize)
- q1.Reg = REGSP
- q1.To.Type = obj.TYPE_REG
- q1.To.Reg = REG_R20
- prologueEnd = q1
-
- // STP (R29, R30), -8(R20)
- q1 = obj.Appendp(q1, c.newprog)
- q1.Pos = p.Pos
- q1.As = ASTP
- q1.From.Type = obj.TYPE_REGREG
- q1.From.Reg = REGFP
- q1.From.Offset = REGLINK
- q1.To.Type = obj.TYPE_MEM
- q1.To.Reg = REG_R20
- q1.To.Offset = -8
+ // Store return address and frame pointer at the top of the stack frame.
+ // STP.W (R29, R30), -16(SP)
+ q1 = obj.Appendp(q, c.newprog)
+ q1.Pos = p.Pos
+ q1.As = ASTP
+ q1.From.Type = obj.TYPE_REGREG
+ q1.From.Reg = REGFP
+ q1.From.Offset = REGLINK
+ q1.To.Type = obj.TYPE_MEM
+ q1.To.Reg = REG_RSP
+ q1.To.Offset = -16
+ q1.Scond = C_XPRE
- // This is not async preemptible, as if we open a frame
- // at the current SP, it will clobber the saved LR.
- q1 = c.ctxt.StartUnsafePoint(q1, c.newprog)
+ prologueEnd = q1
- // MOVD R20, RSP
- q1 = obj.Appendp(q1, c.newprog)
- q1.Pos = p.Pos
- q1.As = AMOVD
- q1.From.Type = obj.TYPE_REG
- q1.From.Reg = REG_R20
- q1.To.Type = obj.TYPE_REG
- q1.To.Reg = REGSP
- q1.Spadj = c.autosize
+ // Update frame pointer
+ q1 = obj.Appendp(q1, c.newprog)
+ q1.Pos = p.Pos
+ q1.As = AMOVD
+ q1.From.Type = obj.TYPE_REG
+ q1.From.Reg = REGSP
+ q1.To.Type = obj.TYPE_REG
+ q1.To.Reg = REGFP
- q1 = c.ctxt.EndUnsafePoint(q1, c.newprog, -1)
+ // Allocate additional frame space.
+ adj := aoffset - 16
+ if adj > 0 {
+ // SUB $autosize-16, RSP
+ if adj < 1<<12 {
+ q1 = obj.Appendp(q1, c.newprog)
+ q1.Pos = p.Pos
+ q1.As = ASUB
+ q1.From.Type = obj.TYPE_CONST
+ q1.From.Offset = int64(adj)
+ q1.To.Type = obj.TYPE_REG
+ q1.To.Reg = REGSP
+ } else {
+ // Constant too big for atomic subtract.
+ // Materialize in tmp register first.
+ q1 = obj.Appendp(q1, c.newprog)
+ q1.Pos = p.Pos
+ q1.As = AMOVD
+ q1.From.Type = obj.TYPE_CONST
+ q1.From.Offset = int64(adj)
+ q1.To.Type = obj.TYPE_REG
+ q1.To.Reg = REGTMP
- if buildcfg.GOOS == "ios" {
- // iOS does not support SA_ONSTACK. We will run the signal handler
- // on the G stack. If we write below SP, it may be clobbered by
- // the signal handler. So we save FP and LR after decrementing SP.
- // STP (R29, R30), -8(RSP)
q1 = obj.Appendp(q1, c.newprog)
q1.Pos = p.Pos
- q1.As = ASTP
- q1.From.Type = obj.TYPE_REGREG
- q1.From.Reg = REGFP
- q1.From.Offset = REGLINK
- q1.To.Type = obj.TYPE_MEM
+ q1.As = ASUB
+ q1.From.Type = obj.TYPE_REG
+ q1.From.Reg = REGTMP
+ q1.To.Type = obj.TYPE_REG
q1.To.Reg = REGSP
- q1.To.Offset = -8
}
- } else {
- // small frame, update SP and save LR in a single MOVD.W instruction.
- // So if a signal comes during the execution of the function prologue,
- // the traceback code will not see a half-updated stack frame.
- // Also, on Linux, in a cgo binary we may get a SIGSETXID signal
- // early on before the signal stack is set, as glibc doesn't allow
- // us to block SIGSETXID. So it is important that we don't write below
- // the SP until the signal stack is set.
- // Luckily, all the functions from thread entry to setting the signal
- // stack have small frames.
- q1 = obj.Appendp(q, c.newprog)
- q1.As = AMOVD
- q1.Pos = p.Pos
- q1.From.Type = obj.TYPE_REG
- q1.From.Reg = REGLINK
- q1.To.Type = obj.TYPE_MEM
- q1.Scond = C_XPRE
- q1.To.Offset = int64(-aoffset)
- q1.To.Reg = REGSP
- q1.Spadj = aoffset
-
- prologueEnd = q1
-
- // Frame pointer.
- q1 = obj.Appendp(q1, c.newprog)
- q1.Pos = p.Pos
- q1.As = AMOVD
- q1.From.Type = obj.TYPE_REG
- q1.From.Reg = REGFP
- q1.To.Type = obj.TYPE_MEM
- q1.To.Reg = REGSP
- q1.To.Offset = -8
+ q1.Spadj = adj
}
prologueEnd.Pos = prologueEnd.Pos.WithXlogue(src.PosPrologueEnd)
- q1 = obj.Appendp(q1, c.newprog)
- q1.Pos = p.Pos
- q1.As = ASUB
- q1.From.Type = obj.TYPE_CONST
- q1.From.Offset = 8
- q1.Reg = REGSP
- q1.To.Type = obj.TYPE_REG
- q1.To.Reg = REGFP
-
case obj.ARET:
nocache(p)
if p.From.Type == obj.TYPE_CONST {
@@ -707,105 +653,56 @@ func preprocess(ctxt *obj.Link, cursym *obj.LSym, newprog obj.ProgAlloc) {
}
p.To = obj.Addr{}
aoffset := c.autosize
- if c.cursym.Func().Text.Mark&LEAF != 0 {
- if aoffset != 0 {
- // Restore frame pointer.
- // ADD $framesize-8, RSP, R29
- p.As = AADD
- p.From.Type = obj.TYPE_CONST
- p.From.Offset = int64(c.autosize) - 8
- p.Reg = REGSP
- p.To.Type = obj.TYPE_REG
- p.To.Reg = REGFP
-
- // Pop stack frame.
- // ADD $framesize, RSP, RSP
- p = obj.Appendp(p, c.newprog)
- p.As = AADD
- p.From.Type = obj.TYPE_CONST
- p.From.Offset = int64(c.autosize)
- p.To.Type = obj.TYPE_REG
- p.To.Reg = REGSP
- p.Spadj = -c.autosize
+ if aoffset > 0 {
+ if aoffset < 16 {
+ log.Fatalf("aoffset too small %d", aoffset)
}
- } else if aoffset <= 0xF0 {
- // small frame, restore LR and update SP in a single MOVD.P instruction.
- // There is no correctness issue to use a single LDP for LR and FP,
- // but the instructions are not pattern matched with the prologue's
- // MOVD.W and MOVD, which may cause performance issue in
- // store-forwarding.
+ adj := aoffset - 16
+ if adj > 0 {
+ if adj < 1<<12 {
+ // ADD $adj, RSP, RSP
+ p.As = AADD
+ p.From.Type = obj.TYPE_CONST
+ p.From.Offset = int64(adj)
+ p.To.Type = obj.TYPE_REG
+ p.To.Reg = REGSP
+ } else {
+ // Put frame size in a separate register and
+ // add it in with a single instruction,
+ // so we never have a partial frame during
+ // the epilog. See issue 73259.
- // MOVD -8(RSP), R29
- p.As = AMOVD
- p.From.Type = obj.TYPE_MEM
- p.From.Reg = REGSP
- p.From.Offset = -8
- p.To.Type = obj.TYPE_REG
- p.To.Reg = REGFP
- p = obj.Appendp(p, c.newprog)
+ // MOVD $adj, REGTMP
+ p.As = AMOVD
+ p.From.Type = obj.TYPE_CONST
+ p.From.Offset = int64(adj)
+ p.To.Type = obj.TYPE_REG
+ p.To.Reg = REGTMP
+ // ADD REGTMP, RSP, RSP
+ p = obj.Appendp(p, c.newprog)
+ p.As = AADD
+ p.From.Type = obj.TYPE_REG
+ p.From.Reg = REGTMP
+ p.To.Type = obj.TYPE_REG
+ p.To.Reg = REGSP
+ }
+ p.Spadj = -adj
+ }
- // MOVD.P offset(RSP), R30
- p.As = AMOVD
- p.From.Type = obj.TYPE_MEM
- p.Scond = C_XPOST
- p.From.Offset = int64(aoffset)
- p.From.Reg = REGSP
- p.To.Type = obj.TYPE_REG
- p.To.Reg = REGLINK
- p.Spadj = -aoffset
- } else {
- // LDP -8(RSP), (R29, R30)
+ // Pop LR+FP.
+ // LDP.P 16(RSP), (R29, R30)
+ if p.As != obj.ARET {
+ p = obj.Appendp(p, c.newprog)
+ }
p.As = ALDP
p.From.Type = obj.TYPE_MEM
- p.From.Offset = -8
p.From.Reg = REGSP
+ p.From.Offset = 16
+ p.Scond = C_XPOST
p.To.Type = obj.TYPE_REGREG
p.To.Reg = REGFP
p.To.Offset = REGLINK
-
- if aoffset < 1<<12 {
- // ADD $aoffset, RSP, RSP
- q = newprog()
- q.As = AADD
- q.From.Type = obj.TYPE_CONST
- q.From.Offset = int64(aoffset)
- q.To.Type = obj.TYPE_REG
- q.To.Reg = REGSP
- q.Spadj = -aoffset
- q.Pos = p.Pos
- q.Link = p.Link
- p.Link = q
- p = q
- } else {
- // Put frame size in a separate register and
- // add it in with a single instruction,
- // so we never have a partial frame during
- // the epilog. See issue 73259.
-
- // MOVD $aoffset, REGTMP
- q = newprog()
- q.As = AMOVD
- q.From.Type = obj.TYPE_CONST
- q.From.Offset = int64(aoffset)
- q.To.Type = obj.TYPE_REG
- q.To.Reg = REGTMP
- q.Pos = p.Pos
- q.Link = p.Link
- p.Link = q
- p = q
- // ADD REGTMP, RSP, RSP
- q = newprog()
- q.As = AADD
- q.From.Type = obj.TYPE_REG
- q.From.Reg = REGTMP
- q.To.Type = obj.TYPE_REG
- q.To.Reg = REGSP
- q.Spadj = -aoffset
- q.Pos = p.Pos
- q.Link = p.Link
- p.Link = q
- p = q
- }
+ p.Spadj = -16
}
// If enabled, this code emits 'MOV PC, R27' before every 'MOV LR, PC',
@@ -868,10 +765,11 @@ func preprocess(ctxt *obj.Link, cursym *obj.LSym, newprog obj.ProgAlloc) {
p.From.Type = obj.TYPE_REG
p.From.Reg = REGLINK
} else {
- /* MOVD (RSP), Rd */
+ /* MOVD framesize-8(RSP), Rd */
p.As = AMOVD
p.From.Type = obj.TYPE_MEM
p.From.Reg = REGSP
+ p.From.Offset = int64(c.autosize - 8)
}
}
if p.To.Type == obj.TYPE_REG && p.To.Reg == REGSP && p.Spadj == 0 {
@@ -906,6 +804,12 @@ func preprocess(ctxt *obj.Link, cursym *obj.LSym, newprog obj.ProgAlloc) {
p.From.Reg = int16(REG_LSL + r + (shift&7)<<5)
p.From.Offset = 0
}
+ if p.To.Type == obj.TYPE_MEM && p.To.Reg == REG_RSP && (p.Scond == C_XPRE || p.Scond == C_XPOST) {
+ p.Spadj += int32(-p.To.Offset)
+ }
+ if p.From.Type == obj.TYPE_MEM && p.From.Reg == REG_RSP && (p.Scond == C_XPRE || p.Scond == C_XPOST) {
+ p.Spadj += int32(-p.From.Offset)
+ }
}
}