From 38b26f29f1f97d24afc852b9f4eee829341ee682 Mon Sep 17 00:00:00 2001 From: Cherry Mui Date: Mon, 22 Sep 2025 10:57:29 -0400 Subject: cmd/compile: remove stores to unread parameters Currently, we remove stores to local variables that are not read. We don't do that for arguments. But arguments and locals are essentially the same. Arguments are passed by value, and are not expected to be read in the caller's frame. So we can remove the writes to them as well. One exception is the cgo_unsafe_arg directive, which makes all the arguments effectively address-taken. cgo_unsafe_arg implies ABI0, so we just skip ABI0 functions' arguments. Cherry-picked from the dev.simd branch. This CL is not necessarily SIMD specific. Apply early to reduce risk. Change-Id: I8999fc50da6a87f22c1ec23e9a0c15483b6f7df8 Reviewed-on: https://go-review.googlesource.com/c/go/+/705815 LUCI-TryBot-Result: Go LUCI Reviewed-by: David Chase Reviewed-by: Junyang Shao Reviewed-on: https://go-review.googlesource.com/c/go/+/708865 --- src/runtime/testdata/testprog/badtraceback.go | 2 ++ 1 file changed, 2 insertions(+) (limited to 'src/runtime/testdata') diff --git a/src/runtime/testdata/testprog/badtraceback.go b/src/runtime/testdata/testprog/badtraceback.go index 09aa2b877e..455118a543 100644 --- a/src/runtime/testdata/testprog/badtraceback.go +++ b/src/runtime/testdata/testprog/badtraceback.go @@ -44,6 +44,8 @@ func badLR2(arg int) { lrPtr := (*uintptr)(unsafe.Pointer(uintptr(unsafe.Pointer(&arg)) - lrOff)) *lrPtr = 0xbad + runtime.KeepAlive(lrPtr) // prevent dead store elimination + // Print a backtrace. This should include diagnostics for the // bad return PC and a hex dump. panic("backtrace") -- cgit v1.3-6-g1900 From ac2ec82172799b88c057bb9ded6fe24e7909e860 Mon Sep 17 00:00:00 2001 From: Michael Anthony Knyszek Date: Fri, 3 Oct 2025 16:23:10 +0000 Subject: runtime: bump thread count slack for TestReadMetricsSched This test is *still* flaky, but it appears to be just mayMoreStackPreempt and the thread count *occasionally* exceeds the original (and arbitrary) thread count slack by exactly 1. Bump the thread count slack by one. We can investigate further and bump it again if it continues to be a problem. Fixes #75664. Change-Id: I29c922bba6d2cc99a8c3bf5e04cc512d0694f7fa Reviewed-on: https://go-review.googlesource.com/c/go/+/708868 Reviewed-by: Michael Pratt Auto-Submit: Michael Knyszek LUCI-TryBot-Result: Go LUCI --- src/runtime/testdata/testprog/schedmetrics.go | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'src/runtime/testdata') diff --git a/src/runtime/testdata/testprog/schedmetrics.go b/src/runtime/testdata/testprog/schedmetrics.go index 6d3f68a848..bc0906330f 100644 --- a/src/runtime/testdata/testprog/schedmetrics.go +++ b/src/runtime/testdata/testprog/schedmetrics.go @@ -84,7 +84,12 @@ func SchedMetrics() { // threadsSlack is the maximum number of threads left over // from the runtime (sysmon, the template thread, etc.) - const threadsSlack = 4 + // Certain build modes may also cause the creation of additional + // threads through frequent scheduling, like mayMoreStackPreempt. + // A slack of 5 is arbitrary but appears to be enough to cover + // the leftovers plus any inflation from scheduling-heavy build + // modes. + const threadsSlack = 5 // Make sure GC isn't running, since GC workers interfere with // expected counts. -- cgit v1.3-6-g1900 From 719dfcf8a8478d70360bf3c34c0e920be7b32994 Mon Sep 17 00:00:00 2001 From: Keith Randall Date: Sat, 17 May 2025 15:05:56 -0700 Subject: cmd/compile: redo arm64 LR/FP save and restore Instead of storing LR (the return address) at 0(SP) and the FP (parent's frame pointer) at -8(SP), store them at framesize-8(SP) and framesize-16(SP), respectively. We push and pop data onto the stack such that we're never accessing anything below SP. The prolog/epilog lengths are unchanged (3 insns for a typical prolog, 2 for a typical epilog). We use 8 bytes more per frame. Typical prologue: STP.W (FP, LR), -16(SP) MOVD SP, FP SUB $C, SP Typical epilogue: ADD $C, SP LDP.P 16(SP), (FP, LR) RET The previous word where we stored LR, at 0(SP), is now unused. We could repurpose that slot for storing a local variable. The new prolog and epilog instructions are recognized by libunwind, so pc-sampling tools like perf should now be accurate. (TODO: except maybe after the first RET instruction? Have to look into that.) Update #73753 (fixes, for arm64) Update #57302 (Quim thinks this will help on that issue) Change-Id: I4800036a9a9a08aaaf35d9f99de79a36cf37ebb8 Reviewed-on: https://go-review.googlesource.com/c/go/+/674615 Reviewed-by: David Chase LUCI-TryBot-Result: Go LUCI Reviewed-by: Keith Randall --- src/cmd/compile/abi-internal.md | 12 +- src/cmd/compile/internal/arm64/ggen.go | 10 +- src/cmd/compile/internal/arm64/ssa.go | 2 +- src/cmd/compile/internal/ssagen/pgen.go | 6 + src/cmd/compile/internal/ssagen/ssa.go | 3 +- src/cmd/internal/obj/arm64/asm7.go | 12 +- src/cmd/internal/obj/arm64/obj7.go | 326 +++++++++----------------- src/cmd/link/internal/amd64/obj.go | 19 +- src/cmd/link/internal/arm64/obj.go | 23 +- src/cmd/link/internal/ld/dwarf.go | 7 +- src/cmd/link/internal/ld/lib.go | 4 + src/cmd/link/internal/ld/stackcheck.go | 5 - src/cmd/link/internal/x86/obj.go | 15 +- src/runtime/asm_arm64.s | 81 +++++-- src/runtime/mkpreempt.go | 20 +- src/runtime/panic.go | 8 +- src/runtime/preempt_arm64.s | 15 +- src/runtime/race_arm64.s | 17 +- src/runtime/signal_arm64.go | 16 +- src/runtime/stack.go | 20 +- src/runtime/testdata/testprog/badtraceback.go | 5 + src/runtime/traceback.go | 30 +-- test/nosplit.go | 8 +- 23 files changed, 303 insertions(+), 361 deletions(-) (limited to 'src/runtime/testdata') diff --git a/src/cmd/compile/abi-internal.md b/src/cmd/compile/abi-internal.md index eae230dc07..490e1affb7 100644 --- a/src/cmd/compile/abi-internal.md +++ b/src/cmd/compile/abi-internal.md @@ -576,19 +576,19 @@ A function's stack frame, after the frame is created, is laid out as follows: +------------------------------+ + | return PC | + | frame pointer on entry | ← R29 points to | ... locals ... | | ... outgoing arguments ... | - | return PC | ← RSP points to - | frame pointer on entry | + | unused word | ← RSP points to +------------------------------+ ↓ lower addresses The "return PC" is loaded to the link register, R30, as part of the arm64 `CALL` operation. -On entry, a function subtracts from RSP to open its stack frame, and -saves the values of R30 and R29 at the bottom of the frame. -Specifically, R30 is saved at 0(RSP) and R29 is saved at -8(RSP), -after RSP is updated. +On entry, a function pushes R30 (the return address) and R29 +(the caller's frame pointer) onto the bottom of the stack. It then +subtracts a constant from RSP to open its stack frame. A leaf function that does not require any stack space may omit the saved R30 and R29. diff --git a/src/cmd/compile/internal/arm64/ggen.go b/src/cmd/compile/internal/arm64/ggen.go index 1402746700..6ba56b992e 100644 --- a/src/cmd/compile/internal/arm64/ggen.go +++ b/src/cmd/compile/internal/arm64/ggen.go @@ -11,10 +11,12 @@ import ( ) func padframe(frame int64) int64 { - // arm64 requires that the frame size (not counting saved FP&LR) - // be 16 bytes aligned. If not, pad it. - if frame%16 != 0 { - frame += 16 - (frame % 16) + // arm64 requires frame sizes here that are 8 mod 16. + // With the additional (unused) slot at the bottom of the frame, + // that makes an aligned 16 byte frame. + // Adding a save region for LR+FP does not change the alignment. + if frame != 0 { + frame += (-(frame + 8)) & 15 } return frame } diff --git a/src/cmd/compile/internal/arm64/ssa.go b/src/cmd/compile/internal/arm64/ssa.go index 7bc0e536e9..9f79a740c6 100644 --- a/src/cmd/compile/internal/arm64/ssa.go +++ b/src/cmd/compile/internal/arm64/ssa.go @@ -221,7 +221,7 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) { for i := 0; i < len(args); i++ { a := args[i] - // Offset by size of the saved LR slot. + // Offset by size of the unused slot before start of args. addr := ssagen.SpillSlotAddr(a, arm64.REGSP, base.Ctxt.Arch.FixedFrameSize) // Look for double-register operations if we can. if i < len(args)-1 { diff --git a/src/cmd/compile/internal/ssagen/pgen.go b/src/cmd/compile/internal/ssagen/pgen.go index 0a2010363f..f0776172b9 100644 --- a/src/cmd/compile/internal/ssagen/pgen.go +++ b/src/cmd/compile/internal/ssagen/pgen.go @@ -393,10 +393,16 @@ func StackOffset(slot ssa.LocalSlot) int32 { case ir.PAUTO: off = n.FrameOffset() if base.Ctxt.Arch.FixedFrameSize == 0 { + // x86 return address off -= int64(types.PtrSize) } if buildcfg.FramePointerEnabled { + // frame pointer off -= int64(types.PtrSize) + if buildcfg.GOARCH == "arm64" { + // arm64 return address also + off -= int64(types.PtrSize) + } } } return int32(off + slot.Off) diff --git a/src/cmd/compile/internal/ssagen/ssa.go b/src/cmd/compile/internal/ssagen/ssa.go index 1e2159579d..107447f04c 100644 --- a/src/cmd/compile/internal/ssagen/ssa.go +++ b/src/cmd/compile/internal/ssagen/ssa.go @@ -7150,6 +7150,7 @@ func defframe(s *State, e *ssafn, f *ssa.Func) { // Insert code to zero ambiguously live variables so that the // garbage collector only sees initialized values when it // looks for pointers. + // Note: lo/hi are offsets from varp and will be negative. var lo, hi int64 // Opaque state for backend to use. Current backends use it to @@ -7157,7 +7158,7 @@ func defframe(s *State, e *ssafn, f *ssa.Func) { var state uint32 // Iterate through declarations. Autos are sorted in decreasing - // frame offset order. + // frame offset order (least negative to most negative). for _, n := range e.curfn.Dcl { if !n.Needzero() { continue diff --git a/src/cmd/internal/obj/arm64/asm7.go b/src/cmd/internal/obj/arm64/asm7.go index 743d09a319..281d705a3e 100644 --- a/src/cmd/internal/obj/arm64/asm7.go +++ b/src/cmd/internal/obj/arm64/asm7.go @@ -51,7 +51,6 @@ type ctxt7 struct { blitrl *obj.Prog elitrl *obj.Prog autosize int32 - extrasize int32 instoffset int64 pc int64 pool struct { @@ -1122,8 +1121,7 @@ func span7(ctxt *obj.Link, cursym *obj.LSym, newprog obj.ProgAlloc) { ctxt.Diag("arm64 ops not initialized, call arm64.buildop first") } - c := ctxt7{ctxt: ctxt, newprog: newprog, cursym: cursym, autosize: int32(p.To.Offset & 0xffffffff), extrasize: int32(p.To.Offset >> 32)} - p.To.Offset &= 0xffffffff // extrasize is no longer needed + c := ctxt7{ctxt: ctxt, newprog: newprog, cursym: cursym, autosize: int32(p.To.Offset)} // Process literal pool and allocate initial program counter for each Prog, before // generating branch veneers. @@ -2119,8 +2117,8 @@ func (c *ctxt7) aclass(a *obj.Addr) int { // a.Offset is still relative to pseudo-SP. a.Reg = obj.REG_NONE } - // The frame top 8 or 16 bytes are for FP - c.instoffset = int64(c.autosize) + a.Offset - int64(c.extrasize) + // The frame top 16 bytes are for LR/FP + c.instoffset = int64(c.autosize) + a.Offset - extrasize return autoclass(c.instoffset) case obj.NAME_PARAM: @@ -2180,8 +2178,8 @@ func (c *ctxt7) aclass(a *obj.Addr) int { // a.Offset is still relative to pseudo-SP. a.Reg = obj.REG_NONE } - // The frame top 8 or 16 bytes are for FP - c.instoffset = int64(c.autosize) + a.Offset - int64(c.extrasize) + // The frame top 16 bytes are for LR/FP + c.instoffset = int64(c.autosize) + a.Offset - extrasize case obj.NAME_PARAM: if a.Reg == REGSP { diff --git a/src/cmd/internal/obj/arm64/obj7.go b/src/cmd/internal/obj/arm64/obj7.go index 2583e46354..a697426145 100644 --- a/src/cmd/internal/obj/arm64/obj7.go +++ b/src/cmd/internal/obj/arm64/obj7.go @@ -36,7 +36,6 @@ import ( "cmd/internal/src" "cmd/internal/sys" "internal/abi" - "internal/buildcfg" "log" "math" ) @@ -472,6 +471,8 @@ func (c *ctxt7) rewriteToUseGot(p *obj.Prog) { obj.Nopout(p) } +const extrasize = 16 // space needed in the frame for LR+FP + func preprocess(ctxt *obj.Link, cursym *obj.LSym, newprog obj.ProgAlloc) { if cursym.Func().Text == nil || cursym.Func().Text.Link == nil { return @@ -521,33 +522,26 @@ func preprocess(ctxt *obj.Link, cursym *obj.LSym, newprog obj.ProgAlloc) { c.autosize = int32(textstksiz) if p.Mark&LEAF != 0 && c.autosize == 0 { - // A leaf function with no locals has no frame. + // A leaf function with no locals needs no frame. p.From.Sym.Set(obj.AttrNoFrame, true) } if !p.From.Sym.NoFrame() { // If there is a stack frame at all, it includes - // space to save the LR. + // space for the (now unused) word at [SP:SP+8]. c.autosize += 8 } + // Round up to a multiple of 16. + c.autosize += (-c.autosize) & 15 + if c.autosize != 0 { - extrasize := int32(0) - if c.autosize%16 == 8 { - // Allocate extra 8 bytes on the frame top to save FP - extrasize = 8 - } else if c.autosize&(16-1) == 0 { - // Allocate extra 16 bytes to save FP for the old frame whose size is 8 mod 16 - extrasize = 16 - } else { - c.ctxt.Diag("%v: unaligned frame size %d - must be 16 aligned", p, c.autosize-8) - } + // Allocate an extra 16 bytes at the top of the frame + // to save LR+FP. c.autosize += extrasize c.cursym.Func().Locals += extrasize - // low 32 bits for autosize - // high 32 bits for extrasize - p.To.Offset = int64(c.autosize) | int64(extrasize)<<32 + p.To.Offset = int64(c.autosize) } else { // NOFRAME p.To.Offset = 0 @@ -580,120 +574,72 @@ func preprocess(ctxt *obj.Link, cursym *obj.LSym, newprog obj.ProgAlloc) { var prologueEnd *obj.Prog aoffset := c.autosize - if aoffset > 0xf0 { - // MOVD.W offset variant range is -0x100 to 0xf8, SP should be 16-byte aligned. - // so the maximum aoffset value is 0xf0. - aoffset = 0xf0 + if aoffset < 16 { + log.Fatalf("aoffset too small %d", aoffset) } - // Frame is non-empty. Make sure to save link register, even if - // it is a leaf function, so that traceback works. q = p - if c.autosize > aoffset { - // Frame size is too large for a MOVD.W instruction. Store the frame pointer - // register and link register before decrementing SP, so if a signal comes - // during the execution of the function prologue, the traceback code will - // not see a half-updated stack frame. - - // SUB $autosize, RSP, R20 - q1 = obj.Appendp(q, c.newprog) - q1.Pos = p.Pos - q1.As = ASUB - q1.From.Type = obj.TYPE_CONST - q1.From.Offset = int64(c.autosize) - q1.Reg = REGSP - q1.To.Type = obj.TYPE_REG - q1.To.Reg = REG_R20 - - prologueEnd = q1 - - // STP (R29, R30), -8(R20) - q1 = obj.Appendp(q1, c.newprog) - q1.Pos = p.Pos - q1.As = ASTP - q1.From.Type = obj.TYPE_REGREG - q1.From.Reg = REGFP - q1.From.Offset = REGLINK - q1.To.Type = obj.TYPE_MEM - q1.To.Reg = REG_R20 - q1.To.Offset = -8 - - // This is not async preemptible, as if we open a frame - // at the current SP, it will clobber the saved LR. - q1 = c.ctxt.StartUnsafePoint(q1, c.newprog) - - // MOVD R20, RSP - q1 = obj.Appendp(q1, c.newprog) - q1.Pos = p.Pos - q1.As = AMOVD - q1.From.Type = obj.TYPE_REG - q1.From.Reg = REG_R20 - q1.To.Type = obj.TYPE_REG - q1.To.Reg = REGSP - q1.Spadj = c.autosize - - q1 = c.ctxt.EndUnsafePoint(q1, c.newprog, -1) - - if buildcfg.GOOS == "ios" { - // iOS does not support SA_ONSTACK. We will run the signal handler - // on the G stack. If we write below SP, it may be clobbered by - // the signal handler. So we save FP and LR after decrementing SP. - // STP (R29, R30), -8(RSP) + + // Store return address and frame pointer at the top of the stack frame. + // STP.W (R29, R30), -16(SP) + q1 = obj.Appendp(q, c.newprog) + q1.Pos = p.Pos + q1.As = ASTP + q1.From.Type = obj.TYPE_REGREG + q1.From.Reg = REGFP + q1.From.Offset = REGLINK + q1.To.Type = obj.TYPE_MEM + q1.To.Reg = REG_RSP + q1.To.Offset = -16 + q1.Scond = C_XPRE + + prologueEnd = q1 + + // Update frame pointer + q1 = obj.Appendp(q1, c.newprog) + q1.Pos = p.Pos + q1.As = AMOVD + q1.From.Type = obj.TYPE_REG + q1.From.Reg = REGSP + q1.To.Type = obj.TYPE_REG + q1.To.Reg = REGFP + + // Allocate additional frame space. + adj := aoffset - 16 + if adj > 0 { + // SUB $autosize-16, RSP + if adj < 1<<12 { + q1 = obj.Appendp(q1, c.newprog) + q1.Pos = p.Pos + q1.As = ASUB + q1.From.Type = obj.TYPE_CONST + q1.From.Offset = int64(adj) + q1.To.Type = obj.TYPE_REG + q1.To.Reg = REGSP + } else { + // Constant too big for atomic subtract. + // Materialize in tmp register first. + q1 = obj.Appendp(q1, c.newprog) + q1.Pos = p.Pos + q1.As = AMOVD + q1.From.Type = obj.TYPE_CONST + q1.From.Offset = int64(adj) + q1.To.Type = obj.TYPE_REG + q1.To.Reg = REGTMP + q1 = obj.Appendp(q1, c.newprog) q1.Pos = p.Pos - q1.As = ASTP - q1.From.Type = obj.TYPE_REGREG - q1.From.Reg = REGFP - q1.From.Offset = REGLINK - q1.To.Type = obj.TYPE_MEM + q1.As = ASUB + q1.From.Type = obj.TYPE_REG + q1.From.Reg = REGTMP + q1.To.Type = obj.TYPE_REG q1.To.Reg = REGSP - q1.To.Offset = -8 } - } else { - // small frame, update SP and save LR in a single MOVD.W instruction. - // So if a signal comes during the execution of the function prologue, - // the traceback code will not see a half-updated stack frame. - // Also, on Linux, in a cgo binary we may get a SIGSETXID signal - // early on before the signal stack is set, as glibc doesn't allow - // us to block SIGSETXID. So it is important that we don't write below - // the SP until the signal stack is set. - // Luckily, all the functions from thread entry to setting the signal - // stack have small frames. - q1 = obj.Appendp(q, c.newprog) - q1.As = AMOVD - q1.Pos = p.Pos - q1.From.Type = obj.TYPE_REG - q1.From.Reg = REGLINK - q1.To.Type = obj.TYPE_MEM - q1.Scond = C_XPRE - q1.To.Offset = int64(-aoffset) - q1.To.Reg = REGSP - q1.Spadj = aoffset - - prologueEnd = q1 - - // Frame pointer. - q1 = obj.Appendp(q1, c.newprog) - q1.Pos = p.Pos - q1.As = AMOVD - q1.From.Type = obj.TYPE_REG - q1.From.Reg = REGFP - q1.To.Type = obj.TYPE_MEM - q1.To.Reg = REGSP - q1.To.Offset = -8 + q1.Spadj = adj } prologueEnd.Pos = prologueEnd.Pos.WithXlogue(src.PosPrologueEnd) - q1 = obj.Appendp(q1, c.newprog) - q1.Pos = p.Pos - q1.As = ASUB - q1.From.Type = obj.TYPE_CONST - q1.From.Offset = 8 - q1.Reg = REGSP - q1.To.Type = obj.TYPE_REG - q1.To.Reg = REGFP - case obj.ARET: nocache(p) if p.From.Type == obj.TYPE_CONST { @@ -707,105 +653,56 @@ func preprocess(ctxt *obj.Link, cursym *obj.LSym, newprog obj.ProgAlloc) { } p.To = obj.Addr{} aoffset := c.autosize - if c.cursym.Func().Text.Mark&LEAF != 0 { - if aoffset != 0 { - // Restore frame pointer. - // ADD $framesize-8, RSP, R29 - p.As = AADD - p.From.Type = obj.TYPE_CONST - p.From.Offset = int64(c.autosize) - 8 - p.Reg = REGSP - p.To.Type = obj.TYPE_REG - p.To.Reg = REGFP - - // Pop stack frame. - // ADD $framesize, RSP, RSP - p = obj.Appendp(p, c.newprog) - p.As = AADD - p.From.Type = obj.TYPE_CONST - p.From.Offset = int64(c.autosize) - p.To.Type = obj.TYPE_REG - p.To.Reg = REGSP - p.Spadj = -c.autosize + if aoffset > 0 { + if aoffset < 16 { + log.Fatalf("aoffset too small %d", aoffset) + } + adj := aoffset - 16 + if adj > 0 { + if adj < 1<<12 { + // ADD $adj, RSP, RSP + p.As = AADD + p.From.Type = obj.TYPE_CONST + p.From.Offset = int64(adj) + p.To.Type = obj.TYPE_REG + p.To.Reg = REGSP + } else { + // Put frame size in a separate register and + // add it in with a single instruction, + // so we never have a partial frame during + // the epilog. See issue 73259. + + // MOVD $adj, REGTMP + p.As = AMOVD + p.From.Type = obj.TYPE_CONST + p.From.Offset = int64(adj) + p.To.Type = obj.TYPE_REG + p.To.Reg = REGTMP + // ADD REGTMP, RSP, RSP + p = obj.Appendp(p, c.newprog) + p.As = AADD + p.From.Type = obj.TYPE_REG + p.From.Reg = REGTMP + p.To.Type = obj.TYPE_REG + p.To.Reg = REGSP + } + p.Spadj = -adj } - } else if aoffset <= 0xF0 { - // small frame, restore LR and update SP in a single MOVD.P instruction. - // There is no correctness issue to use a single LDP for LR and FP, - // but the instructions are not pattern matched with the prologue's - // MOVD.W and MOVD, which may cause performance issue in - // store-forwarding. - - // MOVD -8(RSP), R29 - p.As = AMOVD - p.From.Type = obj.TYPE_MEM - p.From.Reg = REGSP - p.From.Offset = -8 - p.To.Type = obj.TYPE_REG - p.To.Reg = REGFP - p = obj.Appendp(p, c.newprog) - // MOVD.P offset(RSP), R30 - p.As = AMOVD - p.From.Type = obj.TYPE_MEM - p.Scond = C_XPOST - p.From.Offset = int64(aoffset) - p.From.Reg = REGSP - p.To.Type = obj.TYPE_REG - p.To.Reg = REGLINK - p.Spadj = -aoffset - } else { - // LDP -8(RSP), (R29, R30) + // Pop LR+FP. + // LDP.P 16(RSP), (R29, R30) + if p.As != obj.ARET { + p = obj.Appendp(p, c.newprog) + } p.As = ALDP p.From.Type = obj.TYPE_MEM - p.From.Offset = -8 p.From.Reg = REGSP + p.From.Offset = 16 + p.Scond = C_XPOST p.To.Type = obj.TYPE_REGREG p.To.Reg = REGFP p.To.Offset = REGLINK - - if aoffset < 1<<12 { - // ADD $aoffset, RSP, RSP - q = newprog() - q.As = AADD - q.From.Type = obj.TYPE_CONST - q.From.Offset = int64(aoffset) - q.To.Type = obj.TYPE_REG - q.To.Reg = REGSP - q.Spadj = -aoffset - q.Pos = p.Pos - q.Link = p.Link - p.Link = q - p = q - } else { - // Put frame size in a separate register and - // add it in with a single instruction, - // so we never have a partial frame during - // the epilog. See issue 73259. - - // MOVD $aoffset, REGTMP - q = newprog() - q.As = AMOVD - q.From.Type = obj.TYPE_CONST - q.From.Offset = int64(aoffset) - q.To.Type = obj.TYPE_REG - q.To.Reg = REGTMP - q.Pos = p.Pos - q.Link = p.Link - p.Link = q - p = q - // ADD REGTMP, RSP, RSP - q = newprog() - q.As = AADD - q.From.Type = obj.TYPE_REG - q.From.Reg = REGTMP - q.To.Type = obj.TYPE_REG - q.To.Reg = REGSP - q.Spadj = -aoffset - q.Pos = p.Pos - q.Link = p.Link - p.Link = q - p = q - } + p.Spadj = -16 } // If enabled, this code emits 'MOV PC, R27' before every 'MOV LR, PC', @@ -868,10 +765,11 @@ func preprocess(ctxt *obj.Link, cursym *obj.LSym, newprog obj.ProgAlloc) { p.From.Type = obj.TYPE_REG p.From.Reg = REGLINK } else { - /* MOVD (RSP), Rd */ + /* MOVD framesize-8(RSP), Rd */ p.As = AMOVD p.From.Type = obj.TYPE_MEM p.From.Reg = REGSP + p.From.Offset = int64(c.autosize - 8) } } if p.To.Type == obj.TYPE_REG && p.To.Reg == REGSP && p.Spadj == 0 { @@ -906,6 +804,12 @@ func preprocess(ctxt *obj.Link, cursym *obj.LSym, newprog obj.ProgAlloc) { p.From.Reg = int16(REG_LSL + r + (shift&7)<<5) p.From.Offset = 0 } + if p.To.Type == obj.TYPE_MEM && p.To.Reg == REG_RSP && (p.Scond == C_XPRE || p.Scond == C_XPOST) { + p.Spadj += int32(-p.To.Offset) + } + if p.From.Type == obj.TYPE_MEM && p.From.Reg == REG_RSP && (p.Scond == C_XPRE || p.Scond == C_XPOST) { + p.Spadj += int32(-p.From.Offset) + } } } diff --git a/src/cmd/link/internal/amd64/obj.go b/src/cmd/link/internal/amd64/obj.go index 3a6141b909..761496549f 100644 --- a/src/cmd/link/internal/amd64/obj.go +++ b/src/cmd/link/internal/amd64/obj.go @@ -51,15 +51,16 @@ func Init() (*sys.Arch, ld.Arch) { Plan9Magic: uint32(4*26*26 + 7), Plan9_64Bit: true, - Adddynrel: adddynrel, - Archinit: archinit, - Archreloc: archreloc, - Archrelocvariant: archrelocvariant, - Gentext: gentext, - Machoreloc1: machoreloc1, - MachorelocSize: 8, - PEreloc1: pereloc1, - TLSIEtoLE: tlsIEtoLE, + Adddynrel: adddynrel, + Archinit: archinit, + Archreloc: archreloc, + Archrelocvariant: archrelocvariant, + Gentext: gentext, + Machoreloc1: machoreloc1, + MachorelocSize: 8, + PEreloc1: pereloc1, + TLSIEtoLE: tlsIEtoLE, + ReturnAddressAtTopOfFrame: true, ELF: ld.ELFArch{ Linuxdynld: "/lib64/ld-linux-x86-64.so.2", diff --git a/src/cmd/link/internal/arm64/obj.go b/src/cmd/link/internal/arm64/obj.go index 3d358155ba..e1e4ade818 100644 --- a/src/cmd/link/internal/arm64/obj.go +++ b/src/cmd/link/internal/arm64/obj.go @@ -47,17 +47,18 @@ func Init() (*sys.Arch, ld.Arch) { Dwarfreglr: dwarfRegLR, TrampLimit: 0x7c00000, // 26-bit signed offset * 4, leave room for PLT etc. - Adddynrel: adddynrel, - Archinit: archinit, - Archreloc: archreloc, - Archrelocvariant: archrelocvariant, - Extreloc: extreloc, - Gentext: gentext, - GenSymsLate: gensymlate, - Machoreloc1: machoreloc1, - MachorelocSize: 8, - PEreloc1: pereloc1, - Trampoline: trampoline, + Adddynrel: adddynrel, + Archinit: archinit, + Archreloc: archreloc, + Archrelocvariant: archrelocvariant, + Extreloc: extreloc, + Gentext: gentext, + GenSymsLate: gensymlate, + Machoreloc1: machoreloc1, + MachorelocSize: 8, + PEreloc1: pereloc1, + Trampoline: trampoline, + ReturnAddressAtTopOfFrame: true, ELF: ld.ELFArch{ Androiddynld: "/system/bin/linker64", diff --git a/src/cmd/link/internal/ld/dwarf.go b/src/cmd/link/internal/ld/dwarf.go index 0003938ef2..c4d12a5488 100644 --- a/src/cmd/link/internal/ld/dwarf.go +++ b/src/cmd/link/internal/ld/dwarf.go @@ -1544,9 +1544,14 @@ func (d *dwctxt) writeframes(fs loader.Sym) dwarfSecInfo { if pcsp.Value > 0 { // The return address is preserved at (CFA-frame_size) // after a stack frame has been allocated. + off := -spdelta + if thearch.ReturnAddressAtTopOfFrame { + // Except arm64, which has it at the top of frame. + off = -int64(d.arch.PtrSize) + } deltaBuf = append(deltaBuf, dwarf.DW_CFA_offset_extended_sf) deltaBuf = dwarf.AppendUleb128(deltaBuf, uint64(thearch.Dwarfreglr)) - deltaBuf = dwarf.AppendSleb128(deltaBuf, -spdelta/dataAlignmentFactor) + deltaBuf = dwarf.AppendSleb128(deltaBuf, off/dataAlignmentFactor) } else { // The return address is restored into the link register // when a stack frame has been de-allocated. diff --git a/src/cmd/link/internal/ld/lib.go b/src/cmd/link/internal/ld/lib.go index 2c861129b5..5f5ebfc1d9 100644 --- a/src/cmd/link/internal/ld/lib.go +++ b/src/cmd/link/internal/ld/lib.go @@ -263,6 +263,10 @@ type Arch struct { // optional override for assignAddress AssignAddress func(ldr *loader.Loader, sect *sym.Section, n int, s loader.Sym, va uint64, isTramp bool) (*sym.Section, int, uint64) + // Reports whether the return address is stored at the top (highest address) + // of the stack frame. + ReturnAddressAtTopOfFrame bool + // ELF specific information. ELF ELFArch } diff --git a/src/cmd/link/internal/ld/stackcheck.go b/src/cmd/link/internal/ld/stackcheck.go index 98e7edaeb1..14cd3a2238 100644 --- a/src/cmd/link/internal/ld/stackcheck.go +++ b/src/cmd/link/internal/ld/stackcheck.go @@ -9,7 +9,6 @@ import ( "cmd/internal/objabi" "cmd/link/internal/loader" "fmt" - "internal/buildcfg" "sort" "strings" ) @@ -62,10 +61,6 @@ func (ctxt *Link) doStackCheck() { // that there are at least StackLimit bytes available below SP // when morestack returns. limit := objabi.StackNosplit(*flagRace) - sc.callSize - if buildcfg.GOARCH == "arm64" { - // Need an extra 8 bytes below SP to save FP. - limit -= 8 - } // Compute stack heights without any back-tracking information. // This will almost certainly succeed and we can simply diff --git a/src/cmd/link/internal/x86/obj.go b/src/cmd/link/internal/x86/obj.go index 4336f01ea3..a4885fde8f 100644 --- a/src/cmd/link/internal/x86/obj.go +++ b/src/cmd/link/internal/x86/obj.go @@ -50,13 +50,14 @@ func Init() (*sys.Arch, ld.Arch) { Plan9Magic: uint32(4*11*11 + 7), - Adddynrel: adddynrel, - Archinit: archinit, - Archreloc: archreloc, - Archrelocvariant: archrelocvariant, - Gentext: gentext, - Machoreloc1: machoreloc1, - PEreloc1: pereloc1, + Adddynrel: adddynrel, + Archinit: archinit, + Archreloc: archreloc, + Archrelocvariant: archrelocvariant, + Gentext: gentext, + Machoreloc1: machoreloc1, + PEreloc1: pereloc1, + ReturnAddressAtTopOfFrame: true, ELF: ld.ELFArch{ Linuxdynld: "/lib/ld-linux.so.2", diff --git a/src/runtime/asm_arm64.s b/src/runtime/asm_arm64.s index a0e82ec830..aa49a27a75 100644 --- a/src/runtime/asm_arm64.s +++ b/src/runtime/asm_arm64.s @@ -50,9 +50,7 @@ TEXT _rt0_arm64_lib(SB),NOSPLIT,$184 CBZ R4, nocgo MOVD $_rt0_arm64_lib_go(SB), R0 MOVD $0, R1 - SUB $16, RSP // reserve 16 bytes for sp-8 where fp may be saved. BL (R4) - ADD $16, RSP B restore nocgo: @@ -371,7 +369,6 @@ switch: BL runtime·save_g(SB) MOVD (g_sched+gobuf_sp)(g), R0 MOVD R0, RSP - MOVD (g_sched+gobuf_bp)(g), R29 MOVD $0, (g_sched+gobuf_sp)(g) MOVD $0, (g_sched+gobuf_bp)(g) RET @@ -381,8 +378,8 @@ noswitch: // Using a tail call here cleans up tracebacks since we won't stop // at an intermediate systemstack. MOVD 0(R26), R3 // code pointer - MOVD.P 16(RSP), R30 // restore LR - SUB $8, RSP, R29 // restore FP + ADD $16, RSP + LDP.P 16(RSP), (R29,R30) // restore FP, LR B (R3) // func switchToCrashStack0(fn func()) @@ -1051,7 +1048,7 @@ again: // Smashes R0. TEXT gosave_systemstack_switch<>(SB),NOSPLIT|NOFRAME,$0 MOVD $runtime·systemstack_switch(SB), R0 - ADD $8, R0 // get past prologue + ADD $12, R0 // get past prologue MOVD R0, (g_sched+gobuf_pc)(g) MOVD RSP, R0 MOVD R0, (g_sched+gobuf_sp)(g) @@ -1069,9 +1066,7 @@ TEXT gosave_systemstack_switch<>(SB),NOSPLIT|NOFRAME,$0 TEXT ·asmcgocall_no_g(SB),NOSPLIT,$0-16 MOVD fn+0(FP), R1 MOVD arg+8(FP), R0 - SUB $16, RSP // skip over saved frame pointer below RSP BL (R1) - ADD $16, RSP // skip over saved frame pointer below RSP RET // func asmcgocall(fn, arg unsafe.Pointer) int32 @@ -1236,9 +1231,9 @@ havem: BL runtime·save_g(SB) MOVD (g_sched+gobuf_sp)(g), R4 // prepare stack as R4 MOVD (g_sched+gobuf_pc)(g), R5 - MOVD R5, -48(R4) + MOVD R5, -8(R4) MOVD (g_sched+gobuf_bp)(g), R5 - MOVD R5, -56(R4) + MOVD R5, -16(R4) // Gather our arguments into registers. MOVD fn+0(FP), R1 MOVD frame+8(FP), R2 @@ -1252,7 +1247,7 @@ havem: CALL (R0) // indirect call to bypass nosplit check. We're on a different stack now. // Restore g->sched (== m->curg->sched) from saved values. - MOVD 0(RSP), R5 + MOVD 40(RSP), R5 MOVD R5, (g_sched+gobuf_pc)(g) MOVD RSP, R4 ADD $48, R4, R4 @@ -1490,10 +1485,57 @@ GLOBL debugCallFrameTooLarge<>(SB), RODATA, $20 // Size duplicated below // // This is ABIInternal because Go code injects its PC directly into new // goroutine stacks. +// +// State before debugger starts doing anything: +// | current | +// | stack | +// +-------------+ <- SP = origSP +// stopped executing at PC = origPC +// some values are in LR (origLR) and FP (origFP) +// +// After debugger has done steps 1-6 above: +// | current | +// | stack | +// +-------------+ <- origSP +// | ----- | (used to be a slot to store frame pointer on entry to origPC's frame.) +// +-------------+ +// | origLR | +// +-------------+ <- SP +// | ----- | +// +-------------+ +// | argsize | +// +-------------+ +// LR = origPC, PC = debugCallV2 +// +// debugCallV2 then modifies the stack up to the "good" label: +// | current | +// | stack | +// +-------------+ <- origSP +// | ----- | (used to be a slot to store frame pointer on entry to origPC's frame.) +// +-------------+ +// | origLR | +// +-------------+ <- where debugger left SP +// | origPC | +// +-------------+ +// | origFP | +// +-------------+ <- FP = SP + 256 +// | saved | +// | registers | +// | (224 bytes) | +// +-------------+ <- SP + 32 +// | space for | +// | outargs | +// +-------------+ <- SP + 8 +// | argsize | +// +-------------+ <- SP + TEXT runtime·debugCallV2(SB),NOSPLIT|NOFRAME,$0-0 - STP (R29, R30), -280(RSP) - SUB $272, RSP, RSP - SUB $8, RSP, R29 + MOVD R30, -8(RSP) // save origPC + MOVD -16(RSP), R30 // save argsize in R30 temporarily + MOVD.W R29, -16(RSP) // push origFP + MOVD RSP, R29 // frame pointer chain now set up + SUB $256, RSP, RSP // allocate frame + MOVD R30, (RSP) // Save argsize on the stack // Save all registers that may contain pointers so they can be // conservatively scanned. // @@ -1515,7 +1557,8 @@ TEXT runtime·debugCallV2(SB),NOSPLIT|NOFRAME,$0-0 STP (R0, R1), (4*8)(RSP) // Perform a safe-point check. - MOVD R30, 8(RSP) // Caller's PC + MOVD 264(RSP), R0 // origPC + MOVD R0, 8(RSP) CALL runtime·debugCallCheck(SB) MOVD 16(RSP), R0 CBZ R0, good @@ -1559,7 +1602,7 @@ good: CALL runtime·debugCallWrap(SB); \ JMP restore - MOVD 256(RSP), R0 // the argument frame size + MOVD (RSP), R0 // the argument frame size DEBUG_CALL_DISPATCH(debugCall32<>, 32) DEBUG_CALL_DISPATCH(debugCall64<>, 64) DEBUG_CALL_DISPATCH(debugCall128<>, 128) @@ -1607,9 +1650,9 @@ restore: LDP (6*8)(RSP), (R2, R3) LDP (4*8)(RSP), (R0, R1) - LDP -8(RSP), (R29, R27) - ADD $288, RSP, RSP // Add 16 more bytes, see saveSigContext - MOVD -16(RSP), R30 // restore old lr + MOVD 272(RSP), R30 // restore old lr (saved by (*sigctxt).pushCall) + LDP 256(RSP), (R29, R27) // restore old fp, set up resumption address + ADD $288, RSP, RSP // Pop frame, LR+FP, and block pushed by (*sigctxt).pushCall JMP (R27) // runtime.debugCallCheck assumes that functions defined with the diff --git a/src/runtime/mkpreempt.go b/src/runtime/mkpreempt.go index 769c4ffc5c..9064cae039 100644 --- a/src/runtime/mkpreempt.go +++ b/src/runtime/mkpreempt.go @@ -488,26 +488,18 @@ func genARM64(g *gen) { l.stack += 8 // SP needs 16-byte alignment } - // allocate frame, save PC of interrupted instruction (in LR) - p("MOVD R30, %d(RSP)", -l.stack) + // allocate frame, save PC (in R30), FP (in R29) of interrupted instruction + p("STP.W (R29, R30), -16(RSP)") + p("MOVD RSP, R29") // set up new frame pointer p("SUB $%d, RSP", l.stack) - p("MOVD R29, -8(RSP)") // save frame pointer (only used on Linux) - p("SUB $8, RSP, R29") // set up new frame pointer - // On iOS, save the LR again after decrementing SP. We run the - // signal handler on the G stack (as it doesn't support sigaltstack), - // so any writes below SP may be clobbered. - p("#ifdef GOOS_ios") - p("MOVD R30, (RSP)") - p("#endif") l.save(g) p("CALL ·asyncPreempt2(SB)") l.restore(g) - p("MOVD %d(RSP), R30", l.stack) // sigctxt.pushCall has pushed LR (at interrupt) on stack, restore it - p("MOVD -8(RSP), R29") // restore frame pointer - p("MOVD (RSP), R27") // load PC to REGTMP - p("ADD $%d, RSP", l.stack+16) // pop frame (including the space pushed by sigctxt.pushCall) + p("MOVD %d(RSP), R30", l.stack+16) // sigctxt.pushCall has pushed LR (at interrupt) on stack, restore it + p("LDP %d(RSP), (R29, R27)", l.stack) // Restore frame pointer. Load PC into regtmp. + p("ADD $%d, RSP", l.stack+32) // pop frame (including the space pushed by sigctxt.pushCall) p("RET (R27)") } diff --git a/src/runtime/panic.go b/src/runtime/panic.go index 8c91c9435a..04b3afe168 100644 --- a/src/runtime/panic.go +++ b/src/runtime/panic.go @@ -1379,10 +1379,10 @@ func recovery(gp *g) { // the caller gp.sched.bp = fp - 2*goarch.PtrSize case goarch.IsArm64 != 0: - // on arm64, the architectural bp points one word higher - // than the sp. fp is totally useless to us here, because it - // only gets us to the caller's fp. - gp.sched.bp = sp - goarch.PtrSize + // on arm64, the first two words of the frame are caller's PC + // (the saved LR register) and the caller's BP. + // Coincidentally, the same as amd64. + gp.sched.bp = fp - 2*goarch.PtrSize } gogo(&gp.sched) } diff --git a/src/runtime/preempt_arm64.s b/src/runtime/preempt_arm64.s index 31ec9d940f..f4248cac25 100644 --- a/src/runtime/preempt_arm64.s +++ b/src/runtime/preempt_arm64.s @@ -4,13 +4,9 @@ #include "textflag.h" TEXT ·asyncPreempt(SB),NOSPLIT|NOFRAME,$0-0 - MOVD R30, -496(RSP) + STP.W (R29, R30), -16(RSP) + MOVD RSP, R29 SUB $496, RSP - MOVD R29, -8(RSP) - SUB $8, RSP, R29 - #ifdef GOOS_ios - MOVD R30, (RSP) - #endif STP (R0, R1), 8(RSP) STP (R2, R3), 24(RSP) STP (R4, R5), 40(RSP) @@ -78,8 +74,7 @@ TEXT ·asyncPreempt(SB),NOSPLIT|NOFRAME,$0-0 LDP 40(RSP), (R4, R5) LDP 24(RSP), (R2, R3) LDP 8(RSP), (R0, R1) - MOVD 496(RSP), R30 - MOVD -8(RSP), R29 - MOVD (RSP), R27 - ADD $512, RSP + MOVD 512(RSP), R30 + LDP 496(RSP), (R29, R27) + ADD $528, RSP RET (R27) diff --git a/src/runtime/race_arm64.s b/src/runtime/race_arm64.s index 5df650105b..feaa328d4c 100644 --- a/src/runtime/race_arm64.s +++ b/src/runtime/race_arm64.s @@ -397,7 +397,7 @@ TEXT racecallatomic<>(SB), NOSPLIT, $0 // R3 = addr of incoming arg list // Trigger SIGSEGV early. - MOVD 40(RSP), R3 // 1st arg is addr. after two times BL, get it at 40(RSP) + MOVD 72(RSP), R3 // 1st arg is addr. after two small frames (32 bytes each), get it at 72(RSP) MOVB (R3), R13 // segv here if addr is bad // Check that addr is within [arenastart, arenaend) or within [racedatastart, racedataend). MOVD runtime·racearenastart(SB), R10 @@ -417,10 +417,11 @@ racecallatomic_ok: // Addr is within the good range, call the atomic function. load_g MOVD g_racectx(g), R0 // goroutine context - MOVD 16(RSP), R1 // caller pc + MOVD 56(RSP), R1 // caller pc MOVD R9, R2 // pc - ADD $40, RSP, R3 - JMP racecall<>(SB) // does not return + ADD $72, RSP, R3 + BL racecall<>(SB) + RET racecallatomic_ignore: // Addr is outside the good range. // Call __tsan_go_ignore_sync_begin to ignore synchronization during the atomic op. @@ -435,9 +436,9 @@ racecallatomic_ignore: // racecall will call LLVM race code which might clobber R28 (g) load_g MOVD g_racectx(g), R0 // goroutine context - MOVD 16(RSP), R1 // caller pc + MOVD 56(RSP), R1 // caller pc MOVD R9, R2 // pc - ADD $40, RSP, R3 // arguments + ADD $72, RSP, R3 // arguments BL racecall<>(SB) // Call __tsan_go_ignore_sync_end. MOVD $__tsan_go_ignore_sync_end(SB), R9 @@ -476,10 +477,6 @@ TEXT racecall<>(SB), NOSPLIT|NOFRAME, $0-0 MOVD (g_sched+gobuf_sp)(R11), R12 MOVD R12, RSP call: - // Decrement SP past where the frame pointer is saved in the Go arm64 - // ABI (one word below the stack pointer) so the race detector library - // code doesn't clobber it - SUB $16, RSP BL R9 MOVD R19, RSP JMP (R20) diff --git a/src/runtime/signal_arm64.go b/src/runtime/signal_arm64.go index af7d29f9de..61dad50721 100644 --- a/src/runtime/signal_arm64.go +++ b/src/runtime/signal_arm64.go @@ -8,7 +8,6 @@ package runtime import ( "internal/abi" - "internal/goarch" "internal/runtime/sys" "unsafe" ) @@ -63,18 +62,11 @@ func (c *sigctxt) preparePanic(sig uint32, gp *g) { // We arrange lr, and pc to pretend the panicking // function calls sigpanic directly. // Always save LR to stack so that panics in leaf - // functions are correctly handled. This smashes - // the stack frame but we're not going back there - // anyway. + // functions are correctly handled. + // This extra space is known to gentraceback. sp := c.sp() - sys.StackAlign // needs only sizeof uint64, but must align the stack c.set_sp(sp) *(*uint64)(unsafe.Pointer(uintptr(sp))) = c.lr() - // Make sure a valid frame pointer is saved on the stack so that the - // frame pointer checks in adjustframe are happy, if they're enabled. - // Frame pointer unwinding won't visit the sigpanic frame, since - // sigpanic will save the same frame pointer before calling into a panic - // function. - *(*uint64)(unsafe.Pointer(uintptr(sp - goarch.PtrSize))) = c.r29() pc := gp.sigpc @@ -96,10 +88,6 @@ func (c *sigctxt) pushCall(targetPC, resumePC uintptr) { sp := c.sp() - 16 // SP needs 16-byte alignment c.set_sp(sp) *(*uint64)(unsafe.Pointer(uintptr(sp))) = c.lr() - // Make sure a valid frame pointer is saved on the stack so that the - // frame pointer checks in adjustframe are happy, if they're enabled. - // This is not actually used for unwinding. - *(*uint64)(unsafe.Pointer(uintptr(sp - goarch.PtrSize))) = c.r29() // Set up PC and LR to pretend the function being signaled // calls targetPC at resumePC. c.set_lr(uint64(resumePC)) diff --git a/src/runtime/stack.go b/src/runtime/stack.go index 55e97e77af..5eaceec6da 100644 --- a/src/runtime/stack.go +++ b/src/runtime/stack.go @@ -579,23 +579,27 @@ var ptrnames = []string{ // | args to callee | // +------------------+ <- frame->sp // -// (arm) +// (arm64) // +------------------+ // | args from caller | // +------------------+ <- frame->argp -// | caller's retaddr | +// | | +// +------------------+ <- frame->fp (aka caller's sp) +// | return address | // +------------------+ -// | caller's FP (*) | (*) on ARM64, if framepointer_enabled && varp > sp +// | caller's FP | (frame pointer always enabled: TODO) // +------------------+ <- frame->varp // | locals | // +------------------+ // | args to callee | // +------------------+ -// | return address | +// | | // +------------------+ <- frame->sp // // varp > sp means that the function has a frame; // varp == sp means frameless function. +// +// Alignment padding, if needed, will be between "locals" and "args to callee". type adjustinfo struct { old stack @@ -709,7 +713,8 @@ func adjustframe(frame *stkframe, adjinfo *adjustinfo) { } // Adjust saved frame pointer if there is one. - if (goarch.ArchFamily == goarch.AMD64 || goarch.ArchFamily == goarch.ARM64) && frame.argp-frame.varp == 2*goarch.PtrSize { + if goarch.ArchFamily == goarch.AMD64 && frame.argp-frame.varp == 2*goarch.PtrSize || + goarch.ArchFamily == goarch.ARM64 && frame.argp-frame.varp == 3*goarch.PtrSize { if stackDebug >= 3 { print(" saved bp\n") } @@ -723,10 +728,7 @@ func adjustframe(frame *stkframe, adjinfo *adjustinfo) { throw("bad frame pointer") } } - // On AMD64, this is the caller's frame pointer saved in the current - // frame. - // On ARM64, this is the frame pointer of the caller's caller saved - // by the caller in its frame (one word below its SP). + // This is the caller's frame pointer saved in the current frame. adjustpointer(adjinfo, unsafe.Pointer(frame.varp)) } diff --git a/src/runtime/testdata/testprog/badtraceback.go b/src/runtime/testdata/testprog/badtraceback.go index 455118a543..36575f765d 100644 --- a/src/runtime/testdata/testprog/badtraceback.go +++ b/src/runtime/testdata/testprog/badtraceback.go @@ -41,6 +41,11 @@ func badLR2(arg int) { if runtime.GOARCH == "ppc64" || runtime.GOARCH == "ppc64le" { lrOff = 32 // FIXED_FRAME or sys.MinFrameSize } + if runtime.GOARCH == "arm64" { + // skip 8 bytes at bottom of parent frame, then point + // to the 8 bytes of the saved PC at the top of the frame. + lrOff = 16 + } lrPtr := (*uintptr)(unsafe.Pointer(uintptr(unsafe.Pointer(&arg)) - lrOff)) *lrPtr = 0xbad diff --git a/src/runtime/traceback.go b/src/runtime/traceback.go index 8882c306ed..1c3e679a02 100644 --- a/src/runtime/traceback.go +++ b/src/runtime/traceback.go @@ -175,6 +175,11 @@ func (u *unwinder) initAt(pc0, sp0, lr0 uintptr, gp *g, flags unwindFlags) { // Start in the caller's frame. if frame.pc == 0 { if usesLR { + // TODO: this isn't right on arm64. But also, this should + // ~never happen. Calling a nil function will panic + // when loading the PC out of the closure, not when + // branching to that PC. (Closures should always have + // valid PCs in their first word.) frame.pc = *(*uintptr)(unsafe.Pointer(frame.sp)) frame.lr = 0 } else { @@ -369,7 +374,11 @@ func (u *unwinder) resolveInternal(innermost, isSyscall bool) { var lrPtr uintptr if usesLR { if innermost && frame.sp < frame.fp || frame.lr == 0 { - lrPtr = frame.sp + if GOARCH == "arm64" { + lrPtr = frame.fp - goarch.PtrSize + } else { + lrPtr = frame.sp + } frame.lr = *(*uintptr)(unsafe.Pointer(lrPtr)) } } else { @@ -385,24 +394,17 @@ func (u *unwinder) resolveInternal(innermost, isSyscall bool) { // On x86, call instruction pushes return PC before entering new function. frame.varp -= goarch.PtrSize } + if GOARCH == "arm64" && frame.varp > frame.sp { + frame.varp -= goarch.PtrSize // LR have been saved, skip over it. + } // For architectures with frame pointers, if there's // a frame, then there's a saved frame pointer here. // // NOTE: This code is not as general as it looks. - // On x86, the ABI is to save the frame pointer word at the + // On x86 and arm64, the ABI is to save the frame pointer word at the // top of the stack frame, so we have to back down over it. - // On arm64, the frame pointer should be at the bottom of - // the stack (with R29 (aka FP) = RSP), in which case we would - // not want to do the subtraction here. But we started out without - // any frame pointer, and when we wanted to add it, we didn't - // want to break all the assembly doing direct writes to 8(RSP) - // to set the first parameter to a called function. - // So we decided to write the FP link *below* the stack pointer - // (with R29 = RSP - 8 in Go functions). - // This is technically ABI-compatible but not standard. - // And it happens to end up mimicking the x86 layout. - // Other architectures may make different decisions. + // No other architectures are framepointer-enabled at the moment. if frame.varp > frame.sp && framepointer_enabled { frame.varp -= goarch.PtrSize } @@ -562,7 +564,7 @@ func (u *unwinder) finishInternal() { gp := u.g.ptr() if u.flags&(unwindPrintErrors|unwindSilentErrors) == 0 && u.frame.sp != gp.stktopsp { print("runtime: g", gp.goid, ": frame.sp=", hex(u.frame.sp), " top=", hex(gp.stktopsp), "\n") - print("\tstack=[", hex(gp.stack.lo), "-", hex(gp.stack.hi), "\n") + print("\tstack=[", hex(gp.stack.lo), "-", hex(gp.stack.hi), "]\n") throw("traceback did not unwind completely") } } diff --git a/test/nosplit.go b/test/nosplit.go index 4b4c93b1d0..1f943fa18c 100644 --- a/test/nosplit.go +++ b/test/nosplit.go @@ -142,7 +142,7 @@ start 136 # (CallSize is 32 on ppc64, 8 on amd64 for frame pointer.) start 96 nosplit start 100 nosplit; REJECT ppc64 ppc64le -start 104 nosplit; REJECT ppc64 ppc64le arm64 +start 104 nosplit; REJECT ppc64 ppc64le start 108 nosplit; REJECT ppc64 ppc64le start 112 nosplit; REJECT ppc64 ppc64le arm64 start 116 nosplit; REJECT ppc64 ppc64le @@ -160,7 +160,7 @@ start 136 nosplit; REJECT # Because AMD64 uses frame pointer, it has 8 fewer bytes. start 96 nosplit call f; f 0 nosplit start 100 nosplit call f; f 0 nosplit; REJECT ppc64 ppc64le -start 104 nosplit call f; f 0 nosplit; REJECT ppc64 ppc64le arm64 +start 104 nosplit call f; f 0 nosplit; REJECT ppc64 ppc64le start 108 nosplit call f; f 0 nosplit; REJECT ppc64 ppc64le start 112 nosplit call f; f 0 nosplit; REJECT ppc64 ppc64le amd64 arm64 start 116 nosplit call f; f 0 nosplit; REJECT ppc64 ppc64le amd64 @@ -176,7 +176,7 @@ start 136 nosplit call f; f 0 nosplit; REJECT # Architectures differ in the same way as before. start 96 nosplit call f; f 0 call f start 100 nosplit call f; f 0 call f; REJECT ppc64 ppc64le -start 104 nosplit call f; f 0 call f; REJECT ppc64 ppc64le amd64 arm64 +start 104 nosplit call f; f 0 call f; REJECT ppc64 ppc64le amd64 start 108 nosplit call f; f 0 call f; REJECT ppc64 ppc64le amd64 start 112 nosplit call f; f 0 call f; REJECT ppc64 ppc64le amd64 arm64 start 116 nosplit call f; f 0 call f; REJECT ppc64 ppc64le amd64 @@ -189,7 +189,7 @@ start 136 nosplit call f; f 0 call f; REJECT # Indirect calls are assumed to be splitting functions. start 96 nosplit callind start 100 nosplit callind; REJECT ppc64 ppc64le -start 104 nosplit callind; REJECT ppc64 ppc64le amd64 arm64 +start 104 nosplit callind; REJECT ppc64 ppc64le amd64 start 108 nosplit callind; REJECT ppc64 ppc64le amd64 start 112 nosplit callind; REJECT ppc64 ppc64le amd64 arm64 start 116 nosplit callind; REJECT ppc64 ppc64le amd64 -- cgit v1.3-6-g1900 From c938051dd0b80a5c60572d6807270d06ca685d2e Mon Sep 17 00:00:00 2001 From: Keith Randall Date: Tue, 7 Oct 2025 07:58:50 -0700 Subject: Revert "cmd/compile: redo arm64 LR/FP save and restore" This reverts commit 719dfcf8a8478d70360bf3c34c0e920be7b32994. Reason for revert: Causing crashes. Change-Id: I0b8526dd03d82fa074ce4f97f1789eeac702b3eb Reviewed-on: https://go-review.googlesource.com/c/go/+/709755 Reviewed-by: Keith Randall Reviewed-by: David Chase LUCI-TryBot-Result: Go LUCI Auto-Submit: Keith Randall Reviewed-by: Cherry Mui --- src/cmd/compile/abi-internal.md | 12 +- src/cmd/compile/internal/arm64/ggen.go | 10 +- src/cmd/compile/internal/arm64/ssa.go | 2 +- src/cmd/compile/internal/ssagen/pgen.go | 6 - src/cmd/compile/internal/ssagen/ssa.go | 3 +- src/cmd/internal/obj/arm64/asm7.go | 12 +- src/cmd/internal/obj/arm64/obj7.go | 326 +++++++++++++++++--------- src/cmd/link/internal/amd64/obj.go | 19 +- src/cmd/link/internal/arm64/obj.go | 23 +- src/cmd/link/internal/ld/dwarf.go | 7 +- src/cmd/link/internal/ld/lib.go | 4 - src/cmd/link/internal/ld/stackcheck.go | 5 + src/cmd/link/internal/x86/obj.go | 15 +- src/runtime/asm_arm64.s | 81 ++----- src/runtime/mkpreempt.go | 20 +- src/runtime/panic.go | 8 +- src/runtime/preempt_arm64.s | 15 +- src/runtime/race_arm64.s | 17 +- src/runtime/signal_arm64.go | 16 +- src/runtime/stack.go | 20 +- src/runtime/testdata/testprog/badtraceback.go | 5 - src/runtime/traceback.go | 30 ++- test/nosplit.go | 8 +- 23 files changed, 361 insertions(+), 303 deletions(-) (limited to 'src/runtime/testdata') diff --git a/src/cmd/compile/abi-internal.md b/src/cmd/compile/abi-internal.md index 490e1affb7..eae230dc07 100644 --- a/src/cmd/compile/abi-internal.md +++ b/src/cmd/compile/abi-internal.md @@ -576,19 +576,19 @@ A function's stack frame, after the frame is created, is laid out as follows: +------------------------------+ - | return PC | - | frame pointer on entry | ← R29 points to | ... locals ... | | ... outgoing arguments ... | - | unused word | ← RSP points to + | return PC | ← RSP points to + | frame pointer on entry | +------------------------------+ ↓ lower addresses The "return PC" is loaded to the link register, R30, as part of the arm64 `CALL` operation. -On entry, a function pushes R30 (the return address) and R29 -(the caller's frame pointer) onto the bottom of the stack. It then -subtracts a constant from RSP to open its stack frame. +On entry, a function subtracts from RSP to open its stack frame, and +saves the values of R30 and R29 at the bottom of the frame. +Specifically, R30 is saved at 0(RSP) and R29 is saved at -8(RSP), +after RSP is updated. A leaf function that does not require any stack space may omit the saved R30 and R29. diff --git a/src/cmd/compile/internal/arm64/ggen.go b/src/cmd/compile/internal/arm64/ggen.go index 6ba56b992e..1402746700 100644 --- a/src/cmd/compile/internal/arm64/ggen.go +++ b/src/cmd/compile/internal/arm64/ggen.go @@ -11,12 +11,10 @@ import ( ) func padframe(frame int64) int64 { - // arm64 requires frame sizes here that are 8 mod 16. - // With the additional (unused) slot at the bottom of the frame, - // that makes an aligned 16 byte frame. - // Adding a save region for LR+FP does not change the alignment. - if frame != 0 { - frame += (-(frame + 8)) & 15 + // arm64 requires that the frame size (not counting saved FP&LR) + // be 16 bytes aligned. If not, pad it. + if frame%16 != 0 { + frame += 16 - (frame % 16) } return frame } diff --git a/src/cmd/compile/internal/arm64/ssa.go b/src/cmd/compile/internal/arm64/ssa.go index 9f79a740c6..7bc0e536e9 100644 --- a/src/cmd/compile/internal/arm64/ssa.go +++ b/src/cmd/compile/internal/arm64/ssa.go @@ -221,7 +221,7 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) { for i := 0; i < len(args); i++ { a := args[i] - // Offset by size of the unused slot before start of args. + // Offset by size of the saved LR slot. addr := ssagen.SpillSlotAddr(a, arm64.REGSP, base.Ctxt.Arch.FixedFrameSize) // Look for double-register operations if we can. if i < len(args)-1 { diff --git a/src/cmd/compile/internal/ssagen/pgen.go b/src/cmd/compile/internal/ssagen/pgen.go index f0776172b9..0a2010363f 100644 --- a/src/cmd/compile/internal/ssagen/pgen.go +++ b/src/cmd/compile/internal/ssagen/pgen.go @@ -393,16 +393,10 @@ func StackOffset(slot ssa.LocalSlot) int32 { case ir.PAUTO: off = n.FrameOffset() if base.Ctxt.Arch.FixedFrameSize == 0 { - // x86 return address off -= int64(types.PtrSize) } if buildcfg.FramePointerEnabled { - // frame pointer off -= int64(types.PtrSize) - if buildcfg.GOARCH == "arm64" { - // arm64 return address also - off -= int64(types.PtrSize) - } } } return int32(off + slot.Off) diff --git a/src/cmd/compile/internal/ssagen/ssa.go b/src/cmd/compile/internal/ssagen/ssa.go index 107447f04c..1e2159579d 100644 --- a/src/cmd/compile/internal/ssagen/ssa.go +++ b/src/cmd/compile/internal/ssagen/ssa.go @@ -7150,7 +7150,6 @@ func defframe(s *State, e *ssafn, f *ssa.Func) { // Insert code to zero ambiguously live variables so that the // garbage collector only sees initialized values when it // looks for pointers. - // Note: lo/hi are offsets from varp and will be negative. var lo, hi int64 // Opaque state for backend to use. Current backends use it to @@ -7158,7 +7157,7 @@ func defframe(s *State, e *ssafn, f *ssa.Func) { var state uint32 // Iterate through declarations. Autos are sorted in decreasing - // frame offset order (least negative to most negative). + // frame offset order. for _, n := range e.curfn.Dcl { if !n.Needzero() { continue diff --git a/src/cmd/internal/obj/arm64/asm7.go b/src/cmd/internal/obj/arm64/asm7.go index 281d705a3e..743d09a319 100644 --- a/src/cmd/internal/obj/arm64/asm7.go +++ b/src/cmd/internal/obj/arm64/asm7.go @@ -51,6 +51,7 @@ type ctxt7 struct { blitrl *obj.Prog elitrl *obj.Prog autosize int32 + extrasize int32 instoffset int64 pc int64 pool struct { @@ -1121,7 +1122,8 @@ func span7(ctxt *obj.Link, cursym *obj.LSym, newprog obj.ProgAlloc) { ctxt.Diag("arm64 ops not initialized, call arm64.buildop first") } - c := ctxt7{ctxt: ctxt, newprog: newprog, cursym: cursym, autosize: int32(p.To.Offset)} + c := ctxt7{ctxt: ctxt, newprog: newprog, cursym: cursym, autosize: int32(p.To.Offset & 0xffffffff), extrasize: int32(p.To.Offset >> 32)} + p.To.Offset &= 0xffffffff // extrasize is no longer needed // Process literal pool and allocate initial program counter for each Prog, before // generating branch veneers. @@ -2117,8 +2119,8 @@ func (c *ctxt7) aclass(a *obj.Addr) int { // a.Offset is still relative to pseudo-SP. a.Reg = obj.REG_NONE } - // The frame top 16 bytes are for LR/FP - c.instoffset = int64(c.autosize) + a.Offset - extrasize + // The frame top 8 or 16 bytes are for FP + c.instoffset = int64(c.autosize) + a.Offset - int64(c.extrasize) return autoclass(c.instoffset) case obj.NAME_PARAM: @@ -2178,8 +2180,8 @@ func (c *ctxt7) aclass(a *obj.Addr) int { // a.Offset is still relative to pseudo-SP. a.Reg = obj.REG_NONE } - // The frame top 16 bytes are for LR/FP - c.instoffset = int64(c.autosize) + a.Offset - extrasize + // The frame top 8 or 16 bytes are for FP + c.instoffset = int64(c.autosize) + a.Offset - int64(c.extrasize) case obj.NAME_PARAM: if a.Reg == REGSP { diff --git a/src/cmd/internal/obj/arm64/obj7.go b/src/cmd/internal/obj/arm64/obj7.go index a697426145..2583e46354 100644 --- a/src/cmd/internal/obj/arm64/obj7.go +++ b/src/cmd/internal/obj/arm64/obj7.go @@ -36,6 +36,7 @@ import ( "cmd/internal/src" "cmd/internal/sys" "internal/abi" + "internal/buildcfg" "log" "math" ) @@ -471,8 +472,6 @@ func (c *ctxt7) rewriteToUseGot(p *obj.Prog) { obj.Nopout(p) } -const extrasize = 16 // space needed in the frame for LR+FP - func preprocess(ctxt *obj.Link, cursym *obj.LSym, newprog obj.ProgAlloc) { if cursym.Func().Text == nil || cursym.Func().Text.Link == nil { return @@ -522,26 +521,33 @@ func preprocess(ctxt *obj.Link, cursym *obj.LSym, newprog obj.ProgAlloc) { c.autosize = int32(textstksiz) if p.Mark&LEAF != 0 && c.autosize == 0 { - // A leaf function with no locals needs no frame. + // A leaf function with no locals has no frame. p.From.Sym.Set(obj.AttrNoFrame, true) } if !p.From.Sym.NoFrame() { // If there is a stack frame at all, it includes - // space for the (now unused) word at [SP:SP+8]. + // space to save the LR. c.autosize += 8 } - // Round up to a multiple of 16. - c.autosize += (-c.autosize) & 15 - if c.autosize != 0 { - // Allocate an extra 16 bytes at the top of the frame - // to save LR+FP. + extrasize := int32(0) + if c.autosize%16 == 8 { + // Allocate extra 8 bytes on the frame top to save FP + extrasize = 8 + } else if c.autosize&(16-1) == 0 { + // Allocate extra 16 bytes to save FP for the old frame whose size is 8 mod 16 + extrasize = 16 + } else { + c.ctxt.Diag("%v: unaligned frame size %d - must be 16 aligned", p, c.autosize-8) + } c.autosize += extrasize c.cursym.Func().Locals += extrasize - p.To.Offset = int64(c.autosize) + // low 32 bits for autosize + // high 32 bits for extrasize + p.To.Offset = int64(c.autosize) | int64(extrasize)<<32 } else { // NOFRAME p.To.Offset = 0 @@ -574,72 +580,120 @@ func preprocess(ctxt *obj.Link, cursym *obj.LSym, newprog obj.ProgAlloc) { var prologueEnd *obj.Prog aoffset := c.autosize - if aoffset < 16 { - log.Fatalf("aoffset too small %d", aoffset) + if aoffset > 0xf0 { + // MOVD.W offset variant range is -0x100 to 0xf8, SP should be 16-byte aligned. + // so the maximum aoffset value is 0xf0. + aoffset = 0xf0 } + // Frame is non-empty. Make sure to save link register, even if + // it is a leaf function, so that traceback works. q = p - - // Store return address and frame pointer at the top of the stack frame. - // STP.W (R29, R30), -16(SP) - q1 = obj.Appendp(q, c.newprog) - q1.Pos = p.Pos - q1.As = ASTP - q1.From.Type = obj.TYPE_REGREG - q1.From.Reg = REGFP - q1.From.Offset = REGLINK - q1.To.Type = obj.TYPE_MEM - q1.To.Reg = REG_RSP - q1.To.Offset = -16 - q1.Scond = C_XPRE - - prologueEnd = q1 - - // Update frame pointer - q1 = obj.Appendp(q1, c.newprog) - q1.Pos = p.Pos - q1.As = AMOVD - q1.From.Type = obj.TYPE_REG - q1.From.Reg = REGSP - q1.To.Type = obj.TYPE_REG - q1.To.Reg = REGFP - - // Allocate additional frame space. - adj := aoffset - 16 - if adj > 0 { - // SUB $autosize-16, RSP - if adj < 1<<12 { - q1 = obj.Appendp(q1, c.newprog) - q1.Pos = p.Pos - q1.As = ASUB - q1.From.Type = obj.TYPE_CONST - q1.From.Offset = int64(adj) - q1.To.Type = obj.TYPE_REG - q1.To.Reg = REGSP - } else { - // Constant too big for atomic subtract. - // Materialize in tmp register first. - q1 = obj.Appendp(q1, c.newprog) - q1.Pos = p.Pos - q1.As = AMOVD - q1.From.Type = obj.TYPE_CONST - q1.From.Offset = int64(adj) - q1.To.Type = obj.TYPE_REG - q1.To.Reg = REGTMP - + if c.autosize > aoffset { + // Frame size is too large for a MOVD.W instruction. Store the frame pointer + // register and link register before decrementing SP, so if a signal comes + // during the execution of the function prologue, the traceback code will + // not see a half-updated stack frame. + + // SUB $autosize, RSP, R20 + q1 = obj.Appendp(q, c.newprog) + q1.Pos = p.Pos + q1.As = ASUB + q1.From.Type = obj.TYPE_CONST + q1.From.Offset = int64(c.autosize) + q1.Reg = REGSP + q1.To.Type = obj.TYPE_REG + q1.To.Reg = REG_R20 + + prologueEnd = q1 + + // STP (R29, R30), -8(R20) + q1 = obj.Appendp(q1, c.newprog) + q1.Pos = p.Pos + q1.As = ASTP + q1.From.Type = obj.TYPE_REGREG + q1.From.Reg = REGFP + q1.From.Offset = REGLINK + q1.To.Type = obj.TYPE_MEM + q1.To.Reg = REG_R20 + q1.To.Offset = -8 + + // This is not async preemptible, as if we open a frame + // at the current SP, it will clobber the saved LR. + q1 = c.ctxt.StartUnsafePoint(q1, c.newprog) + + // MOVD R20, RSP + q1 = obj.Appendp(q1, c.newprog) + q1.Pos = p.Pos + q1.As = AMOVD + q1.From.Type = obj.TYPE_REG + q1.From.Reg = REG_R20 + q1.To.Type = obj.TYPE_REG + q1.To.Reg = REGSP + q1.Spadj = c.autosize + + q1 = c.ctxt.EndUnsafePoint(q1, c.newprog, -1) + + if buildcfg.GOOS == "ios" { + // iOS does not support SA_ONSTACK. We will run the signal handler + // on the G stack. If we write below SP, it may be clobbered by + // the signal handler. So we save FP and LR after decrementing SP. + // STP (R29, R30), -8(RSP) q1 = obj.Appendp(q1, c.newprog) q1.Pos = p.Pos - q1.As = ASUB - q1.From.Type = obj.TYPE_REG - q1.From.Reg = REGTMP - q1.To.Type = obj.TYPE_REG + q1.As = ASTP + q1.From.Type = obj.TYPE_REGREG + q1.From.Reg = REGFP + q1.From.Offset = REGLINK + q1.To.Type = obj.TYPE_MEM q1.To.Reg = REGSP + q1.To.Offset = -8 } - q1.Spadj = adj + } else { + // small frame, update SP and save LR in a single MOVD.W instruction. + // So if a signal comes during the execution of the function prologue, + // the traceback code will not see a half-updated stack frame. + // Also, on Linux, in a cgo binary we may get a SIGSETXID signal + // early on before the signal stack is set, as glibc doesn't allow + // us to block SIGSETXID. So it is important that we don't write below + // the SP until the signal stack is set. + // Luckily, all the functions from thread entry to setting the signal + // stack have small frames. + q1 = obj.Appendp(q, c.newprog) + q1.As = AMOVD + q1.Pos = p.Pos + q1.From.Type = obj.TYPE_REG + q1.From.Reg = REGLINK + q1.To.Type = obj.TYPE_MEM + q1.Scond = C_XPRE + q1.To.Offset = int64(-aoffset) + q1.To.Reg = REGSP + q1.Spadj = aoffset + + prologueEnd = q1 + + // Frame pointer. + q1 = obj.Appendp(q1, c.newprog) + q1.Pos = p.Pos + q1.As = AMOVD + q1.From.Type = obj.TYPE_REG + q1.From.Reg = REGFP + q1.To.Type = obj.TYPE_MEM + q1.To.Reg = REGSP + q1.To.Offset = -8 } prologueEnd.Pos = prologueEnd.Pos.WithXlogue(src.PosPrologueEnd) + q1 = obj.Appendp(q1, c.newprog) + q1.Pos = p.Pos + q1.As = ASUB + q1.From.Type = obj.TYPE_CONST + q1.From.Offset = 8 + q1.Reg = REGSP + q1.To.Type = obj.TYPE_REG + q1.To.Reg = REGFP + case obj.ARET: nocache(p) if p.From.Type == obj.TYPE_CONST { @@ -653,56 +707,105 @@ func preprocess(ctxt *obj.Link, cursym *obj.LSym, newprog obj.ProgAlloc) { } p.To = obj.Addr{} aoffset := c.autosize - if aoffset > 0 { - if aoffset < 16 { - log.Fatalf("aoffset too small %d", aoffset) - } - adj := aoffset - 16 - if adj > 0 { - if adj < 1<<12 { - // ADD $adj, RSP, RSP - p.As = AADD - p.From.Type = obj.TYPE_CONST - p.From.Offset = int64(adj) - p.To.Type = obj.TYPE_REG - p.To.Reg = REGSP - } else { - // Put frame size in a separate register and - // add it in with a single instruction, - // so we never have a partial frame during - // the epilog. See issue 73259. - - // MOVD $adj, REGTMP - p.As = AMOVD - p.From.Type = obj.TYPE_CONST - p.From.Offset = int64(adj) - p.To.Type = obj.TYPE_REG - p.To.Reg = REGTMP - // ADD REGTMP, RSP, RSP - p = obj.Appendp(p, c.newprog) - p.As = AADD - p.From.Type = obj.TYPE_REG - p.From.Reg = REGTMP - p.To.Type = obj.TYPE_REG - p.To.Reg = REGSP - } - p.Spadj = -adj - } - - // Pop LR+FP. - // LDP.P 16(RSP), (R29, R30) - if p.As != obj.ARET { + if c.cursym.Func().Text.Mark&LEAF != 0 { + if aoffset != 0 { + // Restore frame pointer. + // ADD $framesize-8, RSP, R29 + p.As = AADD + p.From.Type = obj.TYPE_CONST + p.From.Offset = int64(c.autosize) - 8 + p.Reg = REGSP + p.To.Type = obj.TYPE_REG + p.To.Reg = REGFP + + // Pop stack frame. + // ADD $framesize, RSP, RSP p = obj.Appendp(p, c.newprog) + p.As = AADD + p.From.Type = obj.TYPE_CONST + p.From.Offset = int64(c.autosize) + p.To.Type = obj.TYPE_REG + p.To.Reg = REGSP + p.Spadj = -c.autosize } - p.As = ALDP + } else if aoffset <= 0xF0 { + // small frame, restore LR and update SP in a single MOVD.P instruction. + // There is no correctness issue to use a single LDP for LR and FP, + // but the instructions are not pattern matched with the prologue's + // MOVD.W and MOVD, which may cause performance issue in + // store-forwarding. + + // MOVD -8(RSP), R29 + p.As = AMOVD p.From.Type = obj.TYPE_MEM p.From.Reg = REGSP - p.From.Offset = 16 + p.From.Offset = -8 + p.To.Type = obj.TYPE_REG + p.To.Reg = REGFP + p = obj.Appendp(p, c.newprog) + + // MOVD.P offset(RSP), R30 + p.As = AMOVD + p.From.Type = obj.TYPE_MEM p.Scond = C_XPOST + p.From.Offset = int64(aoffset) + p.From.Reg = REGSP + p.To.Type = obj.TYPE_REG + p.To.Reg = REGLINK + p.Spadj = -aoffset + } else { + // LDP -8(RSP), (R29, R30) + p.As = ALDP + p.From.Type = obj.TYPE_MEM + p.From.Offset = -8 + p.From.Reg = REGSP p.To.Type = obj.TYPE_REGREG p.To.Reg = REGFP p.To.Offset = REGLINK - p.Spadj = -16 + + if aoffset < 1<<12 { + // ADD $aoffset, RSP, RSP + q = newprog() + q.As = AADD + q.From.Type = obj.TYPE_CONST + q.From.Offset = int64(aoffset) + q.To.Type = obj.TYPE_REG + q.To.Reg = REGSP + q.Spadj = -aoffset + q.Pos = p.Pos + q.Link = p.Link + p.Link = q + p = q + } else { + // Put frame size in a separate register and + // add it in with a single instruction, + // so we never have a partial frame during + // the epilog. See issue 73259. + + // MOVD $aoffset, REGTMP + q = newprog() + q.As = AMOVD + q.From.Type = obj.TYPE_CONST + q.From.Offset = int64(aoffset) + q.To.Type = obj.TYPE_REG + q.To.Reg = REGTMP + q.Pos = p.Pos + q.Link = p.Link + p.Link = q + p = q + // ADD REGTMP, RSP, RSP + q = newprog() + q.As = AADD + q.From.Type = obj.TYPE_REG + q.From.Reg = REGTMP + q.To.Type = obj.TYPE_REG + q.To.Reg = REGSP + q.Spadj = -aoffset + q.Pos = p.Pos + q.Link = p.Link + p.Link = q + p = q + } } // If enabled, this code emits 'MOV PC, R27' before every 'MOV LR, PC', @@ -765,11 +868,10 @@ func preprocess(ctxt *obj.Link, cursym *obj.LSym, newprog obj.ProgAlloc) { p.From.Type = obj.TYPE_REG p.From.Reg = REGLINK } else { - /* MOVD framesize-8(RSP), Rd */ + /* MOVD (RSP), Rd */ p.As = AMOVD p.From.Type = obj.TYPE_MEM p.From.Reg = REGSP - p.From.Offset = int64(c.autosize - 8) } } if p.To.Type == obj.TYPE_REG && p.To.Reg == REGSP && p.Spadj == 0 { @@ -804,12 +906,6 @@ func preprocess(ctxt *obj.Link, cursym *obj.LSym, newprog obj.ProgAlloc) { p.From.Reg = int16(REG_LSL + r + (shift&7)<<5) p.From.Offset = 0 } - if p.To.Type == obj.TYPE_MEM && p.To.Reg == REG_RSP && (p.Scond == C_XPRE || p.Scond == C_XPOST) { - p.Spadj += int32(-p.To.Offset) - } - if p.From.Type == obj.TYPE_MEM && p.From.Reg == REG_RSP && (p.Scond == C_XPRE || p.Scond == C_XPOST) { - p.Spadj += int32(-p.From.Offset) - } } } diff --git a/src/cmd/link/internal/amd64/obj.go b/src/cmd/link/internal/amd64/obj.go index 761496549f..3a6141b909 100644 --- a/src/cmd/link/internal/amd64/obj.go +++ b/src/cmd/link/internal/amd64/obj.go @@ -51,16 +51,15 @@ func Init() (*sys.Arch, ld.Arch) { Plan9Magic: uint32(4*26*26 + 7), Plan9_64Bit: true, - Adddynrel: adddynrel, - Archinit: archinit, - Archreloc: archreloc, - Archrelocvariant: archrelocvariant, - Gentext: gentext, - Machoreloc1: machoreloc1, - MachorelocSize: 8, - PEreloc1: pereloc1, - TLSIEtoLE: tlsIEtoLE, - ReturnAddressAtTopOfFrame: true, + Adddynrel: adddynrel, + Archinit: archinit, + Archreloc: archreloc, + Archrelocvariant: archrelocvariant, + Gentext: gentext, + Machoreloc1: machoreloc1, + MachorelocSize: 8, + PEreloc1: pereloc1, + TLSIEtoLE: tlsIEtoLE, ELF: ld.ELFArch{ Linuxdynld: "/lib64/ld-linux-x86-64.so.2", diff --git a/src/cmd/link/internal/arm64/obj.go b/src/cmd/link/internal/arm64/obj.go index e1e4ade818..3d358155ba 100644 --- a/src/cmd/link/internal/arm64/obj.go +++ b/src/cmd/link/internal/arm64/obj.go @@ -47,18 +47,17 @@ func Init() (*sys.Arch, ld.Arch) { Dwarfreglr: dwarfRegLR, TrampLimit: 0x7c00000, // 26-bit signed offset * 4, leave room for PLT etc. - Adddynrel: adddynrel, - Archinit: archinit, - Archreloc: archreloc, - Archrelocvariant: archrelocvariant, - Extreloc: extreloc, - Gentext: gentext, - GenSymsLate: gensymlate, - Machoreloc1: machoreloc1, - MachorelocSize: 8, - PEreloc1: pereloc1, - Trampoline: trampoline, - ReturnAddressAtTopOfFrame: true, + Adddynrel: adddynrel, + Archinit: archinit, + Archreloc: archreloc, + Archrelocvariant: archrelocvariant, + Extreloc: extreloc, + Gentext: gentext, + GenSymsLate: gensymlate, + Machoreloc1: machoreloc1, + MachorelocSize: 8, + PEreloc1: pereloc1, + Trampoline: trampoline, ELF: ld.ELFArch{ Androiddynld: "/system/bin/linker64", diff --git a/src/cmd/link/internal/ld/dwarf.go b/src/cmd/link/internal/ld/dwarf.go index c4d12a5488..0003938ef2 100644 --- a/src/cmd/link/internal/ld/dwarf.go +++ b/src/cmd/link/internal/ld/dwarf.go @@ -1544,14 +1544,9 @@ func (d *dwctxt) writeframes(fs loader.Sym) dwarfSecInfo { if pcsp.Value > 0 { // The return address is preserved at (CFA-frame_size) // after a stack frame has been allocated. - off := -spdelta - if thearch.ReturnAddressAtTopOfFrame { - // Except arm64, which has it at the top of frame. - off = -int64(d.arch.PtrSize) - } deltaBuf = append(deltaBuf, dwarf.DW_CFA_offset_extended_sf) deltaBuf = dwarf.AppendUleb128(deltaBuf, uint64(thearch.Dwarfreglr)) - deltaBuf = dwarf.AppendSleb128(deltaBuf, off/dataAlignmentFactor) + deltaBuf = dwarf.AppendSleb128(deltaBuf, -spdelta/dataAlignmentFactor) } else { // The return address is restored into the link register // when a stack frame has been de-allocated. diff --git a/src/cmd/link/internal/ld/lib.go b/src/cmd/link/internal/ld/lib.go index 5f5ebfc1d9..2c861129b5 100644 --- a/src/cmd/link/internal/ld/lib.go +++ b/src/cmd/link/internal/ld/lib.go @@ -263,10 +263,6 @@ type Arch struct { // optional override for assignAddress AssignAddress func(ldr *loader.Loader, sect *sym.Section, n int, s loader.Sym, va uint64, isTramp bool) (*sym.Section, int, uint64) - // Reports whether the return address is stored at the top (highest address) - // of the stack frame. - ReturnAddressAtTopOfFrame bool - // ELF specific information. ELF ELFArch } diff --git a/src/cmd/link/internal/ld/stackcheck.go b/src/cmd/link/internal/ld/stackcheck.go index 14cd3a2238..98e7edaeb1 100644 --- a/src/cmd/link/internal/ld/stackcheck.go +++ b/src/cmd/link/internal/ld/stackcheck.go @@ -9,6 +9,7 @@ import ( "cmd/internal/objabi" "cmd/link/internal/loader" "fmt" + "internal/buildcfg" "sort" "strings" ) @@ -61,6 +62,10 @@ func (ctxt *Link) doStackCheck() { // that there are at least StackLimit bytes available below SP // when morestack returns. limit := objabi.StackNosplit(*flagRace) - sc.callSize + if buildcfg.GOARCH == "arm64" { + // Need an extra 8 bytes below SP to save FP. + limit -= 8 + } // Compute stack heights without any back-tracking information. // This will almost certainly succeed and we can simply diff --git a/src/cmd/link/internal/x86/obj.go b/src/cmd/link/internal/x86/obj.go index a4885fde8f..4336f01ea3 100644 --- a/src/cmd/link/internal/x86/obj.go +++ b/src/cmd/link/internal/x86/obj.go @@ -50,14 +50,13 @@ func Init() (*sys.Arch, ld.Arch) { Plan9Magic: uint32(4*11*11 + 7), - Adddynrel: adddynrel, - Archinit: archinit, - Archreloc: archreloc, - Archrelocvariant: archrelocvariant, - Gentext: gentext, - Machoreloc1: machoreloc1, - PEreloc1: pereloc1, - ReturnAddressAtTopOfFrame: true, + Adddynrel: adddynrel, + Archinit: archinit, + Archreloc: archreloc, + Archrelocvariant: archrelocvariant, + Gentext: gentext, + Machoreloc1: machoreloc1, + PEreloc1: pereloc1, ELF: ld.ELFArch{ Linuxdynld: "/lib/ld-linux.so.2", diff --git a/src/runtime/asm_arm64.s b/src/runtime/asm_arm64.s index aa49a27a75..a0e82ec830 100644 --- a/src/runtime/asm_arm64.s +++ b/src/runtime/asm_arm64.s @@ -50,7 +50,9 @@ TEXT _rt0_arm64_lib(SB),NOSPLIT,$184 CBZ R4, nocgo MOVD $_rt0_arm64_lib_go(SB), R0 MOVD $0, R1 + SUB $16, RSP // reserve 16 bytes for sp-8 where fp may be saved. BL (R4) + ADD $16, RSP B restore nocgo: @@ -369,6 +371,7 @@ switch: BL runtime·save_g(SB) MOVD (g_sched+gobuf_sp)(g), R0 MOVD R0, RSP + MOVD (g_sched+gobuf_bp)(g), R29 MOVD $0, (g_sched+gobuf_sp)(g) MOVD $0, (g_sched+gobuf_bp)(g) RET @@ -378,8 +381,8 @@ noswitch: // Using a tail call here cleans up tracebacks since we won't stop // at an intermediate systemstack. MOVD 0(R26), R3 // code pointer - ADD $16, RSP - LDP.P 16(RSP), (R29,R30) // restore FP, LR + MOVD.P 16(RSP), R30 // restore LR + SUB $8, RSP, R29 // restore FP B (R3) // func switchToCrashStack0(fn func()) @@ -1048,7 +1051,7 @@ again: // Smashes R0. TEXT gosave_systemstack_switch<>(SB),NOSPLIT|NOFRAME,$0 MOVD $runtime·systemstack_switch(SB), R0 - ADD $12, R0 // get past prologue + ADD $8, R0 // get past prologue MOVD R0, (g_sched+gobuf_pc)(g) MOVD RSP, R0 MOVD R0, (g_sched+gobuf_sp)(g) @@ -1066,7 +1069,9 @@ TEXT gosave_systemstack_switch<>(SB),NOSPLIT|NOFRAME,$0 TEXT ·asmcgocall_no_g(SB),NOSPLIT,$0-16 MOVD fn+0(FP), R1 MOVD arg+8(FP), R0 + SUB $16, RSP // skip over saved frame pointer below RSP BL (R1) + ADD $16, RSP // skip over saved frame pointer below RSP RET // func asmcgocall(fn, arg unsafe.Pointer) int32 @@ -1231,9 +1236,9 @@ havem: BL runtime·save_g(SB) MOVD (g_sched+gobuf_sp)(g), R4 // prepare stack as R4 MOVD (g_sched+gobuf_pc)(g), R5 - MOVD R5, -8(R4) + MOVD R5, -48(R4) MOVD (g_sched+gobuf_bp)(g), R5 - MOVD R5, -16(R4) + MOVD R5, -56(R4) // Gather our arguments into registers. MOVD fn+0(FP), R1 MOVD frame+8(FP), R2 @@ -1247,7 +1252,7 @@ havem: CALL (R0) // indirect call to bypass nosplit check. We're on a different stack now. // Restore g->sched (== m->curg->sched) from saved values. - MOVD 40(RSP), R5 + MOVD 0(RSP), R5 MOVD R5, (g_sched+gobuf_pc)(g) MOVD RSP, R4 ADD $48, R4, R4 @@ -1485,57 +1490,10 @@ GLOBL debugCallFrameTooLarge<>(SB), RODATA, $20 // Size duplicated below // // This is ABIInternal because Go code injects its PC directly into new // goroutine stacks. -// -// State before debugger starts doing anything: -// | current | -// | stack | -// +-------------+ <- SP = origSP -// stopped executing at PC = origPC -// some values are in LR (origLR) and FP (origFP) -// -// After debugger has done steps 1-6 above: -// | current | -// | stack | -// +-------------+ <- origSP -// | ----- | (used to be a slot to store frame pointer on entry to origPC's frame.) -// +-------------+ -// | origLR | -// +-------------+ <- SP -// | ----- | -// +-------------+ -// | argsize | -// +-------------+ -// LR = origPC, PC = debugCallV2 -// -// debugCallV2 then modifies the stack up to the "good" label: -// | current | -// | stack | -// +-------------+ <- origSP -// | ----- | (used to be a slot to store frame pointer on entry to origPC's frame.) -// +-------------+ -// | origLR | -// +-------------+ <- where debugger left SP -// | origPC | -// +-------------+ -// | origFP | -// +-------------+ <- FP = SP + 256 -// | saved | -// | registers | -// | (224 bytes) | -// +-------------+ <- SP + 32 -// | space for | -// | outargs | -// +-------------+ <- SP + 8 -// | argsize | -// +-------------+ <- SP - TEXT runtime·debugCallV2(SB),NOSPLIT|NOFRAME,$0-0 - MOVD R30, -8(RSP) // save origPC - MOVD -16(RSP), R30 // save argsize in R30 temporarily - MOVD.W R29, -16(RSP) // push origFP - MOVD RSP, R29 // frame pointer chain now set up - SUB $256, RSP, RSP // allocate frame - MOVD R30, (RSP) // Save argsize on the stack + STP (R29, R30), -280(RSP) + SUB $272, RSP, RSP + SUB $8, RSP, R29 // Save all registers that may contain pointers so they can be // conservatively scanned. // @@ -1557,8 +1515,7 @@ TEXT runtime·debugCallV2(SB),NOSPLIT|NOFRAME,$0-0 STP (R0, R1), (4*8)(RSP) // Perform a safe-point check. - MOVD 264(RSP), R0 // origPC - MOVD R0, 8(RSP) + MOVD R30, 8(RSP) // Caller's PC CALL runtime·debugCallCheck(SB) MOVD 16(RSP), R0 CBZ R0, good @@ -1602,7 +1559,7 @@ good: CALL runtime·debugCallWrap(SB); \ JMP restore - MOVD (RSP), R0 // the argument frame size + MOVD 256(RSP), R0 // the argument frame size DEBUG_CALL_DISPATCH(debugCall32<>, 32) DEBUG_CALL_DISPATCH(debugCall64<>, 64) DEBUG_CALL_DISPATCH(debugCall128<>, 128) @@ -1650,9 +1607,9 @@ restore: LDP (6*8)(RSP), (R2, R3) LDP (4*8)(RSP), (R0, R1) - MOVD 272(RSP), R30 // restore old lr (saved by (*sigctxt).pushCall) - LDP 256(RSP), (R29, R27) // restore old fp, set up resumption address - ADD $288, RSP, RSP // Pop frame, LR+FP, and block pushed by (*sigctxt).pushCall + LDP -8(RSP), (R29, R27) + ADD $288, RSP, RSP // Add 16 more bytes, see saveSigContext + MOVD -16(RSP), R30 // restore old lr JMP (R27) // runtime.debugCallCheck assumes that functions defined with the diff --git a/src/runtime/mkpreempt.go b/src/runtime/mkpreempt.go index 9064cae039..769c4ffc5c 100644 --- a/src/runtime/mkpreempt.go +++ b/src/runtime/mkpreempt.go @@ -488,18 +488,26 @@ func genARM64(g *gen) { l.stack += 8 // SP needs 16-byte alignment } - // allocate frame, save PC (in R30), FP (in R29) of interrupted instruction - p("STP.W (R29, R30), -16(RSP)") - p("MOVD RSP, R29") // set up new frame pointer + // allocate frame, save PC of interrupted instruction (in LR) + p("MOVD R30, %d(RSP)", -l.stack) p("SUB $%d, RSP", l.stack) + p("MOVD R29, -8(RSP)") // save frame pointer (only used on Linux) + p("SUB $8, RSP, R29") // set up new frame pointer + // On iOS, save the LR again after decrementing SP. We run the + // signal handler on the G stack (as it doesn't support sigaltstack), + // so any writes below SP may be clobbered. + p("#ifdef GOOS_ios") + p("MOVD R30, (RSP)") + p("#endif") l.save(g) p("CALL ·asyncPreempt2(SB)") l.restore(g) - p("MOVD %d(RSP), R30", l.stack+16) // sigctxt.pushCall has pushed LR (at interrupt) on stack, restore it - p("LDP %d(RSP), (R29, R27)", l.stack) // Restore frame pointer. Load PC into regtmp. - p("ADD $%d, RSP", l.stack+32) // pop frame (including the space pushed by sigctxt.pushCall) + p("MOVD %d(RSP), R30", l.stack) // sigctxt.pushCall has pushed LR (at interrupt) on stack, restore it + p("MOVD -8(RSP), R29") // restore frame pointer + p("MOVD (RSP), R27") // load PC to REGTMP + p("ADD $%d, RSP", l.stack+16) // pop frame (including the space pushed by sigctxt.pushCall) p("RET (R27)") } diff --git a/src/runtime/panic.go b/src/runtime/panic.go index 04b3afe168..8c91c9435a 100644 --- a/src/runtime/panic.go +++ b/src/runtime/panic.go @@ -1379,10 +1379,10 @@ func recovery(gp *g) { // the caller gp.sched.bp = fp - 2*goarch.PtrSize case goarch.IsArm64 != 0: - // on arm64, the first two words of the frame are caller's PC - // (the saved LR register) and the caller's BP. - // Coincidentally, the same as amd64. - gp.sched.bp = fp - 2*goarch.PtrSize + // on arm64, the architectural bp points one word higher + // than the sp. fp is totally useless to us here, because it + // only gets us to the caller's fp. + gp.sched.bp = sp - goarch.PtrSize } gogo(&gp.sched) } diff --git a/src/runtime/preempt_arm64.s b/src/runtime/preempt_arm64.s index f4248cac25..31ec9d940f 100644 --- a/src/runtime/preempt_arm64.s +++ b/src/runtime/preempt_arm64.s @@ -4,9 +4,13 @@ #include "textflag.h" TEXT ·asyncPreempt(SB),NOSPLIT|NOFRAME,$0-0 - STP.W (R29, R30), -16(RSP) - MOVD RSP, R29 + MOVD R30, -496(RSP) SUB $496, RSP + MOVD R29, -8(RSP) + SUB $8, RSP, R29 + #ifdef GOOS_ios + MOVD R30, (RSP) + #endif STP (R0, R1), 8(RSP) STP (R2, R3), 24(RSP) STP (R4, R5), 40(RSP) @@ -74,7 +78,8 @@ TEXT ·asyncPreempt(SB),NOSPLIT|NOFRAME,$0-0 LDP 40(RSP), (R4, R5) LDP 24(RSP), (R2, R3) LDP 8(RSP), (R0, R1) - MOVD 512(RSP), R30 - LDP 496(RSP), (R29, R27) - ADD $528, RSP + MOVD 496(RSP), R30 + MOVD -8(RSP), R29 + MOVD (RSP), R27 + ADD $512, RSP RET (R27) diff --git a/src/runtime/race_arm64.s b/src/runtime/race_arm64.s index feaa328d4c..5df650105b 100644 --- a/src/runtime/race_arm64.s +++ b/src/runtime/race_arm64.s @@ -397,7 +397,7 @@ TEXT racecallatomic<>(SB), NOSPLIT, $0 // R3 = addr of incoming arg list // Trigger SIGSEGV early. - MOVD 72(RSP), R3 // 1st arg is addr. after two small frames (32 bytes each), get it at 72(RSP) + MOVD 40(RSP), R3 // 1st arg is addr. after two times BL, get it at 40(RSP) MOVB (R3), R13 // segv here if addr is bad // Check that addr is within [arenastart, arenaend) or within [racedatastart, racedataend). MOVD runtime·racearenastart(SB), R10 @@ -417,11 +417,10 @@ racecallatomic_ok: // Addr is within the good range, call the atomic function. load_g MOVD g_racectx(g), R0 // goroutine context - MOVD 56(RSP), R1 // caller pc + MOVD 16(RSP), R1 // caller pc MOVD R9, R2 // pc - ADD $72, RSP, R3 - BL racecall<>(SB) - RET + ADD $40, RSP, R3 + JMP racecall<>(SB) // does not return racecallatomic_ignore: // Addr is outside the good range. // Call __tsan_go_ignore_sync_begin to ignore synchronization during the atomic op. @@ -436,9 +435,9 @@ racecallatomic_ignore: // racecall will call LLVM race code which might clobber R28 (g) load_g MOVD g_racectx(g), R0 // goroutine context - MOVD 56(RSP), R1 // caller pc + MOVD 16(RSP), R1 // caller pc MOVD R9, R2 // pc - ADD $72, RSP, R3 // arguments + ADD $40, RSP, R3 // arguments BL racecall<>(SB) // Call __tsan_go_ignore_sync_end. MOVD $__tsan_go_ignore_sync_end(SB), R9 @@ -477,6 +476,10 @@ TEXT racecall<>(SB), NOSPLIT|NOFRAME, $0-0 MOVD (g_sched+gobuf_sp)(R11), R12 MOVD R12, RSP call: + // Decrement SP past where the frame pointer is saved in the Go arm64 + // ABI (one word below the stack pointer) so the race detector library + // code doesn't clobber it + SUB $16, RSP BL R9 MOVD R19, RSP JMP (R20) diff --git a/src/runtime/signal_arm64.go b/src/runtime/signal_arm64.go index 61dad50721..af7d29f9de 100644 --- a/src/runtime/signal_arm64.go +++ b/src/runtime/signal_arm64.go @@ -8,6 +8,7 @@ package runtime import ( "internal/abi" + "internal/goarch" "internal/runtime/sys" "unsafe" ) @@ -62,11 +63,18 @@ func (c *sigctxt) preparePanic(sig uint32, gp *g) { // We arrange lr, and pc to pretend the panicking // function calls sigpanic directly. // Always save LR to stack so that panics in leaf - // functions are correctly handled. - // This extra space is known to gentraceback. + // functions are correctly handled. This smashes + // the stack frame but we're not going back there + // anyway. sp := c.sp() - sys.StackAlign // needs only sizeof uint64, but must align the stack c.set_sp(sp) *(*uint64)(unsafe.Pointer(uintptr(sp))) = c.lr() + // Make sure a valid frame pointer is saved on the stack so that the + // frame pointer checks in adjustframe are happy, if they're enabled. + // Frame pointer unwinding won't visit the sigpanic frame, since + // sigpanic will save the same frame pointer before calling into a panic + // function. + *(*uint64)(unsafe.Pointer(uintptr(sp - goarch.PtrSize))) = c.r29() pc := gp.sigpc @@ -88,6 +96,10 @@ func (c *sigctxt) pushCall(targetPC, resumePC uintptr) { sp := c.sp() - 16 // SP needs 16-byte alignment c.set_sp(sp) *(*uint64)(unsafe.Pointer(uintptr(sp))) = c.lr() + // Make sure a valid frame pointer is saved on the stack so that the + // frame pointer checks in adjustframe are happy, if they're enabled. + // This is not actually used for unwinding. + *(*uint64)(unsafe.Pointer(uintptr(sp - goarch.PtrSize))) = c.r29() // Set up PC and LR to pretend the function being signaled // calls targetPC at resumePC. c.set_lr(uint64(resumePC)) diff --git a/src/runtime/stack.go b/src/runtime/stack.go index 5eaceec6da..55e97e77af 100644 --- a/src/runtime/stack.go +++ b/src/runtime/stack.go @@ -579,27 +579,23 @@ var ptrnames = []string{ // | args to callee | // +------------------+ <- frame->sp // -// (arm64) +// (arm) // +------------------+ // | args from caller | // +------------------+ <- frame->argp -// | | -// +------------------+ <- frame->fp (aka caller's sp) -// | return address | +// | caller's retaddr | // +------------------+ -// | caller's FP | (frame pointer always enabled: TODO) +// | caller's FP (*) | (*) on ARM64, if framepointer_enabled && varp > sp // +------------------+ <- frame->varp // | locals | // +------------------+ // | args to callee | // +------------------+ -// | | +// | return address | // +------------------+ <- frame->sp // // varp > sp means that the function has a frame; // varp == sp means frameless function. -// -// Alignment padding, if needed, will be between "locals" and "args to callee". type adjustinfo struct { old stack @@ -713,8 +709,7 @@ func adjustframe(frame *stkframe, adjinfo *adjustinfo) { } // Adjust saved frame pointer if there is one. - if goarch.ArchFamily == goarch.AMD64 && frame.argp-frame.varp == 2*goarch.PtrSize || - goarch.ArchFamily == goarch.ARM64 && frame.argp-frame.varp == 3*goarch.PtrSize { + if (goarch.ArchFamily == goarch.AMD64 || goarch.ArchFamily == goarch.ARM64) && frame.argp-frame.varp == 2*goarch.PtrSize { if stackDebug >= 3 { print(" saved bp\n") } @@ -728,7 +723,10 @@ func adjustframe(frame *stkframe, adjinfo *adjustinfo) { throw("bad frame pointer") } } - // This is the caller's frame pointer saved in the current frame. + // On AMD64, this is the caller's frame pointer saved in the current + // frame. + // On ARM64, this is the frame pointer of the caller's caller saved + // by the caller in its frame (one word below its SP). adjustpointer(adjinfo, unsafe.Pointer(frame.varp)) } diff --git a/src/runtime/testdata/testprog/badtraceback.go b/src/runtime/testdata/testprog/badtraceback.go index 36575f765d..455118a543 100644 --- a/src/runtime/testdata/testprog/badtraceback.go +++ b/src/runtime/testdata/testprog/badtraceback.go @@ -41,11 +41,6 @@ func badLR2(arg int) { if runtime.GOARCH == "ppc64" || runtime.GOARCH == "ppc64le" { lrOff = 32 // FIXED_FRAME or sys.MinFrameSize } - if runtime.GOARCH == "arm64" { - // skip 8 bytes at bottom of parent frame, then point - // to the 8 bytes of the saved PC at the top of the frame. - lrOff = 16 - } lrPtr := (*uintptr)(unsafe.Pointer(uintptr(unsafe.Pointer(&arg)) - lrOff)) *lrPtr = 0xbad diff --git a/src/runtime/traceback.go b/src/runtime/traceback.go index 1c3e679a02..8882c306ed 100644 --- a/src/runtime/traceback.go +++ b/src/runtime/traceback.go @@ -175,11 +175,6 @@ func (u *unwinder) initAt(pc0, sp0, lr0 uintptr, gp *g, flags unwindFlags) { // Start in the caller's frame. if frame.pc == 0 { if usesLR { - // TODO: this isn't right on arm64. But also, this should - // ~never happen. Calling a nil function will panic - // when loading the PC out of the closure, not when - // branching to that PC. (Closures should always have - // valid PCs in their first word.) frame.pc = *(*uintptr)(unsafe.Pointer(frame.sp)) frame.lr = 0 } else { @@ -374,11 +369,7 @@ func (u *unwinder) resolveInternal(innermost, isSyscall bool) { var lrPtr uintptr if usesLR { if innermost && frame.sp < frame.fp || frame.lr == 0 { - if GOARCH == "arm64" { - lrPtr = frame.fp - goarch.PtrSize - } else { - lrPtr = frame.sp - } + lrPtr = frame.sp frame.lr = *(*uintptr)(unsafe.Pointer(lrPtr)) } } else { @@ -394,17 +385,24 @@ func (u *unwinder) resolveInternal(innermost, isSyscall bool) { // On x86, call instruction pushes return PC before entering new function. frame.varp -= goarch.PtrSize } - if GOARCH == "arm64" && frame.varp > frame.sp { - frame.varp -= goarch.PtrSize // LR have been saved, skip over it. - } // For architectures with frame pointers, if there's // a frame, then there's a saved frame pointer here. // // NOTE: This code is not as general as it looks. - // On x86 and arm64, the ABI is to save the frame pointer word at the + // On x86, the ABI is to save the frame pointer word at the // top of the stack frame, so we have to back down over it. - // No other architectures are framepointer-enabled at the moment. + // On arm64, the frame pointer should be at the bottom of + // the stack (with R29 (aka FP) = RSP), in which case we would + // not want to do the subtraction here. But we started out without + // any frame pointer, and when we wanted to add it, we didn't + // want to break all the assembly doing direct writes to 8(RSP) + // to set the first parameter to a called function. + // So we decided to write the FP link *below* the stack pointer + // (with R29 = RSP - 8 in Go functions). + // This is technically ABI-compatible but not standard. + // And it happens to end up mimicking the x86 layout. + // Other architectures may make different decisions. if frame.varp > frame.sp && framepointer_enabled { frame.varp -= goarch.PtrSize } @@ -564,7 +562,7 @@ func (u *unwinder) finishInternal() { gp := u.g.ptr() if u.flags&(unwindPrintErrors|unwindSilentErrors) == 0 && u.frame.sp != gp.stktopsp { print("runtime: g", gp.goid, ": frame.sp=", hex(u.frame.sp), " top=", hex(gp.stktopsp), "\n") - print("\tstack=[", hex(gp.stack.lo), "-", hex(gp.stack.hi), "]\n") + print("\tstack=[", hex(gp.stack.lo), "-", hex(gp.stack.hi), "\n") throw("traceback did not unwind completely") } } diff --git a/test/nosplit.go b/test/nosplit.go index 1f943fa18c..4b4c93b1d0 100644 --- a/test/nosplit.go +++ b/test/nosplit.go @@ -142,7 +142,7 @@ start 136 # (CallSize is 32 on ppc64, 8 on amd64 for frame pointer.) start 96 nosplit start 100 nosplit; REJECT ppc64 ppc64le -start 104 nosplit; REJECT ppc64 ppc64le +start 104 nosplit; REJECT ppc64 ppc64le arm64 start 108 nosplit; REJECT ppc64 ppc64le start 112 nosplit; REJECT ppc64 ppc64le arm64 start 116 nosplit; REJECT ppc64 ppc64le @@ -160,7 +160,7 @@ start 136 nosplit; REJECT # Because AMD64 uses frame pointer, it has 8 fewer bytes. start 96 nosplit call f; f 0 nosplit start 100 nosplit call f; f 0 nosplit; REJECT ppc64 ppc64le -start 104 nosplit call f; f 0 nosplit; REJECT ppc64 ppc64le +start 104 nosplit call f; f 0 nosplit; REJECT ppc64 ppc64le arm64 start 108 nosplit call f; f 0 nosplit; REJECT ppc64 ppc64le start 112 nosplit call f; f 0 nosplit; REJECT ppc64 ppc64le amd64 arm64 start 116 nosplit call f; f 0 nosplit; REJECT ppc64 ppc64le amd64 @@ -176,7 +176,7 @@ start 136 nosplit call f; f 0 nosplit; REJECT # Architectures differ in the same way as before. start 96 nosplit call f; f 0 call f start 100 nosplit call f; f 0 call f; REJECT ppc64 ppc64le -start 104 nosplit call f; f 0 call f; REJECT ppc64 ppc64le amd64 +start 104 nosplit call f; f 0 call f; REJECT ppc64 ppc64le amd64 arm64 start 108 nosplit call f; f 0 call f; REJECT ppc64 ppc64le amd64 start 112 nosplit call f; f 0 call f; REJECT ppc64 ppc64le amd64 arm64 start 116 nosplit call f; f 0 call f; REJECT ppc64 ppc64le amd64 @@ -189,7 +189,7 @@ start 136 nosplit call f; f 0 call f; REJECT # Indirect calls are assumed to be splitting functions. start 96 nosplit callind start 100 nosplit callind; REJECT ppc64 ppc64le -start 104 nosplit callind; REJECT ppc64 ppc64le amd64 +start 104 nosplit callind; REJECT ppc64 ppc64le amd64 arm64 start 108 nosplit callind; REJECT ppc64 ppc64le amd64 start 112 nosplit callind; REJECT ppc64 ppc64le amd64 arm64 start 116 nosplit callind; REJECT ppc64 ppc64le amd64 -- cgit v1.3-6-g1900 From e8a53538b473f1a7a92602675eda2d34f3887611 Mon Sep 17 00:00:00 2001 From: Michael Anthony Knyszek Date: Thu, 9 Oct 2025 20:58:34 +0000 Subject: runtime: fail TestGoroutineLeakProfile on data race Some of the programs in testdata/testgoroutineleakprofile have data races because they were taken from a corpus that showcases general Go concurrency bugs, not just leaked goroutines. This causes some flakiness as tests might fail due to, for example, a concurrent map access, even outside of race mode. Let's just call data races a failure and fix them in the examples. As far as I can tell, there are only two that show up consistently. Fixes #75732. Change-Id: I160b3a1cdce4c2de3f2320b68b4083292e02b557 Reviewed-on: https://go-review.googlesource.com/c/go/+/710756 LUCI-TryBot-Result: Go LUCI Reviewed-by: Carlos Amedee Auto-Submit: Michael Knyszek Reviewed-by: Cherry Mui --- src/runtime/goroutineleakprofile_test.go | 10 ++++---- .../testgoroutineleakprofile/goker/README.md | 30 ++++++++++++---------- .../goker/cockroach1055.go | 4 +-- .../testgoroutineleakprofile/goker/moby27782.go | 7 +++++ 4 files changed, 31 insertions(+), 20 deletions(-) (limited to 'src/runtime/testdata') diff --git a/src/runtime/goroutineleakprofile_test.go b/src/runtime/goroutineleakprofile_test.go index 6e26bcab13..f5d2dd6372 100644 --- a/src/runtime/goroutineleakprofile_test.go +++ b/src/runtime/goroutineleakprofile_test.go @@ -487,7 +487,7 @@ func TestGoroutineLeakProfile(t *testing.T) { testCases = append(testCases, patternTestCases...) // Test cases must not panic or cause fatal exceptions. - failStates := regexp.MustCompile(`fatal|panic`) + failStates := regexp.MustCompile(`fatal|panic|DATA RACE`) testApp := func(exepath string, testCases []testCase) { @@ -520,9 +520,9 @@ func TestGoroutineLeakProfile(t *testing.T) { t.Errorf("Test %s produced no output. Is the goroutine leak profile collected?", tcase.name) } - // Zero tolerance policy for fatal exceptions or panics. + // Zero tolerance policy for fatal exceptions, panics, or data races. if failStates.MatchString(runOutput) { - t.Errorf("unexpected fatal exception or panic!\noutput:\n%s\n\n", runOutput) + t.Errorf("unexpected fatal exception or panic\noutput:\n%s\n\n", runOutput) } output += runOutput + "\n\n" @@ -540,7 +540,7 @@ func TestGoroutineLeakProfile(t *testing.T) { unexpectedLeaks := make([]string, 0, len(foundLeaks)) // Parse every leak and check if it is expected (maybe as a flaky leak). - LEAKS: + leaks: for _, leak := range foundLeaks { // Check if the leak is expected. // If it is, check whether it has been encountered before. @@ -569,7 +569,7 @@ func TestGoroutineLeakProfile(t *testing.T) { for flakyLeak := range tcase.flakyLeaks { if flakyLeak.MatchString(leak) { // The leak is flaky. Carry on to the next line. - continue LEAKS + continue leaks } } diff --git a/src/runtime/testdata/testgoroutineleakprofile/goker/README.md b/src/runtime/testdata/testgoroutineleakprofile/goker/README.md index 88c50e1e48..e6f8fe23f2 100644 --- a/src/runtime/testdata/testgoroutineleakprofile/goker/README.md +++ b/src/runtime/testdata/testgoroutineleakprofile/goker/README.md @@ -24,18 +24,22 @@ Jingling Xue (jingling@cse.unsw.edu.au): White paper: https://lujie.ac.cn/files/papers/GoBench.pdf -The examples have been modified in order to run the goroutine leak -profiler. Buggy snippets are moved from within a unit test to separate -applications. Each is then independently executed, possibly as multiple -copies within the same application in order to exercise more interleavings. -Concurrently, the main program sets up a waiting period (typically 1ms), followed -by a goroutine leak profile request. Other modifications may involve injecting calls -to `runtime.Gosched()`, to more reliably exercise buggy interleavings, or reductions -in waiting periods when calling `time.Sleep`, in order to reduce overall testing time. - -The resulting goroutine leak profile is analyzed to ensure that no unexpected leaks occurred, -and that the expected leaks did occur. If the leak is flaky, the only purpose of the expected -leak list is to protect against unexpected leaks. +The examples have been modified in order to run the goroutine leak profiler. +Buggy snippets are moved from within a unit test to separate applications. +Each is then independently executed, possibly as multiple copies within the +same application in order to exercise more interleavings. Concurrently, the +main program sets up a waiting period (typically 1ms), followed by a goroutine +leak profile request. Other modifications may involve injecting calls to +`runtime.Gosched()`, to more reliably exercise buggy interleavings, or reductions +in waiting periods when calling `time.Sleep`, in order to reduce overall testing +time. + +The resulting goroutine leak profile is analyzed to ensure that no unexpecte +leaks occurred, and that the expected leaks did occur. If the leak is flaky, the +only purpose of the expected leak list is to protect against unexpected leaks. + +The examples have also been modified to remove data races, since those create flaky +test failures, when really all we care about are leaked goroutines. The entries below document each of the corresponding leaks. @@ -1844,4 +1848,4 @@ c.inbox <- <================> [<-c.inbox] . close(c.closed) . <-c.dispatcherLoopStopped ---------------------G1,G2 leak------------------------------- -``` \ No newline at end of file +``` diff --git a/src/runtime/testdata/testgoroutineleakprofile/goker/cockroach1055.go b/src/runtime/testdata/testgoroutineleakprofile/goker/cockroach1055.go index 687baed25a..87cf157996 100644 --- a/src/runtime/testdata/testgoroutineleakprofile/goker/cockroach1055.go +++ b/src/runtime/testdata/testgoroutineleakprofile/goker/cockroach1055.go @@ -44,9 +44,9 @@ func (s *Stopper_cockroach1055) SetStopped() { func (s *Stopper_cockroach1055) Quiesce() { s.mu.Lock() defer s.mu.Unlock() - s.draining = 1 + atomic.StoreInt32(&s.draining, 1) s.drain.Wait() - s.draining = 0 + atomic.StoreInt32(&s.draining, 0) } func (s *Stopper_cockroach1055) Stop() { diff --git a/src/runtime/testdata/testgoroutineleakprofile/goker/moby27782.go b/src/runtime/testdata/testgoroutineleakprofile/goker/moby27782.go index 7b3398fd38..9b53d9035c 100644 --- a/src/runtime/testdata/testgoroutineleakprofile/goker/moby27782.go +++ b/src/runtime/testdata/testgoroutineleakprofile/goker/moby27782.go @@ -208,6 +208,7 @@ func (container *Container_moby27782) Reset() { } type JSONFileLogger_moby27782 struct { + mu sync.Mutex readers map[*LogWatcher_moby27782]struct{} } @@ -218,11 +219,17 @@ func (l *JSONFileLogger_moby27782) ReadLogs() *LogWatcher_moby27782 { } func (l *JSONFileLogger_moby27782) readLogs(logWatcher *LogWatcher_moby27782) { + l.mu.Lock() + defer l.mu.Unlock() + l.readers[logWatcher] = struct{}{} followLogs_moby27782(logWatcher) } func (l *JSONFileLogger_moby27782) Close() { + l.mu.Lock() + defer l.mu.Unlock() + for r := range l.readers { r.Close() delete(l.readers, r) -- cgit v1.3-6-g1900 From 5cd1b73772e339e3b460d53ba37630704a323ca7 Mon Sep 17 00:00:00 2001 From: qiulaidongfeng <2645477756@qq.com> Date: Tue, 17 Sep 2024 20:10:20 +0800 Subject: runtime: correctly print panics before fatal-ing on defer Fixes #67792 Change-Id: I93d16580cb31e54cee7c8490212404e4d0dec446 Reviewed-on: https://go-review.googlesource.com/c/go/+/613757 Reviewed-by: Keith Randall Reviewed-by: Keith Randall Reviewed-by: Michael Pratt LUCI-TryBot-Result: Go LUCI Auto-Submit: Keith Randall --- src/runtime/panic.go | 23 +++++++++++++++++++++++ src/runtime/panic_test.go | 2 ++ src/runtime/testdata/testprog/panicprint.go | 21 +++++++++++++++++++++ 3 files changed, 46 insertions(+) (limited to 'src/runtime/testdata') diff --git a/src/runtime/panic.go b/src/runtime/panic.go index 325bf74aa8..3c967a2999 100644 --- a/src/runtime/panic.go +++ b/src/runtime/panic.go @@ -1235,10 +1235,12 @@ func throw(s string) { // //go:nosplit func fatal(s string) { + p := getg()._panic // Everything fatal does should be recursively nosplit so it // can be called even when it's unsafe to grow the stack. printlock() // Prevent multiple interleaved fatal reports. See issue 69447. systemstack(func() { + printPreFatalDeferPanic(p) print("fatal error: ") printindented(s) // logically printpanicval(s), but avoids convTstring write barrier print("\n") @@ -1248,6 +1250,27 @@ func fatal(s string) { printunlock() } +// printPreFatalDeferPanic prints the panic +// when fatal occurs in panics while running defer. +func printPreFatalDeferPanic(p *_panic) { + // Don`t call preprintpanics, because + // don't want to call String/Error on the panicked values. + // When we fatal we really want to just print and exit, + // no more executing user Go code. + for x := p; x != nil; x = x.link { + if x.link != nil && *efaceOf(&x.link.arg) == *efaceOf(&x.arg) { + // This panic contains the same value as the next one in the chain. + // Mark it as repanicked. We will skip printing it twice in a row. + x.link.repanicked = true + } + } + if p != nil { + printpanics(p) + // make fatal have the same indentation as non-first panics. + print("\t") + } +} + // runningPanicDefers is non-zero while running deferred functions for panic. // This is used to try hard to get a panic stack trace out when exiting. var runningPanicDefers atomic.Uint32 diff --git a/src/runtime/panic_test.go b/src/runtime/panic_test.go index 4a30c30db6..2b06bce45d 100644 --- a/src/runtime/panic_test.go +++ b/src/runtime/panic_test.go @@ -34,6 +34,8 @@ func TestPanicWithDirectlyPrintableCustomTypes(t *testing.T) { {"panicCustomUint32", `panic: main.MyUint32(93)`}, {"panicCustomUint64", `panic: main.MyUint64(93)`}, {"panicCustomUintptr", `panic: main.MyUintptr(93)`}, + {"panicDeferFatal", "panic: runtime.errorString(\"invalid memory address or nil pointer dereference\")\n\tfatal error: sync: unlock of unlocked mutex"}, + {"panicDoublieDeferFatal", "panic: runtime.errorString(\"invalid memory address or nil pointer dereference\") [recovered, repanicked]\n\tfatal error: sync: unlock of unlocked mutex"}, } for _, tt := range tests { diff --git a/src/runtime/testdata/testprog/panicprint.go b/src/runtime/testdata/testprog/panicprint.go index 4ce958ba3d..eaf3ba2c33 100644 --- a/src/runtime/testdata/testprog/panicprint.go +++ b/src/runtime/testdata/testprog/panicprint.go @@ -4,6 +4,8 @@ package main +import "sync" + type MyBool bool type MyComplex128 complex128 type MyComplex64 complex64 @@ -90,6 +92,23 @@ func panicCustomFloat32() { panic(MyFloat32(-93.70)) } +func panicDeferFatal() { + var mu sync.Mutex + defer mu.Unlock() + var i *int + *i = 0 +} + +func panicDoublieDeferFatal() { + var mu sync.Mutex + defer mu.Unlock() + defer func() { + panic(recover()) + }() + var i *int + *i = 0 +} + func init() { register("panicCustomComplex64", panicCustomComplex64) register("panicCustomComplex128", panicCustomComplex128) @@ -108,4 +127,6 @@ func init() { register("panicCustomUint32", panicCustomUint32) register("panicCustomUint64", panicCustomUint64) register("panicCustomUintptr", panicCustomUintptr) + register("panicDeferFatal", panicDeferFatal) + register("panicDoublieDeferFatal", panicDoublieDeferFatal) } -- cgit v1.3-6-g1900 From 4ebf295b0b1740caac6302cc824ebd0f6175c1d5 Mon Sep 17 00:00:00 2001 From: Michael Pratt Date: Fri, 24 Oct 2025 15:14:59 -0400 Subject: runtime: prefer to restart Ps on the same M after STW Today, Ps jump around arbitrarily across STW. Instead, try to keep the P on the previous M it ran on. In the future, we'll likely want to try to expand this beyond STW to create a more general affinity for specific Ms. For this to be useful, the Ps need to have runnable Gs. Today, STW preemption goes through goschedImpl, which places the G on the global run queue. If that was the only G then the P won't have runnable goroutines anymore. It makes more sense to keep the G with its P across STW anyway, so add a special case to goschedImpl for that. On my machine, this CL reduces the error rate in TestTraceSTW from 99.8% to 1.9%. As a nearly 2% error rate shows, there are still cases where this best effort scheduling doesn't work. The most obvious is that while procresize assigns Ps back to their original M, startTheWorldWithSema calls wakep to start a spinning M. The spinning M may steal a goroutine from another P if that P is too slow to start. For #65694. Change-Id: I6a6a636c0969c587d039b68bc68ea16c74ff1fc9 Reviewed-on: https://go-review.googlesource.com/c/go/+/714801 Reviewed-by: Michael Knyszek Auto-Submit: Michael Pratt LUCI-TryBot-Result: Go LUCI --- src/internal/trace/testtrace/helpers.go | 3 +- src/runtime/proc.go | 83 ++++++- src/runtime/proc_test.go | 366 +++++++++++++++++++++++++++++ src/runtime/runtime2.go | 45 ++++ src/runtime/testdata/testprog/stw_mexit.go | 69 ++++++ src/runtime/testdata/testprog/stw_trace.go | 99 ++++++++ 6 files changed, 656 insertions(+), 9 deletions(-) create mode 100644 src/runtime/testdata/testprog/stw_mexit.go create mode 100644 src/runtime/testdata/testprog/stw_trace.go (limited to 'src/runtime/testdata') diff --git a/src/internal/trace/testtrace/helpers.go b/src/internal/trace/testtrace/helpers.go index c4c404a304..8a64d5c2ee 100644 --- a/src/internal/trace/testtrace/helpers.go +++ b/src/internal/trace/testtrace/helpers.go @@ -16,8 +16,7 @@ import ( "testing" ) -// MustHaveSyscallEvents skips the current test if the current -// platform does not support true system call events. +// Dump saves the trace to a file or the test log. func Dump(t *testing.T, testName string, traceBytes []byte, forceToFile bool) { onBuilder := testenv.Builder() != "" onOldBuilder := !strings.Contains(testenv.Builder(), "gotip") && !strings.Contains(testenv.Builder(), "go1") diff --git a/src/runtime/proc.go b/src/runtime/proc.go index 30d2a68626..44b64913a5 100644 --- a/src/runtime/proc.go +++ b/src/runtime/proc.go @@ -1009,6 +1009,8 @@ func mcommoninit(mp *m, id int64) { mp.id = mReserveID() } + mp.self = newMWeakPointer(mp) + mrandinit(mp) mpreinit(mp) @@ -2018,6 +2020,10 @@ func mexit(osStack bool) { // Free vgetrandom state. vgetrandomDestroy(mp) + // Clear the self pointer so Ps don't access this M after it is freed, + // or keep it alive. + mp.self.clear() + // Remove m from allm. lock(&sched.lock) for pprev := &allm; *pprev != nil; pprev = &(*pprev).alllink { @@ -4259,6 +4265,7 @@ func park_m(gp *g) { } func goschedImpl(gp *g, preempted bool) { + pp := gp.m.p.ptr() trace := traceAcquire() status := readgstatus(gp) if status&^_Gscan != _Grunning { @@ -4281,9 +4288,15 @@ func goschedImpl(gp *g, preempted bool) { } dropg() - lock(&sched.lock) - globrunqput(gp) - unlock(&sched.lock) + if preempted && sched.gcwaiting.Load() { + // If preempted for STW, keep the G on the local P in runnext + // so it can keep running immediately after the STW. + runqput(pp, gp, true) + } else { + lock(&sched.lock) + globrunqput(gp) + unlock(&sched.lock) + } if mainStarted { wakep() @@ -6013,6 +6026,7 @@ func procresize(nprocs int32) *p { } var runnablePs *p + var runnablePsNeedM *p for i := nprocs - 1; i >= 0; i-- { pp := allp[i] if gp.m.p.ptr() == pp { @@ -6021,12 +6035,41 @@ func procresize(nprocs int32) *p { pp.status = _Pidle if runqempty(pp) { pidleput(pp, now) - } else { - pp.m.set(mget()) - pp.link.set(runnablePs) - runnablePs = pp + continue } + + // Prefer to run on the most recent M if it is + // available. + // + // Ps with no oldm (or for which oldm is already taken + // by an earlier P), we delay until all oldm Ps are + // handled. Otherwise, mget may return an M that a + // later P has in oldm. + var mp *m + if oldm := pp.oldm.get(); oldm != nil { + // Returns nil if oldm is not idle. + mp = mgetSpecific(oldm) + } + if mp == nil { + // Call mget later. + pp.link.set(runnablePsNeedM) + runnablePsNeedM = pp + continue + } + pp.m.set(mp) + pp.link.set(runnablePs) + runnablePs = pp } + for runnablePsNeedM != nil { + pp := runnablePsNeedM + runnablePsNeedM = pp.link.ptr() + + mp := mget() + pp.m.set(mp) + pp.link.set(runnablePs) + runnablePs = pp + } + stealOrder.reset(uint32(nprocs)) var int32p *int32 = &gomaxprocs // make compiler check that gomaxprocs is an int32 atomic.Store((*uint32)(unsafe.Pointer(int32p)), uint32(nprocs)) @@ -6064,6 +6107,11 @@ func acquirepNoTrace(pp *p) { // Have p; write barriers now allowed. + // The M we're associating with will be the old M after the next + // releasep. We must set this here because write barriers are not + // allowed in releasep. + pp.oldm = pp.m.ptr().self + // Perform deferred mcache flush before this P can allocate // from a potentially stale mcache. pp.mcache.prepareForSweep() @@ -6998,6 +7046,27 @@ func mget() *m { return mp } +// Try to get a specific m from midle list. Returns nil if it isn't on the +// midle list. +// +// sched.lock must be held. +// May run during STW, so write barriers are not allowed. +// +//go:nowritebarrierrec +func mgetSpecific(mp *m) *m { + assertLockHeld(&sched.lock) + + if mp.idleNode.prev == 0 && mp.idleNode.next == 0 { + // Not on the list. + return nil + } + + sched.midle.remove(unsafe.Pointer(mp)) + sched.nmidle-- + + return mp +} + // Put gp on the global runnable queue. // sched.lock must be held. // May run during STW, so write barriers are not allowed. diff --git a/src/runtime/proc_test.go b/src/runtime/proc_test.go index d10d4a1fc9..b3084f4895 100644 --- a/src/runtime/proc_test.go +++ b/src/runtime/proc_test.go @@ -5,13 +5,18 @@ package runtime_test import ( + "bytes" "fmt" "internal/race" "internal/testenv" + "internal/trace" + "internal/trace/testtrace" + "io" "math" "net" "runtime" "runtime/debug" + "slices" "strings" "sync" "sync/atomic" @@ -1168,3 +1173,364 @@ func TestBigGOMAXPROCS(t *testing.T) { t.Errorf("output:\n%s\nwanted:\nunknown function: NonexistentTest", output) } } + +type goroutineState struct { + G trace.GoID // This goroutine. + P trace.ProcID // Most recent P this goroutine ran on. + M trace.ThreadID // Most recent M this goroutine ran on. +} + +func newGoroutineState(g trace.GoID) *goroutineState { + return &goroutineState{ + G: g, + P: trace.NoProc, + M: trace.NoThread, + } +} + +// TestTraceSTW verifies that goroutines continue running on the same M and P +// after a STW. +func TestTraceSTW(t *testing.T) { + // Across STW, the runtime attempts to keep goroutines running on the + // same P and the P running on the same M. It does this by keeping + // goroutines in the P's local runq, and remembering which M the P ran + // on before STW and preferring that M when restarting. + // + // This test verifies that affinity by analyzing a trace of testprog + // TraceSTW. + // + // The affinity across STW is best-effort, so have to allow some + // failure rate, thus we test many times and ensure the error rate is + // low. + // + // The expected affinity can fail for a variety of reasons. The most + // obvious is that while procresize assigns Ps back to their original + // M, startTheWorldWithSema calls wakep to start a spinning M. The + // spinning M may steal a goroutine from another P if that P is too + // slow to start. + + if testing.Short() { + t.Skip("skipping in -short mode") + } + + if runtime.NumCPU() < 4 { + t.Skip("This test sets GOMAXPROCS=4 and wants to avoid thread descheduling as much as possible. Skip on machines with less than 4 CPUs") + } + + const runs = 50 + + var errors int + for i := range runs { + err := runTestTracesSTW(t, i) + if err != nil { + t.Logf("Run %d failed: %v", i, err) + errors++ + } + } + + pct := float64(errors)/float64(runs) + t.Logf("Errors: %d/%d = %f%%", errors, runs, 100*pct) + if pct > 0.25 { + t.Errorf("Error rate too high") + } +} + +func runTestTracesSTW(t *testing.T, run int) (err error) { + t.Logf("Run %d", run) + + // By default, TSAN sleeps for 1s at exit to allow background + // goroutines to race. This slows down execution for this test far too + // much, since we are running 50 iterations, so disable the sleep. + // + // Outside of race mode, GORACE does nothing. + buf := []byte(runTestProg(t, "testprog", "TraceSTW", "GORACE=atexit_sleep_ms=0")) + + // We locally "fail" the run (return an error) if the trace exhibits + // unwanted scheduling. i.e., the target goroutines did not remain on + // the same P/M. + // + // We fail the entire test (t.Fatal) for other cases that should never + // occur, such as a trace parse error. + defer func() { + if err != nil || t.Failed() { + testtrace.Dump(t, fmt.Sprintf("TestTraceSTW-run%d", run), []byte(buf), false) + } + }() + + br, err := trace.NewReader(bytes.NewReader(buf)) + if err != nil { + t.Fatalf("NewReader got err %v want nil", err) + } + + var targetGoroutines []*goroutineState + findGoroutine := func(goid trace.GoID) *goroutineState { + for _, gs := range targetGoroutines { + if gs.G == goid { + return gs + } + } + return nil + } + findProc := func(pid trace.ProcID) *goroutineState { + for _, gs := range targetGoroutines { + if gs.P == pid { + return gs + } + } + return nil + } + + // 1. Find the goroutine IDs for the target goroutines. This will be in + // the StateTransition from NotExist. + // + // 2. Once found, track which M and P the target goroutines run on until... + // + // 3. Look for the "TraceSTW" "start" log message, where we commit the + // target goroutines' "before" M and P. + // + // N.B. We must do (1) and (2) together because the first target + // goroutine may start running before the second is created. +findStart: + for { + ev, err := br.ReadEvent() + if err == io.EOF { + // Reached the end of the trace without finding case (3). + t.Fatalf("Trace missing start log message") + } + if err != nil { + t.Fatalf("ReadEvent got err %v want nil", err) + } + t.Logf("Event: %s", ev.String()) + + switch ev.Kind() { + case trace.EventStateTransition: + st := ev.StateTransition() + if st.Resource.Kind != trace.ResourceGoroutine { + continue + } + + goid := st.Resource.Goroutine() + from, to := st.Goroutine() + + // Potentially case (1): Goroutine creation. + if from == trace.GoNotExist { + for sf := range st.Stack.Frames() { + if sf.Func == "main.traceSTWTarget" { + targetGoroutines = append(targetGoroutines, newGoroutineState(goid)) + t.Logf("Identified target goroutine id %d", goid) + } + + // Always break, the goroutine entrypoint is always the + // first frame. + break + } + } + + // Potentially case (2): Goroutine running. + if to == trace.GoRunning { + gs := findGoroutine(goid) + if gs == nil { + continue + } + gs.P = ev.Proc() + gs.M = ev.Thread() + t.Logf("G %d running on P %d M %d", gs.G, gs.P, gs.M) + } + case trace.EventLog: + // Potentially case (3): Start log event. + log := ev.Log() + if log.Category != "TraceSTW" { + continue + } + if log.Message != "start" { + t.Fatalf("Log message got %s want start", log.Message) + } + + // Found start point, move on to next stage. + t.Logf("Found start message") + break findStart + } + } + + t.Log("Target goroutines:") + for _, gs := range targetGoroutines { + t.Logf("%+v", gs) + } + + if len(targetGoroutines) != 2 { + t.Fatalf("len(targetGoroutines) got %d want 2", len(targetGoroutines)) + } + + for _, gs := range targetGoroutines { + if gs.P == trace.NoProc { + t.Fatalf("Goroutine %+v not running on a P", gs) + } + if gs.M == trace.NoThread { + t.Fatalf("Goroutine %+v not running on an M", gs) + } + } + + // The test continues until we see the "end" log message. + // + // What we want to observe is that the target goroutines run only on + // the original P and M. + // + // They will be stopped by STW [1], but should resume on the original P + // and M. + // + // However, this is best effort. For example, startTheWorld wakep's a + // spinning M. If the original M is slow to restart (e.g., due to poor + // kernel scheduling), the spinning M may legally steal the goroutine + // and run it instead. + // + // In practice, we see this occur frequently on builders, likely + // because they are overcommitted on CPU. Thus, we instead check + // slightly more constrained properties: + // - The original P must run on the original M (if it runs at all). + // - The original P must run the original G before anything else, + // unless that G has already run elsewhere. + // + // This allows a spinning M to steal the G from a slow-to-start M, but + // does not allow the original P to just flat out run something + // completely different from expected. + // + // Note this is still somewhat racy: the spinning M may steal the + // target G, but before it marks the target G as running, the original + // P runs an alternative G. This test will fail that case, even though + // it is legitimate. We allow that failure because such a race should + // be very rare, particularly because the test process usually has no + // other runnable goroutines. + // + // [1] This is slightly fragile because there is a small window between + // the "start" log and actual STW during which the target goroutines + // could legitimately migrate. + var stwSeen bool + var pRunning []trace.ProcID + var gRunning []trace.GoID +findEnd: + for { + ev, err := br.ReadEvent() + if err == io.EOF { + break + } + if err != nil { + t.Fatalf("ReadEvent got err %v want nil", err) + } + t.Logf("Event: %s", ev.String()) + + switch ev.Kind() { + case trace.EventStateTransition: + st := ev.StateTransition() + switch st.Resource.Kind { + case trace.ResourceProc: + p := st.Resource.Proc() + _, to := st.Proc() + + // Proc running. Ensure it didn't migrate. + if to == trace.ProcRunning { + gs := findProc(p) + if gs == nil { + continue + } + + if slices.Contains(pRunning, p) { + // Only check the first + // transition to running. + // Afterwards it is free to + // migrate anywhere. + continue + } + pRunning = append(pRunning, p) + + m := ev.Thread() + if m != gs.M { + t.Logf("Proc %d running on M %d want M %d", p, m, gs.M) + return fmt.Errorf("P did not remain on M") + } + } + case trace.ResourceGoroutine: + goid := st.Resource.Goroutine() + _, to := st.Goroutine() + + // Goroutine running. Ensure it didn't migrate. + if to == trace.GoRunning { + p := ev.Proc() + m := ev.Thread() + + gs := findGoroutine(goid) + if gs == nil { + // This isn't a target + // goroutine. Is it a target P? + // That shouldn't run anything + // other than the target G. + gs = findProc(p) + if gs == nil { + continue + } + + if slices.Contains(gRunning, gs.G) { + // This P's target G ran elsewhere. This probably + // means that this P was slow to start, so + // another P stole it. That isn't ideal, but + // we'll allow it. + continue + } + + t.Logf("Goroutine %d running on P %d M %d want this P to run G %d", goid, p, m, gs.G) + return fmt.Errorf("P ran incorrect goroutine") + } + + if !slices.Contains(gRunning, goid) { + gRunning = append(gRunning, goid) + } + + if p != gs.P || m != gs.M { + t.Logf("Goroutine %d running on P %d M %d want P %d M %d", goid, p, m, gs.P, gs.M) + // We don't want this to occur, + // but allow it for cases of + // bad kernel scheduling. See + // "The test continues" comment + // above. + } + } + } + case trace.EventLog: + // Potentially end log event. + log := ev.Log() + if log.Category != "TraceSTW" { + continue + } + if log.Message != "end" { + t.Fatalf("Log message got %s want end", log.Message) + } + + // Found end point. + t.Logf("Found end message") + break findEnd + case trace.EventRangeBegin: + r := ev.Range() + if r.Name == "stop-the-world (read mem stats)" { + // Note when we see the STW begin. This is not + // load bearing; it's purpose is simply to fail + // the test if we manage to remove the STW from + // ReadMemStat, so we remember to change this + // test to add some new source of STW. + stwSeen = true + } + } + } + + if !stwSeen { + t.Fatal("No STW in the test trace") + } + + return nil +} + +func TestMexitSTW(t *testing.T) { + got := runTestProg(t, "testprog", "mexitSTW") + want := "OK\n" + if got != want { + t.Fatalf("expected %q, but got:\n%s", want, got) + } +} diff --git a/src/runtime/runtime2.go b/src/runtime/runtime2.go index 85a9693ace..6c955460d4 100644 --- a/src/runtime/runtime2.go +++ b/src/runtime/runtime2.go @@ -716,6 +716,9 @@ type m struct { // Up to 10 locks held by this m, maintained by the lock ranking code. locksHeldLen int locksHeld [10]heldLockInfo + + // self points this M until mexit clears it to return nil. + self mWeakPointer } const mRedZoneSize = (16 << 3) * asanenabledBit // redZoneSize(2048) @@ -730,6 +733,37 @@ type mPadded struct { _ [(1 - goarch.IsWasm) * (2048 - mallocHeaderSize - mRedZoneSize - unsafe.Sizeof(m{}))]byte } +// mWeakPointer is a "weak" pointer to an M. A weak pointer for each M is +// available as m.self. Users may copy mWeakPointer arbitrarily, and get will +// return the M if it is still live, or nil after mexit. +// +// The zero value is treated as a nil pointer. +// +// Note that get may race with M exit. A successful get will keep the m object +// alive, but the M itself may be exited and thus not actually usable. +type mWeakPointer struct { + m *atomic.Pointer[m] +} + +func newMWeakPointer(mp *m) mWeakPointer { + w := mWeakPointer{m: new(atomic.Pointer[m])} + w.m.Store(mp) + return w +} + +func (w mWeakPointer) get() *m { + if w.m == nil { + return nil + } + return w.m.Load() +} + +// clear sets the weak pointer to nil. It cannot be used on zero value +// mWeakPointers. +func (w mWeakPointer) clear() { + w.m.Store(nil) +} + type p struct { id int32 status uint32 // one of pidle/prunning/... @@ -742,6 +776,17 @@ type p struct { pcache pageCache raceprocctx uintptr + // oldm is the previous m this p ran on. + // + // We are not assosciated with this m, so we have no control over its + // lifecycle. This value is an m.self object which points to the m + // until the m exits. + // + // Note that this m may be idle, running, or exiting. It should only be + // used with mgetSpecific, which will take ownership of the m only if + // it is idle. + oldm mWeakPointer + deferpool []*_defer // pool of available defer structs (see panic.go) deferpoolbuf [32]*_defer diff --git a/src/runtime/testdata/testprog/stw_mexit.go b/src/runtime/testdata/testprog/stw_mexit.go new file mode 100644 index 0000000000..b022ef4777 --- /dev/null +++ b/src/runtime/testdata/testprog/stw_mexit.go @@ -0,0 +1,69 @@ +// Copyright 2025 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package main + +import ( + "runtime" +) + +func init() { + register("mexitSTW", mexitSTW) +} + +// Stress test for pp.oldm pointing to an exited M. +// +// If pp.oldm points to an exited M it should be ignored and another M used +// instead. To stress: +// +// 1. Start and exit many threads (thus setting oldm on some P). +// 2. Meanwhile, frequently stop the world. +// +// If procresize incorrect attempts to assign a P to an exited M, likely +// failure modes are: +// +// 1. Crash in startTheWorldWithSema attempting to access the M, if it is nil. +// +// 2. Memory corruption elsewhere after startTheWorldWithSema writes to the M, +// if it is not nil, but is freed and reused for another allocation. +// +// 3. Hang on a subsequent stop the world waiting for the P to stop, if the M +// object is valid, but the M is exited, because startTheWorldWithSema didn't +// actually wake anything to run the P. The P is _Pidle, but not in the pidle +// list, thus startTheWorldWithSema will wake for it to actively stop. +// +// For this to go wrong, an exited M must fail to clear mp.self and must leave +// the M on the sched.midle list. +// +// Similar to TraceSTW. +func mexitSTW() { + // Ensure we have multiple Ps, but not too many, as we want the + // runnable goroutines likely to run on Ps with oldm set. + runtime.GOMAXPROCS(4) + + // Background busy work so there is always something runnable. + for i := range 2 { + go traceSTWTarget(i) + } + + // Wait for children to start running. + ping.Store(1) + for pong[0].Load() != 1 {} + for pong[1].Load() != 1 {} + + for range 100 { + // Exit a thread. The last P to run this will have it in oldm. + go func() { + runtime.LockOSThread() + }() + + // STW + var ms runtime.MemStats + runtime.ReadMemStats(&ms) + } + + stop.Store(true) + + println("OK") +} diff --git a/src/runtime/testdata/testprog/stw_trace.go b/src/runtime/testdata/testprog/stw_trace.go new file mode 100644 index 0000000000..0fed55b875 --- /dev/null +++ b/src/runtime/testdata/testprog/stw_trace.go @@ -0,0 +1,99 @@ +// Copyright 2025 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package main + +import ( + "context" + "log" + "os" + "runtime" + "runtime/debug" + "runtime/trace" + "sync/atomic" +) + +func init() { + register("TraceSTW", TraceSTW) +} + +// The parent writes to ping and waits for the children to write back +// via pong to show that they are running. +var ping atomic.Uint32 +var pong [2]atomic.Uint32 + +// Tell runners to stop. +var stop atomic.Bool + +func traceSTWTarget(i int) { + for !stop.Load() { + // Async preemption often takes 100ms+ to preempt this loop on + // windows-386. This makes the test flaky, as the traceReadCPU + // timer often fires by the time STW finishes, jumbling the + // goroutine scheduling. As a workaround, ensure we have a + // morestack call for prompt preemption. + ensureMorestack() + + pong[i].Store(ping.Load()) + } +} + +func TraceSTW() { + ctx := context.Background() + + // The idea here is to have 2 target goroutines that are constantly + // running. When the world restarts after STW, we expect these + // goroutines to continue execution on the same M and P. + // + // Set GOMAXPROCS=4 to make room for the 2 target goroutines, 1 parent, + // and 1 slack for potential misscheduling. + // + // Disable the GC because GC STW generally moves goroutines (see + // https://go.dev/issue/65694). Alternatively, we could just ignore the + // trace if the GC runs. + runtime.GOMAXPROCS(4) + debug.SetGCPercent(0) + + if err := trace.Start(os.Stdout); err != nil { + log.Fatalf("failed to start tracing: %v", err) + } + defer trace.Stop() + + for i := range 2 { + go traceSTWTarget(i) + } + + // Wait for children to start running. + ping.Store(1) + for pong[0].Load() != 1 {} + for pong[1].Load() != 1 {} + + trace.Log(ctx, "TraceSTW", "start") + + // STW + var ms runtime.MemStats + runtime.ReadMemStats(&ms) + + // Make sure to run long enough for the children to schedule again + // after STW. + ping.Store(2) + for pong[0].Load() != 2 {} + for pong[1].Load() != 2 {} + + trace.Log(ctx, "TraceSTW", "end") + + stop.Store(true) +} + +// Manually insert a morestack call. Leaf functions can omit morestack, but +// non-leaf functions should include them. + +//go:noinline +func ensureMorestack() { + ensureMorestack1() +} + +//go:noinline +func ensureMorestack1() { +} -- cgit v1.3-6-g1900