diff options
| author | Keith Randall <khr@golang.org> | 2025-05-17 15:05:56 -0700 |
|---|---|---|
| committer | Keith Randall <khr@golang.org> | 2025-10-06 14:11:41 -0700 |
| commit | 719dfcf8a8478d70360bf3c34c0e920be7b32994 (patch) | |
| tree | d58aaf3289de3bb18901e34b336da46b425f8075 /src/runtime | |
| parent | f3312124c2370c2f64a7f9ad29732ec30209647a (diff) | |
| download | go-719dfcf8a8478d70360bf3c34c0e920be7b32994.tar.xz | |
cmd/compile: redo arm64 LR/FP save and restore
Instead of storing LR (the return address) at 0(SP) and the FP
(parent's frame pointer) at -8(SP), store them at framesize-8(SP)
and framesize-16(SP), respectively.
We push and pop data onto the stack such that we're never accessing
anything below SP.
The prolog/epilog lengths are unchanged (3 insns for a typical prolog,
2 for a typical epilog).
We use 8 bytes more per frame.
Typical prologue:
STP.W (FP, LR), -16(SP)
MOVD SP, FP
SUB $C, SP
Typical epilogue:
ADD $C, SP
LDP.P 16(SP), (FP, LR)
RET
The previous word where we stored LR, at 0(SP), is now unused.
We could repurpose that slot for storing a local variable.
The new prolog and epilog instructions are recognized by libunwind,
so pc-sampling tools like perf should now be accurate. (TODO: except
maybe after the first RET instruction? Have to look into that.)
Update #73753 (fixes, for arm64)
Update #57302 (Quim thinks this will help on that issue)
Change-Id: I4800036a9a9a08aaaf35d9f99de79a36cf37ebb8
Reviewed-on: https://go-review.googlesource.com/c/go/+/674615
Reviewed-by: David Chase <drchase@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Keith Randall <khr@google.com>
Diffstat (limited to 'src/runtime')
| -rw-r--r-- | src/runtime/asm_arm64.s | 81 | ||||
| -rw-r--r-- | src/runtime/mkpreempt.go | 20 | ||||
| -rw-r--r-- | src/runtime/panic.go | 8 | ||||
| -rw-r--r-- | src/runtime/preempt_arm64.s | 15 | ||||
| -rw-r--r-- | src/runtime/race_arm64.s | 17 | ||||
| -rw-r--r-- | src/runtime/signal_arm64.go | 16 | ||||
| -rw-r--r-- | src/runtime/stack.go | 20 | ||||
| -rw-r--r-- | src/runtime/testdata/testprog/badtraceback.go | 5 | ||||
| -rw-r--r-- | src/runtime/traceback.go | 30 |
9 files changed, 118 insertions, 94 deletions
diff --git a/src/runtime/asm_arm64.s b/src/runtime/asm_arm64.s index a0e82ec830..aa49a27a75 100644 --- a/src/runtime/asm_arm64.s +++ b/src/runtime/asm_arm64.s @@ -50,9 +50,7 @@ TEXT _rt0_arm64_lib(SB),NOSPLIT,$184 CBZ R4, nocgo MOVD $_rt0_arm64_lib_go(SB), R0 MOVD $0, R1 - SUB $16, RSP // reserve 16 bytes for sp-8 where fp may be saved. BL (R4) - ADD $16, RSP B restore nocgo: @@ -371,7 +369,6 @@ switch: BL runtime·save_g(SB) MOVD (g_sched+gobuf_sp)(g), R0 MOVD R0, RSP - MOVD (g_sched+gobuf_bp)(g), R29 MOVD $0, (g_sched+gobuf_sp)(g) MOVD $0, (g_sched+gobuf_bp)(g) RET @@ -381,8 +378,8 @@ noswitch: // Using a tail call here cleans up tracebacks since we won't stop // at an intermediate systemstack. MOVD 0(R26), R3 // code pointer - MOVD.P 16(RSP), R30 // restore LR - SUB $8, RSP, R29 // restore FP + ADD $16, RSP + LDP.P 16(RSP), (R29,R30) // restore FP, LR B (R3) // func switchToCrashStack0(fn func()) @@ -1051,7 +1048,7 @@ again: // Smashes R0. TEXT gosave_systemstack_switch<>(SB),NOSPLIT|NOFRAME,$0 MOVD $runtime·systemstack_switch(SB), R0 - ADD $8, R0 // get past prologue + ADD $12, R0 // get past prologue MOVD R0, (g_sched+gobuf_pc)(g) MOVD RSP, R0 MOVD R0, (g_sched+gobuf_sp)(g) @@ -1069,9 +1066,7 @@ TEXT gosave_systemstack_switch<>(SB),NOSPLIT|NOFRAME,$0 TEXT ·asmcgocall_no_g(SB),NOSPLIT,$0-16 MOVD fn+0(FP), R1 MOVD arg+8(FP), R0 - SUB $16, RSP // skip over saved frame pointer below RSP BL (R1) - ADD $16, RSP // skip over saved frame pointer below RSP RET // func asmcgocall(fn, arg unsafe.Pointer) int32 @@ -1236,9 +1231,9 @@ havem: BL runtime·save_g(SB) MOVD (g_sched+gobuf_sp)(g), R4 // prepare stack as R4 MOVD (g_sched+gobuf_pc)(g), R5 - MOVD R5, -48(R4) + MOVD R5, -8(R4) MOVD (g_sched+gobuf_bp)(g), R5 - MOVD R5, -56(R4) + MOVD R5, -16(R4) // Gather our arguments into registers. MOVD fn+0(FP), R1 MOVD frame+8(FP), R2 @@ -1252,7 +1247,7 @@ havem: CALL (R0) // indirect call to bypass nosplit check. We're on a different stack now. // Restore g->sched (== m->curg->sched) from saved values. - MOVD 0(RSP), R5 + MOVD 40(RSP), R5 MOVD R5, (g_sched+gobuf_pc)(g) MOVD RSP, R4 ADD $48, R4, R4 @@ -1490,10 +1485,57 @@ GLOBL debugCallFrameTooLarge<>(SB), RODATA, $20 // Size duplicated below // // This is ABIInternal because Go code injects its PC directly into new // goroutine stacks. +// +// State before debugger starts doing anything: +// | current | +// | stack | +// +-------------+ <- SP = origSP +// stopped executing at PC = origPC +// some values are in LR (origLR) and FP (origFP) +// +// After debugger has done steps 1-6 above: +// | current | +// | stack | +// +-------------+ <- origSP +// | ----- | (used to be a slot to store frame pointer on entry to origPC's frame.) +// +-------------+ +// | origLR | +// +-------------+ <- SP +// | ----- | +// +-------------+ +// | argsize | +// +-------------+ +// LR = origPC, PC = debugCallV2 +// +// debugCallV2 then modifies the stack up to the "good" label: +// | current | +// | stack | +// +-------------+ <- origSP +// | ----- | (used to be a slot to store frame pointer on entry to origPC's frame.) +// +-------------+ +// | origLR | +// +-------------+ <- where debugger left SP +// | origPC | +// +-------------+ +// | origFP | +// +-------------+ <- FP = SP + 256 +// | saved | +// | registers | +// | (224 bytes) | +// +-------------+ <- SP + 32 +// | space for | +// | outargs | +// +-------------+ <- SP + 8 +// | argsize | +// +-------------+ <- SP + TEXT runtime·debugCallV2<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-0 - STP (R29, R30), -280(RSP) - SUB $272, RSP, RSP - SUB $8, RSP, R29 + MOVD R30, -8(RSP) // save origPC + MOVD -16(RSP), R30 // save argsize in R30 temporarily + MOVD.W R29, -16(RSP) // push origFP + MOVD RSP, R29 // frame pointer chain now set up + SUB $256, RSP, RSP // allocate frame + MOVD R30, (RSP) // Save argsize on the stack // Save all registers that may contain pointers so they can be // conservatively scanned. // @@ -1515,7 +1557,8 @@ TEXT runtime·debugCallV2<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-0 STP (R0, R1), (4*8)(RSP) // Perform a safe-point check. - MOVD R30, 8(RSP) // Caller's PC + MOVD 264(RSP), R0 // origPC + MOVD R0, 8(RSP) CALL runtime·debugCallCheck(SB) MOVD 16(RSP), R0 CBZ R0, good @@ -1559,7 +1602,7 @@ good: CALL runtime·debugCallWrap(SB); \ JMP restore - MOVD 256(RSP), R0 // the argument frame size + MOVD (RSP), R0 // the argument frame size DEBUG_CALL_DISPATCH(debugCall32<>, 32) DEBUG_CALL_DISPATCH(debugCall64<>, 64) DEBUG_CALL_DISPATCH(debugCall128<>, 128) @@ -1607,9 +1650,9 @@ restore: LDP (6*8)(RSP), (R2, R3) LDP (4*8)(RSP), (R0, R1) - LDP -8(RSP), (R29, R27) - ADD $288, RSP, RSP // Add 16 more bytes, see saveSigContext - MOVD -16(RSP), R30 // restore old lr + MOVD 272(RSP), R30 // restore old lr (saved by (*sigctxt).pushCall) + LDP 256(RSP), (R29, R27) // restore old fp, set up resumption address + ADD $288, RSP, RSP // Pop frame, LR+FP, and block pushed by (*sigctxt).pushCall JMP (R27) // runtime.debugCallCheck assumes that functions defined with the diff --git a/src/runtime/mkpreempt.go b/src/runtime/mkpreempt.go index 769c4ffc5c..9064cae039 100644 --- a/src/runtime/mkpreempt.go +++ b/src/runtime/mkpreempt.go @@ -488,26 +488,18 @@ func genARM64(g *gen) { l.stack += 8 // SP needs 16-byte alignment } - // allocate frame, save PC of interrupted instruction (in LR) - p("MOVD R30, %d(RSP)", -l.stack) + // allocate frame, save PC (in R30), FP (in R29) of interrupted instruction + p("STP.W (R29, R30), -16(RSP)") + p("MOVD RSP, R29") // set up new frame pointer p("SUB $%d, RSP", l.stack) - p("MOVD R29, -8(RSP)") // save frame pointer (only used on Linux) - p("SUB $8, RSP, R29") // set up new frame pointer - // On iOS, save the LR again after decrementing SP. We run the - // signal handler on the G stack (as it doesn't support sigaltstack), - // so any writes below SP may be clobbered. - p("#ifdef GOOS_ios") - p("MOVD R30, (RSP)") - p("#endif") l.save(g) p("CALL ·asyncPreempt2(SB)") l.restore(g) - p("MOVD %d(RSP), R30", l.stack) // sigctxt.pushCall has pushed LR (at interrupt) on stack, restore it - p("MOVD -8(RSP), R29") // restore frame pointer - p("MOVD (RSP), R27") // load PC to REGTMP - p("ADD $%d, RSP", l.stack+16) // pop frame (including the space pushed by sigctxt.pushCall) + p("MOVD %d(RSP), R30", l.stack+16) // sigctxt.pushCall has pushed LR (at interrupt) on stack, restore it + p("LDP %d(RSP), (R29, R27)", l.stack) // Restore frame pointer. Load PC into regtmp. + p("ADD $%d, RSP", l.stack+32) // pop frame (including the space pushed by sigctxt.pushCall) p("RET (R27)") } diff --git a/src/runtime/panic.go b/src/runtime/panic.go index 8c91c9435a..04b3afe168 100644 --- a/src/runtime/panic.go +++ b/src/runtime/panic.go @@ -1379,10 +1379,10 @@ func recovery(gp *g) { // the caller gp.sched.bp = fp - 2*goarch.PtrSize case goarch.IsArm64 != 0: - // on arm64, the architectural bp points one word higher - // than the sp. fp is totally useless to us here, because it - // only gets us to the caller's fp. - gp.sched.bp = sp - goarch.PtrSize + // on arm64, the first two words of the frame are caller's PC + // (the saved LR register) and the caller's BP. + // Coincidentally, the same as amd64. + gp.sched.bp = fp - 2*goarch.PtrSize } gogo(&gp.sched) } diff --git a/src/runtime/preempt_arm64.s b/src/runtime/preempt_arm64.s index 31ec9d940f..f4248cac25 100644 --- a/src/runtime/preempt_arm64.s +++ b/src/runtime/preempt_arm64.s @@ -4,13 +4,9 @@ #include "textflag.h" TEXT ·asyncPreempt(SB),NOSPLIT|NOFRAME,$0-0 - MOVD R30, -496(RSP) + STP.W (R29, R30), -16(RSP) + MOVD RSP, R29 SUB $496, RSP - MOVD R29, -8(RSP) - SUB $8, RSP, R29 - #ifdef GOOS_ios - MOVD R30, (RSP) - #endif STP (R0, R1), 8(RSP) STP (R2, R3), 24(RSP) STP (R4, R5), 40(RSP) @@ -78,8 +74,7 @@ TEXT ·asyncPreempt(SB),NOSPLIT|NOFRAME,$0-0 LDP 40(RSP), (R4, R5) LDP 24(RSP), (R2, R3) LDP 8(RSP), (R0, R1) - MOVD 496(RSP), R30 - MOVD -8(RSP), R29 - MOVD (RSP), R27 - ADD $512, RSP + MOVD 512(RSP), R30 + LDP 496(RSP), (R29, R27) + ADD $528, RSP RET (R27) diff --git a/src/runtime/race_arm64.s b/src/runtime/race_arm64.s index 5df650105b..feaa328d4c 100644 --- a/src/runtime/race_arm64.s +++ b/src/runtime/race_arm64.s @@ -397,7 +397,7 @@ TEXT racecallatomic<>(SB), NOSPLIT, $0 // R3 = addr of incoming arg list // Trigger SIGSEGV early. - MOVD 40(RSP), R3 // 1st arg is addr. after two times BL, get it at 40(RSP) + MOVD 72(RSP), R3 // 1st arg is addr. after two small frames (32 bytes each), get it at 72(RSP) MOVB (R3), R13 // segv here if addr is bad // Check that addr is within [arenastart, arenaend) or within [racedatastart, racedataend). MOVD runtime·racearenastart(SB), R10 @@ -417,10 +417,11 @@ racecallatomic_ok: // Addr is within the good range, call the atomic function. load_g MOVD g_racectx(g), R0 // goroutine context - MOVD 16(RSP), R1 // caller pc + MOVD 56(RSP), R1 // caller pc MOVD R9, R2 // pc - ADD $40, RSP, R3 - JMP racecall<>(SB) // does not return + ADD $72, RSP, R3 + BL racecall<>(SB) + RET racecallatomic_ignore: // Addr is outside the good range. // Call __tsan_go_ignore_sync_begin to ignore synchronization during the atomic op. @@ -435,9 +436,9 @@ racecallatomic_ignore: // racecall will call LLVM race code which might clobber R28 (g) load_g MOVD g_racectx(g), R0 // goroutine context - MOVD 16(RSP), R1 // caller pc + MOVD 56(RSP), R1 // caller pc MOVD R9, R2 // pc - ADD $40, RSP, R3 // arguments + ADD $72, RSP, R3 // arguments BL racecall<>(SB) // Call __tsan_go_ignore_sync_end. MOVD $__tsan_go_ignore_sync_end(SB), R9 @@ -476,10 +477,6 @@ TEXT racecall<>(SB), NOSPLIT|NOFRAME, $0-0 MOVD (g_sched+gobuf_sp)(R11), R12 MOVD R12, RSP call: - // Decrement SP past where the frame pointer is saved in the Go arm64 - // ABI (one word below the stack pointer) so the race detector library - // code doesn't clobber it - SUB $16, RSP BL R9 MOVD R19, RSP JMP (R20) diff --git a/src/runtime/signal_arm64.go b/src/runtime/signal_arm64.go index af7d29f9de..61dad50721 100644 --- a/src/runtime/signal_arm64.go +++ b/src/runtime/signal_arm64.go @@ -8,7 +8,6 @@ package runtime import ( "internal/abi" - "internal/goarch" "internal/runtime/sys" "unsafe" ) @@ -63,18 +62,11 @@ func (c *sigctxt) preparePanic(sig uint32, gp *g) { // We arrange lr, and pc to pretend the panicking // function calls sigpanic directly. // Always save LR to stack so that panics in leaf - // functions are correctly handled. This smashes - // the stack frame but we're not going back there - // anyway. + // functions are correctly handled. + // This extra space is known to gentraceback. sp := c.sp() - sys.StackAlign // needs only sizeof uint64, but must align the stack c.set_sp(sp) *(*uint64)(unsafe.Pointer(uintptr(sp))) = c.lr() - // Make sure a valid frame pointer is saved on the stack so that the - // frame pointer checks in adjustframe are happy, if they're enabled. - // Frame pointer unwinding won't visit the sigpanic frame, since - // sigpanic will save the same frame pointer before calling into a panic - // function. - *(*uint64)(unsafe.Pointer(uintptr(sp - goarch.PtrSize))) = c.r29() pc := gp.sigpc @@ -96,10 +88,6 @@ func (c *sigctxt) pushCall(targetPC, resumePC uintptr) { sp := c.sp() - 16 // SP needs 16-byte alignment c.set_sp(sp) *(*uint64)(unsafe.Pointer(uintptr(sp))) = c.lr() - // Make sure a valid frame pointer is saved on the stack so that the - // frame pointer checks in adjustframe are happy, if they're enabled. - // This is not actually used for unwinding. - *(*uint64)(unsafe.Pointer(uintptr(sp - goarch.PtrSize))) = c.r29() // Set up PC and LR to pretend the function being signaled // calls targetPC at resumePC. c.set_lr(uint64(resumePC)) diff --git a/src/runtime/stack.go b/src/runtime/stack.go index 55e97e77af..5eaceec6da 100644 --- a/src/runtime/stack.go +++ b/src/runtime/stack.go @@ -579,23 +579,27 @@ var ptrnames = []string{ // | args to callee | // +------------------+ <- frame->sp // -// (arm) +// (arm64) // +------------------+ // | args from caller | // +------------------+ <- frame->argp -// | caller's retaddr | +// | <unused> | +// +------------------+ <- frame->fp (aka caller's sp) +// | return address | // +------------------+ -// | caller's FP (*) | (*) on ARM64, if framepointer_enabled && varp > sp +// | caller's FP | (frame pointer always enabled: TODO) // +------------------+ <- frame->varp // | locals | // +------------------+ // | args to callee | // +------------------+ -// | return address | +// | <unused> | // +------------------+ <- frame->sp // // varp > sp means that the function has a frame; // varp == sp means frameless function. +// +// Alignment padding, if needed, will be between "locals" and "args to callee". type adjustinfo struct { old stack @@ -709,7 +713,8 @@ func adjustframe(frame *stkframe, adjinfo *adjustinfo) { } // Adjust saved frame pointer if there is one. - if (goarch.ArchFamily == goarch.AMD64 || goarch.ArchFamily == goarch.ARM64) && frame.argp-frame.varp == 2*goarch.PtrSize { + if goarch.ArchFamily == goarch.AMD64 && frame.argp-frame.varp == 2*goarch.PtrSize || + goarch.ArchFamily == goarch.ARM64 && frame.argp-frame.varp == 3*goarch.PtrSize { if stackDebug >= 3 { print(" saved bp\n") } @@ -723,10 +728,7 @@ func adjustframe(frame *stkframe, adjinfo *adjustinfo) { throw("bad frame pointer") } } - // On AMD64, this is the caller's frame pointer saved in the current - // frame. - // On ARM64, this is the frame pointer of the caller's caller saved - // by the caller in its frame (one word below its SP). + // This is the caller's frame pointer saved in the current frame. adjustpointer(adjinfo, unsafe.Pointer(frame.varp)) } diff --git a/src/runtime/testdata/testprog/badtraceback.go b/src/runtime/testdata/testprog/badtraceback.go index 455118a543..36575f765d 100644 --- a/src/runtime/testdata/testprog/badtraceback.go +++ b/src/runtime/testdata/testprog/badtraceback.go @@ -41,6 +41,11 @@ func badLR2(arg int) { if runtime.GOARCH == "ppc64" || runtime.GOARCH == "ppc64le" { lrOff = 32 // FIXED_FRAME or sys.MinFrameSize } + if runtime.GOARCH == "arm64" { + // skip 8 bytes at bottom of parent frame, then point + // to the 8 bytes of the saved PC at the top of the frame. + lrOff = 16 + } lrPtr := (*uintptr)(unsafe.Pointer(uintptr(unsafe.Pointer(&arg)) - lrOff)) *lrPtr = 0xbad diff --git a/src/runtime/traceback.go b/src/runtime/traceback.go index 8882c306ed..1c3e679a02 100644 --- a/src/runtime/traceback.go +++ b/src/runtime/traceback.go @@ -175,6 +175,11 @@ func (u *unwinder) initAt(pc0, sp0, lr0 uintptr, gp *g, flags unwindFlags) { // Start in the caller's frame. if frame.pc == 0 { if usesLR { + // TODO: this isn't right on arm64. But also, this should + // ~never happen. Calling a nil function will panic + // when loading the PC out of the closure, not when + // branching to that PC. (Closures should always have + // valid PCs in their first word.) frame.pc = *(*uintptr)(unsafe.Pointer(frame.sp)) frame.lr = 0 } else { @@ -369,7 +374,11 @@ func (u *unwinder) resolveInternal(innermost, isSyscall bool) { var lrPtr uintptr if usesLR { if innermost && frame.sp < frame.fp || frame.lr == 0 { - lrPtr = frame.sp + if GOARCH == "arm64" { + lrPtr = frame.fp - goarch.PtrSize + } else { + lrPtr = frame.sp + } frame.lr = *(*uintptr)(unsafe.Pointer(lrPtr)) } } else { @@ -385,24 +394,17 @@ func (u *unwinder) resolveInternal(innermost, isSyscall bool) { // On x86, call instruction pushes return PC before entering new function. frame.varp -= goarch.PtrSize } + if GOARCH == "arm64" && frame.varp > frame.sp { + frame.varp -= goarch.PtrSize // LR have been saved, skip over it. + } // For architectures with frame pointers, if there's // a frame, then there's a saved frame pointer here. // // NOTE: This code is not as general as it looks. - // On x86, the ABI is to save the frame pointer word at the + // On x86 and arm64, the ABI is to save the frame pointer word at the // top of the stack frame, so we have to back down over it. - // On arm64, the frame pointer should be at the bottom of - // the stack (with R29 (aka FP) = RSP), in which case we would - // not want to do the subtraction here. But we started out without - // any frame pointer, and when we wanted to add it, we didn't - // want to break all the assembly doing direct writes to 8(RSP) - // to set the first parameter to a called function. - // So we decided to write the FP link *below* the stack pointer - // (with R29 = RSP - 8 in Go functions). - // This is technically ABI-compatible but not standard. - // And it happens to end up mimicking the x86 layout. - // Other architectures may make different decisions. + // No other architectures are framepointer-enabled at the moment. if frame.varp > frame.sp && framepointer_enabled { frame.varp -= goarch.PtrSize } @@ -562,7 +564,7 @@ func (u *unwinder) finishInternal() { gp := u.g.ptr() if u.flags&(unwindPrintErrors|unwindSilentErrors) == 0 && u.frame.sp != gp.stktopsp { print("runtime: g", gp.goid, ": frame.sp=", hex(u.frame.sp), " top=", hex(gp.stktopsp), "\n") - print("\tstack=[", hex(gp.stack.lo), "-", hex(gp.stack.hi), "\n") + print("\tstack=[", hex(gp.stack.lo), "-", hex(gp.stack.hi), "]\n") throw("traceback did not unwind completely") } } |
