From dfa6c7426316fb81c5f29b260b2de7822680ffd3 Mon Sep 17 00:00:00 2001 From: Austin Clements Date: Thu, 12 Jun 2025 18:37:01 -0400 Subject: [dev.simd] runtime: eliminate global state in mkpreempt.go We're going to start writing two files, so having a single global file we're writing will be a problem. This has no effect on the generated code. Change-Id: I49897ea0c6500a29eac89b597d75c0eb3e9b6706 Reviewed-on: https://go-review.googlesource.com/c/go/+/680897 Reviewed-by: Cherry Mui LUCI-TryBot-Result: Go LUCI --- src/runtime/mkpreempt.go | 166 +++++++++++++++++++++++++++-------------------- 1 file changed, 94 insertions(+), 72 deletions(-) (limited to 'src/runtime') diff --git a/src/runtime/mkpreempt.go b/src/runtime/mkpreempt.go index 6a9cf77a43..ec900a23d2 100644 --- a/src/runtime/mkpreempt.go +++ b/src/runtime/mkpreempt.go @@ -73,16 +73,14 @@ var regNamesAMD64 = []string{ "X15", } -var out io.Writer - -var arches = map[string]func(){ +var arches = map[string]func(g *gen){ "386": gen386, "amd64": genAMD64, "arm": genARM, "arm64": genARM64, "loong64": genLoong64, - "mips64x": func() { genMIPS(true) }, - "mipsx": func() { genMIPS(false) }, + "mips64x": func(g *gen) { genMIPS(g, true) }, + "mipsx": func(g *gen) { genMIPS(g, false) }, "ppc64x": genPPC64, "riscv64": genRISCV64, "s390x": genS390X, @@ -93,53 +91,58 @@ var beLe = map[string]bool{"mips64x": true, "mipsx": true, "ppc64x": true} func main() { flag.Parse() if flag.NArg() > 0 { - out = os.Stdout for _, arch := range flag.Args() { - gen, ok := arches[arch] + genFn, ok := arches[arch] if !ok { log.Fatalf("unknown arch %s", arch) } - header(arch) - gen() + g := gen{os.Stdout, arch} + g.asmHeader() + genFn(&g) } return } - for arch, gen := range arches { + for arch, genFn := range arches { f, err := os.Create(fmt.Sprintf("preempt_%s.s", arch)) if err != nil { log.Fatal(err) } - out = f - header(arch) - gen() + g := gen{f, arch} + g.asmHeader() + genFn(&g) if err := f.Close(); err != nil { log.Fatal(err) } } } -func header(arch string) { - fmt.Fprintf(out, "// Code generated by mkpreempt.go; DO NOT EDIT.\n\n") - if beLe[arch] { - base := arch[:len(arch)-1] - fmt.Fprintf(out, "//go:build %s || %sle\n\n", base, base) +type gen struct { + w io.Writer + goarch string +} + +func (g *gen) asmHeader() { + fmt.Fprintf(g.w, "// Code generated by mkpreempt.go; DO NOT EDIT.\n\n") + if beLe[g.goarch] { + base := g.goarch[:len(g.goarch)-1] + fmt.Fprintf(g.w, "//go:build %s || %sle\n\n", base, base) } - fmt.Fprintf(out, "#include \"go_asm.h\"\n") - if arch == "amd64" { - fmt.Fprintf(out, "#include \"asm_amd64.h\"\n") + fmt.Fprintf(g.w, "#include \"go_asm.h\"\n") + if g.goarch == "amd64" { + fmt.Fprintf(g.w, "#include \"asm_amd64.h\"\n") } - fmt.Fprintf(out, "#include \"textflag.h\"\n\n") - fmt.Fprintf(out, "TEXT ·asyncPreempt(SB),NOSPLIT|NOFRAME,$0-0\n") + fmt.Fprintf(g.w, "#include \"textflag.h\"\n\n") + fmt.Fprintf(g.w, "TEXT ·asyncPreempt(SB),NOSPLIT|NOFRAME,$0-0\n") } -func p(f string, args ...any) { +func (g *gen) p(f string, args ...any) { fmted := fmt.Sprintf(f, args...) - fmt.Fprintf(out, "\t%s\n", strings.ReplaceAll(fmted, "\n", "\n\t")) + fmt.Fprintf(g.w, "\t%s\n", strings.ReplaceAll(fmted, "\n", "\n\t")) } -func label(l string) { - fmt.Fprintf(out, "%s\n", l) +func (g *gen) label(l string) { + fmt.Fprintf(g.w, "%s\n", l) } type layout struct { @@ -176,28 +179,30 @@ func (l *layout) addSpecial(save, restore string, size int) { l.stack += size } -func (l *layout) save() { +func (l *layout) save(g *gen) { for _, reg := range l.regs { if reg.save != "" { - p(reg.save, reg.pos) + g.p(reg.save, reg.pos) } else { - p("%s %s, %d(%s)", reg.saveOp, reg.reg, reg.pos, l.sp) + g.p("%s %s, %d(%s)", reg.saveOp, reg.reg, reg.pos, l.sp) } } } -func (l *layout) restore() { +func (l *layout) restore(g *gen) { for i := len(l.regs) - 1; i >= 0; i-- { reg := l.regs[i] if reg.restore != "" { - p(reg.restore, reg.pos) + g.p(reg.restore, reg.pos) } else { - p("%s %d(%s), %s", reg.restoreOp, reg.pos, l.sp, reg.reg) + g.p("%s %d(%s), %s", reg.restoreOp, reg.pos, l.sp, reg.reg) } } } -func gen386() { +func gen386(g *gen) { + p := g.p + p("PUSHFL") // Save general purpose registers. var l = layout{sp: "SP"} @@ -218,22 +223,24 @@ func gen386() { p("ADJSP $%d", lSSE.stack) p("NOP SP") - l.save() + l.save(g) p("#ifndef %s", softfloat) - lSSE.save() + lSSE.save(g) p("#endif") p("CALL ·asyncPreempt2(SB)") p("#ifndef %s", softfloat) - lSSE.restore() + lSSE.restore(g) p("#endif") - l.restore() + l.restore(g) p("ADJSP $%d", -lSSE.stack) p("POPFL") p("RET") } -func genAMD64() { +func genAMD64(g *gen) { + p := g.p + // Assign stack offsets. var l = layout{sp: "SP"} for _, reg := range regNamesAMD64 { @@ -262,19 +269,21 @@ func genAMD64() { p("// But vet doesn't know ADJSP, so suppress vet stack checking") p("NOP SP") - l.save() + l.save(g) - lSSE.save() + lSSE.save(g) p("CALL ·asyncPreempt2(SB)") - lSSE.restore() - l.restore() + lSSE.restore(g) + l.restore(g) p("ADJSP $%d", -lSSE.stack) p("POPFQ") p("POPQ BP") p("RET") } -func genARM() { +func genARM(g *gen) { + p := g.p + // Add integer registers R0-R12. // R13 (SP), R14 (LR), R15 (PC) are special and not saved here. var l = layout{sp: "R13", stack: 4} // add LR slot @@ -303,22 +312,23 @@ func genARM() { } p("MOVW.W R14, -%d(R13)", lfp.stack) // allocate frame, save LR - l.save() + l.save(g) p("MOVB ·goarmsoftfp(SB), R0\nCMP $0, R0\nBNE nofp") // test goarmsoftfp, and skip FP registers if goarmsoftfp!=0. - lfp.save() - label("nofp:") + lfp.save(g) + g.label("nofp:") p("CALL ·asyncPreempt2(SB)") p("MOVB ·goarmsoftfp(SB), R0\nCMP $0, R0\nBNE nofp2") // test goarmsoftfp, and skip FP registers if goarmsoftfp!=0. - lfp.restore() - label("nofp2:") - l.restore() + lfp.restore(g) + g.label("nofp2:") + l.restore(g) p("MOVW %d(R13), R14", lfp.stack) // sigctxt.pushCall pushes LR on stack, restore it p("MOVW.P %d(R13), R15", lfp.stack+4) // load PC, pop frame (including the space pushed by sigctxt.pushCall) p("UNDEF") // shouldn't get here } -func genARM64() { +func genARM64(g *gen) { + p := g.p // Add integer registers R0-R26 // R27 (REGTMP), R28 (g), R29 (FP), R30 (LR), R31 (SP) are special // and not saved here. @@ -362,9 +372,9 @@ func genARM64() { p("MOVD R30, (RSP)") p("#endif") - l.save() + l.save(g) p("CALL ·asyncPreempt2(SB)") - l.restore() + l.restore(g) p("MOVD %d(RSP), R30", l.stack) // sigctxt.pushCall has pushed LR (at interrupt) on stack, restore it p("MOVD -8(RSP), R29") // restore frame pointer @@ -373,7 +383,9 @@ func genARM64() { p("RET (R27)") } -func genMIPS(_64bit bool) { +func genMIPS(g *gen, _64bit bool) { + p := g.p + mov := "MOVW" movf := "MOVF" add := "ADD" @@ -428,15 +440,15 @@ func genMIPS(_64bit bool) { p(mov+" R31, -%d(R29)", lfp.stack) p(sub+" $%d, R29", lfp.stack) - l.save() + l.save(g) p("#ifndef %s", softfloat) - lfp.save() + lfp.save(g) p("#endif") p("CALL ·asyncPreempt2(SB)") p("#ifndef %s", softfloat) - lfp.restore() + lfp.restore(g) p("#endif") - l.restore() + l.restore(g) p(mov+" %d(R29), R31", lfp.stack) // sigctxt.pushCall has pushed LR (at interrupt) on stack, restore it p(mov + " (R29), R23") // load PC to REGTMP @@ -444,7 +456,9 @@ func genMIPS(_64bit bool) { p("JMP (R23)") } -func genLoong64() { +func genLoong64(g *gen) { + p := g.p + mov := "MOVV" movf := "MOVD" add := "ADDV" @@ -478,9 +492,9 @@ func genLoong64() { p(mov+" R1, -%d(R3)", l.stack) p(sub+" $%d, R3", l.stack) - l.save() + l.save(g) p("CALL ·asyncPreempt2(SB)") - l.restore() + l.restore(g) p(mov+" %d(R3), R1", l.stack) // sigctxt.pushCall has pushed LR (at interrupt) on stack, restore it p(mov + " (R3), R30") // load PC to REGTMP @@ -488,7 +502,9 @@ func genLoong64() { p("JMP (R30)") } -func genPPC64() { +func genPPC64(g *gen) { + p := g.p + // Add integer registers R3-R29 // R0 (zero), R1 (SP), R30 (g) are special and not saved here. // R2 (TOC pointer in PIC mode), R12 (function entry address in PIC mode) have been saved in sigctxt.pushCall. @@ -528,9 +544,9 @@ func genPPC64() { p("MOVD LR, R31") p("MOVDU R31, -%d(R1)", l.stack) // allocate frame, save PC of interrupted instruction (in LR) - l.save() + l.save(g) p("CALL ·asyncPreempt2(SB)") - l.restore() + l.restore(g) p("MOVD %d(R1), R31", l.stack) // sigctxt.pushCall has pushed LR, R2, R12 (at interrupt) on stack, restore them p("MOVD R31, LR") @@ -543,7 +559,9 @@ func genPPC64() { p("JMP (CTR)") } -func genRISCV64() { +func genRISCV64(g *gen) { + p := g.p + // X0 (zero), X1 (LR), X2 (SP), X3 (GP), X4 (TP), X27 (g), X31 (TMP) are special. var l = layout{sp: "X2", stack: 8} @@ -564,16 +582,18 @@ func genRISCV64() { p("MOV X1, -%d(X2)", l.stack) p("SUB $%d, X2", l.stack) - l.save() + l.save(g) p("CALL ·asyncPreempt2(SB)") - l.restore() + l.restore(g) p("MOV %d(X2), X1", l.stack) p("MOV (X2), X31") p("ADD $%d, X2", l.stack+8) p("JMP (X31)") } -func genS390X() { +func genS390X(g *gen) { + p := g.p + // Add integer registers R0-R12 // R13 (g), R14 (LR), R15 (SP) are special, and not saved here. // Saving R10 (REGTMP) is not necessary, but it is saved anyway. @@ -594,9 +614,9 @@ func genS390X() { p("ADD $-%d, R15", l.stack) p("MOVW R10, 8(R15)") // save flags - l.save() + l.save(g) p("CALL ·asyncPreempt2(SB)") - l.restore() + l.restore(g) p("MOVD %d(R15), R14", l.stack) // sigctxt.pushCall has pushed LR (at interrupt) on stack, restore it p("ADD $%d, R15", l.stack+8) // pop frame (including the space pushed by sigctxt.pushCall) @@ -606,12 +626,14 @@ func genS390X() { p("JMP (R10)") } -func genWasm() { +func genWasm(g *gen) { + p := g.p p("// No async preemption on wasm") p("UNDEF") } -func notImplemented() { +func notImplemented(g *gen) { + p := g.p p("// Not implemented yet") p("JMP ·abort(SB)") } -- cgit v1.3 From 426cf36b4d0c672dc88fc5cef9b0d5db0d2f4fe5 Mon Sep 17 00:00:00 2001 From: Austin Clements Date: Tue, 29 Apr 2025 22:55:40 -0400 Subject: [dev.simd] runtime: save scalar registers off stack in amd64 async preemption Asynchronous preemption must save all registers that could be in use by Go code. Currently, it saves all of these to the goroutine stack. As a result, the stack frame requirements of asynchronous preemption can be rather high. On amd64, this requires 368 bytes of stack space, most of which is the XMM registers. Several RISC architectures are around 0.5 KiB. As we add support for SIMD instructions, this is going to become a problem. The AVX-512 register state is 2.5 KiB. This well exceeds the nosplit limit, and even if it didn't, could constrain when we can asynchronously preempt goroutines on small stacks. This CL fixes this by moving pure scalar state stored in non-GP registers off the stack and into an allocated "extended register state" object. To reduce space overhead, we only allocate these objects as needed. While in the theoretical limit, every G could need this register state, in practice very few do at a time. However, we can't allocate when we're in the middle of saving the register state during an asynchronous preemption, so we reserve scratch space on every P to temporarily store the register state, which can then be copied out to an allocated state object later by Go code. This commit only implements this for amd64, since that's where we're about to add much more vector state, but it lays the groundwork for doing this on any architecture that could benefit. Change-Id: I123a95e21c11d5c10942d70e27f84d2d99bbf735 Reviewed-on: https://go-review.googlesource.com/c/go/+/680898 Reviewed-by: Cherry Mui LUCI-TryBot-Result: Go LUCI Auto-Submit: Austin Clements --- src/runtime/export_test.go | 2 + src/runtime/lockrank.go | 5 +- src/runtime/mheap.go | 2 + src/runtime/mklockrank.go | 6 +- src/runtime/mkpreempt.go | 92 ++++++++++++++++++++++++++---- src/runtime/preempt.go | 50 +++++++++++------ src/runtime/preempt_amd64.go | 22 ++++++++ src/runtime/preempt_amd64.s | 82 ++++++++++++++++----------- src/runtime/preempt_noxreg.go | 27 +++++++++ src/runtime/preempt_xreg.go | 127 ++++++++++++++++++++++++++++++++++++++++++ src/runtime/proc.go | 1 + src/runtime/runtime2.go | 9 +++ src/runtime/sizeof_test.go | 9 ++- 13 files changed, 368 insertions(+), 66 deletions(-) create mode 100644 src/runtime/preempt_amd64.go create mode 100644 src/runtime/preempt_noxreg.go create mode 100644 src/runtime/preempt_xreg.go (limited to 'src/runtime') diff --git a/src/runtime/export_test.go b/src/runtime/export_test.go index 83cf301be4..b3bb5d2c58 100644 --- a/src/runtime/export_test.go +++ b/src/runtime/export_test.go @@ -555,6 +555,8 @@ type G = g type Sudog = sudog +type XRegPerG = xRegPerG + func Getg() *G { return getg() } diff --git a/src/runtime/lockrank.go b/src/runtime/lockrank.go index 44015ce862..9821e49998 100644 --- a/src/runtime/lockrank.go +++ b/src/runtime/lockrank.go @@ -70,6 +70,7 @@ const ( lockRankHchanLeaf // WB lockRankWbufSpans + lockRankXRegAlloc lockRankMheap lockRankMheapSpecial lockRankGlobalAlloc @@ -143,6 +144,7 @@ var lockNames = []string{ lockRankStackLarge: "stackLarge", lockRankHchanLeaf: "hchanLeaf", lockRankWbufSpans: "wbufSpans", + lockRankXRegAlloc: "xRegAlloc", lockRankMheap: "mheap", lockRankMheapSpecial: "mheapSpecial", lockRankGlobalAlloc: "globalAlloc", @@ -228,9 +230,10 @@ var lockPartialOrder [][]lockRank = [][]lockRank{ lockRankStackLarge: {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankComputeMaxProcs, lockRankUpdateMaxProcsG, lockRankSweepWaiters, lockRankAssistQueue, lockRankStrongFromWeakQueue, lockRankCleanupQueue, lockRankSweep, lockRankTestR, lockRankVgetrandom, lockRankTimerSend, lockRankExecW, lockRankCpuprof, lockRankPollDesc, lockRankWakeableSleep, lockRankHchan, lockRankAllocmR, lockRankExecR, lockRankSched, lockRankAllg, lockRankAllp, lockRankNotifyList, lockRankTimers, lockRankTimer, lockRankNetpollInit, lockRankRoot, lockRankItab, lockRankReflectOffs, lockRankSynctest, lockRankUserArenaState, lockRankTraceBuf, lockRankTraceStrings, lockRankFin, lockRankSpanSetSpine, lockRankMspanSpecial, lockRankGcBitsArenas, lockRankProfInsert, lockRankProfBlock, lockRankProfMemActive, lockRankProfMemFuture, lockRankGscan}, lockRankHchanLeaf: {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankComputeMaxProcs, lockRankUpdateMaxProcsG, lockRankSweepWaiters, lockRankAssistQueue, lockRankStrongFromWeakQueue, lockRankCleanupQueue, lockRankSweep, lockRankTestR, lockRankVgetrandom, lockRankTimerSend, lockRankExecW, lockRankCpuprof, lockRankPollDesc, lockRankWakeableSleep, lockRankHchan, lockRankAllocmR, lockRankExecR, lockRankSched, lockRankAllg, lockRankAllp, lockRankNotifyList, lockRankTimers, lockRankTimer, lockRankNetpollInit, lockRankRoot, lockRankItab, lockRankReflectOffs, lockRankSynctest, lockRankUserArenaState, lockRankTraceBuf, lockRankTraceStrings, lockRankFin, lockRankSpanSetSpine, lockRankMspanSpecial, lockRankGcBitsArenas, lockRankProfInsert, lockRankProfBlock, lockRankProfMemActive, lockRankProfMemFuture, lockRankGscan, lockRankHchanLeaf}, lockRankWbufSpans: {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankComputeMaxProcs, lockRankUpdateMaxProcsG, lockRankDefer, lockRankSweepWaiters, lockRankAssistQueue, lockRankStrongFromWeakQueue, lockRankCleanupQueue, lockRankSweep, lockRankTestR, lockRankVgetrandom, lockRankTimerSend, lockRankExecW, lockRankCpuprof, lockRankPollCache, lockRankPollDesc, lockRankWakeableSleep, lockRankHchan, lockRankAllocmR, lockRankExecR, lockRankSched, lockRankAllg, lockRankAllp, lockRankNotifyList, lockRankSudog, lockRankTimers, lockRankTimer, lockRankNetpollInit, lockRankRoot, lockRankItab, lockRankReflectOffs, lockRankSynctest, lockRankUserArenaState, lockRankTraceBuf, lockRankTraceStrings, lockRankFin, lockRankSpanSetSpine, lockRankMspanSpecial, lockRankGcBitsArenas, lockRankProfInsert, lockRankProfBlock, lockRankProfMemActive, lockRankProfMemFuture, lockRankGscan}, + lockRankXRegAlloc: {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankComputeMaxProcs, lockRankUpdateMaxProcsG, lockRankSweepWaiters, lockRankAssistQueue, lockRankStrongFromWeakQueue, lockRankCleanupQueue, lockRankSweep, lockRankTestR, lockRankTimerSend, lockRankCpuprof, lockRankPollDesc, lockRankWakeableSleep, lockRankHchan, lockRankAllocmR, lockRankExecR, lockRankSched}, lockRankMheap: {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankComputeMaxProcs, lockRankUpdateMaxProcsG, lockRankDefer, lockRankSweepWaiters, lockRankAssistQueue, lockRankStrongFromWeakQueue, lockRankCleanupQueue, lockRankSweep, lockRankTestR, lockRankVgetrandom, lockRankTimerSend, lockRankExecW, lockRankCpuprof, lockRankPollCache, lockRankPollDesc, lockRankWakeableSleep, lockRankHchan, lockRankAllocmR, lockRankExecR, lockRankSched, lockRankAllg, lockRankAllp, lockRankNotifyList, lockRankSudog, lockRankTimers, lockRankTimer, lockRankNetpollInit, lockRankRoot, lockRankItab, lockRankReflectOffs, lockRankSynctest, lockRankUserArenaState, lockRankTraceBuf, lockRankTraceStrings, lockRankFin, lockRankSpanSetSpine, lockRankMspanSpecial, lockRankGcBitsArenas, lockRankProfInsert, lockRankProfBlock, lockRankProfMemActive, lockRankProfMemFuture, lockRankGscan, lockRankStackpool, lockRankStackLarge, lockRankWbufSpans}, lockRankMheapSpecial: {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankComputeMaxProcs, lockRankUpdateMaxProcsG, lockRankDefer, lockRankSweepWaiters, lockRankAssistQueue, lockRankStrongFromWeakQueue, lockRankCleanupQueue, lockRankSweep, lockRankTestR, lockRankVgetrandom, lockRankTimerSend, lockRankExecW, lockRankCpuprof, lockRankPollCache, lockRankPollDesc, lockRankWakeableSleep, lockRankHchan, lockRankAllocmR, lockRankExecR, lockRankSched, lockRankAllg, lockRankAllp, lockRankNotifyList, lockRankSudog, lockRankTimers, lockRankTimer, lockRankNetpollInit, lockRankRoot, lockRankItab, lockRankReflectOffs, lockRankSynctest, lockRankUserArenaState, lockRankTraceBuf, lockRankTraceStrings, lockRankFin, lockRankSpanSetSpine, lockRankMspanSpecial, lockRankGcBitsArenas, lockRankProfInsert, lockRankProfBlock, lockRankProfMemActive, lockRankProfMemFuture, lockRankGscan, lockRankStackpool, lockRankStackLarge, lockRankWbufSpans, lockRankMheap}, - lockRankGlobalAlloc: {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankComputeMaxProcs, lockRankUpdateMaxProcsG, lockRankDefer, lockRankSweepWaiters, lockRankAssistQueue, lockRankStrongFromWeakQueue, lockRankCleanupQueue, lockRankSweep, lockRankTestR, lockRankVgetrandom, lockRankTimerSend, lockRankExecW, lockRankCpuprof, lockRankPollCache, lockRankPollDesc, lockRankWakeableSleep, lockRankHchan, lockRankAllocmR, lockRankExecR, lockRankSched, lockRankAllg, lockRankAllp, lockRankNotifyList, lockRankSudog, lockRankTimers, lockRankTimer, lockRankNetpollInit, lockRankRoot, lockRankItab, lockRankReflectOffs, lockRankSynctest, lockRankUserArenaState, lockRankTraceBuf, lockRankTraceStrings, lockRankFin, lockRankSpanSetSpine, lockRankMspanSpecial, lockRankGcBitsArenas, lockRankProfInsert, lockRankProfBlock, lockRankProfMemActive, lockRankProfMemFuture, lockRankGscan, lockRankStackpool, lockRankStackLarge, lockRankWbufSpans, lockRankMheap, lockRankMheapSpecial}, + lockRankGlobalAlloc: {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankComputeMaxProcs, lockRankUpdateMaxProcsG, lockRankDefer, lockRankSweepWaiters, lockRankAssistQueue, lockRankStrongFromWeakQueue, lockRankCleanupQueue, lockRankSweep, lockRankTestR, lockRankVgetrandom, lockRankTimerSend, lockRankExecW, lockRankCpuprof, lockRankPollCache, lockRankPollDesc, lockRankWakeableSleep, lockRankHchan, lockRankAllocmR, lockRankExecR, lockRankSched, lockRankAllg, lockRankAllp, lockRankNotifyList, lockRankSudog, lockRankTimers, lockRankTimer, lockRankNetpollInit, lockRankRoot, lockRankItab, lockRankReflectOffs, lockRankSynctest, lockRankUserArenaState, lockRankTraceBuf, lockRankTraceStrings, lockRankFin, lockRankSpanSetSpine, lockRankMspanSpecial, lockRankGcBitsArenas, lockRankProfInsert, lockRankProfBlock, lockRankProfMemActive, lockRankProfMemFuture, lockRankGscan, lockRankStackpool, lockRankStackLarge, lockRankWbufSpans, lockRankXRegAlloc, lockRankMheap, lockRankMheapSpecial}, lockRankTrace: {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankComputeMaxProcs, lockRankUpdateMaxProcsG, lockRankDefer, lockRankSweepWaiters, lockRankAssistQueue, lockRankStrongFromWeakQueue, lockRankCleanupQueue, lockRankSweep, lockRankTestR, lockRankVgetrandom, lockRankTimerSend, lockRankExecW, lockRankCpuprof, lockRankPollCache, lockRankPollDesc, lockRankWakeableSleep, lockRankHchan, lockRankAllocmR, lockRankExecR, lockRankSched, lockRankAllg, lockRankAllp, lockRankNotifyList, lockRankSudog, lockRankTimers, lockRankTimer, lockRankNetpollInit, lockRankRoot, lockRankItab, lockRankReflectOffs, lockRankSynctest, lockRankUserArenaState, lockRankTraceBuf, lockRankTraceStrings, lockRankFin, lockRankSpanSetSpine, lockRankMspanSpecial, lockRankGcBitsArenas, lockRankProfInsert, lockRankProfBlock, lockRankProfMemActive, lockRankProfMemFuture, lockRankGscan, lockRankStackpool, lockRankStackLarge, lockRankWbufSpans, lockRankMheap}, lockRankTraceStackTab: {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankComputeMaxProcs, lockRankUpdateMaxProcsG, lockRankDefer, lockRankSweepWaiters, lockRankAssistQueue, lockRankStrongFromWeakQueue, lockRankCleanupQueue, lockRankSweep, lockRankTestR, lockRankVgetrandom, lockRankTimerSend, lockRankExecW, lockRankCpuprof, lockRankPollCache, lockRankPollDesc, lockRankWakeableSleep, lockRankHchan, lockRankAllocmR, lockRankExecR, lockRankSched, lockRankAllg, lockRankAllp, lockRankNotifyList, lockRankSudog, lockRankTimers, lockRankTimer, lockRankNetpollInit, lockRankRoot, lockRankItab, lockRankReflectOffs, lockRankSynctest, lockRankUserArenaState, lockRankTraceBuf, lockRankTraceStrings, lockRankFin, lockRankSpanSetSpine, lockRankMspanSpecial, lockRankGcBitsArenas, lockRankProfInsert, lockRankProfBlock, lockRankProfMemActive, lockRankProfMemFuture, lockRankGscan, lockRankStackpool, lockRankStackLarge, lockRankWbufSpans, lockRankMheap, lockRankTrace}, lockRankPanic: {}, diff --git a/src/runtime/mheap.go b/src/runtime/mheap.go index f25dbb429d..358de2f376 100644 --- a/src/runtime/mheap.go +++ b/src/runtime/mheap.go @@ -821,6 +821,8 @@ func (h *mheap) init() { } h.pages.init(&h.lock, &memstats.gcMiscSys, false) + + xRegInitAlloc() } // reclaim sweeps and reclaims at least npage pages into the heap. diff --git a/src/runtime/mklockrank.go b/src/runtime/mklockrank.go index 46a063fdce..9c503369a3 100644 --- a/src/runtime/mklockrank.go +++ b/src/runtime/mklockrank.go @@ -193,6 +193,9 @@ defer, # Below WB is the write barrier implementation. < wbufSpans; +# xRegState allocator +sched < xRegAlloc; + # Span allocator stackLarge, stackpool, @@ -205,7 +208,8 @@ stackLarge, # an mspanSpecial lock, and they're part of the malloc implementation. # Pinner bits might be freed by the span allocator. mheap, mspanSpecial < mheapSpecial; -mheap, mheapSpecial < globalAlloc; +# Fixallocs +mheap, mheapSpecial, xRegAlloc < globalAlloc; # Execution tracer events (with a P) hchan, diff --git a/src/runtime/mkpreempt.go b/src/runtime/mkpreempt.go index ec900a23d2..e3dd5046f3 100644 --- a/src/runtime/mkpreempt.go +++ b/src/runtime/mkpreempt.go @@ -9,8 +9,10 @@ package main import ( + "bytes" "flag" "fmt" + "go/format" "io" "log" "os" @@ -122,14 +124,19 @@ type gen struct { goarch string } -func (g *gen) asmHeader() { +func (g *gen) commonHeader() { fmt.Fprintf(g.w, "// Code generated by mkpreempt.go; DO NOT EDIT.\n\n") if beLe[g.goarch] { base := g.goarch[:len(g.goarch)-1] fmt.Fprintf(g.w, "//go:build %s || %sle\n\n", base, base) } +} + +func (g *gen) asmHeader() { + g.commonHeader() fmt.Fprintf(g.w, "#include \"go_asm.h\"\n") if g.goarch == "amd64" { + fmt.Fprintf(g.w, "#include \"go_tls.h\"\n") fmt.Fprintf(g.w, "#include \"asm_amd64.h\"\n") } fmt.Fprintf(g.w, "#include \"textflag.h\"\n\n") @@ -145,6 +152,43 @@ func (g *gen) label(l string) { fmt.Fprintf(g.w, "%s\n", l) } +// writeXRegs writes an architecture xregs file. +func writeXRegs(arch string, l *layout) { + var code bytes.Buffer + g := gen{&code, arch} + g.commonHeader() + fmt.Fprintf(g.w, ` +package runtime + +type xRegState struct { +`) + pos := 0 + for _, reg := range l.regs { + if reg.pos != pos { + log.Fatalf("padding not implemented") + } + typ := fmt.Sprintf("[%d]byte", reg.size) + switch { + case reg.size == 4 && reg.pos%4 == 0: + typ = "uint32" + case reg.size == 8 && reg.pos%8 == 0: + typ = "uint64" + } + fmt.Fprintf(g.w, "\t%s %s\n", reg.reg, typ) + pos += reg.size + } + fmt.Fprintf(g.w, "}\n") + + path := fmt.Sprintf("preempt_%s.go", arch) + b, err := format.Source(code.Bytes()) + if err != nil { + log.Fatalf("formatting %s: %s", path, err) + } + if err := os.WriteFile(path, b, 0666); err != nil { + log.Fatal(err) + } +} + type layout struct { stack int regs []regPos @@ -152,7 +196,7 @@ type layout struct { } type regPos struct { - pos int + pos, size int saveOp string restoreOp string @@ -165,17 +209,17 @@ type regPos struct { } func (l *layout) add(op, reg string, size int) { - l.regs = append(l.regs, regPos{saveOp: op, restoreOp: op, reg: reg, pos: l.stack}) + l.regs = append(l.regs, regPos{saveOp: op, restoreOp: op, reg: reg, pos: l.stack, size: size}) l.stack += size } func (l *layout) add2(sop, rop, reg string, size int) { - l.regs = append(l.regs, regPos{saveOp: sop, restoreOp: rop, reg: reg, pos: l.stack}) + l.regs = append(l.regs, regPos{saveOp: sop, restoreOp: rop, reg: reg, pos: l.stack, size: size}) l.stack += size } func (l *layout) addSpecial(save, restore string, size int) { - l.regs = append(l.regs, regPos{save: save, restore: restore, pos: l.stack}) + l.regs = append(l.regs, regPos{save: save, restore: restore, pos: l.stack, size: size}) l.stack += size } @@ -239,6 +283,8 @@ func gen386(g *gen) { } func genAMD64(g *gen) { + const xReg = "AX" // *xRegState + p := g.p // Assign stack offsets. @@ -251,12 +297,13 @@ func genAMD64(g *gen) { l.add("MOVQ", reg, 8) } } - lSSE := layout{stack: l.stack, sp: "SP"} + lXRegs := layout{sp: xReg} // Non-GP registers for _, reg := range regNamesAMD64 { if strings.HasPrefix(reg, "X") { - lSSE.add("MOVUPS", reg, 16) + lXRegs.add("MOVUPS", reg, 16) } } + writeXRegs(g.goarch, &lXRegs) // TODO: MXCSR register? @@ -265,17 +312,40 @@ func genAMD64(g *gen) { p("// Save flags before clobbering them") p("PUSHFQ") p("// obj doesn't understand ADD/SUB on SP, but does understand ADJSP") - p("ADJSP $%d", lSSE.stack) + p("ADJSP $%d", l.stack) p("// But vet doesn't know ADJSP, so suppress vet stack checking") p("NOP SP") + p("// Save GPs") l.save(g) - lSSE.save(g) + // In general, the limitations on asynchronous preemption mean we only + // preempt in ABIInternal code. However, there's at least one exception to + // this: when we're in an open-coded transition between an ABIInternal + // function and an ABI0 call. We could more carefully arrange unsafe points + // to avoid ever landing in ABI0, but it's easy to just make this code not + // sensitive to the ABI we're preempting. The CALL to asyncPreempt2 will + // ensure we're in ABIInternal register state. + p("// Save extended register state to p.xRegs.scratch") + p("// Don't make assumptions about ABI register state. See mkpreempt.go") + p("get_tls(CX)") + p("MOVQ g(CX), R14") + p("MOVQ g_m(R14), %s", xReg) + p("MOVQ m_p(%s), %s", xReg, xReg) + p("LEAQ (p_xRegs+xRegPerP_scratch)(%s), %s", xReg, xReg) + lXRegs.save(g) + p("CALL ·asyncPreempt2(SB)") - lSSE.restore(g) + + p("// Restore non-GPs from *p.xRegs.cache") + p("MOVQ g_m(R14), %s", xReg) + p("MOVQ m_p(%s), %s", xReg, xReg) + p("MOVQ (p_xRegs+xRegPerP_cache)(%s), %s", xReg, xReg) + lXRegs.restore(g) + + p("// Restore GPs") l.restore(g) - p("ADJSP $%d", -lSSE.stack) + p("ADJSP $%d", -l.stack) p("POPFQ") p("POPQ BP") p("RET") diff --git a/src/runtime/preempt.go b/src/runtime/preempt.go index c41c355835..d053747d3a 100644 --- a/src/runtime/preempt.go +++ b/src/runtime/preempt.go @@ -292,21 +292,43 @@ func canPreemptM(mp *m) bool { // asyncPreempt saves all user registers and calls asyncPreempt2. // -// When stack scanning encounters an asyncPreempt frame, it scans that +// It saves GP registers (anything that might contain a pointer) to the G stack. +// Hence, when stack scanning encounters an asyncPreempt frame, it scans that // frame and its parent frame conservatively. // +// On some platforms, it saves large additional scalar-only register state such +// as vector registers to an "extended register state" on the P. +// // asyncPreempt is implemented in assembly. func asyncPreempt() //go:nosplit func asyncPreempt2() { + // We can't grow the stack with untyped data from asyncPreempt, so switch to + // the system stack right away. + mcall(func(gp *g) { + gp.asyncSafePoint = true + + // Move the extended register state from the P to the G. We do this now that + // we're on the system stack to avoid stack splits. + xRegSave(gp) + + if gp.preemptStop { + preemptPark(gp) + } else { + gopreempt_m(gp) + } + // The above functions never return. + }) + + // Do not grow the stack below here! + gp := getg() - gp.asyncSafePoint = true - if gp.preemptStop { - mcall(preemptPark) - } else { - mcall(gopreempt_m) - } + + // Put the extended register state back on the M so resumption can find it. + // We can't do this in asyncPreemptM because the park calls never return. + xRegRestore(gp) + gp.asyncSafePoint = false } @@ -319,19 +341,13 @@ func init() { total := funcMaxSPDelta(f) f = findfunc(abi.FuncPCABIInternal(asyncPreempt2)) total += funcMaxSPDelta(f) + f = findfunc(abi.FuncPCABIInternal(xRegRestore)) + total += funcMaxSPDelta(f) // Add some overhead for return PCs, etc. asyncPreemptStack = uintptr(total) + 8*goarch.PtrSize if asyncPreemptStack > stackNosplit { - // We need more than the nosplit limit. This isn't - // unsafe, but it may limit asynchronous preemption. - // - // This may be a problem if we start using more - // registers. In that case, we should store registers - // in a context object. If we pre-allocate one per P, - // asyncPreempt can spill just a few registers to the - // stack, then grab its context object and spill into - // it. When it enters the runtime, it would allocate a - // new context for the P. + // We need more than the nosplit limit. This isn't unsafe, but it may + // limit asynchronous preemption. Consider moving state into xRegState. print("runtime: asyncPreemptStack=", asyncPreemptStack, "\n") throw("async stack too large") } diff --git a/src/runtime/preempt_amd64.go b/src/runtime/preempt_amd64.go new file mode 100644 index 0000000000..904defac33 --- /dev/null +++ b/src/runtime/preempt_amd64.go @@ -0,0 +1,22 @@ +// Code generated by mkpreempt.go; DO NOT EDIT. + +package runtime + +type xRegState struct { + X0 [16]byte + X1 [16]byte + X2 [16]byte + X3 [16]byte + X4 [16]byte + X5 [16]byte + X6 [16]byte + X7 [16]byte + X8 [16]byte + X9 [16]byte + X10 [16]byte + X11 [16]byte + X12 [16]byte + X13 [16]byte + X14 [16]byte + X15 [16]byte +} diff --git a/src/runtime/preempt_amd64.s b/src/runtime/preempt_amd64.s index 8e3ed0d7c5..0a33ce7f3e 100644 --- a/src/runtime/preempt_amd64.s +++ b/src/runtime/preempt_amd64.s @@ -1,6 +1,7 @@ // Code generated by mkpreempt.go; DO NOT EDIT. #include "go_asm.h" +#include "go_tls.h" #include "asm_amd64.h" #include "textflag.h" @@ -10,9 +11,10 @@ TEXT ·asyncPreempt(SB),NOSPLIT|NOFRAME,$0-0 // Save flags before clobbering them PUSHFQ // obj doesn't understand ADD/SUB on SP, but does understand ADJSP - ADJSP $368 + ADJSP $112 // But vet doesn't know ADJSP, so suppress vet stack checking NOP SP + // Save GPs MOVQ AX, 0(SP) MOVQ CX, 8(SP) MOVQ DX, 16(SP) @@ -27,39 +29,51 @@ TEXT ·asyncPreempt(SB),NOSPLIT|NOFRAME,$0-0 MOVQ R13, 88(SP) MOVQ R14, 96(SP) MOVQ R15, 104(SP) - MOVUPS X0, 112(SP) - MOVUPS X1, 128(SP) - MOVUPS X2, 144(SP) - MOVUPS X3, 160(SP) - MOVUPS X4, 176(SP) - MOVUPS X5, 192(SP) - MOVUPS X6, 208(SP) - MOVUPS X7, 224(SP) - MOVUPS X8, 240(SP) - MOVUPS X9, 256(SP) - MOVUPS X10, 272(SP) - MOVUPS X11, 288(SP) - MOVUPS X12, 304(SP) - MOVUPS X13, 320(SP) - MOVUPS X14, 336(SP) - MOVUPS X15, 352(SP) + // Save extended register state to p.xRegs.scratch + // Don't make assumptions about ABI register state. See mkpreempt.go + get_tls(CX) + MOVQ g(CX), R14 + MOVQ g_m(R14), AX + MOVQ m_p(AX), AX + LEAQ (p_xRegs+xRegPerP_scratch)(AX), AX + MOVUPS X0, 0(AX) + MOVUPS X1, 16(AX) + MOVUPS X2, 32(AX) + MOVUPS X3, 48(AX) + MOVUPS X4, 64(AX) + MOVUPS X5, 80(AX) + MOVUPS X6, 96(AX) + MOVUPS X7, 112(AX) + MOVUPS X8, 128(AX) + MOVUPS X9, 144(AX) + MOVUPS X10, 160(AX) + MOVUPS X11, 176(AX) + MOVUPS X12, 192(AX) + MOVUPS X13, 208(AX) + MOVUPS X14, 224(AX) + MOVUPS X15, 240(AX) CALL ·asyncPreempt2(SB) - MOVUPS 352(SP), X15 - MOVUPS 336(SP), X14 - MOVUPS 320(SP), X13 - MOVUPS 304(SP), X12 - MOVUPS 288(SP), X11 - MOVUPS 272(SP), X10 - MOVUPS 256(SP), X9 - MOVUPS 240(SP), X8 - MOVUPS 224(SP), X7 - MOVUPS 208(SP), X6 - MOVUPS 192(SP), X5 - MOVUPS 176(SP), X4 - MOVUPS 160(SP), X3 - MOVUPS 144(SP), X2 - MOVUPS 128(SP), X1 - MOVUPS 112(SP), X0 + // Restore non-GPs from *p.xRegs.cache + MOVQ g_m(R14), AX + MOVQ m_p(AX), AX + MOVQ (p_xRegs+xRegPerP_cache)(AX), AX + MOVUPS 240(AX), X15 + MOVUPS 224(AX), X14 + MOVUPS 208(AX), X13 + MOVUPS 192(AX), X12 + MOVUPS 176(AX), X11 + MOVUPS 160(AX), X10 + MOVUPS 144(AX), X9 + MOVUPS 128(AX), X8 + MOVUPS 112(AX), X7 + MOVUPS 96(AX), X6 + MOVUPS 80(AX), X5 + MOVUPS 64(AX), X4 + MOVUPS 48(AX), X3 + MOVUPS 32(AX), X2 + MOVUPS 16(AX), X1 + MOVUPS 0(AX), X0 + // Restore GPs MOVQ 104(SP), R15 MOVQ 96(SP), R14 MOVQ 88(SP), R13 @@ -74,7 +88,7 @@ TEXT ·asyncPreempt(SB),NOSPLIT|NOFRAME,$0-0 MOVQ 16(SP), DX MOVQ 8(SP), CX MOVQ 0(SP), AX - ADJSP $-368 + ADJSP $-112 POPFQ POPQ BP RET diff --git a/src/runtime/preempt_noxreg.go b/src/runtime/preempt_noxreg.go new file mode 100644 index 0000000000..dfe46559b5 --- /dev/null +++ b/src/runtime/preempt_noxreg.go @@ -0,0 +1,27 @@ +// Copyright 2025 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build !amd64 + +// This provides common support for architectures that DO NOT use extended +// register state in asynchronous preemption. + +package runtime + +type xRegPerG struct{} + +type xRegPerP struct{} + +// xRegState is defined only so the build fails if we try to define a real +// xRegState on a noxreg architecture. +type xRegState struct{} + +func xRegInitAlloc() {} + +func xRegSave(gp *g) {} + +//go:nosplit +func xRegRestore(gp *g) {} + +func (*xRegPerP) free() {} diff --git a/src/runtime/preempt_xreg.go b/src/runtime/preempt_xreg.go new file mode 100644 index 0000000000..f0a47c15d9 --- /dev/null +++ b/src/runtime/preempt_xreg.go @@ -0,0 +1,127 @@ +// Copyright 2025 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build amd64 + +// This provides common support for architectures that use extended register +// state in asynchronous preemption. +// +// While asynchronous preemption stores general-purpose (GP) registers on the +// preempted goroutine's own stack, extended register state can be used to save +// non-GP state off the stack. In particular, this is meant for large vector +// register files. Currently, we assume this contains only scalar data, though +// we could change this constraint by conservatively scanning this memory. +// +// For an architecture to support extended register state, it must provide a Go +// definition of an xRegState type for storing the state, and its asyncPreempt +// implementation must write this register state to p.xRegs.scratch. + +package runtime + +import "unsafe" + +// xRegPerG stores extended register state while a goroutine is asynchronously +// preempted. This is nil otherwise, so we can reuse a (likely small) pool of +// xRegState objects. +type xRegPerG struct { + state *xRegState +} + +type xRegPerP struct { + // scratch temporary per-P space where [asyncPreempt] saves the register + // state before entering Go. It's quickly copied to per-G state. + scratch xRegState + + // cache is a 1-element allocation cache of extended register state used by + // asynchronous preemption. On entry to preemption, this is used as a simple + // allocation cache. On exit from preemption, the G's xRegState is always + // stored here where it can be restored, and later either freed or reused + // for another preemption. On exit, this serves the dual purpose of + // delay-freeing the allocated xRegState until after we've definitely + // restored it. + cache *xRegState +} + +// xRegAlloc allocates xRegState objects. +var xRegAlloc struct { + lock mutex + alloc fixalloc +} + +func xRegInitAlloc() { + lockInit(&xRegAlloc.lock, lockRankXRegAlloc) + xRegAlloc.alloc.init(unsafe.Sizeof(xRegState{}), nil, nil, &memstats.other_sys) +} + +// xRegSave saves the extended register state on this P to gp. +// +// This must run on the system stack because it assumes the P won't change. +// +//go:systemstack +func xRegSave(gp *g) { + if gp.xRegs.state != nil { + // Double preempt? + throw("gp.xRegState.p != nil on async preempt") + } + + // Get the place to save the register state. + var dest *xRegState + pp := gp.m.p.ptr() + if pp.xRegs.cache != nil { + // Use the cached allocation. + dest = pp.xRegs.cache + pp.xRegs.cache = nil + } else { + // Allocate a new save block. + lock(&xRegAlloc.lock) + dest = (*xRegState)(xRegAlloc.alloc.alloc()) + unlock(&xRegAlloc.lock) + } + + // Copy state saved in the scratchpad to dest. + // + // If we ever need to save less state (e.g., avoid saving vector registers + // that aren't in use), we could have multiple allocation pools for + // different size states and copy only the registers we need. + *dest = pp.xRegs.scratch + + // Save on the G. + gp.xRegs.state = dest +} + +// xRegRestore prepares the extended register state on gp to be restored. +// +// It moves the state to gp.m.p.xRegs.cache where [asyncPreempt] expects to find +// it. This means nothing else may use the cache between this call and the +// return to asyncPreempt. This is not quite symmetric with [xRegSave], which +// uses gp.m.p.xRegs.scratch. By using cache instead, we save a block copy. +// +// This is called with asyncPreempt on the stack and thus must not grow the +// stack. +// +//go:nosplit +func xRegRestore(gp *g) { + if gp.xRegs.state == nil { + throw("gp.xRegState.p == nil on return from async preempt") + } + // If the P has a block cached on it, free that so we can replace it. + pp := gp.m.p.ptr() + if pp.xRegs.cache != nil { + // Don't grow the G stack. + systemstack(func() { + pp.xRegs.free() + }) + } + pp.xRegs.cache = gp.xRegs.state + gp.xRegs.state = nil +} + +func (xRegs *xRegPerP) free() { + if xRegs.cache != nil { + lock(&xRegAlloc.lock) + xRegAlloc.alloc.free(unsafe.Pointer(xRegs.cache)) + xRegs.cache = nil + unlock(&xRegAlloc.lock) + } +} diff --git a/src/runtime/proc.go b/src/runtime/proc.go index 9817308430..b2ae46e0e4 100644 --- a/src/runtime/proc.go +++ b/src/runtime/proc.go @@ -5799,6 +5799,7 @@ func (pp *p) destroy() { pp.gcAssistTime = 0 gcCleanups.queued += pp.cleanupsQueued pp.cleanupsQueued = 0 + pp.xRegs.free() pp.status = _Pdead } diff --git a/src/runtime/runtime2.go b/src/runtime/runtime2.go index 96720846b2..789b68e54e 100644 --- a/src/runtime/runtime2.go +++ b/src/runtime/runtime2.go @@ -491,6 +491,10 @@ type g struct { coroarg *coro // argument during coroutine transfers bubble *synctestBubble + // xRegs stores the extended register state if this G has been + // asynchronously preempted. + xRegs xRegPerG + // Per-G tracer state. trace gTraceState @@ -760,6 +764,11 @@ type p struct { // gcStopTime is the nanotime timestamp that this P last entered _Pgcstop. gcStopTime int64 + // xRegs is the per-P extended register state used by asynchronous + // preemption. This is an empty struct on platforms that don't use extended + // register state. + xRegs xRegPerP + // Padding is no longer needed. False sharing is now not a worry because p is large enough // that its size class is an integer multiple of the cache line size (for any of our architectures). } diff --git a/src/runtime/sizeof_test.go b/src/runtime/sizeof_test.go index a5dc8aed34..de859866a5 100644 --- a/src/runtime/sizeof_test.go +++ b/src/runtime/sizeof_test.go @@ -15,13 +15,18 @@ import ( func TestSizeof(t *testing.T) { const _64bit = unsafe.Sizeof(uintptr(0)) == 8 + const xreg = unsafe.Sizeof(runtime.XRegPerG{}) // Varies per architecture var tests = []struct { val any // type as a value _32bit uintptr // size on 32bit platforms _64bit uintptr // size on 64bit platforms }{ - {runtime.G{}, 280, 440}, // g, but exported for testing - {runtime.Sudog{}, 56, 88}, // sudog, but exported for testing + {runtime.G{}, 280 + xreg, 440 + xreg}, // g, but exported for testing + {runtime.Sudog{}, 56, 88}, // sudog, but exported for testing + } + + if xreg > runtime.PtrSize { + t.Errorf("unsafe.Sizeof(xRegPerG) = %d, want <= %d", xreg, runtime.PtrSize) } for _, tt := range tests { -- cgit v1.3 From 9eeb1e7a9afb992e899d3917fce92c01b3fa50c1 Mon Sep 17 00:00:00 2001 From: Austin Clements Date: Thu, 12 Jun 2025 15:33:41 -0400 Subject: [dev.simd] runtime: save AVX2 and AVX-512 state on asynchronous preemption Based on CL 669415 by shaojunyang@google.com. Change-Id: I574f15c3b18a7179a1573aaf567caf18d8602ef1 Reviewed-on: https://go-review.googlesource.com/c/go/+/680900 LUCI-TryBot-Result: Go LUCI Auto-Submit: Austin Clements Reviewed-by: Cherry Mui --- src/runtime/cpuflags.go | 1 + src/runtime/mkpreempt.go | 74 ++++++++++++++++--- src/runtime/preempt_amd64.go | 40 ++++++----- src/runtime/preempt_amd64.s | 166 +++++++++++++++++++++++++++++++++++-------- 4 files changed, 227 insertions(+), 54 deletions(-) (limited to 'src/runtime') diff --git a/src/runtime/cpuflags.go b/src/runtime/cpuflags.go index bd1cb328d3..6452364b68 100644 --- a/src/runtime/cpuflags.go +++ b/src/runtime/cpuflags.go @@ -13,6 +13,7 @@ import ( const ( offsetX86HasAVX = unsafe.Offsetof(cpu.X86.HasAVX) offsetX86HasAVX2 = unsafe.Offsetof(cpu.X86.HasAVX2) + offsetX86HasAVX512 = unsafe.Offsetof(cpu.X86.HasAVX512) // F+CD+BW+DQ+VL offsetX86HasERMS = unsafe.Offsetof(cpu.X86.HasERMS) offsetX86HasRDTSCP = unsafe.Offsetof(cpu.X86.HasRDTSCP) diff --git a/src/runtime/mkpreempt.go b/src/runtime/mkpreempt.go index e3dd5046f3..29e8288129 100644 --- a/src/runtime/mkpreempt.go +++ b/src/runtime/mkpreempt.go @@ -285,7 +285,7 @@ func gen386(g *gen) { func genAMD64(g *gen) { const xReg = "AX" // *xRegState - p := g.p + p, label := g.p, g.label // Assign stack offsets. var l = layout{sp: "SP"} @@ -297,15 +297,33 @@ func genAMD64(g *gen) { l.add("MOVQ", reg, 8) } } - lXRegs := layout{sp: xReg} // Non-GP registers - for _, reg := range regNamesAMD64 { - if strings.HasPrefix(reg, "X") { - lXRegs.add("MOVUPS", reg, 16) + // Create layouts for X, Y, and Z registers. + const ( + numXRegs = 16 + numZRegs = 16 // TODO: If we start using upper registers, change to 32 + numKRegs = 8 + ) + lZRegs := layout{sp: xReg} // Non-GP registers + lXRegs, lYRegs := lZRegs, lZRegs + for i := range numZRegs { + lZRegs.add("VMOVDQU64", fmt.Sprintf("Z%d", i), 512/8) + if i < numXRegs { + // Use SSE-only instructions for X registers. + lXRegs.add("MOVUPS", fmt.Sprintf("X%d", i), 128/8) + lYRegs.add("VMOVDQU", fmt.Sprintf("Y%d", i), 256/8) } } - writeXRegs(g.goarch, &lXRegs) - - // TODO: MXCSR register? + for i := range numKRegs { + lZRegs.add("KMOVQ", fmt.Sprintf("K%d", i), 8) + } + // The Z layout is the most general, so we line up the others with that one. + // We don't have to do this, but it results in a nice Go type. If we split + // this into multiple types, we probably should stop doing this. + for i := range lXRegs.regs { + lXRegs.regs[i].pos = lZRegs.regs[i].pos + lYRegs.regs[i].pos = lZRegs.regs[i].pos + } + writeXRegs(g.goarch, &lZRegs) p("PUSHQ BP") p("MOVQ SP, BP") @@ -333,16 +351,56 @@ func genAMD64(g *gen) { p("MOVQ g_m(R14), %s", xReg) p("MOVQ m_p(%s), %s", xReg, xReg) p("LEAQ (p_xRegs+xRegPerP_scratch)(%s), %s", xReg, xReg) + + // Which registers do we need to save? + p("#ifdef GOEXPERIMENT_simd") + p("CMPB internal∕cpu·X86+const_offsetX86HasAVX512(SB), $1") + p("JE saveAVX512") + p("CMPB internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1") + p("JE saveAVX2") + p("#endif") + + // No features. Assume only SSE. + label("saveSSE:") lXRegs.save(g) + p("JMP preempt") + label("saveAVX2:") + lYRegs.save(g) + p("JMP preempt") + + label("saveAVX512:") + lZRegs.save(g) + p("JMP preempt") + + label("preempt:") p("CALL ·asyncPreempt2(SB)") p("// Restore non-GPs from *p.xRegs.cache") p("MOVQ g_m(R14), %s", xReg) p("MOVQ m_p(%s), %s", xReg, xReg) p("MOVQ (p_xRegs+xRegPerP_cache)(%s), %s", xReg, xReg) + + p("#ifdef GOEXPERIMENT_simd") + p("CMPB internal∕cpu·X86+const_offsetX86HasAVX512(SB), $1") + p("JE restoreAVX512") + p("CMPB internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1") + p("JE restoreAVX2") + p("#endif") + + label("restoreSSE:") lXRegs.restore(g) + p("JMP restoreGPs") + + label("restoreAVX2:") + lYRegs.restore(g) + p("JMP restoreGPs") + + label("restoreAVX512:") + lZRegs.restore(g) + p("JMP restoreGPs") + label("restoreGPs:") p("// Restore GPs") l.restore(g) p("ADJSP $%d", -l.stack) diff --git a/src/runtime/preempt_amd64.go b/src/runtime/preempt_amd64.go index 904defac33..44838a1df2 100644 --- a/src/runtime/preempt_amd64.go +++ b/src/runtime/preempt_amd64.go @@ -3,20 +3,28 @@ package runtime type xRegState struct { - X0 [16]byte - X1 [16]byte - X2 [16]byte - X3 [16]byte - X4 [16]byte - X5 [16]byte - X6 [16]byte - X7 [16]byte - X8 [16]byte - X9 [16]byte - X10 [16]byte - X11 [16]byte - X12 [16]byte - X13 [16]byte - X14 [16]byte - X15 [16]byte + Z0 [64]byte + Z1 [64]byte + Z2 [64]byte + Z3 [64]byte + Z4 [64]byte + Z5 [64]byte + Z6 [64]byte + Z7 [64]byte + Z8 [64]byte + Z9 [64]byte + Z10 [64]byte + Z11 [64]byte + Z12 [64]byte + Z13 [64]byte + Z14 [64]byte + Z15 [64]byte + K0 uint64 + K1 uint64 + K2 uint64 + K3 uint64 + K4 uint64 + K5 uint64 + K6 uint64 + K7 uint64 } diff --git a/src/runtime/preempt_amd64.s b/src/runtime/preempt_amd64.s index 0a33ce7f3e..c35de7f3b7 100644 --- a/src/runtime/preempt_amd64.s +++ b/src/runtime/preempt_amd64.s @@ -36,43 +36,149 @@ TEXT ·asyncPreempt(SB),NOSPLIT|NOFRAME,$0-0 MOVQ g_m(R14), AX MOVQ m_p(AX), AX LEAQ (p_xRegs+xRegPerP_scratch)(AX), AX + #ifdef GOEXPERIMENT_simd + CMPB internal∕cpu·X86+const_offsetX86HasAVX512(SB), $1 + JE saveAVX512 + CMPB internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1 + JE saveAVX2 + #endif +saveSSE: MOVUPS X0, 0(AX) - MOVUPS X1, 16(AX) - MOVUPS X2, 32(AX) - MOVUPS X3, 48(AX) - MOVUPS X4, 64(AX) - MOVUPS X5, 80(AX) - MOVUPS X6, 96(AX) - MOVUPS X7, 112(AX) - MOVUPS X8, 128(AX) - MOVUPS X9, 144(AX) - MOVUPS X10, 160(AX) - MOVUPS X11, 176(AX) - MOVUPS X12, 192(AX) - MOVUPS X13, 208(AX) - MOVUPS X14, 224(AX) - MOVUPS X15, 240(AX) + MOVUPS X1, 64(AX) + MOVUPS X2, 128(AX) + MOVUPS X3, 192(AX) + MOVUPS X4, 256(AX) + MOVUPS X5, 320(AX) + MOVUPS X6, 384(AX) + MOVUPS X7, 448(AX) + MOVUPS X8, 512(AX) + MOVUPS X9, 576(AX) + MOVUPS X10, 640(AX) + MOVUPS X11, 704(AX) + MOVUPS X12, 768(AX) + MOVUPS X13, 832(AX) + MOVUPS X14, 896(AX) + MOVUPS X15, 960(AX) + JMP preempt +saveAVX2: + VMOVDQU Y0, 0(AX) + VMOVDQU Y1, 64(AX) + VMOVDQU Y2, 128(AX) + VMOVDQU Y3, 192(AX) + VMOVDQU Y4, 256(AX) + VMOVDQU Y5, 320(AX) + VMOVDQU Y6, 384(AX) + VMOVDQU Y7, 448(AX) + VMOVDQU Y8, 512(AX) + VMOVDQU Y9, 576(AX) + VMOVDQU Y10, 640(AX) + VMOVDQU Y11, 704(AX) + VMOVDQU Y12, 768(AX) + VMOVDQU Y13, 832(AX) + VMOVDQU Y14, 896(AX) + VMOVDQU Y15, 960(AX) + JMP preempt +saveAVX512: + VMOVDQU64 Z0, 0(AX) + VMOVDQU64 Z1, 64(AX) + VMOVDQU64 Z2, 128(AX) + VMOVDQU64 Z3, 192(AX) + VMOVDQU64 Z4, 256(AX) + VMOVDQU64 Z5, 320(AX) + VMOVDQU64 Z6, 384(AX) + VMOVDQU64 Z7, 448(AX) + VMOVDQU64 Z8, 512(AX) + VMOVDQU64 Z9, 576(AX) + VMOVDQU64 Z10, 640(AX) + VMOVDQU64 Z11, 704(AX) + VMOVDQU64 Z12, 768(AX) + VMOVDQU64 Z13, 832(AX) + VMOVDQU64 Z14, 896(AX) + VMOVDQU64 Z15, 960(AX) + KMOVQ K0, 1024(AX) + KMOVQ K1, 1032(AX) + KMOVQ K2, 1040(AX) + KMOVQ K3, 1048(AX) + KMOVQ K4, 1056(AX) + KMOVQ K5, 1064(AX) + KMOVQ K6, 1072(AX) + KMOVQ K7, 1080(AX) + JMP preempt +preempt: CALL ·asyncPreempt2(SB) // Restore non-GPs from *p.xRegs.cache MOVQ g_m(R14), AX MOVQ m_p(AX), AX MOVQ (p_xRegs+xRegPerP_cache)(AX), AX - MOVUPS 240(AX), X15 - MOVUPS 224(AX), X14 - MOVUPS 208(AX), X13 - MOVUPS 192(AX), X12 - MOVUPS 176(AX), X11 - MOVUPS 160(AX), X10 - MOVUPS 144(AX), X9 - MOVUPS 128(AX), X8 - MOVUPS 112(AX), X7 - MOVUPS 96(AX), X6 - MOVUPS 80(AX), X5 - MOVUPS 64(AX), X4 - MOVUPS 48(AX), X3 - MOVUPS 32(AX), X2 - MOVUPS 16(AX), X1 + #ifdef GOEXPERIMENT_simd + CMPB internal∕cpu·X86+const_offsetX86HasAVX512(SB), $1 + JE restoreAVX512 + CMPB internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1 + JE restoreAVX2 + #endif +restoreSSE: + MOVUPS 960(AX), X15 + MOVUPS 896(AX), X14 + MOVUPS 832(AX), X13 + MOVUPS 768(AX), X12 + MOVUPS 704(AX), X11 + MOVUPS 640(AX), X10 + MOVUPS 576(AX), X9 + MOVUPS 512(AX), X8 + MOVUPS 448(AX), X7 + MOVUPS 384(AX), X6 + MOVUPS 320(AX), X5 + MOVUPS 256(AX), X4 + MOVUPS 192(AX), X3 + MOVUPS 128(AX), X2 + MOVUPS 64(AX), X1 MOVUPS 0(AX), X0 + JMP restoreGPs +restoreAVX2: + VMOVDQU 960(AX), Y15 + VMOVDQU 896(AX), Y14 + VMOVDQU 832(AX), Y13 + VMOVDQU 768(AX), Y12 + VMOVDQU 704(AX), Y11 + VMOVDQU 640(AX), Y10 + VMOVDQU 576(AX), Y9 + VMOVDQU 512(AX), Y8 + VMOVDQU 448(AX), Y7 + VMOVDQU 384(AX), Y6 + VMOVDQU 320(AX), Y5 + VMOVDQU 256(AX), Y4 + VMOVDQU 192(AX), Y3 + VMOVDQU 128(AX), Y2 + VMOVDQU 64(AX), Y1 + VMOVDQU 0(AX), Y0 + JMP restoreGPs +restoreAVX512: + KMOVQ 1080(AX), K7 + KMOVQ 1072(AX), K6 + KMOVQ 1064(AX), K5 + KMOVQ 1056(AX), K4 + KMOVQ 1048(AX), K3 + KMOVQ 1040(AX), K2 + KMOVQ 1032(AX), K1 + KMOVQ 1024(AX), K0 + VMOVDQU64 960(AX), Z15 + VMOVDQU64 896(AX), Z14 + VMOVDQU64 832(AX), Z13 + VMOVDQU64 768(AX), Z12 + VMOVDQU64 704(AX), Z11 + VMOVDQU64 640(AX), Z10 + VMOVDQU64 576(AX), Z9 + VMOVDQU64 512(AX), Z8 + VMOVDQU64 448(AX), Z7 + VMOVDQU64 384(AX), Z6 + VMOVDQU64 320(AX), Z5 + VMOVDQU64 256(AX), Z4 + VMOVDQU64 192(AX), Z3 + VMOVDQU64 128(AX), Z2 + VMOVDQU64 64(AX), Z1 + VMOVDQU64 0(AX), Z0 + JMP restoreGPs +restoreGPs: // Restore GPs MOVQ 104(SP), R15 MOVQ 96(SP), R14 -- cgit v1.3 From 0710cce6eb0d75db1fc6c45807773f40edb14d73 Mon Sep 17 00:00:00 2001 From: Austin Clements Date: Mon, 30 Jun 2025 16:42:19 -0400 Subject: [dev.simd] runtime: remove write barrier in xRegRestore Currently, there's a write barrier in xRegRestore when it assigns pp.xRegs.cache = gp.xRegs.state. This is bad because that gets called on the asyncPreempt return path, where we have really limited stack space, and we don't currently account for this write barrier. We can't simply mark xRegState as sys.NotInHeap because it's also embedded in runtime.p as register scratch space, and runtime.p is heap allocated. Hence, to fix this, we rename xRegState to just "xRegs" and introduce a wrapper "xRegState" type that embeds xRegs and is itself marked sys.NotInHeap. Then, anywhere we need a manually-managed pointer to register state, we use the new type. To ensure this doesn't happen again in the future, we also mark asyncPreempt2 as go:nowritebarrierrec. Change-Id: I5ff4841e55ff20047ff7d253ab659ab77aeb3391 Reviewed-on: https://go-review.googlesource.com/c/go/+/684836 Auto-Submit: Austin Clements Reviewed-by: Cherry Mui LUCI-TryBot-Result: Go LUCI --- src/runtime/mkpreempt.go | 2 +- src/runtime/preempt.go | 9 +++++++++ src/runtime/preempt_amd64.go | 2 +- src/runtime/preempt_xreg.go | 16 +++++++++++++--- 4 files changed, 24 insertions(+), 5 deletions(-) (limited to 'src/runtime') diff --git a/src/runtime/mkpreempt.go b/src/runtime/mkpreempt.go index 29e8288129..2bd2ef07fa 100644 --- a/src/runtime/mkpreempt.go +++ b/src/runtime/mkpreempt.go @@ -160,7 +160,7 @@ func writeXRegs(arch string, l *layout) { fmt.Fprintf(g.w, ` package runtime -type xRegState struct { +type xRegs struct { `) pos := 0 for _, reg := range l.regs { diff --git a/src/runtime/preempt.go b/src/runtime/preempt.go index d053747d3a..22727df74e 100644 --- a/src/runtime/preempt.go +++ b/src/runtime/preempt.go @@ -302,7 +302,16 @@ func canPreemptM(mp *m) bool { // asyncPreempt is implemented in assembly. func asyncPreempt() +// asyncPreempt2 is the Go continuation of asyncPreempt. +// +// It must be deeply nosplit because there's untyped data on the stack from +// asyncPreempt. +// +// It must not have any write barriers because we need to limit the amount of +// stack it uses. +// //go:nosplit +//go:nowritebarrierrec func asyncPreempt2() { // We can't grow the stack with untyped data from asyncPreempt, so switch to // the system stack right away. diff --git a/src/runtime/preempt_amd64.go b/src/runtime/preempt_amd64.go index 44838a1df2..88c0ddd34a 100644 --- a/src/runtime/preempt_amd64.go +++ b/src/runtime/preempt_amd64.go @@ -2,7 +2,7 @@ package runtime -type xRegState struct { +type xRegs struct { Z0 [64]byte Z1 [64]byte Z2 [64]byte diff --git a/src/runtime/preempt_xreg.go b/src/runtime/preempt_xreg.go index f0a47c15d9..9e05455ddb 100644 --- a/src/runtime/preempt_xreg.go +++ b/src/runtime/preempt_xreg.go @@ -19,7 +19,17 @@ package runtime -import "unsafe" +import ( + "internal/runtime/sys" + "unsafe" +) + +// xRegState is long-lived extended register state. It is allocated off-heap and +// manually managed. +type xRegState struct { + _ sys.NotInHeap // Allocated from xRegAlloc + regs xRegs +} // xRegPerG stores extended register state while a goroutine is asynchronously // preempted. This is nil otherwise, so we can reuse a (likely small) pool of @@ -31,7 +41,7 @@ type xRegPerG struct { type xRegPerP struct { // scratch temporary per-P space where [asyncPreempt] saves the register // state before entering Go. It's quickly copied to per-G state. - scratch xRegState + scratch xRegs // cache is a 1-element allocation cache of extended register state used by // asynchronous preemption. On entry to preemption, this is used as a simple @@ -84,7 +94,7 @@ func xRegSave(gp *g) { // If we ever need to save less state (e.g., avoid saving vector registers // that aren't in use), we could have multiple allocation pools for // different size states and copy only the registers we need. - *dest = pp.xRegs.scratch + dest.regs = pp.xRegs.scratch // Save on the G. gp.xRegs.state = dest -- cgit v1.3 From 574854fd863377a9467625c45ec842fd7d5fc341 Mon Sep 17 00:00:00 2001 From: Junyang Shao Date: Tue, 8 Jul 2025 19:24:30 +0000 Subject: [dev.simd] runtime: save Z16-Z31 registers in async preempt The register allocation will use the upper register soon, this CL is to enable that. Change-Id: I4d7285e08b95f4e6ebee72594dfbe8d1199f09ed Reviewed-on: https://go-review.googlesource.com/c/go/+/686498 TryBot-Bypass: David Chase Reviewed-by: Cherry Mui Commit-Queue: David Chase --- src/runtime/mkpreempt.go | 2 +- src/runtime/preempt_amd64.go | 16 +++++++++++ src/runtime/preempt_amd64.s | 64 +++++++++++++++++++++++++++++++++----------- 3 files changed, 65 insertions(+), 17 deletions(-) (limited to 'src/runtime') diff --git a/src/runtime/mkpreempt.go b/src/runtime/mkpreempt.go index 2bd2ef07fa..7786f342b5 100644 --- a/src/runtime/mkpreempt.go +++ b/src/runtime/mkpreempt.go @@ -300,7 +300,7 @@ func genAMD64(g *gen) { // Create layouts for X, Y, and Z registers. const ( numXRegs = 16 - numZRegs = 16 // TODO: If we start using upper registers, change to 32 + numZRegs = 32 numKRegs = 8 ) lZRegs := layout{sp: xReg} // Non-GP registers diff --git a/src/runtime/preempt_amd64.go b/src/runtime/preempt_amd64.go index 88c0ddd34a..78dec40e1f 100644 --- a/src/runtime/preempt_amd64.go +++ b/src/runtime/preempt_amd64.go @@ -19,6 +19,22 @@ type xRegs struct { Z13 [64]byte Z14 [64]byte Z15 [64]byte + Z16 [64]byte + Z17 [64]byte + Z18 [64]byte + Z19 [64]byte + Z20 [64]byte + Z21 [64]byte + Z22 [64]byte + Z23 [64]byte + Z24 [64]byte + Z25 [64]byte + Z26 [64]byte + Z27 [64]byte + Z28 [64]byte + Z29 [64]byte + Z30 [64]byte + Z31 [64]byte K0 uint64 K1 uint64 K2 uint64 diff --git a/src/runtime/preempt_amd64.s b/src/runtime/preempt_amd64.s index c35de7f3b7..a5b949a242 100644 --- a/src/runtime/preempt_amd64.s +++ b/src/runtime/preempt_amd64.s @@ -95,14 +95,30 @@ saveAVX512: VMOVDQU64 Z13, 832(AX) VMOVDQU64 Z14, 896(AX) VMOVDQU64 Z15, 960(AX) - KMOVQ K0, 1024(AX) - KMOVQ K1, 1032(AX) - KMOVQ K2, 1040(AX) - KMOVQ K3, 1048(AX) - KMOVQ K4, 1056(AX) - KMOVQ K5, 1064(AX) - KMOVQ K6, 1072(AX) - KMOVQ K7, 1080(AX) + VMOVDQU64 Z16, 1024(AX) + VMOVDQU64 Z17, 1088(AX) + VMOVDQU64 Z18, 1152(AX) + VMOVDQU64 Z19, 1216(AX) + VMOVDQU64 Z20, 1280(AX) + VMOVDQU64 Z21, 1344(AX) + VMOVDQU64 Z22, 1408(AX) + VMOVDQU64 Z23, 1472(AX) + VMOVDQU64 Z24, 1536(AX) + VMOVDQU64 Z25, 1600(AX) + VMOVDQU64 Z26, 1664(AX) + VMOVDQU64 Z27, 1728(AX) + VMOVDQU64 Z28, 1792(AX) + VMOVDQU64 Z29, 1856(AX) + VMOVDQU64 Z30, 1920(AX) + VMOVDQU64 Z31, 1984(AX) + KMOVQ K0, 2048(AX) + KMOVQ K1, 2056(AX) + KMOVQ K2, 2064(AX) + KMOVQ K3, 2072(AX) + KMOVQ K4, 2080(AX) + KMOVQ K5, 2088(AX) + KMOVQ K6, 2096(AX) + KMOVQ K7, 2104(AX) JMP preempt preempt: CALL ·asyncPreempt2(SB) @@ -153,14 +169,30 @@ restoreAVX2: VMOVDQU 0(AX), Y0 JMP restoreGPs restoreAVX512: - KMOVQ 1080(AX), K7 - KMOVQ 1072(AX), K6 - KMOVQ 1064(AX), K5 - KMOVQ 1056(AX), K4 - KMOVQ 1048(AX), K3 - KMOVQ 1040(AX), K2 - KMOVQ 1032(AX), K1 - KMOVQ 1024(AX), K0 + KMOVQ 2104(AX), K7 + KMOVQ 2096(AX), K6 + KMOVQ 2088(AX), K5 + KMOVQ 2080(AX), K4 + KMOVQ 2072(AX), K3 + KMOVQ 2064(AX), K2 + KMOVQ 2056(AX), K1 + KMOVQ 2048(AX), K0 + VMOVDQU64 1984(AX), Z31 + VMOVDQU64 1920(AX), Z30 + VMOVDQU64 1856(AX), Z29 + VMOVDQU64 1792(AX), Z28 + VMOVDQU64 1728(AX), Z27 + VMOVDQU64 1664(AX), Z26 + VMOVDQU64 1600(AX), Z25 + VMOVDQU64 1536(AX), Z24 + VMOVDQU64 1472(AX), Z23 + VMOVDQU64 1408(AX), Z22 + VMOVDQU64 1344(AX), Z21 + VMOVDQU64 1280(AX), Z20 + VMOVDQU64 1216(AX), Z19 + VMOVDQU64 1152(AX), Z18 + VMOVDQU64 1088(AX), Z17 + VMOVDQU64 1024(AX), Z16 VMOVDQU64 960(AX), Z15 VMOVDQU64 896(AX), Z14 VMOVDQU64 832(AX), Z13 -- cgit v1.3 From 4c311aa38f6e354ec4d9f5882a16c36a2e4b0f36 Mon Sep 17 00:00:00 2001 From: Cherry Mui Date: Thu, 21 Aug 2025 14:37:18 -0400 Subject: [dev.simd] cmd/compile: ensure the whole X15 register is zeroed On AMD64, we reserve the X15 register as the zero register. Currently we use an SSE instruction to zero it, and we only use it in SSE contexts. When the machine supports AVX, the high bits of the register is not necessarily zeroed. Now that the compiler generates AVX code for SIMD, it would be great to have a zero register in the AVX context. This CL zeroes the whole X15 register if AVX is supported. Change-Id: I4dc803362f2e007b1614b90de435fbb7814cebc7 Reviewed-on: https://go-review.googlesource.com/c/go/+/698237 LUCI-TryBot-Result: Go LUCI Reviewed-by: Junyang Shao Reviewed-by: David Chase --- src/cmd/compile/internal/amd64/ssa.go | 33 ++++++++++++++++++++-- src/cmd/compile/internal/ir/symtab.go | 1 + src/cmd/compile/internal/ssagen/ssa.go | 4 +-- .../compile/internal/typecheck/_builtin/runtime.go | 3 +- src/cmd/compile/internal/typecheck/builtin.go | 3 +- src/runtime/asm_amd64.s | 6 ++++ src/runtime/cpuflags.go | 3 +- src/runtime/proc.go | 3 +- src/runtime/race_amd64.s | 3 ++ src/runtime/sys_darwin_amd64.s | 3 ++ src/runtime/sys_dragonfly_amd64.s | 3 ++ src/runtime/sys_freebsd_amd64.s | 6 ++++ src/runtime/sys_linux_amd64.s | 6 ++++ src/runtime/sys_netbsd_amd64.s | 3 ++ src/runtime/sys_openbsd_amd64.s | 3 ++ src/runtime/sys_windows_amd64.s | 3 ++ 16 files changed, 78 insertions(+), 8 deletions(-) (limited to 'src/runtime') diff --git a/src/cmd/compile/internal/amd64/ssa.go b/src/cmd/compile/internal/amd64/ssa.go index 3ae3c61764..f511e75e97 100644 --- a/src/cmd/compile/internal/amd64/ssa.go +++ b/src/cmd/compile/internal/amd64/ssa.go @@ -18,6 +18,7 @@ import ( "cmd/internal/obj" "cmd/internal/obj/x86" "internal/abi" + "internal/buildcfg" ) // ssaMarkMoves marks any MOVXconst ops that need to avoid clobbering flags. @@ -1290,7 +1291,7 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) { case ssa.OpAMD64CALLstatic, ssa.OpAMD64CALLtail: if s.ABI == obj.ABI0 && v.Aux.(*ssa.AuxCall).Fn.ABI() == obj.ABIInternal { // zeroing X15 when entering ABIInternal from ABI0 - opregreg(s, x86.AXORPS, x86.REG_X15, x86.REG_X15) + zeroX15(s) // set G register from TLS getgFromTLS(s, x86.REG_R14) } @@ -1301,7 +1302,7 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) { s.Call(v) if s.ABI == obj.ABIInternal && v.Aux.(*ssa.AuxCall).Fn.ABI() == obj.ABI0 { // zeroing X15 when entering ABIInternal from ABI0 - opregreg(s, x86.AXORPS, x86.REG_X15, x86.REG_X15) + zeroX15(s) // set G register from TLS getgFromTLS(s, x86.REG_R14) } @@ -1829,6 +1830,34 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) { } } +// zeroX15 zeroes the X15 register. +func zeroX15(s *ssagen.State) { + vxorps := func(s *ssagen.State) { + p := s.Prog(x86.AVXORPS) + p.From.Type = obj.TYPE_REG + p.From.Reg = x86.REG_X15 + p.AddRestSourceReg(x86.REG_X15) + p.To.Type = obj.TYPE_REG + p.To.Reg = x86.REG_X15 + } + if buildcfg.GOAMD64 >= 3 { + vxorps(s) + return + } + // AVX may not be available, check before zeroing the high bits. + p := s.Prog(x86.ACMPB) + p.From.Type = obj.TYPE_MEM + p.From.Name = obj.NAME_EXTERN + p.From.Sym = ir.Syms.X86HasAVX + p.To.Type = obj.TYPE_CONST + p.To.Offset = 1 + jmp := s.Prog(x86.AJNE) + jmp.To.Type = obj.TYPE_BRANCH + vxorps(s) + sse := opregreg(s, x86.AXORPS, x86.REG_X15, x86.REG_X15) + jmp.To.SetTarget(sse) +} + // Example instruction: VRSQRTPS X1, X1 func simdV11(s *ssagen.State, v *ssa.Value) *obj.Prog { p := s.Prog(v.Op.Asm()) diff --git a/src/cmd/compile/internal/ir/symtab.go b/src/cmd/compile/internal/ir/symtab.go index ee0f52fbf3..2222a5444a 100644 --- a/src/cmd/compile/internal/ir/symtab.go +++ b/src/cmd/compile/internal/ir/symtab.go @@ -68,6 +68,7 @@ type symsStruct struct { Loong64HasLAM_BH *obj.LSym Loong64HasLSX *obj.LSym RISCV64HasZbb *obj.LSym + X86HasAVX *obj.LSym X86HasFMA *obj.LSym X86HasPOPCNT *obj.LSym X86HasSSE41 *obj.LSym diff --git a/src/cmd/compile/internal/ssagen/ssa.go b/src/cmd/compile/internal/ssagen/ssa.go index abb6370a15..57129817f6 100644 --- a/src/cmd/compile/internal/ssagen/ssa.go +++ b/src/cmd/compile/internal/ssagen/ssa.go @@ -150,9 +150,10 @@ func InitConfig() { ir.Syms.TypeAssert = typecheck.LookupRuntimeFunc("typeAssert") ir.Syms.WBZero = typecheck.LookupRuntimeFunc("wbZero") ir.Syms.WBMove = typecheck.LookupRuntimeFunc("wbMove") + ir.Syms.X86HasAVX = typecheck.LookupRuntimeVar("x86HasAVX") // bool + ir.Syms.X86HasFMA = typecheck.LookupRuntimeVar("x86HasFMA") // bool ir.Syms.X86HasPOPCNT = typecheck.LookupRuntimeVar("x86HasPOPCNT") // bool ir.Syms.X86HasSSE41 = typecheck.LookupRuntimeVar("x86HasSSE41") // bool - ir.Syms.X86HasFMA = typecheck.LookupRuntimeVar("x86HasFMA") // bool ir.Syms.ARMHasVFPv4 = typecheck.LookupRuntimeVar("armHasVFPv4") // bool ir.Syms.ARM64HasATOMICS = typecheck.LookupRuntimeVar("arm64HasATOMICS") // bool ir.Syms.Loong64HasLAMCAS = typecheck.LookupRuntimeVar("loong64HasLAMCAS") // bool @@ -7714,4 +7715,3 @@ func isStructNotSIMD(t *types.Type) bool { } var BoundsCheckFunc [ssa.BoundsKindCount]*obj.LSym - diff --git a/src/cmd/compile/internal/typecheck/_builtin/runtime.go b/src/cmd/compile/internal/typecheck/_builtin/runtime.go index 296bfdc281..1e4d0b7db6 100644 --- a/src/cmd/compile/internal/typecheck/_builtin/runtime.go +++ b/src/cmd/compile/internal/typecheck/_builtin/runtime.go @@ -284,9 +284,10 @@ func libfuzzerHookEqualFold(string, string, uint) func addCovMeta(p unsafe.Pointer, len uint32, hash [16]byte, pkpath string, pkgId int, cmode uint8, cgran uint8) uint32 // architecture variants +var x86HasAVX bool +var x86HasFMA bool var x86HasPOPCNT bool var x86HasSSE41 bool -var x86HasFMA bool var armHasVFPv4 bool var arm64HasATOMICS bool var loong64HasLAMCAS bool diff --git a/src/cmd/compile/internal/typecheck/builtin.go b/src/cmd/compile/internal/typecheck/builtin.go index 535f0fb7e8..6b8c6d7bad 100644 --- a/src/cmd/compile/internal/typecheck/builtin.go +++ b/src/cmd/compile/internal/typecheck/builtin.go @@ -232,9 +232,10 @@ var runtimeDecls = [...]struct { {"libfuzzerHookStrCmp", funcTag, 155}, {"libfuzzerHookEqualFold", funcTag, 155}, {"addCovMeta", funcTag, 157}, + {"x86HasAVX", varTag, 6}, + {"x86HasFMA", varTag, 6}, {"x86HasPOPCNT", varTag, 6}, {"x86HasSSE41", varTag, 6}, - {"x86HasFMA", varTag, 6}, {"armHasVFPv4", varTag, 6}, {"arm64HasATOMICS", varTag, 6}, {"loong64HasLAMCAS", varTag, 6}, diff --git a/src/runtime/asm_amd64.s b/src/runtime/asm_amd64.s index cf1d49a4ad..f8ebd030b6 100644 --- a/src/runtime/asm_amd64.s +++ b/src/runtime/asm_amd64.s @@ -1015,6 +1015,9 @@ needm: // there's no need to handle that. Clear R14 so that there's // a bad value in there, in case needm tries to use it. XORPS X15, X15 + CMPB internal∕cpu·X86+const_offsetX86HasAVX(SB), $1 + JNE 2(PC) + VXORPS X15, X15, X15 XORQ R14, R14 MOVQ $runtime·needAndBindM(SB), AX CALL AX @@ -1712,6 +1715,9 @@ TEXT ·sigpanic0(SB),NOSPLIT,$0-0 get_tls(R14) MOVQ g(R14), R14 XORPS X15, X15 + CMPB internal∕cpu·X86+const_offsetX86HasAVX(SB), $1 + JNE 2(PC) + VXORPS X15, X15, X15 JMP ·sigpanic(SB) // gcWriteBarrier informs the GC about heap pointer writes. diff --git a/src/runtime/cpuflags.go b/src/runtime/cpuflags.go index 6452364b68..67ed081ef6 100644 --- a/src/runtime/cpuflags.go +++ b/src/runtime/cpuflags.go @@ -28,9 +28,10 @@ const ( var ( // Set in runtime.cpuinit. // TODO: deprecate these; use internal/cpu directly. + x86HasAVX bool + x86HasFMA bool x86HasPOPCNT bool x86HasSSE41 bool - x86HasFMA bool armHasVFPv4 bool diff --git a/src/runtime/proc.go b/src/runtime/proc.go index 68647d771f..1d597d59c2 100644 --- a/src/runtime/proc.go +++ b/src/runtime/proc.go @@ -766,9 +766,10 @@ func cpuinit(env string) { // to guard execution of instructions that can not be assumed to be always supported. switch GOARCH { case "386", "amd64": + x86HasAVX = cpu.X86.HasAVX + x86HasFMA = cpu.X86.HasFMA x86HasPOPCNT = cpu.X86.HasPOPCNT x86HasSSE41 = cpu.X86.HasSSE41 - x86HasFMA = cpu.X86.HasFMA case "arm": armHasVFPv4 = cpu.ARM.HasVFPv4 diff --git a/src/runtime/race_amd64.s b/src/runtime/race_amd64.s index e19118bd54..23f2e59e3d 100644 --- a/src/runtime/race_amd64.s +++ b/src/runtime/race_amd64.s @@ -456,6 +456,9 @@ call: // Back to Go world, set special registers. // The g register (R14) is preserved in C. XORPS X15, X15 + CMPB internal∕cpu·X86+const_offsetX86HasAVX(SB), $1 + JNE 2(PC) + VXORPS X15, X15, X15 RET // C->Go callback thunk that allows to call runtime·racesymbolize from C code. diff --git a/src/runtime/sys_darwin_amd64.s b/src/runtime/sys_darwin_amd64.s index cc4e52d305..0091546f20 100644 --- a/src/runtime/sys_darwin_amd64.s +++ b/src/runtime/sys_darwin_amd64.s @@ -177,6 +177,9 @@ TEXT runtime·sigtramp(SB),NOSPLIT|TOPFRAME|NOFRAME,$0 get_tls(R12) MOVQ g(R12), R14 PXOR X15, X15 + CMPB internal∕cpu·X86+const_offsetX86HasAVX(SB), $1 + JNE 2(PC) + VXORPS X15, X15, X15 // Reserve space for spill slots. NOP SP // disable vet stack checking diff --git a/src/runtime/sys_dragonfly_amd64.s b/src/runtime/sys_dragonfly_amd64.s index a223c2cf76..84bf326aad 100644 --- a/src/runtime/sys_dragonfly_amd64.s +++ b/src/runtime/sys_dragonfly_amd64.s @@ -228,6 +228,9 @@ TEXT runtime·sigtramp(SB),NOSPLIT|TOPFRAME|NOFRAME,$0 get_tls(R12) MOVQ g(R12), R14 PXOR X15, X15 + CMPB internal∕cpu·X86+const_offsetX86HasAVX(SB), $1 + JNE 2(PC) + VXORPS X15, X15, X15 // Reserve space for spill slots. NOP SP // disable vet stack checking diff --git a/src/runtime/sys_freebsd_amd64.s b/src/runtime/sys_freebsd_amd64.s index 977ea093d2..a1fa3a6fa2 100644 --- a/src/runtime/sys_freebsd_amd64.s +++ b/src/runtime/sys_freebsd_amd64.s @@ -265,6 +265,9 @@ TEXT runtime·sigtramp(SB),NOSPLIT|TOPFRAME|NOFRAME,$0 get_tls(R12) MOVQ g(R12), R14 PXOR X15, X15 + CMPB internal∕cpu·X86+const_offsetX86HasAVX(SB), $1 + JNE 2(PC) + VXORPS X15, X15, X15 // Reserve space for spill slots. NOP SP // disable vet stack checking @@ -290,6 +293,9 @@ TEXT runtime·sigprofNonGoWrapper<>(SB),NOSPLIT|NOFRAME,$0 get_tls(R12) MOVQ g(R12), R14 PXOR X15, X15 + CMPB internal∕cpu·X86+const_offsetX86HasAVX(SB), $1 + JNE 2(PC) + VXORPS X15, X15, X15 // Reserve space for spill slots. NOP SP // disable vet stack checking diff --git a/src/runtime/sys_linux_amd64.s b/src/runtime/sys_linux_amd64.s index 941f70b0e8..02505c2fb0 100644 --- a/src/runtime/sys_linux_amd64.s +++ b/src/runtime/sys_linux_amd64.s @@ -340,6 +340,9 @@ TEXT runtime·sigtramp(SB),NOSPLIT|TOPFRAME|NOFRAME,$0 get_tls(R12) MOVQ g(R12), R14 PXOR X15, X15 + CMPB internal∕cpu·X86+const_offsetX86HasAVX(SB), $1 + JNE 2(PC) + VXORPS X15, X15, X15 // Reserve space for spill slots. NOP SP // disable vet stack checking @@ -365,6 +368,9 @@ TEXT runtime·sigprofNonGoWrapper<>(SB),NOSPLIT|NOFRAME,$0 get_tls(R12) MOVQ g(R12), R14 PXOR X15, X15 + CMPB internal∕cpu·X86+const_offsetX86HasAVX(SB), $1 + JNE 2(PC) + VXORPS X15, X15, X15 // Reserve space for spill slots. NOP SP // disable vet stack checking diff --git a/src/runtime/sys_netbsd_amd64.s b/src/runtime/sys_netbsd_amd64.s index 2f1ddcdc89..edc7f3d6ee 100644 --- a/src/runtime/sys_netbsd_amd64.s +++ b/src/runtime/sys_netbsd_amd64.s @@ -310,6 +310,9 @@ TEXT runtime·sigtramp(SB),NOSPLIT|TOPFRAME|NOFRAME,$0 get_tls(R12) MOVQ g(R12), R14 PXOR X15, X15 + CMPB internal∕cpu·X86+const_offsetX86HasAVX(SB), $1 + JNE 2(PC) + VXORPS X15, X15, X15 // Reserve space for spill slots. NOP SP // disable vet stack checking diff --git a/src/runtime/sys_openbsd_amd64.s b/src/runtime/sys_openbsd_amd64.s index ff0bc2416a..734dfe6478 100644 --- a/src/runtime/sys_openbsd_amd64.s +++ b/src/runtime/sys_openbsd_amd64.s @@ -64,6 +64,9 @@ TEXT runtime·sigtramp(SB),NOSPLIT|TOPFRAME|NOFRAME,$0 get_tls(R12) MOVQ g(R12), R14 PXOR X15, X15 + CMPB internal∕cpu·X86+const_offsetX86HasAVX(SB), $1 + JNE 2(PC) + VXORPS X15, X15, X15 // Reserve space for spill slots. NOP SP // disable vet stack checking diff --git a/src/runtime/sys_windows_amd64.s b/src/runtime/sys_windows_amd64.s index e438599910..b0b4d3cce6 100644 --- a/src/runtime/sys_windows_amd64.s +++ b/src/runtime/sys_windows_amd64.s @@ -32,6 +32,9 @@ TEXT sigtramp<>(SB),NOSPLIT,$0-0 // R14 is cleared in case there's a non-zero value in there // if called from a non-go thread. XORPS X15, X15 + CMPB internal∕cpu·X86+const_offsetX86HasAVX(SB), $1 + JNE 2(PC) + VXORPS X15, X15, X15 XORQ R14, R14 get_tls(AX) -- cgit v1.3 From 91253515831d1d51f9a998a743309c94e1fc4e1e Mon Sep 17 00:00:00 2001 From: Cherry Mui Date: Fri, 29 Aug 2025 20:33:19 -0400 Subject: [dev.simd] internal/cpu: report AVX1 and 2 as supported on macOS 15 Rosetta 2 Apparently, on macOS 15 or newer, Rosetta 2 supports AVX1 and 2. However, neither CPUID nor the Apple-recommended sysctl says it has AVX. If AVX is used without checking the CPU feature, it may run fine without SIGILL, but the runtime doesn't know AVX is available therefore save and restore its states. This may lead to value corruption. Check if we are running under Rosetta 2 on macOS 15 or newer. If so, report AVX1 and 2 as supported. Change-Id: Ib981379405b1ae28faa378f051096827d760a4cc Reviewed-on: https://go-review.googlesource.com/c/go/+/700055 Reviewed-by: David Chase LUCI-TryBot-Result: Go LUCI Reviewed-by: Junyang Shao --- src/internal/cpu/cpu_arm64_darwin.go | 23 -------- src/internal/cpu/cpu_darwin.go | 72 +++++++++++++++++++++++++ src/internal/cpu/cpu_x86.go | 5 ++ src/internal/cpu/cpu_x86_darwin.go | 23 ++++++++ src/internal/cpu/cpu_x86_other.go | 9 ++++ src/runtime/cpuflags_amd64_test.go | 19 +++++++ src/runtime/export_test.go | 2 + src/runtime/os_darwin.go | 15 +++++- src/runtime/testdata/testprog/cpuflags_amd64.go | 18 +++++++ src/runtime/testdata/testprog/cpuflags_amd64.s | 9 ++++ 10 files changed, 170 insertions(+), 25 deletions(-) create mode 100644 src/internal/cpu/cpu_darwin.go create mode 100644 src/internal/cpu/cpu_x86_darwin.go create mode 100644 src/internal/cpu/cpu_x86_other.go create mode 100644 src/runtime/cpuflags_amd64_test.go create mode 100644 src/runtime/testdata/testprog/cpuflags_amd64.go create mode 100644 src/runtime/testdata/testprog/cpuflags_amd64.s (limited to 'src/runtime') diff --git a/src/internal/cpu/cpu_arm64_darwin.go b/src/internal/cpu/cpu_arm64_darwin.go index 28b47d60e8..bd89cd4e80 100644 --- a/src/internal/cpu/cpu_arm64_darwin.go +++ b/src/internal/cpu/cpu_arm64_darwin.go @@ -6,8 +6,6 @@ package cpu -import _ "unsafe" // for linkname - func osInit() { // macOS 12 moved these to the hw.optional.arm tree, but as of Go 1.24 we // still support macOS 11. See [Determine Encryption Capabilities]. @@ -29,24 +27,3 @@ func osInit() { ARM64.HasSHA1 = true ARM64.HasSHA2 = true } - -//go:noescape -func getsysctlbyname(name []byte) (int32, int32) - -// sysctlEnabled should be an internal detail, -// but widely used packages access it using linkname. -// Notable members of the hall of shame include: -// - github.com/bytedance/gopkg -// - github.com/songzhibin97/gkit -// -// Do not remove or change the type signature. -// See go.dev/issue/67401. -// -//go:linkname sysctlEnabled -func sysctlEnabled(name []byte) bool { - ret, value := getsysctlbyname(name) - if ret < 0 { - return false - } - return value > 0 -} diff --git a/src/internal/cpu/cpu_darwin.go b/src/internal/cpu/cpu_darwin.go new file mode 100644 index 0000000000..2d4ac54fc2 --- /dev/null +++ b/src/internal/cpu/cpu_darwin.go @@ -0,0 +1,72 @@ +// Copyright 2020 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build darwin && !ios + +package cpu + +import _ "unsafe" // for linkname + +// Pushed from runtime. +// +//go:noescape +func sysctlbynameInt32(name []byte) (int32, int32) + +// Pushed from runtime. +// +//go:noescape +func sysctlbynameBytes(name, out []byte) int32 + +// sysctlEnabled should be an internal detail, +// but widely used packages access it using linkname. +// Notable members of the hall of shame include: +// - github.com/bytedance/gopkg +// - github.com/songzhibin97/gkit +// +// Do not remove or change the type signature. +// See go.dev/issue/67401. +// +//go:linkname sysctlEnabled +func sysctlEnabled(name []byte) bool { + ret, value := sysctlbynameInt32(name) + if ret < 0 { + return false + } + return value > 0 +} + +// darwinKernelVersionCheck reports if Darwin kernel version is at +// least major.minor.patch. +// +// Code borrowed from x/sys/cpu. +func darwinKernelVersionCheck(major, minor, patch int) bool { + var release [256]byte + ret := sysctlbynameBytes([]byte("kern.osrelease\x00"), release[:]) + if ret < 0 { + return false + } + + var mmp [3]int + c := 0 +Loop: + for _, b := range release[:] { + switch { + case b >= '0' && b <= '9': + mmp[c] = 10*mmp[c] + int(b-'0') + case b == '.': + c++ + if c > 2 { + return false + } + case b == 0: + break Loop + default: + return false + } + } + if c != 2 { + return false + } + return mmp[0] > major || mmp[0] == major && (mmp[1] > minor || mmp[1] == minor && mmp[2] >= patch) +} diff --git a/src/internal/cpu/cpu_x86.go b/src/internal/cpu/cpu_x86.go index f07fc82df1..ef1874ad68 100644 --- a/src/internal/cpu/cpu_x86.go +++ b/src/internal/cpu/cpu_x86.go @@ -114,6 +114,7 @@ func doinit() { maxID, _, _, _ := cpuid(0, 0) if maxID < 1 { + osInit() return } @@ -158,6 +159,7 @@ func doinit() { X86.HasAVX = isSet(ecx1, cpuid_AVX) && osSupportsAVX if maxID < 7 { + osInit() return } @@ -194,6 +196,7 @@ func doinit() { maxExtendedInformation, _, _, _ = cpuid(0x80000000, 0) if maxExtendedInformation < 0x80000001 { + osInit() return } @@ -217,6 +220,8 @@ func doinit() { X86.HasAVXVNNI = isSet(4, eax71) } } + + osInit() } func isSet(hwc uint32, value uint32) bool { diff --git a/src/internal/cpu/cpu_x86_darwin.go b/src/internal/cpu/cpu_x86_darwin.go new file mode 100644 index 0000000000..12380a7802 --- /dev/null +++ b/src/internal/cpu/cpu_x86_darwin.go @@ -0,0 +1,23 @@ +// Copyright 2025 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build (386 || amd64) && darwin && !ios + +package cpu + +func osInit() { + if isRosetta() && darwinKernelVersionCheck(24, 0, 0) { + // Apparently, on macOS 15 (Darwin kernel version 24) or newer, + // Rosetta 2 supports AVX1 and 2. However, neither CPUID nor + // sysctl says it has AVX. Detect this situation here and report + // AVX1 and 2 as supported. + // TODO: check if any other feature is actually supported. + X86.HasAVX = true + X86.HasAVX2 = true + } +} + +func isRosetta() bool { + return sysctlEnabled([]byte("sysctl.proc_translated\x00")) +} diff --git a/src/internal/cpu/cpu_x86_other.go b/src/internal/cpu/cpu_x86_other.go new file mode 100644 index 0000000000..824131226c --- /dev/null +++ b/src/internal/cpu/cpu_x86_other.go @@ -0,0 +1,9 @@ +// Copyright 2025 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build (386 || amd64) && (!darwin || ios) + +package cpu + +func osInit() {} diff --git a/src/runtime/cpuflags_amd64_test.go b/src/runtime/cpuflags_amd64_test.go new file mode 100644 index 0000000000..f238e7fdf2 --- /dev/null +++ b/src/runtime/cpuflags_amd64_test.go @@ -0,0 +1,19 @@ +// Copyright 2025 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package runtime_test + +import ( + "runtime" + "testing" +) + +func TestHasAVX(t *testing.T) { + t.Parallel() + output := runTestProg(t, "testprog", "CheckAVX") + ok := output == "OK\n" + if *runtime.X86HasAVX != ok { + t.Fatalf("x86HasAVX: %v, CheckAVX got:\n%s", *runtime.X86HasAVX, output) + } +} diff --git a/src/runtime/export_test.go b/src/runtime/export_test.go index 1f55717f0a..fc77b535da 100644 --- a/src/runtime/export_test.go +++ b/src/runtime/export_test.go @@ -1940,3 +1940,5 @@ func (t *TraceStackTable) Reset() { func TraceStack(gp *G, tab *TraceStackTable) { traceStack(0, gp, (*traceStackTable)(tab)) } + +var X86HasAVX = &x86HasAVX diff --git a/src/runtime/os_darwin.go b/src/runtime/os_darwin.go index 0c7144e9d0..ab8aa8037b 100644 --- a/src/runtime/os_darwin.go +++ b/src/runtime/os_darwin.go @@ -157,11 +157,22 @@ func sysctlbynameInt32(name []byte) (int32, int32) { return ret, out } -//go:linkname internal_cpu_getsysctlbyname internal/cpu.getsysctlbyname -func internal_cpu_getsysctlbyname(name []byte) (int32, int32) { +func sysctlbynameBytes(name, out []byte) int32 { + nout := uintptr(len(out)) + ret := sysctlbyname(&name[0], &out[0], &nout, nil, 0) + return ret +} + +//go:linkname internal_cpu_sysctlbynameInt32 internal/cpu.sysctlbynameInt32 +func internal_cpu_sysctlbynameInt32(name []byte) (int32, int32) { return sysctlbynameInt32(name) } +//go:linkname internal_cpu_sysctlbynameBytes internal/cpu.sysctlbynameBytes +func internal_cpu_sysctlbynameBytes(name, out []byte) int32 { + return sysctlbynameBytes(name, out) +} + const ( _CTL_HW = 6 _HW_NCPU = 3 diff --git a/src/runtime/testdata/testprog/cpuflags_amd64.go b/src/runtime/testdata/testprog/cpuflags_amd64.go new file mode 100644 index 0000000000..d53eacbe99 --- /dev/null +++ b/src/runtime/testdata/testprog/cpuflags_amd64.go @@ -0,0 +1,18 @@ +// Copyright 2025 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package main + +import "fmt" + +func init() { + register("CheckAVX", CheckAVX) +} + +func CheckAVX() { + checkAVX() + fmt.Println("OK") +} + +func checkAVX() diff --git a/src/runtime/testdata/testprog/cpuflags_amd64.s b/src/runtime/testdata/testprog/cpuflags_amd64.s new file mode 100644 index 0000000000..1610c5729a --- /dev/null +++ b/src/runtime/testdata/testprog/cpuflags_amd64.s @@ -0,0 +1,9 @@ +// Copyright 2025 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "textflag.h" + +TEXT ·checkAVX(SB), NOSPLIT|NOFRAME, $0-0 + VXORPS X1, X2, X3 + RET -- cgit v1.3 From 2b50ffe172ee638a88e2750481eaeeac7d3bedfa Mon Sep 17 00:00:00 2001 From: Cherry Mui Date: Mon, 22 Sep 2025 10:57:29 -0400 Subject: [dev.simd] cmd/compile: remove stores to unread parameters Currently, we remove stores to local variables that are not read. We don't do that for arguments. But arguments and locals are essentially the same. Arguments are passed by value, and are not expected to be read in the caller's frame. So we can remove the writes to them as well. One exception is the cgo_unsafe_arg directive, which makes all the arguments effectively address-taken. cgo_unsafe_arg implies ABI0, so we just skip ABI0 functions' arguments. Change-Id: I8999fc50da6a87f22c1ec23e9a0c15483b6f7df8 Reviewed-on: https://go-review.googlesource.com/c/go/+/705815 LUCI-TryBot-Result: Go LUCI Reviewed-by: David Chase Reviewed-by: Junyang Shao --- src/cmd/compile/internal/ssa/deadstore.go | 22 ++++++++++++++++++---- src/runtime/testdata/testprog/badtraceback.go | 2 ++ test/codegen/stack.go | 6 ++++++ 3 files changed, 26 insertions(+), 4 deletions(-) (limited to 'src/runtime') diff --git a/src/cmd/compile/internal/ssa/deadstore.go b/src/cmd/compile/internal/ssa/deadstore.go index 9e67e83399..d0adff788c 100644 --- a/src/cmd/compile/internal/ssa/deadstore.go +++ b/src/cmd/compile/internal/ssa/deadstore.go @@ -7,6 +7,7 @@ package ssa import ( "cmd/compile/internal/ir" "cmd/compile/internal/types" + "cmd/internal/obj" ) // dse does dead-store elimination on the Function. @@ -213,7 +214,7 @@ func elimDeadAutosGeneric(f *Func) { case OpAddr, OpLocalAddr: // Propagate the address if it points to an auto. n, ok := v.Aux.(*ir.Name) - if !ok || n.Class != ir.PAUTO { + if !ok || (n.Class != ir.PAUTO && !isABIInternalParam(f, n)) { return } if addr[v] == nil { @@ -224,7 +225,7 @@ func elimDeadAutosGeneric(f *Func) { case OpVarDef: // v should be eliminated if we eliminate the auto. n, ok := v.Aux.(*ir.Name) - if !ok || n.Class != ir.PAUTO { + if !ok || (n.Class != ir.PAUTO && !isABIInternalParam(f, n)) { return } if elim[v] == nil { @@ -240,7 +241,7 @@ func elimDeadAutosGeneric(f *Func) { // may not be used by the inline code, but will be used by // panic processing). n, ok := v.Aux.(*ir.Name) - if !ok || n.Class != ir.PAUTO { + if !ok || (n.Class != ir.PAUTO && !isABIInternalParam(f, n)) { return } if !used.Has(n) { @@ -373,7 +374,7 @@ func elimUnreadAutos(f *Func) { if !ok { continue } - if n.Class != ir.PAUTO { + if n.Class != ir.PAUTO && !isABIInternalParam(f, n) { continue } @@ -413,3 +414,16 @@ func elimUnreadAutos(f *Func) { store.Op = OpCopy } } + +// isABIInternalParam returns whether n is a parameter of an ABIInternal +// function. For dead store elimination, we can treat parameters the same +// way as autos. Storing to a parameter can be removed if it is not read +// or address-taken. +// +// We check ABI here because for a cgo_unsafe_arg function (which is ABI0), +// all the args are effectively address-taken, but not necessarily have +// an Addr or LocalAddr op. We could probably just check for cgo_unsafe_arg, +// but ABIInternal is mostly what matters. +func isABIInternalParam(f *Func, n *ir.Name) bool { + return n.Class == ir.PPARAM && f.ABISelf.Which() == obj.ABIInternal +} diff --git a/src/runtime/testdata/testprog/badtraceback.go b/src/runtime/testdata/testprog/badtraceback.go index 09aa2b877e..455118a543 100644 --- a/src/runtime/testdata/testprog/badtraceback.go +++ b/src/runtime/testdata/testprog/badtraceback.go @@ -44,6 +44,8 @@ func badLR2(arg int) { lrPtr := (*uintptr)(unsafe.Pointer(uintptr(unsafe.Pointer(&arg)) - lrOff)) *lrPtr = 0xbad + runtime.KeepAlive(lrPtr) // prevent dead store elimination + // Print a backtrace. This should include diagnostics for the // bad return PC and a hex dump. panic("backtrace") diff --git a/test/codegen/stack.go b/test/codegen/stack.go index 4e45d68f38..59284ae888 100644 --- a/test/codegen/stack.go +++ b/test/codegen/stack.go @@ -168,3 +168,9 @@ func getp1() *[4]int { func getp2() *[4]int { return nil } + +// Store to an argument without read can be removed. +func storeArg(a [2]int) { + // amd64:-`MOVQ\t\$123,.*\.a\+\d+\(SP\)` + a[1] = 123 +} -- cgit v1.3 From 25c36b95d1523f22d4c46ec237acc03e00540e0a Mon Sep 17 00:00:00 2001 From: David Chase Date: Fri, 19 Sep 2025 13:07:59 -0400 Subject: [dev.simd] simd, cmd/compile: add 128 bit select-from-pair Using this name until a better one appears: x.Select128FromPair(3, 2, y) Includes test for constant and variable case. Checks for unexpected immediates (using the zeroing flag, which is not supported for this intrinsic) and panics. Change-Id: I9249475d6572968c127b4ee9e00328d717c07578 Reviewed-on: https://go-review.googlesource.com/c/go/+/705496 Reviewed-by: Junyang Shao LUCI-TryBot-Result: Go LUCI --- src/cmd/compile/internal/amd64/simdssa.go | 2 + src/cmd/compile/internal/ir/symtab.go | 1 + src/cmd/compile/internal/ssa/_gen/simdAMD64.rules | 6 ++ src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go | 2 + .../compile/internal/ssa/_gen/simdgenericOps.go | 6 ++ src/cmd/compile/internal/ssa/opGen.go | 74 ++++++++++++++++++++++ src/cmd/compile/internal/ssa/rewriteAMD64.go | 18 ++++++ src/cmd/compile/internal/ssagen/intrinsics.go | 26 +++++++- src/cmd/compile/internal/ssagen/simdintrinsics.go | 6 ++ src/cmd/compile/internal/ssagen/ssa.go | 1 + src/runtime/panic.go | 7 ++ src/simd/_gen/simdgen/gen_simdIntrinsics.go | 2 + src/simd/_gen/simdgen/gen_simdTypes.go | 9 +++ src/simd/_gen/simdgen/ops/Moves/categories.yaml | 8 ++- src/simd/_gen/simdgen/ops/Moves/go.yaml | 72 ++++++++++++++++++++- src/simd/_gen/unify/domain.go | 4 +- src/simd/internal/simd_test/simd_test.go | 74 ++++++++++++++++++++++ src/simd/ops_amd64.go | 56 ++++++++++++++++ 18 files changed, 369 insertions(+), 5 deletions(-) (limited to 'src/runtime') diff --git a/src/cmd/compile/internal/amd64/simdssa.go b/src/cmd/compile/internal/amd64/simdssa.go index d69740cd96..a4d2452435 100644 --- a/src/cmd/compile/internal/amd64/simdssa.go +++ b/src/cmd/compile/internal/amd64/simdssa.go @@ -1053,6 +1053,8 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VGF2P8AFFINEINVQB128, ssa.OpAMD64VGF2P8AFFINEINVQB256, ssa.OpAMD64VGF2P8AFFINEINVQB512, + ssa.OpAMD64VPERM2F128256, + ssa.OpAMD64VPERM2I128256, ssa.OpAMD64VINSERTF128256, ssa.OpAMD64VINSERTF64X4512, ssa.OpAMD64VINSERTI128256, diff --git a/src/cmd/compile/internal/ir/symtab.go b/src/cmd/compile/internal/ir/symtab.go index 2222a5444a..0cfa2a2262 100644 --- a/src/cmd/compile/internal/ir/symtab.go +++ b/src/cmd/compile/internal/ir/symtab.go @@ -45,6 +45,7 @@ type symsStruct struct { PanicdottypeI *obj.LSym Panicnildottype *obj.LSym Panicoverflow *obj.LSym + PanicSimdImm *obj.LSym Racefuncenter *obj.LSym Racefuncexit *obj.LSym Raceread *obj.LSym diff --git a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules index 9db223c04f..1eab8b5e6d 100644 --- a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules +++ b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules @@ -938,6 +938,12 @@ (ScaleFloat64x2 ...) => (VSCALEFPD128 ...) (ScaleFloat64x4 ...) => (VSCALEFPD256 ...) (ScaleFloat64x8 ...) => (VSCALEFPD512 ...) +(Select128FromPairFloat32x8 ...) => (VPERM2F128256 ...) +(Select128FromPairFloat64x4 ...) => (VPERM2F128256 ...) +(Select128FromPairInt32x8 ...) => (VPERM2I128256 ...) +(Select128FromPairInt64x4 ...) => (VPERM2I128256 ...) +(Select128FromPairUint32x8 ...) => (VPERM2I128256 ...) +(Select128FromPairUint64x4 ...) => (VPERM2I128256 ...) (SetElemFloat32x4 ...) => (VPINSRD128 ...) (SetElemFloat64x2 ...) => (VPINSRQ128 ...) (SetElemInt8x16 ...) => (VPINSRB128 ...) diff --git a/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go b/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go index ba91fb3fc9..5e1da3249f 100644 --- a/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go +++ b/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go @@ -1212,6 +1212,8 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf {name: "VPRORQMasked128", argLength: 2, reg: wkw, asm: "VPRORQ", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false}, {name: "VPRORQMasked256", argLength: 2, reg: wkw, asm: "VPRORQ", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: false}, {name: "VPRORQMasked512", argLength: 2, reg: wkw, asm: "VPRORQ", aux: "UInt8", commutative: false, typ: "Vec512", resultInArg0: false}, + {name: "VPERM2F128256", argLength: 2, reg: v21, asm: "VPERM2F128", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: false}, + {name: "VPERM2I128256", argLength: 2, reg: v21, asm: "VPERM2I128", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: false}, {name: "VPINSRD128", argLength: 2, reg: vgpv, asm: "VPINSRD", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false}, {name: "VPINSRQ128", argLength: 2, reg: vgpv, asm: "VPINSRQ", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false}, {name: "VPINSRB128", argLength: 2, reg: vgpv, asm: "VPINSRB", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false}, diff --git a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go index 81a1dff137..aa088dbf0b 100644 --- a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go +++ b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go @@ -1199,6 +1199,12 @@ func simdGenericOps() []opData { {name: "RoundToEvenScaledResidueFloat64x2", argLength: 1, commutative: false, aux: "UInt8"}, {name: "RoundToEvenScaledResidueFloat64x4", argLength: 1, commutative: false, aux: "UInt8"}, {name: "RoundToEvenScaledResidueFloat64x8", argLength: 1, commutative: false, aux: "UInt8"}, + {name: "Select128FromPairFloat32x8", argLength: 2, commutative: false, aux: "UInt8"}, + {name: "Select128FromPairFloat64x4", argLength: 2, commutative: false, aux: "UInt8"}, + {name: "Select128FromPairInt32x8", argLength: 2, commutative: false, aux: "UInt8"}, + {name: "Select128FromPairInt64x4", argLength: 2, commutative: false, aux: "UInt8"}, + {name: "Select128FromPairUint32x8", argLength: 2, commutative: false, aux: "UInt8"}, + {name: "Select128FromPairUint64x4", argLength: 2, commutative: false, aux: "UInt8"}, {name: "SetElemFloat32x4", argLength: 2, commutative: false, aux: "UInt8"}, {name: "SetElemFloat64x2", argLength: 2, commutative: false, aux: "UInt8"}, {name: "SetElemInt8x16", argLength: 2, commutative: false, aux: "UInt8"}, diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go index 792a1ca08f..105d1a803c 100644 --- a/src/cmd/compile/internal/ssa/opGen.go +++ b/src/cmd/compile/internal/ssa/opGen.go @@ -2444,6 +2444,8 @@ const ( OpAMD64VPRORQMasked128 OpAMD64VPRORQMasked256 OpAMD64VPRORQMasked512 + OpAMD64VPERM2F128256 + OpAMD64VPERM2I128256 OpAMD64VPINSRD128 OpAMD64VPINSRQ128 OpAMD64VPINSRB128 @@ -6594,6 +6596,12 @@ const ( OpRoundToEvenScaledResidueFloat64x2 OpRoundToEvenScaledResidueFloat64x4 OpRoundToEvenScaledResidueFloat64x8 + OpSelect128FromPairFloat32x8 + OpSelect128FromPairFloat64x4 + OpSelect128FromPairInt32x8 + OpSelect128FromPairInt64x4 + OpSelect128FromPairUint32x8 + OpSelect128FromPairUint64x4 OpSetElemFloat32x4 OpSetElemFloat64x2 OpSetElemInt8x16 @@ -37656,6 +37664,36 @@ var opcodeTable = [...]opInfo{ }, }, }, + { + name: "VPERM2F128256", + auxType: auxUInt8, + argLen: 2, + asm: x86.AVPERM2F128, + reg: regInfo{ + inputs: []inputInfo{ + {0, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 + {1, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VPERM2I128256", + auxType: auxUInt8, + argLen: 2, + asm: x86.AVPERM2I128, + reg: regInfo{ + inputs: []inputInfo{ + {0, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 + {1, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, { name: "VPINSRD128", auxType: auxUInt8, @@ -82360,6 +82398,42 @@ var opcodeTable = [...]opInfo{ argLen: 1, generic: true, }, + { + name: "Select128FromPairFloat32x8", + auxType: auxUInt8, + argLen: 2, + generic: true, + }, + { + name: "Select128FromPairFloat64x4", + auxType: auxUInt8, + argLen: 2, + generic: true, + }, + { + name: "Select128FromPairInt32x8", + auxType: auxUInt8, + argLen: 2, + generic: true, + }, + { + name: "Select128FromPairInt64x4", + auxType: auxUInt8, + argLen: 2, + generic: true, + }, + { + name: "Select128FromPairUint32x8", + auxType: auxUInt8, + argLen: 2, + generic: true, + }, + { + name: "Select128FromPairUint64x4", + auxType: auxUInt8, + argLen: 2, + generic: true, + }, { name: "SetElemFloat32x4", auxType: auxUInt8, diff --git a/src/cmd/compile/internal/ssa/rewriteAMD64.go b/src/cmd/compile/internal/ssa/rewriteAMD64.go index ca9f9ae17b..bc611fc44c 100644 --- a/src/cmd/compile/internal/ssa/rewriteAMD64.go +++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go @@ -4991,6 +4991,24 @@ func rewriteValueAMD64(v *Value) bool { return rewriteValueAMD64_OpSelect0(v) case OpSelect1: return rewriteValueAMD64_OpSelect1(v) + case OpSelect128FromPairFloat32x8: + v.Op = OpAMD64VPERM2F128256 + return true + case OpSelect128FromPairFloat64x4: + v.Op = OpAMD64VPERM2F128256 + return true + case OpSelect128FromPairInt32x8: + v.Op = OpAMD64VPERM2I128256 + return true + case OpSelect128FromPairInt64x4: + v.Op = OpAMD64VPERM2I128256 + return true + case OpSelect128FromPairUint32x8: + v.Op = OpAMD64VPERM2I128256 + return true + case OpSelect128FromPairUint64x4: + v.Op = OpAMD64VPERM2I128256 + return true case OpSelectN: return rewriteValueAMD64_OpSelectN(v) case OpSetElemFloat32x4: diff --git a/src/cmd/compile/internal/ssagen/intrinsics.go b/src/cmd/compile/internal/ssagen/intrinsics.go index 985d899a71..4c5cd9ef2c 100644 --- a/src/cmd/compile/internal/ssagen/intrinsics.go +++ b/src/cmd/compile/internal/ssagen/intrinsics.go @@ -1842,7 +1842,9 @@ func immJumpTable(s *state, idx *ssa.Value, intrinsicCall *ir.CallExpr, genOp fu for i, t := range targets { s.startBlock(t) genOp(s, i) - t.AddEdgeTo(bEnd) + if t.Kind != ssa.BlockExit { + t.AddEdgeTo(bEnd) + } s.endBlock() } @@ -1899,6 +1901,28 @@ func opLen2Imm8_2I(op ssa.Op, t *types.Type, offset int) func(s *state, n *ir.Ca } } +// Two immediates instead of just 1. Offset is ignored, so it is a _ parameter instead. +func opLen2Imm8_II(op ssa.Op, t *types.Type, _ int) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { + return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { + if args[1].Op == ssa.OpConst8 && args[2].Op == ssa.OpConst8 && args[1].AuxInt & ^3 == 0 && args[2].AuxInt & ^3 == 0 { + i1, i2 := args[1].AuxInt, args[2].AuxInt + return s.newValue2I(op, t, i1+i2<<4, args[0], args[3]) + } + four := s.constInt64(types.Types[types.TUINT8], 4) + shifted := s.newValue2(ssa.OpLsh8x8, types.Types[types.TUINT8], args[2], four) + combined := s.newValue2(ssa.OpAdd8, types.Types[types.TUINT8], args[1], shifted) + return immJumpTable(s, combined, n, func(sNew *state, idx int) { + // Encode as int8 due to requirement of AuxInt, check its comment for details. + // TODO for "zeroing" values, panic instead. + if idx & ^(3+3<<4) == 0 { + s.vars[n] = sNew.newValue2I(op, t, int64(int8(idx)), args[0], args[3]) + } else { + sNew.rtcall(ir.Syms.PanicSimdImm, false, nil) + } + }) + } +} + func opLen3Imm8_2I(op ssa.Op, t *types.Type, offset int) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { if args[2].Op == ssa.OpConst8 { diff --git a/src/cmd/compile/internal/ssagen/simdintrinsics.go b/src/cmd/compile/internal/ssagen/simdintrinsics.go index 41858a7745..a62b3882c3 100644 --- a/src/cmd/compile/internal/ssagen/simdintrinsics.go +++ b/src/cmd/compile/internal/ssagen/simdintrinsics.go @@ -950,6 +950,12 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies . addF(simdPackage, "Float64x2.Scale", opLen2(ssa.OpScaleFloat64x2, types.TypeVec128), sys.AMD64) addF(simdPackage, "Float64x4.Scale", opLen2(ssa.OpScaleFloat64x4, types.TypeVec256), sys.AMD64) addF(simdPackage, "Float64x8.Scale", opLen2(ssa.OpScaleFloat64x8, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Float32x8.Select128FromPair", opLen2Imm8_II(ssa.OpSelect128FromPairFloat32x8, types.TypeVec256, 0), sys.AMD64) + addF(simdPackage, "Float64x4.Select128FromPair", opLen2Imm8_II(ssa.OpSelect128FromPairFloat64x4, types.TypeVec256, 0), sys.AMD64) + addF(simdPackage, "Int32x8.Select128FromPair", opLen2Imm8_II(ssa.OpSelect128FromPairInt32x8, types.TypeVec256, 0), sys.AMD64) + addF(simdPackage, "Int64x4.Select128FromPair", opLen2Imm8_II(ssa.OpSelect128FromPairInt64x4, types.TypeVec256, 0), sys.AMD64) + addF(simdPackage, "Uint32x8.Select128FromPair", opLen2Imm8_II(ssa.OpSelect128FromPairUint32x8, types.TypeVec256, 0), sys.AMD64) + addF(simdPackage, "Uint64x4.Select128FromPair", opLen2Imm8_II(ssa.OpSelect128FromPairUint64x4, types.TypeVec256, 0), sys.AMD64) addF(simdPackage, "Float32x4.SetElem", opLen2Imm8(ssa.OpSetElemFloat32x4, types.TypeVec128, 0), sys.AMD64) addF(simdPackage, "Float64x2.SetElem", opLen2Imm8(ssa.OpSetElemFloat64x2, types.TypeVec128, 0), sys.AMD64) addF(simdPackage, "Int8x16.SetElem", opLen2Imm8(ssa.OpSetElemInt8x16, types.TypeVec128, 0), sys.AMD64) diff --git a/src/cmd/compile/internal/ssagen/ssa.go b/src/cmd/compile/internal/ssagen/ssa.go index 57129817f6..37aad360f2 100644 --- a/src/cmd/compile/internal/ssagen/ssa.go +++ b/src/cmd/compile/internal/ssagen/ssa.go @@ -141,6 +141,7 @@ func InitConfig() { ir.Syms.Panicnildottype = typecheck.LookupRuntimeFunc("panicnildottype") ir.Syms.Panicoverflow = typecheck.LookupRuntimeFunc("panicoverflow") ir.Syms.Panicshift = typecheck.LookupRuntimeFunc("panicshift") + ir.Syms.PanicSimdImm = typecheck.LookupRuntimeFunc("panicSimdImm") ir.Syms.Racefuncenter = typecheck.LookupRuntimeFunc("racefuncenter") ir.Syms.Racefuncexit = typecheck.LookupRuntimeFunc("racefuncexit") ir.Syms.Raceread = typecheck.LookupRuntimeFunc("raceread") diff --git a/src/runtime/panic.go b/src/runtime/panic.go index 8c91c9435a..d7bce70fe5 100644 --- a/src/runtime/panic.go +++ b/src/runtime/panic.go @@ -341,6 +341,13 @@ func panicmemAddr(addr uintptr) { panic(errorAddressString{msg: "invalid memory address or nil pointer dereference", addr: addr}) } +var simdImmError = error(errorString("out-of-range immediate for simd intrinsic")) + +func panicSimdImm() { + panicCheck2("simd immediate error") + panic(simdImmError) +} + // Create a new deferred function fn, which has no arguments and results. // The compiler turns a defer statement into a call to this. func deferproc(fn func()) { diff --git a/src/simd/_gen/simdgen/gen_simdIntrinsics.go b/src/simd/_gen/simdgen/gen_simdIntrinsics.go index 353bc46b31..4b27f7ce5f 100644 --- a/src/simd/_gen/simdgen/gen_simdIntrinsics.go +++ b/src/simd/_gen/simdgen/gen_simdIntrinsics.go @@ -56,6 +56,8 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies . {{end}} {{define "op2Imm8_2I"}} addF(simdPackage, "{{(index .In 1).Go}}.{{.Go}}", opLen2Imm8_2I(ssa.Op{{.GenericName}}, {{.SSAType}}, {{(index .In 0).ImmOffset}}), sys.AMD64) {{end}} +{{define "op2Imm8_II"}} addF(simdPackage, "{{(index .In 1).Go}}.{{.Go}}", opLen2Imm8_II(ssa.Op{{.GenericName}}, {{.SSAType}}, {{(index .In 0).ImmOffset}}), sys.AMD64) +{{end}} {{define "op3Imm8"}} addF(simdPackage, "{{(index .In 1).Go}}.{{.Go}}", opLen3Imm8(ssa.Op{{.GenericName}}, {{.SSAType}}, {{(index .In 0).ImmOffset}}), sys.AMD64) {{end}} {{define "op3Imm8_2I"}} addF(simdPackage, "{{(index .In 1).Go}}.{{.Go}}", opLen3Imm8_2I(ssa.Op{{.GenericName}}, {{.SSAType}}, {{(index .In 0).ImmOffset}}), sys.AMD64) diff --git a/src/simd/_gen/simdgen/gen_simdTypes.go b/src/simd/_gen/simdgen/gen_simdTypes.go index 0d5d08b7ed..8944c35cad 100644 --- a/src/simd/_gen/simdgen/gen_simdTypes.go +++ b/src/simd/_gen/simdgen/gen_simdTypes.go @@ -354,6 +354,15 @@ func ({{.Op1NameAndType "x"}}) {{.Go}}({{.ImmName}} uint8, {{.Op2NameAndType "y" func ({{.Op1NameAndType "x"}}) {{.Go}}({{.Op2NameAndType "y"}}, {{.ImmName}} uint8) {{.GoType}} {{end}} +{{define "op2Imm8_II"}} +{{if .Documentation}}{{.Documentation}} +//{{end}} +// {{.ImmName}} result in better performance when they are constants, non-constant values will be translated into a jump table. +// {{.ImmName}} should be between 0 and 3, inclusive; other values will result in a runtime panic. +// +// Asm: {{.Asm}}, CPU Feature: {{.CPUFeature}} +func ({{.Op1NameAndType "x"}}) {{.Go}}({{.ImmName}} uint8, {{.Op2NameAndType "y"}}) {{.GoType}} +{{end}} {{define "op3Imm8"}} {{if .Documentation}}{{.Documentation}} diff --git a/src/simd/_gen/simdgen/ops/Moves/categories.yaml b/src/simd/_gen/simdgen/ops/Moves/categories.yaml index e9a7fef202..0c733e12ee 100644 --- a/src/simd/_gen/simdgen/ops/Moves/categories.yaml +++ b/src/simd/_gen/simdgen/ops/Moves/categories.yaml @@ -174,4 +174,10 @@ // then 1, selecting element 1 from x's upper 128 bits (9), then 1, // selecting element 1 from y's upper 128 bits (11). // This differs from the same method applied to a 32x8 vector, where - // the 8-bit constant performs the same selection on both subvectors. \ No newline at end of file + // the 8-bit constant performs the same selection on both subvectors. + +- go: Select128FromPair + commutative: false + documentation: !string |- + // NAME selects the low and high 128-bit halves from the 128-bit halves + // of its two 256-bit inputs, numbering those halves 0, 1, 2, 3. diff --git a/src/simd/_gen/simdgen/ops/Moves/go.yaml b/src/simd/_gen/simdgen/ops/Moves/go.yaml index 46599b7bd7..495b9ed6fa 100644 --- a/src/simd/_gen/simdgen/ops/Moves/go.yaml +++ b/src/simd/_gen/simdgen/ops/Moves/go.yaml @@ -721,7 +721,6 @@ out: - *v - - go: concatSelectedConstantGrouped asm: VSHUFPD in: @@ -771,3 +770,74 @@ inVariant: [] out: - *v + +- go: Select128FromPair + asm: VPERM2F128 + operandOrder: II + in: + - &v + go: $t + class: vreg + base: float + bits: 256 + - *v + - class: immediate + immOffset: 0 + name: "lo, hi" + inVariant: [] + out: + - *v + +- go: Select128FromPair + asm: VPERM2F128 + operandOrder: II + in: + - &v + go: $t + class: vreg + base: float + bits: 256 + OverwriteElementBits: 32 + - *v + - class: immediate + immOffset: 0 + name: "lo, hi" + inVariant: [] + out: + - *v + +- go: Select128FromPair + asm: VPERM2I128 + operandOrder: II + in: + - &v + go: $t + class: vreg + base: int|uint + bits: 256 + OverwriteElementBits: 64 + - *v + - class: immediate + immOffset: 0 + name: "lo, hi" + inVariant: [] + out: + - *v + +- go: Select128FromPair + asm: VPERM2I128 + operandOrder: II + in: + - &v + go: $t + class: vreg + base: int|uint + bits: 256 + OverwriteElementBits: 32 + - *v + - class: immediate + immOffset: 0 + name: "lo, hi" + inVariant: [] + out: + - *v diff --git a/src/simd/_gen/unify/domain.go b/src/simd/_gen/unify/domain.go index 1e0f2be63d..8eb5deab2b 100644 --- a/src/simd/_gen/unify/domain.go +++ b/src/simd/_gen/unify/domain.go @@ -106,8 +106,8 @@ func (b *DefBuilder) Add(name string, v *Value) { if b.fields == nil { b.fields = make(map[string]*Value) } - if _, ok := b.fields[name]; ok { - panic(fmt.Sprintf("duplicate field %q", name)) + if old, ok := b.fields[name]; ok { + panic(fmt.Sprintf("duplicate field %q, added value is %v, old value is %v", name, v, old)) } b.fields[name] = v } diff --git a/src/simd/internal/simd_test/simd_test.go b/src/simd/internal/simd_test/simd_test.go index 6deadde45e..e38f7eea01 100644 --- a/src/simd/internal/simd_test/simd_test.go +++ b/src/simd/internal/simd_test/simd_test.go @@ -815,3 +815,77 @@ func TestSelectFromPairConstGroupedUint32x16(t *testing.T) { foo(lhhl, 0, 4, 5, 1) foo(hllh, 4, 0, 1, 5) } + +func TestSelect128FromPair(t *testing.T) { + x := simd.LoadUint64x4Slice([]uint64{0, 1, 2, 3}) + y := simd.LoadUint64x4Slice([]uint64{4, 5, 6, 7}) + + aa := x.Select128FromPair(0, 0, y) + ab := x.Select128FromPair(0, 1, y) + bc := x.Select128FromPair(1, 2, y) + cd := x.Select128FromPair(2, 3, y) + da := x.Select128FromPair(3, 0, y) + dc := x.Select128FromPair(3, 2, y) + + r := make([]uint64, 4, 4) + + foo := func(v simd.Uint64x4, a, b uint64) { + a, b = 2*a, 2*b + v.StoreSlice(r) + checkSlices[uint64](t, r, []uint64{a, a + 1, b, b + 1}) + } + + foo(aa, 0, 0) + foo(ab, 0, 1) + foo(bc, 1, 2) + foo(cd, 2, 3) + foo(da, 3, 0) + foo(dc, 3, 2) +} + +func TestSelect128FromPairError(t *testing.T) { + x := simd.LoadUint64x4Slice([]uint64{0, 1, 2, 3}) + y := simd.LoadUint64x4Slice([]uint64{4, 5, 6, 7}) + + defer func() { + if r := recover(); r != nil { + t.Logf("Saw expected panic %v", r) + } + }() + _ = x.Select128FromPair(0, 4, y) + + t.Errorf("Should have panicked") +} + +//go:noinline +func select128FromPair(x simd.Uint64x4, lo, hi uint8, y simd.Uint64x4) simd.Uint64x4 { + return x.Select128FromPair(lo, hi, y) +} + +func TestSelect128FromPairVar(t *testing.T) { + x := simd.LoadUint64x4Slice([]uint64{0, 1, 2, 3}) + y := simd.LoadUint64x4Slice([]uint64{4, 5, 6, 7}) + + aa := select128FromPair(x, 0, 0, y) + ab := select128FromPair(x, 0, 1, y) + bc := select128FromPair(x, 1, 2, y) + cd := select128FromPair(x, 2, 3, y) + da := select128FromPair(x, 3, 0, y) + dc := select128FromPair(x, 3, 2, y) + + r := make([]uint64, 4, 4) + + foo := func(v simd.Uint64x4, a, b uint64) { + a, b = 2*a, 2*b + v.StoreSlice(r) + checkSlices[uint64](t, r, []uint64{a, a + 1, b, b + 1}) + } + + foo(aa, 0, 0) + foo(ab, 0, 1) + foo(bc, 1, 2) + foo(cd, 2, 3) + foo(da, 3, 0) + foo(dc, 3, 2) + +} diff --git a/src/simd/ops_amd64.go b/src/simd/ops_amd64.go index a104601ed7..91e7d91842 100644 --- a/src/simd/ops_amd64.go +++ b/src/simd/ops_amd64.go @@ -5576,6 +5576,62 @@ func (x Float64x4) Scale(y Float64x4) Float64x4 // Asm: VSCALEFPD, CPU Feature: AVX512 func (x Float64x8) Scale(y Float64x8) Float64x8 +/* Select128FromPair */ + +// Select128FromPair selects the low and high 128-bit halves from the 128-bit halves +// of its two 256-bit inputs, numbering those halves 0, 1, 2, 3. +// +// lo, hi result in better performance when they are constants, non-constant values will be translated into a jump table. +// lo, hi should be between 0 and 3, inclusive; other values will result in a runtime panic. +// +// Asm: VPERM2F128, CPU Feature: AVX +func (x Float32x8) Select128FromPair(lo, hi uint8, y Float32x8) Float32x8 + +// Select128FromPair selects the low and high 128-bit halves from the 128-bit halves +// of its two 256-bit inputs, numbering those halves 0, 1, 2, 3. +// +// lo, hi result in better performance when they are constants, non-constant values will be translated into a jump table. +// lo, hi should be between 0 and 3, inclusive; other values will result in a runtime panic. +// +// Asm: VPERM2F128, CPU Feature: AVX +func (x Float64x4) Select128FromPair(lo, hi uint8, y Float64x4) Float64x4 + +// Select128FromPair selects the low and high 128-bit halves from the 128-bit halves +// of its two 256-bit inputs, numbering those halves 0, 1, 2, 3. +// +// lo, hi result in better performance when they are constants, non-constant values will be translated into a jump table. +// lo, hi should be between 0 and 3, inclusive; other values will result in a runtime panic. +// +// Asm: VPERM2I128, CPU Feature: AVX2 +func (x Int32x8) Select128FromPair(lo, hi uint8, y Int32x8) Int32x8 + +// Select128FromPair selects the low and high 128-bit halves from the 128-bit halves +// of its two 256-bit inputs, numbering those halves 0, 1, 2, 3. +// +// lo, hi result in better performance when they are constants, non-constant values will be translated into a jump table. +// lo, hi should be between 0 and 3, inclusive; other values will result in a runtime panic. +// +// Asm: VPERM2I128, CPU Feature: AVX2 +func (x Int64x4) Select128FromPair(lo, hi uint8, y Int64x4) Int64x4 + +// Select128FromPair selects the low and high 128-bit halves from the 128-bit halves +// of its two 256-bit inputs, numbering those halves 0, 1, 2, 3. +// +// lo, hi result in better performance when they are constants, non-constant values will be translated into a jump table. +// lo, hi should be between 0 and 3, inclusive; other values will result in a runtime panic. +// +// Asm: VPERM2I128, CPU Feature: AVX2 +func (x Uint32x8) Select128FromPair(lo, hi uint8, y Uint32x8) Uint32x8 + +// Select128FromPair selects the low and high 128-bit halves from the 128-bit halves +// of its two 256-bit inputs, numbering those halves 0, 1, 2, 3. +// +// lo, hi result in better performance when they are constants, non-constant values will be translated into a jump table. +// lo, hi should be between 0 and 3, inclusive; other values will result in a runtime panic. +// +// Asm: VPERM2I128, CPU Feature: AVX2 +func (x Uint64x4) Select128FromPair(lo, hi uint8, y Uint64x4) Uint64x4 + /* SetElem */ // SetElem sets a single constant-indexed element's value. -- cgit v1.3