From dfa6c7426316fb81c5f29b260b2de7822680ffd3 Mon Sep 17 00:00:00 2001
From: Austin Clements <austin@google.com>
Date: Thu, 12 Jun 2025 18:37:01 -0400
Subject: [dev.simd] runtime: eliminate global state in mkpreempt.go

We're going to start writing two files, so having a single global file
we're writing will be a problem.

This has no effect on the generated code.

Change-Id: I49897ea0c6500a29eac89b597d75c0eb3e9b6706
Reviewed-on: https://go-review.googlesource.com/c/go/+/680897
Reviewed-by: Cherry Mui <cherryyz@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
---
 src/runtime/mkpreempt.go | 166 +++++++++++++++++++++++++++--------------------
 1 file changed, 94 insertions(+), 72 deletions(-)

(limited to 'src/runtime')

diff --git a/src/runtime/mkpreempt.go b/src/runtime/mkpreempt.go
index 6a9cf77a43..ec900a23d2 100644
--- a/src/runtime/mkpreempt.go
+++ b/src/runtime/mkpreempt.go
@@ -73,16 +73,14 @@ var regNamesAMD64 = []string{
 	"X15",
 }
 
-var out io.Writer
-
-var arches = map[string]func(){
+var arches = map[string]func(g *gen){
 	"386":     gen386,
 	"amd64":   genAMD64,
 	"arm":     genARM,
 	"arm64":   genARM64,
 	"loong64": genLoong64,
-	"mips64x": func() { genMIPS(true) },
-	"mipsx":   func() { genMIPS(false) },
+	"mips64x": func(g *gen) { genMIPS(g, true) },
+	"mipsx":   func(g *gen) { genMIPS(g, false) },
 	"ppc64x":  genPPC64,
 	"riscv64": genRISCV64,
 	"s390x":   genS390X,
@@ -93,53 +91,58 @@ var beLe = map[string]bool{"mips64x": true, "mipsx": true, "ppc64x": true}
 func main() {
 	flag.Parse()
 	if flag.NArg() > 0 {
-		out = os.Stdout
 		for _, arch := range flag.Args() {
-			gen, ok := arches[arch]
+			genFn, ok := arches[arch]
 			if !ok {
 				log.Fatalf("unknown arch %s", arch)
 			}
-			header(arch)
-			gen()
+			g := gen{os.Stdout, arch}
+			g.asmHeader()
+			genFn(&g)
 		}
 		return
 	}
 
-	for arch, gen := range arches {
+	for arch, genFn := range arches {
 		f, err := os.Create(fmt.Sprintf("preempt_%s.s", arch))
 		if err != nil {
 			log.Fatal(err)
 		}
-		out = f
-		header(arch)
-		gen()
+		g := gen{f, arch}
+		g.asmHeader()
+		genFn(&g)
 		if err := f.Close(); err != nil {
 			log.Fatal(err)
 		}
 	}
 }
 
-func header(arch string) {
-	fmt.Fprintf(out, "// Code generated by mkpreempt.go; DO NOT EDIT.\n\n")
-	if beLe[arch] {
-		base := arch[:len(arch)-1]
-		fmt.Fprintf(out, "//go:build %s || %sle\n\n", base, base)
+type gen struct {
+	w      io.Writer
+	goarch string
+}
+
+func (g *gen) asmHeader() {
+	fmt.Fprintf(g.w, "// Code generated by mkpreempt.go; DO NOT EDIT.\n\n")
+	if beLe[g.goarch] {
+		base := g.goarch[:len(g.goarch)-1]
+		fmt.Fprintf(g.w, "//go:build %s || %sle\n\n", base, base)
 	}
-	fmt.Fprintf(out, "#include \"go_asm.h\"\n")
-	if arch == "amd64" {
-		fmt.Fprintf(out, "#include \"asm_amd64.h\"\n")
+	fmt.Fprintf(g.w, "#include \"go_asm.h\"\n")
+	if g.goarch == "amd64" {
+		fmt.Fprintf(g.w, "#include \"asm_amd64.h\"\n")
 	}
-	fmt.Fprintf(out, "#include \"textflag.h\"\n\n")
-	fmt.Fprintf(out, "TEXT ·asyncPreempt(SB),NOSPLIT|NOFRAME,$0-0\n")
+	fmt.Fprintf(g.w, "#include \"textflag.h\"\n\n")
+	fmt.Fprintf(g.w, "TEXT ·asyncPreempt(SB),NOSPLIT|NOFRAME,$0-0\n")
 }
 
-func p(f string, args ...any) {
+func (g *gen) p(f string, args ...any) {
 	fmted := fmt.Sprintf(f, args...)
-	fmt.Fprintf(out, "\t%s\n", strings.ReplaceAll(fmted, "\n", "\n\t"))
+	fmt.Fprintf(g.w, "\t%s\n", strings.ReplaceAll(fmted, "\n", "\n\t"))
 }
 
-func label(l string) {
-	fmt.Fprintf(out, "%s\n", l)
+func (g *gen) label(l string) {
+	fmt.Fprintf(g.w, "%s\n", l)
 }
 
 type layout struct {
@@ -176,28 +179,30 @@ func (l *layout) addSpecial(save, restore string, size int) {
 	l.stack += size
 }
 
-func (l *layout) save() {
+func (l *layout) save(g *gen) {
 	for _, reg := range l.regs {
 		if reg.save != "" {
-			p(reg.save, reg.pos)
+			g.p(reg.save, reg.pos)
 		} else {
-			p("%s %s, %d(%s)", reg.saveOp, reg.reg, reg.pos, l.sp)
+			g.p("%s %s, %d(%s)", reg.saveOp, reg.reg, reg.pos, l.sp)
 		}
 	}
 }
 
-func (l *layout) restore() {
+func (l *layout) restore(g *gen) {
 	for i := len(l.regs) - 1; i >= 0; i-- {
 		reg := l.regs[i]
 		if reg.restore != "" {
-			p(reg.restore, reg.pos)
+			g.p(reg.restore, reg.pos)
 		} else {
-			p("%s %d(%s), %s", reg.restoreOp, reg.pos, l.sp, reg.reg)
+			g.p("%s %d(%s), %s", reg.restoreOp, reg.pos, l.sp, reg.reg)
 		}
 	}
 }
 
-func gen386() {
+func gen386(g *gen) {
+	p := g.p
+
 	p("PUSHFL")
 	// Save general purpose registers.
 	var l = layout{sp: "SP"}
@@ -218,22 +223,24 @@ func gen386() {
 
 	p("ADJSP $%d", lSSE.stack)
 	p("NOP SP")
-	l.save()
+	l.save(g)
 	p("#ifndef %s", softfloat)
-	lSSE.save()
+	lSSE.save(g)
 	p("#endif")
 	p("CALL ·asyncPreempt2(SB)")
 	p("#ifndef %s", softfloat)
-	lSSE.restore()
+	lSSE.restore(g)
 	p("#endif")
-	l.restore()
+	l.restore(g)
 	p("ADJSP $%d", -lSSE.stack)
 
 	p("POPFL")
 	p("RET")
 }
 
-func genAMD64() {
+func genAMD64(g *gen) {
+	p := g.p
+
 	// Assign stack offsets.
 	var l = layout{sp: "SP"}
 	for _, reg := range regNamesAMD64 {
@@ -262,19 +269,21 @@ func genAMD64() {
 	p("// But vet doesn't know ADJSP, so suppress vet stack checking")
 	p("NOP SP")
 
-	l.save()
+	l.save(g)
 
-	lSSE.save()
+	lSSE.save(g)
 	p("CALL ·asyncPreempt2(SB)")
-	lSSE.restore()
-	l.restore()
+	lSSE.restore(g)
+	l.restore(g)
 	p("ADJSP $%d", -lSSE.stack)
 	p("POPFQ")
 	p("POPQ BP")
 	p("RET")
 }
 
-func genARM() {
+func genARM(g *gen) {
+	p := g.p
+
 	// Add integer registers R0-R12.
 	// R13 (SP), R14 (LR), R15 (PC) are special and not saved here.
 	var l = layout{sp: "R13", stack: 4} // add LR slot
@@ -303,22 +312,23 @@ func genARM() {
 	}
 
 	p("MOVW.W R14, -%d(R13)", lfp.stack) // allocate frame, save LR
-	l.save()
+	l.save(g)
 	p("MOVB ·goarmsoftfp(SB), R0\nCMP $0, R0\nBNE nofp") // test goarmsoftfp, and skip FP registers if goarmsoftfp!=0.
-	lfp.save()
-	label("nofp:")
+	lfp.save(g)
+	g.label("nofp:")
 	p("CALL ·asyncPreempt2(SB)")
 	p("MOVB ·goarmsoftfp(SB), R0\nCMP $0, R0\nBNE nofp2") // test goarmsoftfp, and skip FP registers if goarmsoftfp!=0.
-	lfp.restore()
-	label("nofp2:")
-	l.restore()
+	lfp.restore(g)
+	g.label("nofp2:")
+	l.restore(g)
 
 	p("MOVW %d(R13), R14", lfp.stack)     // sigctxt.pushCall pushes LR on stack, restore it
 	p("MOVW.P %d(R13), R15", lfp.stack+4) // load PC, pop frame (including the space pushed by sigctxt.pushCall)
 	p("UNDEF")                            // shouldn't get here
 }
 
-func genARM64() {
+func genARM64(g *gen) {
+	p := g.p
 	// Add integer registers R0-R26
 	// R27 (REGTMP), R28 (g), R29 (FP), R30 (LR), R31 (SP) are special
 	// and not saved here.
@@ -362,9 +372,9 @@ func genARM64() {
 	p("MOVD R30, (RSP)")
 	p("#endif")
 
-	l.save()
+	l.save(g)
 	p("CALL ·asyncPreempt2(SB)")
-	l.restore()
+	l.restore(g)
 
 	p("MOVD %d(RSP), R30", l.stack) // sigctxt.pushCall has pushed LR (at interrupt) on stack, restore it
 	p("MOVD -8(RSP), R29")          // restore frame pointer
@@ -373,7 +383,9 @@ func genARM64() {
 	p("RET (R27)")
 }
 
-func genMIPS(_64bit bool) {
+func genMIPS(g *gen, _64bit bool) {
+	p := g.p
+
 	mov := "MOVW"
 	movf := "MOVF"
 	add := "ADD"
@@ -428,15 +440,15 @@ func genMIPS(_64bit bool) {
 	p(mov+" R31, -%d(R29)", lfp.stack)
 	p(sub+" $%d, R29", lfp.stack)
 
-	l.save()
+	l.save(g)
 	p("#ifndef %s", softfloat)
-	lfp.save()
+	lfp.save(g)
 	p("#endif")
 	p("CALL ·asyncPreempt2(SB)")
 	p("#ifndef %s", softfloat)
-	lfp.restore()
+	lfp.restore(g)
 	p("#endif")
-	l.restore()
+	l.restore(g)
 
 	p(mov+" %d(R29), R31", lfp.stack)     // sigctxt.pushCall has pushed LR (at interrupt) on stack, restore it
 	p(mov + " (R29), R23")                // load PC to REGTMP
@@ -444,7 +456,9 @@ func genMIPS(_64bit bool) {
 	p("JMP (R23)")
 }
 
-func genLoong64() {
+func genLoong64(g *gen) {
+	p := g.p
+
 	mov := "MOVV"
 	movf := "MOVD"
 	add := "ADDV"
@@ -478,9 +492,9 @@ func genLoong64() {
 	p(mov+" R1, -%d(R3)", l.stack)
 	p(sub+" $%d, R3", l.stack)
 
-	l.save()
+	l.save(g)
 	p("CALL ·asyncPreempt2(SB)")
-	l.restore()
+	l.restore(g)
 
 	p(mov+" %d(R3), R1", l.stack)      // sigctxt.pushCall has pushed LR (at interrupt) on stack, restore it
 	p(mov + " (R3), R30")              // load PC to REGTMP
@@ -488,7 +502,9 @@ func genLoong64() {
 	p("JMP (R30)")
 }
 
-func genPPC64() {
+func genPPC64(g *gen) {
+	p := g.p
+
 	// Add integer registers R3-R29
 	// R0 (zero), R1 (SP), R30 (g) are special and not saved here.
 	// R2 (TOC pointer in PIC mode), R12 (function entry address in PIC mode) have been saved in sigctxt.pushCall.
@@ -528,9 +544,9 @@ func genPPC64() {
 	p("MOVD LR, R31")
 	p("MOVDU R31, -%d(R1)", l.stack) // allocate frame, save PC of interrupted instruction (in LR)
 
-	l.save()
+	l.save(g)
 	p("CALL ·asyncPreempt2(SB)")
-	l.restore()
+	l.restore(g)
 
 	p("MOVD %d(R1), R31", l.stack) // sigctxt.pushCall has pushed LR, R2, R12 (at interrupt) on stack, restore them
 	p("MOVD R31, LR")
@@ -543,7 +559,9 @@ func genPPC64() {
 	p("JMP (CTR)")
 }
 
-func genRISCV64() {
+func genRISCV64(g *gen) {
+	p := g.p
+
 	// X0 (zero), X1 (LR), X2 (SP), X3 (GP), X4 (TP), X27 (g), X31 (TMP) are special.
 	var l = layout{sp: "X2", stack: 8}
 
@@ -564,16 +582,18 @@ func genRISCV64() {
 
 	p("MOV X1, -%d(X2)", l.stack)
 	p("SUB $%d, X2", l.stack)
-	l.save()
+	l.save(g)
 	p("CALL ·asyncPreempt2(SB)")
-	l.restore()
+	l.restore(g)
 	p("MOV %d(X2), X1", l.stack)
 	p("MOV (X2), X31")
 	p("ADD $%d, X2", l.stack+8)
 	p("JMP (X31)")
 }
 
-func genS390X() {
+func genS390X(g *gen) {
+	p := g.p
+
 	// Add integer registers R0-R12
 	// R13 (g), R14 (LR), R15 (SP) are special, and not saved here.
 	// Saving R10 (REGTMP) is not necessary, but it is saved anyway.
@@ -594,9 +614,9 @@ func genS390X() {
 	p("ADD $-%d, R15", l.stack)
 	p("MOVW R10, 8(R15)") // save flags
 
-	l.save()
+	l.save(g)
 	p("CALL ·asyncPreempt2(SB)")
-	l.restore()
+	l.restore(g)
 
 	p("MOVD %d(R15), R14", l.stack)    // sigctxt.pushCall has pushed LR (at interrupt) on stack, restore it
 	p("ADD $%d, R15", l.stack+8)       // pop frame (including the space pushed by sigctxt.pushCall)
@@ -606,12 +626,14 @@ func genS390X() {
 	p("JMP (R10)")
 }
 
-func genWasm() {
+func genWasm(g *gen) {
+	p := g.p
 	p("// No async preemption on wasm")
 	p("UNDEF")
 }
 
-func notImplemented() {
+func notImplemented(g *gen) {
+	p := g.p
 	p("// Not implemented yet")
 	p("JMP ·abort(SB)")
 }
-- 
cgit v1.3


From 426cf36b4d0c672dc88fc5cef9b0d5db0d2f4fe5 Mon Sep 17 00:00:00 2001
From: Austin Clements <austin@google.com>
Date: Tue, 29 Apr 2025 22:55:40 -0400
Subject: [dev.simd] runtime: save scalar registers off stack in amd64 async
 preemption

Asynchronous preemption must save all registers that could be in use
by Go code. Currently, it saves all of these to the goroutine stack.
As a result, the stack frame requirements of asynchronous preemption
can be rather high. On amd64, this requires 368 bytes of stack space,
most of which is the XMM registers. Several RISC architectures are
around 0.5 KiB.

As we add support for SIMD instructions, this is going to become a
problem. The AVX-512 register state is 2.5 KiB. This well exceeds the
nosplit limit, and even if it didn't, could constrain when we can
asynchronously preempt goroutines on small stacks.

This CL fixes this by moving pure scalar state stored in non-GP
registers off the stack and into an allocated "extended register
state" object. To reduce space overhead, we only allocate these
objects as needed. While in the theoretical limit, every G could need
this register state, in practice very few do at a time.

However, we can't allocate when we're in the middle of saving the
register state during an asynchronous preemption, so we reserve
scratch space on every P to temporarily store the register state,
which can then be copied out to an allocated state object later by Go
code.

This commit only implements this for amd64, since that's where we're
about to add much more vector state, but it lays the groundwork for
doing this on any architecture that could benefit.

Change-Id: I123a95e21c11d5c10942d70e27f84d2d99bbf735
Reviewed-on: https://go-review.googlesource.com/c/go/+/680898
Reviewed-by: Cherry Mui <cherryyz@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Auto-Submit: Austin Clements <austin@google.com>
---
 src/runtime/export_test.go    |   2 +
 src/runtime/lockrank.go       |   5 +-
 src/runtime/mheap.go          |   2 +
 src/runtime/mklockrank.go     |   6 +-
 src/runtime/mkpreempt.go      |  92 ++++++++++++++++++++++++++----
 src/runtime/preempt.go        |  50 +++++++++++------
 src/runtime/preempt_amd64.go  |  22 ++++++++
 src/runtime/preempt_amd64.s   |  82 ++++++++++++++++-----------
 src/runtime/preempt_noxreg.go |  27 +++++++++
 src/runtime/preempt_xreg.go   | 127 ++++++++++++++++++++++++++++++++++++++++++
 src/runtime/proc.go           |   1 +
 src/runtime/runtime2.go       |   9 +++
 src/runtime/sizeof_test.go    |   9 ++-
 13 files changed, 368 insertions(+), 66 deletions(-)
 create mode 100644 src/runtime/preempt_amd64.go
 create mode 100644 src/runtime/preempt_noxreg.go
 create mode 100644 src/runtime/preempt_xreg.go

(limited to 'src/runtime')

diff --git a/src/runtime/export_test.go b/src/runtime/export_test.go
index 83cf301be4..b3bb5d2c58 100644
--- a/src/runtime/export_test.go
+++ b/src/runtime/export_test.go
@@ -555,6 +555,8 @@ type G = g
 
 type Sudog = sudog
 
+type XRegPerG = xRegPerG
+
 func Getg() *G {
 	return getg()
 }
diff --git a/src/runtime/lockrank.go b/src/runtime/lockrank.go
index 44015ce862..9821e49998 100644
--- a/src/runtime/lockrank.go
+++ b/src/runtime/lockrank.go
@@ -70,6 +70,7 @@ const (
 	lockRankHchanLeaf
 	// WB
 	lockRankWbufSpans
+	lockRankXRegAlloc
 	lockRankMheap
 	lockRankMheapSpecial
 	lockRankGlobalAlloc
@@ -143,6 +144,7 @@ var lockNames = []string{
 	lockRankStackLarge:          "stackLarge",
 	lockRankHchanLeaf:           "hchanLeaf",
 	lockRankWbufSpans:           "wbufSpans",
+	lockRankXRegAlloc:           "xRegAlloc",
 	lockRankMheap:               "mheap",
 	lockRankMheapSpecial:        "mheapSpecial",
 	lockRankGlobalAlloc:         "globalAlloc",
@@ -228,9 +230,10 @@ var lockPartialOrder [][]lockRank = [][]lockRank{
 	lockRankStackLarge:          {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankComputeMaxProcs, lockRankUpdateMaxProcsG, lockRankSweepWaiters, lockRankAssistQueue, lockRankStrongFromWeakQueue, lockRankCleanupQueue, lockRankSweep, lockRankTestR, lockRankVgetrandom, lockRankTimerSend, lockRankExecW, lockRankCpuprof, lockRankPollDesc, lockRankWakeableSleep, lockRankHchan, lockRankAllocmR, lockRankExecR, lockRankSched, lockRankAllg, lockRankAllp, lockRankNotifyList, lockRankTimers, lockRankTimer, lockRankNetpollInit, lockRankRoot, lockRankItab, lockRankReflectOffs, lockRankSynctest, lockRankUserArenaState, lockRankTraceBuf, lockRankTraceStrings, lockRankFin, lockRankSpanSetSpine, lockRankMspanSpecial, lockRankGcBitsArenas, lockRankProfInsert, lockRankProfBlock, lockRankProfMemActive, lockRankProfMemFuture, lockRankGscan},
 	lockRankHchanLeaf:           {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankComputeMaxProcs, lockRankUpdateMaxProcsG, lockRankSweepWaiters, lockRankAssistQueue, lockRankStrongFromWeakQueue, lockRankCleanupQueue, lockRankSweep, lockRankTestR, lockRankVgetrandom, lockRankTimerSend, lockRankExecW, lockRankCpuprof, lockRankPollDesc, lockRankWakeableSleep, lockRankHchan, lockRankAllocmR, lockRankExecR, lockRankSched, lockRankAllg, lockRankAllp, lockRankNotifyList, lockRankTimers, lockRankTimer, lockRankNetpollInit, lockRankRoot, lockRankItab, lockRankReflectOffs, lockRankSynctest, lockRankUserArenaState, lockRankTraceBuf, lockRankTraceStrings, lockRankFin, lockRankSpanSetSpine, lockRankMspanSpecial, lockRankGcBitsArenas, lockRankProfInsert, lockRankProfBlock, lockRankProfMemActive, lockRankProfMemFuture, lockRankGscan, lockRankHchanLeaf},
 	lockRankWbufSpans:           {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankComputeMaxProcs, lockRankUpdateMaxProcsG, lockRankDefer, lockRankSweepWaiters, lockRankAssistQueue, lockRankStrongFromWeakQueue, lockRankCleanupQueue, lockRankSweep, lockRankTestR, lockRankVgetrandom, lockRankTimerSend, lockRankExecW, lockRankCpuprof, lockRankPollCache, lockRankPollDesc, lockRankWakeableSleep, lockRankHchan, lockRankAllocmR, lockRankExecR, lockRankSched, lockRankAllg, lockRankAllp, lockRankNotifyList, lockRankSudog, lockRankTimers, lockRankTimer, lockRankNetpollInit, lockRankRoot, lockRankItab, lockRankReflectOffs, lockRankSynctest, lockRankUserArenaState, lockRankTraceBuf, lockRankTraceStrings, lockRankFin, lockRankSpanSetSpine, lockRankMspanSpecial, lockRankGcBitsArenas, lockRankProfInsert, lockRankProfBlock, lockRankProfMemActive, lockRankProfMemFuture, lockRankGscan},
+	lockRankXRegAlloc:           {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankComputeMaxProcs, lockRankUpdateMaxProcsG, lockRankSweepWaiters, lockRankAssistQueue, lockRankStrongFromWeakQueue, lockRankCleanupQueue, lockRankSweep, lockRankTestR, lockRankTimerSend, lockRankCpuprof, lockRankPollDesc, lockRankWakeableSleep, lockRankHchan, lockRankAllocmR, lockRankExecR, lockRankSched},
 	lockRankMheap:               {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankComputeMaxProcs, lockRankUpdateMaxProcsG, lockRankDefer, lockRankSweepWaiters, lockRankAssistQueue, lockRankStrongFromWeakQueue, lockRankCleanupQueue, lockRankSweep, lockRankTestR, lockRankVgetrandom, lockRankTimerSend, lockRankExecW, lockRankCpuprof, lockRankPollCache, lockRankPollDesc, lockRankWakeableSleep, lockRankHchan, lockRankAllocmR, lockRankExecR, lockRankSched, lockRankAllg, lockRankAllp, lockRankNotifyList, lockRankSudog, lockRankTimers, lockRankTimer, lockRankNetpollInit, lockRankRoot, lockRankItab, lockRankReflectOffs, lockRankSynctest, lockRankUserArenaState, lockRankTraceBuf, lockRankTraceStrings, lockRankFin, lockRankSpanSetSpine, lockRankMspanSpecial, lockRankGcBitsArenas, lockRankProfInsert, lockRankProfBlock, lockRankProfMemActive, lockRankProfMemFuture, lockRankGscan, lockRankStackpool, lockRankStackLarge, lockRankWbufSpans},
 	lockRankMheapSpecial:        {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankComputeMaxProcs, lockRankUpdateMaxProcsG, lockRankDefer, lockRankSweepWaiters, lockRankAssistQueue, lockRankStrongFromWeakQueue, lockRankCleanupQueue, lockRankSweep, lockRankTestR, lockRankVgetrandom, lockRankTimerSend, lockRankExecW, lockRankCpuprof, lockRankPollCache, lockRankPollDesc, lockRankWakeableSleep, lockRankHchan, lockRankAllocmR, lockRankExecR, lockRankSched, lockRankAllg, lockRankAllp, lockRankNotifyList, lockRankSudog, lockRankTimers, lockRankTimer, lockRankNetpollInit, lockRankRoot, lockRankItab, lockRankReflectOffs, lockRankSynctest, lockRankUserArenaState, lockRankTraceBuf, lockRankTraceStrings, lockRankFin, lockRankSpanSetSpine, lockRankMspanSpecial, lockRankGcBitsArenas, lockRankProfInsert, lockRankProfBlock, lockRankProfMemActive, lockRankProfMemFuture, lockRankGscan, lockRankStackpool, lockRankStackLarge, lockRankWbufSpans, lockRankMheap},
-	lockRankGlobalAlloc:         {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankComputeMaxProcs, lockRankUpdateMaxProcsG, lockRankDefer, lockRankSweepWaiters, lockRankAssistQueue, lockRankStrongFromWeakQueue, lockRankCleanupQueue, lockRankSweep, lockRankTestR, lockRankVgetrandom, lockRankTimerSend, lockRankExecW, lockRankCpuprof, lockRankPollCache, lockRankPollDesc, lockRankWakeableSleep, lockRankHchan, lockRankAllocmR, lockRankExecR, lockRankSched, lockRankAllg, lockRankAllp, lockRankNotifyList, lockRankSudog, lockRankTimers, lockRankTimer, lockRankNetpollInit, lockRankRoot, lockRankItab, lockRankReflectOffs, lockRankSynctest, lockRankUserArenaState, lockRankTraceBuf, lockRankTraceStrings, lockRankFin, lockRankSpanSetSpine, lockRankMspanSpecial, lockRankGcBitsArenas, lockRankProfInsert, lockRankProfBlock, lockRankProfMemActive, lockRankProfMemFuture, lockRankGscan, lockRankStackpool, lockRankStackLarge, lockRankWbufSpans, lockRankMheap, lockRankMheapSpecial},
+	lockRankGlobalAlloc:         {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankComputeMaxProcs, lockRankUpdateMaxProcsG, lockRankDefer, lockRankSweepWaiters, lockRankAssistQueue, lockRankStrongFromWeakQueue, lockRankCleanupQueue, lockRankSweep, lockRankTestR, lockRankVgetrandom, lockRankTimerSend, lockRankExecW, lockRankCpuprof, lockRankPollCache, lockRankPollDesc, lockRankWakeableSleep, lockRankHchan, lockRankAllocmR, lockRankExecR, lockRankSched, lockRankAllg, lockRankAllp, lockRankNotifyList, lockRankSudog, lockRankTimers, lockRankTimer, lockRankNetpollInit, lockRankRoot, lockRankItab, lockRankReflectOffs, lockRankSynctest, lockRankUserArenaState, lockRankTraceBuf, lockRankTraceStrings, lockRankFin, lockRankSpanSetSpine, lockRankMspanSpecial, lockRankGcBitsArenas, lockRankProfInsert, lockRankProfBlock, lockRankProfMemActive, lockRankProfMemFuture, lockRankGscan, lockRankStackpool, lockRankStackLarge, lockRankWbufSpans, lockRankXRegAlloc, lockRankMheap, lockRankMheapSpecial},
 	lockRankTrace:               {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankComputeMaxProcs, lockRankUpdateMaxProcsG, lockRankDefer, lockRankSweepWaiters, lockRankAssistQueue, lockRankStrongFromWeakQueue, lockRankCleanupQueue, lockRankSweep, lockRankTestR, lockRankVgetrandom, lockRankTimerSend, lockRankExecW, lockRankCpuprof, lockRankPollCache, lockRankPollDesc, lockRankWakeableSleep, lockRankHchan, lockRankAllocmR, lockRankExecR, lockRankSched, lockRankAllg, lockRankAllp, lockRankNotifyList, lockRankSudog, lockRankTimers, lockRankTimer, lockRankNetpollInit, lockRankRoot, lockRankItab, lockRankReflectOffs, lockRankSynctest, lockRankUserArenaState, lockRankTraceBuf, lockRankTraceStrings, lockRankFin, lockRankSpanSetSpine, lockRankMspanSpecial, lockRankGcBitsArenas, lockRankProfInsert, lockRankProfBlock, lockRankProfMemActive, lockRankProfMemFuture, lockRankGscan, lockRankStackpool, lockRankStackLarge, lockRankWbufSpans, lockRankMheap},
 	lockRankTraceStackTab:       {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankComputeMaxProcs, lockRankUpdateMaxProcsG, lockRankDefer, lockRankSweepWaiters, lockRankAssistQueue, lockRankStrongFromWeakQueue, lockRankCleanupQueue, lockRankSweep, lockRankTestR, lockRankVgetrandom, lockRankTimerSend, lockRankExecW, lockRankCpuprof, lockRankPollCache, lockRankPollDesc, lockRankWakeableSleep, lockRankHchan, lockRankAllocmR, lockRankExecR, lockRankSched, lockRankAllg, lockRankAllp, lockRankNotifyList, lockRankSudog, lockRankTimers, lockRankTimer, lockRankNetpollInit, lockRankRoot, lockRankItab, lockRankReflectOffs, lockRankSynctest, lockRankUserArenaState, lockRankTraceBuf, lockRankTraceStrings, lockRankFin, lockRankSpanSetSpine, lockRankMspanSpecial, lockRankGcBitsArenas, lockRankProfInsert, lockRankProfBlock, lockRankProfMemActive, lockRankProfMemFuture, lockRankGscan, lockRankStackpool, lockRankStackLarge, lockRankWbufSpans, lockRankMheap, lockRankTrace},
 	lockRankPanic:               {},
diff --git a/src/runtime/mheap.go b/src/runtime/mheap.go
index f25dbb429d..358de2f376 100644
--- a/src/runtime/mheap.go
+++ b/src/runtime/mheap.go
@@ -821,6 +821,8 @@ func (h *mheap) init() {
 	}
 
 	h.pages.init(&h.lock, &memstats.gcMiscSys, false)
+
+	xRegInitAlloc()
 }
 
 // reclaim sweeps and reclaims at least npage pages into the heap.
diff --git a/src/runtime/mklockrank.go b/src/runtime/mklockrank.go
index 46a063fdce..9c503369a3 100644
--- a/src/runtime/mklockrank.go
+++ b/src/runtime/mklockrank.go
@@ -193,6 +193,9 @@ defer,
 # Below WB is the write barrier implementation.
 < wbufSpans;
 
+# xRegState allocator
+sched < xRegAlloc;
+
 # Span allocator
 stackLarge,
   stackpool,
@@ -205,7 +208,8 @@ stackLarge,
 # an mspanSpecial lock, and they're part of the malloc implementation.
 # Pinner bits might be freed by the span allocator.
 mheap, mspanSpecial < mheapSpecial;
-mheap, mheapSpecial < globalAlloc;
+# Fixallocs
+mheap, mheapSpecial, xRegAlloc < globalAlloc;
 
 # Execution tracer events (with a P)
 hchan,
diff --git a/src/runtime/mkpreempt.go b/src/runtime/mkpreempt.go
index ec900a23d2..e3dd5046f3 100644
--- a/src/runtime/mkpreempt.go
+++ b/src/runtime/mkpreempt.go
@@ -9,8 +9,10 @@
 package main
 
 import (
+	"bytes"
 	"flag"
 	"fmt"
+	"go/format"
 	"io"
 	"log"
 	"os"
@@ -122,14 +124,19 @@ type gen struct {
 	goarch string
 }
 
-func (g *gen) asmHeader() {
+func (g *gen) commonHeader() {
 	fmt.Fprintf(g.w, "// Code generated by mkpreempt.go; DO NOT EDIT.\n\n")
 	if beLe[g.goarch] {
 		base := g.goarch[:len(g.goarch)-1]
 		fmt.Fprintf(g.w, "//go:build %s || %sle\n\n", base, base)
 	}
+}
+
+func (g *gen) asmHeader() {
+	g.commonHeader()
 	fmt.Fprintf(g.w, "#include \"go_asm.h\"\n")
 	if g.goarch == "amd64" {
+		fmt.Fprintf(g.w, "#include \"go_tls.h\"\n")
 		fmt.Fprintf(g.w, "#include \"asm_amd64.h\"\n")
 	}
 	fmt.Fprintf(g.w, "#include \"textflag.h\"\n\n")
@@ -145,6 +152,43 @@ func (g *gen) label(l string) {
 	fmt.Fprintf(g.w, "%s\n", l)
 }
 
+// writeXRegs writes an architecture xregs file.
+func writeXRegs(arch string, l *layout) {
+	var code bytes.Buffer
+	g := gen{&code, arch}
+	g.commonHeader()
+	fmt.Fprintf(g.w, `
+package runtime
+
+type xRegState struct {
+`)
+	pos := 0
+	for _, reg := range l.regs {
+		if reg.pos != pos {
+			log.Fatalf("padding not implemented")
+		}
+		typ := fmt.Sprintf("[%d]byte", reg.size)
+		switch {
+		case reg.size == 4 && reg.pos%4 == 0:
+			typ = "uint32"
+		case reg.size == 8 && reg.pos%8 == 0:
+			typ = "uint64"
+		}
+		fmt.Fprintf(g.w, "\t%s %s\n", reg.reg, typ)
+		pos += reg.size
+	}
+	fmt.Fprintf(g.w, "}\n")
+
+	path := fmt.Sprintf("preempt_%s.go", arch)
+	b, err := format.Source(code.Bytes())
+	if err != nil {
+		log.Fatalf("formatting %s: %s", path, err)
+	}
+	if err := os.WriteFile(path, b, 0666); err != nil {
+		log.Fatal(err)
+	}
+}
+
 type layout struct {
 	stack int
 	regs  []regPos
@@ -152,7 +196,7 @@ type layout struct {
 }
 
 type regPos struct {
-	pos int
+	pos, size int
 
 	saveOp    string
 	restoreOp string
@@ -165,17 +209,17 @@ type regPos struct {
 }
 
 func (l *layout) add(op, reg string, size int) {
-	l.regs = append(l.regs, regPos{saveOp: op, restoreOp: op, reg: reg, pos: l.stack})
+	l.regs = append(l.regs, regPos{saveOp: op, restoreOp: op, reg: reg, pos: l.stack, size: size})
 	l.stack += size
 }
 
 func (l *layout) add2(sop, rop, reg string, size int) {
-	l.regs = append(l.regs, regPos{saveOp: sop, restoreOp: rop, reg: reg, pos: l.stack})
+	l.regs = append(l.regs, regPos{saveOp: sop, restoreOp: rop, reg: reg, pos: l.stack, size: size})
 	l.stack += size
 }
 
 func (l *layout) addSpecial(save, restore string, size int) {
-	l.regs = append(l.regs, regPos{save: save, restore: restore, pos: l.stack})
+	l.regs = append(l.regs, regPos{save: save, restore: restore, pos: l.stack, size: size})
 	l.stack += size
 }
 
@@ -239,6 +283,8 @@ func gen386(g *gen) {
 }
 
 func genAMD64(g *gen) {
+	const xReg = "AX" // *xRegState
+
 	p := g.p
 
 	// Assign stack offsets.
@@ -251,12 +297,13 @@ func genAMD64(g *gen) {
 			l.add("MOVQ", reg, 8)
 		}
 	}
-	lSSE := layout{stack: l.stack, sp: "SP"}
+	lXRegs := layout{sp: xReg} // Non-GP registers
 	for _, reg := range regNamesAMD64 {
 		if strings.HasPrefix(reg, "X") {
-			lSSE.add("MOVUPS", reg, 16)
+			lXRegs.add("MOVUPS", reg, 16)
 		}
 	}
+	writeXRegs(g.goarch, &lXRegs)
 
 	// TODO: MXCSR register?
 
@@ -265,17 +312,40 @@ func genAMD64(g *gen) {
 	p("// Save flags before clobbering them")
 	p("PUSHFQ")
 	p("// obj doesn't understand ADD/SUB on SP, but does understand ADJSP")
-	p("ADJSP $%d", lSSE.stack)
+	p("ADJSP $%d", l.stack)
 	p("// But vet doesn't know ADJSP, so suppress vet stack checking")
 	p("NOP SP")
 
+	p("// Save GPs")
 	l.save(g)
 
-	lSSE.save(g)
+	// In general, the limitations on asynchronous preemption mean we only
+	// preempt in ABIInternal code. However, there's at least one exception to
+	// this: when we're in an open-coded transition between an ABIInternal
+	// function and an ABI0 call. We could more carefully arrange unsafe points
+	// to avoid ever landing in ABI0, but it's easy to just make this code not
+	// sensitive to the ABI we're preempting. The CALL to asyncPreempt2 will
+	// ensure we're in ABIInternal register state.
+	p("// Save extended register state to p.xRegs.scratch")
+	p("// Don't make assumptions about ABI register state. See mkpreempt.go")
+	p("get_tls(CX)")
+	p("MOVQ g(CX), R14")
+	p("MOVQ g_m(R14), %s", xReg)
+	p("MOVQ m_p(%s), %s", xReg, xReg)
+	p("LEAQ (p_xRegs+xRegPerP_scratch)(%s), %s", xReg, xReg)
+	lXRegs.save(g)
+
 	p("CALL ·asyncPreempt2(SB)")
-	lSSE.restore(g)
+
+	p("// Restore non-GPs from *p.xRegs.cache")
+	p("MOVQ g_m(R14), %s", xReg)
+	p("MOVQ m_p(%s), %s", xReg, xReg)
+	p("MOVQ (p_xRegs+xRegPerP_cache)(%s), %s", xReg, xReg)
+	lXRegs.restore(g)
+
+	p("// Restore GPs")
 	l.restore(g)
-	p("ADJSP $%d", -lSSE.stack)
+	p("ADJSP $%d", -l.stack)
 	p("POPFQ")
 	p("POPQ BP")
 	p("RET")
diff --git a/src/runtime/preempt.go b/src/runtime/preempt.go
index c41c355835..d053747d3a 100644
--- a/src/runtime/preempt.go
+++ b/src/runtime/preempt.go
@@ -292,21 +292,43 @@ func canPreemptM(mp *m) bool {
 
 // asyncPreempt saves all user registers and calls asyncPreempt2.
 //
-// When stack scanning encounters an asyncPreempt frame, it scans that
+// It saves GP registers (anything that might contain a pointer) to the G stack.
+// Hence, when stack scanning encounters an asyncPreempt frame, it scans that
 // frame and its parent frame conservatively.
 //
+// On some platforms, it saves large additional scalar-only register state such
+// as vector registers to an "extended register state" on the P.
+//
 // asyncPreempt is implemented in assembly.
 func asyncPreempt()
 
 //go:nosplit
 func asyncPreempt2() {
+	// We can't grow the stack with untyped data from asyncPreempt, so switch to
+	// the system stack right away.
+	mcall(func(gp *g) {
+		gp.asyncSafePoint = true
+
+		// Move the extended register state from the P to the G. We do this now that
+		// we're on the system stack to avoid stack splits.
+		xRegSave(gp)
+
+		if gp.preemptStop {
+			preemptPark(gp)
+		} else {
+			gopreempt_m(gp)
+		}
+		// The above functions never return.
+	})
+
+	// Do not grow the stack below here!
+
 	gp := getg()
-	gp.asyncSafePoint = true
-	if gp.preemptStop {
-		mcall(preemptPark)
-	} else {
-		mcall(gopreempt_m)
-	}
+
+	// Put the extended register state back on the M so resumption can find it.
+	// We can't do this in asyncPreemptM because the park calls never return.
+	xRegRestore(gp)
+
 	gp.asyncSafePoint = false
 }
 
@@ -319,19 +341,13 @@ func init() {
 	total := funcMaxSPDelta(f)
 	f = findfunc(abi.FuncPCABIInternal(asyncPreempt2))
 	total += funcMaxSPDelta(f)
+	f = findfunc(abi.FuncPCABIInternal(xRegRestore))
+	total += funcMaxSPDelta(f)
 	// Add some overhead for return PCs, etc.
 	asyncPreemptStack = uintptr(total) + 8*goarch.PtrSize
 	if asyncPreemptStack > stackNosplit {
-		// We need more than the nosplit limit. This isn't
-		// unsafe, but it may limit asynchronous preemption.
-		//
-		// This may be a problem if we start using more
-		// registers. In that case, we should store registers
-		// in a context object. If we pre-allocate one per P,
-		// asyncPreempt can spill just a few registers to the
-		// stack, then grab its context object and spill into
-		// it. When it enters the runtime, it would allocate a
-		// new context for the P.
+		// We need more than the nosplit limit. This isn't unsafe, but it may
+		// limit asynchronous preemption. Consider moving state into xRegState.
 		print("runtime: asyncPreemptStack=", asyncPreemptStack, "\n")
 		throw("async stack too large")
 	}
diff --git a/src/runtime/preempt_amd64.go b/src/runtime/preempt_amd64.go
new file mode 100644
index 0000000000..904defac33
--- /dev/null
+++ b/src/runtime/preempt_amd64.go
@@ -0,0 +1,22 @@
+// Code generated by mkpreempt.go; DO NOT EDIT.
+
+package runtime
+
+type xRegState struct {
+	X0  [16]byte
+	X1  [16]byte
+	X2  [16]byte
+	X3  [16]byte
+	X4  [16]byte
+	X5  [16]byte
+	X6  [16]byte
+	X7  [16]byte
+	X8  [16]byte
+	X9  [16]byte
+	X10 [16]byte
+	X11 [16]byte
+	X12 [16]byte
+	X13 [16]byte
+	X14 [16]byte
+	X15 [16]byte
+}
diff --git a/src/runtime/preempt_amd64.s b/src/runtime/preempt_amd64.s
index 8e3ed0d7c5..0a33ce7f3e 100644
--- a/src/runtime/preempt_amd64.s
+++ b/src/runtime/preempt_amd64.s
@@ -1,6 +1,7 @@
 // Code generated by mkpreempt.go; DO NOT EDIT.
 
 #include "go_asm.h"
+#include "go_tls.h"
 #include "asm_amd64.h"
 #include "textflag.h"
 
@@ -10,9 +11,10 @@ TEXT ·asyncPreempt(SB),NOSPLIT|NOFRAME,$0-0
 	// Save flags before clobbering them
 	PUSHFQ
 	// obj doesn't understand ADD/SUB on SP, but does understand ADJSP
-	ADJSP $368
+	ADJSP $112
 	// But vet doesn't know ADJSP, so suppress vet stack checking
 	NOP SP
+	// Save GPs
 	MOVQ AX, 0(SP)
 	MOVQ CX, 8(SP)
 	MOVQ DX, 16(SP)
@@ -27,39 +29,51 @@ TEXT ·asyncPreempt(SB),NOSPLIT|NOFRAME,$0-0
 	MOVQ R13, 88(SP)
 	MOVQ R14, 96(SP)
 	MOVQ R15, 104(SP)
-	MOVUPS X0, 112(SP)
-	MOVUPS X1, 128(SP)
-	MOVUPS X2, 144(SP)
-	MOVUPS X3, 160(SP)
-	MOVUPS X4, 176(SP)
-	MOVUPS X5, 192(SP)
-	MOVUPS X6, 208(SP)
-	MOVUPS X7, 224(SP)
-	MOVUPS X8, 240(SP)
-	MOVUPS X9, 256(SP)
-	MOVUPS X10, 272(SP)
-	MOVUPS X11, 288(SP)
-	MOVUPS X12, 304(SP)
-	MOVUPS X13, 320(SP)
-	MOVUPS X14, 336(SP)
-	MOVUPS X15, 352(SP)
+	// Save extended register state to p.xRegs.scratch
+	// Don't make assumptions about ABI register state. See mkpreempt.go
+	get_tls(CX)
+	MOVQ g(CX), R14
+	MOVQ g_m(R14), AX
+	MOVQ m_p(AX), AX
+	LEAQ (p_xRegs+xRegPerP_scratch)(AX), AX
+	MOVUPS X0, 0(AX)
+	MOVUPS X1, 16(AX)
+	MOVUPS X2, 32(AX)
+	MOVUPS X3, 48(AX)
+	MOVUPS X4, 64(AX)
+	MOVUPS X5, 80(AX)
+	MOVUPS X6, 96(AX)
+	MOVUPS X7, 112(AX)
+	MOVUPS X8, 128(AX)
+	MOVUPS X9, 144(AX)
+	MOVUPS X10, 160(AX)
+	MOVUPS X11, 176(AX)
+	MOVUPS X12, 192(AX)
+	MOVUPS X13, 208(AX)
+	MOVUPS X14, 224(AX)
+	MOVUPS X15, 240(AX)
 	CALL ·asyncPreempt2(SB)
-	MOVUPS 352(SP), X15
-	MOVUPS 336(SP), X14
-	MOVUPS 320(SP), X13
-	MOVUPS 304(SP), X12
-	MOVUPS 288(SP), X11
-	MOVUPS 272(SP), X10
-	MOVUPS 256(SP), X9
-	MOVUPS 240(SP), X8
-	MOVUPS 224(SP), X7
-	MOVUPS 208(SP), X6
-	MOVUPS 192(SP), X5
-	MOVUPS 176(SP), X4
-	MOVUPS 160(SP), X3
-	MOVUPS 144(SP), X2
-	MOVUPS 128(SP), X1
-	MOVUPS 112(SP), X0
+	// Restore non-GPs from *p.xRegs.cache
+	MOVQ g_m(R14), AX
+	MOVQ m_p(AX), AX
+	MOVQ (p_xRegs+xRegPerP_cache)(AX), AX
+	MOVUPS 240(AX), X15
+	MOVUPS 224(AX), X14
+	MOVUPS 208(AX), X13
+	MOVUPS 192(AX), X12
+	MOVUPS 176(AX), X11
+	MOVUPS 160(AX), X10
+	MOVUPS 144(AX), X9
+	MOVUPS 128(AX), X8
+	MOVUPS 112(AX), X7
+	MOVUPS 96(AX), X6
+	MOVUPS 80(AX), X5
+	MOVUPS 64(AX), X4
+	MOVUPS 48(AX), X3
+	MOVUPS 32(AX), X2
+	MOVUPS 16(AX), X1
+	MOVUPS 0(AX), X0
+	// Restore GPs
 	MOVQ 104(SP), R15
 	MOVQ 96(SP), R14
 	MOVQ 88(SP), R13
@@ -74,7 +88,7 @@ TEXT ·asyncPreempt(SB),NOSPLIT|NOFRAME,$0-0
 	MOVQ 16(SP), DX
 	MOVQ 8(SP), CX
 	MOVQ 0(SP), AX
-	ADJSP $-368
+	ADJSP $-112
 	POPFQ
 	POPQ BP
 	RET
diff --git a/src/runtime/preempt_noxreg.go b/src/runtime/preempt_noxreg.go
new file mode 100644
index 0000000000..dfe46559b5
--- /dev/null
+++ b/src/runtime/preempt_noxreg.go
@@ -0,0 +1,27 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !amd64
+
+// This provides common support for architectures that DO NOT use extended
+// register state in asynchronous preemption.
+
+package runtime
+
+type xRegPerG struct{}
+
+type xRegPerP struct{}
+
+// xRegState is defined only so the build fails if we try to define a real
+// xRegState on a noxreg architecture.
+type xRegState struct{}
+
+func xRegInitAlloc() {}
+
+func xRegSave(gp *g) {}
+
+//go:nosplit
+func xRegRestore(gp *g) {}
+
+func (*xRegPerP) free() {}
diff --git a/src/runtime/preempt_xreg.go b/src/runtime/preempt_xreg.go
new file mode 100644
index 0000000000..f0a47c15d9
--- /dev/null
+++ b/src/runtime/preempt_xreg.go
@@ -0,0 +1,127 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build amd64
+
+// This provides common support for architectures that use extended register
+// state in asynchronous preemption.
+//
+// While asynchronous preemption stores general-purpose (GP) registers on the
+// preempted goroutine's own stack, extended register state can be used to save
+// non-GP state off the stack. In particular, this is meant for large vector
+// register files. Currently, we assume this contains only scalar data, though
+// we could change this constraint by conservatively scanning this memory.
+//
+// For an architecture to support extended register state, it must provide a Go
+// definition of an xRegState type for storing the state, and its asyncPreempt
+// implementation must write this register state to p.xRegs.scratch.
+
+package runtime
+
+import "unsafe"
+
+// xRegPerG stores extended register state while a goroutine is asynchronously
+// preempted. This is nil otherwise, so we can reuse a (likely small) pool of
+// xRegState objects.
+type xRegPerG struct {
+	state *xRegState
+}
+
+type xRegPerP struct {
+	// scratch temporary per-P space where [asyncPreempt] saves the register
+	// state before entering Go. It's quickly copied to per-G state.
+	scratch xRegState
+
+	// cache is a 1-element allocation cache of extended register state used by
+	// asynchronous preemption. On entry to preemption, this is used as a simple
+	// allocation cache. On exit from preemption, the G's xRegState is always
+	// stored here where it can be restored, and later either freed or reused
+	// for another preemption. On exit, this serves the dual purpose of
+	// delay-freeing the allocated xRegState until after we've definitely
+	// restored it.
+	cache *xRegState
+}
+
+// xRegAlloc allocates xRegState objects.
+var xRegAlloc struct {
+	lock  mutex
+	alloc fixalloc
+}
+
+func xRegInitAlloc() {
+	lockInit(&xRegAlloc.lock, lockRankXRegAlloc)
+	xRegAlloc.alloc.init(unsafe.Sizeof(xRegState{}), nil, nil, &memstats.other_sys)
+}
+
+// xRegSave saves the extended register state on this P to gp.
+//
+// This must run on the system stack because it assumes the P won't change.
+//
+//go:systemstack
+func xRegSave(gp *g) {
+	if gp.xRegs.state != nil {
+		// Double preempt?
+		throw("gp.xRegState.p != nil on async preempt")
+	}
+
+	// Get the place to save the register state.
+	var dest *xRegState
+	pp := gp.m.p.ptr()
+	if pp.xRegs.cache != nil {
+		// Use the cached allocation.
+		dest = pp.xRegs.cache
+		pp.xRegs.cache = nil
+	} else {
+		// Allocate a new save block.
+		lock(&xRegAlloc.lock)
+		dest = (*xRegState)(xRegAlloc.alloc.alloc())
+		unlock(&xRegAlloc.lock)
+	}
+
+	// Copy state saved in the scratchpad to dest.
+	//
+	// If we ever need to save less state (e.g., avoid saving vector registers
+	// that aren't in use), we could have multiple allocation pools for
+	// different size states and copy only the registers we need.
+	*dest = pp.xRegs.scratch
+
+	// Save on the G.
+	gp.xRegs.state = dest
+}
+
+// xRegRestore prepares the extended register state on gp to be restored.
+//
+// It moves the state to gp.m.p.xRegs.cache where [asyncPreempt] expects to find
+// it. This means nothing else may use the cache between this call and the
+// return to asyncPreempt. This is not quite symmetric with [xRegSave], which
+// uses gp.m.p.xRegs.scratch. By using cache instead, we save a block copy.
+//
+// This is called with asyncPreempt on the stack and thus must not grow the
+// stack.
+//
+//go:nosplit
+func xRegRestore(gp *g) {
+	if gp.xRegs.state == nil {
+		throw("gp.xRegState.p == nil on return from async preempt")
+	}
+	// If the P has a block cached on it, free that so we can replace it.
+	pp := gp.m.p.ptr()
+	if pp.xRegs.cache != nil {
+		// Don't grow the G stack.
+		systemstack(func() {
+			pp.xRegs.free()
+		})
+	}
+	pp.xRegs.cache = gp.xRegs.state
+	gp.xRegs.state = nil
+}
+
+func (xRegs *xRegPerP) free() {
+	if xRegs.cache != nil {
+		lock(&xRegAlloc.lock)
+		xRegAlloc.alloc.free(unsafe.Pointer(xRegs.cache))
+		xRegs.cache = nil
+		unlock(&xRegAlloc.lock)
+	}
+}
diff --git a/src/runtime/proc.go b/src/runtime/proc.go
index 9817308430..b2ae46e0e4 100644
--- a/src/runtime/proc.go
+++ b/src/runtime/proc.go
@@ -5799,6 +5799,7 @@ func (pp *p) destroy() {
 	pp.gcAssistTime = 0
 	gcCleanups.queued += pp.cleanupsQueued
 	pp.cleanupsQueued = 0
+	pp.xRegs.free()
 	pp.status = _Pdead
 }
 
diff --git a/src/runtime/runtime2.go b/src/runtime/runtime2.go
index 96720846b2..789b68e54e 100644
--- a/src/runtime/runtime2.go
+++ b/src/runtime/runtime2.go
@@ -491,6 +491,10 @@ type g struct {
 	coroarg *coro // argument during coroutine transfers
 	bubble  *synctestBubble
 
+	// xRegs stores the extended register state if this G has been
+	// asynchronously preempted.
+	xRegs xRegPerG
+
 	// Per-G tracer state.
 	trace gTraceState
 
@@ -760,6 +764,11 @@ type p struct {
 	// gcStopTime is the nanotime timestamp that this P last entered _Pgcstop.
 	gcStopTime int64
 
+	// xRegs is the per-P extended register state used by asynchronous
+	// preemption. This is an empty struct on platforms that don't use extended
+	// register state.
+	xRegs xRegPerP
+
 	// Padding is no longer needed. False sharing is now not a worry because p is large enough
 	// that its size class is an integer multiple of the cache line size (for any of our architectures).
 }
diff --git a/src/runtime/sizeof_test.go b/src/runtime/sizeof_test.go
index a5dc8aed34..de859866a5 100644
--- a/src/runtime/sizeof_test.go
+++ b/src/runtime/sizeof_test.go
@@ -15,13 +15,18 @@ import (
 
 func TestSizeof(t *testing.T) {
 	const _64bit = unsafe.Sizeof(uintptr(0)) == 8
+	const xreg = unsafe.Sizeof(runtime.XRegPerG{}) // Varies per architecture
 	var tests = []struct {
 		val    any     // type as a value
 		_32bit uintptr // size on 32bit platforms
 		_64bit uintptr // size on 64bit platforms
 	}{
-		{runtime.G{}, 280, 440},   // g, but exported for testing
-		{runtime.Sudog{}, 56, 88}, // sudog, but exported for testing
+		{runtime.G{}, 280 + xreg, 440 + xreg}, // g, but exported for testing
+		{runtime.Sudog{}, 56, 88},             // sudog, but exported for testing
+	}
+
+	if xreg > runtime.PtrSize {
+		t.Errorf("unsafe.Sizeof(xRegPerG) = %d, want <= %d", xreg, runtime.PtrSize)
 	}
 
 	for _, tt := range tests {
-- 
cgit v1.3


From 9eeb1e7a9afb992e899d3917fce92c01b3fa50c1 Mon Sep 17 00:00:00 2001
From: Austin Clements <austin@google.com>
Date: Thu, 12 Jun 2025 15:33:41 -0400
Subject: [dev.simd] runtime: save AVX2 and AVX-512 state on asynchronous
 preemption

Based on CL 669415 by shaojunyang@google.com.

Change-Id: I574f15c3b18a7179a1573aaf567caf18d8602ef1
Reviewed-on: https://go-review.googlesource.com/c/go/+/680900
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Auto-Submit: Austin Clements <austin@google.com>
Reviewed-by: Cherry Mui <cherryyz@google.com>
---
 src/runtime/cpuflags.go      |   1 +
 src/runtime/mkpreempt.go     |  74 ++++++++++++++++---
 src/runtime/preempt_amd64.go |  40 ++++++-----
 src/runtime/preempt_amd64.s  | 166 +++++++++++++++++++++++++++++++++++--------
 4 files changed, 227 insertions(+), 54 deletions(-)

(limited to 'src/runtime')

diff --git a/src/runtime/cpuflags.go b/src/runtime/cpuflags.go
index bd1cb328d3..6452364b68 100644
--- a/src/runtime/cpuflags.go
+++ b/src/runtime/cpuflags.go
@@ -13,6 +13,7 @@ import (
 const (
 	offsetX86HasAVX    = unsafe.Offsetof(cpu.X86.HasAVX)
 	offsetX86HasAVX2   = unsafe.Offsetof(cpu.X86.HasAVX2)
+	offsetX86HasAVX512 = unsafe.Offsetof(cpu.X86.HasAVX512) // F+CD+BW+DQ+VL
 	offsetX86HasERMS   = unsafe.Offsetof(cpu.X86.HasERMS)
 	offsetX86HasRDTSCP = unsafe.Offsetof(cpu.X86.HasRDTSCP)
 
diff --git a/src/runtime/mkpreempt.go b/src/runtime/mkpreempt.go
index e3dd5046f3..29e8288129 100644
--- a/src/runtime/mkpreempt.go
+++ b/src/runtime/mkpreempt.go
@@ -285,7 +285,7 @@ func gen386(g *gen) {
 func genAMD64(g *gen) {
 	const xReg = "AX" // *xRegState
 
-	p := g.p
+	p, label := g.p, g.label
 
 	// Assign stack offsets.
 	var l = layout{sp: "SP"}
@@ -297,15 +297,33 @@ func genAMD64(g *gen) {
 			l.add("MOVQ", reg, 8)
 		}
 	}
-	lXRegs := layout{sp: xReg} // Non-GP registers
-	for _, reg := range regNamesAMD64 {
-		if strings.HasPrefix(reg, "X") {
-			lXRegs.add("MOVUPS", reg, 16)
+	// Create layouts for X, Y, and Z registers.
+	const (
+		numXRegs = 16
+		numZRegs = 16 // TODO: If we start using upper registers, change to 32
+		numKRegs = 8
+	)
+	lZRegs := layout{sp: xReg} // Non-GP registers
+	lXRegs, lYRegs := lZRegs, lZRegs
+	for i := range numZRegs {
+		lZRegs.add("VMOVDQU64", fmt.Sprintf("Z%d", i), 512/8)
+		if i < numXRegs {
+			// Use SSE-only instructions for X registers.
+			lXRegs.add("MOVUPS", fmt.Sprintf("X%d", i), 128/8)
+			lYRegs.add("VMOVDQU", fmt.Sprintf("Y%d", i), 256/8)
 		}
 	}
-	writeXRegs(g.goarch, &lXRegs)
-
-	// TODO: MXCSR register?
+	for i := range numKRegs {
+		lZRegs.add("KMOVQ", fmt.Sprintf("K%d", i), 8)
+	}
+	// The Z layout is the most general, so we line up the others with that one.
+	// We don't have to do this, but it results in a nice Go type. If we split
+	// this into multiple types, we probably should stop doing this.
+	for i := range lXRegs.regs {
+		lXRegs.regs[i].pos = lZRegs.regs[i].pos
+		lYRegs.regs[i].pos = lZRegs.regs[i].pos
+	}
+	writeXRegs(g.goarch, &lZRegs)
 
 	p("PUSHQ BP")
 	p("MOVQ SP, BP")
@@ -333,16 +351,56 @@ func genAMD64(g *gen) {
 	p("MOVQ g_m(R14), %s", xReg)
 	p("MOVQ m_p(%s), %s", xReg, xReg)
 	p("LEAQ (p_xRegs+xRegPerP_scratch)(%s), %s", xReg, xReg)
+
+	// Which registers do we need to save?
+	p("#ifdef GOEXPERIMENT_simd")
+	p("CMPB internal∕cpu·X86+const_offsetX86HasAVX512(SB), $1")
+	p("JE saveAVX512")
+	p("CMPB internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1")
+	p("JE saveAVX2")
+	p("#endif")
+
+	// No features. Assume only SSE.
+	label("saveSSE:")
 	lXRegs.save(g)
+	p("JMP preempt")
 
+	label("saveAVX2:")
+	lYRegs.save(g)
+	p("JMP preempt")
+
+	label("saveAVX512:")
+	lZRegs.save(g)
+	p("JMP preempt")
+
+	label("preempt:")
 	p("CALL ·asyncPreempt2(SB)")
 
 	p("// Restore non-GPs from *p.xRegs.cache")
 	p("MOVQ g_m(R14), %s", xReg)
 	p("MOVQ m_p(%s), %s", xReg, xReg)
 	p("MOVQ (p_xRegs+xRegPerP_cache)(%s), %s", xReg, xReg)
+
+	p("#ifdef GOEXPERIMENT_simd")
+	p("CMPB internal∕cpu·X86+const_offsetX86HasAVX512(SB), $1")
+	p("JE restoreAVX512")
+	p("CMPB internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1")
+	p("JE restoreAVX2")
+	p("#endif")
+
+	label("restoreSSE:")
 	lXRegs.restore(g)
+	p("JMP restoreGPs")
+
+	label("restoreAVX2:")
+	lYRegs.restore(g)
+	p("JMP restoreGPs")
+
+	label("restoreAVX512:")
+	lZRegs.restore(g)
+	p("JMP restoreGPs")
 
+	label("restoreGPs:")
 	p("// Restore GPs")
 	l.restore(g)
 	p("ADJSP $%d", -l.stack)
diff --git a/src/runtime/preempt_amd64.go b/src/runtime/preempt_amd64.go
index 904defac33..44838a1df2 100644
--- a/src/runtime/preempt_amd64.go
+++ b/src/runtime/preempt_amd64.go
@@ -3,20 +3,28 @@
 package runtime
 
 type xRegState struct {
-	X0  [16]byte
-	X1  [16]byte
-	X2  [16]byte
-	X3  [16]byte
-	X4  [16]byte
-	X5  [16]byte
-	X6  [16]byte
-	X7  [16]byte
-	X8  [16]byte
-	X9  [16]byte
-	X10 [16]byte
-	X11 [16]byte
-	X12 [16]byte
-	X13 [16]byte
-	X14 [16]byte
-	X15 [16]byte
+	Z0  [64]byte
+	Z1  [64]byte
+	Z2  [64]byte
+	Z3  [64]byte
+	Z4  [64]byte
+	Z5  [64]byte
+	Z6  [64]byte
+	Z7  [64]byte
+	Z8  [64]byte
+	Z9  [64]byte
+	Z10 [64]byte
+	Z11 [64]byte
+	Z12 [64]byte
+	Z13 [64]byte
+	Z14 [64]byte
+	Z15 [64]byte
+	K0  uint64
+	K1  uint64
+	K2  uint64
+	K3  uint64
+	K4  uint64
+	K5  uint64
+	K6  uint64
+	K7  uint64
 }
diff --git a/src/runtime/preempt_amd64.s b/src/runtime/preempt_amd64.s
index 0a33ce7f3e..c35de7f3b7 100644
--- a/src/runtime/preempt_amd64.s
+++ b/src/runtime/preempt_amd64.s
@@ -36,43 +36,149 @@ TEXT ·asyncPreempt(SB),NOSPLIT|NOFRAME,$0-0
 	MOVQ g_m(R14), AX
 	MOVQ m_p(AX), AX
 	LEAQ (p_xRegs+xRegPerP_scratch)(AX), AX
+	#ifdef GOEXPERIMENT_simd
+	CMPB internal∕cpu·X86+const_offsetX86HasAVX512(SB), $1
+	JE saveAVX512
+	CMPB internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1
+	JE saveAVX2
+	#endif
+saveSSE:
 	MOVUPS X0, 0(AX)
-	MOVUPS X1, 16(AX)
-	MOVUPS X2, 32(AX)
-	MOVUPS X3, 48(AX)
-	MOVUPS X4, 64(AX)
-	MOVUPS X5, 80(AX)
-	MOVUPS X6, 96(AX)
-	MOVUPS X7, 112(AX)
-	MOVUPS X8, 128(AX)
-	MOVUPS X9, 144(AX)
-	MOVUPS X10, 160(AX)
-	MOVUPS X11, 176(AX)
-	MOVUPS X12, 192(AX)
-	MOVUPS X13, 208(AX)
-	MOVUPS X14, 224(AX)
-	MOVUPS X15, 240(AX)
+	MOVUPS X1, 64(AX)
+	MOVUPS X2, 128(AX)
+	MOVUPS X3, 192(AX)
+	MOVUPS X4, 256(AX)
+	MOVUPS X5, 320(AX)
+	MOVUPS X6, 384(AX)
+	MOVUPS X7, 448(AX)
+	MOVUPS X8, 512(AX)
+	MOVUPS X9, 576(AX)
+	MOVUPS X10, 640(AX)
+	MOVUPS X11, 704(AX)
+	MOVUPS X12, 768(AX)
+	MOVUPS X13, 832(AX)
+	MOVUPS X14, 896(AX)
+	MOVUPS X15, 960(AX)
+	JMP preempt
+saveAVX2:
+	VMOVDQU Y0, 0(AX)
+	VMOVDQU Y1, 64(AX)
+	VMOVDQU Y2, 128(AX)
+	VMOVDQU Y3, 192(AX)
+	VMOVDQU Y4, 256(AX)
+	VMOVDQU Y5, 320(AX)
+	VMOVDQU Y6, 384(AX)
+	VMOVDQU Y7, 448(AX)
+	VMOVDQU Y8, 512(AX)
+	VMOVDQU Y9, 576(AX)
+	VMOVDQU Y10, 640(AX)
+	VMOVDQU Y11, 704(AX)
+	VMOVDQU Y12, 768(AX)
+	VMOVDQU Y13, 832(AX)
+	VMOVDQU Y14, 896(AX)
+	VMOVDQU Y15, 960(AX)
+	JMP preempt
+saveAVX512:
+	VMOVDQU64 Z0, 0(AX)
+	VMOVDQU64 Z1, 64(AX)
+	VMOVDQU64 Z2, 128(AX)
+	VMOVDQU64 Z3, 192(AX)
+	VMOVDQU64 Z4, 256(AX)
+	VMOVDQU64 Z5, 320(AX)
+	VMOVDQU64 Z6, 384(AX)
+	VMOVDQU64 Z7, 448(AX)
+	VMOVDQU64 Z8, 512(AX)
+	VMOVDQU64 Z9, 576(AX)
+	VMOVDQU64 Z10, 640(AX)
+	VMOVDQU64 Z11, 704(AX)
+	VMOVDQU64 Z12, 768(AX)
+	VMOVDQU64 Z13, 832(AX)
+	VMOVDQU64 Z14, 896(AX)
+	VMOVDQU64 Z15, 960(AX)
+	KMOVQ K0, 1024(AX)
+	KMOVQ K1, 1032(AX)
+	KMOVQ K2, 1040(AX)
+	KMOVQ K3, 1048(AX)
+	KMOVQ K4, 1056(AX)
+	KMOVQ K5, 1064(AX)
+	KMOVQ K6, 1072(AX)
+	KMOVQ K7, 1080(AX)
+	JMP preempt
+preempt:
 	CALL ·asyncPreempt2(SB)
 	// Restore non-GPs from *p.xRegs.cache
 	MOVQ g_m(R14), AX
 	MOVQ m_p(AX), AX
 	MOVQ (p_xRegs+xRegPerP_cache)(AX), AX
-	MOVUPS 240(AX), X15
-	MOVUPS 224(AX), X14
-	MOVUPS 208(AX), X13
-	MOVUPS 192(AX), X12
-	MOVUPS 176(AX), X11
-	MOVUPS 160(AX), X10
-	MOVUPS 144(AX), X9
-	MOVUPS 128(AX), X8
-	MOVUPS 112(AX), X7
-	MOVUPS 96(AX), X6
-	MOVUPS 80(AX), X5
-	MOVUPS 64(AX), X4
-	MOVUPS 48(AX), X3
-	MOVUPS 32(AX), X2
-	MOVUPS 16(AX), X1
+	#ifdef GOEXPERIMENT_simd
+	CMPB internal∕cpu·X86+const_offsetX86HasAVX512(SB), $1
+	JE restoreAVX512
+	CMPB internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1
+	JE restoreAVX2
+	#endif
+restoreSSE:
+	MOVUPS 960(AX), X15
+	MOVUPS 896(AX), X14
+	MOVUPS 832(AX), X13
+	MOVUPS 768(AX), X12
+	MOVUPS 704(AX), X11
+	MOVUPS 640(AX), X10
+	MOVUPS 576(AX), X9
+	MOVUPS 512(AX), X8
+	MOVUPS 448(AX), X7
+	MOVUPS 384(AX), X6
+	MOVUPS 320(AX), X5
+	MOVUPS 256(AX), X4
+	MOVUPS 192(AX), X3
+	MOVUPS 128(AX), X2
+	MOVUPS 64(AX), X1
 	MOVUPS 0(AX), X0
+	JMP restoreGPs
+restoreAVX2:
+	VMOVDQU 960(AX), Y15
+	VMOVDQU 896(AX), Y14
+	VMOVDQU 832(AX), Y13
+	VMOVDQU 768(AX), Y12
+	VMOVDQU 704(AX), Y11
+	VMOVDQU 640(AX), Y10
+	VMOVDQU 576(AX), Y9
+	VMOVDQU 512(AX), Y8
+	VMOVDQU 448(AX), Y7
+	VMOVDQU 384(AX), Y6
+	VMOVDQU 320(AX), Y5
+	VMOVDQU 256(AX), Y4
+	VMOVDQU 192(AX), Y3
+	VMOVDQU 128(AX), Y2
+	VMOVDQU 64(AX), Y1
+	VMOVDQU 0(AX), Y0
+	JMP restoreGPs
+restoreAVX512:
+	KMOVQ 1080(AX), K7
+	KMOVQ 1072(AX), K6
+	KMOVQ 1064(AX), K5
+	KMOVQ 1056(AX), K4
+	KMOVQ 1048(AX), K3
+	KMOVQ 1040(AX), K2
+	KMOVQ 1032(AX), K1
+	KMOVQ 1024(AX), K0
+	VMOVDQU64 960(AX), Z15
+	VMOVDQU64 896(AX), Z14
+	VMOVDQU64 832(AX), Z13
+	VMOVDQU64 768(AX), Z12
+	VMOVDQU64 704(AX), Z11
+	VMOVDQU64 640(AX), Z10
+	VMOVDQU64 576(AX), Z9
+	VMOVDQU64 512(AX), Z8
+	VMOVDQU64 448(AX), Z7
+	VMOVDQU64 384(AX), Z6
+	VMOVDQU64 320(AX), Z5
+	VMOVDQU64 256(AX), Z4
+	VMOVDQU64 192(AX), Z3
+	VMOVDQU64 128(AX), Z2
+	VMOVDQU64 64(AX), Z1
+	VMOVDQU64 0(AX), Z0
+	JMP restoreGPs
+restoreGPs:
 	// Restore GPs
 	MOVQ 104(SP), R15
 	MOVQ 96(SP), R14
-- 
cgit v1.3


From 0710cce6eb0d75db1fc6c45807773f40edb14d73 Mon Sep 17 00:00:00 2001
From: Austin Clements <austin@google.com>
Date: Mon, 30 Jun 2025 16:42:19 -0400
Subject: [dev.simd] runtime: remove write barrier in xRegRestore

Currently, there's a write barrier in xRegRestore when it assigns
pp.xRegs.cache = gp.xRegs.state. This is bad because that gets called
on the asyncPreempt return path, where we have really limited stack
space, and we don't currently account for this write barrier.

We can't simply mark xRegState as sys.NotInHeap because it's also
embedded in runtime.p as register scratch space, and runtime.p is heap
allocated.

Hence, to fix this, we rename xRegState to just "xRegs" and introduce
a wrapper "xRegState" type that embeds xRegs and is itself marked
sys.NotInHeap. Then, anywhere we need a manually-managed pointer to
register state, we use the new type.

To ensure this doesn't happen again in the future, we also mark
asyncPreempt2 as go:nowritebarrierrec.

Change-Id: I5ff4841e55ff20047ff7d253ab659ab77aeb3391
Reviewed-on: https://go-review.googlesource.com/c/go/+/684836
Auto-Submit: Austin Clements <austin@google.com>
Reviewed-by: Cherry Mui <cherryyz@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
---
 src/runtime/mkpreempt.go     |  2 +-
 src/runtime/preempt.go       |  9 +++++++++
 src/runtime/preempt_amd64.go |  2 +-
 src/runtime/preempt_xreg.go  | 16 +++++++++++++---
 4 files changed, 24 insertions(+), 5 deletions(-)

(limited to 'src/runtime')

diff --git a/src/runtime/mkpreempt.go b/src/runtime/mkpreempt.go
index 29e8288129..2bd2ef07fa 100644
--- a/src/runtime/mkpreempt.go
+++ b/src/runtime/mkpreempt.go
@@ -160,7 +160,7 @@ func writeXRegs(arch string, l *layout) {
 	fmt.Fprintf(g.w, `
 package runtime
 
-type xRegState struct {
+type xRegs struct {
 `)
 	pos := 0
 	for _, reg := range l.regs {
diff --git a/src/runtime/preempt.go b/src/runtime/preempt.go
index d053747d3a..22727df74e 100644
--- a/src/runtime/preempt.go
+++ b/src/runtime/preempt.go
@@ -302,7 +302,16 @@ func canPreemptM(mp *m) bool {
 // asyncPreempt is implemented in assembly.
 func asyncPreempt()
 
+// asyncPreempt2 is the Go continuation of asyncPreempt.
+//
+// It must be deeply nosplit because there's untyped data on the stack from
+// asyncPreempt.
+//
+// It must not have any write barriers because we need to limit the amount of
+// stack it uses.
+//
 //go:nosplit
+//go:nowritebarrierrec
 func asyncPreempt2() {
 	// We can't grow the stack with untyped data from asyncPreempt, so switch to
 	// the system stack right away.
diff --git a/src/runtime/preempt_amd64.go b/src/runtime/preempt_amd64.go
index 44838a1df2..88c0ddd34a 100644
--- a/src/runtime/preempt_amd64.go
+++ b/src/runtime/preempt_amd64.go
@@ -2,7 +2,7 @@
 
 package runtime
 
-type xRegState struct {
+type xRegs struct {
 	Z0  [64]byte
 	Z1  [64]byte
 	Z2  [64]byte
diff --git a/src/runtime/preempt_xreg.go b/src/runtime/preempt_xreg.go
index f0a47c15d9..9e05455ddb 100644
--- a/src/runtime/preempt_xreg.go
+++ b/src/runtime/preempt_xreg.go
@@ -19,7 +19,17 @@
 
 package runtime
 
-import "unsafe"
+import (
+	"internal/runtime/sys"
+	"unsafe"
+)
+
+// xRegState is long-lived extended register state. It is allocated off-heap and
+// manually managed.
+type xRegState struct {
+	_    sys.NotInHeap // Allocated from xRegAlloc
+	regs xRegs
+}
 
 // xRegPerG stores extended register state while a goroutine is asynchronously
 // preempted. This is nil otherwise, so we can reuse a (likely small) pool of
@@ -31,7 +41,7 @@ type xRegPerG struct {
 type xRegPerP struct {
 	// scratch temporary per-P space where [asyncPreempt] saves the register
 	// state before entering Go. It's quickly copied to per-G state.
-	scratch xRegState
+	scratch xRegs
 
 	// cache is a 1-element allocation cache of extended register state used by
 	// asynchronous preemption. On entry to preemption, this is used as a simple
@@ -84,7 +94,7 @@ func xRegSave(gp *g) {
 	// If we ever need to save less state (e.g., avoid saving vector registers
 	// that aren't in use), we could have multiple allocation pools for
 	// different size states and copy only the registers we need.
-	*dest = pp.xRegs.scratch
+	dest.regs = pp.xRegs.scratch
 
 	// Save on the G.
 	gp.xRegs.state = dest
-- 
cgit v1.3


From 574854fd863377a9467625c45ec842fd7d5fc341 Mon Sep 17 00:00:00 2001
From: Junyang Shao <shaojunyang@google.com>
Date: Tue, 8 Jul 2025 19:24:30 +0000
Subject: [dev.simd] runtime: save Z16-Z31 registers in async preempt

The register allocation will use the upper register soon, this CL is to
enable that.

Change-Id: I4d7285e08b95f4e6ebee72594dfbe8d1199f09ed
Reviewed-on: https://go-review.googlesource.com/c/go/+/686498
TryBot-Bypass: David Chase <drchase@google.com>
Reviewed-by: Cherry Mui <cherryyz@google.com>
Commit-Queue: David Chase <drchase@google.com>
---
 src/runtime/mkpreempt.go     |  2 +-
 src/runtime/preempt_amd64.go | 16 +++++++++++
 src/runtime/preempt_amd64.s  | 64 +++++++++++++++++++++++++++++++++-----------
 3 files changed, 65 insertions(+), 17 deletions(-)

(limited to 'src/runtime')

diff --git a/src/runtime/mkpreempt.go b/src/runtime/mkpreempt.go
index 2bd2ef07fa..7786f342b5 100644
--- a/src/runtime/mkpreempt.go
+++ b/src/runtime/mkpreempt.go
@@ -300,7 +300,7 @@ func genAMD64(g *gen) {
 	// Create layouts for X, Y, and Z registers.
 	const (
 		numXRegs = 16
-		numZRegs = 16 // TODO: If we start using upper registers, change to 32
+		numZRegs = 32
 		numKRegs = 8
 	)
 	lZRegs := layout{sp: xReg} // Non-GP registers
diff --git a/src/runtime/preempt_amd64.go b/src/runtime/preempt_amd64.go
index 88c0ddd34a..78dec40e1f 100644
--- a/src/runtime/preempt_amd64.go
+++ b/src/runtime/preempt_amd64.go
@@ -19,6 +19,22 @@ type xRegs struct {
 	Z13 [64]byte
 	Z14 [64]byte
 	Z15 [64]byte
+	Z16 [64]byte
+	Z17 [64]byte
+	Z18 [64]byte
+	Z19 [64]byte
+	Z20 [64]byte
+	Z21 [64]byte
+	Z22 [64]byte
+	Z23 [64]byte
+	Z24 [64]byte
+	Z25 [64]byte
+	Z26 [64]byte
+	Z27 [64]byte
+	Z28 [64]byte
+	Z29 [64]byte
+	Z30 [64]byte
+	Z31 [64]byte
 	K0  uint64
 	K1  uint64
 	K2  uint64
diff --git a/src/runtime/preempt_amd64.s b/src/runtime/preempt_amd64.s
index c35de7f3b7..a5b949a242 100644
--- a/src/runtime/preempt_amd64.s
+++ b/src/runtime/preempt_amd64.s
@@ -95,14 +95,30 @@ saveAVX512:
 	VMOVDQU64 Z13, 832(AX)
 	VMOVDQU64 Z14, 896(AX)
 	VMOVDQU64 Z15, 960(AX)
-	KMOVQ K0, 1024(AX)
-	KMOVQ K1, 1032(AX)
-	KMOVQ K2, 1040(AX)
-	KMOVQ K3, 1048(AX)
-	KMOVQ K4, 1056(AX)
-	KMOVQ K5, 1064(AX)
-	KMOVQ K6, 1072(AX)
-	KMOVQ K7, 1080(AX)
+	VMOVDQU64 Z16, 1024(AX)
+	VMOVDQU64 Z17, 1088(AX)
+	VMOVDQU64 Z18, 1152(AX)
+	VMOVDQU64 Z19, 1216(AX)
+	VMOVDQU64 Z20, 1280(AX)
+	VMOVDQU64 Z21, 1344(AX)
+	VMOVDQU64 Z22, 1408(AX)
+	VMOVDQU64 Z23, 1472(AX)
+	VMOVDQU64 Z24, 1536(AX)
+	VMOVDQU64 Z25, 1600(AX)
+	VMOVDQU64 Z26, 1664(AX)
+	VMOVDQU64 Z27, 1728(AX)
+	VMOVDQU64 Z28, 1792(AX)
+	VMOVDQU64 Z29, 1856(AX)
+	VMOVDQU64 Z30, 1920(AX)
+	VMOVDQU64 Z31, 1984(AX)
+	KMOVQ K0, 2048(AX)
+	KMOVQ K1, 2056(AX)
+	KMOVQ K2, 2064(AX)
+	KMOVQ K3, 2072(AX)
+	KMOVQ K4, 2080(AX)
+	KMOVQ K5, 2088(AX)
+	KMOVQ K6, 2096(AX)
+	KMOVQ K7, 2104(AX)
 	JMP preempt
 preempt:
 	CALL ·asyncPreempt2(SB)
@@ -153,14 +169,30 @@ restoreAVX2:
 	VMOVDQU 0(AX), Y0
 	JMP restoreGPs
 restoreAVX512:
-	KMOVQ 1080(AX), K7
-	KMOVQ 1072(AX), K6
-	KMOVQ 1064(AX), K5
-	KMOVQ 1056(AX), K4
-	KMOVQ 1048(AX), K3
-	KMOVQ 1040(AX), K2
-	KMOVQ 1032(AX), K1
-	KMOVQ 1024(AX), K0
+	KMOVQ 2104(AX), K7
+	KMOVQ 2096(AX), K6
+	KMOVQ 2088(AX), K5
+	KMOVQ 2080(AX), K4
+	KMOVQ 2072(AX), K3
+	KMOVQ 2064(AX), K2
+	KMOVQ 2056(AX), K1
+	KMOVQ 2048(AX), K0
+	VMOVDQU64 1984(AX), Z31
+	VMOVDQU64 1920(AX), Z30
+	VMOVDQU64 1856(AX), Z29
+	VMOVDQU64 1792(AX), Z28
+	VMOVDQU64 1728(AX), Z27
+	VMOVDQU64 1664(AX), Z26
+	VMOVDQU64 1600(AX), Z25
+	VMOVDQU64 1536(AX), Z24
+	VMOVDQU64 1472(AX), Z23
+	VMOVDQU64 1408(AX), Z22
+	VMOVDQU64 1344(AX), Z21
+	VMOVDQU64 1280(AX), Z20
+	VMOVDQU64 1216(AX), Z19
+	VMOVDQU64 1152(AX), Z18
+	VMOVDQU64 1088(AX), Z17
+	VMOVDQU64 1024(AX), Z16
 	VMOVDQU64 960(AX), Z15
 	VMOVDQU64 896(AX), Z14
 	VMOVDQU64 832(AX), Z13
-- 
cgit v1.3


From 4c311aa38f6e354ec4d9f5882a16c36a2e4b0f36 Mon Sep 17 00:00:00 2001
From: Cherry Mui <cherryyz@google.com>
Date: Thu, 21 Aug 2025 14:37:18 -0400
Subject: [dev.simd] cmd/compile: ensure the whole X15 register is zeroed

On AMD64, we reserve the X15 register as the zero register.
Currently we use an SSE instruction to zero it, and we only use
it in SSE contexts. When the machine supports AVX, the high bits
of the register is not necessarily zeroed.

Now that the compiler generates AVX code for SIMD, it would be
great to have a zero register in the AVX context. This CL zeroes
the whole X15 register if AVX is supported.

Change-Id: I4dc803362f2e007b1614b90de435fbb7814cebc7
Reviewed-on: https://go-review.googlesource.com/c/go/+/698237
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Junyang Shao <shaojunyang@google.com>
Reviewed-by: David Chase <drchase@google.com>
---
 src/cmd/compile/internal/amd64/ssa.go              | 33 ++++++++++++++++++++--
 src/cmd/compile/internal/ir/symtab.go              |  1 +
 src/cmd/compile/internal/ssagen/ssa.go             |  4 +--
 .../compile/internal/typecheck/_builtin/runtime.go |  3 +-
 src/cmd/compile/internal/typecheck/builtin.go      |  3 +-
 src/runtime/asm_amd64.s                            |  6 ++++
 src/runtime/cpuflags.go                            |  3 +-
 src/runtime/proc.go                                |  3 +-
 src/runtime/race_amd64.s                           |  3 ++
 src/runtime/sys_darwin_amd64.s                     |  3 ++
 src/runtime/sys_dragonfly_amd64.s                  |  3 ++
 src/runtime/sys_freebsd_amd64.s                    |  6 ++++
 src/runtime/sys_linux_amd64.s                      |  6 ++++
 src/runtime/sys_netbsd_amd64.s                     |  3 ++
 src/runtime/sys_openbsd_amd64.s                    |  3 ++
 src/runtime/sys_windows_amd64.s                    |  3 ++
 16 files changed, 78 insertions(+), 8 deletions(-)

(limited to 'src/runtime')

diff --git a/src/cmd/compile/internal/amd64/ssa.go b/src/cmd/compile/internal/amd64/ssa.go
index 3ae3c61764..f511e75e97 100644
--- a/src/cmd/compile/internal/amd64/ssa.go
+++ b/src/cmd/compile/internal/amd64/ssa.go
@@ -18,6 +18,7 @@ import (
 	"cmd/internal/obj"
 	"cmd/internal/obj/x86"
 	"internal/abi"
+	"internal/buildcfg"
 )
 
 // ssaMarkMoves marks any MOVXconst ops that need to avoid clobbering flags.
@@ -1290,7 +1291,7 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
 	case ssa.OpAMD64CALLstatic, ssa.OpAMD64CALLtail:
 		if s.ABI == obj.ABI0 && v.Aux.(*ssa.AuxCall).Fn.ABI() == obj.ABIInternal {
 			// zeroing X15 when entering ABIInternal from ABI0
-			opregreg(s, x86.AXORPS, x86.REG_X15, x86.REG_X15)
+			zeroX15(s)
 			// set G register from TLS
 			getgFromTLS(s, x86.REG_R14)
 		}
@@ -1301,7 +1302,7 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
 		s.Call(v)
 		if s.ABI == obj.ABIInternal && v.Aux.(*ssa.AuxCall).Fn.ABI() == obj.ABI0 {
 			// zeroing X15 when entering ABIInternal from ABI0
-			opregreg(s, x86.AXORPS, x86.REG_X15, x86.REG_X15)
+			zeroX15(s)
 			// set G register from TLS
 			getgFromTLS(s, x86.REG_R14)
 		}
@@ -1829,6 +1830,34 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
 	}
 }
 
+// zeroX15 zeroes the X15 register.
+func zeroX15(s *ssagen.State) {
+	vxorps := func(s *ssagen.State) {
+		p := s.Prog(x86.AVXORPS)
+		p.From.Type = obj.TYPE_REG
+		p.From.Reg = x86.REG_X15
+		p.AddRestSourceReg(x86.REG_X15)
+		p.To.Type = obj.TYPE_REG
+		p.To.Reg = x86.REG_X15
+	}
+	if buildcfg.GOAMD64 >= 3 {
+		vxorps(s)
+		return
+	}
+	// AVX may not be available, check before zeroing the high bits.
+	p := s.Prog(x86.ACMPB)
+	p.From.Type = obj.TYPE_MEM
+	p.From.Name = obj.NAME_EXTERN
+	p.From.Sym = ir.Syms.X86HasAVX
+	p.To.Type = obj.TYPE_CONST
+	p.To.Offset = 1
+	jmp := s.Prog(x86.AJNE)
+	jmp.To.Type = obj.TYPE_BRANCH
+	vxorps(s)
+	sse := opregreg(s, x86.AXORPS, x86.REG_X15, x86.REG_X15)
+	jmp.To.SetTarget(sse)
+}
+
 // Example instruction: VRSQRTPS X1, X1
 func simdV11(s *ssagen.State, v *ssa.Value) *obj.Prog {
 	p := s.Prog(v.Op.Asm())
diff --git a/src/cmd/compile/internal/ir/symtab.go b/src/cmd/compile/internal/ir/symtab.go
index ee0f52fbf3..2222a5444a 100644
--- a/src/cmd/compile/internal/ir/symtab.go
+++ b/src/cmd/compile/internal/ir/symtab.go
@@ -68,6 +68,7 @@ type symsStruct struct {
 	Loong64HasLAM_BH *obj.LSym
 	Loong64HasLSX    *obj.LSym
 	RISCV64HasZbb    *obj.LSym
+	X86HasAVX        *obj.LSym
 	X86HasFMA        *obj.LSym
 	X86HasPOPCNT     *obj.LSym
 	X86HasSSE41      *obj.LSym
diff --git a/src/cmd/compile/internal/ssagen/ssa.go b/src/cmd/compile/internal/ssagen/ssa.go
index abb6370a15..57129817f6 100644
--- a/src/cmd/compile/internal/ssagen/ssa.go
+++ b/src/cmd/compile/internal/ssagen/ssa.go
@@ -150,9 +150,10 @@ func InitConfig() {
 	ir.Syms.TypeAssert = typecheck.LookupRuntimeFunc("typeAssert")
 	ir.Syms.WBZero = typecheck.LookupRuntimeFunc("wbZero")
 	ir.Syms.WBMove = typecheck.LookupRuntimeFunc("wbMove")
+	ir.Syms.X86HasAVX = typecheck.LookupRuntimeVar("x86HasAVX")               // bool
+	ir.Syms.X86HasFMA = typecheck.LookupRuntimeVar("x86HasFMA")               // bool
 	ir.Syms.X86HasPOPCNT = typecheck.LookupRuntimeVar("x86HasPOPCNT")         // bool
 	ir.Syms.X86HasSSE41 = typecheck.LookupRuntimeVar("x86HasSSE41")           // bool
-	ir.Syms.X86HasFMA = typecheck.LookupRuntimeVar("x86HasFMA")               // bool
 	ir.Syms.ARMHasVFPv4 = typecheck.LookupRuntimeVar("armHasVFPv4")           // bool
 	ir.Syms.ARM64HasATOMICS = typecheck.LookupRuntimeVar("arm64HasATOMICS")   // bool
 	ir.Syms.Loong64HasLAMCAS = typecheck.LookupRuntimeVar("loong64HasLAMCAS") // bool
@@ -7714,4 +7715,3 @@ func isStructNotSIMD(t *types.Type) bool {
 }
 
 var BoundsCheckFunc [ssa.BoundsKindCount]*obj.LSym
-
diff --git a/src/cmd/compile/internal/typecheck/_builtin/runtime.go b/src/cmd/compile/internal/typecheck/_builtin/runtime.go
index 296bfdc281..1e4d0b7db6 100644
--- a/src/cmd/compile/internal/typecheck/_builtin/runtime.go
+++ b/src/cmd/compile/internal/typecheck/_builtin/runtime.go
@@ -284,9 +284,10 @@ func libfuzzerHookEqualFold(string, string, uint)
 func addCovMeta(p unsafe.Pointer, len uint32, hash [16]byte, pkpath string, pkgId int, cmode uint8, cgran uint8) uint32
 
 // architecture variants
+var x86HasAVX bool
+var x86HasFMA bool
 var x86HasPOPCNT bool
 var x86HasSSE41 bool
-var x86HasFMA bool
 var armHasVFPv4 bool
 var arm64HasATOMICS bool
 var loong64HasLAMCAS bool
diff --git a/src/cmd/compile/internal/typecheck/builtin.go b/src/cmd/compile/internal/typecheck/builtin.go
index 535f0fb7e8..6b8c6d7bad 100644
--- a/src/cmd/compile/internal/typecheck/builtin.go
+++ b/src/cmd/compile/internal/typecheck/builtin.go
@@ -232,9 +232,10 @@ var runtimeDecls = [...]struct {
 	{"libfuzzerHookStrCmp", funcTag, 155},
 	{"libfuzzerHookEqualFold", funcTag, 155},
 	{"addCovMeta", funcTag, 157},
+	{"x86HasAVX", varTag, 6},
+	{"x86HasFMA", varTag, 6},
 	{"x86HasPOPCNT", varTag, 6},
 	{"x86HasSSE41", varTag, 6},
-	{"x86HasFMA", varTag, 6},
 	{"armHasVFPv4", varTag, 6},
 	{"arm64HasATOMICS", varTag, 6},
 	{"loong64HasLAMCAS", varTag, 6},
diff --git a/src/runtime/asm_amd64.s b/src/runtime/asm_amd64.s
index cf1d49a4ad..f8ebd030b6 100644
--- a/src/runtime/asm_amd64.s
+++ b/src/runtime/asm_amd64.s
@@ -1015,6 +1015,9 @@ needm:
 	// there's no need to handle that. Clear R14 so that there's
 	// a bad value in there, in case needm tries to use it.
 	XORPS	X15, X15
+	CMPB	internal∕cpu·X86+const_offsetX86HasAVX(SB), $1
+	JNE	2(PC)
+	VXORPS	X15, X15, X15
 	XORQ    R14, R14
 	MOVQ	$runtime·needAndBindM<ABIInternal>(SB), AX
 	CALL	AX
@@ -1712,6 +1715,9 @@ TEXT ·sigpanic0(SB),NOSPLIT,$0-0
 	get_tls(R14)
 	MOVQ	g(R14), R14
 	XORPS	X15, X15
+	CMPB	internal∕cpu·X86+const_offsetX86HasAVX(SB), $1
+	JNE	2(PC)
+	VXORPS	X15, X15, X15
 	JMP	·sigpanic<ABIInternal>(SB)
 
 // gcWriteBarrier informs the GC about heap pointer writes.
diff --git a/src/runtime/cpuflags.go b/src/runtime/cpuflags.go
index 6452364b68..67ed081ef6 100644
--- a/src/runtime/cpuflags.go
+++ b/src/runtime/cpuflags.go
@@ -28,9 +28,10 @@ const (
 var (
 	// Set in runtime.cpuinit.
 	// TODO: deprecate these; use internal/cpu directly.
+	x86HasAVX    bool
+	x86HasFMA    bool
 	x86HasPOPCNT bool
 	x86HasSSE41  bool
-	x86HasFMA    bool
 
 	armHasVFPv4 bool
 
diff --git a/src/runtime/proc.go b/src/runtime/proc.go
index 68647d771f..1d597d59c2 100644
--- a/src/runtime/proc.go
+++ b/src/runtime/proc.go
@@ -766,9 +766,10 @@ func cpuinit(env string) {
 	// to guard execution of instructions that can not be assumed to be always supported.
 	switch GOARCH {
 	case "386", "amd64":
+		x86HasAVX = cpu.X86.HasAVX
+		x86HasFMA = cpu.X86.HasFMA
 		x86HasPOPCNT = cpu.X86.HasPOPCNT
 		x86HasSSE41 = cpu.X86.HasSSE41
-		x86HasFMA = cpu.X86.HasFMA
 
 	case "arm":
 		armHasVFPv4 = cpu.ARM.HasVFPv4
diff --git a/src/runtime/race_amd64.s b/src/runtime/race_amd64.s
index e19118bd54..23f2e59e3d 100644
--- a/src/runtime/race_amd64.s
+++ b/src/runtime/race_amd64.s
@@ -456,6 +456,9 @@ call:
 	// Back to Go world, set special registers.
 	// The g register (R14) is preserved in C.
 	XORPS	X15, X15
+	CMPB	internal∕cpu·X86+const_offsetX86HasAVX(SB), $1
+	JNE	2(PC)
+	VXORPS	X15, X15, X15
 	RET
 
 // C->Go callback thunk that allows to call runtime·racesymbolize from C code.
diff --git a/src/runtime/sys_darwin_amd64.s b/src/runtime/sys_darwin_amd64.s
index cc4e52d305..0091546f20 100644
--- a/src/runtime/sys_darwin_amd64.s
+++ b/src/runtime/sys_darwin_amd64.s
@@ -177,6 +177,9 @@ TEXT runtime·sigtramp(SB),NOSPLIT|TOPFRAME|NOFRAME,$0
 	get_tls(R12)
 	MOVQ	g(R12), R14
 	PXOR	X15, X15
+	CMPB	internal∕cpu·X86+const_offsetX86HasAVX(SB), $1
+	JNE	2(PC)
+	VXORPS	X15, X15, X15
 
 	// Reserve space for spill slots.
 	NOP	SP		// disable vet stack checking
diff --git a/src/runtime/sys_dragonfly_amd64.s b/src/runtime/sys_dragonfly_amd64.s
index a223c2cf76..84bf326aad 100644
--- a/src/runtime/sys_dragonfly_amd64.s
+++ b/src/runtime/sys_dragonfly_amd64.s
@@ -228,6 +228,9 @@ TEXT runtime·sigtramp(SB),NOSPLIT|TOPFRAME|NOFRAME,$0
 	get_tls(R12)
 	MOVQ	g(R12), R14
 	PXOR	X15, X15
+	CMPB	internal∕cpu·X86+const_offsetX86HasAVX(SB), $1
+	JNE	2(PC)
+	VXORPS	X15, X15, X15
 
 	// Reserve space for spill slots.
 	NOP	SP		// disable vet stack checking
diff --git a/src/runtime/sys_freebsd_amd64.s b/src/runtime/sys_freebsd_amd64.s
index 977ea093d2..a1fa3a6fa2 100644
--- a/src/runtime/sys_freebsd_amd64.s
+++ b/src/runtime/sys_freebsd_amd64.s
@@ -265,6 +265,9 @@ TEXT runtime·sigtramp(SB),NOSPLIT|TOPFRAME|NOFRAME,$0
 	get_tls(R12)
 	MOVQ	g(R12), R14
 	PXOR	X15, X15
+	CMPB	internal∕cpu·X86+const_offsetX86HasAVX(SB), $1
+	JNE	2(PC)
+	VXORPS	X15, X15, X15
 
 	// Reserve space for spill slots.
 	NOP	SP		// disable vet stack checking
@@ -290,6 +293,9 @@ TEXT runtime·sigprofNonGoWrapper<>(SB),NOSPLIT|NOFRAME,$0
 	get_tls(R12)
 	MOVQ	g(R12), R14
 	PXOR	X15, X15
+	CMPB	internal∕cpu·X86+const_offsetX86HasAVX(SB), $1
+	JNE	2(PC)
+	VXORPS	X15, X15, X15
 
 	// Reserve space for spill slots.
 	NOP	SP		// disable vet stack checking
diff --git a/src/runtime/sys_linux_amd64.s b/src/runtime/sys_linux_amd64.s
index 941f70b0e8..02505c2fb0 100644
--- a/src/runtime/sys_linux_amd64.s
+++ b/src/runtime/sys_linux_amd64.s
@@ -340,6 +340,9 @@ TEXT runtime·sigtramp(SB),NOSPLIT|TOPFRAME|NOFRAME,$0
 	get_tls(R12)
 	MOVQ	g(R12), R14
 	PXOR	X15, X15
+	CMPB	internal∕cpu·X86+const_offsetX86HasAVX(SB), $1
+	JNE	2(PC)
+	VXORPS	X15, X15, X15
 
 	// Reserve space for spill slots.
 	NOP	SP		// disable vet stack checking
@@ -365,6 +368,9 @@ TEXT runtime·sigprofNonGoWrapper<>(SB),NOSPLIT|NOFRAME,$0
 	get_tls(R12)
 	MOVQ	g(R12), R14
 	PXOR	X15, X15
+	CMPB	internal∕cpu·X86+const_offsetX86HasAVX(SB), $1
+	JNE	2(PC)
+	VXORPS	X15, X15, X15
 
 	// Reserve space for spill slots.
 	NOP	SP		// disable vet stack checking
diff --git a/src/runtime/sys_netbsd_amd64.s b/src/runtime/sys_netbsd_amd64.s
index 2f1ddcdc89..edc7f3d6ee 100644
--- a/src/runtime/sys_netbsd_amd64.s
+++ b/src/runtime/sys_netbsd_amd64.s
@@ -310,6 +310,9 @@ TEXT runtime·sigtramp(SB),NOSPLIT|TOPFRAME|NOFRAME,$0
 	get_tls(R12)
 	MOVQ	g(R12), R14
 	PXOR	X15, X15
+	CMPB	internal∕cpu·X86+const_offsetX86HasAVX(SB), $1
+	JNE	2(PC)
+	VXORPS	X15, X15, X15
 
 	// Reserve space for spill slots.
 	NOP	SP		// disable vet stack checking
diff --git a/src/runtime/sys_openbsd_amd64.s b/src/runtime/sys_openbsd_amd64.s
index ff0bc2416a..734dfe6478 100644
--- a/src/runtime/sys_openbsd_amd64.s
+++ b/src/runtime/sys_openbsd_amd64.s
@@ -64,6 +64,9 @@ TEXT runtime·sigtramp(SB),NOSPLIT|TOPFRAME|NOFRAME,$0
 	get_tls(R12)
 	MOVQ	g(R12), R14
 	PXOR	X15, X15
+	CMPB	internal∕cpu·X86+const_offsetX86HasAVX(SB), $1
+	JNE	2(PC)
+	VXORPS	X15, X15, X15
 
 	// Reserve space for spill slots.
 	NOP	SP		// disable vet stack checking
diff --git a/src/runtime/sys_windows_amd64.s b/src/runtime/sys_windows_amd64.s
index e438599910..b0b4d3cce6 100644
--- a/src/runtime/sys_windows_amd64.s
+++ b/src/runtime/sys_windows_amd64.s
@@ -32,6 +32,9 @@ TEXT sigtramp<>(SB),NOSPLIT,$0-0
 	// R14 is cleared in case there's a non-zero value in there
 	// if called from a non-go thread.
 	XORPS	X15, X15
+	CMPB	internal∕cpu·X86+const_offsetX86HasAVX(SB), $1
+	JNE	2(PC)
+	VXORPS	X15, X15, X15
 	XORQ	R14, R14
 
 	get_tls(AX)
-- 
cgit v1.3


From 91253515831d1d51f9a998a743309c94e1fc4e1e Mon Sep 17 00:00:00 2001
From: Cherry Mui <cherryyz@google.com>
Date: Fri, 29 Aug 2025 20:33:19 -0400
Subject: [dev.simd] internal/cpu: report AVX1 and 2 as supported on macOS 15
 Rosetta 2

Apparently, on macOS 15 or newer, Rosetta 2 supports AVX1 and 2.
However, neither CPUID nor the Apple-recommended sysctl says it
has AVX. If AVX is used without checking the CPU feature, it may
run fine without SIGILL, but the runtime doesn't know AVX is
available therefore save and restore its states. This may lead to
value corruption.

Check if we are running under Rosetta 2 on macOS 15 or newer. If so,
report AVX1 and 2 as supported.

Change-Id: Ib981379405b1ae28faa378f051096827d760a4cc
Reviewed-on: https://go-review.googlesource.com/c/go/+/700055
Reviewed-by: David Chase <drchase@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Junyang Shao <shaojunyang@google.com>
---
 src/internal/cpu/cpu_arm64_darwin.go            | 23 --------
 src/internal/cpu/cpu_darwin.go                  | 72 +++++++++++++++++++++++++
 src/internal/cpu/cpu_x86.go                     |  5 ++
 src/internal/cpu/cpu_x86_darwin.go              | 23 ++++++++
 src/internal/cpu/cpu_x86_other.go               |  9 ++++
 src/runtime/cpuflags_amd64_test.go              | 19 +++++++
 src/runtime/export_test.go                      |  2 +
 src/runtime/os_darwin.go                        | 15 +++++-
 src/runtime/testdata/testprog/cpuflags_amd64.go | 18 +++++++
 src/runtime/testdata/testprog/cpuflags_amd64.s  |  9 ++++
 10 files changed, 170 insertions(+), 25 deletions(-)
 create mode 100644 src/internal/cpu/cpu_darwin.go
 create mode 100644 src/internal/cpu/cpu_x86_darwin.go
 create mode 100644 src/internal/cpu/cpu_x86_other.go
 create mode 100644 src/runtime/cpuflags_amd64_test.go
 create mode 100644 src/runtime/testdata/testprog/cpuflags_amd64.go
 create mode 100644 src/runtime/testdata/testprog/cpuflags_amd64.s

(limited to 'src/runtime')

diff --git a/src/internal/cpu/cpu_arm64_darwin.go b/src/internal/cpu/cpu_arm64_darwin.go
index 28b47d60e8..bd89cd4e80 100644
--- a/src/internal/cpu/cpu_arm64_darwin.go
+++ b/src/internal/cpu/cpu_arm64_darwin.go
@@ -6,8 +6,6 @@
 
 package cpu
 
-import _ "unsafe" // for linkname
-
 func osInit() {
 	// macOS 12 moved these to the hw.optional.arm tree, but as of Go 1.24 we
 	// still support macOS 11. See [Determine Encryption Capabilities].
@@ -29,24 +27,3 @@ func osInit() {
 	ARM64.HasSHA1 = true
 	ARM64.HasSHA2 = true
 }
-
-//go:noescape
-func getsysctlbyname(name []byte) (int32, int32)
-
-// sysctlEnabled should be an internal detail,
-// but widely used packages access it using linkname.
-// Notable members of the hall of shame include:
-//   - github.com/bytedance/gopkg
-//   - github.com/songzhibin97/gkit
-//
-// Do not remove or change the type signature.
-// See go.dev/issue/67401.
-//
-//go:linkname sysctlEnabled
-func sysctlEnabled(name []byte) bool {
-	ret, value := getsysctlbyname(name)
-	if ret < 0 {
-		return false
-	}
-	return value > 0
-}
diff --git a/src/internal/cpu/cpu_darwin.go b/src/internal/cpu/cpu_darwin.go
new file mode 100644
index 0000000000..2d4ac54fc2
--- /dev/null
+++ b/src/internal/cpu/cpu_darwin.go
@@ -0,0 +1,72 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build darwin && !ios
+
+package cpu
+
+import _ "unsafe" // for linkname
+
+// Pushed from runtime.
+//
+//go:noescape
+func sysctlbynameInt32(name []byte) (int32, int32)
+
+// Pushed from runtime.
+//
+//go:noescape
+func sysctlbynameBytes(name, out []byte) int32
+
+// sysctlEnabled should be an internal detail,
+// but widely used packages access it using linkname.
+// Notable members of the hall of shame include:
+//   - github.com/bytedance/gopkg
+//   - github.com/songzhibin97/gkit
+//
+// Do not remove or change the type signature.
+// See go.dev/issue/67401.
+//
+//go:linkname sysctlEnabled
+func sysctlEnabled(name []byte) bool {
+	ret, value := sysctlbynameInt32(name)
+	if ret < 0 {
+		return false
+	}
+	return value > 0
+}
+
+// darwinKernelVersionCheck reports if Darwin kernel version is at
+// least major.minor.patch.
+//
+// Code borrowed from x/sys/cpu.
+func darwinKernelVersionCheck(major, minor, patch int) bool {
+	var release [256]byte
+	ret := sysctlbynameBytes([]byte("kern.osrelease\x00"), release[:])
+	if ret < 0 {
+		return false
+	}
+
+	var mmp [3]int
+	c := 0
+Loop:
+	for _, b := range release[:] {
+		switch {
+		case b >= '0' && b <= '9':
+			mmp[c] = 10*mmp[c] + int(b-'0')
+		case b == '.':
+			c++
+			if c > 2 {
+				return false
+			}
+		case b == 0:
+			break Loop
+		default:
+			return false
+		}
+	}
+	if c != 2 {
+		return false
+	}
+	return mmp[0] > major || mmp[0] == major && (mmp[1] > minor || mmp[1] == minor && mmp[2] >= patch)
+}
diff --git a/src/internal/cpu/cpu_x86.go b/src/internal/cpu/cpu_x86.go
index f07fc82df1..ef1874ad68 100644
--- a/src/internal/cpu/cpu_x86.go
+++ b/src/internal/cpu/cpu_x86.go
@@ -114,6 +114,7 @@ func doinit() {
 	maxID, _, _, _ := cpuid(0, 0)
 
 	if maxID < 1 {
+		osInit()
 		return
 	}
 
@@ -158,6 +159,7 @@ func doinit() {
 	X86.HasAVX = isSet(ecx1, cpuid_AVX) && osSupportsAVX
 
 	if maxID < 7 {
+		osInit()
 		return
 	}
 
@@ -194,6 +196,7 @@ func doinit() {
 	maxExtendedInformation, _, _, _ = cpuid(0x80000000, 0)
 
 	if maxExtendedInformation < 0x80000001 {
+		osInit()
 		return
 	}
 
@@ -217,6 +220,8 @@ func doinit() {
 			X86.HasAVXVNNI = isSet(4, eax71)
 		}
 	}
+
+	osInit()
 }
 
 func isSet(hwc uint32, value uint32) bool {
diff --git a/src/internal/cpu/cpu_x86_darwin.go b/src/internal/cpu/cpu_x86_darwin.go
new file mode 100644
index 0000000000..12380a7802
--- /dev/null
+++ b/src/internal/cpu/cpu_x86_darwin.go
@@ -0,0 +1,23 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build (386 || amd64) && darwin && !ios
+
+package cpu
+
+func osInit() {
+	if isRosetta() && darwinKernelVersionCheck(24, 0, 0) {
+		// Apparently, on macOS 15 (Darwin kernel version 24) or newer,
+		// Rosetta 2 supports AVX1 and 2. However, neither CPUID nor
+		// sysctl says it has AVX. Detect this situation here and report
+		// AVX1 and 2 as supported.
+		// TODO: check if any other feature is actually supported.
+		X86.HasAVX = true
+		X86.HasAVX2 = true
+	}
+}
+
+func isRosetta() bool {
+	return sysctlEnabled([]byte("sysctl.proc_translated\x00"))
+}
diff --git a/src/internal/cpu/cpu_x86_other.go b/src/internal/cpu/cpu_x86_other.go
new file mode 100644
index 0000000000..824131226c
--- /dev/null
+++ b/src/internal/cpu/cpu_x86_other.go
@@ -0,0 +1,9 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build (386 || amd64) && (!darwin || ios)
+
+package cpu
+
+func osInit() {}
diff --git a/src/runtime/cpuflags_amd64_test.go b/src/runtime/cpuflags_amd64_test.go
new file mode 100644
index 0000000000..f238e7fdf2
--- /dev/null
+++ b/src/runtime/cpuflags_amd64_test.go
@@ -0,0 +1,19 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime_test
+
+import (
+	"runtime"
+	"testing"
+)
+
+func TestHasAVX(t *testing.T) {
+	t.Parallel()
+	output := runTestProg(t, "testprog", "CheckAVX")
+	ok := output == "OK\n"
+	if *runtime.X86HasAVX != ok {
+		t.Fatalf("x86HasAVX: %v, CheckAVX got:\n%s", *runtime.X86HasAVX, output)
+	}
+}
diff --git a/src/runtime/export_test.go b/src/runtime/export_test.go
index 1f55717f0a..fc77b535da 100644
--- a/src/runtime/export_test.go
+++ b/src/runtime/export_test.go
@@ -1940,3 +1940,5 @@ func (t *TraceStackTable) Reset() {
 func TraceStack(gp *G, tab *TraceStackTable) {
 	traceStack(0, gp, (*traceStackTable)(tab))
 }
+
+var X86HasAVX = &x86HasAVX
diff --git a/src/runtime/os_darwin.go b/src/runtime/os_darwin.go
index 0c7144e9d0..ab8aa8037b 100644
--- a/src/runtime/os_darwin.go
+++ b/src/runtime/os_darwin.go
@@ -157,11 +157,22 @@ func sysctlbynameInt32(name []byte) (int32, int32) {
 	return ret, out
 }
 
-//go:linkname internal_cpu_getsysctlbyname internal/cpu.getsysctlbyname
-func internal_cpu_getsysctlbyname(name []byte) (int32, int32) {
+func sysctlbynameBytes(name, out []byte) int32 {
+	nout := uintptr(len(out))
+	ret := sysctlbyname(&name[0], &out[0], &nout, nil, 0)
+	return ret
+}
+
+//go:linkname internal_cpu_sysctlbynameInt32 internal/cpu.sysctlbynameInt32
+func internal_cpu_sysctlbynameInt32(name []byte) (int32, int32) {
 	return sysctlbynameInt32(name)
 }
 
+//go:linkname internal_cpu_sysctlbynameBytes internal/cpu.sysctlbynameBytes
+func internal_cpu_sysctlbynameBytes(name, out []byte) int32 {
+	return sysctlbynameBytes(name, out)
+}
+
 const (
 	_CTL_HW      = 6
 	_HW_NCPU     = 3
diff --git a/src/runtime/testdata/testprog/cpuflags_amd64.go b/src/runtime/testdata/testprog/cpuflags_amd64.go
new file mode 100644
index 0000000000..d53eacbe99
--- /dev/null
+++ b/src/runtime/testdata/testprog/cpuflags_amd64.go
@@ -0,0 +1,18 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package main
+
+import "fmt"
+
+func init() {
+	register("CheckAVX", CheckAVX)
+}
+
+func CheckAVX() {
+	checkAVX()
+	fmt.Println("OK")
+}
+
+func checkAVX()
diff --git a/src/runtime/testdata/testprog/cpuflags_amd64.s b/src/runtime/testdata/testprog/cpuflags_amd64.s
new file mode 100644
index 0000000000..1610c5729a
--- /dev/null
+++ b/src/runtime/testdata/testprog/cpuflags_amd64.s
@@ -0,0 +1,9 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+TEXT	·checkAVX(SB), NOSPLIT|NOFRAME, $0-0
+	VXORPS	X1, X2, X3
+	RET
-- 
cgit v1.3


From 2b50ffe172ee638a88e2750481eaeeac7d3bedfa Mon Sep 17 00:00:00 2001
From: Cherry Mui <cherryyz@google.com>
Date: Mon, 22 Sep 2025 10:57:29 -0400
Subject: [dev.simd] cmd/compile: remove stores to unread parameters

Currently, we remove stores to local variables that are not read.
We don't do that for arguments. But arguments and locals are
essentially the same. Arguments are passed by value, and are not
expected to be read in the caller's frame. So we can remove the
writes to them as well. One exception is the cgo_unsafe_arg
directive, which makes all the arguments effectively address-taken.
cgo_unsafe_arg implies ABI0, so we just skip ABI0 functions'
arguments.

Change-Id: I8999fc50da6a87f22c1ec23e9a0c15483b6f7df8
Reviewed-on: https://go-review.googlesource.com/c/go/+/705815
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: David Chase <drchase@google.com>
Reviewed-by: Junyang Shao <shaojunyang@google.com>
---
 src/cmd/compile/internal/ssa/deadstore.go     | 22 ++++++++++++++++++----
 src/runtime/testdata/testprog/badtraceback.go |  2 ++
 test/codegen/stack.go                         |  6 ++++++
 3 files changed, 26 insertions(+), 4 deletions(-)

(limited to 'src/runtime')

diff --git a/src/cmd/compile/internal/ssa/deadstore.go b/src/cmd/compile/internal/ssa/deadstore.go
index 9e67e83399..d0adff788c 100644
--- a/src/cmd/compile/internal/ssa/deadstore.go
+++ b/src/cmd/compile/internal/ssa/deadstore.go
@@ -7,6 +7,7 @@ package ssa
 import (
 	"cmd/compile/internal/ir"
 	"cmd/compile/internal/types"
+	"cmd/internal/obj"
 )
 
 // dse does dead-store elimination on the Function.
@@ -213,7 +214,7 @@ func elimDeadAutosGeneric(f *Func) {
 		case OpAddr, OpLocalAddr:
 			// Propagate the address if it points to an auto.
 			n, ok := v.Aux.(*ir.Name)
-			if !ok || n.Class != ir.PAUTO {
+			if !ok || (n.Class != ir.PAUTO && !isABIInternalParam(f, n)) {
 				return
 			}
 			if addr[v] == nil {
@@ -224,7 +225,7 @@ func elimDeadAutosGeneric(f *Func) {
 		case OpVarDef:
 			// v should be eliminated if we eliminate the auto.
 			n, ok := v.Aux.(*ir.Name)
-			if !ok || n.Class != ir.PAUTO {
+			if !ok || (n.Class != ir.PAUTO && !isABIInternalParam(f, n)) {
 				return
 			}
 			if elim[v] == nil {
@@ -240,7 +241,7 @@ func elimDeadAutosGeneric(f *Func) {
 			// may not be used by the inline code, but will be used by
 			// panic processing).
 			n, ok := v.Aux.(*ir.Name)
-			if !ok || n.Class != ir.PAUTO {
+			if !ok || (n.Class != ir.PAUTO && !isABIInternalParam(f, n)) {
 				return
 			}
 			if !used.Has(n) {
@@ -373,7 +374,7 @@ func elimUnreadAutos(f *Func) {
 			if !ok {
 				continue
 			}
-			if n.Class != ir.PAUTO {
+			if n.Class != ir.PAUTO && !isABIInternalParam(f, n) {
 				continue
 			}
 
@@ -413,3 +414,16 @@ func elimUnreadAutos(f *Func) {
 		store.Op = OpCopy
 	}
 }
+
+// isABIInternalParam returns whether n is a parameter of an ABIInternal
+// function. For dead store elimination, we can treat parameters the same
+// way as autos. Storing to a parameter can be removed if it is not read
+// or address-taken.
+//
+// We check ABI here because for a cgo_unsafe_arg function (which is ABI0),
+// all the args are effectively address-taken, but not necessarily have
+// an Addr or LocalAddr op. We could probably just check for cgo_unsafe_arg,
+// but ABIInternal is mostly what matters.
+func isABIInternalParam(f *Func, n *ir.Name) bool {
+	return n.Class == ir.PPARAM && f.ABISelf.Which() == obj.ABIInternal
+}
diff --git a/src/runtime/testdata/testprog/badtraceback.go b/src/runtime/testdata/testprog/badtraceback.go
index 09aa2b877e..455118a543 100644
--- a/src/runtime/testdata/testprog/badtraceback.go
+++ b/src/runtime/testdata/testprog/badtraceback.go
@@ -44,6 +44,8 @@ func badLR2(arg int) {
 	lrPtr := (*uintptr)(unsafe.Pointer(uintptr(unsafe.Pointer(&arg)) - lrOff))
 	*lrPtr = 0xbad
 
+	runtime.KeepAlive(lrPtr) // prevent dead store elimination
+
 	// Print a backtrace. This should include diagnostics for the
 	// bad return PC and a hex dump.
 	panic("backtrace")
diff --git a/test/codegen/stack.go b/test/codegen/stack.go
index 4e45d68f38..59284ae888 100644
--- a/test/codegen/stack.go
+++ b/test/codegen/stack.go
@@ -168,3 +168,9 @@ func getp1() *[4]int {
 func getp2() *[4]int {
 	return nil
 }
+
+// Store to an argument without read can be removed.
+func storeArg(a [2]int) {
+	// amd64:-`MOVQ\t\$123,.*\.a\+\d+\(SP\)`
+	a[1] = 123
+}
-- 
cgit v1.3


From 25c36b95d1523f22d4c46ec237acc03e00540e0a Mon Sep 17 00:00:00 2001
From: David Chase <drchase@google.com>
Date: Fri, 19 Sep 2025 13:07:59 -0400
Subject: [dev.simd] simd, cmd/compile: add 128 bit select-from-pair

Using this name until a better one appears:
   x.Select128FromPair(3, 2, y)

Includes test for constant and variable case.
Checks for unexpected immediates (using the zeroing flag,
which is not supported for this intrinsic) and panics.

Change-Id: I9249475d6572968c127b4ee9e00328d717c07578
Reviewed-on: https://go-review.googlesource.com/c/go/+/705496
Reviewed-by: Junyang Shao <shaojunyang@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
---
 src/cmd/compile/internal/amd64/simdssa.go          |  2 +
 src/cmd/compile/internal/ir/symtab.go              |  1 +
 src/cmd/compile/internal/ssa/_gen/simdAMD64.rules  |  6 ++
 src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go  |  2 +
 .../compile/internal/ssa/_gen/simdgenericOps.go    |  6 ++
 src/cmd/compile/internal/ssa/opGen.go              | 74 ++++++++++++++++++++++
 src/cmd/compile/internal/ssa/rewriteAMD64.go       | 18 ++++++
 src/cmd/compile/internal/ssagen/intrinsics.go      | 26 +++++++-
 src/cmd/compile/internal/ssagen/simdintrinsics.go  |  6 ++
 src/cmd/compile/internal/ssagen/ssa.go             |  1 +
 src/runtime/panic.go                               |  7 ++
 src/simd/_gen/simdgen/gen_simdIntrinsics.go        |  2 +
 src/simd/_gen/simdgen/gen_simdTypes.go             |  9 +++
 src/simd/_gen/simdgen/ops/Moves/categories.yaml    |  8 ++-
 src/simd/_gen/simdgen/ops/Moves/go.yaml            | 72 ++++++++++++++++++++-
 src/simd/_gen/unify/domain.go                      |  4 +-
 src/simd/internal/simd_test/simd_test.go           | 74 ++++++++++++++++++++++
 src/simd/ops_amd64.go                              | 56 ++++++++++++++++
 18 files changed, 369 insertions(+), 5 deletions(-)

(limited to 'src/runtime')

diff --git a/src/cmd/compile/internal/amd64/simdssa.go b/src/cmd/compile/internal/amd64/simdssa.go
index d69740cd96..a4d2452435 100644
--- a/src/cmd/compile/internal/amd64/simdssa.go
+++ b/src/cmd/compile/internal/amd64/simdssa.go
@@ -1053,6 +1053,8 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VGF2P8AFFINEINVQB128,
 		ssa.OpAMD64VGF2P8AFFINEINVQB256,
 		ssa.OpAMD64VGF2P8AFFINEINVQB512,
+		ssa.OpAMD64VPERM2F128256,
+		ssa.OpAMD64VPERM2I128256,
 		ssa.OpAMD64VINSERTF128256,
 		ssa.OpAMD64VINSERTF64X4512,
 		ssa.OpAMD64VINSERTI128256,
diff --git a/src/cmd/compile/internal/ir/symtab.go b/src/cmd/compile/internal/ir/symtab.go
index 2222a5444a..0cfa2a2262 100644
--- a/src/cmd/compile/internal/ir/symtab.go
+++ b/src/cmd/compile/internal/ir/symtab.go
@@ -45,6 +45,7 @@ type symsStruct struct {
 	PanicdottypeI     *obj.LSym
 	Panicnildottype   *obj.LSym
 	Panicoverflow     *obj.LSym
+	PanicSimdImm      *obj.LSym
 	Racefuncenter     *obj.LSym
 	Racefuncexit      *obj.LSym
 	Raceread          *obj.LSym
diff --git a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules
index 9db223c04f..1eab8b5e6d 100644
--- a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules
+++ b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules
@@ -938,6 +938,12 @@
 (ScaleFloat64x2 ...) => (VSCALEFPD128 ...)
 (ScaleFloat64x4 ...) => (VSCALEFPD256 ...)
 (ScaleFloat64x8 ...) => (VSCALEFPD512 ...)
+(Select128FromPairFloat32x8 ...) => (VPERM2F128256 ...)
+(Select128FromPairFloat64x4 ...) => (VPERM2F128256 ...)
+(Select128FromPairInt32x8 ...) => (VPERM2I128256 ...)
+(Select128FromPairInt64x4 ...) => (VPERM2I128256 ...)
+(Select128FromPairUint32x8 ...) => (VPERM2I128256 ...)
+(Select128FromPairUint64x4 ...) => (VPERM2I128256 ...)
 (SetElemFloat32x4 ...) => (VPINSRD128 ...)
 (SetElemFloat64x2 ...) => (VPINSRQ128 ...)
 (SetElemInt8x16 ...) => (VPINSRB128 ...)
diff --git a/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go b/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go
index ba91fb3fc9..5e1da3249f 100644
--- a/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go
+++ b/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go
@@ -1212,6 +1212,8 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
 		{name: "VPRORQMasked128", argLength: 2, reg: wkw, asm: "VPRORQ", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VPRORQMasked256", argLength: 2, reg: wkw, asm: "VPRORQ", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VPRORQMasked512", argLength: 2, reg: wkw, asm: "VPRORQ", aux: "UInt8", commutative: false, typ: "Vec512", resultInArg0: false},
+		{name: "VPERM2F128256", argLength: 2, reg: v21, asm: "VPERM2F128", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: false},
+		{name: "VPERM2I128256", argLength: 2, reg: v21, asm: "VPERM2I128", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VPINSRD128", argLength: 2, reg: vgpv, asm: "VPINSRD", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VPINSRQ128", argLength: 2, reg: vgpv, asm: "VPINSRQ", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VPINSRB128", argLength: 2, reg: vgpv, asm: "VPINSRB", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false},
diff --git a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go
index 81a1dff137..aa088dbf0b 100644
--- a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go
+++ b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go
@@ -1199,6 +1199,12 @@ func simdGenericOps() []opData {
 		{name: "RoundToEvenScaledResidueFloat64x2", argLength: 1, commutative: false, aux: "UInt8"},
 		{name: "RoundToEvenScaledResidueFloat64x4", argLength: 1, commutative: false, aux: "UInt8"},
 		{name: "RoundToEvenScaledResidueFloat64x8", argLength: 1, commutative: false, aux: "UInt8"},
+		{name: "Select128FromPairFloat32x8", argLength: 2, commutative: false, aux: "UInt8"},
+		{name: "Select128FromPairFloat64x4", argLength: 2, commutative: false, aux: "UInt8"},
+		{name: "Select128FromPairInt32x8", argLength: 2, commutative: false, aux: "UInt8"},
+		{name: "Select128FromPairInt64x4", argLength: 2, commutative: false, aux: "UInt8"},
+		{name: "Select128FromPairUint32x8", argLength: 2, commutative: false, aux: "UInt8"},
+		{name: "Select128FromPairUint64x4", argLength: 2, commutative: false, aux: "UInt8"},
 		{name: "SetElemFloat32x4", argLength: 2, commutative: false, aux: "UInt8"},
 		{name: "SetElemFloat64x2", argLength: 2, commutative: false, aux: "UInt8"},
 		{name: "SetElemInt8x16", argLength: 2, commutative: false, aux: "UInt8"},
diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go
index 792a1ca08f..105d1a803c 100644
--- a/src/cmd/compile/internal/ssa/opGen.go
+++ b/src/cmd/compile/internal/ssa/opGen.go
@@ -2444,6 +2444,8 @@ const (
 	OpAMD64VPRORQMasked128
 	OpAMD64VPRORQMasked256
 	OpAMD64VPRORQMasked512
+	OpAMD64VPERM2F128256
+	OpAMD64VPERM2I128256
 	OpAMD64VPINSRD128
 	OpAMD64VPINSRQ128
 	OpAMD64VPINSRB128
@@ -6594,6 +6596,12 @@ const (
 	OpRoundToEvenScaledResidueFloat64x2
 	OpRoundToEvenScaledResidueFloat64x4
 	OpRoundToEvenScaledResidueFloat64x8
+	OpSelect128FromPairFloat32x8
+	OpSelect128FromPairFloat64x4
+	OpSelect128FromPairInt32x8
+	OpSelect128FromPairInt64x4
+	OpSelect128FromPairUint32x8
+	OpSelect128FromPairUint64x4
 	OpSetElemFloat32x4
 	OpSetElemFloat64x2
 	OpSetElemInt8x16
@@ -37656,6 +37664,36 @@ var opcodeTable = [...]opInfo{
 			},
 		},
 	},
+	{
+		name:    "VPERM2F128256",
+		auxType: auxUInt8,
+		argLen:  2,
+		asm:     x86.AVPERM2F128,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
+				{1, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
+			},
+			outputs: []outputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+		},
+	},
+	{
+		name:    "VPERM2I128256",
+		auxType: auxUInt8,
+		argLen:  2,
+		asm:     x86.AVPERM2I128,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
+				{1, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
+			},
+			outputs: []outputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+		},
+	},
 	{
 		name:    "VPINSRD128",
 		auxType: auxUInt8,
@@ -82360,6 +82398,42 @@ var opcodeTable = [...]opInfo{
 		argLen:  1,
 		generic: true,
 	},
+	{
+		name:    "Select128FromPairFloat32x8",
+		auxType: auxUInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "Select128FromPairFloat64x4",
+		auxType: auxUInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "Select128FromPairInt32x8",
+		auxType: auxUInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "Select128FromPairInt64x4",
+		auxType: auxUInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "Select128FromPairUint32x8",
+		auxType: auxUInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "Select128FromPairUint64x4",
+		auxType: auxUInt8,
+		argLen:  2,
+		generic: true,
+	},
 	{
 		name:    "SetElemFloat32x4",
 		auxType: auxUInt8,
diff --git a/src/cmd/compile/internal/ssa/rewriteAMD64.go b/src/cmd/compile/internal/ssa/rewriteAMD64.go
index ca9f9ae17b..bc611fc44c 100644
--- a/src/cmd/compile/internal/ssa/rewriteAMD64.go
+++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go
@@ -4991,6 +4991,24 @@ func rewriteValueAMD64(v *Value) bool {
 		return rewriteValueAMD64_OpSelect0(v)
 	case OpSelect1:
 		return rewriteValueAMD64_OpSelect1(v)
+	case OpSelect128FromPairFloat32x8:
+		v.Op = OpAMD64VPERM2F128256
+		return true
+	case OpSelect128FromPairFloat64x4:
+		v.Op = OpAMD64VPERM2F128256
+		return true
+	case OpSelect128FromPairInt32x8:
+		v.Op = OpAMD64VPERM2I128256
+		return true
+	case OpSelect128FromPairInt64x4:
+		v.Op = OpAMD64VPERM2I128256
+		return true
+	case OpSelect128FromPairUint32x8:
+		v.Op = OpAMD64VPERM2I128256
+		return true
+	case OpSelect128FromPairUint64x4:
+		v.Op = OpAMD64VPERM2I128256
+		return true
 	case OpSelectN:
 		return rewriteValueAMD64_OpSelectN(v)
 	case OpSetElemFloat32x4:
diff --git a/src/cmd/compile/internal/ssagen/intrinsics.go b/src/cmd/compile/internal/ssagen/intrinsics.go
index 985d899a71..4c5cd9ef2c 100644
--- a/src/cmd/compile/internal/ssagen/intrinsics.go
+++ b/src/cmd/compile/internal/ssagen/intrinsics.go
@@ -1842,7 +1842,9 @@ func immJumpTable(s *state, idx *ssa.Value, intrinsicCall *ir.CallExpr, genOp fu
 	for i, t := range targets {
 		s.startBlock(t)
 		genOp(s, i)
-		t.AddEdgeTo(bEnd)
+		if t.Kind != ssa.BlockExit {
+			t.AddEdgeTo(bEnd)
+		}
 		s.endBlock()
 	}
 
@@ -1899,6 +1901,28 @@ func opLen2Imm8_2I(op ssa.Op, t *types.Type, offset int) func(s *state, n *ir.Ca
 	}
 }
 
+// Two immediates instead of just 1.  Offset is ignored, so it is a _ parameter instead.
+func opLen2Imm8_II(op ssa.Op, t *types.Type, _ int) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+	return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+		if args[1].Op == ssa.OpConst8 && args[2].Op == ssa.OpConst8 && args[1].AuxInt & ^3 == 0 && args[2].AuxInt & ^3 == 0 {
+			i1, i2 := args[1].AuxInt, args[2].AuxInt
+			return s.newValue2I(op, t, i1+i2<<4, args[0], args[3])
+		}
+		four := s.constInt64(types.Types[types.TUINT8], 4)
+		shifted := s.newValue2(ssa.OpLsh8x8, types.Types[types.TUINT8], args[2], four)
+		combined := s.newValue2(ssa.OpAdd8, types.Types[types.TUINT8], args[1], shifted)
+		return immJumpTable(s, combined, n, func(sNew *state, idx int) {
+			// Encode as int8 due to requirement of AuxInt, check its comment for details.
+			// TODO for "zeroing" values, panic instead.
+			if idx & ^(3+3<<4) == 0 {
+				s.vars[n] = sNew.newValue2I(op, t, int64(int8(idx)), args[0], args[3])
+			} else {
+				sNew.rtcall(ir.Syms.PanicSimdImm, false, nil)
+			}
+		})
+	}
+}
+
 func opLen3Imm8_2I(op ssa.Op, t *types.Type, offset int) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
 	return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
 		if args[2].Op == ssa.OpConst8 {
diff --git a/src/cmd/compile/internal/ssagen/simdintrinsics.go b/src/cmd/compile/internal/ssagen/simdintrinsics.go
index 41858a7745..a62b3882c3 100644
--- a/src/cmd/compile/internal/ssagen/simdintrinsics.go
+++ b/src/cmd/compile/internal/ssagen/simdintrinsics.go
@@ -950,6 +950,12 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
 	addF(simdPackage, "Float64x2.Scale", opLen2(ssa.OpScaleFloat64x2, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Float64x4.Scale", opLen2(ssa.OpScaleFloat64x4, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Float64x8.Scale", opLen2(ssa.OpScaleFloat64x8, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Float32x8.Select128FromPair", opLen2Imm8_II(ssa.OpSelect128FromPairFloat32x8, types.TypeVec256, 0), sys.AMD64)
+	addF(simdPackage, "Float64x4.Select128FromPair", opLen2Imm8_II(ssa.OpSelect128FromPairFloat64x4, types.TypeVec256, 0), sys.AMD64)
+	addF(simdPackage, "Int32x8.Select128FromPair", opLen2Imm8_II(ssa.OpSelect128FromPairInt32x8, types.TypeVec256, 0), sys.AMD64)
+	addF(simdPackage, "Int64x4.Select128FromPair", opLen2Imm8_II(ssa.OpSelect128FromPairInt64x4, types.TypeVec256, 0), sys.AMD64)
+	addF(simdPackage, "Uint32x8.Select128FromPair", opLen2Imm8_II(ssa.OpSelect128FromPairUint32x8, types.TypeVec256, 0), sys.AMD64)
+	addF(simdPackage, "Uint64x4.Select128FromPair", opLen2Imm8_II(ssa.OpSelect128FromPairUint64x4, types.TypeVec256, 0), sys.AMD64)
 	addF(simdPackage, "Float32x4.SetElem", opLen2Imm8(ssa.OpSetElemFloat32x4, types.TypeVec128, 0), sys.AMD64)
 	addF(simdPackage, "Float64x2.SetElem", opLen2Imm8(ssa.OpSetElemFloat64x2, types.TypeVec128, 0), sys.AMD64)
 	addF(simdPackage, "Int8x16.SetElem", opLen2Imm8(ssa.OpSetElemInt8x16, types.TypeVec128, 0), sys.AMD64)
diff --git a/src/cmd/compile/internal/ssagen/ssa.go b/src/cmd/compile/internal/ssagen/ssa.go
index 57129817f6..37aad360f2 100644
--- a/src/cmd/compile/internal/ssagen/ssa.go
+++ b/src/cmd/compile/internal/ssagen/ssa.go
@@ -141,6 +141,7 @@ func InitConfig() {
 	ir.Syms.Panicnildottype = typecheck.LookupRuntimeFunc("panicnildottype")
 	ir.Syms.Panicoverflow = typecheck.LookupRuntimeFunc("panicoverflow")
 	ir.Syms.Panicshift = typecheck.LookupRuntimeFunc("panicshift")
+	ir.Syms.PanicSimdImm = typecheck.LookupRuntimeFunc("panicSimdImm")
 	ir.Syms.Racefuncenter = typecheck.LookupRuntimeFunc("racefuncenter")
 	ir.Syms.Racefuncexit = typecheck.LookupRuntimeFunc("racefuncexit")
 	ir.Syms.Raceread = typecheck.LookupRuntimeFunc("raceread")
diff --git a/src/runtime/panic.go b/src/runtime/panic.go
index 8c91c9435a..d7bce70fe5 100644
--- a/src/runtime/panic.go
+++ b/src/runtime/panic.go
@@ -341,6 +341,13 @@ func panicmemAddr(addr uintptr) {
 	panic(errorAddressString{msg: "invalid memory address or nil pointer dereference", addr: addr})
 }
 
+var simdImmError = error(errorString("out-of-range immediate for simd intrinsic"))
+
+func panicSimdImm() {
+	panicCheck2("simd immediate error")
+	panic(simdImmError)
+}
+
 // Create a new deferred function fn, which has no arguments and results.
 // The compiler turns a defer statement into a call to this.
 func deferproc(fn func()) {
diff --git a/src/simd/_gen/simdgen/gen_simdIntrinsics.go b/src/simd/_gen/simdgen/gen_simdIntrinsics.go
index 353bc46b31..4b27f7ce5f 100644
--- a/src/simd/_gen/simdgen/gen_simdIntrinsics.go
+++ b/src/simd/_gen/simdgen/gen_simdIntrinsics.go
@@ -56,6 +56,8 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
 {{end}}
 {{define "op2Imm8_2I"}}	addF(simdPackage, "{{(index .In 1).Go}}.{{.Go}}", opLen2Imm8_2I(ssa.Op{{.GenericName}}, {{.SSAType}}, {{(index .In 0).ImmOffset}}), sys.AMD64)
 {{end}}
+{{define "op2Imm8_II"}}	addF(simdPackage, "{{(index .In 1).Go}}.{{.Go}}", opLen2Imm8_II(ssa.Op{{.GenericName}}, {{.SSAType}}, {{(index .In 0).ImmOffset}}), sys.AMD64)
+{{end}}
 {{define "op3Imm8"}}	addF(simdPackage, "{{(index .In 1).Go}}.{{.Go}}", opLen3Imm8(ssa.Op{{.GenericName}}, {{.SSAType}}, {{(index .In 0).ImmOffset}}), sys.AMD64)
 {{end}}
 {{define "op3Imm8_2I"}}	addF(simdPackage, "{{(index .In 1).Go}}.{{.Go}}", opLen3Imm8_2I(ssa.Op{{.GenericName}}, {{.SSAType}}, {{(index .In 0).ImmOffset}}), sys.AMD64)
diff --git a/src/simd/_gen/simdgen/gen_simdTypes.go b/src/simd/_gen/simdgen/gen_simdTypes.go
index 0d5d08b7ed..8944c35cad 100644
--- a/src/simd/_gen/simdgen/gen_simdTypes.go
+++ b/src/simd/_gen/simdgen/gen_simdTypes.go
@@ -354,6 +354,15 @@ func ({{.Op1NameAndType "x"}}) {{.Go}}({{.ImmName}} uint8, {{.Op2NameAndType "y"
 func ({{.Op1NameAndType "x"}}) {{.Go}}({{.Op2NameAndType "y"}}, {{.ImmName}} uint8) {{.GoType}}
 {{end}}
 
+{{define "op2Imm8_II"}}
+{{if .Documentation}}{{.Documentation}}
+//{{end}}
+// {{.ImmName}} result in better performance when they are constants, non-constant values will be translated into a jump table.
+// {{.ImmName}} should be between 0 and 3, inclusive; other values will result in a runtime panic.
+//
+// Asm: {{.Asm}}, CPU Feature: {{.CPUFeature}}
+func ({{.Op1NameAndType "x"}}) {{.Go}}({{.ImmName}} uint8, {{.Op2NameAndType "y"}}) {{.GoType}}
+{{end}}
 
 {{define "op3Imm8"}}
 {{if .Documentation}}{{.Documentation}}
diff --git a/src/simd/_gen/simdgen/ops/Moves/categories.yaml b/src/simd/_gen/simdgen/ops/Moves/categories.yaml
index e9a7fef202..0c733e12ee 100644
--- a/src/simd/_gen/simdgen/ops/Moves/categories.yaml
+++ b/src/simd/_gen/simdgen/ops/Moves/categories.yaml
@@ -174,4 +174,10 @@
     // then 1, selecting element 1 from x's upper 128 bits (9), then 1,
     // selecting element 1 from y's upper 128 bits (11).
     // This differs from the same method applied to a 32x8 vector, where
-    // the 8-bit constant performs the same selection on both subvectors.
\ No newline at end of file
+    // the 8-bit constant performs the same selection on both subvectors.
+
+- go: Select128FromPair
+  commutative: false
+  documentation: !string |-
+    // NAME selects the low and high 128-bit halves from the 128-bit halves
+    // of its two 256-bit inputs, numbering those halves 0, 1, 2, 3.
diff --git a/src/simd/_gen/simdgen/ops/Moves/go.yaml b/src/simd/_gen/simdgen/ops/Moves/go.yaml
index 46599b7bd7..495b9ed6fa 100644
--- a/src/simd/_gen/simdgen/ops/Moves/go.yaml
+++ b/src/simd/_gen/simdgen/ops/Moves/go.yaml
@@ -721,7 +721,6 @@
   out:
   - *v
 
-
 - go: concatSelectedConstantGrouped
   asm: VSHUFPD
   in:
@@ -771,3 +770,74 @@
   inVariant: []
   out:
   - *v
+
+- go: Select128FromPair
+  asm: VPERM2F128
+  operandOrder: II
+  in:
+  - &v
+    go: $t
+    class: vreg
+    base: float
+    bits: 256
+  - *v
+  - class: immediate
+    immOffset: 0
+    name: "lo, hi"
+  inVariant: []
+  out:
+  - *v
+
+- go: Select128FromPair
+  asm: VPERM2F128
+  operandOrder: II
+  in:
+  - &v
+    go: $t
+    class: vreg
+    base: float
+    bits: 256
+    OverwriteElementBits: 32
+  - *v
+  - class: immediate
+    immOffset: 0
+    name: "lo, hi"
+  inVariant: []
+  out:
+  - *v
+
+- go: Select128FromPair
+  asm: VPERM2I128
+  operandOrder: II
+  in:
+  - &v
+    go: $t
+    class: vreg
+    base: int|uint
+    bits: 256
+    OverwriteElementBits: 64
+  - *v
+  - class: immediate
+    immOffset: 0
+    name: "lo, hi"
+  inVariant: []
+  out:
+  - *v
+
+- go: Select128FromPair
+  asm: VPERM2I128
+  operandOrder: II
+  in:
+  - &v
+    go: $t
+    class: vreg
+    base: int|uint
+    bits: 256
+    OverwriteElementBits: 32
+  - *v
+  - class: immediate
+    immOffset: 0
+    name: "lo, hi"
+  inVariant: []
+  out:
+  - *v
diff --git a/src/simd/_gen/unify/domain.go b/src/simd/_gen/unify/domain.go
index 1e0f2be63d..8eb5deab2b 100644
--- a/src/simd/_gen/unify/domain.go
+++ b/src/simd/_gen/unify/domain.go
@@ -106,8 +106,8 @@ func (b *DefBuilder) Add(name string, v *Value) {
 	if b.fields == nil {
 		b.fields = make(map[string]*Value)
 	}
-	if _, ok := b.fields[name]; ok {
-		panic(fmt.Sprintf("duplicate field %q", name))
+	if old, ok := b.fields[name]; ok {
+		panic(fmt.Sprintf("duplicate field %q, added value is %v, old value is %v", name, v, old))
 	}
 	b.fields[name] = v
 }
diff --git a/src/simd/internal/simd_test/simd_test.go b/src/simd/internal/simd_test/simd_test.go
index 6deadde45e..e38f7eea01 100644
--- a/src/simd/internal/simd_test/simd_test.go
+++ b/src/simd/internal/simd_test/simd_test.go
@@ -815,3 +815,77 @@ func TestSelectFromPairConstGroupedUint32x16(t *testing.T) {
 	foo(lhhl, 0, 4, 5, 1)
 	foo(hllh, 4, 0, 1, 5)
 }
+
+func TestSelect128FromPair(t *testing.T) {
+	x := simd.LoadUint64x4Slice([]uint64{0, 1, 2, 3})
+	y := simd.LoadUint64x4Slice([]uint64{4, 5, 6, 7})
+
+	aa := x.Select128FromPair(0, 0, y)
+	ab := x.Select128FromPair(0, 1, y)
+	bc := x.Select128FromPair(1, 2, y)
+	cd := x.Select128FromPair(2, 3, y)
+	da := x.Select128FromPair(3, 0, y)
+	dc := x.Select128FromPair(3, 2, y)
+
+	r := make([]uint64, 4, 4)
+
+	foo := func(v simd.Uint64x4, a, b uint64) {
+		a, b = 2*a, 2*b
+		v.StoreSlice(r)
+		checkSlices[uint64](t, r, []uint64{a, a + 1, b, b + 1})
+	}
+
+	foo(aa, 0, 0)
+	foo(ab, 0, 1)
+	foo(bc, 1, 2)
+	foo(cd, 2, 3)
+	foo(da, 3, 0)
+	foo(dc, 3, 2)
+}
+
+func TestSelect128FromPairError(t *testing.T) {
+	x := simd.LoadUint64x4Slice([]uint64{0, 1, 2, 3})
+	y := simd.LoadUint64x4Slice([]uint64{4, 5, 6, 7})
+
+	defer func() {
+		if r := recover(); r != nil {
+			t.Logf("Saw expected panic %v", r)
+		}
+	}()
+	_ = x.Select128FromPair(0, 4, y)
+
+	t.Errorf("Should have panicked")
+}
+
+//go:noinline
+func select128FromPair(x simd.Uint64x4, lo, hi uint8, y simd.Uint64x4) simd.Uint64x4 {
+	return x.Select128FromPair(lo, hi, y)
+}
+
+func TestSelect128FromPairVar(t *testing.T) {
+	x := simd.LoadUint64x4Slice([]uint64{0, 1, 2, 3})
+	y := simd.LoadUint64x4Slice([]uint64{4, 5, 6, 7})
+
+	aa := select128FromPair(x, 0, 0, y)
+	ab := select128FromPair(x, 0, 1, y)
+	bc := select128FromPair(x, 1, 2, y)
+	cd := select128FromPair(x, 2, 3, y)
+	da := select128FromPair(x, 3, 0, y)
+	dc := select128FromPair(x, 3, 2, y)
+
+	r := make([]uint64, 4, 4)
+
+	foo := func(v simd.Uint64x4, a, b uint64) {
+		a, b = 2*a, 2*b
+		v.StoreSlice(r)
+		checkSlices[uint64](t, r, []uint64{a, a + 1, b, b + 1})
+	}
+
+	foo(aa, 0, 0)
+	foo(ab, 0, 1)
+	foo(bc, 1, 2)
+	foo(cd, 2, 3)
+	foo(da, 3, 0)
+	foo(dc, 3, 2)
+
+}
diff --git a/src/simd/ops_amd64.go b/src/simd/ops_amd64.go
index a104601ed7..91e7d91842 100644
--- a/src/simd/ops_amd64.go
+++ b/src/simd/ops_amd64.go
@@ -5576,6 +5576,62 @@ func (x Float64x4) Scale(y Float64x4) Float64x4
 // Asm: VSCALEFPD, CPU Feature: AVX512
 func (x Float64x8) Scale(y Float64x8) Float64x8
 
+/* Select128FromPair */
+
+// Select128FromPair selects the low and high 128-bit halves from the 128-bit halves
+// of its two 256-bit inputs, numbering those halves 0, 1, 2, 3.
+//
+// lo, hi result in better performance when they are constants, non-constant values will be translated into a jump table.
+// lo, hi should be between 0 and 3, inclusive; other values will result in a runtime panic.
+//
+// Asm: VPERM2F128, CPU Feature: AVX
+func (x Float32x8) Select128FromPair(lo, hi uint8, y Float32x8) Float32x8
+
+// Select128FromPair selects the low and high 128-bit halves from the 128-bit halves
+// of its two 256-bit inputs, numbering those halves 0, 1, 2, 3.
+//
+// lo, hi result in better performance when they are constants, non-constant values will be translated into a jump table.
+// lo, hi should be between 0 and 3, inclusive; other values will result in a runtime panic.
+//
+// Asm: VPERM2F128, CPU Feature: AVX
+func (x Float64x4) Select128FromPair(lo, hi uint8, y Float64x4) Float64x4
+
+// Select128FromPair selects the low and high 128-bit halves from the 128-bit halves
+// of its two 256-bit inputs, numbering those halves 0, 1, 2, 3.
+//
+// lo, hi result in better performance when they are constants, non-constant values will be translated into a jump table.
+// lo, hi should be between 0 and 3, inclusive; other values will result in a runtime panic.
+//
+// Asm: VPERM2I128, CPU Feature: AVX2
+func (x Int32x8) Select128FromPair(lo, hi uint8, y Int32x8) Int32x8
+
+// Select128FromPair selects the low and high 128-bit halves from the 128-bit halves
+// of its two 256-bit inputs, numbering those halves 0, 1, 2, 3.
+//
+// lo, hi result in better performance when they are constants, non-constant values will be translated into a jump table.
+// lo, hi should be between 0 and 3, inclusive; other values will result in a runtime panic.
+//
+// Asm: VPERM2I128, CPU Feature: AVX2
+func (x Int64x4) Select128FromPair(lo, hi uint8, y Int64x4) Int64x4
+
+// Select128FromPair selects the low and high 128-bit halves from the 128-bit halves
+// of its two 256-bit inputs, numbering those halves 0, 1, 2, 3.
+//
+// lo, hi result in better performance when they are constants, non-constant values will be translated into a jump table.
+// lo, hi should be between 0 and 3, inclusive; other values will result in a runtime panic.
+//
+// Asm: VPERM2I128, CPU Feature: AVX2
+func (x Uint32x8) Select128FromPair(lo, hi uint8, y Uint32x8) Uint32x8
+
+// Select128FromPair selects the low and high 128-bit halves from the 128-bit halves
+// of its two 256-bit inputs, numbering those halves 0, 1, 2, 3.
+//
+// lo, hi result in better performance when they are constants, non-constant values will be translated into a jump table.
+// lo, hi should be between 0 and 3, inclusive; other values will result in a runtime panic.
+//
+// Asm: VPERM2I128, CPU Feature: AVX2
+func (x Uint64x4) Select128FromPair(lo, hi uint8, y Uint64x4) Uint64x4
+
 /* SetElem */
 
 // SetElem sets a single constant-indexed element's value.
-- 
cgit v1.3