From d1e7f49e3d1eb039d9d2aed3ba923459bd42aa7c Mon Sep 17 00:00:00 2001
From: jjpinto <jorgpinto@gmail.com>
Date: Tue, 6 Jan 2026 22:47:07 +0000
Subject: internal/trace: fix recorder.Write return value for header-only
 buffers

Fix issue #77083

Change-Id: I9189d1e3a6efea8478224164e820f50c818abcd5
GitHub-Last-Rev: bb24cbda95f0b5b10aeae9a5ee8cbe215ba6d4eb
GitHub-Pull-Request: golang/go#77092
Reviewed-on: https://go-review.googlesource.com/c/go/+/734300
Reviewed-by: Michael Pratt <mpratt@google.com>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
Commit-Queue: Michael Knyszek <mknyszek@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
---
 src/runtime/trace/recorder.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/runtime/trace/recorder.go b/src/runtime/trace/recorder.go
index 4f2d3aa92a..a18d764141 100644
--- a/src/runtime/trace/recorder.go
+++ b/src/runtime/trace/recorder.go
@@ -39,7 +39,7 @@ func (w *recorder) Write(b []byte) (n int, err error) {
 		w.headerReceived = true
 	}
 	if len(b) == n {
-		return 0, nil
+		return n, nil
 	}
 	ba, nb, err := readBatch(b[n:]) // Every write from the runtime is guaranteed to be a complete batch.
 	if err != nil {
-- 
cgit v1.3


From 874d8b98eba8129559b97d2fdfa02ddeb88b95f9 Mon Sep 17 00:00:00 2001
From: Michael Matloob <matloob@golang.org>
Date: Tue, 6 Jan 2026 17:18:48 -0500
Subject: cmd/go/internal/work: decrement concurrentProcesses when action
 finishes

This fixes a bug where we only incremented concurrentProcesses but never
decremented it, causing us to run out of tokens and give all compiles
-c=1 after a point.

Cq-Include-Trybots: luci.golang.try:gotip-linux-amd64_c2s16-perf_vs_parent,gotip-linux-amd64_c3h88-perf_vs_parent,gotip-linux-arm64_c4ah72-perf_vs_parent,gotip-linux-arm64_c4as16-perf_vs_parent
Change-Id: I41f4c1edb77004cbc1772d6d672045946a6a6964
Reviewed-on: https://go-review.googlesource.com/c/go/+/734260
Reviewed-by: David Chase <drchase@google.com>
Reviewed-by: Cherry Mui <cherryyz@google.com>
Reviewed-by: Michael Matloob <matloob@google.com>
TryBot-Bypass: Michael Matloob <matloob@google.com>
---
 src/cmd/go/internal/work/exec.go | 5 +++++
 src/cmd/go/internal/work/gc.go   | 3 +++
 2 files changed, 8 insertions(+)

diff --git a/src/cmd/go/internal/work/exec.go b/src/cmd/go/internal/work/exec.go
index 654e9e9374..f2d1b1040b 100644
--- a/src/cmd/go/internal/work/exec.go
+++ b/src/cmd/go/internal/work/exec.go
@@ -248,6 +248,11 @@ func (b *Builder) Do(ctx context.Context, root *Action) {
 
 	wg.Wait()
 
+	if tokens != totalTokens || concurrentProcesses != 0 {
+		base.Fatalf("internal error: tokens not restored at end of build: tokens: %d, totalTokens: %d, concurrentProcesses: %d",
+			tokens, totalTokens, concurrentProcesses)
+	}
+
 	// Write action graph again, this time with timing information.
 	writeActionGraph()
 }
diff --git a/src/cmd/go/internal/work/gc.go b/src/cmd/go/internal/work/gc.go
index 9a5e6c924c..fc74715f22 100644
--- a/src/cmd/go/internal/work/gc.go
+++ b/src/cmd/go/internal/work/gc.go
@@ -227,6 +227,7 @@ func compilerConcurrency() (int, func()) {
 	return c, func() {
 		tokensMu.Lock()
 		defer tokensMu.Unlock()
+		concurrentProcesses--
 		tokens += c
 	}
 }
@@ -235,6 +236,7 @@ var maxCompilerConcurrency = runtime.GOMAXPROCS(0) // max value we will use for
 
 var (
 	tokensMu            sync.Mutex
+	totalTokens         int // total number of tokens: this is used for checking that we get them all back in the end
 	tokens              int // number of available tokens
 	concurrentProcesses int // number of currently running compiles
 )
@@ -246,6 +248,7 @@ func initCompilerConcurrencyPool() {
 	// than what it was when we capped the concurrency to 4.
 	oldConcurrencyCap := min(4, maxCompilerConcurrency)
 	tokens = oldConcurrencyCap * cfg.BuildP
+	totalTokens = tokens
 }
 
 // trimpath returns the -trimpath argument to use
-- 
cgit v1.3


From 28147b528312055b535c6a69d0d4492bd502e1b0 Mon Sep 17 00:00:00 2001
From: Michael Matloob <matloob@golang.org>
Date: Mon, 5 Jan 2026 12:23:14 -0500
Subject: cmd/go: guarantee a minimum of min(4,GOMAXPROCS) to compile -c

To allow this, we also increase the size of the pool to allow the
minimum number for each action, with an extra 2*GOMAXPROCS number of
tokens to boost -c when there are fewer concurrently running actions.
That means the pool will now have the size 6*GOMAXPROCS instead of the
previous 4*GOMAXPROCS.

The goal is to maintain the boosting behavior added by the pool, while
guarding from starving compiles when there are too few tokens left, so
that the value of -c is always at least min(4,GOMAXPROCS), which is what
it was set to before Go 1.26.

Cq-Include-Trybots: luci.golang.try:gotip-linux-arm64_c4as16-perf_vs_parent,gotip-linux-arm64_c4ah72-perf_vs_parent,gotip-linux-amd64_c3h88-perf_vs_parent,gotip-linux-amd64_c2s16-perf_vs_parent
Change-Id: I113a38584514a6c025d3d1bc727ff8d86a6a6964
Reviewed-on: https://go-review.googlesource.com/c/go/+/734040
Commit-Queue: Michael Matloob <matloob@golang.org>
Reviewed-by: Cherry Mui <cherryyz@google.com>
TryBot-Bypass: Michael Matloob <matloob@google.com>
Reviewed-by: Michael Matloob <matloob@google.com>
---
 src/cmd/go/internal/work/gc.go | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/src/cmd/go/internal/work/gc.go b/src/cmd/go/internal/work/gc.go
index fc74715f22..6300a9135b 100644
--- a/src/cmd/go/internal/work/gc.go
+++ b/src/cmd/go/internal/work/gc.go
@@ -217,11 +217,11 @@ func compilerConcurrency() (int, func()) {
 	concurrentProcesses++
 	// Set aside tokens so that we don't run out if we were running cfg.BuildP concurrent compiles.
 	// We'll set aside one token for each of the action goroutines that aren't currently running a compile.
-	setAside := cfg.BuildP - concurrentProcesses
+	setAside := (cfg.BuildP - concurrentProcesses) * minTokens
 	availableTokens := tokens - setAside
-	// Grab half the remaining tokens: but with a floor of at least 1 token, and
+	// Grab half the remaining tokens: but with a floor of at least minTokens token, and
 	// a ceiling of the max backend concurrency.
-	c := max(min(availableTokens/2, maxCompilerConcurrency), 1)
+	c := max(min(availableTokens/2, maxCompilerConcurrency), minTokens)
 	tokens -= c
 	// Successfully grabbed the tokens.
 	return c, func() {
@@ -239,15 +239,18 @@ var (
 	totalTokens         int // total number of tokens: this is used for checking that we get them all back in the end
 	tokens              int // number of available tokens
 	concurrentProcesses int // number of currently running compiles
+	minTokens           int // minimum number of tokens to give out
 )
 
 // initCompilerConcurrencyPool sets the number of tokens in the pool. It needs
 // to be run after init, so that it can use the value of cfg.BuildP.
 func initCompilerConcurrencyPool() {
-	// Size the pool so that the worst case total number of compiles is not more
-	// than what it was when we capped the concurrency to 4.
-	oldConcurrencyCap := min(4, maxCompilerConcurrency)
-	tokens = oldConcurrencyCap * cfg.BuildP
+	// Size the pool to allow 2*maxCompilerConcurrency extra tokens to
+	// be distributed amongst the compile actions in addition to the minimum
+	// of min(4,GOMAXPROCS) tokens for each of the potentially cfg.BuildP
+	// concurrently running compile actions.
+	minTokens = min(4, maxCompilerConcurrency)
+	tokens = 2*maxCompilerConcurrency + minTokens*cfg.BuildP
 	totalTokens = tokens
 }
 
-- 
cgit v1.3


From 5facb3b24b1c388176572eb95239f94d6ed4017d Mon Sep 17 00:00:00 2001
From: Mark Freeman <mark@golang.org>
Date: Wed, 7 Jan 2026 16:40:53 -0500
Subject: internal/types: add test for cycles in value context

Exposition is also added to outline a difference between syntax which
can / cannot produce values of incomplete types.

For us to enforce non-nilness of type RHS and remove the pending type
mechanism, I suspect we would need to add completeness guards to
the syntax which *can*.

Enforcing non-nilness of type RHS currently breaks the below test
cases, but I suspect that is simply an implementation artifact.
In other words, they just call Underlying at a bad time.

  - T0
  - T3
  - T6 / T7
  - T10
  - T12

If we also remove pendingType, all of these test cases break; again,
we would need guards in the appropriate syntax logic.

Change-Id: Ibe22042232e542de1d38b923dd1d5cc50dce08cb
Reviewed-on: https://go-review.googlesource.com/c/go/+/734600
TryBot-Bypass: Mark Freeman <markfreeman@google.com>
Reviewed-by: Robert Griesemer <gri@google.com>
Auto-Submit: Mark Freeman <markfreeman@google.com>
---
 src/internal/types/testdata/check/cycles6.go | 71 ++++++++++++++++++++++++++++
 1 file changed, 71 insertions(+)
 create mode 100644 src/internal/types/testdata/check/cycles6.go

diff --git a/src/internal/types/testdata/check/cycles6.go b/src/internal/types/testdata/check/cycles6.go
new file mode 100644
index 0000000000..e5635ed456
--- /dev/null
+++ b/src/internal/types/testdata/check/cycles6.go
@@ -0,0 +1,71 @@
+// Copyright 2026 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package p
+
+import "unsafe"
+
+// Below are the pieces of syntax corresponding to functions which can produce a
+// type T without first having a value of type T. Notice that each causes a
+// value of type T to be passed to unsafe.Sizeof while T is incomplete.
+
+// literal on type
+type T0 /* ERROR "invalid recursive type" */ [unsafe.Sizeof(T0{})]int
+// literal on value                                                             (not applicable)
+// literal on pointer                                                           (not applicable)
+
+// call on type
+type T1 /* ERROR "invalid recursive type" */ [unsafe.Sizeof(T1(42))]int
+// call on value
+func f2() T2
+type T2 /* ERROR "invalid recursive type" */ [unsafe.Sizeof(f2())]int
+// call on pointer                                                              (not applicable)
+
+// assert on type
+var i3 interface{}
+type T3 /* ERROR "invalid recursive type" */ [unsafe.Sizeof(i3.(T3))]int
+// assert on value                                                              (not applicable)
+// assert on pointer                                                            (not applicable)
+
+// receive on type                                                              (not applicable)
+// receive on value
+func f4() <-chan T4
+type T4 /* ERROR "invalid recursive type" */ [unsafe.Sizeof(<-f4())]int
+// receive on pointer                                                           (not applicable)
+
+// star on type                                                                 (not applicable)
+// star on value                                                                (not applicable)
+// star on pointer
+func f5() *T5
+type T5 /* ERROR "invalid recursive type" */ [unsafe.Sizeof(*f5())]int
+
+// Below is additional syntax which interacts with incomplete types. Notice that
+// each of the below falls into 1 of 3 cases:
+//   1. It cannot produce a value of (incomplete) type T.
+//   2. It can, but only because it already has a value of type T.
+//   3. It can, but only because it performs an implicit dereference.
+
+// select on type                                                               (case 1)
+// select on value                                                              (case 2)
+type T6 /* ERROR "invalid recursive type" */ struct {
+	f T7
+}
+type T7 [unsafe.Sizeof(T6{}.f)]int
+// select on pointer                                                            (case 3)
+type T8 /* ERROR "invalid recursive type" */ struct {
+	f T9
+}
+type T9 [unsafe.Sizeof(new(T8).f)]int
+
+// slice on type                                                                (not applicable)
+// slice on value                                                               (case 2)
+type T10 /* ERROR "invalid recursive type" */ [unsafe.Sizeof(T10{}[:])]int
+// slice on pointer                                                             (case 3)
+type T11 /* ERROR "invalid recursive type" */ [unsafe.Sizeof(new(T11)[:])]int
+
+// index on type                                                                (case 1)
+// index on value                                                               (case 2)
+type T12 /* ERROR "invalid recursive type" */ [unsafe.Sizeof(T12{}[42])]int
+// index on pointer                                                             (case 3)
+type T13 /* ERROR "invalid recursive type" */ [unsafe.Sizeof(new(T13)[42])]int
-- 
cgit v1.3


From 8ac4477d83672af8c3d39399685731ee6b81ce2f Mon Sep 17 00:00:00 2001
From: Cherry Mui <cherryyz@google.com>
Date: Thu, 8 Jan 2026 11:57:28 -0500
Subject: simd/archsimd: rename Broadcast methods

Currently the Broadcast128/256/512 methods broadcast the lowest
element of the input vector to a vector of the corresponding width.
There are also variations of broadcast operations that broadcast
the whole (128- or 256-bit) vector to a larger vector, which we
don't yet support. Our current naming is unclear which version it
is, though. Rename the current ones to Broadcast1ToN, to be clear
that they broadcast one element. The vector version probably will
be named BoradcastAllToN (not included in this CL).

Change-Id: I47a21e367f948ec0b578d63706a40d20f5a9f46d
Reviewed-on: https://go-review.googlesource.com/c/go/+/734840
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Junyang Shao <shaojunyang@google.com>
---
 src/cmd/compile/internal/amd64/simdssa.go          |  88 ++++----
 src/cmd/compile/internal/ssa/_gen/simdAMD64.rules  |  82 +++----
 .../compile/internal/ssa/_gen/simdgenericOps.go    |  60 +++---
 src/cmd/compile/internal/ssa/opGen.go              | 120 +++++------
 src/cmd/compile/internal/ssa/rewriteAMD64.go       | 112 +++++-----
 src/cmd/compile/internal/ssagen/simdintrinsics.go  |  60 +++---
 .../_gen/simdgen/ops/Moves/categories.yaml         |  33 ++-
 src/simd/archsimd/_gen/simdgen/ops/Moves/go.yaml   |  79 ++++---
 src/simd/archsimd/_gen/tmplgen/main.go             |   2 +-
 src/simd/archsimd/ops_amd64.go                     | 240 +++++++++++----------
 src/simd/archsimd/other_gen_amd64.go               |  60 +++---
 11 files changed, 490 insertions(+), 446 deletions(-)

diff --git a/src/cmd/compile/internal/amd64/simdssa.go b/src/cmd/compile/internal/amd64/simdssa.go
index c4d0fd69c6..a028cbe86d 100644
--- a/src/cmd/compile/internal/amd64/simdssa.go
+++ b/src/cmd/compile/internal/amd64/simdssa.go
@@ -25,23 +25,23 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPABSQ128,
 		ssa.OpAMD64VPABSQ256,
 		ssa.OpAMD64VPABSQ512,
-		ssa.OpAMD64VBROADCASTSS128,
 		ssa.OpAMD64VPBROADCASTQ128,
-		ssa.OpAMD64VPBROADCASTB128,
-		ssa.OpAMD64VPBROADCASTW128,
+		ssa.OpAMD64VBROADCASTSS128,
+		ssa.OpAMD64VBROADCASTSD256,
 		ssa.OpAMD64VPBROADCASTD128,
+		ssa.OpAMD64VPBROADCASTQ256,
 		ssa.OpAMD64VBROADCASTSS256,
-		ssa.OpAMD64VBROADCASTSD256,
-		ssa.OpAMD64VPBROADCASTB256,
-		ssa.OpAMD64VPBROADCASTW256,
+		ssa.OpAMD64VBROADCASTSD512,
+		ssa.OpAMD64VPBROADCASTW128,
 		ssa.OpAMD64VPBROADCASTD256,
-		ssa.OpAMD64VPBROADCASTQ256,
+		ssa.OpAMD64VPBROADCASTQ512,
 		ssa.OpAMD64VBROADCASTSS512,
-		ssa.OpAMD64VBROADCASTSD512,
-		ssa.OpAMD64VPBROADCASTB512,
-		ssa.OpAMD64VPBROADCASTW512,
+		ssa.OpAMD64VPBROADCASTB128,
+		ssa.OpAMD64VPBROADCASTW256,
 		ssa.OpAMD64VPBROADCASTD512,
-		ssa.OpAMD64VPBROADCASTQ512,
+		ssa.OpAMD64VPBROADCASTB256,
+		ssa.OpAMD64VPBROADCASTW512,
+		ssa.OpAMD64VPBROADCASTB512,
 		ssa.OpAMD64VCVTPD2PSX128,
 		ssa.OpAMD64VCVTPD2PSY128,
 		ssa.OpAMD64VCVTPD2PS256,
@@ -832,23 +832,23 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPABSQMasked128,
 		ssa.OpAMD64VPABSQMasked256,
 		ssa.OpAMD64VPABSQMasked512,
-		ssa.OpAMD64VBROADCASTSSMasked128,
 		ssa.OpAMD64VPBROADCASTQMasked128,
-		ssa.OpAMD64VPBROADCASTBMasked128,
-		ssa.OpAMD64VPBROADCASTWMasked128,
+		ssa.OpAMD64VBROADCASTSSMasked128,
+		ssa.OpAMD64VBROADCASTSDMasked256,
 		ssa.OpAMD64VPBROADCASTDMasked128,
+		ssa.OpAMD64VPBROADCASTQMasked256,
 		ssa.OpAMD64VBROADCASTSSMasked256,
-		ssa.OpAMD64VBROADCASTSDMasked256,
-		ssa.OpAMD64VPBROADCASTBMasked256,
-		ssa.OpAMD64VPBROADCASTWMasked256,
+		ssa.OpAMD64VBROADCASTSDMasked512,
+		ssa.OpAMD64VPBROADCASTWMasked128,
 		ssa.OpAMD64VPBROADCASTDMasked256,
-		ssa.OpAMD64VPBROADCASTQMasked256,
+		ssa.OpAMD64VPBROADCASTQMasked512,
 		ssa.OpAMD64VBROADCASTSSMasked512,
-		ssa.OpAMD64VBROADCASTSDMasked512,
-		ssa.OpAMD64VPBROADCASTBMasked512,
-		ssa.OpAMD64VPBROADCASTWMasked512,
+		ssa.OpAMD64VPBROADCASTBMasked128,
+		ssa.OpAMD64VPBROADCASTWMasked256,
 		ssa.OpAMD64VPBROADCASTDMasked512,
-		ssa.OpAMD64VPBROADCASTQMasked512,
+		ssa.OpAMD64VPBROADCASTBMasked256,
+		ssa.OpAMD64VPBROADCASTWMasked512,
+		ssa.OpAMD64VPBROADCASTBMasked512,
 		ssa.OpAMD64VCOMPRESSPSMasked128,
 		ssa.OpAMD64VCOMPRESSPSMasked256,
 		ssa.OpAMD64VCOMPRESSPSMasked512,
@@ -2460,23 +2460,23 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPABSQMasked128Merging,
 		ssa.OpAMD64VPABSQMasked256Merging,
 		ssa.OpAMD64VPABSQMasked512Merging,
-		ssa.OpAMD64VBROADCASTSSMasked128Merging,
 		ssa.OpAMD64VPBROADCASTQMasked128Merging,
-		ssa.OpAMD64VPBROADCASTBMasked128Merging,
-		ssa.OpAMD64VPBROADCASTWMasked128Merging,
+		ssa.OpAMD64VBROADCASTSSMasked128Merging,
+		ssa.OpAMD64VBROADCASTSDMasked256Merging,
 		ssa.OpAMD64VPBROADCASTDMasked128Merging,
+		ssa.OpAMD64VPBROADCASTQMasked256Merging,
 		ssa.OpAMD64VBROADCASTSSMasked256Merging,
-		ssa.OpAMD64VBROADCASTSDMasked256Merging,
-		ssa.OpAMD64VPBROADCASTBMasked256Merging,
-		ssa.OpAMD64VPBROADCASTWMasked256Merging,
+		ssa.OpAMD64VBROADCASTSDMasked512Merging,
+		ssa.OpAMD64VPBROADCASTWMasked128Merging,
 		ssa.OpAMD64VPBROADCASTDMasked256Merging,
-		ssa.OpAMD64VPBROADCASTQMasked256Merging,
+		ssa.OpAMD64VPBROADCASTQMasked512Merging,
 		ssa.OpAMD64VBROADCASTSSMasked512Merging,
-		ssa.OpAMD64VBROADCASTSDMasked512Merging,
-		ssa.OpAMD64VPBROADCASTBMasked512Merging,
-		ssa.OpAMD64VPBROADCASTWMasked512Merging,
+		ssa.OpAMD64VPBROADCASTBMasked128Merging,
+		ssa.OpAMD64VPBROADCASTWMasked256Merging,
 		ssa.OpAMD64VPBROADCASTDMasked512Merging,
-		ssa.OpAMD64VPBROADCASTQMasked512Merging,
+		ssa.OpAMD64VPBROADCASTBMasked256Merging,
+		ssa.OpAMD64VPBROADCASTWMasked512Merging,
+		ssa.OpAMD64VPBROADCASTBMasked512Merging,
 		ssa.OpAMD64VRNDSCALEPSMasked128Merging,
 		ssa.OpAMD64VRNDSCALEPSMasked256Merging,
 		ssa.OpAMD64VRNDSCALEPSMasked512Merging,
@@ -2817,23 +2817,23 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPAVGWMasked128,
 		ssa.OpAMD64VPAVGWMasked256,
 		ssa.OpAMD64VPAVGWMasked512,
-		ssa.OpAMD64VBROADCASTSSMasked128,
 		ssa.OpAMD64VPBROADCASTQMasked128,
-		ssa.OpAMD64VPBROADCASTBMasked128,
-		ssa.OpAMD64VPBROADCASTWMasked128,
+		ssa.OpAMD64VBROADCASTSSMasked128,
+		ssa.OpAMD64VBROADCASTSDMasked256,
 		ssa.OpAMD64VPBROADCASTDMasked128,
+		ssa.OpAMD64VPBROADCASTQMasked256,
 		ssa.OpAMD64VBROADCASTSSMasked256,
-		ssa.OpAMD64VBROADCASTSDMasked256,
-		ssa.OpAMD64VPBROADCASTBMasked256,
-		ssa.OpAMD64VPBROADCASTWMasked256,
+		ssa.OpAMD64VBROADCASTSDMasked512,
+		ssa.OpAMD64VPBROADCASTWMasked128,
 		ssa.OpAMD64VPBROADCASTDMasked256,
-		ssa.OpAMD64VPBROADCASTQMasked256,
+		ssa.OpAMD64VPBROADCASTQMasked512,
 		ssa.OpAMD64VBROADCASTSSMasked512,
-		ssa.OpAMD64VBROADCASTSDMasked512,
-		ssa.OpAMD64VPBROADCASTBMasked512,
-		ssa.OpAMD64VPBROADCASTWMasked512,
+		ssa.OpAMD64VPBROADCASTBMasked128,
+		ssa.OpAMD64VPBROADCASTWMasked256,
 		ssa.OpAMD64VPBROADCASTDMasked512,
-		ssa.OpAMD64VPBROADCASTQMasked512,
+		ssa.OpAMD64VPBROADCASTBMasked256,
+		ssa.OpAMD64VPBROADCASTWMasked512,
+		ssa.OpAMD64VPBROADCASTBMasked512,
 		ssa.OpAMD64VRNDSCALEPSMasked128,
 		ssa.OpAMD64VRNDSCALEPSMasked128load,
 		ssa.OpAMD64VRNDSCALEPSMasked256,
diff --git a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules
index 5c83f39a1f..799461610d 100644
--- a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules
+++ b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules
@@ -140,36 +140,36 @@
 (AverageUint16x8 ...) => (VPAVGW128 ...)
 (AverageUint16x16 ...) => (VPAVGW256 ...)
 (AverageUint16x32 ...) => (VPAVGW512 ...)
-(Broadcast128Float32x4 ...) => (VBROADCASTSS128 ...)
-(Broadcast128Float64x2 ...) => (VPBROADCASTQ128 ...)
-(Broadcast128Int8x16 ...) => (VPBROADCASTB128 ...)
-(Broadcast128Int16x8 ...) => (VPBROADCASTW128 ...)
-(Broadcast128Int32x4 ...) => (VPBROADCASTD128 ...)
-(Broadcast128Int64x2 ...) => (VPBROADCASTQ128 ...)
-(Broadcast128Uint8x16 ...) => (VPBROADCASTB128 ...)
-(Broadcast128Uint16x8 ...) => (VPBROADCASTW128 ...)
-(Broadcast128Uint32x4 ...) => (VPBROADCASTD128 ...)
-(Broadcast128Uint64x2 ...) => (VPBROADCASTQ128 ...)
-(Broadcast256Float32x4 ...) => (VBROADCASTSS256 ...)
-(Broadcast256Float64x2 ...) => (VBROADCASTSD256 ...)
-(Broadcast256Int8x16 ...) => (VPBROADCASTB256 ...)
-(Broadcast256Int16x8 ...) => (VPBROADCASTW256 ...)
-(Broadcast256Int32x4 ...) => (VPBROADCASTD256 ...)
-(Broadcast256Int64x2 ...) => (VPBROADCASTQ256 ...)
-(Broadcast256Uint8x16 ...) => (VPBROADCASTB256 ...)
-(Broadcast256Uint16x8 ...) => (VPBROADCASTW256 ...)
-(Broadcast256Uint32x4 ...) => (VPBROADCASTD256 ...)
-(Broadcast256Uint64x2 ...) => (VPBROADCASTQ256 ...)
-(Broadcast512Float32x4 ...) => (VBROADCASTSS512 ...)
-(Broadcast512Float64x2 ...) => (VBROADCASTSD512 ...)
-(Broadcast512Int8x16 ...) => (VPBROADCASTB512 ...)
-(Broadcast512Int16x8 ...) => (VPBROADCASTW512 ...)
-(Broadcast512Int32x4 ...) => (VPBROADCASTD512 ...)
-(Broadcast512Int64x2 ...) => (VPBROADCASTQ512 ...)
-(Broadcast512Uint8x16 ...) => (VPBROADCASTB512 ...)
-(Broadcast512Uint16x8 ...) => (VPBROADCASTW512 ...)
-(Broadcast512Uint32x4 ...) => (VPBROADCASTD512 ...)
-(Broadcast512Uint64x2 ...) => (VPBROADCASTQ512 ...)
+(Broadcast1To2Float64x2 ...) => (VPBROADCASTQ128 ...)
+(Broadcast1To2Int64x2 ...) => (VPBROADCASTQ128 ...)
+(Broadcast1To2Uint64x2 ...) => (VPBROADCASTQ128 ...)
+(Broadcast1To4Float32x4 ...) => (VBROADCASTSS128 ...)
+(Broadcast1To4Float64x2 ...) => (VBROADCASTSD256 ...)
+(Broadcast1To4Int32x4 ...) => (VPBROADCASTD128 ...)
+(Broadcast1To4Int64x2 ...) => (VPBROADCASTQ256 ...)
+(Broadcast1To4Uint32x4 ...) => (VPBROADCASTD128 ...)
+(Broadcast1To4Uint64x2 ...) => (VPBROADCASTQ256 ...)
+(Broadcast1To8Float32x4 ...) => (VBROADCASTSS256 ...)
+(Broadcast1To8Float64x2 ...) => (VBROADCASTSD512 ...)
+(Broadcast1To8Int16x8 ...) => (VPBROADCASTW128 ...)
+(Broadcast1To8Int32x4 ...) => (VPBROADCASTD256 ...)
+(Broadcast1To8Int64x2 ...) => (VPBROADCASTQ512 ...)
+(Broadcast1To8Uint16x8 ...) => (VPBROADCASTW128 ...)
+(Broadcast1To8Uint32x4 ...) => (VPBROADCASTD256 ...)
+(Broadcast1To8Uint64x2 ...) => (VPBROADCASTQ512 ...)
+(Broadcast1To16Float32x4 ...) => (VBROADCASTSS512 ...)
+(Broadcast1To16Int8x16 ...) => (VPBROADCASTB128 ...)
+(Broadcast1To16Int16x8 ...) => (VPBROADCASTW256 ...)
+(Broadcast1To16Int32x4 ...) => (VPBROADCASTD512 ...)
+(Broadcast1To16Uint8x16 ...) => (VPBROADCASTB128 ...)
+(Broadcast1To16Uint16x8 ...) => (VPBROADCASTW256 ...)
+(Broadcast1To16Uint32x4 ...) => (VPBROADCASTD512 ...)
+(Broadcast1To32Int8x16 ...) => (VPBROADCASTB256 ...)
+(Broadcast1To32Int16x8 ...) => (VPBROADCASTW512 ...)
+(Broadcast1To32Uint8x16 ...) => (VPBROADCASTB256 ...)
+(Broadcast1To32Uint16x8 ...) => (VPBROADCASTW512 ...)
+(Broadcast1To64Int8x16 ...) => (VPBROADCASTB512 ...)
+(Broadcast1To64Uint8x16 ...) => (VPBROADCASTB512 ...)
 (CeilFloat32x4 x) => (VROUNDPS128 [2] x)
 (CeilFloat32x8 x) => (VROUNDPS256 [2] x)
 (CeilFloat64x2 x) => (VROUNDPD128 [2] x)
@@ -1424,23 +1424,23 @@
 (VMOVDQU16Masked128 (VPAVGW128 x y) mask) => (VPAVGWMasked128 x y mask)
 (VMOVDQU16Masked256 (VPAVGW256 x y) mask) => (VPAVGWMasked256 x y mask)
 (VMOVDQU16Masked512 (VPAVGW512 x y) mask) => (VPAVGWMasked512 x y mask)
-(VMOVDQU32Masked128 (VBROADCASTSS128 x) mask) => (VBROADCASTSSMasked128 x mask)
 (VMOVDQU64Masked128 (VPBROADCASTQ128 x) mask) => (VPBROADCASTQMasked128 x mask)
-(VMOVDQU8Masked128 (VPBROADCASTB128 x) mask) => (VPBROADCASTBMasked128 x mask)
-(VMOVDQU16Masked128 (VPBROADCASTW128 x) mask) => (VPBROADCASTWMasked128 x mask)
+(VMOVDQU32Masked128 (VBROADCASTSS128 x) mask) => (VBROADCASTSSMasked128 x mask)
+(VMOVDQU64Masked256 (VBROADCASTSD256 x) mask) => (VBROADCASTSDMasked256 x mask)
 (VMOVDQU32Masked128 (VPBROADCASTD128 x) mask) => (VPBROADCASTDMasked128 x mask)
+(VMOVDQU64Masked256 (VPBROADCASTQ256 x) mask) => (VPBROADCASTQMasked256 x mask)
 (VMOVDQU32Masked256 (VBROADCASTSS256 x) mask) => (VBROADCASTSSMasked256 x mask)
-(VMOVDQU64Masked256 (VBROADCASTSD256 x) mask) => (VBROADCASTSDMasked256 x mask)
-(VMOVDQU8Masked256 (VPBROADCASTB256 x) mask) => (VPBROADCASTBMasked256 x mask)
-(VMOVDQU16Masked256 (VPBROADCASTW256 x) mask) => (VPBROADCASTWMasked256 x mask)
+(VMOVDQU64Masked512 (VBROADCASTSD512 x) mask) => (VBROADCASTSDMasked512 x mask)
+(VMOVDQU16Masked128 (VPBROADCASTW128 x) mask) => (VPBROADCASTWMasked128 x mask)
 (VMOVDQU32Masked256 (VPBROADCASTD256 x) mask) => (VPBROADCASTDMasked256 x mask)
-(VMOVDQU64Masked256 (VPBROADCASTQ256 x) mask) => (VPBROADCASTQMasked256 x mask)
+(VMOVDQU64Masked512 (VPBROADCASTQ512 x) mask) => (VPBROADCASTQMasked512 x mask)
 (VMOVDQU32Masked512 (VBROADCASTSS512 x) mask) => (VBROADCASTSSMasked512 x mask)
-(VMOVDQU64Masked512 (VBROADCASTSD512 x) mask) => (VBROADCASTSDMasked512 x mask)
-(VMOVDQU8Masked512 (VPBROADCASTB512 x) mask) => (VPBROADCASTBMasked512 x mask)
-(VMOVDQU16Masked512 (VPBROADCASTW512 x) mask) => (VPBROADCASTWMasked512 x mask)
+(VMOVDQU8Masked128 (VPBROADCASTB128 x) mask) => (VPBROADCASTBMasked128 x mask)
+(VMOVDQU16Masked256 (VPBROADCASTW256 x) mask) => (VPBROADCASTWMasked256 x mask)
 (VMOVDQU32Masked512 (VPBROADCASTD512 x) mask) => (VPBROADCASTDMasked512 x mask)
-(VMOVDQU64Masked512 (VPBROADCASTQ512 x) mask) => (VPBROADCASTQMasked512 x mask)
+(VMOVDQU8Masked256 (VPBROADCASTB256 x) mask) => (VPBROADCASTBMasked256 x mask)
+(VMOVDQU16Masked512 (VPBROADCASTW512 x) mask) => (VPBROADCASTWMasked512 x mask)
+(VMOVDQU8Masked512 (VPBROADCASTB512 x) mask) => (VPBROADCASTBMasked512 x mask)
 (VMOVDQU32Masked128 (VRNDSCALEPS128 [a] x) mask) => (VRNDSCALEPSMasked128 [a] x mask)
 (VMOVDQU32Masked256 (VRNDSCALEPS256 [a] x) mask) => (VRNDSCALEPSMasked256 [a] x mask)
 (VMOVDQU32Masked512 (VRNDSCALEPS512 [a] x) mask) => (VRNDSCALEPSMasked512 [a] x mask)
diff --git a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go
index 889ab0d84f..ff863a389f 100644
--- a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go
+++ b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go
@@ -143,36 +143,36 @@ func simdGenericOps() []opData {
 		{name: "AverageUint16x8", argLength: 2, commutative: true},
 		{name: "AverageUint16x16", argLength: 2, commutative: true},
 		{name: "AverageUint16x32", argLength: 2, commutative: true},
-		{name: "Broadcast128Float32x4", argLength: 1, commutative: false},
-		{name: "Broadcast128Float64x2", argLength: 1, commutative: false},
-		{name: "Broadcast128Int8x16", argLength: 1, commutative: false},
-		{name: "Broadcast128Int16x8", argLength: 1, commutative: false},
-		{name: "Broadcast128Int32x4", argLength: 1, commutative: false},
-		{name: "Broadcast128Int64x2", argLength: 1, commutative: false},
-		{name: "Broadcast128Uint8x16", argLength: 1, commutative: false},
-		{name: "Broadcast128Uint16x8", argLength: 1, commutative: false},
-		{name: "Broadcast128Uint32x4", argLength: 1, commutative: false},
-		{name: "Broadcast128Uint64x2", argLength: 1, commutative: false},
-		{name: "Broadcast256Float32x4", argLength: 1, commutative: false},
-		{name: "Broadcast256Float64x2", argLength: 1, commutative: false},
-		{name: "Broadcast256Int8x16", argLength: 1, commutative: false},
-		{name: "Broadcast256Int16x8", argLength: 1, commutative: false},
-		{name: "Broadcast256Int32x4", argLength: 1, commutative: false},
-		{name: "Broadcast256Int64x2", argLength: 1, commutative: false},
-		{name: "Broadcast256Uint8x16", argLength: 1, commutative: false},
-		{name: "Broadcast256Uint16x8", argLength: 1, commutative: false},
-		{name: "Broadcast256Uint32x4", argLength: 1, commutative: false},
-		{name: "Broadcast256Uint64x2", argLength: 1, commutative: false},
-		{name: "Broadcast512Float32x4", argLength: 1, commutative: false},
-		{name: "Broadcast512Float64x2", argLength: 1, commutative: false},
-		{name: "Broadcast512Int8x16", argLength: 1, commutative: false},
-		{name: "Broadcast512Int16x8", argLength: 1, commutative: false},
-		{name: "Broadcast512Int32x4", argLength: 1, commutative: false},
-		{name: "Broadcast512Int64x2", argLength: 1, commutative: false},
-		{name: "Broadcast512Uint8x16", argLength: 1, commutative: false},
-		{name: "Broadcast512Uint16x8", argLength: 1, commutative: false},
-		{name: "Broadcast512Uint32x4", argLength: 1, commutative: false},
-		{name: "Broadcast512Uint64x2", argLength: 1, commutative: false},
+		{name: "Broadcast1To2Float64x2", argLength: 1, commutative: false},
+		{name: "Broadcast1To2Int64x2", argLength: 1, commutative: false},
+		{name: "Broadcast1To2Uint64x2", argLength: 1, commutative: false},
+		{name: "Broadcast1To4Float32x4", argLength: 1, commutative: false},
+		{name: "Broadcast1To4Float64x2", argLength: 1, commutative: false},
+		{name: "Broadcast1To4Int32x4", argLength: 1, commutative: false},
+		{name: "Broadcast1To4Int64x2", argLength: 1, commutative: false},
+		{name: "Broadcast1To4Uint32x4", argLength: 1, commutative: false},
+		{name: "Broadcast1To4Uint64x2", argLength: 1, commutative: false},
+		{name: "Broadcast1To8Float32x4", argLength: 1, commutative: false},
+		{name: "Broadcast1To8Float64x2", argLength: 1, commutative: false},
+		{name: "Broadcast1To8Int16x8", argLength: 1, commutative: false},
+		{name: "Broadcast1To8Int32x4", argLength: 1, commutative: false},
+		{name: "Broadcast1To8Int64x2", argLength: 1, commutative: false},
+		{name: "Broadcast1To8Uint16x8", argLength: 1, commutative: false},
+		{name: "Broadcast1To8Uint32x4", argLength: 1, commutative: false},
+		{name: "Broadcast1To8Uint64x2", argLength: 1, commutative: false},
+		{name: "Broadcast1To16Float32x4", argLength: 1, commutative: false},
+		{name: "Broadcast1To16Int8x16", argLength: 1, commutative: false},
+		{name: "Broadcast1To16Int16x8", argLength: 1, commutative: false},
+		{name: "Broadcast1To16Int32x4", argLength: 1, commutative: false},
+		{name: "Broadcast1To16Uint8x16", argLength: 1, commutative: false},
+		{name: "Broadcast1To16Uint16x8", argLength: 1, commutative: false},
+		{name: "Broadcast1To16Uint32x4", argLength: 1, commutative: false},
+		{name: "Broadcast1To32Int8x16", argLength: 1, commutative: false},
+		{name: "Broadcast1To32Int16x8", argLength: 1, commutative: false},
+		{name: "Broadcast1To32Uint8x16", argLength: 1, commutative: false},
+		{name: "Broadcast1To32Uint16x8", argLength: 1, commutative: false},
+		{name: "Broadcast1To64Int8x16", argLength: 1, commutative: false},
+		{name: "Broadcast1To64Uint8x16", argLength: 1, commutative: false},
 		{name: "CeilFloat32x4", argLength: 1, commutative: false},
 		{name: "CeilFloat32x8", argLength: 1, commutative: false},
 		{name: "CeilFloat64x2", argLength: 1, commutative: false},
diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go
index 7b70dc2686..9e5fdb1fc1 100644
--- a/src/cmd/compile/internal/ssa/opGen.go
+++ b/src/cmd/compile/internal/ssa/opGen.go
@@ -6309,36 +6309,36 @@ const (
 	OpAverageUint16x8
 	OpAverageUint16x16
 	OpAverageUint16x32
-	OpBroadcast128Float32x4
-	OpBroadcast128Float64x2
-	OpBroadcast128Int8x16
-	OpBroadcast128Int16x8
-	OpBroadcast128Int32x4
-	OpBroadcast128Int64x2
-	OpBroadcast128Uint8x16
-	OpBroadcast128Uint16x8
-	OpBroadcast128Uint32x4
-	OpBroadcast128Uint64x2
-	OpBroadcast256Float32x4
-	OpBroadcast256Float64x2
-	OpBroadcast256Int8x16
-	OpBroadcast256Int16x8
-	OpBroadcast256Int32x4
-	OpBroadcast256Int64x2
-	OpBroadcast256Uint8x16
-	OpBroadcast256Uint16x8
-	OpBroadcast256Uint32x4
-	OpBroadcast256Uint64x2
-	OpBroadcast512Float32x4
-	OpBroadcast512Float64x2
-	OpBroadcast512Int8x16
-	OpBroadcast512Int16x8
-	OpBroadcast512Int32x4
-	OpBroadcast512Int64x2
-	OpBroadcast512Uint8x16
-	OpBroadcast512Uint16x8
-	OpBroadcast512Uint32x4
-	OpBroadcast512Uint64x2
+	OpBroadcast1To2Float64x2
+	OpBroadcast1To2Int64x2
+	OpBroadcast1To2Uint64x2
+	OpBroadcast1To4Float32x4
+	OpBroadcast1To4Float64x2
+	OpBroadcast1To4Int32x4
+	OpBroadcast1To4Int64x2
+	OpBroadcast1To4Uint32x4
+	OpBroadcast1To4Uint64x2
+	OpBroadcast1To8Float32x4
+	OpBroadcast1To8Float64x2
+	OpBroadcast1To8Int16x8
+	OpBroadcast1To8Int32x4
+	OpBroadcast1To8Int64x2
+	OpBroadcast1To8Uint16x8
+	OpBroadcast1To8Uint32x4
+	OpBroadcast1To8Uint64x2
+	OpBroadcast1To16Float32x4
+	OpBroadcast1To16Int8x16
+	OpBroadcast1To16Int16x8
+	OpBroadcast1To16Int32x4
+	OpBroadcast1To16Uint8x16
+	OpBroadcast1To16Uint16x8
+	OpBroadcast1To16Uint32x4
+	OpBroadcast1To32Int8x16
+	OpBroadcast1To32Int16x8
+	OpBroadcast1To32Uint8x16
+	OpBroadcast1To32Uint16x8
+	OpBroadcast1To64Int8x16
+	OpBroadcast1To64Uint8x16
 	OpCeilFloat32x4
 	OpCeilFloat32x8
 	OpCeilFloat64x2
@@ -89875,152 +89875,152 @@ var opcodeTable = [...]opInfo{
 		generic:     true,
 	},
 	{
-		name:    "Broadcast128Float32x4",
+		name:    "Broadcast1To2Float64x2",
 		argLen:  1,
 		generic: true,
 	},
 	{
-		name:    "Broadcast128Float64x2",
+		name:    "Broadcast1To2Int64x2",
 		argLen:  1,
 		generic: true,
 	},
 	{
-		name:    "Broadcast128Int8x16",
+		name:    "Broadcast1To2Uint64x2",
 		argLen:  1,
 		generic: true,
 	},
 	{
-		name:    "Broadcast128Int16x8",
+		name:    "Broadcast1To4Float32x4",
 		argLen:  1,
 		generic: true,
 	},
 	{
-		name:    "Broadcast128Int32x4",
+		name:    "Broadcast1To4Float64x2",
 		argLen:  1,
 		generic: true,
 	},
 	{
-		name:    "Broadcast128Int64x2",
+		name:    "Broadcast1To4Int32x4",
 		argLen:  1,
 		generic: true,
 	},
 	{
-		name:    "Broadcast128Uint8x16",
+		name:    "Broadcast1To4Int64x2",
 		argLen:  1,
 		generic: true,
 	},
 	{
-		name:    "Broadcast128Uint16x8",
+		name:    "Broadcast1To4Uint32x4",
 		argLen:  1,
 		generic: true,
 	},
 	{
-		name:    "Broadcast128Uint32x4",
+		name:    "Broadcast1To4Uint64x2",
 		argLen:  1,
 		generic: true,
 	},
 	{
-		name:    "Broadcast128Uint64x2",
+		name:    "Broadcast1To8Float32x4",
 		argLen:  1,
 		generic: true,
 	},
 	{
-		name:    "Broadcast256Float32x4",
+		name:    "Broadcast1To8Float64x2",
 		argLen:  1,
 		generic: true,
 	},
 	{
-		name:    "Broadcast256Float64x2",
+		name:    "Broadcast1To8Int16x8",
 		argLen:  1,
 		generic: true,
 	},
 	{
-		name:    "Broadcast256Int8x16",
+		name:    "Broadcast1To8Int32x4",
 		argLen:  1,
 		generic: true,
 	},
 	{
-		name:    "Broadcast256Int16x8",
+		name:    "Broadcast1To8Int64x2",
 		argLen:  1,
 		generic: true,
 	},
 	{
-		name:    "Broadcast256Int32x4",
+		name:    "Broadcast1To8Uint16x8",
 		argLen:  1,
 		generic: true,
 	},
 	{
-		name:    "Broadcast256Int64x2",
+		name:    "Broadcast1To8Uint32x4",
 		argLen:  1,
 		generic: true,
 	},
 	{
-		name:    "Broadcast256Uint8x16",
+		name:    "Broadcast1To8Uint64x2",
 		argLen:  1,
 		generic: true,
 	},
 	{
-		name:    "Broadcast256Uint16x8",
+		name:    "Broadcast1To16Float32x4",
 		argLen:  1,
 		generic: true,
 	},
 	{
-		name:    "Broadcast256Uint32x4",
+		name:    "Broadcast1To16Int8x16",
 		argLen:  1,
 		generic: true,
 	},
 	{
-		name:    "Broadcast256Uint64x2",
+		name:    "Broadcast1To16Int16x8",
 		argLen:  1,
 		generic: true,
 	},
 	{
-		name:    "Broadcast512Float32x4",
+		name:    "Broadcast1To16Int32x4",
 		argLen:  1,
 		generic: true,
 	},
 	{
-		name:    "Broadcast512Float64x2",
+		name:    "Broadcast1To16Uint8x16",
 		argLen:  1,
 		generic: true,
 	},
 	{
-		name:    "Broadcast512Int8x16",
+		name:    "Broadcast1To16Uint16x8",
 		argLen:  1,
 		generic: true,
 	},
 	{
-		name:    "Broadcast512Int16x8",
+		name:    "Broadcast1To16Uint32x4",
 		argLen:  1,
 		generic: true,
 	},
 	{
-		name:    "Broadcast512Int32x4",
+		name:    "Broadcast1To32Int8x16",
 		argLen:  1,
 		generic: true,
 	},
 	{
-		name:    "Broadcast512Int64x2",
+		name:    "Broadcast1To32Int16x8",
 		argLen:  1,
 		generic: true,
 	},
 	{
-		name:    "Broadcast512Uint8x16",
+		name:    "Broadcast1To32Uint8x16",
 		argLen:  1,
 		generic: true,
 	},
 	{
-		name:    "Broadcast512Uint16x8",
+		name:    "Broadcast1To32Uint16x8",
 		argLen:  1,
 		generic: true,
 	},
 	{
-		name:    "Broadcast512Uint32x4",
+		name:    "Broadcast1To64Int8x16",
 		argLen:  1,
 		generic: true,
 	},
 	{
-		name:    "Broadcast512Uint64x2",
+		name:    "Broadcast1To64Uint8x16",
 		argLen:  1,
 		generic: true,
 	},
diff --git a/src/cmd/compile/internal/ssa/rewriteAMD64.go b/src/cmd/compile/internal/ssa/rewriteAMD64.go
index e84bf19c83..fe0005bb05 100644
--- a/src/cmd/compile/internal/ssa/rewriteAMD64.go
+++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go
@@ -2479,96 +2479,96 @@ func rewriteValueAMD64(v *Value) bool {
 		return rewriteValueAMD64_OpBitLen64(v)
 	case OpBitLen8:
 		return rewriteValueAMD64_OpBitLen8(v)
-	case OpBroadcast128Float32x4:
-		v.Op = OpAMD64VBROADCASTSS128
+	case OpBroadcast1To16Float32x4:
+		v.Op = OpAMD64VBROADCASTSS512
 		return true
-	case OpBroadcast128Float64x2:
-		v.Op = OpAMD64VPBROADCASTQ128
+	case OpBroadcast1To16Int16x8:
+		v.Op = OpAMD64VPBROADCASTW256
 		return true
-	case OpBroadcast128Int16x8:
-		v.Op = OpAMD64VPBROADCASTW128
+	case OpBroadcast1To16Int32x4:
+		v.Op = OpAMD64VPBROADCASTD512
 		return true
-	case OpBroadcast128Int32x4:
-		v.Op = OpAMD64VPBROADCASTD128
+	case OpBroadcast1To16Int8x16:
+		v.Op = OpAMD64VPBROADCASTB128
 		return true
-	case OpBroadcast128Int64x2:
-		v.Op = OpAMD64VPBROADCASTQ128
+	case OpBroadcast1To16Uint16x8:
+		v.Op = OpAMD64VPBROADCASTW256
 		return true
-	case OpBroadcast128Int8x16:
+	case OpBroadcast1To16Uint32x4:
+		v.Op = OpAMD64VPBROADCASTD512
+		return true
+	case OpBroadcast1To16Uint8x16:
 		v.Op = OpAMD64VPBROADCASTB128
 		return true
-	case OpBroadcast128Uint16x8:
-		v.Op = OpAMD64VPBROADCASTW128
+	case OpBroadcast1To2Float64x2:
+		v.Op = OpAMD64VPBROADCASTQ128
 		return true
-	case OpBroadcast128Uint32x4:
-		v.Op = OpAMD64VPBROADCASTD128
+	case OpBroadcast1To2Int64x2:
+		v.Op = OpAMD64VPBROADCASTQ128
 		return true
-	case OpBroadcast128Uint64x2:
+	case OpBroadcast1To2Uint64x2:
 		v.Op = OpAMD64VPBROADCASTQ128
 		return true
-	case OpBroadcast128Uint8x16:
-		v.Op = OpAMD64VPBROADCASTB128
+	case OpBroadcast1To32Int16x8:
+		v.Op = OpAMD64VPBROADCASTW512
 		return true
-	case OpBroadcast256Float32x4:
-		v.Op = OpAMD64VBROADCASTSS256
+	case OpBroadcast1To32Int8x16:
+		v.Op = OpAMD64VPBROADCASTB256
 		return true
-	case OpBroadcast256Float64x2:
-		v.Op = OpAMD64VBROADCASTSD256
+	case OpBroadcast1To32Uint16x8:
+		v.Op = OpAMD64VPBROADCASTW512
 		return true
-	case OpBroadcast256Int16x8:
-		v.Op = OpAMD64VPBROADCASTW256
+	case OpBroadcast1To32Uint8x16:
+		v.Op = OpAMD64VPBROADCASTB256
 		return true
-	case OpBroadcast256Int32x4:
-		v.Op = OpAMD64VPBROADCASTD256
+	case OpBroadcast1To4Float32x4:
+		v.Op = OpAMD64VBROADCASTSS128
 		return true
-	case OpBroadcast256Int64x2:
-		v.Op = OpAMD64VPBROADCASTQ256
+	case OpBroadcast1To4Float64x2:
+		v.Op = OpAMD64VBROADCASTSD256
 		return true
-	case OpBroadcast256Int8x16:
-		v.Op = OpAMD64VPBROADCASTB256
+	case OpBroadcast1To4Int32x4:
+		v.Op = OpAMD64VPBROADCASTD128
 		return true
-	case OpBroadcast256Uint16x8:
-		v.Op = OpAMD64VPBROADCASTW256
+	case OpBroadcast1To4Int64x2:
+		v.Op = OpAMD64VPBROADCASTQ256
 		return true
-	case OpBroadcast256Uint32x4:
-		v.Op = OpAMD64VPBROADCASTD256
+	case OpBroadcast1To4Uint32x4:
+		v.Op = OpAMD64VPBROADCASTD128
 		return true
-	case OpBroadcast256Uint64x2:
+	case OpBroadcast1To4Uint64x2:
 		v.Op = OpAMD64VPBROADCASTQ256
 		return true
-	case OpBroadcast256Uint8x16:
-		v.Op = OpAMD64VPBROADCASTB256
+	case OpBroadcast1To64Int8x16:
+		v.Op = OpAMD64VPBROADCASTB512
 		return true
-	case OpBroadcast512Float32x4:
-		v.Op = OpAMD64VBROADCASTSS512
+	case OpBroadcast1To64Uint8x16:
+		v.Op = OpAMD64VPBROADCASTB512
+		return true
+	case OpBroadcast1To8Float32x4:
+		v.Op = OpAMD64VBROADCASTSS256
 		return true
-	case OpBroadcast512Float64x2:
+	case OpBroadcast1To8Float64x2:
 		v.Op = OpAMD64VBROADCASTSD512
 		return true
-	case OpBroadcast512Int16x8:
-		v.Op = OpAMD64VPBROADCASTW512
+	case OpBroadcast1To8Int16x8:
+		v.Op = OpAMD64VPBROADCASTW128
 		return true
-	case OpBroadcast512Int32x4:
-		v.Op = OpAMD64VPBROADCASTD512
+	case OpBroadcast1To8Int32x4:
+		v.Op = OpAMD64VPBROADCASTD256
 		return true
-	case OpBroadcast512Int64x2:
+	case OpBroadcast1To8Int64x2:
 		v.Op = OpAMD64VPBROADCASTQ512
 		return true
-	case OpBroadcast512Int8x16:
-		v.Op = OpAMD64VPBROADCASTB512
-		return true
-	case OpBroadcast512Uint16x8:
-		v.Op = OpAMD64VPBROADCASTW512
+	case OpBroadcast1To8Uint16x8:
+		v.Op = OpAMD64VPBROADCASTW128
 		return true
-	case OpBroadcast512Uint32x4:
-		v.Op = OpAMD64VPBROADCASTD512
+	case OpBroadcast1To8Uint32x4:
+		v.Op = OpAMD64VPBROADCASTD256
 		return true
-	case OpBroadcast512Uint64x2:
+	case OpBroadcast1To8Uint64x2:
 		v.Op = OpAMD64VPBROADCASTQ512
 		return true
-	case OpBroadcast512Uint8x16:
-		v.Op = OpAMD64VPBROADCASTB512
-		return true
 	case OpBswap16:
 		return rewriteValueAMD64_OpBswap16(v)
 	case OpBswap32:
diff --git a/src/cmd/compile/internal/ssagen/simdintrinsics.go b/src/cmd/compile/internal/ssagen/simdintrinsics.go
index 4ad0c6032c..e50561845b 100644
--- a/src/cmd/compile/internal/ssagen/simdintrinsics.go
+++ b/src/cmd/compile/internal/ssagen/simdintrinsics.go
@@ -152,36 +152,36 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
 	addF(simdPackage, "Uint16x8.Average", opLen2(ssa.OpAverageUint16x8, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Uint16x16.Average", opLen2(ssa.OpAverageUint16x16, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Uint16x32.Average", opLen2(ssa.OpAverageUint16x32, types.TypeVec512), sys.AMD64)
-	addF(simdPackage, "Float32x4.Broadcast128", opLen1(ssa.OpBroadcast128Float32x4, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Float64x2.Broadcast128", opLen1(ssa.OpBroadcast128Float64x2, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Int8x16.Broadcast128", opLen1(ssa.OpBroadcast128Int8x16, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Int16x8.Broadcast128", opLen1(ssa.OpBroadcast128Int16x8, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Int32x4.Broadcast128", opLen1(ssa.OpBroadcast128Int32x4, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Int64x2.Broadcast128", opLen1(ssa.OpBroadcast128Int64x2, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Uint8x16.Broadcast128", opLen1(ssa.OpBroadcast128Uint8x16, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Uint16x8.Broadcast128", opLen1(ssa.OpBroadcast128Uint16x8, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Uint32x4.Broadcast128", opLen1(ssa.OpBroadcast128Uint32x4, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Uint64x2.Broadcast128", opLen1(ssa.OpBroadcast128Uint64x2, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Float32x4.Broadcast256", opLen1(ssa.OpBroadcast256Float32x4, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Float64x2.Broadcast256", opLen1(ssa.OpBroadcast256Float64x2, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Int8x16.Broadcast256", opLen1(ssa.OpBroadcast256Int8x16, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Int16x8.Broadcast256", opLen1(ssa.OpBroadcast256Int16x8, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Int32x4.Broadcast256", opLen1(ssa.OpBroadcast256Int32x4, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Int64x2.Broadcast256", opLen1(ssa.OpBroadcast256Int64x2, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Uint8x16.Broadcast256", opLen1(ssa.OpBroadcast256Uint8x16, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Uint16x8.Broadcast256", opLen1(ssa.OpBroadcast256Uint16x8, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Uint32x4.Broadcast256", opLen1(ssa.OpBroadcast256Uint32x4, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Uint64x2.Broadcast256", opLen1(ssa.OpBroadcast256Uint64x2, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Float32x4.Broadcast512", opLen1(ssa.OpBroadcast512Float32x4, types.TypeVec512), sys.AMD64)
-	addF(simdPackage, "Float64x2.Broadcast512", opLen1(ssa.OpBroadcast512Float64x2, types.TypeVec512), sys.AMD64)
-	addF(simdPackage, "Int8x16.Broadcast512", opLen1(ssa.OpBroadcast512Int8x16, types.TypeVec512), sys.AMD64)
-	addF(simdPackage, "Int16x8.Broadcast512", opLen1(ssa.OpBroadcast512Int16x8, types.TypeVec512), sys.AMD64)
-	addF(simdPackage, "Int32x4.Broadcast512", opLen1(ssa.OpBroadcast512Int32x4, types.TypeVec512), sys.AMD64)
-	addF(simdPackage, "Int64x2.Broadcast512", opLen1(ssa.OpBroadcast512Int64x2, types.TypeVec512), sys.AMD64)
-	addF(simdPackage, "Uint8x16.Broadcast512", opLen1(ssa.OpBroadcast512Uint8x16, types.TypeVec512), sys.AMD64)
-	addF(simdPackage, "Uint16x8.Broadcast512", opLen1(ssa.OpBroadcast512Uint16x8, types.TypeVec512), sys.AMD64)
-	addF(simdPackage, "Uint32x4.Broadcast512", opLen1(ssa.OpBroadcast512Uint32x4, types.TypeVec512), sys.AMD64)
-	addF(simdPackage, "Uint64x2.Broadcast512", opLen1(ssa.OpBroadcast512Uint64x2, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Float64x2.Broadcast1To2", opLen1(ssa.OpBroadcast1To2Float64x2, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Int64x2.Broadcast1To2", opLen1(ssa.OpBroadcast1To2Int64x2, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Uint64x2.Broadcast1To2", opLen1(ssa.OpBroadcast1To2Uint64x2, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Float32x4.Broadcast1To4", opLen1(ssa.OpBroadcast1To4Float32x4, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Float64x2.Broadcast1To4", opLen1(ssa.OpBroadcast1To4Float64x2, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Int32x4.Broadcast1To4", opLen1(ssa.OpBroadcast1To4Int32x4, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Int64x2.Broadcast1To4", opLen1(ssa.OpBroadcast1To4Int64x2, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Uint32x4.Broadcast1To4", opLen1(ssa.OpBroadcast1To4Uint32x4, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Uint64x2.Broadcast1To4", opLen1(ssa.OpBroadcast1To4Uint64x2, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Float32x4.Broadcast1To8", opLen1(ssa.OpBroadcast1To8Float32x4, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Float64x2.Broadcast1To8", opLen1(ssa.OpBroadcast1To8Float64x2, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Int16x8.Broadcast1To8", opLen1(ssa.OpBroadcast1To8Int16x8, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Int32x4.Broadcast1To8", opLen1(ssa.OpBroadcast1To8Int32x4, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Int64x2.Broadcast1To8", opLen1(ssa.OpBroadcast1To8Int64x2, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Uint16x8.Broadcast1To8", opLen1(ssa.OpBroadcast1To8Uint16x8, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Uint32x4.Broadcast1To8", opLen1(ssa.OpBroadcast1To8Uint32x4, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Uint64x2.Broadcast1To8", opLen1(ssa.OpBroadcast1To8Uint64x2, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Float32x4.Broadcast1To16", opLen1(ssa.OpBroadcast1To16Float32x4, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Int8x16.Broadcast1To16", opLen1(ssa.OpBroadcast1To16Int8x16, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Int16x8.Broadcast1To16", opLen1(ssa.OpBroadcast1To16Int16x8, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Int32x4.Broadcast1To16", opLen1(ssa.OpBroadcast1To16Int32x4, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Uint8x16.Broadcast1To16", opLen1(ssa.OpBroadcast1To16Uint8x16, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Uint16x8.Broadcast1To16", opLen1(ssa.OpBroadcast1To16Uint16x8, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Uint32x4.Broadcast1To16", opLen1(ssa.OpBroadcast1To16Uint32x4, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Int8x16.Broadcast1To32", opLen1(ssa.OpBroadcast1To32Int8x16, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Int16x8.Broadcast1To32", opLen1(ssa.OpBroadcast1To32Int16x8, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Uint8x16.Broadcast1To32", opLen1(ssa.OpBroadcast1To32Uint8x16, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Uint16x8.Broadcast1To32", opLen1(ssa.OpBroadcast1To32Uint16x8, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Int8x16.Broadcast1To64", opLen1(ssa.OpBroadcast1To64Int8x16, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Uint8x16.Broadcast1To64", opLen1(ssa.OpBroadcast1To64Uint8x16, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Float32x4.Ceil", opLen1(ssa.OpCeilFloat32x4, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Float32x8.Ceil", opLen1(ssa.OpCeilFloat32x8, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Float64x2.Ceil", opLen1(ssa.OpCeilFloat64x2, types.TypeVec128), sys.AMD64)
diff --git a/src/simd/archsimd/_gen/simdgen/ops/Moves/categories.yaml b/src/simd/archsimd/_gen/simdgen/ops/Moves/categories.yaml
index 38bc9374cc..3cba01ef95 100644
--- a/src/simd/archsimd/_gen/simdgen/ops/Moves/categories.yaml
+++ b/src/simd/archsimd/_gen/simdgen/ops/Moves/categories.yaml
@@ -69,21 +69,36 @@
   documentation: !string |-
     // NAME performs an expansion on a vector x whose elements are packed to lower parts.
     // The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order.
-- go: Broadcast128
+- go: Broadcast1To2
   commutative: false
   documentation: !string |-
-    // NAME copies element zero of its (128-bit) input to all elements of
-    // the 128-bit output vector.
-- go: Broadcast256
+    // NAME copies the lowest element of its input to all 2 elements of
+    // the output vector.
+- go: Broadcast1To4
   commutative: false
   documentation: !string |-
-    // NAME copies element zero of its (128-bit) input to all elements of
-    // the 256-bit output vector.
-- go: Broadcast512
+    // NAME copies the lowest element of its input to all 4 elements of
+    // the output vector.
+- go: Broadcast1To8
   commutative: false
   documentation: !string |-
-    // NAME copies element zero of its (128-bit) input to all elements of
-    // the 512-bit output vector.
+    // NAME copies the lowest element of its input to all 8 elements of
+    // the output vector.
+- go: Broadcast1To16
+  commutative: false
+  documentation: !string |-
+    // NAME copies the lowest element of its input to all 16 elements of
+    // the output vector.
+- go: Broadcast1To32
+  commutative: false
+  documentation: !string |-
+    // NAME copies the lowest element of its input to all 32 elements of
+    // the output vector.
+- go: Broadcast1To64
+  commutative: false
+  documentation: !string |-
+    // NAME copies the lowest element of its input to all 64 elements of
+    // the output vector.
 - go: PermuteOrZeroGrouped
   commutative: false
   documentation: !string |- # Detailed documentation will rely on the specific ops.
diff --git a/src/simd/archsimd/_gen/simdgen/ops/Moves/go.yaml b/src/simd/archsimd/_gen/simdgen/ops/Moves/go.yaml
index e1fd184ed7..02daa2ea1e 100644
--- a/src/simd/archsimd/_gen/simdgen/ops/Moves/go.yaml
+++ b/src/simd/archsimd/_gen/simdgen/ops/Moves/go.yaml
@@ -376,21 +376,21 @@
   out:
   - *any
 
-- go: Broadcast128
-  asm: VPBROADCAST[BWDQ]
+- go: Broadcast1To2
+  asm: VPBROADCASTQ
   in:
   - class: vreg
     bits: 128
-    elemBits: $e
+    elemBits: 64
     base: $b
   out:
   - class: vreg
     bits: 128
-    elemBits: $e
+    elemBits: 64
     base: $b
 
 # weirdly, this one case on AVX2 is memory-operand-only
-- go: Broadcast128
+- go: Broadcast1To2
   asm: VPBROADCASTQ
   in:
   - class: vreg
@@ -405,71 +405,94 @@
     base: int
     OverwriteBase: float
 
-- go: Broadcast256
+- go: Broadcast1To4
   asm: VPBROADCAST[BWDQ]
   in:
   - class: vreg
     bits: 128
-    elemBits: $e
     base: $b
   out:
   - class: vreg
-    bits: 256
-    elemBits: $e
+    lanes: 4
     base: $b
 
-- go: Broadcast512
+- go: Broadcast1To8
   asm: VPBROADCAST[BWDQ]
   in:
   - class: vreg
     bits: 128
-    elemBits: $e
     base: $b
   out:
   - class: vreg
-    bits: 512
-    elemBits: $e
+    lanes: 8
     base: $b
 
-- go: Broadcast128
-  asm: VBROADCASTS[SD]
+- go: Broadcast1To16
+  asm: VPBROADCAST[BWDQ]
   in:
   - class: vreg
     bits: 128
-    elemBits: $e
     base: $b
   out:
   - class: vreg
-    bits: 128
-    elemBits: $e
+    lanes: 16
     base: $b
 
-- go: Broadcast256
-  asm: VBROADCASTS[SD]
+- go: Broadcast1To32
+  asm: VPBROADCAST[BWDQ]
   in:
   - class: vreg
     bits: 128
-    elemBits: $e
     base: $b
   out:
   - class: vreg
-    bits: 256
-    elemBits: $e
+    lanes: 32
     base: $b
 
-- go: Broadcast512
-  asm: VBROADCASTS[SD]
+- go: Broadcast1To64
+  asm: VPBROADCASTB
   in:
   - class: vreg
     bits: 128
-    elemBits: $e
     base: $b
   out:
   - class: vreg
-    bits: 512
-    elemBits: $e
+    lanes: 64
     base: $b
 
+- go: Broadcast1To4
+  asm: VBROADCASTS[SD]
+  in:
+  - class: vreg
+    bits: 128
+    base: float
+  out:
+  - class: vreg
+    lanes: 4
+    base: float
+
+- go: Broadcast1To8
+  asm: VBROADCASTS[SD]
+  in:
+  - class: vreg
+    bits: 128
+    base: float
+  out:
+  - class: vreg
+    lanes: 8
+    base: float
+
+- go: Broadcast1To16
+  asm: VBROADCASTS[SD]
+  in:
+  - class: vreg
+    bits: 128
+    base: float
+  out:
+  - class: vreg
+    lanes: 16
+    base: float
+
 # VPSHUFB for 128-bit byte shuffles will be picked with higher priority than VPERMB, given its lower CPU feature requirement. (It's AVX)
 - go: PermuteOrZero
   asm: VPSHUFB
diff --git a/src/simd/archsimd/_gen/tmplgen/main.go b/src/simd/archsimd/_gen/tmplgen/main.go
index 8db185e1e0..45338b765d 100644
--- a/src/simd/archsimd/_gen/tmplgen/main.go
+++ b/src/simd/archsimd/_gen/tmplgen/main.go
@@ -873,7 +873,7 @@ var broadcastTemplate = templateOf("Broadcast functions", `
 // Emulated, CPU Feature: {{.CPUfeatureBC}}
 func Broadcast{{.VType}}(x {{.Etype}}) {{.VType}} {
 	var z {{.As128BitVec }}
-	return z.SetElem(0, x).Broadcast{{.Vwidth}}()
+	return z.SetElem(0, x).Broadcast1To{{.Count}}()
 }
 `)
 
diff --git a/src/simd/archsimd/ops_amd64.go b/src/simd/archsimd/ops_amd64.go
index eba340c793..bb162c4ff9 100644
--- a/src/simd/archsimd/ops_amd64.go
+++ b/src/simd/archsimd/ops_amd64.go
@@ -805,191 +805,197 @@ func (x Uint16x16) Average(y Uint16x16) Uint16x16
 // Asm: VPAVGW, CPU Feature: AVX512
 func (x Uint16x32) Average(y Uint16x32) Uint16x32
 
-/* Broadcast128 */
+/* Broadcast1To2 */
 
-// Broadcast128 copies element zero of its (128-bit) input to all elements of
-// the 128-bit output vector.
+// Broadcast1To2 copies the lowest element of its input to all 2 elements of
+// the output vector.
 //
-// Asm: VBROADCASTSS, CPU Feature: AVX2
-func (x Float32x4) Broadcast128() Float32x4
+// Asm: VPBROADCASTQ, CPU Feature: AVX2
+func (x Float64x2) Broadcast1To2() Float64x2
 
-// Broadcast128 copies element zero of its (128-bit) input to all elements of
-// the 128-bit output vector.
+// Broadcast1To2 copies the lowest element of its input to all 2 elements of
+// the output vector.
 //
 // Asm: VPBROADCASTQ, CPU Feature: AVX2
-func (x Float64x2) Broadcast128() Float64x2
+func (x Int64x2) Broadcast1To2() Int64x2
 
-// Broadcast128 copies element zero of its (128-bit) input to all elements of
-// the 128-bit output vector.
+// Broadcast1To2 copies the lowest element of its input to all 2 elements of
+// the output vector.
 //
-// Asm: VPBROADCASTB, CPU Feature: AVX2
-func (x Int8x16) Broadcast128() Int8x16
+// Asm: VPBROADCASTQ, CPU Feature: AVX2
+func (x Uint64x2) Broadcast1To2() Uint64x2
 
-// Broadcast128 copies element zero of its (128-bit) input to all elements of
-// the 128-bit output vector.
-//
-// Asm: VPBROADCASTW, CPU Feature: AVX2
-func (x Int16x8) Broadcast128() Int16x8
+/* Broadcast1To4 */
 
-// Broadcast128 copies element zero of its (128-bit) input to all elements of
-// the 128-bit output vector.
+// Broadcast1To4 copies the lowest element of its input to all 4 elements of
+// the output vector.
 //
-// Asm: VPBROADCASTD, CPU Feature: AVX2
-func (x Int32x4) Broadcast128() Int32x4
+// Asm: VBROADCASTSS, CPU Feature: AVX2
+func (x Float32x4) Broadcast1To4() Float32x4
 
-// Broadcast128 copies element zero of its (128-bit) input to all elements of
-// the 128-bit output vector.
+// Broadcast1To4 copies the lowest element of its input to all 4 elements of
+// the output vector.
 //
-// Asm: VPBROADCASTQ, CPU Feature: AVX2
-func (x Int64x2) Broadcast128() Int64x2
+// Asm: VBROADCASTSD, CPU Feature: AVX2
+func (x Float64x2) Broadcast1To4() Float64x4
 
-// Broadcast128 copies element zero of its (128-bit) input to all elements of
-// the 128-bit output vector.
+// Broadcast1To4 copies the lowest element of its input to all 4 elements of
+// the output vector.
 //
-// Asm: VPBROADCASTB, CPU Feature: AVX2
-func (x Uint8x16) Broadcast128() Uint8x16
+// Asm: VPBROADCASTD, CPU Feature: AVX2
+func (x Int32x4) Broadcast1To4() Int32x4
 
-// Broadcast128 copies element zero of its (128-bit) input to all elements of
-// the 128-bit output vector.
+// Broadcast1To4 copies the lowest element of its input to all 4 elements of
+// the output vector.
 //
-// Asm: VPBROADCASTW, CPU Feature: AVX2
-func (x Uint16x8) Broadcast128() Uint16x8
+// Asm: VPBROADCASTQ, CPU Feature: AVX2
+func (x Int64x2) Broadcast1To4() Int64x4
 
-// Broadcast128 copies element zero of its (128-bit) input to all elements of
-// the 128-bit output vector.
+// Broadcast1To4 copies the lowest element of its input to all 4 elements of
+// the output vector.
 //
 // Asm: VPBROADCASTD, CPU Feature: AVX2
-func (x Uint32x4) Broadcast128() Uint32x4
+func (x Uint32x4) Broadcast1To4() Uint32x4
 
-// Broadcast128 copies element zero of its (128-bit) input to all elements of
-// the 128-bit output vector.
+// Broadcast1To4 copies the lowest element of its input to all 4 elements of
+// the output vector.
 //
 // Asm: VPBROADCASTQ, CPU Feature: AVX2
-func (x Uint64x2) Broadcast128() Uint64x2
+func (x Uint64x2) Broadcast1To4() Uint64x4
 
-/* Broadcast256 */
+/* Broadcast1To8 */
 
-// Broadcast256 copies element zero of its (128-bit) input to all elements of
-// the 256-bit output vector.
+// Broadcast1To8 copies the lowest element of its input to all 8 elements of
+// the output vector.
 //
 // Asm: VBROADCASTSS, CPU Feature: AVX2
-func (x Float32x4) Broadcast256() Float32x8
-
-// Broadcast256 copies element zero of its (128-bit) input to all elements of
-// the 256-bit output vector.
-//
-// Asm: VBROADCASTSD, CPU Feature: AVX2
-func (x Float64x2) Broadcast256() Float64x4
+func (x Float32x4) Broadcast1To8() Float32x8
 
-// Broadcast256 copies element zero of its (128-bit) input to all elements of
-// the 256-bit output vector.
+// Broadcast1To8 copies the lowest element of its input to all 8 elements of
+// the output vector.
 //
-// Asm: VPBROADCASTB, CPU Feature: AVX2
-func (x Int8x16) Broadcast256() Int8x32
+// Asm: VBROADCASTSD, CPU Feature: AVX512
+func (x Float64x2) Broadcast1To8() Float64x8
 
-// Broadcast256 copies element zero of its (128-bit) input to all elements of
-// the 256-bit output vector.
+// Broadcast1To8 copies the lowest element of its input to all 8 elements of
+// the output vector.
 //
 // Asm: VPBROADCASTW, CPU Feature: AVX2
-func (x Int16x8) Broadcast256() Int16x16
+func (x Int16x8) Broadcast1To8() Int16x8
 
-// Broadcast256 copies element zero of its (128-bit) input to all elements of
-// the 256-bit output vector.
+// Broadcast1To8 copies the lowest element of its input to all 8 elements of
+// the output vector.
 //
 // Asm: VPBROADCASTD, CPU Feature: AVX2
-func (x Int32x4) Broadcast256() Int32x8
+func (x Int32x4) Broadcast1To8() Int32x8
 
-// Broadcast256 copies element zero of its (128-bit) input to all elements of
-// the 256-bit output vector.
+// Broadcast1To8 copies the lowest element of its input to all 8 elements of
+// the output vector.
 //
-// Asm: VPBROADCASTQ, CPU Feature: AVX2
-func (x Int64x2) Broadcast256() Int64x4
-
-// Broadcast256 copies element zero of its (128-bit) input to all elements of
-// the 256-bit output vector.
-//
-// Asm: VPBROADCASTB, CPU Feature: AVX2
-func (x Uint8x16) Broadcast256() Uint8x32
+// Asm: VPBROADCASTQ, CPU Feature: AVX512
+func (x Int64x2) Broadcast1To8() Int64x8
 
-// Broadcast256 copies element zero of its (128-bit) input to all elements of
-// the 256-bit output vector.
+// Broadcast1To8 copies the lowest element of its input to all 8 elements of
+// the output vector.
 //
 // Asm: VPBROADCASTW, CPU Feature: AVX2
-func (x Uint16x8) Broadcast256() Uint16x16
+func (x Uint16x8) Broadcast1To8() Uint16x8
 
-// Broadcast256 copies element zero of its (128-bit) input to all elements of
-// the 256-bit output vector.
+// Broadcast1To8 copies the lowest element of its input to all 8 elements of
+// the output vector.
 //
 // Asm: VPBROADCASTD, CPU Feature: AVX2
-func (x Uint32x4) Broadcast256() Uint32x8
+func (x Uint32x4) Broadcast1To8() Uint32x8
 
-// Broadcast256 copies element zero of its (128-bit) input to all elements of
-// the 256-bit output vector.
+// Broadcast1To8 copies the lowest element of its input to all 8 elements of
+// the output vector.
 //
-// Asm: VPBROADCASTQ, CPU Feature: AVX2
-func (x Uint64x2) Broadcast256() Uint64x4
+// Asm: VPBROADCASTQ, CPU Feature: AVX512
+func (x Uint64x2) Broadcast1To8() Uint64x8
 
-/* Broadcast512 */
+/* Broadcast1To16 */
 
-// Broadcast512 copies element zero of its (128-bit) input to all elements of
-// the 512-bit output vector.
+// Broadcast1To16 copies the lowest element of its input to all 16 elements of
+// the output vector.
 //
 // Asm: VBROADCASTSS, CPU Feature: AVX512
-func (x Float32x4) Broadcast512() Float32x16
+func (x Float32x4) Broadcast1To16() Float32x16
 
-// Broadcast512 copies element zero of its (128-bit) input to all elements of
-// the 512-bit output vector.
+// Broadcast1To16 copies the lowest element of its input to all 16 elements of
+// the output vector.
 //
-// Asm: VBROADCASTSD, CPU Feature: AVX512
-func (x Float64x2) Broadcast512() Float64x8
+// Asm: VPBROADCASTB, CPU Feature: AVX2
+func (x Int8x16) Broadcast1To16() Int8x16
 
-// Broadcast512 copies element zero of its (128-bit) input to all elements of
-// the 512-bit output vector.
+// Broadcast1To16 copies the lowest element of its input to all 16 elements of
+// the output vector.
 //
-// Asm: VPBROADCASTB, CPU Feature: AVX512
-func (x Int8x16) Broadcast512() Int8x64
+// Asm: VPBROADCASTW, CPU Feature: AVX2
+func (x Int16x8) Broadcast1To16() Int16x16
 
-// Broadcast512 copies element zero of its (128-bit) input to all elements of
-// the 512-bit output vector.
+// Broadcast1To16 copies the lowest element of its input to all 16 elements of
+// the output vector.
 //
-// Asm: VPBROADCASTW, CPU Feature: AVX512
-func (x Int16x8) Broadcast512() Int16x32
+// Asm: VPBROADCASTD, CPU Feature: AVX512
+func (x Int32x4) Broadcast1To16() Int32x16
 
-// Broadcast512 copies element zero of its (128-bit) input to all elements of
-// the 512-bit output vector.
+// Broadcast1To16 copies the lowest element of its input to all 16 elements of
+// the output vector.
+//
+// Asm: VPBROADCASTB, CPU Feature: AVX2
+func (x Uint8x16) Broadcast1To16() Uint8x16
+
+// Broadcast1To16 copies the lowest element of its input to all 16 elements of
+// the output vector.
+//
+// Asm: VPBROADCASTW, CPU Feature: AVX2
+func (x Uint16x8) Broadcast1To16() Uint16x16
+
+// Broadcast1To16 copies the lowest element of its input to all 16 elements of
+// the output vector.
 //
 // Asm: VPBROADCASTD, CPU Feature: AVX512
-func (x Int32x4) Broadcast512() Int32x16
+func (x Uint32x4) Broadcast1To16() Uint32x16
+
+/* Broadcast1To32 */
 
-// Broadcast512 copies element zero of its (128-bit) input to all elements of
-// the 512-bit output vector.
+// Broadcast1To32 copies the lowest element of its input to all 32 elements of
+// the output vector.
 //
-// Asm: VPBROADCASTQ, CPU Feature: AVX512
-func (x Int64x2) Broadcast512() Int64x8
+// Asm: VPBROADCASTB, CPU Feature: AVX2
+func (x Int8x16) Broadcast1To32() Int8x32
 
-// Broadcast512 copies element zero of its (128-bit) input to all elements of
-// the 512-bit output vector.
+// Broadcast1To32 copies the lowest element of its input to all 32 elements of
+// the output vector.
 //
-// Asm: VPBROADCASTB, CPU Feature: AVX512
-func (x Uint8x16) Broadcast512() Uint8x64
+// Asm: VPBROADCASTW, CPU Feature: AVX512
+func (x Int16x8) Broadcast1To32() Int16x32
+
+// Broadcast1To32 copies the lowest element of its input to all 32 elements of
+// the output vector.
+//
+// Asm: VPBROADCASTB, CPU Feature: AVX2
+func (x Uint8x16) Broadcast1To32() Uint8x32
 
-// Broadcast512 copies element zero of its (128-bit) input to all elements of
-// the 512-bit output vector.
+// Broadcast1To32 copies the lowest element of its input to all 32 elements of
+// the output vector.
 //
 // Asm: VPBROADCASTW, CPU Feature: AVX512
-func (x Uint16x8) Broadcast512() Uint16x32
+func (x Uint16x8) Broadcast1To32() Uint16x32
+
+/* Broadcast1To64 */
 
-// Broadcast512 copies element zero of its (128-bit) input to all elements of
-// the 512-bit output vector.
+// Broadcast1To64 copies the lowest element of its input to all 64 elements of
+// the output vector.
 //
-// Asm: VPBROADCASTD, CPU Feature: AVX512
-func (x Uint32x4) Broadcast512() Uint32x16
+// Asm: VPBROADCASTB, CPU Feature: AVX512
+func (x Int8x16) Broadcast1To64() Int8x64
 
-// Broadcast512 copies element zero of its (128-bit) input to all elements of
-// the 512-bit output vector.
+// Broadcast1To64 copies the lowest element of its input to all 64 elements of
+// the output vector.
 //
-// Asm: VPBROADCASTQ, CPU Feature: AVX512
-func (x Uint64x2) Broadcast512() Uint64x8
+// Asm: VPBROADCASTB, CPU Feature: AVX512
+func (x Uint8x16) Broadcast1To64() Uint8x64
 
 /* Ceil */
 
diff --git a/src/simd/archsimd/other_gen_amd64.go b/src/simd/archsimd/other_gen_amd64.go
index 647001acce..c250dc2436 100644
--- a/src/simd/archsimd/other_gen_amd64.go
+++ b/src/simd/archsimd/other_gen_amd64.go
@@ -10,7 +10,7 @@ package archsimd
 // Emulated, CPU Feature: AVX2
 func BroadcastInt8x16(x int8) Int8x16 {
 	var z Int8x16
-	return z.SetElem(0, x).Broadcast128()
+	return z.SetElem(0, x).Broadcast1To16()
 }
 
 // BroadcastInt16x8 returns a vector with the input
@@ -19,7 +19,7 @@ func BroadcastInt8x16(x int8) Int8x16 {
 // Emulated, CPU Feature: AVX2
 func BroadcastInt16x8(x int16) Int16x8 {
 	var z Int16x8
-	return z.SetElem(0, x).Broadcast128()
+	return z.SetElem(0, x).Broadcast1To8()
 }
 
 // BroadcastInt32x4 returns a vector with the input
@@ -28,7 +28,7 @@ func BroadcastInt16x8(x int16) Int16x8 {
 // Emulated, CPU Feature: AVX2
 func BroadcastInt32x4(x int32) Int32x4 {
 	var z Int32x4
-	return z.SetElem(0, x).Broadcast128()
+	return z.SetElem(0, x).Broadcast1To4()
 }
 
 // BroadcastInt64x2 returns a vector with the input
@@ -37,7 +37,7 @@ func BroadcastInt32x4(x int32) Int32x4 {
 // Emulated, CPU Feature: AVX2
 func BroadcastInt64x2(x int64) Int64x2 {
 	var z Int64x2
-	return z.SetElem(0, x).Broadcast128()
+	return z.SetElem(0, x).Broadcast1To2()
 }
 
 // BroadcastUint8x16 returns a vector with the input
@@ -46,7 +46,7 @@ func BroadcastInt64x2(x int64) Int64x2 {
 // Emulated, CPU Feature: AVX2
 func BroadcastUint8x16(x uint8) Uint8x16 {
 	var z Uint8x16
-	return z.SetElem(0, x).Broadcast128()
+	return z.SetElem(0, x).Broadcast1To16()
 }
 
 // BroadcastUint16x8 returns a vector with the input
@@ -55,7 +55,7 @@ func BroadcastUint8x16(x uint8) Uint8x16 {
 // Emulated, CPU Feature: AVX2
 func BroadcastUint16x8(x uint16) Uint16x8 {
 	var z Uint16x8
-	return z.SetElem(0, x).Broadcast128()
+	return z.SetElem(0, x).Broadcast1To8()
 }
 
 // BroadcastUint32x4 returns a vector with the input
@@ -64,7 +64,7 @@ func BroadcastUint16x8(x uint16) Uint16x8 {
 // Emulated, CPU Feature: AVX2
 func BroadcastUint32x4(x uint32) Uint32x4 {
 	var z Uint32x4
-	return z.SetElem(0, x).Broadcast128()
+	return z.SetElem(0, x).Broadcast1To4()
 }
 
 // BroadcastUint64x2 returns a vector with the input
@@ -73,7 +73,7 @@ func BroadcastUint32x4(x uint32) Uint32x4 {
 // Emulated, CPU Feature: AVX2
 func BroadcastUint64x2(x uint64) Uint64x2 {
 	var z Uint64x2
-	return z.SetElem(0, x).Broadcast128()
+	return z.SetElem(0, x).Broadcast1To2()
 }
 
 // BroadcastFloat32x4 returns a vector with the input
@@ -82,7 +82,7 @@ func BroadcastUint64x2(x uint64) Uint64x2 {
 // Emulated, CPU Feature: AVX2
 func BroadcastFloat32x4(x float32) Float32x4 {
 	var z Float32x4
-	return z.SetElem(0, x).Broadcast128()
+	return z.SetElem(0, x).Broadcast1To4()
 }
 
 // BroadcastFloat64x2 returns a vector with the input
@@ -91,7 +91,7 @@ func BroadcastFloat32x4(x float32) Float32x4 {
 // Emulated, CPU Feature: AVX2
 func BroadcastFloat64x2(x float64) Float64x2 {
 	var z Float64x2
-	return z.SetElem(0, x).Broadcast128()
+	return z.SetElem(0, x).Broadcast1To2()
 }
 
 // BroadcastInt8x32 returns a vector with the input
@@ -100,7 +100,7 @@ func BroadcastFloat64x2(x float64) Float64x2 {
 // Emulated, CPU Feature: AVX2
 func BroadcastInt8x32(x int8) Int8x32 {
 	var z Int8x16
-	return z.SetElem(0, x).Broadcast256()
+	return z.SetElem(0, x).Broadcast1To32()
 }
 
 // BroadcastInt16x16 returns a vector with the input
@@ -109,7 +109,7 @@ func BroadcastInt8x32(x int8) Int8x32 {
 // Emulated, CPU Feature: AVX2
 func BroadcastInt16x16(x int16) Int16x16 {
 	var z Int16x8
-	return z.SetElem(0, x).Broadcast256()
+	return z.SetElem(0, x).Broadcast1To16()
 }
 
 // BroadcastInt32x8 returns a vector with the input
@@ -118,7 +118,7 @@ func BroadcastInt16x16(x int16) Int16x16 {
 // Emulated, CPU Feature: AVX2
 func BroadcastInt32x8(x int32) Int32x8 {
 	var z Int32x4
-	return z.SetElem(0, x).Broadcast256()
+	return z.SetElem(0, x).Broadcast1To8()
 }
 
 // BroadcastInt64x4 returns a vector with the input
@@ -127,7 +127,7 @@ func BroadcastInt32x8(x int32) Int32x8 {
 // Emulated, CPU Feature: AVX2
 func BroadcastInt64x4(x int64) Int64x4 {
 	var z Int64x2
-	return z.SetElem(0, x).Broadcast256()
+	return z.SetElem(0, x).Broadcast1To4()
 }
 
 // BroadcastUint8x32 returns a vector with the input
@@ -136,7 +136,7 @@ func BroadcastInt64x4(x int64) Int64x4 {
 // Emulated, CPU Feature: AVX2
 func BroadcastUint8x32(x uint8) Uint8x32 {
 	var z Uint8x16
-	return z.SetElem(0, x).Broadcast256()
+	return z.SetElem(0, x).Broadcast1To32()
 }
 
 // BroadcastUint16x16 returns a vector with the input
@@ -145,7 +145,7 @@ func BroadcastUint8x32(x uint8) Uint8x32 {
 // Emulated, CPU Feature: AVX2
 func BroadcastUint16x16(x uint16) Uint16x16 {
 	var z Uint16x8
-	return z.SetElem(0, x).Broadcast256()
+	return z.SetElem(0, x).Broadcast1To16()
 }
 
 // BroadcastUint32x8 returns a vector with the input
@@ -154,7 +154,7 @@ func BroadcastUint16x16(x uint16) Uint16x16 {
 // Emulated, CPU Feature: AVX2
 func BroadcastUint32x8(x uint32) Uint32x8 {
 	var z Uint32x4
-	return z.SetElem(0, x).Broadcast256()
+	return z.SetElem(0, x).Broadcast1To8()
 }
 
 // BroadcastUint64x4 returns a vector with the input
@@ -163,7 +163,7 @@ func BroadcastUint32x8(x uint32) Uint32x8 {
 // Emulated, CPU Feature: AVX2
 func BroadcastUint64x4(x uint64) Uint64x4 {
 	var z Uint64x2
-	return z.SetElem(0, x).Broadcast256()
+	return z.SetElem(0, x).Broadcast1To4()
 }
 
 // BroadcastFloat32x8 returns a vector with the input
@@ -172,7 +172,7 @@ func BroadcastUint64x4(x uint64) Uint64x4 {
 // Emulated, CPU Feature: AVX2
 func BroadcastFloat32x8(x float32) Float32x8 {
 	var z Float32x4
-	return z.SetElem(0, x).Broadcast256()
+	return z.SetElem(0, x).Broadcast1To8()
 }
 
 // BroadcastFloat64x4 returns a vector with the input
@@ -181,7 +181,7 @@ func BroadcastFloat32x8(x float32) Float32x8 {
 // Emulated, CPU Feature: AVX2
 func BroadcastFloat64x4(x float64) Float64x4 {
 	var z Float64x2
-	return z.SetElem(0, x).Broadcast256()
+	return z.SetElem(0, x).Broadcast1To4()
 }
 
 // BroadcastInt8x64 returns a vector with the input
@@ -190,7 +190,7 @@ func BroadcastFloat64x4(x float64) Float64x4 {
 // Emulated, CPU Feature: AVX512BW
 func BroadcastInt8x64(x int8) Int8x64 {
 	var z Int8x16
-	return z.SetElem(0, x).Broadcast512()
+	return z.SetElem(0, x).Broadcast1To64()
 }
 
 // BroadcastInt16x32 returns a vector with the input
@@ -199,7 +199,7 @@ func BroadcastInt8x64(x int8) Int8x64 {
 // Emulated, CPU Feature: AVX512BW
 func BroadcastInt16x32(x int16) Int16x32 {
 	var z Int16x8
-	return z.SetElem(0, x).Broadcast512()
+	return z.SetElem(0, x).Broadcast1To32()
 }
 
 // BroadcastInt32x16 returns a vector with the input
@@ -208,7 +208,7 @@ func BroadcastInt16x32(x int16) Int16x32 {
 // Emulated, CPU Feature: AVX512F
 func BroadcastInt32x16(x int32) Int32x16 {
 	var z Int32x4
-	return z.SetElem(0, x).Broadcast512()
+	return z.SetElem(0, x).Broadcast1To16()
 }
 
 // BroadcastInt64x8 returns a vector with the input
@@ -217,7 +217,7 @@ func BroadcastInt32x16(x int32) Int32x16 {
 // Emulated, CPU Feature: AVX512F
 func BroadcastInt64x8(x int64) Int64x8 {
 	var z Int64x2
-	return z.SetElem(0, x).Broadcast512()
+	return z.SetElem(0, x).Broadcast1To8()
 }
 
 // BroadcastUint8x64 returns a vector with the input
@@ -226,7 +226,7 @@ func BroadcastInt64x8(x int64) Int64x8 {
 // Emulated, CPU Feature: AVX512BW
 func BroadcastUint8x64(x uint8) Uint8x64 {
 	var z Uint8x16
-	return z.SetElem(0, x).Broadcast512()
+	return z.SetElem(0, x).Broadcast1To64()
 }
 
 // BroadcastUint16x32 returns a vector with the input
@@ -235,7 +235,7 @@ func BroadcastUint8x64(x uint8) Uint8x64 {
 // Emulated, CPU Feature: AVX512BW
 func BroadcastUint16x32(x uint16) Uint16x32 {
 	var z Uint16x8
-	return z.SetElem(0, x).Broadcast512()
+	return z.SetElem(0, x).Broadcast1To32()
 }
 
 // BroadcastUint32x16 returns a vector with the input
@@ -244,7 +244,7 @@ func BroadcastUint16x32(x uint16) Uint16x32 {
 // Emulated, CPU Feature: AVX512F
 func BroadcastUint32x16(x uint32) Uint32x16 {
 	var z Uint32x4
-	return z.SetElem(0, x).Broadcast512()
+	return z.SetElem(0, x).Broadcast1To16()
 }
 
 // BroadcastUint64x8 returns a vector with the input
@@ -253,7 +253,7 @@ func BroadcastUint32x16(x uint32) Uint32x16 {
 // Emulated, CPU Feature: AVX512F
 func BroadcastUint64x8(x uint64) Uint64x8 {
 	var z Uint64x2
-	return z.SetElem(0, x).Broadcast512()
+	return z.SetElem(0, x).Broadcast1To8()
 }
 
 // BroadcastFloat32x16 returns a vector with the input
@@ -262,7 +262,7 @@ func BroadcastUint64x8(x uint64) Uint64x8 {
 // Emulated, CPU Feature: AVX512F
 func BroadcastFloat32x16(x float32) Float32x16 {
 	var z Float32x4
-	return z.SetElem(0, x).Broadcast512()
+	return z.SetElem(0, x).Broadcast1To16()
 }
 
 // BroadcastFloat64x8 returns a vector with the input
@@ -271,7 +271,7 @@ func BroadcastFloat32x16(x float32) Float32x16 {
 // Emulated, CPU Feature: AVX512F
 func BroadcastFloat64x8(x float64) Float64x8 {
 	var z Float64x2
-	return z.SetElem(0, x).Broadcast512()
+	return z.SetElem(0, x).Broadcast1To8()
 }
 
 // ToMask converts from Int8x16 to Mask8x16, mask element is set to true when the corresponding vector element is non-zero.
-- 
cgit v1.3


From 4b89bcb8b7141c7e4ef1a7dbb4c3f17f589d89c0 Mon Sep 17 00:00:00 2001
From: Filippo Valsorda <filippo@golang.org>
Date: Fri, 19 Dec 2025 23:14:36 +0100
Subject: lib/fips140: freeze v1.26.0 FIPS 140-3 module

Fixes #76770

Change-Id: Ia617f01ea9be0d1759147b6cca0403c56a6a6964
Reviewed-on: https://go-review.googlesource.com/c/go/+/731840
Reviewed-by: Roland Shoemaker <roland@golang.org>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Auto-Submit: Filippo Valsorda <filippo@golang.org>
Reviewed-by: Junyang Shao <shaojunyang@google.com>
---
 lib/fips140/fips140.sum    |   2 +-
 lib/fips140/v1.1.0-rc1.zip | Bin 678896 -> 0 bytes
 lib/fips140/v1.26.0.zip    | Bin 0 -> 676132 bytes
 3 files changed, 1 insertion(+), 1 deletion(-)
 delete mode 100644 lib/fips140/v1.1.0-rc1.zip
 create mode 100644 lib/fips140/v1.26.0.zip

diff --git a/lib/fips140/fips140.sum b/lib/fips140/fips140.sum
index c4d185da73..050957af60 100644
--- a/lib/fips140/fips140.sum
+++ b/lib/fips140/fips140.sum
@@ -10,4 +10,4 @@
 #	go test cmd/go/internal/fips140 -update
 #
 v1.0.0-c2097c7c.zip daf3614e0406f67ae6323c902db3f953a1effb199142362a039e7526dfb9368b
-v1.1.0-rc1.zip ea94f8c3885294c9efe1bd8f9b6e86daeb25b6aff2aeb20707cd9a5101f6f54e
+v1.26.0.zip 9b28f847fdf1db4a36cb2b2f8ec09443c039383f085630a03ecfaddf6db7ea23
diff --git a/lib/fips140/v1.1.0-rc1.zip b/lib/fips140/v1.1.0-rc1.zip
deleted file mode 100644
index d4264bdb2e..0000000000
Binary files a/lib/fips140/v1.1.0-rc1.zip and /dev/null differ
diff --git a/lib/fips140/v1.26.0.zip b/lib/fips140/v1.26.0.zip
new file mode 100644
index 0000000000..f53ade8036
Binary files /dev/null and b/lib/fips140/v1.26.0.zip differ
-- 
cgit v1.3


From 6b2505c79cb3838c6e27cf47ac09980fe51c83c2 Mon Sep 17 00:00:00 2001
From: Neal Patel <nealpatel@google.com>
Date: Tue, 6 Jan 2026 16:09:19 -0500
Subject: cmd/go: remove user-content from doc strings in cgo ASTs.

Thank you to RyotaK (https://ryotak.net) of GMO Flatt Security Inc. for reporting this issue.

Updates golang/go#76697
Fixes CVE-2025-61732

Change-Id: I1121502f1bf1e91309eb4bd41cc3a09c39366d36
Reviewed-on: https://go-review.googlesource.com/c/go/+/734220
Reviewed-by: Agustin Hernandez <garisol1982@gmail.com>
Reviewed-by: David Chase <drchase@google.com>
Reviewed-by: Robert Griesemer <gri@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
---
 src/cmd/cgo/ast.go | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/src/cmd/cgo/ast.go b/src/cmd/cgo/ast.go
index 2da6ca5a30..df0552f525 100644
--- a/src/cmd/cgo/ast.go
+++ b/src/cmd/cgo/ast.go
@@ -301,17 +301,12 @@ func (f *File) saveExport(x any, context astContext) {
 			error_(c.Pos(), "export comment has wrong name %q, want %q", name, n.Name.Name)
 		}
 
-		doc := ""
-		for _, c1 := range n.Doc.List {
-			if c1 != c {
-				doc += c1.Text + "\n"
-			}
-		}
-
 		f.ExpFunc = append(f.ExpFunc, &ExpFunc{
 			Func:    n,
 			ExpName: name,
-			Doc:     doc,
+			// Caution: Do not set the Doc field on purpose
+			// to ensure that there are no unintended artifacts
+			// in the binary. See https://go.dev/issue/76697.
 		})
 		break
 	}
-- 
cgit v1.3