[release-branch.go1.26] all: merge master (f6ebd91) into release-branch.go1.26

Merge List: + 2026-01-07 f6ebd91129 all: update vendored x/tools + 2026-01-06 d1d0fc7a97 os/exec: avoid atomic.Bool for Cmd.startCalled + 2026-01-05 9b2e3b9a02 simd/archsimd: use V(P)MOVMSK for mask ToBits if possible + 2026-01-02 f8ee0f8475 cmd/go/testdata/vcstest/git: use git commands that work on older git versions + 2026-01-02 b094749bad test/codegen: codify bit related code generation for arm64 + 2026-01-02 e84983fa40 cmd/compile: optimize SIMD IsNaN.Or(IsNaN) + 2026-01-02 8244b85677 simd/archsimd: add tests for IsNaN + 2026-01-02 13440fb518 simd/archsimd: make IsNaN unary + 2026-01-02 c3550b3352 simd/archsimd: correct documentation of Mask types + 2026-01-02 34ad26341d net/rpc: correct comment for isExportedOrBuiltinType function + 2025-12-30 b28808d838 cmd/go/internal/modindex: fix obvious bug using failed type assertion + 2025-12-30 d64add4d60 simd/archsimd: adjust documentations slightly + 2025-12-30 1843cfbcd6 runtime/secret: make tests more sturdy + 2025-12-30 fd45d70799 all: fix some minor grammatical issues in the comments + 2025-12-30 df4e08ac65 test/codegen: fix a tab in comparisons.go to ensure pattern works + 2025-12-30 cd668d744f cmd/compile: disable inlining for functions using runtime.deferrangefunc + 2025-12-29 06eff0f7c3 simd/archsimd: add tests for Saturate-Concat operations + 2025-12-29 110aaf7137 simd/archsimd: add tests for Saturate operations + 2025-12-29 22e7b94e7f simd/archsimd: add tests for ExtendLo operations + 2025-12-29 76dddce293 simd/archsimd: remove redundant suffix of ExtendLo operations + 2025-12-29 6ecdd2fc6e simd/archsimd: add more tests for Convert operations + 2025-12-29 e0c99fe285 simd/archsimd: add more tests for Truncate operations + 2025-12-29 08369369e5 reflect: document Call/CallSlice panic when v is unexported field + 2025-12-29 ca8effbde1 internal/coverage/decodemeta: correct wording in unknown version error + 2025-12-29 0b06b68e21 encoding/gob: clarify docs about pointers to zero values not being sent + 2025-12-29 9cb3edbfe9 regexp: standardize error message format in find_test.go + 2025-12-29 b3ed0627ce tests: improve consistency and clarity of test diagnostics + 2025-12-29 3dcb48d298 test: follow got/want convention in uintptrescapes test + 2025-12-29 f7b7e94b0a test: clarify log message for surrogate UTF-8 check + 2025-12-29 e790d59674 simd/archsimd: add tests for Truncate operations + 2025-12-27 f4cec7917c cmd: fix unused errors reported by ineffassign + 2025-12-27 ca13fe02c4 simd/archsimd: add more tests for Convert operations + 2025-12-27 037c047f2c simd/archsimd: add more tests for Extend operations + 2025-12-26 7971fcdf53 test/codegen: tidy tests for bits + 2025-12-24 0f620776d7 simd/archsimd: fix "go generate" command + 2025-12-24 a5fe8c07ae simd/archsimd: guard test helpers with amd64 tag + 2025-12-23 a23d1a4ebe bytes: improve consistency in split test messages + 2025-12-23 866e461b96 cmd/go: update pkgsite doc command to v0.0.0-20251223195805-1a3bd3c788fe + 2025-12-23 08dc8393d7 time: skip test that will fail with GO111MODULE=off + 2025-12-23 43ebed88cc runtime: improve a log message in TestCleanupLost + 2025-12-23 81283ad339 runtime: fix nGsyscallNoP accounting + 2025-12-23 3e0e1667f6 test/codegen: codify bit related code generation for riscv64 + 2025-12-23 3faf988f21 errors: add a test verifying join does not flatten errors + 2025-12-23 2485a0bc2c cmd/asm/internal/asm: run riscv64 end-to-end tests for each profile + 2025-12-23 8254d66eab cmd/asm/internal/asm: abort end to end test if assembly failed + 2025-12-23 1b3db48db7 Revert "errors: optimize errors.Join for single unwrappable errors" + 2025-12-23 b6b8b2fe6e cmd/compile: handle propagating an out-of-range jump table index + 2025-12-22 2cd0371a0a debug/pe: avoid panic in File.ImportedSymbols + 2025-12-22 91435be153 runtime: revert entry point on freebsd/arm64 + 2025-12-22 c1efada1d2 simd/archsimd: correct documentation for pairwise operations + 2025-12-22 3d77a0b15e os/exec: second call to Cmd.Start is always an error + 2025-12-20 7ecb1f36ac simd/archsimd: add HasAVX2() guards to tests that need them + 2025-12-19 70c22e0ad7 simd/archsimd: delete DotProductQuadruple methods for now + 2025-12-19 42cda7c1df simd/archsimd: add Grouped for 256- and 512-bit SaturateTo(U)Int16Concat, and fix type + 2025-12-19 baa0ae3aaa simd/archsimd: correct type and instruction for SaturateToUint8 + 2025-12-19 d46c58debb go/doc: link to struct fields in the same package + 2025-12-19 25ed6c7f9b cmd/go/internal/doc: update pkgsite version + 2025-12-19 4411edf972 simd/archsimd: reword documentation for some operations + 2025-12-19 7d9418a19c simd/archsimd: reword documentation of comparison operations + 2025-12-18 d00e96d3ae internal/cpu: repair VNNI feature check + 2025-12-18 cfc024daeb simd/archsimd: reword documentation for conversion ops + 2025-12-17 ad91f5d241 simd/archsimd: reword documentation of shfit operations + 2025-12-17 b8c4cc63e7 runtime: keep track of secret allocation size + 2025-12-17 8564fede89 cmd/go: remove reference to no longer existing -i flag + 2025-12-17 eecdb61eeb crypto: rename fips140v2.0 to fips140v1.26 + 2025-12-17 05e41225f6 simd/archsimd: reword documentation of As methods + 2025-12-17 516699848b runtime/secret: warn users about allocations, loosen guarantees + 2025-12-16 8c28ab936a cmd/cgo: don't emit C local if it is not used + 2025-12-16 65b71c11d4 crypto/internal/fips140only: test fips140=only mode + 2025-12-16 ea1aa76554 go/doc: exclude examples with results + 2025-12-16 5046bdf8a6 crypto/tls: reject trailing messages after client/server hello + 2025-12-16 3f6eabdf09 cmd/compile: use unsigned constant when folding loads for SIMD ops with constants + 2025-12-16 a4b5b92055 cmd/dist: preserve existing GOEXPERIMENTs when running tests with additional experiments Change-Id: I84ad4ceba344761142b98587c07d186cf2d638ff
author: Junyang Shao <shaojunyang@google.com> 2026-01-07 20:06:48 +0000
committer: Junyang Shao <shaojunyang@google.com> 2026-01-07 20:06:49 +0000
commit: b8191a2f9893220bdbe52ecebb37e293847d98f5 (patch)
tree: ffaec06811834d36737d182a65831d65cd8ce798
parent: c599a8f2385849a225d02843b3c6389dbfc5aa69 (diff)
parent: f6ebd91129e13ef7f495550a4fc8fa74769f6a2d (diff)
download: go-b8191a2f9893220bdbe52ecebb37e293847d98f5.tar.xz
143 files changed, 14940 insertions, 5317 deletions
diff --git a/src/archive/tar/reader_test.go b/src/archive/tar/reader_test.go
index de3d365304..c7611ca044 100644
--- a/src/archive/tar/reader_test.go
+++ b/src/archive/tar/reader_test.go
@@ -787,7 +787,7 @@ type readBadSeeker struct{ io.ReadSeeker }
 
 func (rbs *readBadSeeker) Seek(int64, int) (int64, error) { return 0, fmt.Errorf("illegal seek") }
 
-// TestReadTruncation test the ending condition on various truncated files and
+// TestReadTruncation tests the ending condition on various truncated files and
 // that truncated files are still detected even if the underlying io.Reader
 // satisfies io.Seeker.
 func TestReadTruncation(t *testing.T) {
diff --git a/src/archive/tar/stat_unix.go b/src/archive/tar/stat_unix.go
index f999f56db6..891a1a3b4a 100644
--- a/src/archive/tar/stat_unix.go
+++ b/src/archive/tar/stat_unix.go
@@ -19,7 +19,7 @@ func init() {
 	sysStat = statUnix
 }
 
-// userMap and groupMap caches UID and GID lookups for performance reasons.
+// userMap and groupMap cache UID and GID lookups for performance reasons.
 // The downside is that renaming uname or gname by the OS never takes effect.
 var userMap, groupMap sync.Map // map[int]string
 
diff --git a/src/archive/tar/strconv.go b/src/archive/tar/strconv.go
index 217efe9e2e..d3c28a8c4e 100644
--- a/src/archive/tar/strconv.go
+++ b/src/archive/tar/strconv.go
@@ -312,7 +312,7 @@ func formatPAXRecord(k, v string) (string, error) {
 //	"%d %s=%s\n" % (size, key, value)
 //
 // Keys and values should be UTF-8, but the number of bad writers out there
-// forces us to be a more liberal.
+// forces us to be more liberal.
 // Thus, we only reject all keys with NUL, and only reject NULs in values
 // for the PAX version of the USTAR string fields.
 // The key must not contain an '=' character.
diff --git a/src/bytes/bytes_test.go b/src/bytes/bytes_test.go
index 9547ede312..891aef2c8b 100644
--- a/src/bytes/bytes_test.go
+++ b/src/bytes/bytes_test.go
@@ -961,7 +961,7 @@ func TestSplit(t *testing.T) {
 		if tt.n < 0 {
 			b := sliceOfString(Split([]byte(tt.s), []byte(tt.sep)))
 			if !slices.Equal(result, b) {
-				t.Errorf("Split disagrees withSplitN(%q, %q, %d) = %v; want %v", tt.s, tt.sep, tt.n, b, a)
+				t.Errorf("Split disagrees with SplitN(%q, %q, %d) = %v; want %v", tt.s, tt.sep, tt.n, b, a)
 			}
 		}
 		if len(a) > 0 {
@@ -1023,7 +1023,7 @@ func TestSplitAfter(t *testing.T) {
 		if tt.n < 0 {
 			b := sliceOfString(SplitAfter([]byte(tt.s), []byte(tt.sep)))
 			if !slices.Equal(result, b) {
-				t.Errorf("SplitAfter disagrees withSplitAfterN(%q, %q, %d) = %v; want %v", tt.s, tt.sep, tt.n, b, a)
+				t.Errorf("SplitAfter disagrees with SplitAfterN(%q, %q, %d) = %v; want %v", tt.s, tt.sep, tt.n, b, a)
 			}
 		}
 	}
diff --git a/src/cmd/asm/internal/asm/endtoend_test.go b/src/cmd/asm/internal/asm/endtoend_test.go
index e53263356d..28dce50d60 100644
--- a/src/cmd/asm/internal/asm/endtoend_test.go
+++ b/src/cmd/asm/internal/asm/endtoend_test.go
@@ -199,6 +199,11 @@ Diff:
 	}
 	obj.Flushplist(ctxt, pList, nil)
 
+	if !ok {
+		// If we've encountered errors, the output is unlikely to be sane.
+		t.FailNow()
+	}
+
 	for p := top; p != nil; p = p.Link {
 		if p.As == obj.ATEXT {
 			text = p.From.Sym
@@ -486,16 +491,35 @@ func TestPPC64EndToEnd(t *testing.T) {
 	}
 }
 
-func TestRISCVEndToEnd(t *testing.T) {
-	testEndToEnd(t, "riscv64", "riscv64")
+func testRISCV64AllProfiles(t *testing.T, testFn func(t *testing.T)) {
+	t.Helper()
+
+	defer func(orig int) { buildcfg.GORISCV64 = orig }(buildcfg.GORISCV64)
+
+	for _, goriscv64 := range []int{20, 22, 23} {
+		t.Run(fmt.Sprintf("rva%vu64", goriscv64), func(t *testing.T) {
+			buildcfg.GORISCV64 = goriscv64
+			testFn(t)
+		})
+	}
+}
+
+func TestRISCV64EndToEnd(t *testing.T) {
+	testRISCV64AllProfiles(t, func(t *testing.T) {
+		testEndToEnd(t, "riscv64", "riscv64")
+	})
 }
 
-func TestRISCVErrors(t *testing.T) {
-	testErrors(t, "riscv64", "riscv64error")
+func TestRISCV64Errors(t *testing.T) {
+	testRISCV64AllProfiles(t, func(t *testing.T) {
+		testErrors(t, "riscv64", "riscv64error")
+	})
 }
 
-func TestRISCVValidation(t *testing.T) {
-	testErrors(t, "riscv64", "riscv64validation")
+func TestRISCV64Validation(t *testing.T) {
+	testRISCV64AllProfiles(t, func(t *testing.T) {
+		testErrors(t, "riscv64", "riscv64validation")
+	})
 }
 
 func TestS390XEndToEnd(t *testing.T) {
diff --git a/src/cmd/cgo/internal/test/issue76861.go b/src/cmd/cgo/internal/test/issue76861.go
new file mode 100644
index 0000000000..225e2acc3f
--- /dev/null
+++ b/src/cmd/cgo/internal/test/issue76861.go
@@ -0,0 +1,12 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build cgo
+
+package cgotest
+
+// Issue 43639: No runtime test needed, make sure package
+// cmd/cgo/internal/test/issue76861 compiles without error.
+
+import _ "cmd/cgo/internal/test/issue76861"
diff --git a/src/cmd/cgo/internal/test/issue76861/a.go b/src/cmd/cgo/internal/test/issue76861/a.go
new file mode 100644
index 0000000000..18a7bda490
--- /dev/null
+++ b/src/cmd/cgo/internal/test/issue76861/a.go
@@ -0,0 +1,13 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package issue76861
+
+// #cgo CFLAGS: -Wall -Werror
+// void issue76861(void) {}
+import "C"
+
+func Issue76861() {
+	C.issue76861()
+}
diff --git a/src/cmd/cgo/out.go b/src/cmd/cgo/out.go
index dc1e5b29e5..ac2ce8fd0d 100644
--- a/src/cmd/cgo/out.go
+++ b/src/cmd/cgo/out.go
@@ -783,13 +783,13 @@ func (p *Package) writeOutputFunc(fgcc *os.File, n *Name) {
 	// We're trying to write a gcc struct that matches gc's layout.
 	// Use packed attribute to force no padding in this struct in case
 	// gcc has different packing requirements.
-	fmt.Fprintf(fgcc, "\t%s %v *_cgo_a = v;\n", ctype, p.packedAttribute())
-	if n.FuncType.Result != nil {
-		// Save the stack top for use below.
-		fmt.Fprintf(fgcc, "\tchar *_cgo_stktop = _cgo_topofstack();\n")
-	}
 	tr := n.FuncType.Result
+	if (n.Kind != "macro" && len(n.FuncType.Params) > 0) || tr != nil {
+		fmt.Fprintf(fgcc, "\t%s %v *_cgo_a = v;\n", ctype, p.packedAttribute())
+	}
 	if tr != nil {
+		// Save the stack top for use below.
+		fmt.Fprintf(fgcc, "\tchar *_cgo_stktop = _cgo_topofstack();\n")
 		fmt.Fprintf(fgcc, "\t__typeof__(_cgo_a->r) _cgo_r;\n")
 	}
 	fmt.Fprintf(fgcc, "\t_cgo_tsan_acquire();\n")
@@ -819,7 +819,7 @@ func (p *Package) writeOutputFunc(fgcc *os.File, n *Name) {
 		fmt.Fprintf(fgcc, "\t_cgo_errno = errno;\n")
 	}
 	fmt.Fprintf(fgcc, "\t_cgo_tsan_release();\n")
-	if n.FuncType.Result != nil {
+	if tr != nil {
 		// The cgo call may have caused a stack copy (via a callback).
 		// Adjust the return value pointer appropriately.
 		fmt.Fprintf(fgcc, "\t_cgo_a = (void*)((char*)_cgo_a + (_cgo_topofstack() - _cgo_stktop));\n")
diff --git a/src/cmd/compile/internal/amd64/simdssa.go b/src/cmd/compile/internal/amd64/simdssa.go
index f6deba3ec1..c4d0fd69c6 100644
--- a/src/cmd/compile/internal/amd64/simdssa.go
+++ b/src/cmd/compile/internal/amd64/simdssa.go
@@ -1,4 +1,4 @@
-// Code generated by x/arch/internal/simdgen using 'go run . -xedPath $XED_PATH -o godefs -goroot $GOROOT go.yaml types.yaml categories.yaml'; DO NOT EDIT.
+// Code generated by 'simdgen -o godefs -goroot $GOROOT -xedPath $XED_PATH go.yaml types.yaml categories.yaml'; DO NOT EDIT.
 
 package amd64
 
@@ -175,7 +175,15 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPMOVSQD128_128,
 		ssa.OpAMD64VPMOVSQD128_256,
 		ssa.OpAMD64VPMOVSQD256,
+		ssa.OpAMD64VPMOVUSWB128_128,
+		ssa.OpAMD64VPMOVUSWB128_256,
 		ssa.OpAMD64VPMOVUSWB256,
+		ssa.OpAMD64VPMOVUSDB128_128,
+		ssa.OpAMD64VPMOVUSDB128_256,
+		ssa.OpAMD64VPMOVUSDB128_512,
+		ssa.OpAMD64VPMOVUSQB128_128,
+		ssa.OpAMD64VPMOVUSQB128_256,
+		ssa.OpAMD64VPMOVUSQB128_512,
 		ssa.OpAMD64VPMOVUSDW128_128,
 		ssa.OpAMD64VPMOVUSDW128_256,
 		ssa.OpAMD64VPMOVUSDW256,
@@ -242,12 +250,12 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPADDQ256,
 		ssa.OpAMD64VPADDQ512,
 		ssa.OpAMD64VHADDPS128,
-		ssa.OpAMD64VHADDPS256,
 		ssa.OpAMD64VHADDPD128,
-		ssa.OpAMD64VHADDPD256,
 		ssa.OpAMD64VPHADDW128,
-		ssa.OpAMD64VPHADDW256,
 		ssa.OpAMD64VPHADDD128,
+		ssa.OpAMD64VHADDPS256,
+		ssa.OpAMD64VHADDPD256,
+		ssa.OpAMD64VPHADDW256,
 		ssa.OpAMD64VPHADDD256,
 		ssa.OpAMD64VPHADDSW128,
 		ssa.OpAMD64VPHADDSW256,
@@ -512,12 +520,12 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPSUBQ256,
 		ssa.OpAMD64VPSUBQ512,
 		ssa.OpAMD64VHSUBPS128,
-		ssa.OpAMD64VHSUBPS256,
 		ssa.OpAMD64VHSUBPD128,
-		ssa.OpAMD64VHSUBPD256,
 		ssa.OpAMD64VPHSUBW128,
-		ssa.OpAMD64VPHSUBW256,
 		ssa.OpAMD64VPHSUBD128,
+		ssa.OpAMD64VHSUBPS256,
+		ssa.OpAMD64VHSUBPD256,
+		ssa.OpAMD64VPHSUBW256,
 		ssa.OpAMD64VPHSUBD256,
 		ssa.OpAMD64VPHSUBSW128,
 		ssa.OpAMD64VPHSUBSW256,
@@ -731,12 +739,12 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPRORVQMasked128,
 		ssa.OpAMD64VPRORVQMasked256,
 		ssa.OpAMD64VPRORVQMasked512,
-		ssa.OpAMD64VPACKSSDWMasked128,
 		ssa.OpAMD64VPACKSSDWMasked256,
 		ssa.OpAMD64VPACKSSDWMasked512,
-		ssa.OpAMD64VPACKUSDWMasked128,
+		ssa.OpAMD64VPACKSSDWMasked128,
 		ssa.OpAMD64VPACKUSDWMasked256,
 		ssa.OpAMD64VPACKUSDWMasked512,
+		ssa.OpAMD64VPACKUSDWMasked128,
 		ssa.OpAMD64VSCALEFPSMasked128,
 		ssa.OpAMD64VSCALEFPSMasked256,
 		ssa.OpAMD64VSCALEFPSMasked512,
@@ -1010,7 +1018,15 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPMOVSQDMasked128_128,
 		ssa.OpAMD64VPMOVSQDMasked128_256,
 		ssa.OpAMD64VPMOVSQDMasked256,
+		ssa.OpAMD64VPMOVUSWBMasked128_128,
+		ssa.OpAMD64VPMOVUSWBMasked128_256,
 		ssa.OpAMD64VPMOVUSWBMasked256,
+		ssa.OpAMD64VPMOVUSDBMasked128_128,
+		ssa.OpAMD64VPMOVUSDBMasked128_256,
+		ssa.OpAMD64VPMOVUSDBMasked128_512,
+		ssa.OpAMD64VPMOVUSQBMasked128_128,
+		ssa.OpAMD64VPMOVUSQBMasked128_256,
+		ssa.OpAMD64VPMOVUSQBMasked128_512,
 		ssa.OpAMD64VPMOVUSDWMasked128_128,
 		ssa.OpAMD64VPMOVUSDWMasked128_256,
 		ssa.OpAMD64VPMOVUSDWMasked256,
@@ -1308,12 +1324,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPERMI2Q256,
 		ssa.OpAMD64VPERMI2PD512,
 		ssa.OpAMD64VPERMI2Q512,
-		ssa.OpAMD64VPDPBUSD128,
-		ssa.OpAMD64VPDPBUSD256,
-		ssa.OpAMD64VPDPBUSD512,
-		ssa.OpAMD64VPDPBUSDS128,
-		ssa.OpAMD64VPDPBUSDS256,
-		ssa.OpAMD64VPDPBUSDS512,
 		ssa.OpAMD64VFMADD213PS128,
 		ssa.OpAMD64VFMADD213PS256,
 		ssa.OpAMD64VFMADD213PS512,
@@ -1430,12 +1440,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPMADDUBSWMasked128Merging,
 		ssa.OpAMD64VPMADDUBSWMasked256Merging,
 		ssa.OpAMD64VPMADDUBSWMasked512Merging,
-		ssa.OpAMD64VPDPBUSDMasked128,
-		ssa.OpAMD64VPDPBUSDMasked256,
-		ssa.OpAMD64VPDPBUSDMasked512,
-		ssa.OpAMD64VPDPBUSDSMasked128,
-		ssa.OpAMD64VPDPBUSDSMasked256,
-		ssa.OpAMD64VPDPBUSDSMasked512,
 		ssa.OpAMD64VGF2P8MULBMasked128Merging,
 		ssa.OpAMD64VGF2P8MULBMasked256Merging,
 		ssa.OpAMD64VGF2P8MULBMasked512Merging,
@@ -1559,12 +1563,12 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPRORVQMasked128Merging,
 		ssa.OpAMD64VPRORVQMasked256Merging,
 		ssa.OpAMD64VPRORVQMasked512Merging,
-		ssa.OpAMD64VPACKSSDWMasked128Merging,
 		ssa.OpAMD64VPACKSSDWMasked256Merging,
 		ssa.OpAMD64VPACKSSDWMasked512Merging,
-		ssa.OpAMD64VPACKUSDWMasked128Merging,
+		ssa.OpAMD64VPACKSSDWMasked128Merging,
 		ssa.OpAMD64VPACKUSDWMasked256Merging,
 		ssa.OpAMD64VPACKUSDWMasked512Merging,
+		ssa.OpAMD64VPACKUSDWMasked128Merging,
 		ssa.OpAMD64VSCALEFPSMasked128Merging,
 		ssa.OpAMD64VSCALEFPSMasked256Merging,
 		ssa.OpAMD64VSCALEFPSMasked512Merging,
@@ -1955,8 +1959,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPERMI2Q256load,
 		ssa.OpAMD64VPERMI2PD512load,
 		ssa.OpAMD64VPERMI2Q512load,
-		ssa.OpAMD64VPDPBUSD512load,
-		ssa.OpAMD64VPDPBUSDS512load,
 		ssa.OpAMD64VFMADD213PS128load,
 		ssa.OpAMD64VFMADD213PS256load,
 		ssa.OpAMD64VFMADD213PS512load,
@@ -2004,12 +2006,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPERMI2QMasked256load,
 		ssa.OpAMD64VPERMI2PDMasked512load,
 		ssa.OpAMD64VPERMI2QMasked512load,
-		ssa.OpAMD64VPDPBUSDMasked128load,
-		ssa.OpAMD64VPDPBUSDMasked256load,
-		ssa.OpAMD64VPDPBUSDMasked512load,
-		ssa.OpAMD64VPDPBUSDSMasked128load,
-		ssa.OpAMD64VPDPBUSDSMasked256load,
-		ssa.OpAMD64VPDPBUSDSMasked512load,
 		ssa.OpAMD64VFMADD213PSMasked128load,
 		ssa.OpAMD64VFMADD213PSMasked256load,
 		ssa.OpAMD64VFMADD213PSMasked512load,
@@ -2146,12 +2142,12 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPRORVQMasked128load,
 		ssa.OpAMD64VPRORVQMasked256load,
 		ssa.OpAMD64VPRORVQMasked512load,
-		ssa.OpAMD64VPACKSSDWMasked128load,
 		ssa.OpAMD64VPACKSSDWMasked256load,
 		ssa.OpAMD64VPACKSSDWMasked512load,
-		ssa.OpAMD64VPACKUSDWMasked128load,
+		ssa.OpAMD64VPACKSSDWMasked128load,
 		ssa.OpAMD64VPACKUSDWMasked256load,
 		ssa.OpAMD64VPACKUSDWMasked512load,
+		ssa.OpAMD64VPACKUSDWMasked128load,
 		ssa.OpAMD64VSCALEFPSMasked128load,
 		ssa.OpAMD64VSCALEFPSMasked256load,
 		ssa.OpAMD64VSCALEFPSMasked512load,
@@ -2638,7 +2634,15 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPMOVSQDMasked128_128Merging,
 		ssa.OpAMD64VPMOVSQDMasked128_256Merging,
 		ssa.OpAMD64VPMOVSQDMasked256Merging,
+		ssa.OpAMD64VPMOVUSWBMasked128_128Merging,
+		ssa.OpAMD64VPMOVUSWBMasked128_256Merging,
 		ssa.OpAMD64VPMOVUSWBMasked256Merging,
+		ssa.OpAMD64VPMOVUSDBMasked128_128Merging,
+		ssa.OpAMD64VPMOVUSDBMasked128_256Merging,
+		ssa.OpAMD64VPMOVUSDBMasked128_512Merging,
+		ssa.OpAMD64VPMOVUSQBMasked128_128Merging,
+		ssa.OpAMD64VPMOVUSQBMasked128_256Merging,
+		ssa.OpAMD64VPMOVUSQBMasked128_512Merging,
 		ssa.OpAMD64VPMOVUSDWMasked128_128Merging,
 		ssa.OpAMD64VPMOVUSDWMasked128_256Merging,
 		ssa.OpAMD64VPMOVUSDWMasked256Merging,
@@ -3021,18 +3025,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPMADDUBSWMasked128,
 		ssa.OpAMD64VPMADDUBSWMasked256,
 		ssa.OpAMD64VPMADDUBSWMasked512,
-		ssa.OpAMD64VPDPBUSDMasked128,
-		ssa.OpAMD64VPDPBUSDMasked128load,
-		ssa.OpAMD64VPDPBUSDMasked256,
-		ssa.OpAMD64VPDPBUSDMasked256load,
-		ssa.OpAMD64VPDPBUSDMasked512,
-		ssa.OpAMD64VPDPBUSDMasked512load,
-		ssa.OpAMD64VPDPBUSDSMasked128,
-		ssa.OpAMD64VPDPBUSDSMasked128load,
-		ssa.OpAMD64VPDPBUSDSMasked256,
-		ssa.OpAMD64VPDPBUSDSMasked256load,
-		ssa.OpAMD64VPDPBUSDSMasked512,
-		ssa.OpAMD64VPDPBUSDSMasked512load,
 		ssa.OpAMD64VEXPANDPSMasked128,
 		ssa.OpAMD64VEXPANDPSMasked256,
 		ssa.OpAMD64VEXPANDPSMasked512,
@@ -3415,12 +3407,12 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPMOVSQBMasked128_128,
 		ssa.OpAMD64VPMOVSQBMasked128_256,
 		ssa.OpAMD64VPMOVSQBMasked128_512,
-		ssa.OpAMD64VPACKSSDWMasked128,
-		ssa.OpAMD64VPACKSSDWMasked128load,
 		ssa.OpAMD64VPACKSSDWMasked256,
 		ssa.OpAMD64VPACKSSDWMasked256load,
 		ssa.OpAMD64VPACKSSDWMasked512,
 		ssa.OpAMD64VPACKSSDWMasked512load,
+		ssa.OpAMD64VPACKSSDWMasked128,
+		ssa.OpAMD64VPACKSSDWMasked128load,
 		ssa.OpAMD64VPMOVSDWMasked128_128,
 		ssa.OpAMD64VPMOVSDWMasked128_256,
 		ssa.OpAMD64VPMOVSDWMasked256,
@@ -3430,13 +3422,21 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPMOVSQDMasked128_128,
 		ssa.OpAMD64VPMOVSQDMasked128_256,
 		ssa.OpAMD64VPMOVSQDMasked256,
+		ssa.OpAMD64VPMOVUSWBMasked128_128,
+		ssa.OpAMD64VPMOVUSWBMasked128_256,
 		ssa.OpAMD64VPMOVUSWBMasked256,
-		ssa.OpAMD64VPACKUSDWMasked128,
-		ssa.OpAMD64VPACKUSDWMasked128load,
+		ssa.OpAMD64VPMOVUSDBMasked128_128,
+		ssa.OpAMD64VPMOVUSDBMasked128_256,
+		ssa.OpAMD64VPMOVUSDBMasked128_512,
+		ssa.OpAMD64VPMOVUSQBMasked128_128,
+		ssa.OpAMD64VPMOVUSQBMasked128_256,
+		ssa.OpAMD64VPMOVUSQBMasked128_512,
 		ssa.OpAMD64VPACKUSDWMasked256,
 		ssa.OpAMD64VPACKUSDWMasked256load,
 		ssa.OpAMD64VPACKUSDWMasked512,
 		ssa.OpAMD64VPACKUSDWMasked512load,
+		ssa.OpAMD64VPACKUSDWMasked128,
+		ssa.OpAMD64VPACKUSDWMasked128load,
 		ssa.OpAMD64VPMOVUSDWMasked128_128,
 		ssa.OpAMD64VPMOVUSDWMasked128_256,
 		ssa.OpAMD64VPMOVUSDWMasked256,
diff --git a/src/cmd/compile/internal/amd64/ssa.go b/src/cmd/compile/internal/amd64/ssa.go
index 5ddcb84c59..e9a566d759 100644
--- a/src/cmd/compile/internal/amd64/ssa.go
+++ b/src/cmd/compile/internal/amd64/ssa.go
@@ -1845,7 +1845,13 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
 		ssa.OpAMD64VPMOVVec32x16ToM,
 		ssa.OpAMD64VPMOVVec64x2ToM,
 		ssa.OpAMD64VPMOVVec64x4ToM,
-		ssa.OpAMD64VPMOVVec64x8ToM:
+		ssa.OpAMD64VPMOVVec64x8ToM,
+		ssa.OpAMD64VPMOVMSKB128,
+		ssa.OpAMD64VPMOVMSKB256,
+		ssa.OpAMD64VMOVMSKPS128,
+		ssa.OpAMD64VMOVMSKPS256,
+		ssa.OpAMD64VMOVMSKPD128,
+		ssa.OpAMD64VMOVMSKPD256:
 		p := s.Prog(v.Op.Asm())
 		p.From.Type = obj.TYPE_REG
 		p.From.Reg = simdReg(v.Args[0])
diff --git a/src/cmd/compile/internal/inline/inl.go b/src/cmd/compile/internal/inline/inl.go
index 33f9c325c3..4fa9cf07fb 100644
--- a/src/cmd/compile/internal/inline/inl.go
+++ b/src/cmd/compile/internal/inline/inl.go
@@ -516,6 +516,9 @@ opSwitch:
 						break opSwitch
 					case "panicrangestate":
 						cheap = true
+					case "deferrangefunc":
+						v.reason = "defer call in range func"
+						return true
 					}
 				}
 			}
diff --git a/src/cmd/compile/internal/ssa/_gen/AMD64.rules b/src/cmd/compile/internal/ssa/_gen/AMD64.rules
index 353d272179..b49e85b53c 100644
--- a/src/cmd/compile/internal/ssa/_gen/AMD64.rules
+++ b/src/cmd/compile/internal/ssa/_gen/AMD64.rules
@@ -1679,21 +1679,21 @@
 (Cvt8toMask64x8 <t> x) => (VPMOVMToVec64x8 <types.TypeVec512> (KMOVBk <t> x))
 
 // masks to integers
-(CvtMask8x16to16 <t> x) => (KMOVWi <t> (VPMOVVec8x16ToM <types.TypeMask> x))
-(CvtMask8x32to32 <t> x) => (KMOVDi <t> (VPMOVVec8x32ToM <types.TypeMask> x))
-(CvtMask8x64to64 <t> x) => (KMOVQi <t> (VPMOVVec8x64ToM <types.TypeMask> x))
+(CvtMask8x16to16 ...) => (VPMOVMSKB128 ...)
+(CvtMask8x32to32 ...) => (VPMOVMSKB256 ...)
+(CvtMask8x64to64 x) => (KMOVQi (VPMOVVec8x64ToM <types.TypeMask> x))
 
-(CvtMask16x8to8 <t> x) => (KMOVBi <t> (VPMOVVec16x8ToM <types.TypeMask> x))
-(CvtMask16x16to16 <t> x) => (KMOVWi <t> (VPMOVVec16x16ToM <types.TypeMask> x))
-(CvtMask16x32to32 <t> x) => (KMOVDi <t> (VPMOVVec16x32ToM <types.TypeMask> x))
+(CvtMask16x8to8 x) => (KMOVBi (VPMOVVec16x8ToM <types.TypeMask> x))
+(CvtMask16x16to16 x) => (KMOVWi (VPMOVVec16x16ToM <types.TypeMask> x))
+(CvtMask16x32to32 x) => (KMOVDi (VPMOVVec16x32ToM <types.TypeMask> x))
 
-(CvtMask32x4to8 <t> x) => (KMOVBi <t> (VPMOVVec32x4ToM <types.TypeMask> x))
-(CvtMask32x8to8 <t> x) => (KMOVBi <t> (VPMOVVec32x8ToM <types.TypeMask> x))
-(CvtMask32x16to16 <t> x) => (KMOVWi <t> (VPMOVVec32x16ToM <types.TypeMask> x))
+(CvtMask32x4to8 ...) => (VMOVMSKPS128 ...)
+(CvtMask32x8to8 ...) => (VMOVMSKPS256 ...)
+(CvtMask32x16to16 x) => (KMOVWi (VPMOVVec32x16ToM <types.TypeMask> x))
 
-(CvtMask64x2to8 <t> x) => (KMOVBi <t> (VPMOVVec64x2ToM <types.TypeMask> x))
-(CvtMask64x4to8 <t> x) => (KMOVBi <t> (VPMOVVec64x4ToM <types.TypeMask> x))
-(CvtMask64x8to8 <t> x) => (KMOVBi <t> (VPMOVVec64x8ToM <types.TypeMask> x))
+(CvtMask64x2to8 ...) => (VMOVMSKPD128 ...)
+(CvtMask64x4to8 ...) => (VMOVMSKPD256 ...)
+(CvtMask64x8to8 x) => (KMOVBi (VPMOVVec64x8ToM <types.TypeMask> x))
 
 // optimizations
 (MOVBstore [off] {sym} ptr (KMOVBi mask) mem) => (KMOVBstore [off] {sym} ptr mask mem)
@@ -1730,6 +1730,13 @@
 // Misc
 (IsZeroVec x) => (SETEQ (VPTEST x x))
 
+(IsNaNFloat32x4  x) => (VCMPPS128 [3] x x)
+(IsNaNFloat32x8  x) => (VCMPPS256 [3] x x)
+(IsNaNFloat32x16 x) => (VPMOVMToVec32x16 (VCMPPS512 [3] x x))
+(IsNaNFloat64x2  x) => (VCMPPD128 [3] x x)
+(IsNaNFloat64x4  x) => (VCMPPD256 [3] x x)
+(IsNaNFloat64x8  x) => (VPMOVMToVec64x8 (VCMPPD512 [3] x x))
+
 // SIMD vector K-masked loads and stores
 
 (LoadMasked64 <t> ptr mask mem) && t.Size() == 64 => (VPMASK64load512 ptr (VPMOVVec64x8ToM  <types.TypeMask> mask) mem)
@@ -1818,10 +1825,10 @@
 (EQ (VPTEST x:(VPANDN(128|256) j k) y) yes no) && x == y && x.Uses == 2 => (ULT (VPTEST k j) yes no) // AndNot has swapped its operand order
 (EQ (VPTEST x:(VPANDN(D|Q)512 j k) y) yes no) && x == y && x.Uses == 2 => (ULT (VPTEST k j) yes no) // AndNot has swapped its operand order
 
-// DotProductQuadruple optimizations
-(VPADDD128 (VPDPBUSD128 (Zero128 <t>) x y) z) => (VPDPBUSD128 <t> z x y)
-(VPADDD256 (VPDPBUSD256 (Zero256 <t>) x y) z) => (VPDPBUSD256 <t> z x y)
-(VPADDD512 (VPDPBUSD512 (Zero512 <t>) x y) z) => (VPDPBUSD512 <t> z x y)
-(VPADDD128 (VPDPBUSDS128 (Zero128 <t>) x y) z) => (VPDPBUSDS128 <t> z x y)
-(VPADDD256 (VPDPBUSDS256 (Zero256 <t>) x y) z) => (VPDPBUSDS256 <t> z x y)
-(VPADDD512 (VPDPBUSDS512 (Zero512 <t>) x y) z) => (VPDPBUSDS512 <t> z x y)
-\ No newline at end of file
+// optimize x.IsNaN().Or(y.IsNaN())
+(VPOR128 (VCMPP(S|D)128 [3] x x) (VCMPP(S|D)128 [3] y y)) => (VCMPP(S|D)128 [3] x y)
+(VPOR256 (VCMPP(S|D)256 [3] x x) (VCMPP(S|D)256 [3] y y)) => (VCMPP(S|D)256 [3] x y)
+(VPORD512 (VPMOVMToVec32x16 (VCMPPS512 [3] x x)) (VPMOVMToVec32x16 (VCMPPS512 [3] y y))) =>
+	(VPMOVMToVec32x16 (VCMPPS512 [3] x y))
+(VPORD512 (VPMOVMToVec64x8  (VCMPPD512 [3] x x)) (VPMOVMToVec64x8  (VCMPPD512 [3] y y))) =>
+	(VPMOVMToVec64x8  (VCMPPD512 [3] x y))
diff --git a/src/cmd/compile/internal/ssa/_gen/AMD64Ops.go b/src/cmd/compile/internal/ssa/_gen/AMD64Ops.go
index 2fb4fdfc96..b13eb5aa21 100644
--- a/src/cmd/compile/internal/ssa/_gen/AMD64Ops.go
+++ b/src/cmd/compile/internal/ssa/_gen/AMD64Ops.go
@@ -1368,6 +1368,7 @@ func init() {
 		{name: "VPMASK64load512", argLength: 3, reg: vloadk, asm: "VMOVDQU64", aux: "SymOff", faultOnNilArg0: true, symEffect: "Read"},    // load from arg0+auxint+aux, arg1=k mask, arg2 = mem
 		{name: "VPMASK64store512", argLength: 4, reg: vstorek, asm: "VMOVDQU64", aux: "SymOff", faultOnNilArg0: true, symEffect: "Write"}, // store, *(arg0+auxint+aux) = arg2, arg1=k mask, arg3 = mem
 
+		// AVX512 moves between int-vector and mask registers
 		{name: "VPMOVMToVec8x16", argLength: 1, reg: kv, asm: "VPMOVM2B"},
 		{name: "VPMOVMToVec8x32", argLength: 1, reg: kv, asm: "VPMOVM2B"},
 		{name: "VPMOVMToVec8x64", argLength: 1, reg: kw, asm: "VPMOVM2B"},
@@ -1400,6 +1401,14 @@ func init() {
 		{name: "VPMOVVec64x4ToM", argLength: 1, reg: vk, asm: "VPMOVQ2M"},
 		{name: "VPMOVVec64x8ToM", argLength: 1, reg: wk, asm: "VPMOVQ2M"},
 
+		// AVX1/2 moves from int-vector to bitmask (extracting sign bits)
+		{name: "VPMOVMSKB128", argLength: 1, reg: vgp, asm: "VPMOVMSKB"},
+		{name: "VPMOVMSKB256", argLength: 1, reg: vgp, asm: "VPMOVMSKB"},
+		{name: "VMOVMSKPS128", argLength: 1, reg: vgp, asm: "VMOVMSKPS"},
+		{name: "VMOVMSKPS256", argLength: 1, reg: vgp, asm: "VMOVMSKPS"},
+		{name: "VMOVMSKPD128", argLength: 1, reg: vgp, asm: "VMOVMSKPD"},
+		{name: "VMOVMSKPD256", argLength: 1, reg: vgp, asm: "VMOVMSKPD"},
+
 		// X15 is the zero register up to 128-bit. For larger values, we zero it on the fly.
 		{name: "Zero128", argLength: 0, reg: x15only, zeroWidth: true, fixedReg: true},
 		{name: "Zero256", argLength: 0, reg: v01, asm: "VPXOR"},
diff --git a/src/cmd/compile/internal/ssa/_gen/genericOps.go b/src/cmd/compile/internal/ssa/_gen/genericOps.go
index 8637133e5f..85bde1aab2 100644
--- a/src/cmd/compile/internal/ssa/_gen/genericOps.go
+++ b/src/cmd/compile/internal/ssa/_gen/genericOps.go
@@ -715,6 +715,14 @@ var genericOps = []opData{
 
 	// Returns true if arg0 is all zero.
 	{name: "IsZeroVec", argLength: 1},
+
+	// Returns a mask indicating whether arg0's elements are NaN.
+	{name: "IsNaNFloat32x4", argLength: 1},
+	{name: "IsNaNFloat32x8", argLength: 1},
+	{name: "IsNaNFloat32x16", argLength: 1},
+	{name: "IsNaNFloat64x2", argLength: 1},
+	{name: "IsNaNFloat64x4", argLength: 1},
+	{name: "IsNaNFloat64x8", argLength: 1},
 }
 
 //     kind          controls          successors   implicit exit
diff --git a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules
index 649940497c..5c83f39a1f 100644
--- a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules
+++ b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules
@@ -1,4 +1,4 @@
-// Code generated by x/arch/internal/simdgen using 'go run . -xedPath $XED_PATH -o godefs -goroot $GOROOT go.yaml types.yaml categories.yaml'; DO NOT EDIT.
+// Code generated by 'simdgen -o godefs -goroot $GOROOT -xedPath $XED_PATH go.yaml types.yaml categories.yaml'; DO NOT EDIT.
 
 (AESDecryptLastRoundUint8x16 ...) => (VAESDECLAST128 ...)
 (AESDecryptLastRoundUint8x32 ...) => (VAESDECLAST256 ...)
@@ -57,19 +57,19 @@
 (AddUint64x4 ...) => (VPADDQ256 ...)
 (AddUint64x8 ...) => (VPADDQ512 ...)
 (AddPairsFloat32x4 ...) => (VHADDPS128 ...)
-(AddPairsFloat32x8 ...) => (VHADDPS256 ...)
 (AddPairsFloat64x2 ...) => (VHADDPD128 ...)
-(AddPairsFloat64x4 ...) => (VHADDPD256 ...)
 (AddPairsInt16x8 ...) => (VPHADDW128 ...)
-(AddPairsInt16x16 ...) => (VPHADDW256 ...)
 (AddPairsInt32x4 ...) => (VPHADDD128 ...)
-(AddPairsInt32x8 ...) => (VPHADDD256 ...)
 (AddPairsUint16x8 ...) => (VPHADDW128 ...)
-(AddPairsUint16x16 ...) => (VPHADDW256 ...)
 (AddPairsUint32x4 ...) => (VPHADDD128 ...)
-(AddPairsUint32x8 ...) => (VPHADDD256 ...)
+(AddPairsGroupedFloat32x8 ...) => (VHADDPS256 ...)
+(AddPairsGroupedFloat64x4 ...) => (VHADDPD256 ...)
+(AddPairsGroupedInt16x16 ...) => (VPHADDW256 ...)
+(AddPairsGroupedInt32x8 ...) => (VPHADDD256 ...)
+(AddPairsGroupedUint16x16 ...) => (VPHADDW256 ...)
+(AddPairsGroupedUint32x8 ...) => (VPHADDD256 ...)
 (AddPairsSaturatedInt16x8 ...) => (VPHADDSW128 ...)
-(AddPairsSaturatedInt16x16 ...) => (VPHADDSW256 ...)
+(AddPairsSaturatedGroupedInt16x16 ...) => (VPHADDSW256 ...)
 (AddSaturatedInt8x16 ...) => (VPADDSB128 ...)
 (AddSaturatedInt8x32 ...) => (VPADDSB256 ...)
 (AddSaturatedInt8x64 ...) => (VPADDSB512 ...)
@@ -316,12 +316,6 @@
 (DotProductPairsSaturatedUint8x16 ...) => (VPMADDUBSW128 ...)
 (DotProductPairsSaturatedUint8x32 ...) => (VPMADDUBSW256 ...)
 (DotProductPairsSaturatedUint8x64 ...) => (VPMADDUBSW512 ...)
-(DotProductQuadrupleInt32x4 ...) => (VPDPBUSD128 ...)
-(DotProductQuadrupleInt32x8 ...) => (VPDPBUSD256 ...)
-(DotProductQuadrupleInt32x16 ...) => (VPDPBUSD512 ...)
-(DotProductQuadrupleSaturatedInt32x4 ...) => (VPDPBUSDS128 ...)
-(DotProductQuadrupleSaturatedInt32x8 ...) => (VPDPBUSDS256 ...)
-(DotProductQuadrupleSaturatedInt32x16 ...) => (VPDPBUSDS512 ...)
 (EqualFloat32x4 x y) => (VCMPPS128 [0] x y)
 (EqualFloat32x8 x y) => (VCMPPS256 [0] x y)
 (EqualFloat32x16 x y) => (VPMOVMToVec32x16 (VCMPPS512 [0] x y))
@@ -382,26 +376,26 @@
 (ExpandUint64x2 x mask) => (VPEXPANDQMasked128 x (VPMOVVec64x2ToM <types.TypeMask> mask))
 (ExpandUint64x4 x mask) => (VPEXPANDQMasked256 x (VPMOVVec64x4ToM <types.TypeMask> mask))
 (ExpandUint64x8 x mask) => (VPEXPANDQMasked512 x (VPMOVVec64x8ToM <types.TypeMask> mask))
-(ExtendLo2ToInt64x2Int8x16 ...) => (VPMOVSXBQ128 ...)
-(ExtendLo2ToInt64x2Int16x8 ...) => (VPMOVSXWQ128 ...)
-(ExtendLo2ToInt64x2Int32x4 ...) => (VPMOVSXDQ128 ...)
-(ExtendLo2ToUint64x2Uint8x16 ...) => (VPMOVZXBQ128 ...)
-(ExtendLo2ToUint64x2Uint16x8 ...) => (VPMOVZXWQ128 ...)
-(ExtendLo2ToUint64x2Uint32x4 ...) => (VPMOVZXDQ128 ...)
-(ExtendLo4ToInt32x4Int8x16 ...) => (VPMOVSXBD128 ...)
-(ExtendLo4ToInt32x4Int16x8 ...) => (VPMOVSXWD128 ...)
-(ExtendLo4ToInt64x4Int8x16 ...) => (VPMOVSXBQ256 ...)
-(ExtendLo4ToInt64x4Int16x8 ...) => (VPMOVSXWQ256 ...)
-(ExtendLo4ToUint32x4Uint8x16 ...) => (VPMOVZXBD128 ...)
-(ExtendLo4ToUint32x4Uint16x8 ...) => (VPMOVZXWD128 ...)
-(ExtendLo4ToUint64x4Uint8x16 ...) => (VPMOVZXBQ256 ...)
-(ExtendLo4ToUint64x4Uint16x8 ...) => (VPMOVZXWQ256 ...)
-(ExtendLo8ToInt16x8Int8x16 ...) => (VPMOVSXBW128 ...)
-(ExtendLo8ToInt32x8Int8x16 ...) => (VPMOVSXBD256 ...)
-(ExtendLo8ToInt64x8Int8x16 ...) => (VPMOVSXBQ512 ...)
-(ExtendLo8ToUint16x8Uint8x16 ...) => (VPMOVZXBW128 ...)
-(ExtendLo8ToUint32x8Uint8x16 ...) => (VPMOVZXBD256 ...)
-(ExtendLo8ToUint64x8Uint8x16 ...) => (VPMOVZXBQ512 ...)
+(ExtendLo2ToInt64Int8x16 ...) => (VPMOVSXBQ128 ...)
+(ExtendLo2ToInt64Int16x8 ...) => (VPMOVSXWQ128 ...)
+(ExtendLo2ToInt64Int32x4 ...) => (VPMOVSXDQ128 ...)
+(ExtendLo2ToUint64Uint8x16 ...) => (VPMOVZXBQ128 ...)
+(ExtendLo2ToUint64Uint16x8 ...) => (VPMOVZXWQ128 ...)
+(ExtendLo2ToUint64Uint32x4 ...) => (VPMOVZXDQ128 ...)
+(ExtendLo4ToInt32Int8x16 ...) => (VPMOVSXBD128 ...)
+(ExtendLo4ToInt32Int16x8 ...) => (VPMOVSXWD128 ...)
+(ExtendLo4ToInt64Int8x16 ...) => (VPMOVSXBQ256 ...)
+(ExtendLo4ToInt64Int16x8 ...) => (VPMOVSXWQ256 ...)
+(ExtendLo4ToUint32Uint8x16 ...) => (VPMOVZXBD128 ...)
+(ExtendLo4ToUint32Uint16x8 ...) => (VPMOVZXWD128 ...)
+(ExtendLo4ToUint64Uint8x16 ...) => (VPMOVZXBQ256 ...)
+(ExtendLo4ToUint64Uint16x8 ...) => (VPMOVZXWQ256 ...)
+(ExtendLo8ToInt16Int8x16 ...) => (VPMOVSXBW128 ...)
+(ExtendLo8ToInt32Int8x16 ...) => (VPMOVSXBD256 ...)
+(ExtendLo8ToInt64Int8x16 ...) => (VPMOVSXBQ512 ...)
+(ExtendLo8ToUint16Uint8x16 ...) => (VPMOVZXBW128 ...)
+(ExtendLo8ToUint32Uint8x16 ...) => (VPMOVZXBD256 ...)
+(ExtendLo8ToUint64Uint8x16 ...) => (VPMOVZXBQ512 ...)
 (ExtendToInt16Int8x16 ...) => (VPMOVSXBW256 ...)
 (ExtendToInt16Int8x32 ...) => (VPMOVSXBW512 ...)
 (ExtendToInt32Int8x16 ...) => (VPMOVSXBD512 ...)
@@ -565,12 +559,6 @@
 (InterleaveLoGroupedUint32x16 ...) => (VPUNPCKLDQ512 ...)
 (InterleaveLoGroupedUint64x4 ...) => (VPUNPCKLQDQ256 ...)
 (InterleaveLoGroupedUint64x8 ...) => (VPUNPCKLQDQ512 ...)
-(IsNanFloat32x4 x y) => (VCMPPS128 [3] x y)
-(IsNanFloat32x8 x y) => (VCMPPS256 [3] x y)
-(IsNanFloat32x16 x y) => (VPMOVMToVec32x16 (VCMPPS512 [3] x y))
-(IsNanFloat64x2 x y) => (VCMPPD128 [3] x y)
-(IsNanFloat64x4 x y) => (VCMPPD256 [3] x y)
-(IsNanFloat64x8 x y) => (VPMOVMToVec64x8 (VCMPPD512 [3] x y))
 (LeadingZerosInt32x4 ...) => (VPLZCNTD128 ...)
 (LeadingZerosInt32x8 ...) => (VPLZCNTD256 ...)
 (LeadingZerosInt32x16 ...) => (VPLZCNTD512 ...)
@@ -914,29 +902,29 @@
 (SaturateToInt16Int64x4 ...) => (VPMOVSQW128_256 ...)
 (SaturateToInt16Int64x8 ...) => (VPMOVSQW128_512 ...)
 (SaturateToInt16ConcatInt32x4 ...) => (VPACKSSDW128 ...)
-(SaturateToInt16ConcatInt32x8 ...) => (VPACKSSDW256 ...)
-(SaturateToInt16ConcatInt32x16 ...) => (VPACKSSDW512 ...)
+(SaturateToInt16ConcatGroupedInt32x8 ...) => (VPACKSSDW256 ...)
+(SaturateToInt16ConcatGroupedInt32x16 ...) => (VPACKSSDW512 ...)
 (SaturateToInt32Int64x2 ...) => (VPMOVSQD128_128 ...)
 (SaturateToInt32Int64x4 ...) => (VPMOVSQD128_256 ...)
 (SaturateToInt32Int64x8 ...) => (VPMOVSQD256 ...)
-(SaturateToUint8Int16x8 ...) => (VPMOVSWB128_128 ...)
-(SaturateToUint8Int16x16 ...) => (VPMOVSWB128_256 ...)
-(SaturateToUint8Int32x4 ...) => (VPMOVSDB128_128 ...)
-(SaturateToUint8Int32x8 ...) => (VPMOVSDB128_256 ...)
-(SaturateToUint8Int32x16 ...) => (VPMOVSDB128_512 ...)
-(SaturateToUint8Int64x2 ...) => (VPMOVSQB128_128 ...)
-(SaturateToUint8Int64x4 ...) => (VPMOVSQB128_256 ...)
-(SaturateToUint8Int64x8 ...) => (VPMOVSQB128_512 ...)
+(SaturateToUint8Uint16x8 ...) => (VPMOVUSWB128_128 ...)
+(SaturateToUint8Uint16x16 ...) => (VPMOVUSWB128_256 ...)
 (SaturateToUint8Uint16x32 ...) => (VPMOVUSWB256 ...)
+(SaturateToUint8Uint32x4 ...) => (VPMOVUSDB128_128 ...)
+(SaturateToUint8Uint32x8 ...) => (VPMOVUSDB128_256 ...)
+(SaturateToUint8Uint32x16 ...) => (VPMOVUSDB128_512 ...)
+(SaturateToUint8Uint64x2 ...) => (VPMOVUSQB128_128 ...)
+(SaturateToUint8Uint64x4 ...) => (VPMOVUSQB128_256 ...)
+(SaturateToUint8Uint64x8 ...) => (VPMOVUSQB128_512 ...)
 (SaturateToUint16Uint32x4 ...) => (VPMOVUSDW128_128 ...)
 (SaturateToUint16Uint32x8 ...) => (VPMOVUSDW128_256 ...)
 (SaturateToUint16Uint32x16 ...) => (VPMOVUSDW256 ...)
 (SaturateToUint16Uint64x2 ...) => (VPMOVUSQW128_128 ...)
 (SaturateToUint16Uint64x4 ...) => (VPMOVUSQW128_256 ...)
 (SaturateToUint16Uint64x8 ...) => (VPMOVUSQW128_512 ...)
-(SaturateToUint16ConcatUint32x4 ...) => (VPACKUSDW128 ...)
-(SaturateToUint16ConcatUint32x8 ...) => (VPACKUSDW256 ...)
-(SaturateToUint16ConcatUint32x16 ...) => (VPACKUSDW512 ...)
+(SaturateToUint16ConcatInt32x4 ...) => (VPACKUSDW128 ...)
+(SaturateToUint16ConcatGroupedInt32x8 ...) => (VPACKUSDW256 ...)
+(SaturateToUint16ConcatGroupedInt32x16 ...) => (VPACKUSDW512 ...)
 (SaturateToUint32Uint64x2 ...) => (VPMOVUSQD128_128 ...)
 (SaturateToUint32Uint64x4 ...) => (VPMOVUSQD128_256 ...)
 (SaturateToUint32Uint64x8 ...) => (VPMOVUSQD256 ...)
@@ -1223,19 +1211,19 @@
 (SubUint64x4 ...) => (VPSUBQ256 ...)
 (SubUint64x8 ...) => (VPSUBQ512 ...)
 (SubPairsFloat32x4 ...) => (VHSUBPS128 ...)
-(SubPairsFloat32x8 ...) => (VHSUBPS256 ...)
 (SubPairsFloat64x2 ...) => (VHSUBPD128 ...)
-(SubPairsFloat64x4 ...) => (VHSUBPD256 ...)
 (SubPairsInt16x8 ...) => (VPHSUBW128 ...)
-(SubPairsInt16x16 ...) => (VPHSUBW256 ...)
 (SubPairsInt32x4 ...) => (VPHSUBD128 ...)
-(SubPairsInt32x8 ...) => (VPHSUBD256 ...)
 (SubPairsUint16x8 ...) => (VPHSUBW128 ...)
-(SubPairsUint16x16 ...) => (VPHSUBW256 ...)
 (SubPairsUint32x4 ...) => (VPHSUBD128 ...)
-(SubPairsUint32x8 ...) => (VPHSUBD256 ...)
+(SubPairsGroupedFloat32x8 ...) => (VHSUBPS256 ...)
+(SubPairsGroupedFloat64x4 ...) => (VHSUBPD256 ...)
+(SubPairsGroupedInt16x16 ...) => (VPHSUBW256 ...)
+(SubPairsGroupedInt32x8 ...) => (VPHSUBD256 ...)
+(SubPairsGroupedUint16x16 ...) => (VPHSUBW256 ...)
+(SubPairsGroupedUint32x8 ...) => (VPHSUBD256 ...)
 (SubPairsSaturatedInt16x8 ...) => (VPHSUBSW128 ...)
-(SubPairsSaturatedInt16x16 ...) => (VPHSUBSW256 ...)
+(SubPairsSaturatedGroupedInt16x16 ...) => (VPHSUBSW256 ...)
 (SubSaturatedInt8x16 ...) => (VPSUBSB128 ...)
 (SubSaturatedInt8x32 ...) => (VPSUBSB256 ...)
 (SubSaturatedInt8x64 ...) => (VPSUBSB512 ...)
@@ -1547,12 +1535,6 @@
 (VMOVDQU16Masked128 (VPMADDUBSW128 x y) mask) => (VPMADDUBSWMasked128 x y mask)
 (VMOVDQU16Masked256 (VPMADDUBSW256 x y) mask) => (VPMADDUBSWMasked256 x y mask)
 (VMOVDQU16Masked512 (VPMADDUBSW512 x y) mask) => (VPMADDUBSWMasked512 x y mask)
-(VMOVDQU32Masked128 (VPDPBUSD128 x y z) mask) => (VPDPBUSDMasked128 x y z mask)
-(VMOVDQU32Masked256 (VPDPBUSD256 x y z) mask) => (VPDPBUSDMasked256 x y z mask)
-(VMOVDQU32Masked512 (VPDPBUSD512 x y z) mask) => (VPDPBUSDMasked512 x y z mask)
-(VMOVDQU32Masked128 (VPDPBUSDS128 x y z) mask) => (VPDPBUSDSMasked128 x y z mask)
-(VMOVDQU32Masked256 (VPDPBUSDS256 x y z) mask) => (VPDPBUSDSMasked256 x y z mask)
-(VMOVDQU32Masked512 (VPDPBUSDS512 x y z) mask) => (VPDPBUSDSMasked512 x y z mask)
 (VMOVDQU8Masked128 (VPMOVSXBQ128 x) mask) => (VPMOVSXBQMasked128 x mask)
 (VMOVDQU16Masked128 (VPMOVSXWQ128 x) mask) => (VPMOVSXWQMasked128 x mask)
 (VMOVDQU32Masked128 (VPMOVSXDQ128 x) mask) => (VPMOVSXDQMasked128 x mask)
@@ -1775,9 +1757,9 @@
 (VMOVDQU64Masked128 (VPMOVSQB128_128 x) mask) => (VPMOVSQBMasked128_128 x mask)
 (VMOVDQU64Masked256 (VPMOVSQB128_256 x) mask) => (VPMOVSQBMasked128_256 x mask)
 (VMOVDQU64Masked512 (VPMOVSQB128_512 x) mask) => (VPMOVSQBMasked128_512 x mask)
-(VMOVDQU32Masked128 (VPACKSSDW128 x y) mask) => (VPACKSSDWMasked128 x y mask)
 (VMOVDQU32Masked256 (VPACKSSDW256 x y) mask) => (VPACKSSDWMasked256 x y mask)
 (VMOVDQU32Masked512 (VPACKSSDW512 x y) mask) => (VPACKSSDWMasked512 x y mask)
+(VMOVDQU32Masked128 (VPACKSSDW128 x y) mask) => (VPACKSSDWMasked128 x y mask)
 (VMOVDQU32Masked128 (VPMOVSDW128_128 x) mask) => (VPMOVSDWMasked128_128 x mask)
 (VMOVDQU32Masked256 (VPMOVSDW128_256 x) mask) => (VPMOVSDWMasked128_256 x mask)
 (VMOVDQU32Masked256 (VPMOVSDW256 x) mask) => (VPMOVSDWMasked256 x mask)
@@ -1787,10 +1769,18 @@
 (VMOVDQU64Masked128 (VPMOVSQD128_128 x) mask) => (VPMOVSQDMasked128_128 x mask)
 (VMOVDQU64Masked256 (VPMOVSQD128_256 x) mask) => (VPMOVSQDMasked128_256 x mask)
 (VMOVDQU64Masked256 (VPMOVSQD256 x) mask) => (VPMOVSQDMasked256 x mask)
+(VMOVDQU16Masked128 (VPMOVUSWB128_128 x) mask) => (VPMOVUSWBMasked128_128 x mask)
+(VMOVDQU16Masked256 (VPMOVUSWB128_256 x) mask) => (VPMOVUSWBMasked128_256 x mask)
 (VMOVDQU16Masked256 (VPMOVUSWB256 x) mask) => (VPMOVUSWBMasked256 x mask)
-(VMOVDQU32Masked128 (VPACKUSDW128 x y) mask) => (VPACKUSDWMasked128 x y mask)
+(VMOVDQU32Masked128 (VPMOVUSDB128_128 x) mask) => (VPMOVUSDBMasked128_128 x mask)
+(VMOVDQU32Masked256 (VPMOVUSDB128_256 x) mask) => (VPMOVUSDBMasked128_256 x mask)
+(VMOVDQU32Masked512 (VPMOVUSDB128_512 x) mask) => (VPMOVUSDBMasked128_512 x mask)
+(VMOVDQU64Masked128 (VPMOVUSQB128_128 x) mask) => (VPMOVUSQBMasked128_128 x mask)
+(VMOVDQU64Masked256 (VPMOVUSQB128_256 x) mask) => (VPMOVUSQBMasked128_256 x mask)
+(VMOVDQU64Masked512 (VPMOVUSQB128_512 x) mask) => (VPMOVUSQBMasked128_512 x mask)
 (VMOVDQU32Masked256 (VPACKUSDW256 x y) mask) => (VPACKUSDWMasked256 x y mask)
 (VMOVDQU32Masked512 (VPACKUSDW512 x y) mask) => (VPACKUSDWMasked512 x y mask)
+(VMOVDQU32Masked128 (VPACKUSDW128 x y) mask) => (VPACKUSDWMasked128 x y mask)
 (VMOVDQU32Masked128 (VPMOVUSDW128_128 x) mask) => (VPMOVUSDWMasked128_128 x mask)
 (VMOVDQU32Masked256 (VPMOVUSDW128_256 x) mask) => (VPMOVUSDWMasked128_256 x mask)
 (VMOVDQU32Masked256 (VPMOVUSDW256 x) mask) => (VPMOVUSDWMasked256 x mask)
@@ -2018,6 +2008,7 @@
 (VPBLENDMDMasked512 dst (VPMOVDW256 x) mask) => (VPMOVDWMasked256Merging dst x mask)
 (VPBLENDMDMasked512 dst (VPMOVSDB128_512 x) mask) => (VPMOVSDBMasked128_512Merging dst x mask)
 (VPBLENDMDMasked512 dst (VPMOVSDW256 x) mask) => (VPMOVSDWMasked256Merging dst x mask)
+(VPBLENDMDMasked512 dst (VPMOVUSDB128_512 x) mask) => (VPMOVUSDBMasked128_512Merging dst x mask)
 (VPBLENDMDMasked512 dst (VPMOVUSDW256 x) mask) => (VPMOVUSDWMasked256Merging dst x mask)
 (VPBLENDMDMasked512 dst (VPMULLD512 x y) mask) => (VPMULLDMasked512Merging dst x y mask)
 (VPBLENDMDMasked512 dst (VPOPCNTD512 x) mask) => (VPOPCNTDMasked512Merging dst x mask)
@@ -2071,6 +2062,7 @@
 (VPBLENDMQMasked512 dst (VPMOVSQB128_512 x) mask) => (VPMOVSQBMasked128_512Merging dst x mask)
 (VPBLENDMQMasked512 dst (VPMOVSQD256 x) mask) => (VPMOVSQDMasked256Merging dst x mask)
 (VPBLENDMQMasked512 dst (VPMOVSQW128_512 x) mask) => (VPMOVSQWMasked128_512Merging dst x mask)
+(VPBLENDMQMasked512 dst (VPMOVUSQB128_512 x) mask) => (VPMOVUSQBMasked128_512Merging dst x mask)
 (VPBLENDMQMasked512 dst (VPMOVUSQD256 x) mask) => (VPMOVUSQDMasked256Merging dst x mask)
 (VPBLENDMQMasked512 dst (VPMOVUSQW128_512 x) mask) => (VPMOVUSQWMasked128_512Merging dst x mask)
 (VPBLENDMQMasked512 dst (VPMULLQ512 x y) mask) => (VPMULLQMasked512Merging dst x y mask)
@@ -2235,9 +2227,12 @@
 (VPBLENDVB128 dst (VPMOVSXWQ128 x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPMOVSXWQMasked128Merging dst x (VPMOVVec16x8ToM <types.TypeMask> mask))
 (VPBLENDVB128 dst (VPMOVSXWQ256 x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPMOVSXWQMasked256Merging dst x (VPMOVVec16x8ToM <types.TypeMask> mask))
 (VPBLENDVB128 dst (VPMOVSXWQ512 x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPMOVSXWQMasked512Merging dst x (VPMOVVec16x8ToM <types.TypeMask> mask))
+(VPBLENDVB128 dst (VPMOVUSDB128_128 x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPMOVUSDBMasked128_128Merging dst x (VPMOVVec32x4ToM <types.TypeMask> mask))
 (VPBLENDVB128 dst (VPMOVUSDW128_128 x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPMOVUSDWMasked128_128Merging dst x (VPMOVVec32x4ToM <types.TypeMask> mask))
+(VPBLENDVB128 dst (VPMOVUSQB128_128 x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPMOVUSQBMasked128_128Merging dst x (VPMOVVec64x2ToM <types.TypeMask> mask))
 (VPBLENDVB128 dst (VPMOVUSQD128_128 x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPMOVUSQDMasked128_128Merging dst x (VPMOVVec64x2ToM <types.TypeMask> mask))
 (VPBLENDVB128 dst (VPMOVUSQW128_128 x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPMOVUSQWMasked128_128Merging dst x (VPMOVVec64x2ToM <types.TypeMask> mask))
+(VPBLENDVB128 dst (VPMOVUSWB128_128 x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPMOVUSWBMasked128_128Merging dst x (VPMOVVec16x8ToM <types.TypeMask> mask))
 (VPBLENDVB128 dst (VPMOVWB128_128 x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPMOVWBMasked128_128Merging dst x (VPMOVVec16x8ToM <types.TypeMask> mask))
 (VPBLENDVB128 dst (VPMOVZXBD128 x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPMOVZXBDMasked128Merging dst x (VPMOVVec8x16ToM <types.TypeMask> mask))
 (VPBLENDVB128 dst (VPMOVZXBD256 x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPMOVZXBDMasked256Merging dst x (VPMOVVec8x16ToM <types.TypeMask> mask))
@@ -2396,9 +2391,12 @@
 (VPBLENDVB256 dst (VPMOVSXBW512 x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPMOVSXBWMasked512Merging dst x (VPMOVVec8x32ToM <types.TypeMask> mask))
 (VPBLENDVB256 dst (VPMOVSXDQ512 x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPMOVSXDQMasked512Merging dst x (VPMOVVec32x8ToM <types.TypeMask> mask))
 (VPBLENDVB256 dst (VPMOVSXWD512 x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPMOVSXWDMasked512Merging dst x (VPMOVVec16x16ToM <types.TypeMask> mask))
+(VPBLENDVB256 dst (VPMOVUSDB128_256 x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPMOVUSDBMasked128_256Merging dst x (VPMOVVec32x8ToM <types.TypeMask> mask))
 (VPBLENDVB256 dst (VPMOVUSDW128_256 x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPMOVUSDWMasked128_256Merging dst x (VPMOVVec32x8ToM <types.TypeMask> mask))
+(VPBLENDVB256 dst (VPMOVUSQB128_256 x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPMOVUSQBMasked128_256Merging dst x (VPMOVVec64x4ToM <types.TypeMask> mask))
 (VPBLENDVB256 dst (VPMOVUSQD128_256 x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPMOVUSQDMasked128_256Merging dst x (VPMOVVec64x4ToM <types.TypeMask> mask))
 (VPBLENDVB256 dst (VPMOVUSQW128_256 x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPMOVUSQWMasked128_256Merging dst x (VPMOVVec64x4ToM <types.TypeMask> mask))
+(VPBLENDVB256 dst (VPMOVUSWB128_256 x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPMOVUSWBMasked128_256Merging dst x (VPMOVVec16x16ToM <types.TypeMask> mask))
 (VPBLENDVB256 dst (VPMOVWB128_256 x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPMOVWBMasked128_256Merging dst x (VPMOVVec16x16ToM <types.TypeMask> mask))
 (VPBLENDVB256 dst (VPMOVZXBW512 x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPMOVZXBWMasked512Merging dst x (VPMOVVec8x32ToM <types.TypeMask> mask))
 (VPBLENDVB256 dst (VPMOVZXDQ512 x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPMOVZXDQMasked512Merging dst x (VPMOVVec32x8ToM <types.TypeMask> mask))
@@ -2511,30 +2509,30 @@
 (VPANDNQMasked128 x l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPANDNQMasked128load {sym} [off] x ptr mask mem)
 (VPANDNQMasked256 x l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPANDNQMasked256load {sym} [off] x ptr mask mem)
 (VPANDNQMasked512 x l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPANDNQMasked512load {sym} [off] x ptr mask mem)
-(VRNDSCALEPS128 [c]  l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VRNDSCALEPS128load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mem)
-(VRNDSCALEPS256 [c]  l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VRNDSCALEPS256load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mem)
-(VRNDSCALEPS512 [c]  l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VRNDSCALEPS512load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mem)
-(VRNDSCALEPD128 [c]  l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VRNDSCALEPD128load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mem)
-(VRNDSCALEPD256 [c]  l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VRNDSCALEPD256load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mem)
-(VRNDSCALEPD512 [c]  l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VRNDSCALEPD512load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mem)
-(VRNDSCALEPSMasked128 [c]  l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VRNDSCALEPSMasked128load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
-(VRNDSCALEPSMasked256 [c]  l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VRNDSCALEPSMasked256load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
-(VRNDSCALEPSMasked512 [c]  l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VRNDSCALEPSMasked512load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
-(VRNDSCALEPDMasked128 [c]  l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VRNDSCALEPDMasked128load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
-(VRNDSCALEPDMasked256 [c]  l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VRNDSCALEPDMasked256load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
-(VRNDSCALEPDMasked512 [c]  l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VRNDSCALEPDMasked512load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
-(VREDUCEPS128 [c]  l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VREDUCEPS128load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mem)
-(VREDUCEPS256 [c]  l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VREDUCEPS256load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mem)
-(VREDUCEPS512 [c]  l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VREDUCEPS512load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mem)
-(VREDUCEPD128 [c]  l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VREDUCEPD128load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mem)
-(VREDUCEPD256 [c]  l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VREDUCEPD256load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mem)
-(VREDUCEPD512 [c]  l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VREDUCEPD512load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mem)
-(VREDUCEPSMasked128 [c]  l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VREDUCEPSMasked128load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
-(VREDUCEPSMasked256 [c]  l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VREDUCEPSMasked256load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
-(VREDUCEPSMasked512 [c]  l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VREDUCEPSMasked512load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
-(VREDUCEPDMasked128 [c]  l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VREDUCEPDMasked128load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
-(VREDUCEPDMasked256 [c]  l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VREDUCEPDMasked256load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
-(VREDUCEPDMasked512 [c]  l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VREDUCEPDMasked512load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
+(VRNDSCALEPS128 [c]  l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VRNDSCALEPS128load {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mem)
+(VRNDSCALEPS256 [c]  l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VRNDSCALEPS256load {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mem)
+(VRNDSCALEPS512 [c]  l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VRNDSCALEPS512load {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mem)
+(VRNDSCALEPD128 [c]  l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VRNDSCALEPD128load {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mem)
+(VRNDSCALEPD256 [c]  l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VRNDSCALEPD256load {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mem)
+(VRNDSCALEPD512 [c]  l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VRNDSCALEPD512load {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mem)
+(VRNDSCALEPSMasked128 [c]  l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VRNDSCALEPSMasked128load {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mask mem)
+(VRNDSCALEPSMasked256 [c]  l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VRNDSCALEPSMasked256load {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mask mem)
+(VRNDSCALEPSMasked512 [c]  l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VRNDSCALEPSMasked512load {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mask mem)
+(VRNDSCALEPDMasked128 [c]  l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VRNDSCALEPDMasked128load {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mask mem)
+(VRNDSCALEPDMasked256 [c]  l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VRNDSCALEPDMasked256load {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mask mem)
+(VRNDSCALEPDMasked512 [c]  l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VRNDSCALEPDMasked512load {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mask mem)
+(VREDUCEPS128 [c]  l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VREDUCEPS128load {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mem)
+(VREDUCEPS256 [c]  l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VREDUCEPS256load {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mem)
+(VREDUCEPS512 [c]  l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VREDUCEPS512load {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mem)
+(VREDUCEPD128 [c]  l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VREDUCEPD128load {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mem)
+(VREDUCEPD256 [c]  l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VREDUCEPD256load {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mem)
+(VREDUCEPD512 [c]  l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VREDUCEPD512load {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mem)
+(VREDUCEPSMasked128 [c]  l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VREDUCEPSMasked128load {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mask mem)
+(VREDUCEPSMasked256 [c]  l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VREDUCEPSMasked256load {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mask mem)
+(VREDUCEPSMasked512 [c]  l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VREDUCEPSMasked512load {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mask mem)
+(VREDUCEPDMasked128 [c]  l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VREDUCEPDMasked128load {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mask mem)
+(VREDUCEPDMasked256 [c]  l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VREDUCEPDMasked256load {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mask mem)
+(VREDUCEPDMasked512 [c]  l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VREDUCEPDMasked512load {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mask mem)
 (VPERMI2PS128 x y l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PS128load {sym} [off] x y ptr mem)
 (VPERMI2D128 x y l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2D128load {sym} [off] x y ptr mem)
 (VPERMI2PS256 x y l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PS256load {sym} [off] x y ptr mem)
@@ -2655,54 +2653,46 @@
 (VDIVPDMasked128 x l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VDIVPDMasked128load {sym} [off] x ptr mask mem)
 (VDIVPDMasked256 x l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VDIVPDMasked256load {sym} [off] x ptr mask mem)
 (VDIVPDMasked512 x l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VDIVPDMasked512load {sym} [off] x ptr mask mem)
-(VPDPBUSD512 x y l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPDPBUSD512load {sym} [off] x y ptr mem)
-(VPDPBUSDMasked128 x y l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPDPBUSDMasked128load {sym} [off] x y ptr mask mem)
-(VPDPBUSDMasked256 x y l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPDPBUSDMasked256load {sym} [off] x y ptr mask mem)
-(VPDPBUSDMasked512 x y l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPDPBUSDMasked512load {sym} [off] x y ptr mask mem)
-(VPDPBUSDS512 x y l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPDPBUSDS512load {sym} [off] x y ptr mem)
-(VPDPBUSDSMasked128 x y l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPDPBUSDSMasked128load {sym} [off] x y ptr mask mem)
-(VPDPBUSDSMasked256 x y l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPDPBUSDSMasked256load {sym} [off] x y ptr mask mem)
-(VPDPBUSDSMasked512 x y l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPDPBUSDSMasked512load {sym} [off] x y ptr mask mem)
 (VPCMPEQD512 x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPCMPEQD512load {sym} [off] x ptr mem)
 (VPCMPEQQ512 x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPCMPEQQ512load {sym} [off] x ptr mem)
-(VCMPPS512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VCMPPS512load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mem)
-(VCMPPD512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VCMPPD512load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mem)
-(VCMPPSMasked128 [c] x l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VCMPPSMasked128load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
-(VCMPPSMasked256 [c] x l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VCMPPSMasked256load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
-(VCMPPSMasked512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VCMPPSMasked512load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
-(VCMPPDMasked128 [c] x l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VCMPPDMasked128load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
-(VCMPPDMasked256 [c] x l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VCMPPDMasked256load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
-(VCMPPDMasked512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VCMPPDMasked512load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
-(VPCMPDMasked128 [c] x l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPCMPDMasked128load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
-(VPCMPDMasked256 [c] x l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPCMPDMasked256load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
-(VPCMPDMasked512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPCMPDMasked512load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
-(VPCMPQMasked128 [c] x l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPCMPQMasked128load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
-(VPCMPQMasked256 [c] x l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPCMPQMasked256load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
-(VPCMPQMasked512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPCMPQMasked512load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
-(VPCMPUDMasked128 [c] x l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPCMPUDMasked128load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
-(VPCMPUDMasked256 [c] x l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPCMPUDMasked256load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
-(VPCMPUDMasked512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPCMPUDMasked512load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
-(VPCMPUQMasked128 [c] x l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPCMPUQMasked128load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
-(VPCMPUQMasked256 [c] x l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPCMPUQMasked256load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
-(VPCMPUQMasked512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPCMPUQMasked512load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
-(VGF2P8AFFINEQB128 [c]  x l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VGF2P8AFFINEQB128load {sym} [makeValAndOff(int32(int8(c)),off)]  x ptr mem)
-(VGF2P8AFFINEQB256 [c]  x l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VGF2P8AFFINEQB256load {sym} [makeValAndOff(int32(int8(c)),off)]  x ptr mem)
-(VGF2P8AFFINEQB512 [c]  x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VGF2P8AFFINEQB512load {sym} [makeValAndOff(int32(int8(c)),off)]  x ptr mem)
-(VGF2P8AFFINEINVQB128 [c]  x l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VGF2P8AFFINEINVQB128load {sym} [makeValAndOff(int32(int8(c)),off)]  x ptr mem)
-(VGF2P8AFFINEINVQB256 [c]  x l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VGF2P8AFFINEINVQB256load {sym} [makeValAndOff(int32(int8(c)),off)]  x ptr mem)
-(VGF2P8AFFINEINVQB512 [c]  x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VGF2P8AFFINEINVQB512load {sym} [makeValAndOff(int32(int8(c)),off)]  x ptr mem)
-(VGF2P8AFFINEINVQBMasked128 [c]  x l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VGF2P8AFFINEINVQBMasked128load {sym} [makeValAndOff(int32(int8(c)),off)]  x ptr mask mem)
-(VGF2P8AFFINEINVQBMasked256 [c]  x l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VGF2P8AFFINEINVQBMasked256load {sym} [makeValAndOff(int32(int8(c)),off)]  x ptr mask mem)
-(VGF2P8AFFINEINVQBMasked512 [c]  x l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VGF2P8AFFINEINVQBMasked512load {sym} [makeValAndOff(int32(int8(c)),off)]  x ptr mask mem)
-(VGF2P8AFFINEQBMasked128 [c]  x l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VGF2P8AFFINEQBMasked128load {sym} [makeValAndOff(int32(int8(c)),off)]  x ptr mask mem)
-(VGF2P8AFFINEQBMasked256 [c]  x l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VGF2P8AFFINEQBMasked256load {sym} [makeValAndOff(int32(int8(c)),off)]  x ptr mask mem)
-(VGF2P8AFFINEQBMasked512 [c]  x l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VGF2P8AFFINEQBMasked512load {sym} [makeValAndOff(int32(int8(c)),off)]  x ptr mask mem)
+(VCMPPS512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VCMPPS512load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mem)
+(VCMPPD512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VCMPPD512load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mem)
+(VCMPPSMasked128 [c] x l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VCMPPSMasked128load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mask mem)
+(VCMPPSMasked256 [c] x l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VCMPPSMasked256load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mask mem)
+(VCMPPSMasked512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VCMPPSMasked512load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mask mem)
+(VCMPPDMasked128 [c] x l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VCMPPDMasked128load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mask mem)
+(VCMPPDMasked256 [c] x l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VCMPPDMasked256load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mask mem)
+(VCMPPDMasked512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VCMPPDMasked512load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mask mem)
+(VPCMPDMasked128 [c] x l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPCMPDMasked128load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mask mem)
+(VPCMPDMasked256 [c] x l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPCMPDMasked256load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mask mem)
+(VPCMPDMasked512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPCMPDMasked512load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mask mem)
+(VPCMPQMasked128 [c] x l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPCMPQMasked128load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mask mem)
+(VPCMPQMasked256 [c] x l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPCMPQMasked256load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mask mem)
+(VPCMPQMasked512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPCMPQMasked512load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mask mem)
+(VPCMPUDMasked128 [c] x l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPCMPUDMasked128load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mask mem)
+(VPCMPUDMasked256 [c] x l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPCMPUDMasked256load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mask mem)
+(VPCMPUDMasked512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPCMPUDMasked512load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mask mem)
+(VPCMPUQMasked128 [c] x l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPCMPUQMasked128load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mask mem)
+(VPCMPUQMasked256 [c] x l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPCMPUQMasked256load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mask mem)
+(VPCMPUQMasked512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPCMPUQMasked512load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mask mem)
+(VGF2P8AFFINEQB128 [c]  x l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VGF2P8AFFINEQB128load {sym} [makeValAndOff(int32(uint8(c)),off)]  x ptr mem)
+(VGF2P8AFFINEQB256 [c]  x l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VGF2P8AFFINEQB256load {sym} [makeValAndOff(int32(uint8(c)),off)]  x ptr mem)
+(VGF2P8AFFINEQB512 [c]  x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VGF2P8AFFINEQB512load {sym} [makeValAndOff(int32(uint8(c)),off)]  x ptr mem)
+(VGF2P8AFFINEINVQB128 [c]  x l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VGF2P8AFFINEINVQB128load {sym} [makeValAndOff(int32(uint8(c)),off)]  x ptr mem)
+(VGF2P8AFFINEINVQB256 [c]  x l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VGF2P8AFFINEINVQB256load {sym} [makeValAndOff(int32(uint8(c)),off)]  x ptr mem)
+(VGF2P8AFFINEINVQB512 [c]  x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VGF2P8AFFINEINVQB512load {sym} [makeValAndOff(int32(uint8(c)),off)]  x ptr mem)
+(VGF2P8AFFINEINVQBMasked128 [c]  x l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VGF2P8AFFINEINVQBMasked128load {sym} [makeValAndOff(int32(uint8(c)),off)]  x ptr mask mem)
+(VGF2P8AFFINEINVQBMasked256 [c]  x l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VGF2P8AFFINEINVQBMasked256load {sym} [makeValAndOff(int32(uint8(c)),off)]  x ptr mask mem)
+(VGF2P8AFFINEINVQBMasked512 [c]  x l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VGF2P8AFFINEINVQBMasked512load {sym} [makeValAndOff(int32(uint8(c)),off)]  x ptr mask mem)
+(VGF2P8AFFINEQBMasked128 [c]  x l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VGF2P8AFFINEQBMasked128load {sym} [makeValAndOff(int32(uint8(c)),off)]  x ptr mask mem)
+(VGF2P8AFFINEQBMasked256 [c]  x l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VGF2P8AFFINEQBMasked256load {sym} [makeValAndOff(int32(uint8(c)),off)]  x ptr mask mem)
+(VGF2P8AFFINEQBMasked512 [c]  x l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VGF2P8AFFINEQBMasked512load {sym} [makeValAndOff(int32(uint8(c)),off)]  x ptr mask mem)
 (VPCMPGTD512 x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPCMPGTD512load {sym} [off] x ptr mem)
 (VPCMPGTQ512 x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPCMPGTQ512load {sym} [off] x ptr mem)
-(VPCMPUD512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPCMPUD512load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mem)
-(VPCMPUQ512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPCMPUQ512load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mem)
-(VPCMPD512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPCMPD512load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mem)
-(VPCMPQ512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPCMPQ512load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mem)
+(VPCMPUD512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPCMPUD512load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mem)
+(VPCMPUQ512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPCMPUQ512load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mem)
+(VPCMPD512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPCMPD512load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mem)
+(VPCMPQ512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPCMPQ512load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mem)
 (VPUNPCKHDQ512 x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPUNPCKHDQ512load {sym} [off] x ptr mem)
 (VPUNPCKHQDQ512 x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPUNPCKHQDQ512load {sym} [off] x ptr mem)
 (VPUNPCKLDQ512 x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPUNPCKLDQ512load {sym} [off] x ptr mem)
@@ -2883,30 +2873,30 @@
 (VRSQRT14PDMasked128 l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VRSQRT14PDMasked128load {sym} [off] ptr mask mem)
 (VRSQRT14PDMasked256 l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VRSQRT14PDMasked256load {sym} [off] ptr mask mem)
 (VRSQRT14PDMasked512 l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VRSQRT14PDMasked512load {sym} [off] ptr mask mem)
-(VPROLD128 [c]  l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPROLD128load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mem)
-(VPROLD256 [c]  l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPROLD256load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mem)
-(VPROLD512 [c]  l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPROLD512load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mem)
-(VPROLQ128 [c]  l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPROLQ128load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mem)
-(VPROLQ256 [c]  l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPROLQ256load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mem)
-(VPROLQ512 [c]  l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPROLQ512load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mem)
-(VPROLDMasked128 [c]  l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPROLDMasked128load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
-(VPROLDMasked256 [c]  l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPROLDMasked256load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
-(VPROLDMasked512 [c]  l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPROLDMasked512load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
-(VPROLQMasked128 [c]  l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPROLQMasked128load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
-(VPROLQMasked256 [c]  l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPROLQMasked256load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
-(VPROLQMasked512 [c]  l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPROLQMasked512load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
-(VPRORD128 [c]  l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPRORD128load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mem)
-(VPRORD256 [c]  l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPRORD256load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mem)
-(VPRORD512 [c]  l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPRORD512load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mem)
-(VPRORQ128 [c]  l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPRORQ128load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mem)
-(VPRORQ256 [c]  l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPRORQ256load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mem)
-(VPRORQ512 [c]  l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPRORQ512load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mem)
-(VPRORDMasked128 [c]  l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPRORDMasked128load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
-(VPRORDMasked256 [c]  l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPRORDMasked256load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
-(VPRORDMasked512 [c]  l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPRORDMasked512load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
-(VPRORQMasked128 [c]  l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPRORQMasked128load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
-(VPRORQMasked256 [c]  l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPRORQMasked256load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
-(VPRORQMasked512 [c]  l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPRORQMasked512load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
+(VPROLD128 [c]  l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPROLD128load {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mem)
+(VPROLD256 [c]  l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPROLD256load {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mem)
+(VPROLD512 [c]  l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPROLD512load {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mem)
+(VPROLQ128 [c]  l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPROLQ128load {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mem)
+(VPROLQ256 [c]  l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPROLQ256load {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mem)
+(VPROLQ512 [c]  l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPROLQ512load {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mem)
+(VPROLDMasked128 [c]  l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPROLDMasked128load {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mask mem)
+(VPROLDMasked256 [c]  l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPROLDMasked256load {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mask mem)
+(VPROLDMasked512 [c]  l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPROLDMasked512load {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mask mem)
+(VPROLQMasked128 [c]  l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPROLQMasked128load {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mask mem)
+(VPROLQMasked256 [c]  l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPROLQMasked256load {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mask mem)
+(VPROLQMasked512 [c]  l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPROLQMasked512load {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mask mem)
+(VPRORD128 [c]  l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPRORD128load {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mem)
+(VPRORD256 [c]  l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPRORD256load {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mem)
+(VPRORD512 [c]  l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPRORD512load {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mem)
+(VPRORQ128 [c]  l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPRORQ128load {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mem)
+(VPRORQ256 [c]  l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPRORQ256load {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mem)
+(VPRORQ512 [c]  l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPRORQ512load {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mem)
+(VPRORDMasked128 [c]  l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPRORDMasked128load {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mask mem)
+(VPRORDMasked256 [c]  l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPRORDMasked256load {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mask mem)
+(VPRORDMasked512 [c]  l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPRORDMasked512load {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mask mem)
+(VPRORQMasked128 [c]  l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPRORQMasked128load {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mask mem)
+(VPRORQMasked256 [c]  l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPRORQMasked256load {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mask mem)
+(VPRORQMasked512 [c]  l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPRORQMasked512load {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mask mem)
 (VPROLVD128 x l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPROLVD128load {sym} [off] x ptr mem)
 (VPROLVD256 x l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPROLVD256load {sym} [off] x ptr mem)
 (VPROLVD512 x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPROLVD512load {sym} [off] x ptr mem)
@@ -2932,13 +2922,13 @@
 (VPRORVQMasked256 x l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPRORVQMasked256load {sym} [off] x ptr mask mem)
 (VPRORVQMasked512 x l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPRORVQMasked512load {sym} [off] x ptr mask mem)
 (VPACKSSDW512 x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPACKSSDW512load {sym} [off] x ptr mem)
-(VPACKSSDWMasked128 x l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPACKSSDWMasked128load {sym} [off] x ptr mask mem)
 (VPACKSSDWMasked256 x l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPACKSSDWMasked256load {sym} [off] x ptr mask mem)
 (VPACKSSDWMasked512 x l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPACKSSDWMasked512load {sym} [off] x ptr mask mem)
+(VPACKSSDWMasked128 x l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPACKSSDWMasked128load {sym} [off] x ptr mask mem)
 (VPACKUSDW512 x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPACKUSDW512load {sym} [off] x ptr mem)
-(VPACKUSDWMasked128 x l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPACKUSDWMasked128load {sym} [off] x ptr mask mem)
 (VPACKUSDWMasked256 x l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPACKUSDWMasked256load {sym} [off] x ptr mask mem)
 (VPACKUSDWMasked512 x l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPACKUSDWMasked512load {sym} [off] x ptr mask mem)
+(VPACKUSDWMasked128 x l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPACKUSDWMasked128load {sym} [off] x ptr mask mem)
 (VSCALEFPS128 x l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VSCALEFPS128load {sym} [off] x ptr mem)
 (VSCALEFPS256 x l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VSCALEFPS256load {sym} [off] x ptr mem)
 (VSCALEFPS512 x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VSCALEFPS512load {sym} [off] x ptr mem)
@@ -2951,30 +2941,30 @@
 (VSCALEFPDMasked128 x l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VSCALEFPDMasked128load {sym} [off] x ptr mask mem)
 (VSCALEFPDMasked256 x l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VSCALEFPDMasked256load {sym} [off] x ptr mask mem)
 (VSCALEFPDMasked512 x l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VSCALEFPDMasked512load {sym} [off] x ptr mask mem)
-(VPSHLDD128 [c]  x l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSHLDD128load {sym} [makeValAndOff(int32(int8(c)),off)]  x ptr mem)
-(VPSHLDD256 [c]  x l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSHLDD256load {sym} [makeValAndOff(int32(int8(c)),off)]  x ptr mem)
-(VPSHLDD512 [c]  x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSHLDD512load {sym} [makeValAndOff(int32(int8(c)),off)]  x ptr mem)
-(VPSHLDQ128 [c]  x l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSHLDQ128load {sym} [makeValAndOff(int32(int8(c)),off)]  x ptr mem)
-(VPSHLDQ256 [c]  x l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSHLDQ256load {sym} [makeValAndOff(int32(int8(c)),off)]  x ptr mem)
-(VPSHLDQ512 [c]  x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSHLDQ512load {sym} [makeValAndOff(int32(int8(c)),off)]  x ptr mem)
-(VPSHLDDMasked128 [c]  x l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHLDDMasked128load {sym} [makeValAndOff(int32(int8(c)),off)]  x ptr mask mem)
-(VPSHLDDMasked256 [c]  x l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHLDDMasked256load {sym} [makeValAndOff(int32(int8(c)),off)]  x ptr mask mem)
-(VPSHLDDMasked512 [c]  x l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHLDDMasked512load {sym} [makeValAndOff(int32(int8(c)),off)]  x ptr mask mem)
-(VPSHLDQMasked128 [c]  x l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHLDQMasked128load {sym} [makeValAndOff(int32(int8(c)),off)]  x ptr mask mem)
-(VPSHLDQMasked256 [c]  x l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHLDQMasked256load {sym} [makeValAndOff(int32(int8(c)),off)]  x ptr mask mem)
-(VPSHLDQMasked512 [c]  x l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHLDQMasked512load {sym} [makeValAndOff(int32(int8(c)),off)]  x ptr mask mem)
-(VPSHRDD128 [c]  x l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSHRDD128load {sym} [makeValAndOff(int32(int8(c)),off)]  x ptr mem)
-(VPSHRDD256 [c]  x l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSHRDD256load {sym} [makeValAndOff(int32(int8(c)),off)]  x ptr mem)
-(VPSHRDD512 [c]  x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSHRDD512load {sym} [makeValAndOff(int32(int8(c)),off)]  x ptr mem)
-(VPSHRDQ128 [c]  x l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSHRDQ128load {sym} [makeValAndOff(int32(int8(c)),off)]  x ptr mem)
-(VPSHRDQ256 [c]  x l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSHRDQ256load {sym} [makeValAndOff(int32(int8(c)),off)]  x ptr mem)
-(VPSHRDQ512 [c]  x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSHRDQ512load {sym} [makeValAndOff(int32(int8(c)),off)]  x ptr mem)
-(VPSHRDDMasked128 [c]  x l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHRDDMasked128load {sym} [makeValAndOff(int32(int8(c)),off)]  x ptr mask mem)
-(VPSHRDDMasked256 [c]  x l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHRDDMasked256load {sym} [makeValAndOff(int32(int8(c)),off)]  x ptr mask mem)
-(VPSHRDDMasked512 [c]  x l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHRDDMasked512load {sym} [makeValAndOff(int32(int8(c)),off)]  x ptr mask mem)
-(VPSHRDQMasked128 [c]  x l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHRDQMasked128load {sym} [makeValAndOff(int32(int8(c)),off)]  x ptr mask mem)
-(VPSHRDQMasked256 [c]  x l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHRDQMasked256load {sym} [makeValAndOff(int32(int8(c)),off)]  x ptr mask mem)
-(VPSHRDQMasked512 [c]  x l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHRDQMasked512load {sym} [makeValAndOff(int32(int8(c)),off)]  x ptr mask mem)
+(VPSHLDD128 [c]  x l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSHLDD128load {sym} [makeValAndOff(int32(uint8(c)),off)]  x ptr mem)
+(VPSHLDD256 [c]  x l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSHLDD256load {sym} [makeValAndOff(int32(uint8(c)),off)]  x ptr mem)
+(VPSHLDD512 [c]  x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSHLDD512load {sym} [makeValAndOff(int32(uint8(c)),off)]  x ptr mem)
+(VPSHLDQ128 [c]  x l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSHLDQ128load {sym} [makeValAndOff(int32(uint8(c)),off)]  x ptr mem)
+(VPSHLDQ256 [c]  x l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSHLDQ256load {sym} [makeValAndOff(int32(uint8(c)),off)]  x ptr mem)
+(VPSHLDQ512 [c]  x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSHLDQ512load {sym} [makeValAndOff(int32(uint8(c)),off)]  x ptr mem)
+(VPSHLDDMasked128 [c]  x l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHLDDMasked128load {sym} [makeValAndOff(int32(uint8(c)),off)]  x ptr mask mem)
+(VPSHLDDMasked256 [c]  x l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHLDDMasked256load {sym} [makeValAndOff(int32(uint8(c)),off)]  x ptr mask mem)
+(VPSHLDDMasked512 [c]  x l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHLDDMasked512load {sym} [makeValAndOff(int32(uint8(c)),off)]  x ptr mask mem)
+(VPSHLDQMasked128 [c]  x l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHLDQMasked128load {sym} [makeValAndOff(int32(uint8(c)),off)]  x ptr mask mem)
+(VPSHLDQMasked256 [c]  x l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHLDQMasked256load {sym} [makeValAndOff(int32(uint8(c)),off)]  x ptr mask mem)
+(VPSHLDQMasked512 [c]  x l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHLDQMasked512load {sym} [makeValAndOff(int32(uint8(c)),off)]  x ptr mask mem)
+(VPSHRDD128 [c]  x l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSHRDD128load {sym} [makeValAndOff(int32(uint8(c)),off)]  x ptr mem)
+(VPSHRDD256 [c]  x l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSHRDD256load {sym} [makeValAndOff(int32(uint8(c)),off)]  x ptr mem)
+(VPSHRDD512 [c]  x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSHRDD512load {sym} [makeValAndOff(int32(uint8(c)),off)]  x ptr mem)
+(VPSHRDQ128 [c]  x l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSHRDQ128load {sym} [makeValAndOff(int32(uint8(c)),off)]  x ptr mem)
+(VPSHRDQ256 [c]  x l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSHRDQ256load {sym} [makeValAndOff(int32(uint8(c)),off)]  x ptr mem)
+(VPSHRDQ512 [c]  x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSHRDQ512load {sym} [makeValAndOff(int32(uint8(c)),off)]  x ptr mem)
+(VPSHRDDMasked128 [c]  x l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHRDDMasked128load {sym} [makeValAndOff(int32(uint8(c)),off)]  x ptr mask mem)
+(VPSHRDDMasked256 [c]  x l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHRDDMasked256load {sym} [makeValAndOff(int32(uint8(c)),off)]  x ptr mask mem)
+(VPSHRDDMasked512 [c]  x l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHRDDMasked512load {sym} [makeValAndOff(int32(uint8(c)),off)]  x ptr mask mem)
+(VPSHRDQMasked128 [c]  x l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHRDQMasked128load {sym} [makeValAndOff(int32(uint8(c)),off)]  x ptr mask mem)
+(VPSHRDQMasked256 [c]  x l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHRDQMasked256load {sym} [makeValAndOff(int32(uint8(c)),off)]  x ptr mask mem)
+(VPSHRDQMasked512 [c]  x l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHRDQMasked512load {sym} [makeValAndOff(int32(uint8(c)),off)]  x ptr mask mem)
 (VPSLLVD512 x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSLLVD512load {sym} [off] x ptr mem)
 (VPSLLVQ512 x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSLLVQ512load {sym} [off] x ptr mem)
 (VPSHLDVD128 x y l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSHLDVD128load {sym} [off] x y ptr mem)
@@ -3059,41 +3049,41 @@
 (VPXORQMasked512 x l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPXORQMasked512load {sym} [off] x ptr mask mem)
 (VPBLENDMDMasked512 x l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPBLENDMDMasked512load {sym} [off] x ptr mask mem)
 (VPBLENDMQMasked512 x l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPBLENDMQMasked512load {sym} [off] x ptr mask mem)
-(VSHUFPS512 [c]  x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VSHUFPS512load {sym} [makeValAndOff(int32(int8(c)),off)]  x ptr mem)
-(VSHUFPD512 [c]  x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VSHUFPD512load {sym} [makeValAndOff(int32(int8(c)),off)]  x ptr mem)
-(VPSHUFD512 [c]  l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSHUFD512load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mem)
-(VPSHUFDMasked256 [c]  l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHUFDMasked256load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
-(VPSHUFDMasked512 [c]  l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHUFDMasked512load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
-(VPSHUFDMasked128 [c]  l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHUFDMasked128load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
-(VPSLLD512const [c]  l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSLLD512constload {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mem)
-(VPSLLQ512const [c]  l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSLLQ512constload {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mem)
-(VPSLLDMasked128const [c]  l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSLLDMasked128constload {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
-(VPSLLDMasked256const [c]  l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSLLDMasked256constload {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
-(VPSLLDMasked512const [c]  l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSLLDMasked512constload {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
-(VPSLLQMasked128const [c]  l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSLLQMasked128constload {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
-(VPSLLQMasked256const [c]  l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSLLQMasked256constload {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
-(VPSLLQMasked512const [c]  l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSLLQMasked512constload {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
-(VPSRLD512const [c]  l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSRLD512constload {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mem)
-(VPSRLQ512const [c]  l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSRLQ512constload {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mem)
-(VPSRAD512const [c]  l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSRAD512constload {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mem)
-(VPSRAQ128const [c]  l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSRAQ128constload {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mem)
-(VPSRAQ256const [c]  l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSRAQ256constload {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mem)
-(VPSRAQ512const [c]  l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSRAQ512constload {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mem)
-(VPSRLDMasked128const [c]  l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSRLDMasked128constload {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
-(VPSRLDMasked256const [c]  l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSRLDMasked256constload {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
-(VPSRLDMasked512const [c]  l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSRLDMasked512constload {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
-(VPSRLQMasked128const [c]  l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSRLQMasked128constload {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
-(VPSRLQMasked256const [c]  l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSRLQMasked256constload {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
-(VPSRLQMasked512const [c]  l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSRLQMasked512constload {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
-(VPSRADMasked128const [c]  l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSRADMasked128constload {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
-(VPSRADMasked256const [c]  l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSRADMasked256constload {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
-(VPSRADMasked512const [c]  l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSRADMasked512constload {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
-(VPSRAQMasked128const [c]  l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSRAQMasked128constload {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
-(VPSRAQMasked256const [c]  l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSRAQMasked256constload {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
-(VPSRAQMasked512const [c]  l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSRAQMasked512constload {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
-(VPTERNLOGD128 [c]  x y l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPTERNLOGD128load {sym} [makeValAndOff(int32(int8(c)),off)]  x y ptr mem)
-(VPTERNLOGD256 [c]  x y l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPTERNLOGD256load {sym} [makeValAndOff(int32(int8(c)),off)]  x y ptr mem)
-(VPTERNLOGD512 [c]  x y l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPTERNLOGD512load {sym} [makeValAndOff(int32(int8(c)),off)]  x y ptr mem)
-(VPTERNLOGQ128 [c]  x y l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPTERNLOGQ128load {sym} [makeValAndOff(int32(int8(c)),off)]  x y ptr mem)
-(VPTERNLOGQ256 [c]  x y l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPTERNLOGQ256load {sym} [makeValAndOff(int32(int8(c)),off)]  x y ptr mem)
-(VPTERNLOGQ512 [c]  x y l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPTERNLOGQ512load {sym} [makeValAndOff(int32(int8(c)),off)]  x y ptr mem)
+(VSHUFPS512 [c]  x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VSHUFPS512load {sym} [makeValAndOff(int32(uint8(c)),off)]  x ptr mem)
+(VSHUFPD512 [c]  x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VSHUFPD512load {sym} [makeValAndOff(int32(uint8(c)),off)]  x ptr mem)
+(VPSHUFD512 [c]  l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSHUFD512load {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mem)
+(VPSHUFDMasked256 [c]  l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHUFDMasked256load {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mask mem)
+(VPSHUFDMasked512 [c]  l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHUFDMasked512load {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mask mem)
+(VPSHUFDMasked128 [c]  l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHUFDMasked128load {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mask mem)
+(VPSLLD512const [c]  l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSLLD512constload {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mem)
+(VPSLLQ512const [c]  l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSLLQ512constload {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mem)
+(VPSLLDMasked128const [c]  l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSLLDMasked128constload {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mask mem)
+(VPSLLDMasked256const [c]  l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSLLDMasked256constload {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mask mem)
+(VPSLLDMasked512const [c]  l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSLLDMasked512constload {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mask mem)
+(VPSLLQMasked128const [c]  l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSLLQMasked128constload {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mask mem)
+(VPSLLQMasked256const [c]  l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSLLQMasked256constload {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mask mem)
+(VPSLLQMasked512const [c]  l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSLLQMasked512constload {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mask mem)
+(VPSRLD512const [c]  l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSRLD512constload {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mem)
+(VPSRLQ512const [c]  l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSRLQ512constload {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mem)
+(VPSRAD512const [c]  l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSRAD512constload {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mem)
+(VPSRAQ128const [c]  l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSRAQ128constload {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mem)
+(VPSRAQ256const [c]  l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSRAQ256constload {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mem)
+(VPSRAQ512const [c]  l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSRAQ512constload {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mem)
+(VPSRLDMasked128const [c]  l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSRLDMasked128constload {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mask mem)
+(VPSRLDMasked256const [c]  l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSRLDMasked256constload {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mask mem)
+(VPSRLDMasked512const [c]  l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSRLDMasked512constload {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mask mem)
+(VPSRLQMasked128const [c]  l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSRLQMasked128constload {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mask mem)
+(VPSRLQMasked256const [c]  l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSRLQMasked256constload {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mask mem)
+(VPSRLQMasked512const [c]  l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSRLQMasked512constload {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mask mem)
+(VPSRADMasked128const [c]  l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSRADMasked128constload {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mask mem)
+(VPSRADMasked256const [c]  l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSRADMasked256constload {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mask mem)
+(VPSRADMasked512const [c]  l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSRADMasked512constload {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mask mem)
+(VPSRAQMasked128const [c]  l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSRAQMasked128constload {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mask mem)
+(VPSRAQMasked256const [c]  l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSRAQMasked256constload {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mask mem)
+(VPSRAQMasked512const [c]  l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSRAQMasked512constload {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mask mem)
+(VPTERNLOGD128 [c]  x y l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPTERNLOGD128load {sym} [makeValAndOff(int32(uint8(c)),off)]  x y ptr mem)
+(VPTERNLOGD256 [c]  x y l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPTERNLOGD256load {sym} [makeValAndOff(int32(uint8(c)),off)]  x y ptr mem)
+(VPTERNLOGD512 [c]  x y l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPTERNLOGD512load {sym} [makeValAndOff(int32(uint8(c)),off)]  x y ptr mem)
+(VPTERNLOGQ128 [c]  x y l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPTERNLOGQ128load {sym} [makeValAndOff(int32(uint8(c)),off)]  x y ptr mem)
+(VPTERNLOGQ256 [c]  x y l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPTERNLOGQ256load {sym} [makeValAndOff(int32(uint8(c)),off)]  x y ptr mem)
+(VPTERNLOGQ512 [c]  x y l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPTERNLOGQ512load {sym} [makeValAndOff(int32(uint8(c)),off)]  x y ptr mem)
diff --git a/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go b/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go
index f38d24fde7..648e372fb4 100644
--- a/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go
+++ b/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go
@@ -1,4 +1,4 @@
-// Code generated by x/arch/internal/simdgen using 'go run . -xedPath $XED_PATH -o godefs -goroot $GOROOT go.yaml types.yaml categories.yaml'; DO NOT EDIT.
+// Code generated by 'simdgen -o godefs -goroot $GOROOT -xedPath $XED_PATH go.yaml types.yaml categories.yaml'; DO NOT EDIT.
 
 package main
 
@@ -452,18 +452,6 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
 		{name: "VPCOMPRESSWMasked128", argLength: 2, reg: wkw, asm: "VPCOMPRESSW", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VPCOMPRESSWMasked256", argLength: 2, reg: wkw, asm: "VPCOMPRESSW", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VPCOMPRESSWMasked512", argLength: 2, reg: wkw, asm: "VPCOMPRESSW", commutative: false, typ: "Vec512", resultInArg0: false},
-		{name: "VPDPBUSD128", argLength: 3, reg: v31, asm: "VPDPBUSD", commutative: false, typ: "Vec128", resultInArg0: true},
-		{name: "VPDPBUSD256", argLength: 3, reg: v31, asm: "VPDPBUSD", commutative: false, typ: "Vec256", resultInArg0: true},
-		{name: "VPDPBUSD512", argLength: 3, reg: w31, asm: "VPDPBUSD", commutative: false, typ: "Vec512", resultInArg0: true},
-		{name: "VPDPBUSDMasked128", argLength: 4, reg: w3kw, asm: "VPDPBUSD", commutative: false, typ: "Vec128", resultInArg0: true},
-		{name: "VPDPBUSDMasked256", argLength: 4, reg: w3kw, asm: "VPDPBUSD", commutative: false, typ: "Vec256", resultInArg0: true},
-		{name: "VPDPBUSDMasked512", argLength: 4, reg: w3kw, asm: "VPDPBUSD", commutative: false, typ: "Vec512", resultInArg0: true},
-		{name: "VPDPBUSDS128", argLength: 3, reg: v31, asm: "VPDPBUSDS", commutative: false, typ: "Vec128", resultInArg0: true},
-		{name: "VPDPBUSDS256", argLength: 3, reg: v31, asm: "VPDPBUSDS", commutative: false, typ: "Vec256", resultInArg0: true},
-		{name: "VPDPBUSDS512", argLength: 3, reg: w31, asm: "VPDPBUSDS", commutative: false, typ: "Vec512", resultInArg0: true},
-		{name: "VPDPBUSDSMasked128", argLength: 4, reg: w3kw, asm: "VPDPBUSDS", commutative: false, typ: "Vec128", resultInArg0: true},
-		{name: "VPDPBUSDSMasked256", argLength: 4, reg: w3kw, asm: "VPDPBUSDS", commutative: false, typ: "Vec256", resultInArg0: true},
-		{name: "VPDPBUSDSMasked512", argLength: 4, reg: w3kw, asm: "VPDPBUSDS", commutative: false, typ: "Vec512", resultInArg0: true},
 		{name: "VPDPWSSD128", argLength: 3, reg: v31, asm: "VPDPWSSD", commutative: false, typ: "Vec128", resultInArg0: true},
 		{name: "VPDPWSSD256", argLength: 3, reg: v31, asm: "VPDPWSSD", commutative: false, typ: "Vec256", resultInArg0: true},
 		{name: "VPDPWSSD512", argLength: 3, reg: w31, asm: "VPDPWSSD", commutative: false, typ: "Vec512", resultInArg0: true},
@@ -780,12 +768,24 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
 		{name: "VPMOVSXWQMasked128", argLength: 2, reg: wkw, asm: "VPMOVSXWQ", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VPMOVSXWQMasked256", argLength: 2, reg: wkw, asm: "VPMOVSXWQ", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VPMOVSXWQMasked512", argLength: 2, reg: wkw, asm: "VPMOVSXWQ", commutative: false, typ: "Vec512", resultInArg0: false},
+		{name: "VPMOVUSDB128_128", argLength: 1, reg: w11, asm: "VPMOVUSDB", commutative: false, typ: "Vec128", resultInArg0: false},
+		{name: "VPMOVUSDB128_256", argLength: 1, reg: w11, asm: "VPMOVUSDB", commutative: false, typ: "Vec128", resultInArg0: false},
+		{name: "VPMOVUSDB128_512", argLength: 1, reg: w11, asm: "VPMOVUSDB", commutative: false, typ: "Vec128", resultInArg0: false},
+		{name: "VPMOVUSDBMasked128_128", argLength: 2, reg: wkw, asm: "VPMOVUSDB", commutative: false, typ: "Vec128", resultInArg0: false},
+		{name: "VPMOVUSDBMasked128_256", argLength: 2, reg: wkw, asm: "VPMOVUSDB", commutative: false, typ: "Vec128", resultInArg0: false},
+		{name: "VPMOVUSDBMasked128_512", argLength: 2, reg: wkw, asm: "VPMOVUSDB", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VPMOVUSDW128_128", argLength: 1, reg: w11, asm: "VPMOVUSDW", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VPMOVUSDW128_256", argLength: 1, reg: w11, asm: "VPMOVUSDW", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VPMOVUSDW256", argLength: 1, reg: w11, asm: "VPMOVUSDW", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VPMOVUSDWMasked128_128", argLength: 2, reg: wkw, asm: "VPMOVUSDW", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VPMOVUSDWMasked128_256", argLength: 2, reg: wkw, asm: "VPMOVUSDW", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VPMOVUSDWMasked256", argLength: 2, reg: wkw, asm: "VPMOVUSDW", commutative: false, typ: "Vec256", resultInArg0: false},
+		{name: "VPMOVUSQB128_128", argLength: 1, reg: w11, asm: "VPMOVUSQB", commutative: false, typ: "Vec128", resultInArg0: false},
+		{name: "VPMOVUSQB128_256", argLength: 1, reg: w11, asm: "VPMOVUSQB", commutative: false, typ: "Vec128", resultInArg0: false},
+		{name: "VPMOVUSQB128_512", argLength: 1, reg: w11, asm: "VPMOVUSQB", commutative: false, typ: "Vec128", resultInArg0: false},
+		{name: "VPMOVUSQBMasked128_128", argLength: 2, reg: wkw, asm: "VPMOVUSQB", commutative: false, typ: "Vec128", resultInArg0: false},
+		{name: "VPMOVUSQBMasked128_256", argLength: 2, reg: wkw, asm: "VPMOVUSQB", commutative: false, typ: "Vec128", resultInArg0: false},
+		{name: "VPMOVUSQBMasked128_512", argLength: 2, reg: wkw, asm: "VPMOVUSQB", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VPMOVUSQD128_128", argLength: 1, reg: w11, asm: "VPMOVUSQD", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VPMOVUSQD128_256", argLength: 1, reg: w11, asm: "VPMOVUSQD", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VPMOVUSQD256", argLength: 1, reg: w11, asm: "VPMOVUSQD", commutative: false, typ: "Vec256", resultInArg0: false},
@@ -798,7 +798,11 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
 		{name: "VPMOVUSQWMasked128_128", argLength: 2, reg: wkw, asm: "VPMOVUSQW", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VPMOVUSQWMasked128_256", argLength: 2, reg: wkw, asm: "VPMOVUSQW", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VPMOVUSQWMasked128_512", argLength: 2, reg: wkw, asm: "VPMOVUSQW", commutative: false, typ: "Vec128", resultInArg0: false},
+		{name: "VPMOVUSWB128_128", argLength: 1, reg: w11, asm: "VPMOVUSWB", commutative: false, typ: "Vec128", resultInArg0: false},
+		{name: "VPMOVUSWB128_256", argLength: 1, reg: w11, asm: "VPMOVUSWB", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VPMOVUSWB256", argLength: 1, reg: w11, asm: "VPMOVUSWB", commutative: false, typ: "Vec256", resultInArg0: false},
+		{name: "VPMOVUSWBMasked128_128", argLength: 2, reg: wkw, asm: "VPMOVUSWB", commutative: false, typ: "Vec128", resultInArg0: false},
+		{name: "VPMOVUSWBMasked128_256", argLength: 2, reg: wkw, asm: "VPMOVUSWB", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VPMOVUSWBMasked256", argLength: 2, reg: wkw, asm: "VPMOVUSWB", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VPMOVWB128_128", argLength: 1, reg: w11, asm: "VPMOVWB", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VPMOVWB128_256", argLength: 1, reg: w11, asm: "VPMOVWB", commutative: false, typ: "Vec128", resultInArg0: false},
@@ -1698,14 +1702,6 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
 		{name: "VPCMPEQQ512load", argLength: 3, reg: w2kload, asm: "VPCMPEQQ", commutative: false, typ: "Mask", aux: "SymOff", symEffect: "Read", resultInArg0: false},
 		{name: "VPCMPGTD512load", argLength: 3, reg: w2kload, asm: "VPCMPGTD", commutative: false, typ: "Mask", aux: "SymOff", symEffect: "Read", resultInArg0: false},
 		{name: "VPCMPGTQ512load", argLength: 3, reg: w2kload, asm: "VPCMPGTQ", commutative: false, typ: "Mask", aux: "SymOff", symEffect: "Read", resultInArg0: false},
-		{name: "VPDPBUSD512load", argLength: 4, reg: w31load, asm: "VPDPBUSD", commutative: false, typ: "Vec512", aux: "SymOff", symEffect: "Read", resultInArg0: true},
-		{name: "VPDPBUSDMasked128load", argLength: 5, reg: w3kwload, asm: "VPDPBUSD", commutative: false, typ: "Vec128", aux: "SymOff", symEffect: "Read", resultInArg0: true},
-		{name: "VPDPBUSDMasked256load", argLength: 5, reg: w3kwload, asm: "VPDPBUSD", commutative: false, typ: "Vec256", aux: "SymOff", symEffect: "Read", resultInArg0: true},
-		{name: "VPDPBUSDMasked512load", argLength: 5, reg: w3kwload, asm: "VPDPBUSD", commutative: false, typ: "Vec512", aux: "SymOff", symEffect: "Read", resultInArg0: true},
-		{name: "VPDPBUSDS512load", argLength: 4, reg: w31load, asm: "VPDPBUSDS", commutative: false, typ: "Vec512", aux: "SymOff", symEffect: "Read", resultInArg0: true},
-		{name: "VPDPBUSDSMasked128load", argLength: 5, reg: w3kwload, asm: "VPDPBUSDS", commutative: false, typ: "Vec128", aux: "SymOff", symEffect: "Read", resultInArg0: true},
-		{name: "VPDPBUSDSMasked256load", argLength: 5, reg: w3kwload, asm: "VPDPBUSDS", commutative: false, typ: "Vec256", aux: "SymOff", symEffect: "Read", resultInArg0: true},
-		{name: "VPDPBUSDSMasked512load", argLength: 5, reg: w3kwload, asm: "VPDPBUSDS", commutative: false, typ: "Vec512", aux: "SymOff", symEffect: "Read", resultInArg0: true},
 		{name: "VPDPWSSD512load", argLength: 4, reg: w31load, asm: "VPDPWSSD", commutative: false, typ: "Vec512", aux: "SymOff", symEffect: "Read", resultInArg0: true},
 		{name: "VPDPWSSDMasked128load", argLength: 5, reg: w3kwload, asm: "VPDPWSSD", commutative: false, typ: "Vec128", aux: "SymOff", symEffect: "Read", resultInArg0: true},
 		{name: "VPDPWSSDMasked256load", argLength: 5, reg: w3kwload, asm: "VPDPWSSD", commutative: false, typ: "Vec256", aux: "SymOff", symEffect: "Read", resultInArg0: true},
@@ -2382,15 +2378,23 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
 		{name: "VPMOVSXWQMasked128Merging", argLength: 3, reg: w2kw, asm: "VPMOVSXWQ", commutative: false, typ: "Vec128", resultInArg0: true},
 		{name: "VPMOVSXWQMasked256Merging", argLength: 3, reg: w2kw, asm: "VPMOVSXWQ", commutative: false, typ: "Vec256", resultInArg0: true},
 		{name: "VPMOVSXWQMasked512Merging", argLength: 3, reg: w2kw, asm: "VPMOVSXWQ", commutative: false, typ: "Vec512", resultInArg0: true},
+		{name: "VPMOVUSDBMasked128_128Merging", argLength: 3, reg: w2kw, asm: "VPMOVUSDB", commutative: false, typ: "Vec128", resultInArg0: true},
+		{name: "VPMOVUSDBMasked128_256Merging", argLength: 3, reg: w2kw, asm: "VPMOVUSDB", commutative: false, typ: "Vec128", resultInArg0: true},
+		{name: "VPMOVUSDBMasked128_512Merging", argLength: 3, reg: w2kw, asm: "VPMOVUSDB", commutative: false, typ: "Vec128", resultInArg0: true},
 		{name: "VPMOVUSDWMasked128_128Merging", argLength: 3, reg: w2kw, asm: "VPMOVUSDW", commutative: false, typ: "Vec128", resultInArg0: true},
 		{name: "VPMOVUSDWMasked128_256Merging", argLength: 3, reg: w2kw, asm: "VPMOVUSDW", commutative: false, typ: "Vec128", resultInArg0: true},
 		{name: "VPMOVUSDWMasked256Merging", argLength: 3, reg: w2kw, asm: "VPMOVUSDW", commutative: false, typ: "Vec256", resultInArg0: true},
+		{name: "VPMOVUSQBMasked128_128Merging", argLength: 3, reg: w2kw, asm: "VPMOVUSQB", commutative: false, typ: "Vec128", resultInArg0: true},
+		{name: "VPMOVUSQBMasked128_256Merging", argLength: 3, reg: w2kw, asm: "VPMOVUSQB", commutative: false, typ: "Vec128", resultInArg0: true},
+		{name: "VPMOVUSQBMasked128_512Merging", argLength: 3, reg: w2kw, asm: "VPMOVUSQB", commutative: false, typ: "Vec128", resultInArg0: true},
 		{name: "VPMOVUSQDMasked128_128Merging", argLength: 3, reg: w2kw, asm: "VPMOVUSQD", commutative: false, typ: "Vec128", resultInArg0: true},
 		{name: "VPMOVUSQDMasked128_256Merging", argLength: 3, reg: w2kw, asm: "VPMOVUSQD", commutative: false, typ: "Vec128", resultInArg0: true},
 		{name: "VPMOVUSQDMasked256Merging", argLength: 3, reg: w2kw, asm: "VPMOVUSQD", commutative: false, typ: "Vec256", resultInArg0: true},
 		{name: "VPMOVUSQWMasked128_128Merging", argLength: 3, reg: w2kw, asm: "VPMOVUSQW", commutative: false, typ: "Vec128", resultInArg0: true},
 		{name: "VPMOVUSQWMasked128_256Merging", argLength: 3, reg: w2kw, asm: "VPMOVUSQW", commutative: false, typ: "Vec128", resultInArg0: true},
 		{name: "VPMOVUSQWMasked128_512Merging", argLength: 3, reg: w2kw, asm: "VPMOVUSQW", commutative: false, typ: "Vec128", resultInArg0: true},
+		{name: "VPMOVUSWBMasked128_128Merging", argLength: 3, reg: w2kw, asm: "VPMOVUSWB", commutative: false, typ: "Vec128", resultInArg0: true},
+		{name: "VPMOVUSWBMasked128_256Merging", argLength: 3, reg: w2kw, asm: "VPMOVUSWB", commutative: false, typ: "Vec128", resultInArg0: true},
 		{name: "VPMOVUSWBMasked256Merging", argLength: 3, reg: w2kw, asm: "VPMOVUSWB", commutative: false, typ: "Vec256", resultInArg0: true},
 		{name: "VPMOVWBMasked128_128Merging", argLength: 3, reg: w2kw, asm: "VPMOVWB", commutative: false, typ: "Vec128", resultInArg0: true},
 		{name: "VPMOVWBMasked128_256Merging", argLength: 3, reg: w2kw, asm: "VPMOVWB", commutative: false, typ: "Vec128", resultInArg0: true},
diff --git a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go
index a68d8c4122..889ab0d84f 100644
--- a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go
+++ b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go
@@ -1,4 +1,4 @@
-// Code generated by x/arch/internal/simdgen using 'go run . -xedPath $XED_PATH -o godefs -goroot $GOROOT go.yaml types.yaml categories.yaml'; DO NOT EDIT.
+// Code generated by 'simdgen -o godefs -goroot $GOROOT -xedPath $XED_PATH go.yaml types.yaml categories.yaml'; DO NOT EDIT.
 
 package main
 
@@ -48,19 +48,19 @@ func simdGenericOps() []opData {
 		{name: "AddInt64x4", argLength: 2, commutative: true},
 		{name: "AddInt64x8", argLength: 2, commutative: true},
 		{name: "AddPairsFloat32x4", argLength: 2, commutative: false},
-		{name: "AddPairsFloat32x8", argLength: 2, commutative: false},
 		{name: "AddPairsFloat64x2", argLength: 2, commutative: false},
-		{name: "AddPairsFloat64x4", argLength: 2, commutative: false},
+		{name: "AddPairsGroupedFloat32x8", argLength: 2, commutative: false},
+		{name: "AddPairsGroupedFloat64x4", argLength: 2, commutative: false},
+		{name: "AddPairsGroupedInt16x16", argLength: 2, commutative: false},
+		{name: "AddPairsGroupedInt32x8", argLength: 2, commutative: false},
+		{name: "AddPairsGroupedUint16x16", argLength: 2, commutative: false},
+		{name: "AddPairsGroupedUint32x8", argLength: 2, commutative: false},
 		{name: "AddPairsInt16x8", argLength: 2, commutative: false},
-		{name: "AddPairsInt16x16", argLength: 2, commutative: false},
 		{name: "AddPairsInt32x4", argLength: 2, commutative: false},
-		{name: "AddPairsInt32x8", argLength: 2, commutative: false},
+		{name: "AddPairsSaturatedGroupedInt16x16", argLength: 2, commutative: false},
 		{name: "AddPairsSaturatedInt16x8", argLength: 2, commutative: false},
-		{name: "AddPairsSaturatedInt16x16", argLength: 2, commutative: false},
 		{name: "AddPairsUint16x8", argLength: 2, commutative: false},
-		{name: "AddPairsUint16x16", argLength: 2, commutative: false},
 		{name: "AddPairsUint32x4", argLength: 2, commutative: false},
-		{name: "AddPairsUint32x8", argLength: 2, commutative: false},
 		{name: "AddSaturatedInt8x16", argLength: 2, commutative: true},
 		{name: "AddSaturatedInt8x32", argLength: 2, commutative: true},
 		{name: "AddSaturatedInt8x64", argLength: 2, commutative: true},
@@ -304,12 +304,6 @@ func simdGenericOps() []opData {
 		{name: "DotProductPairsSaturatedUint8x16", argLength: 2, commutative: false},
 		{name: "DotProductPairsSaturatedUint8x32", argLength: 2, commutative: false},
 		{name: "DotProductPairsSaturatedUint8x64", argLength: 2, commutative: false},
-		{name: "DotProductQuadrupleInt32x4", argLength: 3, commutative: false},
-		{name: "DotProductQuadrupleInt32x8", argLength: 3, commutative: false},
-		{name: "DotProductQuadrupleInt32x16", argLength: 3, commutative: false},
-		{name: "DotProductQuadrupleSaturatedInt32x4", argLength: 3, commutative: false},
-		{name: "DotProductQuadrupleSaturatedInt32x8", argLength: 3, commutative: false},
-		{name: "DotProductQuadrupleSaturatedInt32x16", argLength: 3, commutative: false},
 		{name: "EqualFloat32x4", argLength: 2, commutative: true},
 		{name: "EqualFloat32x8", argLength: 2, commutative: true},
 		{name: "EqualFloat32x16", argLength: 2, commutative: true},
@@ -370,26 +364,26 @@ func simdGenericOps() []opData {
 		{name: "ExpandUint64x2", argLength: 2, commutative: false},
 		{name: "ExpandUint64x4", argLength: 2, commutative: false},
 		{name: "ExpandUint64x8", argLength: 2, commutative: false},
-		{name: "ExtendLo2ToInt64x2Int8x16", argLength: 1, commutative: false},
-		{name: "ExtendLo2ToInt64x2Int16x8", argLength: 1, commutative: false},
-		{name: "ExtendLo2ToInt64x2Int32x4", argLength: 1, commutative: false},
-		{name: "ExtendLo2ToUint64x2Uint8x16", argLength: 1, commutative: false},
-		{name: "ExtendLo2ToUint64x2Uint16x8", argLength: 1, commutative: false},
-		{name: "ExtendLo2ToUint64x2Uint32x4", argLength: 1, commutative: false},
-		{name: "ExtendLo4ToInt32x4Int8x16", argLength: 1, commutative: false},
-		{name: "ExtendLo4ToInt32x4Int16x8", argLength: 1, commutative: false},
-		{name: "ExtendLo4ToInt64x4Int8x16", argLength: 1, commutative: false},
-		{name: "ExtendLo4ToInt64x4Int16x8", argLength: 1, commutative: false},
-		{name: "ExtendLo4ToUint32x4Uint8x16", argLength: 1, commutative: false},
-		{name: "ExtendLo4ToUint32x4Uint16x8", argLength: 1, commutative: false},
-		{name: "ExtendLo4ToUint64x4Uint8x16", argLength: 1, commutative: false},
-		{name: "ExtendLo4ToUint64x4Uint16x8", argLength: 1, commutative: false},
-		{name: "ExtendLo8ToInt16x8Int8x16", argLength: 1, commutative: false},
-		{name: "ExtendLo8ToInt32x8Int8x16", argLength: 1, commutative: false},
-		{name: "ExtendLo8ToInt64x8Int8x16", argLength: 1, commutative: false},
-		{name: "ExtendLo8ToUint16x8Uint8x16", argLength: 1, commutative: false},
-		{name: "ExtendLo8ToUint32x8Uint8x16", argLength: 1, commutative: false},
-		{name: "ExtendLo8ToUint64x8Uint8x16", argLength: 1, commutative: false},
+		{name: "ExtendLo2ToInt64Int8x16", argLength: 1, commutative: false},
+		{name: "ExtendLo2ToInt64Int16x8", argLength: 1, commutative: false},
+		{name: "ExtendLo2ToInt64Int32x4", argLength: 1, commutative: false},
+		{name: "ExtendLo2ToUint64Uint8x16", argLength: 1, commutative: false},
+		{name: "ExtendLo2ToUint64Uint16x8", argLength: 1, commutative: false},
+		{name: "ExtendLo2ToUint64Uint32x4", argLength: 1, commutative: false},
+		{name: "ExtendLo4ToInt32Int8x16", argLength: 1, commutative: false},
+		{name: "ExtendLo4ToInt32Int16x8", argLength: 1, commutative: false},
+		{name: "ExtendLo4ToInt64Int8x16", argLength: 1, commutative: false},
+		{name: "ExtendLo4ToInt64Int16x8", argLength: 1, commutative: false},
+		{name: "ExtendLo4ToUint32Uint8x16", argLength: 1, commutative: false},
+		{name: "ExtendLo4ToUint32Uint16x8", argLength: 1, commutative: false},
+		{name: "ExtendLo4ToUint64Uint8x16", argLength: 1, commutative: false},
+		{name: "ExtendLo4ToUint64Uint16x8", argLength: 1, commutative: false},
+		{name: "ExtendLo8ToInt16Int8x16", argLength: 1, commutative: false},
+		{name: "ExtendLo8ToInt32Int8x16", argLength: 1, commutative: false},
+		{name: "ExtendLo8ToInt64Int8x16", argLength: 1, commutative: false},
+		{name: "ExtendLo8ToUint16Uint8x16", argLength: 1, commutative: false},
+		{name: "ExtendLo8ToUint32Uint8x16", argLength: 1, commutative: false},
+		{name: "ExtendLo8ToUint64Uint8x16", argLength: 1, commutative: false},
 		{name: "ExtendToInt16Int8x16", argLength: 1, commutative: false},
 		{name: "ExtendToInt16Int8x32", argLength: 1, commutative: false},
 		{name: "ExtendToInt32Int8x16", argLength: 1, commutative: false},
@@ -525,12 +519,6 @@ func simdGenericOps() []opData {
 		{name: "InterleaveLoUint16x8", argLength: 2, commutative: false},
 		{name: "InterleaveLoUint32x4", argLength: 2, commutative: false},
 		{name: "InterleaveLoUint64x2", argLength: 2, commutative: false},
-		{name: "IsNanFloat32x4", argLength: 2, commutative: true},
-		{name: "IsNanFloat32x8", argLength: 2, commutative: true},
-		{name: "IsNanFloat32x16", argLength: 2, commutative: true},
-		{name: "IsNanFloat64x2", argLength: 2, commutative: true},
-		{name: "IsNanFloat64x4", argLength: 2, commutative: true},
-		{name: "IsNanFloat64x8", argLength: 2, commutative: true},
 		{name: "LeadingZerosInt32x4", argLength: 1, commutative: false},
 		{name: "LeadingZerosInt32x8", argLength: 1, commutative: false},
 		{name: "LeadingZerosInt32x16", argLength: 1, commutative: false},
@@ -830,9 +818,9 @@ func simdGenericOps() []opData {
 		{name: "SaturateToInt8Int64x2", argLength: 1, commutative: false},
 		{name: "SaturateToInt8Int64x4", argLength: 1, commutative: false},
 		{name: "SaturateToInt8Int64x8", argLength: 1, commutative: false},
+		{name: "SaturateToInt16ConcatGroupedInt32x8", argLength: 2, commutative: false},
+		{name: "SaturateToInt16ConcatGroupedInt32x16", argLength: 2, commutative: false},
 		{name: "SaturateToInt16ConcatInt32x4", argLength: 2, commutative: false},
-		{name: "SaturateToInt16ConcatInt32x8", argLength: 2, commutative: false},
-		{name: "SaturateToInt16ConcatInt32x16", argLength: 2, commutative: false},
 		{name: "SaturateToInt16Int32x4", argLength: 1, commutative: false},
 		{name: "SaturateToInt16Int32x8", argLength: 1, commutative: false},
 		{name: "SaturateToInt16Int32x16", argLength: 1, commutative: false},
@@ -842,18 +830,18 @@ func simdGenericOps() []opData {
 		{name: "SaturateToInt32Int64x2", argLength: 1, commutative: false},
 		{name: "SaturateToInt32Int64x4", argLength: 1, commutative: false},
 		{name: "SaturateToInt32Int64x8", argLength: 1, commutative: false},
-		{name: "SaturateToUint8Int16x8", argLength: 1, commutative: false},
-		{name: "SaturateToUint8Int16x16", argLength: 1, commutative: false},
-		{name: "SaturateToUint8Int32x4", argLength: 1, commutative: false},
-		{name: "SaturateToUint8Int32x8", argLength: 1, commutative: false},
-		{name: "SaturateToUint8Int32x16", argLength: 1, commutative: false},
-		{name: "SaturateToUint8Int64x2", argLength: 1, commutative: false},
-		{name: "SaturateToUint8Int64x4", argLength: 1, commutative: false},
-		{name: "SaturateToUint8Int64x8", argLength: 1, commutative: false},
+		{name: "SaturateToUint8Uint16x8", argLength: 1, commutative: false},
+		{name: "SaturateToUint8Uint16x16", argLength: 1, commutative: false},
 		{name: "SaturateToUint8Uint16x32", argLength: 1, commutative: false},
-		{name: "SaturateToUint16ConcatUint32x4", argLength: 2, commutative: false},
-		{name: "SaturateToUint16ConcatUint32x8", argLength: 2, commutative: false},
-		{name: "SaturateToUint16ConcatUint32x16", argLength: 2, commutative: false},
+		{name: "SaturateToUint8Uint32x4", argLength: 1, commutative: false},
+		{name: "SaturateToUint8Uint32x8", argLength: 1, commutative: false},
+		{name: "SaturateToUint8Uint32x16", argLength: 1, commutative: false},
+		{name: "SaturateToUint8Uint64x2", argLength: 1, commutative: false},
+		{name: "SaturateToUint8Uint64x4", argLength: 1, commutative: false},
+		{name: "SaturateToUint8Uint64x8", argLength: 1, commutative: false},
+		{name: "SaturateToUint16ConcatGroupedInt32x8", argLength: 2, commutative: false},
+		{name: "SaturateToUint16ConcatGroupedInt32x16", argLength: 2, commutative: false},
+		{name: "SaturateToUint16ConcatInt32x4", argLength: 2, commutative: false},
 		{name: "SaturateToUint16Uint32x4", argLength: 1, commutative: false},
 		{name: "SaturateToUint16Uint32x8", argLength: 1, commutative: false},
 		{name: "SaturateToUint16Uint32x16", argLength: 1, commutative: false},
@@ -1042,19 +1030,19 @@ func simdGenericOps() []opData {
 		{name: "SubInt64x4", argLength: 2, commutative: false},
 		{name: "SubInt64x8", argLength: 2, commutative: false},
 		{name: "SubPairsFloat32x4", argLength: 2, commutative: false},
-		{name: "SubPairsFloat32x8", argLength: 2, commutative: false},
 		{name: "SubPairsFloat64x2", argLength: 2, commutative: false},
-		{name: "SubPairsFloat64x4", argLength: 2, commutative: false},
+		{name: "SubPairsGroupedFloat32x8", argLength: 2, commutative: false},
+		{name: "SubPairsGroupedFloat64x4", argLength: 2, commutative: false},
+		{name: "SubPairsGroupedInt16x16", argLength: 2, commutative: false},
+		{name: "SubPairsGroupedInt32x8", argLength: 2, commutative: false},
+		{name: "SubPairsGroupedUint16x16", argLength: 2, commutative: false},
+		{name: "SubPairsGroupedUint32x8", argLength: 2, commutative: false},
 		{name: "SubPairsInt16x8", argLength: 2, commutative: false},
-		{name: "SubPairsInt16x16", argLength: 2, commutative: false},
 		{name: "SubPairsInt32x4", argLength: 2, commutative: false},
-		{name: "SubPairsInt32x8", argLength: 2, commutative: false},
+		{name: "SubPairsSaturatedGroupedInt16x16", argLength: 2, commutative: false},
 		{name: "SubPairsSaturatedInt16x8", argLength: 2, commutative: false},
-		{name: "SubPairsSaturatedInt16x16", argLength: 2, commutative: false},
 		{name: "SubPairsUint16x8", argLength: 2, commutative: false},
-		{name: "SubPairsUint16x16", argLength: 2, commutative: false},
 		{name: "SubPairsUint32x4", argLength: 2, commutative: false},
-		{name: "SubPairsUint32x8", argLength: 2, commutative: false},
 		{name: "SubSaturatedInt8x16", argLength: 2, commutative: false},
 		{name: "SubSaturatedInt8x32", argLength: 2, commutative: false},
 		{name: "SubSaturatedInt8x64", argLength: 2, commutative: false},
diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go
index 00d581ec9a..7b70dc2686 100644
--- a/src/cmd/compile/internal/ssa/opGen.go
+++ b/src/cmd/compile/internal/ssa/opGen.go
@@ -1214,6 +1214,12 @@ const (
 	OpAMD64VPMOVVec64x2ToM
 	OpAMD64VPMOVVec64x4ToM
 	OpAMD64VPMOVVec64x8ToM
+	OpAMD64VPMOVMSKB128
+	OpAMD64VPMOVMSKB256
+	OpAMD64VMOVMSKPS128
+	OpAMD64VMOVMSKPS256
+	OpAMD64VMOVMSKPD128
+	OpAMD64VMOVMSKPD256
 	OpAMD64Zero128
 	OpAMD64Zero256
 	OpAMD64Zero512
@@ -1693,18 +1699,6 @@ const (
 	OpAMD64VPCOMPRESSWMasked128
 	OpAMD64VPCOMPRESSWMasked256
 	OpAMD64VPCOMPRESSWMasked512
-	OpAMD64VPDPBUSD128
-	OpAMD64VPDPBUSD256
-	OpAMD64VPDPBUSD512
-	OpAMD64VPDPBUSDMasked128
-	OpAMD64VPDPBUSDMasked256
-	OpAMD64VPDPBUSDMasked512
-	OpAMD64VPDPBUSDS128
-	OpAMD64VPDPBUSDS256
-	OpAMD64VPDPBUSDS512
-	OpAMD64VPDPBUSDSMasked128
-	OpAMD64VPDPBUSDSMasked256
-	OpAMD64VPDPBUSDSMasked512
 	OpAMD64VPDPWSSD128
 	OpAMD64VPDPWSSD256
 	OpAMD64VPDPWSSD512
@@ -2021,12 +2015,24 @@ const (
 	OpAMD64VPMOVSXWQMasked128
 	OpAMD64VPMOVSXWQMasked256
 	OpAMD64VPMOVSXWQMasked512
+	OpAMD64VPMOVUSDB128_128
+	OpAMD64VPMOVUSDB128_256
+	OpAMD64VPMOVUSDB128_512
+	OpAMD64VPMOVUSDBMasked128_128
+	OpAMD64VPMOVUSDBMasked128_256
+	OpAMD64VPMOVUSDBMasked128_512
 	OpAMD64VPMOVUSDW128_128
 	OpAMD64VPMOVUSDW128_256
 	OpAMD64VPMOVUSDW256
 	OpAMD64VPMOVUSDWMasked128_128
 	OpAMD64VPMOVUSDWMasked128_256
 	OpAMD64VPMOVUSDWMasked256
+	OpAMD64VPMOVUSQB128_128
+	OpAMD64VPMOVUSQB128_256
+	OpAMD64VPMOVUSQB128_512
+	OpAMD64VPMOVUSQBMasked128_128
+	OpAMD64VPMOVUSQBMasked128_256
+	OpAMD64VPMOVUSQBMasked128_512
 	OpAMD64VPMOVUSQD128_128
 	OpAMD64VPMOVUSQD128_256
 	OpAMD64VPMOVUSQD256
@@ -2039,7 +2045,11 @@ const (
 	OpAMD64VPMOVUSQWMasked128_128
 	OpAMD64VPMOVUSQWMasked128_256
 	OpAMD64VPMOVUSQWMasked128_512
+	OpAMD64VPMOVUSWB128_128
+	OpAMD64VPMOVUSWB128_256
 	OpAMD64VPMOVUSWB256
+	OpAMD64VPMOVUSWBMasked128_128
+	OpAMD64VPMOVUSWBMasked128_256
 	OpAMD64VPMOVUSWBMasked256
 	OpAMD64VPMOVWB128_128
 	OpAMD64VPMOVWB128_256
@@ -2939,14 +2949,6 @@ const (
 	OpAMD64VPCMPEQQ512load
 	OpAMD64VPCMPGTD512load
 	OpAMD64VPCMPGTQ512load
-	OpAMD64VPDPBUSD512load
-	OpAMD64VPDPBUSDMasked128load
-	OpAMD64VPDPBUSDMasked256load
-	OpAMD64VPDPBUSDMasked512load
-	OpAMD64VPDPBUSDS512load
-	OpAMD64VPDPBUSDSMasked128load
-	OpAMD64VPDPBUSDSMasked256load
-	OpAMD64VPDPBUSDSMasked512load
 	OpAMD64VPDPWSSD512load
 	OpAMD64VPDPWSSDMasked128load
 	OpAMD64VPDPWSSDMasked256load
@@ -3623,15 +3625,23 @@ const (
 	OpAMD64VPMOVSXWQMasked128Merging
 	OpAMD64VPMOVSXWQMasked256Merging
 	OpAMD64VPMOVSXWQMasked512Merging
+	OpAMD64VPMOVUSDBMasked128_128Merging
+	OpAMD64VPMOVUSDBMasked128_256Merging
+	OpAMD64VPMOVUSDBMasked128_512Merging
 	OpAMD64VPMOVUSDWMasked128_128Merging
 	OpAMD64VPMOVUSDWMasked128_256Merging
 	OpAMD64VPMOVUSDWMasked256Merging
+	OpAMD64VPMOVUSQBMasked128_128Merging
+	OpAMD64VPMOVUSQBMasked128_256Merging
+	OpAMD64VPMOVUSQBMasked128_512Merging
 	OpAMD64VPMOVUSQDMasked128_128Merging
 	OpAMD64VPMOVUSQDMasked128_256Merging
 	OpAMD64VPMOVUSQDMasked256Merging
 	OpAMD64VPMOVUSQWMasked128_128Merging
 	OpAMD64VPMOVUSQWMasked128_256Merging
 	OpAMD64VPMOVUSQWMasked128_512Merging
+	OpAMD64VPMOVUSWBMasked128_128Merging
+	OpAMD64VPMOVUSWBMasked128_256Merging
 	OpAMD64VPMOVUSWBMasked256Merging
 	OpAMD64VPMOVWBMasked128_128Merging
 	OpAMD64VPMOVWBMasked128_256Merging
@@ -6154,6 +6164,12 @@ const (
 	OpCvtMask64x4to8
 	OpCvtMask64x8to8
 	OpIsZeroVec
+	OpIsNaNFloat32x4
+	OpIsNaNFloat32x8
+	OpIsNaNFloat32x16
+	OpIsNaNFloat64x2
+	OpIsNaNFloat64x4
+	OpIsNaNFloat64x8
 	OpAESDecryptLastRoundUint8x16
 	OpAESDecryptLastRoundUint8x32
 	OpAESDecryptLastRoundUint8x64
@@ -6198,19 +6214,19 @@ const (
 	OpAddInt64x4
 	OpAddInt64x8
 	OpAddPairsFloat32x4
-	OpAddPairsFloat32x8
 	OpAddPairsFloat64x2
-	OpAddPairsFloat64x4
+	OpAddPairsGroupedFloat32x8
+	OpAddPairsGroupedFloat64x4
+	OpAddPairsGroupedInt16x16
+	OpAddPairsGroupedInt32x8
+	OpAddPairsGroupedUint16x16
+	OpAddPairsGroupedUint32x8
 	OpAddPairsInt16x8
-	OpAddPairsInt16x16
 	OpAddPairsInt32x4
-	OpAddPairsInt32x8
+	OpAddPairsSaturatedGroupedInt16x16
 	OpAddPairsSaturatedInt16x8
-	OpAddPairsSaturatedInt16x16
 	OpAddPairsUint16x8
-	OpAddPairsUint16x16
 	OpAddPairsUint32x4
-	OpAddPairsUint32x8
 	OpAddSaturatedInt8x16
 	OpAddSaturatedInt8x32
 	OpAddSaturatedInt8x64
@@ -6454,12 +6470,6 @@ const (
 	OpDotProductPairsSaturatedUint8x16
 	OpDotProductPairsSaturatedUint8x32
 	OpDotProductPairsSaturatedUint8x64
-	OpDotProductQuadrupleInt32x4
-	OpDotProductQuadrupleInt32x8
-	OpDotProductQuadrupleInt32x16
-	OpDotProductQuadrupleSaturatedInt32x4
-	OpDotProductQuadrupleSaturatedInt32x8
-	OpDotProductQuadrupleSaturatedInt32x16
 	OpEqualFloat32x4
 	OpEqualFloat32x8
 	OpEqualFloat32x16
@@ -6520,26 +6530,26 @@ const (
 	OpExpandUint64x2
 	OpExpandUint64x4
 	OpExpandUint64x8
-	OpExtendLo2ToInt64x2Int8x16
-	OpExtendLo2ToInt64x2Int16x8
-	OpExtendLo2ToInt64x2Int32x4
-	OpExtendLo2ToUint64x2Uint8x16
-	OpExtendLo2ToUint64x2Uint16x8
-	OpExtendLo2ToUint64x2Uint32x4
-	OpExtendLo4ToInt32x4Int8x16
-	OpExtendLo4ToInt32x4Int16x8
-	OpExtendLo4ToInt64x4Int8x16
-	OpExtendLo4ToInt64x4Int16x8
-	OpExtendLo4ToUint32x4Uint8x16
-	OpExtendLo4ToUint32x4Uint16x8
-	OpExtendLo4ToUint64x4Uint8x16
-	OpExtendLo4ToUint64x4Uint16x8
-	OpExtendLo8ToInt16x8Int8x16
-	OpExtendLo8ToInt32x8Int8x16
-	OpExtendLo8ToInt64x8Int8x16
-	OpExtendLo8ToUint16x8Uint8x16
-	OpExtendLo8ToUint32x8Uint8x16
-	OpExtendLo8ToUint64x8Uint8x16
+	OpExtendLo2ToInt64Int8x16
+	OpExtendLo2ToInt64Int16x8
+	OpExtendLo2ToInt64Int32x4
+	OpExtendLo2ToUint64Uint8x16
+	OpExtendLo2ToUint64Uint16x8
+	OpExtendLo2ToUint64Uint32x4
+	OpExtendLo4ToInt32Int8x16
+	OpExtendLo4ToInt32Int16x8
+	OpExtendLo4ToInt64Int8x16
+	OpExtendLo4ToInt64Int16x8
+	OpExtendLo4ToUint32Uint8x16
+	OpExtendLo4ToUint32Uint16x8
+	OpExtendLo4ToUint64Uint8x16
+	OpExtendLo4ToUint64Uint16x8
+	OpExtendLo8ToInt16Int8x16
+	OpExtendLo8ToInt32Int8x16
+	OpExtendLo8ToInt64Int8x16
+	OpExtendLo8ToUint16Uint8x16
+	OpExtendLo8ToUint32Uint8x16
+	OpExtendLo8ToUint64Uint8x16
 	OpExtendToInt16Int8x16
 	OpExtendToInt16Int8x32
 	OpExtendToInt32Int8x16
@@ -6675,12 +6685,6 @@ const (
 	OpInterleaveLoUint16x8
 	OpInterleaveLoUint32x4
 	OpInterleaveLoUint64x2
-	OpIsNanFloat32x4
-	OpIsNanFloat32x8
-	OpIsNanFloat32x16
-	OpIsNanFloat64x2
-	OpIsNanFloat64x4
-	OpIsNanFloat64x8
 	OpLeadingZerosInt32x4
 	OpLeadingZerosInt32x8
 	OpLeadingZerosInt32x16
@@ -6980,9 +6984,9 @@ const (
 	OpSaturateToInt8Int64x2
 	OpSaturateToInt8Int64x4
 	OpSaturateToInt8Int64x8
+	OpSaturateToInt16ConcatGroupedInt32x8
+	OpSaturateToInt16ConcatGroupedInt32x16
 	OpSaturateToInt16ConcatInt32x4
-	OpSaturateToInt16ConcatInt32x8
-	OpSaturateToInt16ConcatInt32x16
 	OpSaturateToInt16Int32x4
 	OpSaturateToInt16Int32x8
 	OpSaturateToInt16Int32x16
@@ -6992,18 +6996,18 @@ const (
 	OpSaturateToInt32Int64x2
 	OpSaturateToInt32Int64x4
 	OpSaturateToInt32Int64x8
-	OpSaturateToUint8Int16x8
-	OpSaturateToUint8Int16x16
-	OpSaturateToUint8Int32x4
-	OpSaturateToUint8Int32x8
-	OpSaturateToUint8Int32x16
-	OpSaturateToUint8Int64x2
-	OpSaturateToUint8Int64x4
-	OpSaturateToUint8Int64x8
+	OpSaturateToUint8Uint16x8
+	OpSaturateToUint8Uint16x16
 	OpSaturateToUint8Uint16x32
-	OpSaturateToUint16ConcatUint32x4
-	OpSaturateToUint16ConcatUint32x8
-	OpSaturateToUint16ConcatUint32x16
+	OpSaturateToUint8Uint32x4
+	OpSaturateToUint8Uint32x8
+	OpSaturateToUint8Uint32x16
+	OpSaturateToUint8Uint64x2
+	OpSaturateToUint8Uint64x4
+	OpSaturateToUint8Uint64x8
+	OpSaturateToUint16ConcatGroupedInt32x8
+	OpSaturateToUint16ConcatGroupedInt32x16
+	OpSaturateToUint16ConcatInt32x4
 	OpSaturateToUint16Uint32x4
 	OpSaturateToUint16Uint32x8
 	OpSaturateToUint16Uint32x16
@@ -7192,19 +7196,19 @@ const (
 	OpSubInt64x4
 	OpSubInt64x8
 	OpSubPairsFloat32x4
-	OpSubPairsFloat32x8
 	OpSubPairsFloat64x2
-	OpSubPairsFloat64x4
+	OpSubPairsGroupedFloat32x8
+	OpSubPairsGroupedFloat64x4
+	OpSubPairsGroupedInt16x16
+	OpSubPairsGroupedInt32x8
+	OpSubPairsGroupedUint16x16
+	OpSubPairsGroupedUint32x8
 	OpSubPairsInt16x8
-	OpSubPairsInt16x16
 	OpSubPairsInt32x4
-	OpSubPairsInt32x8
+	OpSubPairsSaturatedGroupedInt16x16
 	OpSubPairsSaturatedInt16x8
-	OpSubPairsSaturatedInt16x16
 	OpSubPairsUint16x8
-	OpSubPairsUint16x16
 	OpSubPairsUint32x4
-	OpSubPairsUint32x8
 	OpSubSaturatedInt8x16
 	OpSubSaturatedInt8x32
 	OpSubSaturatedInt8x64
@@ -20354,6 +20358,84 @@ var opcodeTable = [...]opInfo{
 		},
 	},
 	{
+		name:   "VPMOVMSKB128",
+		argLen: 1,
+		asm:    x86.AVPMOVMSKB,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+			outputs: []outputInfo{
+				{0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
+			},
+		},
+	},
+	{
+		name:   "VPMOVMSKB256",
+		argLen: 1,
+		asm:    x86.AVPMOVMSKB,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+			outputs: []outputInfo{
+				{0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
+			},
+		},
+	},
+	{
+		name:   "VMOVMSKPS128",
+		argLen: 1,
+		asm:    x86.AVMOVMSKPS,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+			outputs: []outputInfo{
+				{0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
+			},
+		},
+	},
+	{
+		name:   "VMOVMSKPS256",
+		argLen: 1,
+		asm:    x86.AVMOVMSKPS,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+			outputs: []outputInfo{
+				{0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
+			},
+		},
+	},
+	{
+		name:   "VMOVMSKPD128",
+		argLen: 1,
+		asm:    x86.AVMOVMSKPD,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+			outputs: []outputInfo{
+				{0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
+			},
+		},
+	},
+	{
+		name:   "VMOVMSKPD256",
+		argLen: 1,
+		asm:    x86.AVMOVMSKPD,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+			outputs: []outputInfo{
+				{0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
+			},
+		},
+	},
+	{
 		name:      "Zero128",
 		argLen:    0,
 		zeroWidth: true,
@@ -27270,204 +27352,6 @@ var opcodeTable = [...]opInfo{
 		},
 	},
 	{
-		name:         "VPDPBUSD128",
-		argLen:       3,
-		resultInArg0: true,
-		asm:          x86.AVPDPBUSD,
-		reg: regInfo{
-			inputs: []inputInfo{
-				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
-				{1, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
-				{2, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
-			},
-			outputs: []outputInfo{
-				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
-			},
-		},
-	},
-	{
-		name:         "VPDPBUSD256",
-		argLen:       3,
-		resultInArg0: true,
-		asm:          x86.AVPDPBUSD,
-		reg: regInfo{
-			inputs: []inputInfo{
-				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
-				{1, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
-				{2, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
-			},
-			outputs: []outputInfo{
-				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
-			},
-		},
-	},
-	{
-		name:         "VPDPBUSD512",
-		argLen:       3,
-		resultInArg0: true,
-		asm:          x86.AVPDPBUSD,
-		reg: regInfo{
-			inputs: []inputInfo{
-				{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
-				{1, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
-				{2, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
-			},
-			outputs: []outputInfo{
-				{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
-			},
-		},
-	},
-	{
-		name:         "VPDPBUSDMasked128",
-		argLen:       4,
-		resultInArg0: true,
-		asm:          x86.AVPDPBUSD,
-		reg: regInfo{
-			inputs: []inputInfo{
-				{3, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
-				{0, 281472829161472},   // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
-				{1, 281474976645120},   // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
-				{2, 281474976645120},   // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
-			},
-			outputs: []outputInfo{
-				{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
-			},
-		},
-	},
-	{
-		name:         "VPDPBUSDMasked256",
-		argLen:       4,
-		resultInArg0: true,
-		asm:          x86.AVPDPBUSD,
-		reg: regInfo{
-			inputs: []inputInfo{
-				{3, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
-				{0, 281472829161472},   // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
-				{1, 281474976645120},   // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
-				{2, 281474976645120},   // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
-			},
-			outputs: []outputInfo{
-				{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
-			},
-		},
-	},
-	{
-		name:         "VPDPBUSDMasked512",
-		argLen:       4,
-		resultInArg0: true,
-		asm:          x86.AVPDPBUSD,
-		reg: regInfo{
-			inputs: []inputInfo{
-				{3, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
-				{0, 281472829161472},   // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
-				{1, 281474976645120},   // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
-				{2, 281474976645120},   // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
-			},
-			outputs: []outputInfo{
-				{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
-			},
-		},
-	},
-	{
-		name:         "VPDPBUSDS128",
-		argLen:       3,
-		resultInArg0: true,
-		asm:          x86.AVPDPBUSDS,
-		reg: regInfo{
-			inputs: []inputInfo{
-				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
-				{1, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
-				{2, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
-			},
-			outputs: []outputInfo{
-				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
-			},
-		},
-	},
-	{
-		name:         "VPDPBUSDS256",
-		argLen:       3,
-		resultInArg0: true,
-		asm:          x86.AVPDPBUSDS,
-		reg: regInfo{
-			inputs: []inputInfo{
-				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
-				{1, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
-				{2, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
-			},
-			outputs: []outputInfo{
-				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
-			},
-		},
-	},
-	{
-		name:         "VPDPBUSDS512",
-		argLen:       3,
-		resultInArg0: true,
-		asm:          x86.AVPDPBUSDS,
-		reg: regInfo{
-			inputs: []inputInfo{
-				{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
-				{1, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
-				{2, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
-			},
-			outputs: []outputInfo{
-				{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
-			},
-		},
-	},
-	{
-		name:         "VPDPBUSDSMasked128",
-		argLen:       4,
-		resultInArg0: true,
-		asm:          x86.AVPDPBUSDS,
-		reg: regInfo{
-			inputs: []inputInfo{
-				{3, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
-				{0, 281472829161472},   // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
-				{1, 281474976645120},   // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
-				{2, 281474976645120},   // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
-			},
-			outputs: []outputInfo{
-				{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
-			},
-		},
-	},
-	{
-		name:         "VPDPBUSDSMasked256",
-		argLen:       4,
-		resultInArg0: true,
-		asm:          x86.AVPDPBUSDS,
-		reg: regInfo{
-			inputs: []inputInfo{
-				{3, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
-				{0, 281472829161472},   // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
-				{1, 281474976645120},   // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
-				{2, 281474976645120},   // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
-			},
-			outputs: []outputInfo{
-				{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
-			},
-		},
-	},
-	{
-		name:         "VPDPBUSDSMasked512",
-		argLen:       4,
-		resultInArg0: true,
-		asm:          x86.AVPDPBUSDS,
-		reg: regInfo{
-			inputs: []inputInfo{
-				{3, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
-				{0, 281472829161472},   // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
-				{1, 281474976645120},   // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
-				{2, 281474976645120},   // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
-			},
-			outputs: []outputInfo{
-				{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
-			},
-		},
-	},
-	{
 		name:         "VPDPWSSD128",
 		argLen:       3,
 		resultInArg0: true,
@@ -32104,6 +31988,87 @@ var opcodeTable = [...]opInfo{
 		},
 	},
 	{
+		name:   "VPMOVUSDB128_128",
+		argLen: 1,
+		asm:    x86.AVPMOVUSDB,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+			},
+			outputs: []outputInfo{
+				{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+			},
+		},
+	},
+	{
+		name:   "VPMOVUSDB128_256",
+		argLen: 1,
+		asm:    x86.AVPMOVUSDB,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+			},
+			outputs: []outputInfo{
+				{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+			},
+		},
+	},
+	{
+		name:   "VPMOVUSDB128_512",
+		argLen: 1,
+		asm:    x86.AVPMOVUSDB,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+			},
+			outputs: []outputInfo{
+				{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+			},
+		},
+	},
+	{
+		name:   "VPMOVUSDBMasked128_128",
+		argLen: 2,
+		asm:    x86.AVPMOVUSDB,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+				{0, 281474976645120},   // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+			},
+			outputs: []outputInfo{
+				{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+			},
+		},
+	},
+	{
+		name:   "VPMOVUSDBMasked128_256",
+		argLen: 2,
+		asm:    x86.AVPMOVUSDB,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+				{0, 281474976645120},   // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+			},
+			outputs: []outputInfo{
+				{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+			},
+		},
+	},
+	{
+		name:   "VPMOVUSDBMasked128_512",
+		argLen: 2,
+		asm:    x86.AVPMOVUSDB,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+				{0, 281474976645120},   // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+			},
+			outputs: []outputInfo{
+				{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+			},
+		},
+	},
+	{
 		name:   "VPMOVUSDW128_128",
 		argLen: 1,
 		asm:    x86.AVPMOVUSDW,
@@ -32185,6 +32150,87 @@ var opcodeTable = [...]opInfo{
 		},
 	},
 	{
+		name:   "VPMOVUSQB128_128",
+		argLen: 1,
+		asm:    x86.AVPMOVUSQB,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+			},
+			outputs: []outputInfo{
+				{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+			},
+		},
+	},
+	{
+		name:   "VPMOVUSQB128_256",
+		argLen: 1,
+		asm:    x86.AVPMOVUSQB,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+			},
+			outputs: []outputInfo{
+				{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+			},
+		},
+	},
+	{
+		name:   "VPMOVUSQB128_512",
+		argLen: 1,
+		asm:    x86.AVPMOVUSQB,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+			},
+			outputs: []outputInfo{
+				{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+			},
+		},
+	},
+	{
+		name:   "VPMOVUSQBMasked128_128",
+		argLen: 2,
+		asm:    x86.AVPMOVUSQB,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+				{0, 281474976645120},   // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+			},
+			outputs: []outputInfo{
+				{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+			},
+		},
+	},
+	{
+		name:   "VPMOVUSQBMasked128_256",
+		argLen: 2,
+		asm:    x86.AVPMOVUSQB,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+				{0, 281474976645120},   // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+			},
+			outputs: []outputInfo{
+				{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+			},
+		},
+	},
+	{
+		name:   "VPMOVUSQBMasked128_512",
+		argLen: 2,
+		asm:    x86.AVPMOVUSQB,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+				{0, 281474976645120},   // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+			},
+			outputs: []outputInfo{
+				{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+			},
+		},
+	},
+	{
 		name:   "VPMOVUSQD128_128",
 		argLen: 1,
 		asm:    x86.AVPMOVUSQD,
@@ -32347,6 +32393,32 @@ var opcodeTable = [...]opInfo{
 		},
 	},
 	{
+		name:   "VPMOVUSWB128_128",
+		argLen: 1,
+		asm:    x86.AVPMOVUSWB,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+			},
+			outputs: []outputInfo{
+				{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+			},
+		},
+	},
+	{
+		name:   "VPMOVUSWB128_256",
+		argLen: 1,
+		asm:    x86.AVPMOVUSWB,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+			},
+			outputs: []outputInfo{
+				{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+			},
+		},
+	},
+	{
 		name:   "VPMOVUSWB256",
 		argLen: 1,
 		asm:    x86.AVPMOVUSWB,
@@ -32360,6 +32432,34 @@ var opcodeTable = [...]opInfo{
 		},
 	},
 	{
+		name:   "VPMOVUSWBMasked128_128",
+		argLen: 2,
+		asm:    x86.AVPMOVUSWB,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+				{0, 281474976645120},   // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+			},
+			outputs: []outputInfo{
+				{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+			},
+		},
+	},
+	{
+		name:   "VPMOVUSWBMasked128_256",
+		argLen: 2,
+		asm:    x86.AVPMOVUSWB,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+				{0, 281474976645120},   // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+			},
+			outputs: []outputInfo{
+				{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+			},
+		},
+	},
+	{
 		name:   "VPMOVUSWBMasked256",
 		argLen: 2,
 		asm:    x86.AVPMOVUSWB,
@@ -45952,156 +46052,6 @@ var opcodeTable = [...]opInfo{
 		},
 	},
 	{
-		name:         "VPDPBUSD512load",
-		auxType:      auxSymOff,
-		argLen:       4,
-		resultInArg0: true,
-		symEffect:    SymRead,
-		asm:          x86.AVPDPBUSD,
-		reg: regInfo{
-			inputs: []inputInfo{
-				{2, 72057594037977087}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R15 SB
-				{0, 281472829161472},   // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
-				{1, 281474976645120},   // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
-			},
-			outputs: []outputInfo{
-				{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
-			},
-		},
-	},
-	{
-		name:         "VPDPBUSDMasked128load",
-		auxType:      auxSymOff,
-		argLen:       5,
-		resultInArg0: true,
-		symEffect:    SymRead,
-		asm:          x86.AVPDPBUSD,
-		reg: regInfo{
-			inputs: []inputInfo{
-				{3, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
-				{2, 72057594037977087}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R15 SB
-				{0, 281472829161472},   // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
-				{1, 281474976645120},   // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
-			},
-			outputs: []outputInfo{
-				{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
-			},
-		},
-	},
-	{
-		name:         "VPDPBUSDMasked256load",
-		auxType:      auxSymOff,
-		argLen:       5,
-		resultInArg0: true,
-		symEffect:    SymRead,
-		asm:          x86.AVPDPBUSD,
-		reg: regInfo{
-			inputs: []inputInfo{
-				{3, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
-				{2, 72057594037977087}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R15 SB
-				{0, 281472829161472},   // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
-				{1, 281474976645120},   // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
-			},
-			outputs: []outputInfo{
-				{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
-			},
-		},
-	},
-	{
-		name:         "VPDPBUSDMasked512load",
-		auxType:      auxSymOff,
-		argLen:       5,
-		resultInArg0: true,
-		symEffect:    SymRead,
-		asm:          x86.AVPDPBUSD,
-		reg: regInfo{
-			inputs: []inputInfo{
-				{3, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
-				{2, 72057594037977087}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R15 SB
-				{0, 281472829161472},   // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
-				{1, 281474976645120},   // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
-			},
-			outputs: []outputInfo{
-				{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
-			},
-		},
-	},
-	{
-		name:         "VPDPBUSDS512load",
-		auxType:      auxSymOff,
-		argLen:       4,
-		resultInArg0: true,
-		symEffect:    SymRead,
-		asm:          x86.AVPDPBUSDS,
-		reg: regInfo{
-			inputs: []inputInfo{
-				{2, 72057594037977087}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R15 SB
-				{0, 281472829161472},   // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
-				{1, 281474976645120},   // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
-			},
-			outputs: []outputInfo{
-				{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
-			},
-		},
-	},
-	{
-		name:         "VPDPBUSDSMasked128load",
-		auxType:      auxSymOff,
-		argLen:       5,
-		resultInArg0: true,
-		symEffect:    SymRead,
-		asm:          x86.AVPDPBUSDS,
-		reg: regInfo{
-			inputs: []inputInfo{
-				{3, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
-				{2, 72057594037977087}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R15 SB
-				{0, 281472829161472},   // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
-				{1, 281474976645120},   // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
-			},
-			outputs: []outputInfo{
-				{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
-			},
-		},
-	},
-	{
-		name:         "VPDPBUSDSMasked256load",
-		auxType:      auxSymOff,
-		argLen:       5,
-		resultInArg0: true,
-		symEffect:    SymRead,
-		asm:          x86.AVPDPBUSDS,
-		reg: regInfo{
-			inputs: []inputInfo{
-				{3, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
-				{2, 72057594037977087}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R15 SB
-				{0, 281472829161472},   // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
-				{1, 281474976645120},   // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
-			},
-			outputs: []outputInfo{
-				{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
-			},
-		},
-	},
-	{
-		name:         "VPDPBUSDSMasked512load",
-		auxType:      auxSymOff,
-		argLen:       5,
-		resultInArg0: true,
-		symEffect:    SymRead,
-		asm:          x86.AVPDPBUSDS,
-		reg: regInfo{
-			inputs: []inputInfo{
-				{3, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
-				{2, 72057594037977087}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R15 SB
-				{0, 281472829161472},   // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
-				{1, 281474976645120},   // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
-			},
-			outputs: []outputInfo{
-				{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
-			},
-		},
-	},
-	{
 		name:         "VPDPWSSD512load",
 		auxType:      auxSymOff,
 		argLen:       4,
@@ -57269,6 +57219,54 @@ var opcodeTable = [...]opInfo{
 		},
 	},
 	{
+		name:         "VPMOVUSDBMasked128_128Merging",
+		argLen:       3,
+		resultInArg0: true,
+		asm:          x86.AVPMOVUSDB,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+				{0, 281472829161472},   // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+				{1, 281474976645120},   // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+			},
+			outputs: []outputInfo{
+				{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+			},
+		},
+	},
+	{
+		name:         "VPMOVUSDBMasked128_256Merging",
+		argLen:       3,
+		resultInArg0: true,
+		asm:          x86.AVPMOVUSDB,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+				{0, 281472829161472},   // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+				{1, 281474976645120},   // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+			},
+			outputs: []outputInfo{
+				{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+			},
+		},
+	},
+	{
+		name:         "VPMOVUSDBMasked128_512Merging",
+		argLen:       3,
+		resultInArg0: true,
+		asm:          x86.AVPMOVUSDB,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+				{0, 281472829161472},   // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+				{1, 281474976645120},   // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+			},
+			outputs: []outputInfo{
+				{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+			},
+		},
+	},
+	{
 		name:         "VPMOVUSDWMasked128_128Merging",
 		argLen:       3,
 		resultInArg0: true,
@@ -57317,6 +57315,54 @@ var opcodeTable = [...]opInfo{
 		},
 	},
 	{
+		name:         "VPMOVUSQBMasked128_128Merging",
+		argLen:       3,
+		resultInArg0: true,
+		asm:          x86.AVPMOVUSQB,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+				{0, 281472829161472},   // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+				{1, 281474976645120},   // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+			},
+			outputs: []outputInfo{
+				{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+			},
+		},
+	},
+	{
+		name:         "VPMOVUSQBMasked128_256Merging",
+		argLen:       3,
+		resultInArg0: true,
+		asm:          x86.AVPMOVUSQB,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+				{0, 281472829161472},   // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+				{1, 281474976645120},   // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+			},
+			outputs: []outputInfo{
+				{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+			},
+		},
+	},
+	{
+		name:         "VPMOVUSQBMasked128_512Merging",
+		argLen:       3,
+		resultInArg0: true,
+		asm:          x86.AVPMOVUSQB,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+				{0, 281472829161472},   // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+				{1, 281474976645120},   // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+			},
+			outputs: []outputInfo{
+				{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+			},
+		},
+	},
+	{
 		name:         "VPMOVUSQDMasked128_128Merging",
 		argLen:       3,
 		resultInArg0: true,
@@ -57413,6 +57459,38 @@ var opcodeTable = [...]opInfo{
 		},
 	},
 	{
+		name:         "VPMOVUSWBMasked128_128Merging",
+		argLen:       3,
+		resultInArg0: true,
+		asm:          x86.AVPMOVUSWB,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+				{0, 281472829161472},   // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+				{1, 281474976645120},   // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+			},
+			outputs: []outputInfo{
+				{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+			},
+		},
+	},
+	{
+		name:         "VPMOVUSWBMasked128_256Merging",
+		argLen:       3,
+		resultInArg0: true,
+		asm:          x86.AVPMOVUSWB,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+				{0, 281472829161472},   // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+				{1, 281474976645120},   // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+			},
+			outputs: []outputInfo{
+				{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+			},
+		},
+	},
+	{
 		name:         "VPMOVUSWBMasked256Merging",
 		argLen:       3,
 		resultInArg0: true,
@@ -89000,6 +89078,36 @@ var opcodeTable = [...]opInfo{
 		generic: true,
 	},
 	{
+		name:    "IsNaNFloat32x4",
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "IsNaNFloat32x8",
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "IsNaNFloat32x16",
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "IsNaNFloat64x2",
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "IsNaNFloat64x4",
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "IsNaNFloat64x8",
+		argLen:  1,
+		generic: true,
+	},
+	{
 		name:    "AESDecryptLastRoundUint8x16",
 		argLen:  2,
 		generic: true,
@@ -89238,67 +89346,67 @@ var opcodeTable = [...]opInfo{
 		generic: true,
 	},
 	{
-		name:    "AddPairsFloat32x8",
+		name:    "AddPairsFloat64x2",
 		argLen:  2,
 		generic: true,
 	},
 	{
-		name:    "AddPairsFloat64x2",
+		name:    "AddPairsGroupedFloat32x8",
 		argLen:  2,
 		generic: true,
 	},
 	{
-		name:    "AddPairsFloat64x4",
+		name:    "AddPairsGroupedFloat64x4",
 		argLen:  2,
 		generic: true,
 	},
 	{
-		name:    "AddPairsInt16x8",
+		name:    "AddPairsGroupedInt16x16",
 		argLen:  2,
 		generic: true,
 	},
 	{
-		name:    "AddPairsInt16x16",
+		name:    "AddPairsGroupedInt32x8",
 		argLen:  2,
 		generic: true,
 	},
 	{
-		name:    "AddPairsInt32x4",
+		name:    "AddPairsGroupedUint16x16",
 		argLen:  2,
 		generic: true,
 	},
 	{
-		name:    "AddPairsInt32x8",
+		name:    "AddPairsGroupedUint32x8",
 		argLen:  2,
 		generic: true,
 	},
 	{
-		name:    "AddPairsSaturatedInt16x8",
+		name:    "AddPairsInt16x8",
 		argLen:  2,
 		generic: true,
 	},
 	{
-		name:    "AddPairsSaturatedInt16x16",
+		name:    "AddPairsInt32x4",
 		argLen:  2,
 		generic: true,
 	},
 	{
-		name:    "AddPairsUint16x8",
+		name:    "AddPairsSaturatedGroupedInt16x16",
 		argLen:  2,
 		generic: true,
 	},
 	{
-		name:    "AddPairsUint16x16",
+		name:    "AddPairsSaturatedInt16x8",
 		argLen:  2,
 		generic: true,
 	},
 	{
-		name:    "AddPairsUint32x4",
+		name:    "AddPairsUint16x8",
 		argLen:  2,
 		generic: true,
 	},
 	{
-		name:    "AddPairsUint32x8",
+		name:    "AddPairsUint32x4",
 		argLen:  2,
 		generic: true,
 	},
@@ -90572,36 +90680,6 @@ var opcodeTable = [...]opInfo{
 		generic: true,
 	},
 	{
-		name:    "DotProductQuadrupleInt32x4",
-		argLen:  3,
-		generic: true,
-	},
-	{
-		name:    "DotProductQuadrupleInt32x8",
-		argLen:  3,
-		generic: true,
-	},
-	{
-		name:    "DotProductQuadrupleInt32x16",
-		argLen:  3,
-		generic: true,
-	},
-	{
-		name:    "DotProductQuadrupleSaturatedInt32x4",
-		argLen:  3,
-		generic: true,
-	},
-	{
-		name:    "DotProductQuadrupleSaturatedInt32x8",
-		argLen:  3,
-		generic: true,
-	},
-	{
-		name:    "DotProductQuadrupleSaturatedInt32x16",
-		argLen:  3,
-		generic: true,
-	},
-	{
 		name:        "EqualFloat32x4",
 		argLen:      2,
 		commutative: true,
@@ -90932,102 +91010,102 @@ var opcodeTable = [...]opInfo{
 		generic: true,
 	},
 	{
-		name:    "ExtendLo2ToInt64x2Int8x16",
+		name:    "ExtendLo2ToInt64Int8x16",
 		argLen:  1,
 		generic: true,
 	},
 	{
-		name:    "ExtendLo2ToInt64x2Int16x8",
+		name:    "ExtendLo2ToInt64Int16x8",
 		argLen:  1,
 		generic: true,
 	},
 	{
-		name:    "ExtendLo2ToInt64x2Int32x4",
+		name:    "ExtendLo2ToInt64Int32x4",
 		argLen:  1,
 		generic: true,
 	},
 	{
-		name:    "ExtendLo2ToUint64x2Uint8x16",
+		name:    "ExtendLo2ToUint64Uint8x16",
 		argLen:  1,
 		generic: true,
 	},
 	{
-		name:    "ExtendLo2ToUint64x2Uint16x8",
+		name:    "ExtendLo2ToUint64Uint16x8",
 		argLen:  1,
 		generic: true,
 	},
 	{
-		name:    "ExtendLo2ToUint64x2Uint32x4",
+		name:    "ExtendLo2ToUint64Uint32x4",
 		argLen:  1,
 		generic: true,
 	},
 	{
-		name:    "ExtendLo4ToInt32x4Int8x16",
+		name:    "ExtendLo4ToInt32Int8x16",
 		argLen:  1,
 		generic: true,
 	},
 	{
-		name:    "ExtendLo4ToInt32x4Int16x8",
+		name:    "ExtendLo4ToInt32Int16x8",
 		argLen:  1,
 		generic: true,
 	},
 	{
-		name:    "ExtendLo4ToInt64x4Int8x16",
+		name:    "ExtendLo4ToInt64Int8x16",
 		argLen:  1,
 		generic: true,
 	},
 	{
-		name:    "ExtendLo4ToInt64x4Int16x8",
+		name:    "ExtendLo4ToInt64Int16x8",
 		argLen:  1,
 		generic: true,
 	},
 	{
-		name:    "ExtendLo4ToUint32x4Uint8x16",
+		name:    "ExtendLo4ToUint32Uint8x16",
 		argLen:  1,
 		generic: true,
 	},
 	{
-		name:    "ExtendLo4ToUint32x4Uint16x8",
+		name:    "ExtendLo4ToUint32Uint16x8",
 		argLen:  1,
 		generic: true,
 	},
 	{
-		name:    "ExtendLo4ToUint64x4Uint8x16",
+		name:    "ExtendLo4ToUint64Uint8x16",
 		argLen:  1,
 		generic: true,
 	},
 	{
-		name:    "ExtendLo4ToUint64x4Uint16x8",
+		name:    "ExtendLo4ToUint64Uint16x8",
 		argLen:  1,
 		generic: true,
 	},
 	{
-		name:    "ExtendLo8ToInt16x8Int8x16",
+		name:    "ExtendLo8ToInt16Int8x16",
 		argLen:  1,
 		generic: true,
 	},
 	{
-		name:    "ExtendLo8ToInt32x8Int8x16",
+		name:    "ExtendLo8ToInt32Int8x16",
 		argLen:  1,
 		generic: true,
 	},
 	{
-		name:    "ExtendLo8ToInt64x8Int8x16",
+		name:    "ExtendLo8ToInt64Int8x16",
 		argLen:  1,
 		generic: true,
 	},
 	{
-		name:    "ExtendLo8ToUint16x8Uint8x16",
+		name:    "ExtendLo8ToUint16Uint8x16",
 		argLen:  1,
 		generic: true,
 	},
 	{
-		name:    "ExtendLo8ToUint32x8Uint8x16",
+		name:    "ExtendLo8ToUint32Uint8x16",
 		argLen:  1,
 		generic: true,
 	},
 	{
-		name:    "ExtendLo8ToUint64x8Uint8x16",
+		name:    "ExtendLo8ToUint64Uint8x16",
 		argLen:  1,
 		generic: true,
 	},
@@ -91707,42 +91785,6 @@ var opcodeTable = [...]opInfo{
 		generic: true,
 	},
 	{
-		name:        "IsNanFloat32x4",
-		argLen:      2,
-		commutative: true,
-		generic:     true,
-	},
-	{
-		name:        "IsNanFloat32x8",
-		argLen:      2,
-		commutative: true,
-		generic:     true,
-	},
-	{
-		name:        "IsNanFloat32x16",
-		argLen:      2,
-		commutative: true,
-		generic:     true,
-	},
-	{
-		name:        "IsNanFloat64x2",
-		argLen:      2,
-		commutative: true,
-		generic:     true,
-	},
-	{
-		name:        "IsNanFloat64x4",
-		argLen:      2,
-		commutative: true,
-		generic:     true,
-	},
-	{
-		name:        "IsNanFloat64x8",
-		argLen:      2,
-		commutative: true,
-		generic:     true,
-	},
-	{
 		name:    "LeadingZerosInt32x4",
 		argLen:  1,
 		generic: true,
@@ -93370,17 +93412,17 @@ var opcodeTable = [...]opInfo{
 		generic: true,
 	},
 	{
-		name:    "SaturateToInt16ConcatInt32x4",
+		name:    "SaturateToInt16ConcatGroupedInt32x8",
 		argLen:  2,
 		generic: true,
 	},
 	{
-		name:    "SaturateToInt16ConcatInt32x8",
+		name:    "SaturateToInt16ConcatGroupedInt32x16",
 		argLen:  2,
 		generic: true,
 	},
 	{
-		name:    "SaturateToInt16ConcatInt32x16",
+		name:    "SaturateToInt16ConcatInt32x4",
 		argLen:  2,
 		generic: true,
 	},
@@ -93430,62 +93472,62 @@ var opcodeTable = [...]opInfo{
 		generic: true,
 	},
 	{
-		name:    "SaturateToUint8Int16x8",
+		name:    "SaturateToUint8Uint16x8",
 		argLen:  1,
 		generic: true,
 	},
 	{
-		name:    "SaturateToUint8Int16x16",
+		name:    "SaturateToUint8Uint16x16",
 		argLen:  1,
 		generic: true,
 	},
 	{
-		name:    "SaturateToUint8Int32x4",
+		name:    "SaturateToUint8Uint16x32",
 		argLen:  1,
 		generic: true,
 	},
 	{
-		name:    "SaturateToUint8Int32x8",
+		name:    "SaturateToUint8Uint32x4",
 		argLen:  1,
 		generic: true,
 	},
 	{
-		name:    "SaturateToUint8Int32x16",
+		name:    "SaturateToUint8Uint32x8",
 		argLen:  1,
 		generic: true,
 	},
 	{
-		name:    "SaturateToUint8Int64x2",
+		name:    "SaturateToUint8Uint32x16",
 		argLen:  1,
 		generic: true,
 	},
 	{
-		name:    "SaturateToUint8Int64x4",
+		name:    "SaturateToUint8Uint64x2",
 		argLen:  1,
 		generic: true,
 	},
 	{
-		name:    "SaturateToUint8Int64x8",
+		name:    "SaturateToUint8Uint64x4",
 		argLen:  1,
 		generic: true,
 	},
 	{
-		name:    "SaturateToUint8Uint16x32",
+		name:    "SaturateToUint8Uint64x8",
 		argLen:  1,
 		generic: true,
 	},
 	{
-		name:    "SaturateToUint16ConcatUint32x4",
+		name:    "SaturateToUint16ConcatGroupedInt32x8",
 		argLen:  2,
 		generic: true,
 	},
 	{
-		name:    "SaturateToUint16ConcatUint32x8",
+		name:    "SaturateToUint16ConcatGroupedInt32x16",
 		argLen:  2,
 		generic: true,
 	},
 	{
-		name:    "SaturateToUint16ConcatUint32x16",
+		name:    "SaturateToUint16ConcatInt32x4",
 		argLen:  2,
 		generic: true,
 	},
@@ -94430,67 +94472,67 @@ var opcodeTable = [...]opInfo{
 		generic: true,
 	},
 	{
-		name:    "SubPairsFloat32x8",
+		name:    "SubPairsFloat64x2",
 		argLen:  2,
 		generic: true,
 	},
 	{
-		name:    "SubPairsFloat64x2",
+		name:    "SubPairsGroupedFloat32x8",
 		argLen:  2,
 		generic: true,
 	},
 	{
-		name:    "SubPairsFloat64x4",
+		name:    "SubPairsGroupedFloat64x4",
 		argLen:  2,
 		generic: true,
 	},
 	{
-		name:    "SubPairsInt16x8",
+		name:    "SubPairsGroupedInt16x16",
 		argLen:  2,
 		generic: true,
 	},
 	{
-		name:    "SubPairsInt16x16",
+		name:    "SubPairsGroupedInt32x8",
 		argLen:  2,
 		generic: true,
 	},
 	{
-		name:    "SubPairsInt32x4",
+		name:    "SubPairsGroupedUint16x16",
 		argLen:  2,
 		generic: true,
 	},
 	{
-		name:    "SubPairsInt32x8",
+		name:    "SubPairsGroupedUint32x8",
 		argLen:  2,
 		generic: true,
 	},
 	{
-		name:    "SubPairsSaturatedInt16x8",
+		name:    "SubPairsInt16x8",
 		argLen:  2,
 		generic: true,
 	},
 	{
-		name:    "SubPairsSaturatedInt16x16",
+		name:    "SubPairsInt32x4",
 		argLen:  2,
 		generic: true,
 	},
 	{
-		name:    "SubPairsUint16x8",
+		name:    "SubPairsSaturatedGroupedInt16x16",
 		argLen:  2,
 		generic: true,
 	},
 	{
-		name:    "SubPairsUint16x16",
+		name:    "SubPairsSaturatedInt16x8",
 		argLen:  2,
 		generic: true,
 	},
 	{
-		name:    "SubPairsUint32x4",
+		name:    "SubPairsUint16x8",
 		argLen:  2,
 		generic: true,
 	},
 	{
-		name:    "SubPairsUint32x8",
+		name:    "SubPairsUint32x4",
 		argLen:  2,
 		generic: true,
 	},
diff --git a/src/cmd/compile/internal/ssa/rewriteAMD64.go b/src/cmd/compile/internal/ssa/rewriteAMD64.go
index 19f16e1cbb..e84bf19c83 100644
--- a/src/cmd/compile/internal/ssa/rewriteAMD64.go
+++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go
@@ -1006,10 +1006,6 @@ func rewriteValueAMD64(v *Value) bool {
 		return rewriteValueAMD64_OpAMD64VPACKUSDWMasked256(v)
 	case OpAMD64VPACKUSDWMasked512:
 		return rewriteValueAMD64_OpAMD64VPACKUSDWMasked512(v)
-	case OpAMD64VPADDD128:
-		return rewriteValueAMD64_OpAMD64VPADDD128(v)
-	case OpAMD64VPADDD256:
-		return rewriteValueAMD64_OpAMD64VPADDD256(v)
 	case OpAMD64VPADDD512:
 		return rewriteValueAMD64_OpAMD64VPADDD512(v)
 	case OpAMD64VPADDDMasked128:
@@ -1126,22 +1122,6 @@ func rewriteValueAMD64(v *Value) bool {
 		return rewriteValueAMD64_OpAMD64VPCMPUQMasked256(v)
 	case OpAMD64VPCMPUQMasked512:
 		return rewriteValueAMD64_OpAMD64VPCMPUQMasked512(v)
-	case OpAMD64VPDPBUSD512:
-		return rewriteValueAMD64_OpAMD64VPDPBUSD512(v)
-	case OpAMD64VPDPBUSDMasked128:
-		return rewriteValueAMD64_OpAMD64VPDPBUSDMasked128(v)
-	case OpAMD64VPDPBUSDMasked256:
-		return rewriteValueAMD64_OpAMD64VPDPBUSDMasked256(v)
-	case OpAMD64VPDPBUSDMasked512:
-		return rewriteValueAMD64_OpAMD64VPDPBUSDMasked512(v)
-	case OpAMD64VPDPBUSDS512:
-		return rewriteValueAMD64_OpAMD64VPDPBUSDS512(v)
-	case OpAMD64VPDPBUSDSMasked128:
-		return rewriteValueAMD64_OpAMD64VPDPBUSDSMasked128(v)
-	case OpAMD64VPDPBUSDSMasked256:
-		return rewriteValueAMD64_OpAMD64VPDPBUSDSMasked256(v)
-	case OpAMD64VPDPBUSDSMasked512:
-		return rewriteValueAMD64_OpAMD64VPDPBUSDSMasked512(v)
 	case OpAMD64VPDPWSSD512:
 		return rewriteValueAMD64_OpAMD64VPDPWSSD512(v)
 	case OpAMD64VPDPWSSDMasked128:
@@ -1402,6 +1382,10 @@ func rewriteValueAMD64(v *Value) bool {
 		return rewriteValueAMD64_OpAMD64VPOPCNTQMasked256(v)
 	case OpAMD64VPOPCNTQMasked512:
 		return rewriteValueAMD64_OpAMD64VPOPCNTQMasked512(v)
+	case OpAMD64VPOR128:
+		return rewriteValueAMD64_OpAMD64VPOR128(v)
+	case OpAMD64VPOR256:
+		return rewriteValueAMD64_OpAMD64VPOR256(v)
 	case OpAMD64VPORD512:
 		return rewriteValueAMD64_OpAMD64VPORD512(v)
 	case OpAMD64VPORDMasked128:
@@ -2133,45 +2117,45 @@ func rewriteValueAMD64(v *Value) bool {
 	case OpAddPairsFloat32x4:
 		v.Op = OpAMD64VHADDPS128
 		return true
-	case OpAddPairsFloat32x8:
-		v.Op = OpAMD64VHADDPS256
-		return true
 	case OpAddPairsFloat64x2:
 		v.Op = OpAMD64VHADDPD128
 		return true
-	case OpAddPairsFloat64x4:
+	case OpAddPairsGroupedFloat32x8:
+		v.Op = OpAMD64VHADDPS256
+		return true
+	case OpAddPairsGroupedFloat64x4:
 		v.Op = OpAMD64VHADDPD256
 		return true
-	case OpAddPairsInt16x16:
+	case OpAddPairsGroupedInt16x16:
+		v.Op = OpAMD64VPHADDW256
+		return true
+	case OpAddPairsGroupedInt32x8:
+		v.Op = OpAMD64VPHADDD256
+		return true
+	case OpAddPairsGroupedUint16x16:
 		v.Op = OpAMD64VPHADDW256
 		return true
+	case OpAddPairsGroupedUint32x8:
+		v.Op = OpAMD64VPHADDD256
+		return true
 	case OpAddPairsInt16x8:
 		v.Op = OpAMD64VPHADDW128
 		return true
 	case OpAddPairsInt32x4:
 		v.Op = OpAMD64VPHADDD128
 		return true
-	case OpAddPairsInt32x8:
-		v.Op = OpAMD64VPHADDD256
-		return true
-	case OpAddPairsSaturatedInt16x16:
+	case OpAddPairsSaturatedGroupedInt16x16:
 		v.Op = OpAMD64VPHADDSW256
 		return true
 	case OpAddPairsSaturatedInt16x8:
 		v.Op = OpAMD64VPHADDSW128
 		return true
-	case OpAddPairsUint16x16:
-		v.Op = OpAMD64VPHADDW256
-		return true
 	case OpAddPairsUint16x8:
 		v.Op = OpAMD64VPHADDW128
 		return true
 	case OpAddPairsUint32x4:
 		v.Op = OpAMD64VPHADDD128
 		return true
-	case OpAddPairsUint32x8:
-		v.Op = OpAMD64VPHADDD256
-		return true
 	case OpAddPtr:
 		v.Op = OpAMD64ADDQ
 		return true
@@ -3066,19 +3050,25 @@ func rewriteValueAMD64(v *Value) bool {
 	case OpCvtMask32x16to16:
 		return rewriteValueAMD64_OpCvtMask32x16to16(v)
 	case OpCvtMask32x4to8:
-		return rewriteValueAMD64_OpCvtMask32x4to8(v)
+		v.Op = OpAMD64VMOVMSKPS128
+		return true
 	case OpCvtMask32x8to8:
-		return rewriteValueAMD64_OpCvtMask32x8to8(v)
+		v.Op = OpAMD64VMOVMSKPS256
+		return true
 	case OpCvtMask64x2to8:
-		return rewriteValueAMD64_OpCvtMask64x2to8(v)
+		v.Op = OpAMD64VMOVMSKPD128
+		return true
 	case OpCvtMask64x4to8:
-		return rewriteValueAMD64_OpCvtMask64x4to8(v)
+		v.Op = OpAMD64VMOVMSKPD256
+		return true
 	case OpCvtMask64x8to8:
 		return rewriteValueAMD64_OpCvtMask64x8to8(v)
 	case OpCvtMask8x16to16:
-		return rewriteValueAMD64_OpCvtMask8x16to16(v)
+		v.Op = OpAMD64VPMOVMSKB128
+		return true
 	case OpCvtMask8x32to32:
-		return rewriteValueAMD64_OpCvtMask8x32to32(v)
+		v.Op = OpAMD64VPMOVMSKB256
+		return true
 	case OpCvtMask8x64to64:
 		return rewriteValueAMD64_OpCvtMask8x64to64(v)
 	case OpDiv128u:
@@ -3142,24 +3132,6 @@ func rewriteValueAMD64(v *Value) bool {
 	case OpDotProductPairsSaturatedUint8x64:
 		v.Op = OpAMD64VPMADDUBSW512
 		return true
-	case OpDotProductQuadrupleInt32x16:
-		v.Op = OpAMD64VPDPBUSD512
-		return true
-	case OpDotProductQuadrupleInt32x4:
-		v.Op = OpAMD64VPDPBUSD128
-		return true
-	case OpDotProductQuadrupleInt32x8:
-		v.Op = OpAMD64VPDPBUSD256
-		return true
-	case OpDotProductQuadrupleSaturatedInt32x16:
-		v.Op = OpAMD64VPDPBUSDS512
-		return true
-	case OpDotProductQuadrupleSaturatedInt32x4:
-		v.Op = OpAMD64VPDPBUSDS128
-		return true
-	case OpDotProductQuadrupleSaturatedInt32x8:
-		v.Op = OpAMD64VPDPBUSDS256
-		return true
 	case OpEq16:
 		return rewriteValueAMD64_OpEq16(v)
 	case OpEq32:
@@ -3312,64 +3284,64 @@ func rewriteValueAMD64(v *Value) bool {
 		return rewriteValueAMD64_OpExpandUint8x32(v)
 	case OpExpandUint8x64:
 		return rewriteValueAMD64_OpExpandUint8x64(v)
-	case OpExtendLo2ToInt64x2Int16x8:
+	case OpExtendLo2ToInt64Int16x8:
 		v.Op = OpAMD64VPMOVSXWQ128
 		return true
-	case OpExtendLo2ToInt64x2Int32x4:
+	case OpExtendLo2ToInt64Int32x4:
 		v.Op = OpAMD64VPMOVSXDQ128
 		return true
-	case OpExtendLo2ToInt64x2Int8x16:
+	case OpExtendLo2ToInt64Int8x16:
 		v.Op = OpAMD64VPMOVSXBQ128
 		return true
-	case OpExtendLo2ToUint64x2Uint16x8:
+	case OpExtendLo2ToUint64Uint16x8:
 		v.Op = OpAMD64VPMOVZXWQ128
 		return true
-	case OpExtendLo2ToUint64x2Uint32x4:
+	case OpExtendLo2ToUint64Uint32x4:
 		v.Op = OpAMD64VPMOVZXDQ128
 		return true
-	case OpExtendLo2ToUint64x2Uint8x16:
+	case OpExtendLo2ToUint64Uint8x16:
 		v.Op = OpAMD64VPMOVZXBQ128
 		return true
-	case OpExtendLo4ToInt32x4Int16x8:
+	case OpExtendLo4ToInt32Int16x8:
 		v.Op = OpAMD64VPMOVSXWD128
 		return true
-	case OpExtendLo4ToInt32x4Int8x16:
+	case OpExtendLo4ToInt32Int8x16:
 		v.Op = OpAMD64VPMOVSXBD128
 		return true
-	case OpExtendLo4ToInt64x4Int16x8:
+	case OpExtendLo4ToInt64Int16x8:
 		v.Op = OpAMD64VPMOVSXWQ256
 		return true
-	case OpExtendLo4ToInt64x4Int8x16:
+	case OpExtendLo4ToInt64Int8x16:
 		v.Op = OpAMD64VPMOVSXBQ256
 		return true
-	case OpExtendLo4ToUint32x4Uint16x8:
+	case OpExtendLo4ToUint32Uint16x8:
 		v.Op = OpAMD64VPMOVZXWD128
 		return true
-	case OpExtendLo4ToUint32x4Uint8x16:
+	case OpExtendLo4ToUint32Uint8x16:
 		v.Op = OpAMD64VPMOVZXBD128
 		return true
-	case OpExtendLo4ToUint64x4Uint16x8:
+	case OpExtendLo4ToUint64Uint16x8:
 		v.Op = OpAMD64VPMOVZXWQ256
 		return true
-	case OpExtendLo4ToUint64x4Uint8x16:
+	case OpExtendLo4ToUint64Uint8x16:
 		v.Op = OpAMD64VPMOVZXBQ256
 		return true
-	case OpExtendLo8ToInt16x8Int8x16:
+	case OpExtendLo8ToInt16Int8x16:
 		v.Op = OpAMD64VPMOVSXBW128
 		return true
-	case OpExtendLo8ToInt32x8Int8x16:
+	case OpExtendLo8ToInt32Int8x16:
 		v.Op = OpAMD64VPMOVSXBD256
 		return true
-	case OpExtendLo8ToInt64x8Int8x16:
+	case OpExtendLo8ToInt64Int8x16:
 		v.Op = OpAMD64VPMOVSXBQ512
 		return true
-	case OpExtendLo8ToUint16x8Uint8x16:
+	case OpExtendLo8ToUint16Uint8x16:
 		v.Op = OpAMD64VPMOVZXBW128
 		return true
-	case OpExtendLo8ToUint32x8Uint8x16:
+	case OpExtendLo8ToUint32Uint8x16:
 		v.Op = OpAMD64VPMOVZXBD256
 		return true
-	case OpExtendLo8ToUint64x8Uint8x16:
+	case OpExtendLo8ToUint64Uint8x16:
 		v.Op = OpAMD64VPMOVZXBQ512
 		return true
 	case OpExtendToInt16Int8x16:
@@ -3811,18 +3783,18 @@ func rewriteValueAMD64(v *Value) bool {
 		return true
 	case OpIsInBounds:
 		return rewriteValueAMD64_OpIsInBounds(v)
-	case OpIsNanFloat32x16:
-		return rewriteValueAMD64_OpIsNanFloat32x16(v)
-	case OpIsNanFloat32x4:
-		return rewriteValueAMD64_OpIsNanFloat32x4(v)
-	case OpIsNanFloat32x8:
-		return rewriteValueAMD64_OpIsNanFloat32x8(v)
-	case OpIsNanFloat64x2:
-		return rewriteValueAMD64_OpIsNanFloat64x2(v)
-	case OpIsNanFloat64x4:
-		return rewriteValueAMD64_OpIsNanFloat64x4(v)
-	case OpIsNanFloat64x8:
-		return rewriteValueAMD64_OpIsNanFloat64x8(v)
+	case OpIsNaNFloat32x16:
+		return rewriteValueAMD64_OpIsNaNFloat32x16(v)
+	case OpIsNaNFloat32x4:
+		return rewriteValueAMD64_OpIsNaNFloat32x4(v)
+	case OpIsNaNFloat32x8:
+		return rewriteValueAMD64_OpIsNaNFloat32x8(v)
+	case OpIsNaNFloat64x2:
+		return rewriteValueAMD64_OpIsNaNFloat64x2(v)
+	case OpIsNaNFloat64x4:
+		return rewriteValueAMD64_OpIsNaNFloat64x4(v)
+	case OpIsNaNFloat64x8:
+		return rewriteValueAMD64_OpIsNaNFloat64x8(v)
 	case OpIsNonNil:
 		return rewriteValueAMD64_OpIsNonNil(v)
 	case OpIsSliceInBounds:
@@ -5040,15 +5012,15 @@ func rewriteValueAMD64(v *Value) bool {
 	case OpSHA256TwoRoundsUint32x4:
 		v.Op = OpAMD64SHA256RNDS2128
 		return true
-	case OpSaturateToInt16ConcatInt32x16:
+	case OpSaturateToInt16ConcatGroupedInt32x16:
 		v.Op = OpAMD64VPACKSSDW512
 		return true
+	case OpSaturateToInt16ConcatGroupedInt32x8:
+		v.Op = OpAMD64VPACKSSDW256
+		return true
 	case OpSaturateToInt16ConcatInt32x4:
 		v.Op = OpAMD64VPACKSSDW128
 		return true
-	case OpSaturateToInt16ConcatInt32x8:
-		v.Op = OpAMD64VPACKSSDW256
-		return true
 	case OpSaturateToInt16Int32x16:
 		v.Op = OpAMD64VPMOVSDW256
 		return true
@@ -5103,15 +5075,15 @@ func rewriteValueAMD64(v *Value) bool {
 	case OpSaturateToInt8Int64x8:
 		v.Op = OpAMD64VPMOVSQB128_512
 		return true
-	case OpSaturateToUint16ConcatUint32x16:
+	case OpSaturateToUint16ConcatGroupedInt32x16:
 		v.Op = OpAMD64VPACKUSDW512
 		return true
-	case OpSaturateToUint16ConcatUint32x4:
-		v.Op = OpAMD64VPACKUSDW128
-		return true
-	case OpSaturateToUint16ConcatUint32x8:
+	case OpSaturateToUint16ConcatGroupedInt32x8:
 		v.Op = OpAMD64VPACKUSDW256
 		return true
+	case OpSaturateToUint16ConcatInt32x4:
+		v.Op = OpAMD64VPACKUSDW128
+		return true
 	case OpSaturateToUint16Uint32x16:
 		v.Op = OpAMD64VPMOVUSDW256
 		return true
@@ -5139,32 +5111,32 @@ func rewriteValueAMD64(v *Value) bool {
 	case OpSaturateToUint32Uint64x8:
 		v.Op = OpAMD64VPMOVUSQD256
 		return true
-	case OpSaturateToUint8Int16x16:
-		v.Op = OpAMD64VPMOVSWB128_256
+	case OpSaturateToUint8Uint16x16:
+		v.Op = OpAMD64VPMOVUSWB128_256
 		return true
-	case OpSaturateToUint8Int16x8:
-		v.Op = OpAMD64VPMOVSWB128_128
+	case OpSaturateToUint8Uint16x32:
+		v.Op = OpAMD64VPMOVUSWB256
 		return true
-	case OpSaturateToUint8Int32x16:
-		v.Op = OpAMD64VPMOVSDB128_512
+	case OpSaturateToUint8Uint16x8:
+		v.Op = OpAMD64VPMOVUSWB128_128
 		return true
-	case OpSaturateToUint8Int32x4:
-		v.Op = OpAMD64VPMOVSDB128_128
+	case OpSaturateToUint8Uint32x16:
+		v.Op = OpAMD64VPMOVUSDB128_512
 		return true
-	case OpSaturateToUint8Int32x8:
-		v.Op = OpAMD64VPMOVSDB128_256
+	case OpSaturateToUint8Uint32x4:
+		v.Op = OpAMD64VPMOVUSDB128_128
 		return true
-	case OpSaturateToUint8Int64x2:
-		v.Op = OpAMD64VPMOVSQB128_128
+	case OpSaturateToUint8Uint32x8:
+		v.Op = OpAMD64VPMOVUSDB128_256
 		return true
-	case OpSaturateToUint8Int64x4:
-		v.Op = OpAMD64VPMOVSQB128_256
+	case OpSaturateToUint8Uint64x2:
+		v.Op = OpAMD64VPMOVUSQB128_128
 		return true
-	case OpSaturateToUint8Int64x8:
-		v.Op = OpAMD64VPMOVSQB128_512
+	case OpSaturateToUint8Uint64x4:
+		v.Op = OpAMD64VPMOVUSQB128_256
 		return true
-	case OpSaturateToUint8Uint16x32:
-		v.Op = OpAMD64VPMOVUSWB256
+	case OpSaturateToUint8Uint64x8:
+		v.Op = OpAMD64VPMOVUSQB128_512
 		return true
 	case OpScaleFloat32x16:
 		v.Op = OpAMD64VSCALEFPS512
@@ -5898,45 +5870,45 @@ func rewriteValueAMD64(v *Value) bool {
 	case OpSubPairsFloat32x4:
 		v.Op = OpAMD64VHSUBPS128
 		return true
-	case OpSubPairsFloat32x8:
-		v.Op = OpAMD64VHSUBPS256
-		return true
 	case OpSubPairsFloat64x2:
 		v.Op = OpAMD64VHSUBPD128
 		return true
-	case OpSubPairsFloat64x4:
+	case OpSubPairsGroupedFloat32x8:
+		v.Op = OpAMD64VHSUBPS256
+		return true
+	case OpSubPairsGroupedFloat64x4:
 		v.Op = OpAMD64VHSUBPD256
 		return true
-	case OpSubPairsInt16x16:
+	case OpSubPairsGroupedInt16x16:
+		v.Op = OpAMD64VPHSUBW256
+		return true
+	case OpSubPairsGroupedInt32x8:
+		v.Op = OpAMD64VPHSUBD256
+		return true
+	case OpSubPairsGroupedUint16x16:
 		v.Op = OpAMD64VPHSUBW256
 		return true
+	case OpSubPairsGroupedUint32x8:
+		v.Op = OpAMD64VPHSUBD256
+		return true
 	case OpSubPairsInt16x8:
 		v.Op = OpAMD64VPHSUBW128
 		return true
 	case OpSubPairsInt32x4:
 		v.Op = OpAMD64VPHSUBD128
 		return true
-	case OpSubPairsInt32x8:
-		v.Op = OpAMD64VPHSUBD256
-		return true
-	case OpSubPairsSaturatedInt16x16:
+	case OpSubPairsSaturatedGroupedInt16x16:
 		v.Op = OpAMD64VPHSUBSW256
 		return true
 	case OpSubPairsSaturatedInt16x8:
 		v.Op = OpAMD64VPHSUBSW128
 		return true
-	case OpSubPairsUint16x16:
-		v.Op = OpAMD64VPHSUBW256
-		return true
 	case OpSubPairsUint16x8:
 		v.Op = OpAMD64VPHSUBW128
 		return true
 	case OpSubPairsUint32x4:
 		v.Op = OpAMD64VPHSUBD128
 		return true
-	case OpSubPairsUint32x8:
-		v.Op = OpAMD64VPHSUBD256
-		return true
 	case OpSubPtr:
 		v.Op = OpAMD64SUBQ
 		return true
@@ -28763,7 +28735,7 @@ func rewriteValueAMD64_OpAMD64VCMPPD512(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VCMPPD512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem))
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VCMPPD512load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mem)
+	// result: (VCMPPD512load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		x := v_0
@@ -28779,7 +28751,7 @@ func rewriteValueAMD64_OpAMD64VCMPPD512(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VCMPPD512load)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg3(x, ptr, mem)
 		return true
@@ -28792,7 +28764,7 @@ func rewriteValueAMD64_OpAMD64VCMPPDMasked128(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VCMPPDMasked128 [c] x l:(VMOVDQUload128 {sym} [off] ptr mem) mask)
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VCMPPDMasked128load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
+	// result: (VCMPPDMasked128load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mask mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		x := v_0
@@ -28809,7 +28781,7 @@ func rewriteValueAMD64_OpAMD64VCMPPDMasked128(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VCMPPDMasked128load)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg4(x, ptr, mask, mem)
 		return true
@@ -28822,7 +28794,7 @@ func rewriteValueAMD64_OpAMD64VCMPPDMasked256(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VCMPPDMasked256 [c] x l:(VMOVDQUload256 {sym} [off] ptr mem) mask)
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VCMPPDMasked256load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
+	// result: (VCMPPDMasked256load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mask mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		x := v_0
@@ -28839,7 +28811,7 @@ func rewriteValueAMD64_OpAMD64VCMPPDMasked256(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VCMPPDMasked256load)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg4(x, ptr, mask, mem)
 		return true
@@ -28852,7 +28824,7 @@ func rewriteValueAMD64_OpAMD64VCMPPDMasked512(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VCMPPDMasked512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem) mask)
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VCMPPDMasked512load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
+	// result: (VCMPPDMasked512load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mask mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		x := v_0
@@ -28869,7 +28841,7 @@ func rewriteValueAMD64_OpAMD64VCMPPDMasked512(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VCMPPDMasked512load)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg4(x, ptr, mask, mem)
 		return true
@@ -28881,7 +28853,7 @@ func rewriteValueAMD64_OpAMD64VCMPPS512(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VCMPPS512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem))
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VCMPPS512load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mem)
+	// result: (VCMPPS512load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		x := v_0
@@ -28897,7 +28869,7 @@ func rewriteValueAMD64_OpAMD64VCMPPS512(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VCMPPS512load)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg3(x, ptr, mem)
 		return true
@@ -28910,7 +28882,7 @@ func rewriteValueAMD64_OpAMD64VCMPPSMasked128(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VCMPPSMasked128 [c] x l:(VMOVDQUload128 {sym} [off] ptr mem) mask)
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VCMPPSMasked128load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
+	// result: (VCMPPSMasked128load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mask mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		x := v_0
@@ -28927,7 +28899,7 @@ func rewriteValueAMD64_OpAMD64VCMPPSMasked128(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VCMPPSMasked128load)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg4(x, ptr, mask, mem)
 		return true
@@ -28940,7 +28912,7 @@ func rewriteValueAMD64_OpAMD64VCMPPSMasked256(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VCMPPSMasked256 [c] x l:(VMOVDQUload256 {sym} [off] ptr mem) mask)
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VCMPPSMasked256load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
+	// result: (VCMPPSMasked256load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mask mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		x := v_0
@@ -28957,7 +28929,7 @@ func rewriteValueAMD64_OpAMD64VCMPPSMasked256(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VCMPPSMasked256load)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg4(x, ptr, mask, mem)
 		return true
@@ -28970,7 +28942,7 @@ func rewriteValueAMD64_OpAMD64VCMPPSMasked512(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VCMPPSMasked512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem) mask)
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VCMPPSMasked512load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
+	// result: (VCMPPSMasked512load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mask mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		x := v_0
@@ -28987,7 +28959,7 @@ func rewriteValueAMD64_OpAMD64VCMPPSMasked512(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VCMPPSMasked512load)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg4(x, ptr, mask, mem)
 		return true
@@ -32605,7 +32577,7 @@ func rewriteValueAMD64_OpAMD64VGF2P8AFFINEINVQB128(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VGF2P8AFFINEINVQB128 [c] x l:(VMOVDQUload128 {sym} [off] ptr mem))
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VGF2P8AFFINEINVQB128load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mem)
+	// result: (VGF2P8AFFINEINVQB128load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		x := v_0
@@ -32621,7 +32593,7 @@ func rewriteValueAMD64_OpAMD64VGF2P8AFFINEINVQB128(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VGF2P8AFFINEINVQB128load)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg3(x, ptr, mem)
 		return true
@@ -32633,7 +32605,7 @@ func rewriteValueAMD64_OpAMD64VGF2P8AFFINEINVQB256(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VGF2P8AFFINEINVQB256 [c] x l:(VMOVDQUload256 {sym} [off] ptr mem))
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VGF2P8AFFINEINVQB256load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mem)
+	// result: (VGF2P8AFFINEINVQB256load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		x := v_0
@@ -32649,7 +32621,7 @@ func rewriteValueAMD64_OpAMD64VGF2P8AFFINEINVQB256(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VGF2P8AFFINEINVQB256load)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg3(x, ptr, mem)
 		return true
@@ -32661,7 +32633,7 @@ func rewriteValueAMD64_OpAMD64VGF2P8AFFINEINVQB512(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VGF2P8AFFINEINVQB512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem))
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VGF2P8AFFINEINVQB512load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mem)
+	// result: (VGF2P8AFFINEINVQB512load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		x := v_0
@@ -32677,7 +32649,7 @@ func rewriteValueAMD64_OpAMD64VGF2P8AFFINEINVQB512(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VGF2P8AFFINEINVQB512load)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg3(x, ptr, mem)
 		return true
@@ -32690,7 +32662,7 @@ func rewriteValueAMD64_OpAMD64VGF2P8AFFINEINVQBMasked128(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VGF2P8AFFINEINVQBMasked128 [c] x l:(VMOVDQUload128 {sym} [off] ptr mem) mask)
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VGF2P8AFFINEINVQBMasked128load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
+	// result: (VGF2P8AFFINEINVQBMasked128load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mask mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		x := v_0
@@ -32707,7 +32679,7 @@ func rewriteValueAMD64_OpAMD64VGF2P8AFFINEINVQBMasked128(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VGF2P8AFFINEINVQBMasked128load)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg4(x, ptr, mask, mem)
 		return true
@@ -32720,7 +32692,7 @@ func rewriteValueAMD64_OpAMD64VGF2P8AFFINEINVQBMasked256(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VGF2P8AFFINEINVQBMasked256 [c] x l:(VMOVDQUload256 {sym} [off] ptr mem) mask)
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VGF2P8AFFINEINVQBMasked256load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
+	// result: (VGF2P8AFFINEINVQBMasked256load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mask mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		x := v_0
@@ -32737,7 +32709,7 @@ func rewriteValueAMD64_OpAMD64VGF2P8AFFINEINVQBMasked256(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VGF2P8AFFINEINVQBMasked256load)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg4(x, ptr, mask, mem)
 		return true
@@ -32750,7 +32722,7 @@ func rewriteValueAMD64_OpAMD64VGF2P8AFFINEINVQBMasked512(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VGF2P8AFFINEINVQBMasked512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem) mask)
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VGF2P8AFFINEINVQBMasked512load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
+	// result: (VGF2P8AFFINEINVQBMasked512load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mask mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		x := v_0
@@ -32767,7 +32739,7 @@ func rewriteValueAMD64_OpAMD64VGF2P8AFFINEINVQBMasked512(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VGF2P8AFFINEINVQBMasked512load)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg4(x, ptr, mask, mem)
 		return true
@@ -32779,7 +32751,7 @@ func rewriteValueAMD64_OpAMD64VGF2P8AFFINEQB128(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VGF2P8AFFINEQB128 [c] x l:(VMOVDQUload128 {sym} [off] ptr mem))
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VGF2P8AFFINEQB128load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mem)
+	// result: (VGF2P8AFFINEQB128load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		x := v_0
@@ -32795,7 +32767,7 @@ func rewriteValueAMD64_OpAMD64VGF2P8AFFINEQB128(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VGF2P8AFFINEQB128load)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg3(x, ptr, mem)
 		return true
@@ -32807,7 +32779,7 @@ func rewriteValueAMD64_OpAMD64VGF2P8AFFINEQB256(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VGF2P8AFFINEQB256 [c] x l:(VMOVDQUload256 {sym} [off] ptr mem))
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VGF2P8AFFINEQB256load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mem)
+	// result: (VGF2P8AFFINEQB256load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		x := v_0
@@ -32823,7 +32795,7 @@ func rewriteValueAMD64_OpAMD64VGF2P8AFFINEQB256(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VGF2P8AFFINEQB256load)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg3(x, ptr, mem)
 		return true
@@ -32835,7 +32807,7 @@ func rewriteValueAMD64_OpAMD64VGF2P8AFFINEQB512(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VGF2P8AFFINEQB512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem))
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VGF2P8AFFINEQB512load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mem)
+	// result: (VGF2P8AFFINEQB512load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		x := v_0
@@ -32851,7 +32823,7 @@ func rewriteValueAMD64_OpAMD64VGF2P8AFFINEQB512(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VGF2P8AFFINEQB512load)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg3(x, ptr, mem)
 		return true
@@ -32864,7 +32836,7 @@ func rewriteValueAMD64_OpAMD64VGF2P8AFFINEQBMasked128(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VGF2P8AFFINEQBMasked128 [c] x l:(VMOVDQUload128 {sym} [off] ptr mem) mask)
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VGF2P8AFFINEQBMasked128load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
+	// result: (VGF2P8AFFINEQBMasked128load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mask mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		x := v_0
@@ -32881,7 +32853,7 @@ func rewriteValueAMD64_OpAMD64VGF2P8AFFINEQBMasked128(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VGF2P8AFFINEQBMasked128load)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg4(x, ptr, mask, mem)
 		return true
@@ -32894,7 +32866,7 @@ func rewriteValueAMD64_OpAMD64VGF2P8AFFINEQBMasked256(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VGF2P8AFFINEQBMasked256 [c] x l:(VMOVDQUload256 {sym} [off] ptr mem) mask)
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VGF2P8AFFINEQBMasked256load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
+	// result: (VGF2P8AFFINEQBMasked256load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mask mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		x := v_0
@@ -32911,7 +32883,7 @@ func rewriteValueAMD64_OpAMD64VGF2P8AFFINEQBMasked256(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VGF2P8AFFINEQBMasked256load)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg4(x, ptr, mask, mem)
 		return true
@@ -32924,7 +32896,7 @@ func rewriteValueAMD64_OpAMD64VGF2P8AFFINEQBMasked512(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VGF2P8AFFINEQBMasked512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem) mask)
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VGF2P8AFFINEQBMasked512load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
+	// result: (VGF2P8AFFINEQBMasked512load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mask mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		x := v_0
@@ -32941,7 +32913,7 @@ func rewriteValueAMD64_OpAMD64VGF2P8AFFINEQBMasked512(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VGF2P8AFFINEQBMasked512load)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg4(x, ptr, mask, mem)
 		return true
@@ -33775,6 +33747,18 @@ func rewriteValueAMD64_OpAMD64VMOVDQU16Masked128(v *Value) bool {
 		v.AddArg2(x, mask)
 		return true
 	}
+	// match: (VMOVDQU16Masked128 (VPMOVUSWB128_128 x) mask)
+	// result: (VPMOVUSWBMasked128_128 x mask)
+	for {
+		if v_0.Op != OpAMD64VPMOVUSWB128_128 {
+			break
+		}
+		x := v_0.Args[0]
+		mask := v_1
+		v.reset(OpAMD64VPMOVUSWBMasked128_128)
+		v.AddArg2(x, mask)
+		return true
+	}
 	// match: (VMOVDQU16Masked128 (VPSHLDW128 [a] x y) mask)
 	// result: (VPSHLDWMasked128 [a] x y mask)
 	for {
@@ -34327,6 +34311,18 @@ func rewriteValueAMD64_OpAMD64VMOVDQU16Masked256(v *Value) bool {
 		v.AddArg2(x, mask)
 		return true
 	}
+	// match: (VMOVDQU16Masked256 (VPMOVUSWB128_256 x) mask)
+	// result: (VPMOVUSWBMasked128_256 x mask)
+	for {
+		if v_0.Op != OpAMD64VPMOVUSWB128_256 {
+			break
+		}
+		x := v_0.Args[0]
+		mask := v_1
+		v.reset(OpAMD64VPMOVUSWBMasked128_256)
+		v.AddArg2(x, mask)
+		return true
+	}
 	// match: (VMOVDQU16Masked256 (VPMOVUSWB256 x) mask)
 	// result: (VPMOVUSWBMasked256 x mask)
 	for {
@@ -35294,34 +35290,6 @@ func rewriteValueAMD64_OpAMD64VMOVDQU32Masked128(v *Value) bool {
 		v.AddArg3(x, y, mask)
 		return true
 	}
-	// match: (VMOVDQU32Masked128 (VPDPBUSD128 x y z) mask)
-	// result: (VPDPBUSDMasked128 x y z mask)
-	for {
-		if v_0.Op != OpAMD64VPDPBUSD128 {
-			break
-		}
-		z := v_0.Args[2]
-		x := v_0.Args[0]
-		y := v_0.Args[1]
-		mask := v_1
-		v.reset(OpAMD64VPDPBUSDMasked128)
-		v.AddArg4(x, y, z, mask)
-		return true
-	}
-	// match: (VMOVDQU32Masked128 (VPDPBUSDS128 x y z) mask)
-	// result: (VPDPBUSDSMasked128 x y z mask)
-	for {
-		if v_0.Op != OpAMD64VPDPBUSDS128 {
-			break
-		}
-		z := v_0.Args[2]
-		x := v_0.Args[0]
-		y := v_0.Args[1]
-		mask := v_1
-		v.reset(OpAMD64VPDPBUSDSMasked128)
-		v.AddArg4(x, y, z, mask)
-		return true
-	}
 	// match: (VMOVDQU32Masked128 (VPMOVSXDQ128 x) mask)
 	// result: (VPMOVSXDQMasked128 x mask)
 	for {
@@ -35607,6 +35575,18 @@ func rewriteValueAMD64_OpAMD64VMOVDQU32Masked128(v *Value) bool {
 		v.AddArg2(x, mask)
 		return true
 	}
+	// match: (VMOVDQU32Masked128 (VPMOVUSDB128_128 x) mask)
+	// result: (VPMOVUSDBMasked128_128 x mask)
+	for {
+		if v_0.Op != OpAMD64VPMOVUSDB128_128 {
+			break
+		}
+		x := v_0.Args[0]
+		mask := v_1
+		v.reset(OpAMD64VPMOVUSDBMasked128_128)
+		v.AddArg2(x, mask)
+		return true
+	}
 	// match: (VMOVDQU32Masked128 (VPACKUSDW128 x y) mask)
 	// result: (VPACKUSDWMasked128 x y mask)
 	for {
@@ -36129,34 +36109,6 @@ func rewriteValueAMD64_OpAMD64VMOVDQU32Masked256(v *Value) bool {
 		v.AddArg3(x, y, mask)
 		return true
 	}
-	// match: (VMOVDQU32Masked256 (VPDPBUSD256 x y z) mask)
-	// result: (VPDPBUSDMasked256 x y z mask)
-	for {
-		if v_0.Op != OpAMD64VPDPBUSD256 {
-			break
-		}
-		z := v_0.Args[2]
-		x := v_0.Args[0]
-		y := v_0.Args[1]
-		mask := v_1
-		v.reset(OpAMD64VPDPBUSDMasked256)
-		v.AddArg4(x, y, z, mask)
-		return true
-	}
-	// match: (VMOVDQU32Masked256 (VPDPBUSDS256 x y z) mask)
-	// result: (VPDPBUSDSMasked256 x y z mask)
-	for {
-		if v_0.Op != OpAMD64VPDPBUSDS256 {
-			break
-		}
-		z := v_0.Args[2]
-		x := v_0.Args[0]
-		y := v_0.Args[1]
-		mask := v_1
-		v.reset(OpAMD64VPDPBUSDSMasked256)
-		v.AddArg4(x, y, z, mask)
-		return true
-	}
 	// match: (VMOVDQU32Masked256 (VPMOVSXDQ256 x) mask)
 	// result: (VPMOVSXDQMasked256 x mask)
 	for {
@@ -36480,6 +36432,18 @@ func rewriteValueAMD64_OpAMD64VMOVDQU32Masked256(v *Value) bool {
 		v.AddArg2(x, mask)
 		return true
 	}
+	// match: (VMOVDQU32Masked256 (VPMOVUSDB128_256 x) mask)
+	// result: (VPMOVUSDBMasked128_256 x mask)
+	for {
+		if v_0.Op != OpAMD64VPMOVUSDB128_256 {
+			break
+		}
+		x := v_0.Args[0]
+		mask := v_1
+		v.reset(OpAMD64VPMOVUSDBMasked128_256)
+		v.AddArg2(x, mask)
+		return true
+	}
 	// match: (VMOVDQU32Masked256 (VPACKUSDW256 x y) mask)
 	// result: (VPACKUSDWMasked256 x y mask)
 	for {
@@ -37052,34 +37016,6 @@ func rewriteValueAMD64_OpAMD64VMOVDQU32Masked512(v *Value) bool {
 		v.AddArg3(x, y, mask)
 		return true
 	}
-	// match: (VMOVDQU32Masked512 (VPDPBUSD512 x y z) mask)
-	// result: (VPDPBUSDMasked512 x y z mask)
-	for {
-		if v_0.Op != OpAMD64VPDPBUSD512 {
-			break
-		}
-		z := v_0.Args[2]
-		x := v_0.Args[0]
-		y := v_0.Args[1]
-		mask := v_1
-		v.reset(OpAMD64VPDPBUSDMasked512)
-		v.AddArg4(x, y, z, mask)
-		return true
-	}
-	// match: (VMOVDQU32Masked512 (VPDPBUSDS512 x y z) mask)
-	// result: (VPDPBUSDSMasked512 x y z mask)
-	for {
-		if v_0.Op != OpAMD64VPDPBUSDS512 {
-			break
-		}
-		z := v_0.Args[2]
-		x := v_0.Args[0]
-		y := v_0.Args[1]
-		mask := v_1
-		v.reset(OpAMD64VPDPBUSDSMasked512)
-		v.AddArg4(x, y, z, mask)
-		return true
-	}
 	// match: (VMOVDQU32Masked512 (VPMOVSXDQ512 x) mask)
 	// result: (VPMOVSXDQMasked512 x mask)
 	for {
@@ -37416,6 +37352,18 @@ func rewriteValueAMD64_OpAMD64VMOVDQU32Masked512(v *Value) bool {
 		v.AddArg3(x, y, mask)
 		return true
 	}
+	// match: (VMOVDQU32Masked512 (VPMOVUSDB128_512 x) mask)
+	// result: (VPMOVUSDBMasked128_512 x mask)
+	for {
+		if v_0.Op != OpAMD64VPMOVUSDB128_512 {
+			break
+		}
+		x := v_0.Args[0]
+		mask := v_1
+		v.reset(OpAMD64VPMOVUSDBMasked128_512)
+		v.AddArg2(x, mask)
+		return true
+	}
 	// match: (VMOVDQU32Masked512 (VPACKUSDW512 x y) mask)
 	// result: (VPACKUSDWMasked512 x y mask)
 	for {
@@ -38259,6 +38207,18 @@ func rewriteValueAMD64_OpAMD64VMOVDQU64Masked128(v *Value) bool {
 		v.AddArg2(x, mask)
 		return true
 	}
+	// match: (VMOVDQU64Masked128 (VPMOVUSQB128_128 x) mask)
+	// result: (VPMOVUSQBMasked128_128 x mask)
+	for {
+		if v_0.Op != OpAMD64VPMOVUSQB128_128 {
+			break
+		}
+		x := v_0.Args[0]
+		mask := v_1
+		v.reset(OpAMD64VPMOVUSQBMasked128_128)
+		v.AddArg2(x, mask)
+		return true
+	}
 	// match: (VMOVDQU64Masked128 (VPMOVUSQW128_128 x) mask)
 	// result: (VPMOVUSQWMasked128_128 x mask)
 	for {
@@ -39100,6 +39060,18 @@ func rewriteValueAMD64_OpAMD64VMOVDQU64Masked256(v *Value) bool {
 		v.AddArg2(x, mask)
 		return true
 	}
+	// match: (VMOVDQU64Masked256 (VPMOVUSQB128_256 x) mask)
+	// result: (VPMOVUSQBMasked128_256 x mask)
+	for {
+		if v_0.Op != OpAMD64VPMOVUSQB128_256 {
+			break
+		}
+		x := v_0.Args[0]
+		mask := v_1
+		v.reset(OpAMD64VPMOVUSQBMasked128_256)
+		v.AddArg2(x, mask)
+		return true
+	}
 	// match: (VMOVDQU64Masked256 (VPMOVUSQW128_256 x) mask)
 	// result: (VPMOVUSQWMasked128_256 x mask)
 	for {
@@ -39920,6 +39892,18 @@ func rewriteValueAMD64_OpAMD64VMOVDQU64Masked512(v *Value) bool {
 		v.AddArg2(x, mask)
 		return true
 	}
+	// match: (VMOVDQU64Masked512 (VPMOVUSQB128_512 x) mask)
+	// result: (VPMOVUSQBMasked128_512 x mask)
+	for {
+		if v_0.Op != OpAMD64VPMOVUSQB128_512 {
+			break
+		}
+		x := v_0.Args[0]
+		mask := v_1
+		v.reset(OpAMD64VPMOVUSQBMasked128_512)
+		v.AddArg2(x, mask)
+		return true
+	}
 	// match: (VMOVDQU64Masked512 (VPMOVUSQW128_512 x) mask)
 	// result: (VPMOVUSQWMasked128_512 x mask)
 	for {
@@ -42407,151 +42391,9 @@ func rewriteValueAMD64_OpAMD64VPACKUSDWMasked512(v *Value) bool {
 	}
 	return false
 }
-func rewriteValueAMD64_OpAMD64VPADDD128(v *Value) bool {
-	v_1 := v.Args[1]
-	v_0 := v.Args[0]
-	// match: (VPADDD128 (VPDPBUSD128 (Zero128 <t>) x y) z)
-	// result: (VPDPBUSD128 <t> z x y)
-	for {
-		for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
-			if v_0.Op != OpAMD64VPDPBUSD128 {
-				continue
-			}
-			y := v_0.Args[2]
-			v_0_0 := v_0.Args[0]
-			if v_0_0.Op != OpAMD64Zero128 {
-				continue
-			}
-			t := v_0_0.Type
-			x := v_0.Args[1]
-			z := v_1
-			v.reset(OpAMD64VPDPBUSD128)
-			v.Type = t
-			v.AddArg3(z, x, y)
-			return true
-		}
-		break
-	}
-	// match: (VPADDD128 (VPDPBUSDS128 (Zero128 <t>) x y) z)
-	// result: (VPDPBUSDS128 <t> z x y)
-	for {
-		for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
-			if v_0.Op != OpAMD64VPDPBUSDS128 {
-				continue
-			}
-			y := v_0.Args[2]
-			v_0_0 := v_0.Args[0]
-			if v_0_0.Op != OpAMD64Zero128 {
-				continue
-			}
-			t := v_0_0.Type
-			x := v_0.Args[1]
-			z := v_1
-			v.reset(OpAMD64VPDPBUSDS128)
-			v.Type = t
-			v.AddArg3(z, x, y)
-			return true
-		}
-		break
-	}
-	return false
-}
-func rewriteValueAMD64_OpAMD64VPADDD256(v *Value) bool {
-	v_1 := v.Args[1]
-	v_0 := v.Args[0]
-	// match: (VPADDD256 (VPDPBUSD256 (Zero256 <t>) x y) z)
-	// result: (VPDPBUSD256 <t> z x y)
-	for {
-		for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
-			if v_0.Op != OpAMD64VPDPBUSD256 {
-				continue
-			}
-			y := v_0.Args[2]
-			v_0_0 := v_0.Args[0]
-			if v_0_0.Op != OpAMD64Zero256 {
-				continue
-			}
-			t := v_0_0.Type
-			x := v_0.Args[1]
-			z := v_1
-			v.reset(OpAMD64VPDPBUSD256)
-			v.Type = t
-			v.AddArg3(z, x, y)
-			return true
-		}
-		break
-	}
-	// match: (VPADDD256 (VPDPBUSDS256 (Zero256 <t>) x y) z)
-	// result: (VPDPBUSDS256 <t> z x y)
-	for {
-		for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
-			if v_0.Op != OpAMD64VPDPBUSDS256 {
-				continue
-			}
-			y := v_0.Args[2]
-			v_0_0 := v_0.Args[0]
-			if v_0_0.Op != OpAMD64Zero256 {
-				continue
-			}
-			t := v_0_0.Type
-			x := v_0.Args[1]
-			z := v_1
-			v.reset(OpAMD64VPDPBUSDS256)
-			v.Type = t
-			v.AddArg3(z, x, y)
-			return true
-		}
-		break
-	}
-	return false
-}
 func rewriteValueAMD64_OpAMD64VPADDD512(v *Value) bool {
 	v_1 := v.Args[1]
 	v_0 := v.Args[0]
-	// match: (VPADDD512 (VPDPBUSD512 (Zero512 <t>) x y) z)
-	// result: (VPDPBUSD512 <t> z x y)
-	for {
-		for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
-			if v_0.Op != OpAMD64VPDPBUSD512 {
-				continue
-			}
-			y := v_0.Args[2]
-			v_0_0 := v_0.Args[0]
-			if v_0_0.Op != OpAMD64Zero512 {
-				continue
-			}
-			t := v_0_0.Type
-			x := v_0.Args[1]
-			z := v_1
-			v.reset(OpAMD64VPDPBUSD512)
-			v.Type = t
-			v.AddArg3(z, x, y)
-			return true
-		}
-		break
-	}
-	// match: (VPADDD512 (VPDPBUSDS512 (Zero512 <t>) x y) z)
-	// result: (VPDPBUSDS512 <t> z x y)
-	for {
-		for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
-			if v_0.Op != OpAMD64VPDPBUSDS512 {
-				continue
-			}
-			y := v_0.Args[2]
-			v_0_0 := v_0.Args[0]
-			if v_0_0.Op != OpAMD64Zero512 {
-				continue
-			}
-			t := v_0_0.Type
-			x := v_0.Args[1]
-			z := v_1
-			v.reset(OpAMD64VPDPBUSDS512)
-			v.Type = t
-			v.AddArg3(z, x, y)
-			return true
-		}
-		break
-	}
 	// match: (VPADDD512 x l:(VMOVDQUload512 {sym} [off] ptr mem))
 	// cond: canMergeLoad(v, l) && clobber(l)
 	// result: (VPADDD512load {sym} [off] x ptr mem)
@@ -44109,6 +43951,19 @@ func rewriteValueAMD64_OpAMD64VPBLENDMDMasked512(v *Value) bool {
 		v.AddArg3(dst, x, mask)
 		return true
 	}
+	// match: (VPBLENDMDMasked512 dst (VPMOVUSDB128_512 x) mask)
+	// result: (VPMOVUSDBMasked128_512Merging dst x mask)
+	for {
+		dst := v_0
+		if v_1.Op != OpAMD64VPMOVUSDB128_512 {
+			break
+		}
+		x := v_1.Args[0]
+		mask := v_2
+		v.reset(OpAMD64VPMOVUSDBMasked128_512Merging)
+		v.AddArg3(dst, x, mask)
+		return true
+	}
 	// match: (VPBLENDMDMasked512 dst (VPMOVUSDW256 x) mask)
 	// result: (VPMOVUSDWMasked256Merging dst x mask)
 	for {
@@ -44869,6 +44724,19 @@ func rewriteValueAMD64_OpAMD64VPBLENDMQMasked512(v *Value) bool {
 		v.AddArg3(dst, x, mask)
 		return true
 	}
+	// match: (VPBLENDMQMasked512 dst (VPMOVUSQB128_512 x) mask)
+	// result: (VPMOVUSQBMasked128_512Merging dst x mask)
+	for {
+		dst := v_0
+		if v_1.Op != OpAMD64VPMOVUSQB128_512 {
+			break
+		}
+		x := v_1.Args[0]
+		mask := v_2
+		v.reset(OpAMD64VPMOVUSQBMasked128_512Merging)
+		v.AddArg3(dst, x, mask)
+		return true
+	}
 	// match: (VPBLENDMQMasked512 dst (VPMOVUSQD256 x) mask)
 	// result: (VPMOVUSQDMasked256Merging dst x mask)
 	for {
@@ -47797,6 +47665,25 @@ func rewriteValueAMD64_OpAMD64VPBLENDVB128(v *Value) bool {
 		v.AddArg3(dst, x, v0)
 		return true
 	}
+	// match: (VPBLENDVB128 dst (VPMOVUSDB128_128 x) mask)
+	// cond: v.Block.CPUfeatures.hasFeature(CPUavx512)
+	// result: (VPMOVUSDBMasked128_128Merging dst x (VPMOVVec32x4ToM <types.TypeMask> mask))
+	for {
+		dst := v_0
+		if v_1.Op != OpAMD64VPMOVUSDB128_128 {
+			break
+		}
+		x := v_1.Args[0]
+		mask := v_2
+		if !(v.Block.CPUfeatures.hasFeature(CPUavx512)) {
+			break
+		}
+		v.reset(OpAMD64VPMOVUSDBMasked128_128Merging)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg3(dst, x, v0)
+		return true
+	}
 	// match: (VPBLENDVB128 dst (VPMOVUSDW128_128 x) mask)
 	// cond: v.Block.CPUfeatures.hasFeature(CPUavx512)
 	// result: (VPMOVUSDWMasked128_128Merging dst x (VPMOVVec32x4ToM <types.TypeMask> mask))
@@ -47816,6 +47703,25 @@ func rewriteValueAMD64_OpAMD64VPBLENDVB128(v *Value) bool {
 		v.AddArg3(dst, x, v0)
 		return true
 	}
+	// match: (VPBLENDVB128 dst (VPMOVUSQB128_128 x) mask)
+	// cond: v.Block.CPUfeatures.hasFeature(CPUavx512)
+	// result: (VPMOVUSQBMasked128_128Merging dst x (VPMOVVec64x2ToM <types.TypeMask> mask))
+	for {
+		dst := v_0
+		if v_1.Op != OpAMD64VPMOVUSQB128_128 {
+			break
+		}
+		x := v_1.Args[0]
+		mask := v_2
+		if !(v.Block.CPUfeatures.hasFeature(CPUavx512)) {
+			break
+		}
+		v.reset(OpAMD64VPMOVUSQBMasked128_128Merging)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg3(dst, x, v0)
+		return true
+	}
 	// match: (VPBLENDVB128 dst (VPMOVUSQD128_128 x) mask)
 	// cond: v.Block.CPUfeatures.hasFeature(CPUavx512)
 	// result: (VPMOVUSQDMasked128_128Merging dst x (VPMOVVec64x2ToM <types.TypeMask> mask))
@@ -47854,6 +47760,25 @@ func rewriteValueAMD64_OpAMD64VPBLENDVB128(v *Value) bool {
 		v.AddArg3(dst, x, v0)
 		return true
 	}
+	// match: (VPBLENDVB128 dst (VPMOVUSWB128_128 x) mask)
+	// cond: v.Block.CPUfeatures.hasFeature(CPUavx512)
+	// result: (VPMOVUSWBMasked128_128Merging dst x (VPMOVVec16x8ToM <types.TypeMask> mask))
+	for {
+		dst := v_0
+		if v_1.Op != OpAMD64VPMOVUSWB128_128 {
+			break
+		}
+		x := v_1.Args[0]
+		mask := v_2
+		if !(v.Block.CPUfeatures.hasFeature(CPUavx512)) {
+			break
+		}
+		v.reset(OpAMD64VPMOVUSWBMasked128_128Merging)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x8ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg3(dst, x, v0)
+		return true
+	}
 	// match: (VPBLENDVB128 dst (VPMOVWB128_128 x) mask)
 	// cond: v.Block.CPUfeatures.hasFeature(CPUavx512)
 	// result: (VPMOVWBMasked128_128Merging dst x (VPMOVVec16x8ToM <types.TypeMask> mask))
@@ -50990,6 +50915,25 @@ func rewriteValueAMD64_OpAMD64VPBLENDVB256(v *Value) bool {
 		v.AddArg3(dst, x, v0)
 		return true
 	}
+	// match: (VPBLENDVB256 dst (VPMOVUSDB128_256 x) mask)
+	// cond: v.Block.CPUfeatures.hasFeature(CPUavx512)
+	// result: (VPMOVUSDBMasked128_256Merging dst x (VPMOVVec32x8ToM <types.TypeMask> mask))
+	for {
+		dst := v_0
+		if v_1.Op != OpAMD64VPMOVUSDB128_256 {
+			break
+		}
+		x := v_1.Args[0]
+		mask := v_2
+		if !(v.Block.CPUfeatures.hasFeature(CPUavx512)) {
+			break
+		}
+		v.reset(OpAMD64VPMOVUSDBMasked128_256Merging)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg3(dst, x, v0)
+		return true
+	}
 	// match: (VPBLENDVB256 dst (VPMOVUSDW128_256 x) mask)
 	// cond: v.Block.CPUfeatures.hasFeature(CPUavx512)
 	// result: (VPMOVUSDWMasked128_256Merging dst x (VPMOVVec32x8ToM <types.TypeMask> mask))
@@ -51009,6 +50953,25 @@ func rewriteValueAMD64_OpAMD64VPBLENDVB256(v *Value) bool {
 		v.AddArg3(dst, x, v0)
 		return true
 	}
+	// match: (VPBLENDVB256 dst (VPMOVUSQB128_256 x) mask)
+	// cond: v.Block.CPUfeatures.hasFeature(CPUavx512)
+	// result: (VPMOVUSQBMasked128_256Merging dst x (VPMOVVec64x4ToM <types.TypeMask> mask))
+	for {
+		dst := v_0
+		if v_1.Op != OpAMD64VPMOVUSQB128_256 {
+			break
+		}
+		x := v_1.Args[0]
+		mask := v_2
+		if !(v.Block.CPUfeatures.hasFeature(CPUavx512)) {
+			break
+		}
+		v.reset(OpAMD64VPMOVUSQBMasked128_256Merging)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x4ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg3(dst, x, v0)
+		return true
+	}
 	// match: (VPBLENDVB256 dst (VPMOVUSQD128_256 x) mask)
 	// cond: v.Block.CPUfeatures.hasFeature(CPUavx512)
 	// result: (VPMOVUSQDMasked128_256Merging dst x (VPMOVVec64x4ToM <types.TypeMask> mask))
@@ -51047,6 +51010,25 @@ func rewriteValueAMD64_OpAMD64VPBLENDVB256(v *Value) bool {
 		v.AddArg3(dst, x, v0)
 		return true
 	}
+	// match: (VPBLENDVB256 dst (VPMOVUSWB128_256 x) mask)
+	// cond: v.Block.CPUfeatures.hasFeature(CPUavx512)
+	// result: (VPMOVUSWBMasked128_256Merging dst x (VPMOVVec16x16ToM <types.TypeMask> mask))
+	for {
+		dst := v_0
+		if v_1.Op != OpAMD64VPMOVUSWB128_256 {
+			break
+		}
+		x := v_1.Args[0]
+		mask := v_2
+		if !(v.Block.CPUfeatures.hasFeature(CPUavx512)) {
+			break
+		}
+		v.reset(OpAMD64VPMOVUSWBMasked128_256Merging)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x16ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg3(dst, x, v0)
+		return true
+	}
 	// match: (VPBLENDVB256 dst (VPMOVWB128_256 x) mask)
 	// cond: v.Block.CPUfeatures.hasFeature(CPUavx512)
 	// result: (VPMOVWBMasked128_256Merging dst x (VPMOVVec16x16ToM <types.TypeMask> mask))
@@ -52553,7 +52535,7 @@ func rewriteValueAMD64_OpAMD64VPCMPD512(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VPCMPD512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem))
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VPCMPD512load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mem)
+	// result: (VPCMPD512load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		x := v_0
@@ -52569,7 +52551,7 @@ func rewriteValueAMD64_OpAMD64VPCMPD512(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VPCMPD512load)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg3(x, ptr, mem)
 		return true
@@ -52582,7 +52564,7 @@ func rewriteValueAMD64_OpAMD64VPCMPDMasked128(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VPCMPDMasked128 [c] x l:(VMOVDQUload128 {sym} [off] ptr mem) mask)
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VPCMPDMasked128load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
+	// result: (VPCMPDMasked128load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mask mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		x := v_0
@@ -52599,7 +52581,7 @@ func rewriteValueAMD64_OpAMD64VPCMPDMasked128(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VPCMPDMasked128load)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg4(x, ptr, mask, mem)
 		return true
@@ -52612,7 +52594,7 @@ func rewriteValueAMD64_OpAMD64VPCMPDMasked256(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VPCMPDMasked256 [c] x l:(VMOVDQUload256 {sym} [off] ptr mem) mask)
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VPCMPDMasked256load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
+	// result: (VPCMPDMasked256load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mask mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		x := v_0
@@ -52629,7 +52611,7 @@ func rewriteValueAMD64_OpAMD64VPCMPDMasked256(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VPCMPDMasked256load)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg4(x, ptr, mask, mem)
 		return true
@@ -52642,7 +52624,7 @@ func rewriteValueAMD64_OpAMD64VPCMPDMasked512(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VPCMPDMasked512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem) mask)
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VPCMPDMasked512load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
+	// result: (VPCMPDMasked512load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mask mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		x := v_0
@@ -52659,7 +52641,7 @@ func rewriteValueAMD64_OpAMD64VPCMPDMasked512(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VPCMPDMasked512load)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg4(x, ptr, mask, mem)
 		return true
@@ -52785,7 +52767,7 @@ func rewriteValueAMD64_OpAMD64VPCMPQ512(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VPCMPQ512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem))
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VPCMPQ512load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mem)
+	// result: (VPCMPQ512load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		x := v_0
@@ -52801,7 +52783,7 @@ func rewriteValueAMD64_OpAMD64VPCMPQ512(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VPCMPQ512load)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg3(x, ptr, mem)
 		return true
@@ -52814,7 +52796,7 @@ func rewriteValueAMD64_OpAMD64VPCMPQMasked128(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VPCMPQMasked128 [c] x l:(VMOVDQUload128 {sym} [off] ptr mem) mask)
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VPCMPQMasked128load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
+	// result: (VPCMPQMasked128load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mask mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		x := v_0
@@ -52831,7 +52813,7 @@ func rewriteValueAMD64_OpAMD64VPCMPQMasked128(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VPCMPQMasked128load)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg4(x, ptr, mask, mem)
 		return true
@@ -52844,7 +52826,7 @@ func rewriteValueAMD64_OpAMD64VPCMPQMasked256(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VPCMPQMasked256 [c] x l:(VMOVDQUload256 {sym} [off] ptr mem) mask)
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VPCMPQMasked256load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
+	// result: (VPCMPQMasked256load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mask mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		x := v_0
@@ -52861,7 +52843,7 @@ func rewriteValueAMD64_OpAMD64VPCMPQMasked256(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VPCMPQMasked256load)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg4(x, ptr, mask, mem)
 		return true
@@ -52874,7 +52856,7 @@ func rewriteValueAMD64_OpAMD64VPCMPQMasked512(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VPCMPQMasked512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem) mask)
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VPCMPQMasked512load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
+	// result: (VPCMPQMasked512load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mask mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		x := v_0
@@ -52891,7 +52873,7 @@ func rewriteValueAMD64_OpAMD64VPCMPQMasked512(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VPCMPQMasked512load)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg4(x, ptr, mask, mem)
 		return true
@@ -52903,7 +52885,7 @@ func rewriteValueAMD64_OpAMD64VPCMPUD512(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VPCMPUD512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem))
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VPCMPUD512load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mem)
+	// result: (VPCMPUD512load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		x := v_0
@@ -52919,7 +52901,7 @@ func rewriteValueAMD64_OpAMD64VPCMPUD512(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VPCMPUD512load)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg3(x, ptr, mem)
 		return true
@@ -52932,7 +52914,7 @@ func rewriteValueAMD64_OpAMD64VPCMPUDMasked128(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VPCMPUDMasked128 [c] x l:(VMOVDQUload128 {sym} [off] ptr mem) mask)
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VPCMPUDMasked128load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
+	// result: (VPCMPUDMasked128load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mask mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		x := v_0
@@ -52949,7 +52931,7 @@ func rewriteValueAMD64_OpAMD64VPCMPUDMasked128(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VPCMPUDMasked128load)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg4(x, ptr, mask, mem)
 		return true
@@ -52962,7 +52944,7 @@ func rewriteValueAMD64_OpAMD64VPCMPUDMasked256(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VPCMPUDMasked256 [c] x l:(VMOVDQUload256 {sym} [off] ptr mem) mask)
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VPCMPUDMasked256load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
+	// result: (VPCMPUDMasked256load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mask mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		x := v_0
@@ -52979,7 +52961,7 @@ func rewriteValueAMD64_OpAMD64VPCMPUDMasked256(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VPCMPUDMasked256load)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg4(x, ptr, mask, mem)
 		return true
@@ -52992,7 +52974,7 @@ func rewriteValueAMD64_OpAMD64VPCMPUDMasked512(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VPCMPUDMasked512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem) mask)
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VPCMPUDMasked512load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
+	// result: (VPCMPUDMasked512load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mask mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		x := v_0
@@ -53009,7 +52991,7 @@ func rewriteValueAMD64_OpAMD64VPCMPUDMasked512(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VPCMPUDMasked512load)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg4(x, ptr, mask, mem)
 		return true
@@ -53021,7 +53003,7 @@ func rewriteValueAMD64_OpAMD64VPCMPUQ512(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VPCMPUQ512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem))
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VPCMPUQ512load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mem)
+	// result: (VPCMPUQ512load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		x := v_0
@@ -53037,7 +53019,7 @@ func rewriteValueAMD64_OpAMD64VPCMPUQ512(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VPCMPUQ512load)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg3(x, ptr, mem)
 		return true
@@ -53050,7 +53032,7 @@ func rewriteValueAMD64_OpAMD64VPCMPUQMasked128(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VPCMPUQMasked128 [c] x l:(VMOVDQUload128 {sym} [off] ptr mem) mask)
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VPCMPUQMasked128load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
+	// result: (VPCMPUQMasked128load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mask mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		x := v_0
@@ -53067,7 +53049,7 @@ func rewriteValueAMD64_OpAMD64VPCMPUQMasked128(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VPCMPUQMasked128load)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg4(x, ptr, mask, mem)
 		return true
@@ -53080,7 +53062,7 @@ func rewriteValueAMD64_OpAMD64VPCMPUQMasked256(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VPCMPUQMasked256 [c] x l:(VMOVDQUload256 {sym} [off] ptr mem) mask)
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VPCMPUQMasked256load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
+	// result: (VPCMPUQMasked256load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mask mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		x := v_0
@@ -53097,7 +53079,7 @@ func rewriteValueAMD64_OpAMD64VPCMPUQMasked256(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VPCMPUQMasked256load)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg4(x, ptr, mask, mem)
 		return true
@@ -53110,7 +53092,7 @@ func rewriteValueAMD64_OpAMD64VPCMPUQMasked512(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VPCMPUQMasked512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem) mask)
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VPCMPUQMasked512load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
+	// result: (VPCMPUQMasked512load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mask mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		x := v_0
@@ -53127,257 +53109,13 @@ func rewriteValueAMD64_OpAMD64VPCMPUQMasked512(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VPCMPUQMasked512load)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg4(x, ptr, mask, mem)
 		return true
 	}
 	return false
 }
-func rewriteValueAMD64_OpAMD64VPDPBUSD512(v *Value) bool {
-	v_2 := v.Args[2]
-	v_1 := v.Args[1]
-	v_0 := v.Args[0]
-	// match: (VPDPBUSD512 x y l:(VMOVDQUload512 {sym} [off] ptr mem))
-	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VPDPBUSD512load {sym} [off] x y ptr mem)
-	for {
-		x := v_0
-		y := v_1
-		l := v_2
-		if l.Op != OpAMD64VMOVDQUload512 {
-			break
-		}
-		off := auxIntToInt32(l.AuxInt)
-		sym := auxToSym(l.Aux)
-		mem := l.Args[1]
-		ptr := l.Args[0]
-		if !(canMergeLoad(v, l) && clobber(l)) {
-			break
-		}
-		v.reset(OpAMD64VPDPBUSD512load)
-		v.AuxInt = int32ToAuxInt(off)
-		v.Aux = symToAux(sym)
-		v.AddArg4(x, y, ptr, mem)
-		return true
-	}
-	return false
-}
-func rewriteValueAMD64_OpAMD64VPDPBUSDMasked128(v *Value) bool {
-	v_3 := v.Args[3]
-	v_2 := v.Args[2]
-	v_1 := v.Args[1]
-	v_0 := v.Args[0]
-	// match: (VPDPBUSDMasked128 x y l:(VMOVDQUload128 {sym} [off] ptr mem) mask)
-	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VPDPBUSDMasked128load {sym} [off] x y ptr mask mem)
-	for {
-		x := v_0
-		y := v_1
-		l := v_2
-		if l.Op != OpAMD64VMOVDQUload128 {
-			break
-		}
-		off := auxIntToInt32(l.AuxInt)
-		sym := auxToSym(l.Aux)
-		mem := l.Args[1]
-		ptr := l.Args[0]
-		mask := v_3
-		if !(canMergeLoad(v, l) && clobber(l)) {
-			break
-		}
-		v.reset(OpAMD64VPDPBUSDMasked128load)
-		v.AuxInt = int32ToAuxInt(off)
-		v.Aux = symToAux(sym)
-		v.AddArg5(x, y, ptr, mask, mem)
-		return true
-	}
-	return false
-}
-func rewriteValueAMD64_OpAMD64VPDPBUSDMasked256(v *Value) bool {
-	v_3 := v.Args[3]
-	v_2 := v.Args[2]
-	v_1 := v.Args[1]
-	v_0 := v.Args[0]
-	// match: (VPDPBUSDMasked256 x y l:(VMOVDQUload256 {sym} [off] ptr mem) mask)
-	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VPDPBUSDMasked256load {sym} [off] x y ptr mask mem)
-	for {
-		x := v_0
-		y := v_1
-		l := v_2
-		if l.Op != OpAMD64VMOVDQUload256 {
-			break
-		}
-		off := auxIntToInt32(l.AuxInt)
-		sym := auxToSym(l.Aux)
-		mem := l.Args[1]
-		ptr := l.Args[0]
-		mask := v_3
-		if !(canMergeLoad(v, l) && clobber(l)) {
-			break
-		}
-		v.reset(OpAMD64VPDPBUSDMasked256load)
-		v.AuxInt = int32ToAuxInt(off)
-		v.Aux = symToAux(sym)
-		v.AddArg5(x, y, ptr, mask, mem)
-		return true
-	}
-	return false
-}
-func rewriteValueAMD64_OpAMD64VPDPBUSDMasked512(v *Value) bool {
-	v_3 := v.Args[3]
-	v_2 := v.Args[2]
-	v_1 := v.Args[1]
-	v_0 := v.Args[0]
-	// match: (VPDPBUSDMasked512 x y l:(VMOVDQUload512 {sym} [off] ptr mem) mask)
-	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VPDPBUSDMasked512load {sym} [off] x y ptr mask mem)
-	for {
-		x := v_0
-		y := v_1
-		l := v_2
-		if l.Op != OpAMD64VMOVDQUload512 {
-			break
-		}
-		off := auxIntToInt32(l.AuxInt)
-		sym := auxToSym(l.Aux)
-		mem := l.Args[1]
-		ptr := l.Args[0]
-		mask := v_3
-		if !(canMergeLoad(v, l) && clobber(l)) {
-			break
-		}
-		v.reset(OpAMD64VPDPBUSDMasked512load)
-		v.AuxInt = int32ToAuxInt(off)
-		v.Aux = symToAux(sym)
-		v.AddArg5(x, y, ptr, mask, mem)
-		return true
-	}
-	return false
-}
-func rewriteValueAMD64_OpAMD64VPDPBUSDS512(v *Value) bool {
-	v_2 := v.Args[2]
-	v_1 := v.Args[1]
-	v_0 := v.Args[0]
-	// match: (VPDPBUSDS512 x y l:(VMOVDQUload512 {sym} [off] ptr mem))
-	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VPDPBUSDS512load {sym} [off] x y ptr mem)
-	for {
-		x := v_0
-		y := v_1
-		l := v_2
-		if l.Op != OpAMD64VMOVDQUload512 {
-			break
-		}
-		off := auxIntToInt32(l.AuxInt)
-		sym := auxToSym(l.Aux)
-		mem := l.Args[1]
-		ptr := l.Args[0]
-		if !(canMergeLoad(v, l) && clobber(l)) {
-			break
-		}
-		v.reset(OpAMD64VPDPBUSDS512load)
-		v.AuxInt = int32ToAuxInt(off)
-		v.Aux = symToAux(sym)
-		v.AddArg4(x, y, ptr, mem)
-		return true
-	}
-	return false
-}
-func rewriteValueAMD64_OpAMD64VPDPBUSDSMasked128(v *Value) bool {
-	v_3 := v.Args[3]
-	v_2 := v.Args[2]
-	v_1 := v.Args[1]
-	v_0 := v.Args[0]
-	// match: (VPDPBUSDSMasked128 x y l:(VMOVDQUload128 {sym} [off] ptr mem) mask)
-	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VPDPBUSDSMasked128load {sym} [off] x y ptr mask mem)
-	for {
-		x := v_0
-		y := v_1
-		l := v_2
-		if l.Op != OpAMD64VMOVDQUload128 {
-			break
-		}
-		off := auxIntToInt32(l.AuxInt)
-		sym := auxToSym(l.Aux)
-		mem := l.Args[1]
-		ptr := l.Args[0]
-		mask := v_3
-		if !(canMergeLoad(v, l) && clobber(l)) {
-			break
-		}
-		v.reset(OpAMD64VPDPBUSDSMasked128load)
-		v.AuxInt = int32ToAuxInt(off)
-		v.Aux = symToAux(sym)
-		v.AddArg5(x, y, ptr, mask, mem)
-		return true
-	}
-	return false
-}
-func rewriteValueAMD64_OpAMD64VPDPBUSDSMasked256(v *Value) bool {
-	v_3 := v.Args[3]
-	v_2 := v.Args[2]
-	v_1 := v.Args[1]
-	v_0 := v.Args[0]
-	// match: (VPDPBUSDSMasked256 x y l:(VMOVDQUload256 {sym} [off] ptr mem) mask)
-	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VPDPBUSDSMasked256load {sym} [off] x y ptr mask mem)
-	for {
-		x := v_0
-		y := v_1
-		l := v_2
-		if l.Op != OpAMD64VMOVDQUload256 {
-			break
-		}
-		off := auxIntToInt32(l.AuxInt)
-		sym := auxToSym(l.Aux)
-		mem := l.Args[1]
-		ptr := l.Args[0]
-		mask := v_3
-		if !(canMergeLoad(v, l) && clobber(l)) {
-			break
-		}
-		v.reset(OpAMD64VPDPBUSDSMasked256load)
-		v.AuxInt = int32ToAuxInt(off)
-		v.Aux = symToAux(sym)
-		v.AddArg5(x, y, ptr, mask, mem)
-		return true
-	}
-	return false
-}
-func rewriteValueAMD64_OpAMD64VPDPBUSDSMasked512(v *Value) bool {
-	v_3 := v.Args[3]
-	v_2 := v.Args[2]
-	v_1 := v.Args[1]
-	v_0 := v.Args[0]
-	// match: (VPDPBUSDSMasked512 x y l:(VMOVDQUload512 {sym} [off] ptr mem) mask)
-	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VPDPBUSDSMasked512load {sym} [off] x y ptr mask mem)
-	for {
-		x := v_0
-		y := v_1
-		l := v_2
-		if l.Op != OpAMD64VMOVDQUload512 {
-			break
-		}
-		off := auxIntToInt32(l.AuxInt)
-		sym := auxToSym(l.Aux)
-		mem := l.Args[1]
-		ptr := l.Args[0]
-		mask := v_3
-		if !(canMergeLoad(v, l) && clobber(l)) {
-			break
-		}
-		v.reset(OpAMD64VPDPBUSDSMasked512load)
-		v.AuxInt = int32ToAuxInt(off)
-		v.Aux = symToAux(sym)
-		v.AddArg5(x, y, ptr, mask, mem)
-		return true
-	}
-	return false
-}
 func rewriteValueAMD64_OpAMD64VPDPWSSD512(v *Value) bool {
 	v_2 := v.Args[2]
 	v_1 := v.Args[1]
@@ -57040,9 +56778,173 @@ func rewriteValueAMD64_OpAMD64VPOPCNTQMasked512(v *Value) bool {
 	}
 	return false
 }
+func rewriteValueAMD64_OpAMD64VPOR128(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	// match: (VPOR128 (VCMPPS128 [3] x x) (VCMPPS128 [3] y y))
+	// result: (VCMPPS128 [3] x y)
+	for {
+		for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
+			if v_0.Op != OpAMD64VCMPPS128 || auxIntToUint8(v_0.AuxInt) != 3 {
+				continue
+			}
+			x := v_0.Args[1]
+			if x != v_0.Args[0] || v_1.Op != OpAMD64VCMPPS128 || auxIntToUint8(v_1.AuxInt) != 3 {
+				continue
+			}
+			y := v_1.Args[1]
+			if y != v_1.Args[0] {
+				continue
+			}
+			v.reset(OpAMD64VCMPPS128)
+			v.AuxInt = uint8ToAuxInt(3)
+			v.AddArg2(x, y)
+			return true
+		}
+		break
+	}
+	// match: (VPOR128 (VCMPPD128 [3] x x) (VCMPPD128 [3] y y))
+	// result: (VCMPPD128 [3] x y)
+	for {
+		for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
+			if v_0.Op != OpAMD64VCMPPD128 || auxIntToUint8(v_0.AuxInt) != 3 {
+				continue
+			}
+			x := v_0.Args[1]
+			if x != v_0.Args[0] || v_1.Op != OpAMD64VCMPPD128 || auxIntToUint8(v_1.AuxInt) != 3 {
+				continue
+			}
+			y := v_1.Args[1]
+			if y != v_1.Args[0] {
+				continue
+			}
+			v.reset(OpAMD64VCMPPD128)
+			v.AuxInt = uint8ToAuxInt(3)
+			v.AddArg2(x, y)
+			return true
+		}
+		break
+	}
+	return false
+}
+func rewriteValueAMD64_OpAMD64VPOR256(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	// match: (VPOR256 (VCMPPS256 [3] x x) (VCMPPS256 [3] y y))
+	// result: (VCMPPS256 [3] x y)
+	for {
+		for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
+			if v_0.Op != OpAMD64VCMPPS256 || auxIntToUint8(v_0.AuxInt) != 3 {
+				continue
+			}
+			x := v_0.Args[1]
+			if x != v_0.Args[0] || v_1.Op != OpAMD64VCMPPS256 || auxIntToUint8(v_1.AuxInt) != 3 {
+				continue
+			}
+			y := v_1.Args[1]
+			if y != v_1.Args[0] {
+				continue
+			}
+			v.reset(OpAMD64VCMPPS256)
+			v.AuxInt = uint8ToAuxInt(3)
+			v.AddArg2(x, y)
+			return true
+		}
+		break
+	}
+	// match: (VPOR256 (VCMPPD256 [3] x x) (VCMPPD256 [3] y y))
+	// result: (VCMPPD256 [3] x y)
+	for {
+		for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
+			if v_0.Op != OpAMD64VCMPPD256 || auxIntToUint8(v_0.AuxInt) != 3 {
+				continue
+			}
+			x := v_0.Args[1]
+			if x != v_0.Args[0] || v_1.Op != OpAMD64VCMPPD256 || auxIntToUint8(v_1.AuxInt) != 3 {
+				continue
+			}
+			y := v_1.Args[1]
+			if y != v_1.Args[0] {
+				continue
+			}
+			v.reset(OpAMD64VCMPPD256)
+			v.AuxInt = uint8ToAuxInt(3)
+			v.AddArg2(x, y)
+			return true
+		}
+		break
+	}
+	return false
+}
 func rewriteValueAMD64_OpAMD64VPORD512(v *Value) bool {
 	v_1 := v.Args[1]
 	v_0 := v.Args[0]
+	b := v.Block
+	typ := &b.Func.Config.Types
+	// match: (VPORD512 (VPMOVMToVec32x16 (VCMPPS512 [3] x x)) (VPMOVMToVec32x16 (VCMPPS512 [3] y y)))
+	// result: (VPMOVMToVec32x16 (VCMPPS512 [3] x y))
+	for {
+		for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
+			if v_0.Op != OpAMD64VPMOVMToVec32x16 {
+				continue
+			}
+			v_0_0 := v_0.Args[0]
+			if v_0_0.Op != OpAMD64VCMPPS512 || auxIntToUint8(v_0_0.AuxInt) != 3 {
+				continue
+			}
+			x := v_0_0.Args[1]
+			if x != v_0_0.Args[0] || v_1.Op != OpAMD64VPMOVMToVec32x16 {
+				continue
+			}
+			v_1_0 := v_1.Args[0]
+			if v_1_0.Op != OpAMD64VCMPPS512 || auxIntToUint8(v_1_0.AuxInt) != 3 {
+				continue
+			}
+			y := v_1_0.Args[1]
+			if y != v_1_0.Args[0] {
+				continue
+			}
+			v.reset(OpAMD64VPMOVMToVec32x16)
+			v0 := b.NewValue0(v.Pos, OpAMD64VCMPPS512, typ.Mask)
+			v0.AuxInt = uint8ToAuxInt(3)
+			v0.AddArg2(x, y)
+			v.AddArg(v0)
+			return true
+		}
+		break
+	}
+	// match: (VPORD512 (VPMOVMToVec64x8 (VCMPPD512 [3] x x)) (VPMOVMToVec64x8 (VCMPPD512 [3] y y)))
+	// result: (VPMOVMToVec64x8 (VCMPPD512 [3] x y))
+	for {
+		for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
+			if v_0.Op != OpAMD64VPMOVMToVec64x8 {
+				continue
+			}
+			v_0_0 := v_0.Args[0]
+			if v_0_0.Op != OpAMD64VCMPPD512 || auxIntToUint8(v_0_0.AuxInt) != 3 {
+				continue
+			}
+			x := v_0_0.Args[1]
+			if x != v_0_0.Args[0] || v_1.Op != OpAMD64VPMOVMToVec64x8 {
+				continue
+			}
+			v_1_0 := v_1.Args[0]
+			if v_1_0.Op != OpAMD64VCMPPD512 || auxIntToUint8(v_1_0.AuxInt) != 3 {
+				continue
+			}
+			y := v_1_0.Args[1]
+			if y != v_1_0.Args[0] {
+				continue
+			}
+			v.reset(OpAMD64VPMOVMToVec64x8)
+			v0 := b.NewValue0(v.Pos, OpAMD64VCMPPD512, typ.Mask)
+			v0.AuxInt = uint8ToAuxInt(3)
+			v0.AddArg2(x, y)
+			v.AddArg(v0)
+			return true
+		}
+		break
+	}
 	// match: (VPORD512 x l:(VMOVDQUload512 {sym} [off] ptr mem))
 	// cond: canMergeLoad(v, l) && clobber(l)
 	// result: (VPORD512load {sym} [off] x ptr mem)
@@ -57296,7 +57198,7 @@ func rewriteValueAMD64_OpAMD64VPROLD128(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VPROLD128 [c] l:(VMOVDQUload128 {sym} [off] ptr mem))
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VPROLD128load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mem)
+	// result: (VPROLD128load {sym} [makeValAndOff(int32(uint8(c)),off)] ptr mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		l := v_0
@@ -57311,7 +57213,7 @@ func rewriteValueAMD64_OpAMD64VPROLD128(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VPROLD128load)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg2(ptr, mem)
 		return true
@@ -57322,7 +57224,7 @@ func rewriteValueAMD64_OpAMD64VPROLD256(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VPROLD256 [c] l:(VMOVDQUload256 {sym} [off] ptr mem))
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VPROLD256load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mem)
+	// result: (VPROLD256load {sym} [makeValAndOff(int32(uint8(c)),off)] ptr mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		l := v_0
@@ -57337,7 +57239,7 @@ func rewriteValueAMD64_OpAMD64VPROLD256(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VPROLD256load)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg2(ptr, mem)
 		return true
@@ -57348,7 +57250,7 @@ func rewriteValueAMD64_OpAMD64VPROLD512(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VPROLD512 [c] l:(VMOVDQUload512 {sym} [off] ptr mem))
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VPROLD512load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mem)
+	// result: (VPROLD512load {sym} [makeValAndOff(int32(uint8(c)),off)] ptr mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		l := v_0
@@ -57363,7 +57265,7 @@ func rewriteValueAMD64_OpAMD64VPROLD512(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VPROLD512load)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg2(ptr, mem)
 		return true
@@ -57375,7 +57277,7 @@ func rewriteValueAMD64_OpAMD64VPROLDMasked128(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VPROLDMasked128 [c] l:(VMOVDQUload128 {sym} [off] ptr mem) mask)
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VPROLDMasked128load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem)
+	// result: (VPROLDMasked128load {sym} [makeValAndOff(int32(uint8(c)),off)] ptr mask mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		l := v_0
@@ -57391,7 +57293,7 @@ func rewriteValueAMD64_OpAMD64VPROLDMasked128(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VPROLDMasked128load)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg3(ptr, mask, mem)
 		return true
@@ -57403,7 +57305,7 @@ func rewriteValueAMD64_OpAMD64VPROLDMasked256(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VPROLDMasked256 [c] l:(VMOVDQUload256 {sym} [off] ptr mem) mask)
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VPROLDMasked256load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem)
+	// result: (VPROLDMasked256load {sym} [makeValAndOff(int32(uint8(c)),off)] ptr mask mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		l := v_0
@@ -57419,7 +57321,7 @@ func rewriteValueAMD64_OpAMD64VPROLDMasked256(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VPROLDMasked256load)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg3(ptr, mask, mem)
 		return true
@@ -57431,7 +57333,7 @@ func rewriteValueAMD64_OpAMD64VPROLDMasked512(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VPROLDMasked512 [c] l:(VMOVDQUload512 {sym} [off] ptr mem) mask)
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VPROLDMasked512load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem)
+	// result: (VPROLDMasked512load {sym} [makeValAndOff(int32(uint8(c)),off)] ptr mask mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		l := v_0
@@ -57447,7 +57349,7 @@ func rewriteValueAMD64_OpAMD64VPROLDMasked512(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VPROLDMasked512load)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg3(ptr, mask, mem)
 		return true
@@ -57458,7 +57360,7 @@ func rewriteValueAMD64_OpAMD64VPROLQ128(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VPROLQ128 [c] l:(VMOVDQUload128 {sym} [off] ptr mem))
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VPROLQ128load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mem)
+	// result: (VPROLQ128load {sym} [makeValAndOff(int32(uint8(c)),off)] ptr mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		l := v_0
@@ -57473,7 +57375,7 @@ func rewriteValueAMD64_OpAMD64VPROLQ128(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VPROLQ128load)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg2(ptr, mem)
 		return true
@@ -57484,7 +57386,7 @@ func rewriteValueAMD64_OpAMD64VPROLQ256(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VPROLQ256 [c] l:(VMOVDQUload256 {sym} [off] ptr mem))
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VPROLQ256load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mem)
+	// result: (VPROLQ256load {sym} [makeValAndOff(int32(uint8(c)),off)] ptr mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		l := v_0
@@ -57499,7 +57401,7 @@ func rewriteValueAMD64_OpAMD64VPROLQ256(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VPROLQ256load)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg2(ptr, mem)
 		return true
@@ -57510,7 +57412,7 @@ func rewriteValueAMD64_OpAMD64VPROLQ512(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VPROLQ512 [c] l:(VMOVDQUload512 {sym} [off] ptr mem))
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VPROLQ512load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mem)
+	// result: (VPROLQ512load {sym} [makeValAndOff(int32(uint8(c)),off)] ptr mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		l := v_0
@@ -57525,7 +57427,7 @@ func rewriteValueAMD64_OpAMD64VPROLQ512(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VPROLQ512load)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg2(ptr, mem)
 		return true
@@ -57537,7 +57439,7 @@ func rewriteValueAMD64_OpAMD64VPROLQMasked128(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VPROLQMasked128 [c] l:(VMOVDQUload128 {sym} [off] ptr mem) mask)
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VPROLQMasked128load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem)
+	// result: (VPROLQMasked128load {sym} [makeValAndOff(int32(uint8(c)),off)] ptr mask mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		l := v_0
@@ -57553,7 +57455,7 @@ func rewriteValueAMD64_OpAMD64VPROLQMasked128(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VPROLQMasked128load)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg3(ptr, mask, mem)
 		return true
@@ -57565,7 +57467,7 @@ func rewriteValueAMD64_OpAMD64VPROLQMasked256(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VPROLQMasked256 [c] l:(VMOVDQUload256 {sym} [off] ptr mem) mask)
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VPROLQMasked256load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem)
+	// result: (VPROLQMasked256load {sym} [makeValAndOff(int32(uint8(c)),off)] ptr mask mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		l := v_0
@@ -57581,7 +57483,7 @@ func rewriteValueAMD64_OpAMD64VPROLQMasked256(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VPROLQMasked256load)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg3(ptr, mask, mem)
 		return true
@@ -57593,7 +57495,7 @@ func rewriteValueAMD64_OpAMD64VPROLQMasked512(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VPROLQMasked512 [c] l:(VMOVDQUload512 {sym} [off] ptr mem) mask)
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VPROLQMasked512load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem)
+	// result: (VPROLQMasked512load {sym} [makeValAndOff(int32(uint8(c)),off)] ptr mask mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		l := v_0
@@ -57609,7 +57511,7 @@ func rewriteValueAMD64_OpAMD64VPROLQMasked512(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VPROLQMasked512load)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg3(ptr, mask, mem)
 		return true
@@ -57956,7 +57858,7 @@ func rewriteValueAMD64_OpAMD64VPRORD128(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VPRORD128 [c] l:(VMOVDQUload128 {sym} [off] ptr mem))
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VPRORD128load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mem)
+	// result: (VPRORD128load {sym} [makeValAndOff(int32(uint8(c)),off)] ptr mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		l := v_0
@@ -57971,7 +57873,7 @@ func rewriteValueAMD64_OpAMD64VPRORD128(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VPRORD128load)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg2(ptr, mem)
 		return true
@@ -57982,7 +57884,7 @@ func rewriteValueAMD64_OpAMD64VPRORD256(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VPRORD256 [c] l:(VMOVDQUload256 {sym} [off] ptr mem))
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VPRORD256load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mem)
+	// result: (VPRORD256load {sym} [makeValAndOff(int32(uint8(c)),off)] ptr mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		l := v_0
@@ -57997,7 +57899,7 @@ func rewriteValueAMD64_OpAMD64VPRORD256(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VPRORD256load)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg2(ptr, mem)
 		return true
@@ -58008,7 +57910,7 @@ func rewriteValueAMD64_OpAMD64VPRORD512(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VPRORD512 [c] l:(VMOVDQUload512 {sym} [off] ptr mem))
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VPRORD512load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mem)
+	// result: (VPRORD512load {sym} [makeValAndOff(int32(uint8(c)),off)] ptr mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		l := v_0
@@ -58023,7 +57925,7 @@ func rewriteValueAMD64_OpAMD64VPRORD512(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VPRORD512load)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg2(ptr, mem)
 		return true
@@ -58035,7 +57937,7 @@ func rewriteValueAMD64_OpAMD64VPRORDMasked128(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VPRORDMasked128 [c] l:(VMOVDQUload128 {sym} [off] ptr mem) mask)
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VPRORDMasked128load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem)
+	// result: (VPRORDMasked128load {sym} [makeValAndOff(int32(uint8(c)),off)] ptr mask mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		l := v_0
@@ -58051,7 +57953,7 @@ func rewriteValueAMD64_OpAMD64VPRORDMasked128(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VPRORDMasked128load)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg3(ptr, mask, mem)
 		return true
@@ -58063,7 +57965,7 @@ func rewriteValueAMD64_OpAMD64VPRORDMasked256(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VPRORDMasked256 [c] l:(VMOVDQUload256 {sym} [off] ptr mem) mask)
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VPRORDMasked256load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem)
+	// result: (VPRORDMasked256load {sym} [makeValAndOff(int32(uint8(c)),off)] ptr mask mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		l := v_0
@@ -58079,7 +57981,7 @@ func rewriteValueAMD64_OpAMD64VPRORDMasked256(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VPRORDMasked256load)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg3(ptr, mask, mem)
 		return true
@@ -58091,7 +57993,7 @@ func rewriteValueAMD64_OpAMD64VPRORDMasked512(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VPRORDMasked512 [c] l:(VMOVDQUload512 {sym} [off] ptr mem) mask)
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VPRORDMasked512load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem)
+	// result: (VPRORDMasked512load {sym} [makeValAndOff(int32(uint8(c)),off)] ptr mask mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		l := v_0
@@ -58107,7 +58009,7 @@ func rewriteValueAMD64_OpAMD64VPRORDMasked512(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VPRORDMasked512load)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg3(ptr, mask, mem)
 		return true
@@ -58118,7 +58020,7 @@ func rewriteValueAMD64_OpAMD64VPRORQ128(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VPRORQ128 [c] l:(VMOVDQUload128 {sym} [off] ptr mem))
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VPRORQ128load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mem)
+	// result: (VPRORQ128load {sym} [makeValAndOff(int32(uint8(c)),off)] ptr mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		l := v_0
@@ -58133,7 +58035,7 @@ func rewriteValueAMD64_OpAMD64VPRORQ128(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VPRORQ128load)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg2(ptr, mem)
 		return true
@@ -58144,7 +58046,7 @@ func rewriteValueAMD64_OpAMD64VPRORQ256(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VPRORQ256 [c] l:(VMOVDQUload256 {sym} [off] ptr mem))
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VPRORQ256load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mem)
+	// result: (VPRORQ256load {sym} [makeValAndOff(int32(uint8(c)),off)] ptr mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		l := v_0
@@ -58159,7 +58061,7 @@ func rewriteValueAMD64_OpAMD64VPRORQ256(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VPRORQ256load)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg2(ptr, mem)
 		return true
@@ -58170,7 +58072,7 @@ func rewriteValueAMD64_OpAMD64VPRORQ512(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VPRORQ512 [c] l:(VMOVDQUload512 {sym} [off] ptr mem))
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VPRORQ512load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mem)
+	// result: (VPRORQ512load {sym} [makeValAndOff(int32(uint8(c)),off)] ptr mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		l := v_0
@@ -58185,7 +58087,7 @@ func rewriteValueAMD64_OpAMD64VPRORQ512(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VPRORQ512load)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg2(ptr, mem)
 		return true
@@ -58197,7 +58099,7 @@ func rewriteValueAMD64_OpAMD64VPRORQMasked128(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VPRORQMasked128 [c] l:(VMOVDQUload128 {sym} [off] ptr mem) mask)
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VPRORQMasked128load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem)
+	// result: (VPRORQMasked128load {sym} [makeValAndOff(int32(uint8(c)),off)] ptr mask mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		l := v_0
@@ -58213,7 +58115,7 @@ func rewriteValueAMD64_OpAMD64VPRORQMasked128(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VPRORQMasked128load)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg3(ptr, mask, mem)
 		return true
@@ -58225,7 +58127,7 @@ func rewriteValueAMD64_OpAMD64VPRORQMasked256(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VPRORQMasked256 [c] l:(VMOVDQUload256 {sym} [off] ptr mem) mask)
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VPRORQMasked256load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem)
+	// result: (VPRORQMasked256load {sym} [makeValAndOff(int32(uint8(c)),off)] ptr mask mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		l := v_0
@@ -58241,7 +58143,7 @@ func rewriteValueAMD64_OpAMD64VPRORQMasked256(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VPRORQMasked256load)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg3(ptr, mask, mem)
 		return true
@@ -58253,7 +58155,7 @@ func rewriteValueAMD64_OpAMD64VPRORQMasked512(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VPRORQMasked512 [c] l:(VMOVDQUload512 {sym} [off] ptr mem) mask)
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VPRORQMasked512load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem)
+	// result: (VPRORQMasked512load {sym} [makeValAndOff(int32(uint8(c)),off)] ptr mask mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		l := v_0
@@ -58269,7 +58171,7 @@ func rewriteValueAMD64_OpAMD64VPRORQMasked512(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VPRORQMasked512load)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg3(ptr, mask, mem)
 		return true
@@ -58617,7 +58519,7 @@ func rewriteValueAMD64_OpAMD64VPSHLDD128(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VPSHLDD128 [c] x l:(VMOVDQUload128 {sym} [off] ptr mem))
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VPSHLDD128load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mem)
+	// result: (VPSHLDD128load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		x := v_0
@@ -58633,7 +58535,7 @@ func rewriteValueAMD64_OpAMD64VPSHLDD128(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VPSHLDD128load)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg3(x, ptr, mem)
 		return true
@@ -58645,7 +58547,7 @@ func rewriteValueAMD64_OpAMD64VPSHLDD256(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VPSHLDD256 [c] x l:(VMOVDQUload256 {sym} [off] ptr mem))
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VPSHLDD256load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mem)
+	// result: (VPSHLDD256load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		x := v_0
@@ -58661,7 +58563,7 @@ func rewriteValueAMD64_OpAMD64VPSHLDD256(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VPSHLDD256load)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg3(x, ptr, mem)
 		return true
@@ -58673,7 +58575,7 @@ func rewriteValueAMD64_OpAMD64VPSHLDD512(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VPSHLDD512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem))
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VPSHLDD512load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mem)
+	// result: (VPSHLDD512load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		x := v_0
@@ -58689,7 +58591,7 @@ func rewriteValueAMD64_OpAMD64VPSHLDD512(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VPSHLDD512load)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg3(x, ptr, mem)
 		return true
@@ -58702,7 +58604,7 @@ func rewriteValueAMD64_OpAMD64VPSHLDDMasked128(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VPSHLDDMasked128 [c] x l:(VMOVDQUload128 {sym} [off] ptr mem) mask)
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VPSHLDDMasked128load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
+	// result: (VPSHLDDMasked128load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mask mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		x := v_0
@@ -58719,7 +58621,7 @@ func rewriteValueAMD64_OpAMD64VPSHLDDMasked128(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VPSHLDDMasked128load)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg4(x, ptr, mask, mem)
 		return true
@@ -58732,7 +58634,7 @@ func rewriteValueAMD64_OpAMD64VPSHLDDMasked256(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VPSHLDDMasked256 [c] x l:(VMOVDQUload256 {sym} [off] ptr mem) mask)
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VPSHLDDMasked256load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
+	// result: (VPSHLDDMasked256load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mask mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		x := v_0
@@ -58749,7 +58651,7 @@ func rewriteValueAMD64_OpAMD64VPSHLDDMasked256(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VPSHLDDMasked256load)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg4(x, ptr, mask, mem)
 		return true
@@ -58762,7 +58664,7 @@ func rewriteValueAMD64_OpAMD64VPSHLDDMasked512(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VPSHLDDMasked512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem) mask)
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VPSHLDDMasked512load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
+	// result: (VPSHLDDMasked512load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mask mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		x := v_0
@@ -58779,7 +58681,7 @@ func rewriteValueAMD64_OpAMD64VPSHLDDMasked512(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VPSHLDDMasked512load)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg4(x, ptr, mask, mem)
 		return true
@@ -58791,7 +58693,7 @@ func rewriteValueAMD64_OpAMD64VPSHLDQ128(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VPSHLDQ128 [c] x l:(VMOVDQUload128 {sym} [off] ptr mem))
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VPSHLDQ128load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mem)
+	// result: (VPSHLDQ128load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		x := v_0
@@ -58807,7 +58709,7 @@ func rewriteValueAMD64_OpAMD64VPSHLDQ128(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VPSHLDQ128load)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg3(x, ptr, mem)
 		return true
@@ -58819,7 +58721,7 @@ func rewriteValueAMD64_OpAMD64VPSHLDQ256(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VPSHLDQ256 [c] x l:(VMOVDQUload256 {sym} [off] ptr mem))
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VPSHLDQ256load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mem)
+	// result: (VPSHLDQ256load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		x := v_0
@@ -58835,7 +58737,7 @@ func rewriteValueAMD64_OpAMD64VPSHLDQ256(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VPSHLDQ256load)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg3(x, ptr, mem)
 		return true
@@ -58847,7 +58749,7 @@ func rewriteValueAMD64_OpAMD64VPSHLDQ512(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VPSHLDQ512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem))
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VPSHLDQ512load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mem)
+	// result: (VPSHLDQ512load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		x := v_0
@@ -58863,7 +58765,7 @@ func rewriteValueAMD64_OpAMD64VPSHLDQ512(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VPSHLDQ512load)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg3(x, ptr, mem)
 		return true
@@ -58876,7 +58778,7 @@ func rewriteValueAMD64_OpAMD64VPSHLDQMasked128(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VPSHLDQMasked128 [c] x l:(VMOVDQUload128 {sym} [off] ptr mem) mask)
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VPSHLDQMasked128load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
+	// result: (VPSHLDQMasked128load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mask mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		x := v_0
@@ -58893,7 +58795,7 @@ func rewriteValueAMD64_OpAMD64VPSHLDQMasked128(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VPSHLDQMasked128load)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg4(x, ptr, mask, mem)
 		return true
@@ -58906,7 +58808,7 @@ func rewriteValueAMD64_OpAMD64VPSHLDQMasked256(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VPSHLDQMasked256 [c] x l:(VMOVDQUload256 {sym} [off] ptr mem) mask)
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VPSHLDQMasked256load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
+	// result: (VPSHLDQMasked256load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mask mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		x := v_0
@@ -58923,7 +58825,7 @@ func rewriteValueAMD64_OpAMD64VPSHLDQMasked256(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VPSHLDQMasked256load)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg4(x, ptr, mask, mem)
 		return true
@@ -58936,7 +58838,7 @@ func rewriteValueAMD64_OpAMD64VPSHLDQMasked512(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VPSHLDQMasked512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem) mask)
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VPSHLDQMasked512load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
+	// result: (VPSHLDQMasked512load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mask mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		x := v_0
@@ -58953,7 +58855,7 @@ func rewriteValueAMD64_OpAMD64VPSHLDQMasked512(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VPSHLDQMasked512load)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg4(x, ptr, mask, mem)
 		return true
@@ -59325,7 +59227,7 @@ func rewriteValueAMD64_OpAMD64VPSHRDD128(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VPSHRDD128 [c] x l:(VMOVDQUload128 {sym} [off] ptr mem))
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VPSHRDD128load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mem)
+	// result: (VPSHRDD128load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		x := v_0
@@ -59341,7 +59243,7 @@ func rewriteValueAMD64_OpAMD64VPSHRDD128(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VPSHRDD128load)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg3(x, ptr, mem)
 		return true
@@ -59353,7 +59255,7 @@ func rewriteValueAMD64_OpAMD64VPSHRDD256(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VPSHRDD256 [c] x l:(VMOVDQUload256 {sym} [off] ptr mem))
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VPSHRDD256load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mem)
+	// result: (VPSHRDD256load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		x := v_0
@@ -59369,7 +59271,7 @@ func rewriteValueAMD64_OpAMD64VPSHRDD256(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VPSHRDD256load)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg3(x, ptr, mem)
 		return true
@@ -59381,7 +59283,7 @@ func rewriteValueAMD64_OpAMD64VPSHRDD512(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VPSHRDD512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem))
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VPSHRDD512load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mem)
+	// result: (VPSHRDD512load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		x := v_0
@@ -59397,7 +59299,7 @@ func rewriteValueAMD64_OpAMD64VPSHRDD512(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VPSHRDD512load)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg3(x, ptr, mem)
 		return true
@@ -59410,7 +59312,7 @@ func rewriteValueAMD64_OpAMD64VPSHRDDMasked128(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VPSHRDDMasked128 [c] x l:(VMOVDQUload128 {sym} [off] ptr mem) mask)
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VPSHRDDMasked128load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
+	// result: (VPSHRDDMasked128load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mask mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		x := v_0
@@ -59427,7 +59329,7 @@ func rewriteValueAMD64_OpAMD64VPSHRDDMasked128(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VPSHRDDMasked128load)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg4(x, ptr, mask, mem)
 		return true
@@ -59440,7 +59342,7 @@ func rewriteValueAMD64_OpAMD64VPSHRDDMasked256(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VPSHRDDMasked256 [c] x l:(VMOVDQUload256 {sym} [off] ptr mem) mask)
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VPSHRDDMasked256load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
+	// result: (VPSHRDDMasked256load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mask mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		x := v_0
@@ -59457,7 +59359,7 @@ func rewriteValueAMD64_OpAMD64VPSHRDDMasked256(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VPSHRDDMasked256load)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg4(x, ptr, mask, mem)
 		return true
@@ -59470,7 +59372,7 @@ func rewriteValueAMD64_OpAMD64VPSHRDDMasked512(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VPSHRDDMasked512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem) mask)
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VPSHRDDMasked512load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
+	// result: (VPSHRDDMasked512load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mask mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		x := v_0
@@ -59487,7 +59389,7 @@ func rewriteValueAMD64_OpAMD64VPSHRDDMasked512(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VPSHRDDMasked512load)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg4(x, ptr, mask, mem)
 		return true
@@ -59499,7 +59401,7 @@ func rewriteValueAMD64_OpAMD64VPSHRDQ128(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VPSHRDQ128 [c] x l:(VMOVDQUload128 {sym} [off] ptr mem))
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VPSHRDQ128load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mem)
+	// result: (VPSHRDQ128load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		x := v_0
@@ -59515,7 +59417,7 @@ func rewriteValueAMD64_OpAMD64VPSHRDQ128(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VPSHRDQ128load)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg3(x, ptr, mem)
 		return true
@@ -59527,7 +59429,7 @@ func rewriteValueAMD64_OpAMD64VPSHRDQ256(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VPSHRDQ256 [c] x l:(VMOVDQUload256 {sym} [off] ptr mem))
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VPSHRDQ256load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mem)
+	// result: (VPSHRDQ256load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		x := v_0
@@ -59543,7 +59445,7 @@ func rewriteValueAMD64_OpAMD64VPSHRDQ256(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VPSHRDQ256load)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg3(x, ptr, mem)
 		return true
@@ -59555,7 +59457,7 @@ func rewriteValueAMD64_OpAMD64VPSHRDQ512(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VPSHRDQ512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem))
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VPSHRDQ512load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mem)
+	// result: (VPSHRDQ512load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		x := v_0
@@ -59571,7 +59473,7 @@ func rewriteValueAMD64_OpAMD64VPSHRDQ512(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VPSHRDQ512load)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg3(x, ptr, mem)
 		return true
@@ -59584,7 +59486,7 @@ func rewriteValueAMD64_OpAMD64VPSHRDQMasked128(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VPSHRDQMasked128 [c] x l:(VMOVDQUload128 {sym} [off] ptr mem) mask)
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VPSHRDQMasked128load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
+	// result: (VPSHRDQMasked128load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mask mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		x := v_0
@@ -59601,7 +59503,7 @@ func rewriteValueAMD64_OpAMD64VPSHRDQMasked128(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VPSHRDQMasked128load)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg4(x, ptr, mask, mem)
 		return true
@@ -59614,7 +59516,7 @@ func rewriteValueAMD64_OpAMD64VPSHRDQMasked256(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VPSHRDQMasked256 [c] x l:(VMOVDQUload256 {sym} [off] ptr mem) mask)
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VPSHRDQMasked256load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
+	// result: (VPSHRDQMasked256load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mask mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		x := v_0
@@ -59631,7 +59533,7 @@ func rewriteValueAMD64_OpAMD64VPSHRDQMasked256(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VPSHRDQMasked256load)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg4(x, ptr, mask, mem)
 		return true
@@ -59644,7 +59546,7 @@ func rewriteValueAMD64_OpAMD64VPSHRDQMasked512(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VPSHRDQMasked512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem) mask)
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VPSHRDQMasked512load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
+	// result: (VPSHRDQMasked512load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mask mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		x := v_0
@@ -59661,7 +59563,7 @@ func rewriteValueAMD64_OpAMD64VPSHRDQMasked512(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VPSHRDQMasked512load)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg4(x, ptr, mask, mem)
 		return true
@@ -60032,7 +59934,7 @@ func rewriteValueAMD64_OpAMD64VPSHUFD512(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VPSHUFD512 [c] l:(VMOVDQUload512 {sym} [off] ptr mem))
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VPSHUFD512load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mem)
+	// result: (VPSHUFD512load {sym} [makeValAndOff(int32(uint8(c)),off)] ptr mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		l := v_0
@@ -60047,7 +59949,7 @@ func rewriteValueAMD64_OpAMD64VPSHUFD512(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VPSHUFD512load)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg2(ptr, mem)
 		return true
@@ -60059,7 +59961,7 @@ func rewriteValueAMD64_OpAMD64VPSHUFDMasked128(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VPSHUFDMasked128 [c] l:(VMOVDQUload128 {sym} [off] ptr mem) mask)
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VPSHUFDMasked128load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem)
+	// result: (VPSHUFDMasked128load {sym} [makeValAndOff(int32(uint8(c)),off)] ptr mask mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		l := v_0
@@ -60075,7 +59977,7 @@ func rewriteValueAMD64_OpAMD64VPSHUFDMasked128(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VPSHUFDMasked128load)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg3(ptr, mask, mem)
 		return true
@@ -60087,7 +59989,7 @@ func rewriteValueAMD64_OpAMD64VPSHUFDMasked256(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VPSHUFDMasked256 [c] l:(VMOVDQUload256 {sym} [off] ptr mem) mask)
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VPSHUFDMasked256load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem)
+	// result: (VPSHUFDMasked256load {sym} [makeValAndOff(int32(uint8(c)),off)] ptr mask mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		l := v_0
@@ -60103,7 +60005,7 @@ func rewriteValueAMD64_OpAMD64VPSHUFDMasked256(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VPSHUFDMasked256load)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg3(ptr, mask, mem)
 		return true
@@ -60115,7 +60017,7 @@ func rewriteValueAMD64_OpAMD64VPSHUFDMasked512(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VPSHUFDMasked512 [c] l:(VMOVDQUload512 {sym} [off] ptr mem) mask)
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VPSHUFDMasked512load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem)
+	// result: (VPSHUFDMasked512load {sym} [makeValAndOff(int32(uint8(c)),off)] ptr mask mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		l := v_0
@@ -60131,7 +60033,7 @@ func rewriteValueAMD64_OpAMD64VPSHUFDMasked512(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VPSHUFDMasked512load)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg3(ptr, mask, mem)
 		return true
@@ -60196,7 +60098,7 @@ func rewriteValueAMD64_OpAMD64VPSLLD512const(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VPSLLD512const [c] l:(VMOVDQUload512 {sym} [off] ptr mem))
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VPSLLD512constload {sym} [makeValAndOff(int32(int8(c)),off)] ptr mem)
+	// result: (VPSLLD512constload {sym} [makeValAndOff(int32(uint8(c)),off)] ptr mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		l := v_0
@@ -60211,7 +60113,7 @@ func rewriteValueAMD64_OpAMD64VPSLLD512const(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VPSLLD512constload)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg2(ptr, mem)
 		return true
@@ -60243,7 +60145,7 @@ func rewriteValueAMD64_OpAMD64VPSLLDMasked128const(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VPSLLDMasked128const [c] l:(VMOVDQUload128 {sym} [off] ptr mem) mask)
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VPSLLDMasked128constload {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem)
+	// result: (VPSLLDMasked128constload {sym} [makeValAndOff(int32(uint8(c)),off)] ptr mask mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		l := v_0
@@ -60259,7 +60161,7 @@ func rewriteValueAMD64_OpAMD64VPSLLDMasked128const(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VPSLLDMasked128constload)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg3(ptr, mask, mem)
 		return true
@@ -60291,7 +60193,7 @@ func rewriteValueAMD64_OpAMD64VPSLLDMasked256const(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VPSLLDMasked256const [c] l:(VMOVDQUload256 {sym} [off] ptr mem) mask)
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VPSLLDMasked256constload {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem)
+	// result: (VPSLLDMasked256constload {sym} [makeValAndOff(int32(uint8(c)),off)] ptr mask mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		l := v_0
@@ -60307,7 +60209,7 @@ func rewriteValueAMD64_OpAMD64VPSLLDMasked256const(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VPSLLDMasked256constload)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg3(ptr, mask, mem)
 		return true
@@ -60339,7 +60241,7 @@ func rewriteValueAMD64_OpAMD64VPSLLDMasked512const(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VPSLLDMasked512const [c] l:(VMOVDQUload512 {sym} [off] ptr mem) mask)
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VPSLLDMasked512constload {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem)
+	// result: (VPSLLDMasked512constload {sym} [makeValAndOff(int32(uint8(c)),off)] ptr mask mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		l := v_0
@@ -60355,7 +60257,7 @@ func rewriteValueAMD64_OpAMD64VPSLLDMasked512const(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VPSLLDMasked512constload)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg3(ptr, mask, mem)
 		return true
@@ -60420,7 +60322,7 @@ func rewriteValueAMD64_OpAMD64VPSLLQ512const(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VPSLLQ512const [c] l:(VMOVDQUload512 {sym} [off] ptr mem))
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VPSLLQ512constload {sym} [makeValAndOff(int32(int8(c)),off)] ptr mem)
+	// result: (VPSLLQ512constload {sym} [makeValAndOff(int32(uint8(c)),off)] ptr mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		l := v_0
@@ -60435,7 +60337,7 @@ func rewriteValueAMD64_OpAMD64VPSLLQ512const(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VPSLLQ512constload)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg2(ptr, mem)
 		return true
@@ -60467,7 +60369,7 @@ func rewriteValueAMD64_OpAMD64VPSLLQMasked128const(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VPSLLQMasked128const [c] l:(VMOVDQUload128 {sym} [off] ptr mem) mask)
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VPSLLQMasked128constload {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem)
+	// result: (VPSLLQMasked128constload {sym} [makeValAndOff(int32(uint8(c)),off)] ptr mask mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		l := v_0
@@ -60483,7 +60385,7 @@ func rewriteValueAMD64_OpAMD64VPSLLQMasked128const(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VPSLLQMasked128constload)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg3(ptr, mask, mem)
 		return true
@@ -60515,7 +60417,7 @@ func rewriteValueAMD64_OpAMD64VPSLLQMasked256const(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VPSLLQMasked256const [c] l:(VMOVDQUload256 {sym} [off] ptr mem) mask)
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VPSLLQMasked256constload {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem)
+	// result: (VPSLLQMasked256constload {sym} [makeValAndOff(int32(uint8(c)),off)] ptr mask mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		l := v_0
@@ -60531,7 +60433,7 @@ func rewriteValueAMD64_OpAMD64VPSLLQMasked256const(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VPSLLQMasked256constload)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg3(ptr, mask, mem)
 		return true
@@ -60563,7 +60465,7 @@ func rewriteValueAMD64_OpAMD64VPSLLQMasked512const(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VPSLLQMasked512const [c] l:(VMOVDQUload512 {sym} [off] ptr mem) mask)
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VPSLLQMasked512constload {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem)
+	// result: (VPSLLQMasked512constload {sym} [makeValAndOff(int32(uint8(c)),off)] ptr mask mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		l := v_0
@@ -60579,7 +60481,7 @@ func rewriteValueAMD64_OpAMD64VPSLLQMasked512const(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VPSLLQMasked512constload)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg3(ptr, mask, mem)
 		return true
@@ -60986,7 +60888,7 @@ func rewriteValueAMD64_OpAMD64VPSRAD512const(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VPSRAD512const [c] l:(VMOVDQUload512 {sym} [off] ptr mem))
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VPSRAD512constload {sym} [makeValAndOff(int32(int8(c)),off)] ptr mem)
+	// result: (VPSRAD512constload {sym} [makeValAndOff(int32(uint8(c)),off)] ptr mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		l := v_0
@@ -61001,7 +60903,7 @@ func rewriteValueAMD64_OpAMD64VPSRAD512const(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VPSRAD512constload)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg2(ptr, mem)
 		return true
@@ -61033,7 +60935,7 @@ func rewriteValueAMD64_OpAMD64VPSRADMasked128const(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VPSRADMasked128const [c] l:(VMOVDQUload128 {sym} [off] ptr mem) mask)
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VPSRADMasked128constload {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem)
+	// result: (VPSRADMasked128constload {sym} [makeValAndOff(int32(uint8(c)),off)] ptr mask mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		l := v_0
@@ -61049,7 +60951,7 @@ func rewriteValueAMD64_OpAMD64VPSRADMasked128const(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VPSRADMasked128constload)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg3(ptr, mask, mem)
 		return true
@@ -61081,7 +60983,7 @@ func rewriteValueAMD64_OpAMD64VPSRADMasked256const(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VPSRADMasked256const [c] l:(VMOVDQUload256 {sym} [off] ptr mem) mask)
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VPSRADMasked256constload {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem)
+	// result: (VPSRADMasked256constload {sym} [makeValAndOff(int32(uint8(c)),off)] ptr mask mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		l := v_0
@@ -61097,7 +60999,7 @@ func rewriteValueAMD64_OpAMD64VPSRADMasked256const(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VPSRADMasked256constload)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg3(ptr, mask, mem)
 		return true
@@ -61129,7 +61031,7 @@ func rewriteValueAMD64_OpAMD64VPSRADMasked512const(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VPSRADMasked512const [c] l:(VMOVDQUload512 {sym} [off] ptr mem) mask)
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VPSRADMasked512constload {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem)
+	// result: (VPSRADMasked512constload {sym} [makeValAndOff(int32(uint8(c)),off)] ptr mask mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		l := v_0
@@ -61145,7 +61047,7 @@ func rewriteValueAMD64_OpAMD64VPSRADMasked512const(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VPSRADMasked512constload)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg3(ptr, mask, mem)
 		return true
@@ -61174,7 +61076,7 @@ func rewriteValueAMD64_OpAMD64VPSRAQ128const(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VPSRAQ128const [c] l:(VMOVDQUload128 {sym} [off] ptr mem))
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VPSRAQ128constload {sym} [makeValAndOff(int32(int8(c)),off)] ptr mem)
+	// result: (VPSRAQ128constload {sym} [makeValAndOff(int32(uint8(c)),off)] ptr mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		l := v_0
@@ -61189,7 +61091,7 @@ func rewriteValueAMD64_OpAMD64VPSRAQ128const(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VPSRAQ128constload)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg2(ptr, mem)
 		return true
@@ -61218,7 +61120,7 @@ func rewriteValueAMD64_OpAMD64VPSRAQ256const(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VPSRAQ256const [c] l:(VMOVDQUload256 {sym} [off] ptr mem))
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VPSRAQ256constload {sym} [makeValAndOff(int32(int8(c)),off)] ptr mem)
+	// result: (VPSRAQ256constload {sym} [makeValAndOff(int32(uint8(c)),off)] ptr mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		l := v_0
@@ -61233,7 +61135,7 @@ func rewriteValueAMD64_OpAMD64VPSRAQ256const(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VPSRAQ256constload)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg2(ptr, mem)
 		return true
@@ -61262,7 +61164,7 @@ func rewriteValueAMD64_OpAMD64VPSRAQ512const(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VPSRAQ512const [c] l:(VMOVDQUload512 {sym} [off] ptr mem))
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VPSRAQ512constload {sym} [makeValAndOff(int32(int8(c)),off)] ptr mem)
+	// result: (VPSRAQ512constload {sym} [makeValAndOff(int32(uint8(c)),off)] ptr mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		l := v_0
@@ -61277,7 +61179,7 @@ func rewriteValueAMD64_OpAMD64VPSRAQ512const(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VPSRAQ512constload)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg2(ptr, mem)
 		return true
@@ -61309,7 +61211,7 @@ func rewriteValueAMD64_OpAMD64VPSRAQMasked128const(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VPSRAQMasked128const [c] l:(VMOVDQUload128 {sym} [off] ptr mem) mask)
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VPSRAQMasked128constload {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem)
+	// result: (VPSRAQMasked128constload {sym} [makeValAndOff(int32(uint8(c)),off)] ptr mask mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		l := v_0
@@ -61325,7 +61227,7 @@ func rewriteValueAMD64_OpAMD64VPSRAQMasked128const(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VPSRAQMasked128constload)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg3(ptr, mask, mem)
 		return true
@@ -61357,7 +61259,7 @@ func rewriteValueAMD64_OpAMD64VPSRAQMasked256const(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VPSRAQMasked256const [c] l:(VMOVDQUload256 {sym} [off] ptr mem) mask)
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VPSRAQMasked256constload {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem)
+	// result: (VPSRAQMasked256constload {sym} [makeValAndOff(int32(uint8(c)),off)] ptr mask mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		l := v_0
@@ -61373,7 +61275,7 @@ func rewriteValueAMD64_OpAMD64VPSRAQMasked256const(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VPSRAQMasked256constload)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg3(ptr, mask, mem)
 		return true
@@ -61405,7 +61307,7 @@ func rewriteValueAMD64_OpAMD64VPSRAQMasked512const(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VPSRAQMasked512const [c] l:(VMOVDQUload512 {sym} [off] ptr mem) mask)
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VPSRAQMasked512constload {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem)
+	// result: (VPSRAQMasked512constload {sym} [makeValAndOff(int32(uint8(c)),off)] ptr mask mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		l := v_0
@@ -61421,7 +61323,7 @@ func rewriteValueAMD64_OpAMD64VPSRAQMasked512const(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VPSRAQMasked512constload)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg3(ptr, mask, mem)
 		return true
@@ -61828,7 +61730,7 @@ func rewriteValueAMD64_OpAMD64VPSRLD512const(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VPSRLD512const [c] l:(VMOVDQUload512 {sym} [off] ptr mem))
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VPSRLD512constload {sym} [makeValAndOff(int32(int8(c)),off)] ptr mem)
+	// result: (VPSRLD512constload {sym} [makeValAndOff(int32(uint8(c)),off)] ptr mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		l := v_0
@@ -61843,7 +61745,7 @@ func rewriteValueAMD64_OpAMD64VPSRLD512const(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VPSRLD512constload)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg2(ptr, mem)
 		return true
@@ -61855,7 +61757,7 @@ func rewriteValueAMD64_OpAMD64VPSRLDMasked128const(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VPSRLDMasked128const [c] l:(VMOVDQUload128 {sym} [off] ptr mem) mask)
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VPSRLDMasked128constload {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem)
+	// result: (VPSRLDMasked128constload {sym} [makeValAndOff(int32(uint8(c)),off)] ptr mask mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		l := v_0
@@ -61871,7 +61773,7 @@ func rewriteValueAMD64_OpAMD64VPSRLDMasked128const(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VPSRLDMasked128constload)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg3(ptr, mask, mem)
 		return true
@@ -61883,7 +61785,7 @@ func rewriteValueAMD64_OpAMD64VPSRLDMasked256const(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VPSRLDMasked256const [c] l:(VMOVDQUload256 {sym} [off] ptr mem) mask)
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VPSRLDMasked256constload {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem)
+	// result: (VPSRLDMasked256constload {sym} [makeValAndOff(int32(uint8(c)),off)] ptr mask mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		l := v_0
@@ -61899,7 +61801,7 @@ func rewriteValueAMD64_OpAMD64VPSRLDMasked256const(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VPSRLDMasked256constload)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg3(ptr, mask, mem)
 		return true
@@ -61911,7 +61813,7 @@ func rewriteValueAMD64_OpAMD64VPSRLDMasked512const(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VPSRLDMasked512const [c] l:(VMOVDQUload512 {sym} [off] ptr mem) mask)
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VPSRLDMasked512constload {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem)
+	// result: (VPSRLDMasked512constload {sym} [makeValAndOff(int32(uint8(c)),off)] ptr mask mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		l := v_0
@@ -61927,7 +61829,7 @@ func rewriteValueAMD64_OpAMD64VPSRLDMasked512const(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VPSRLDMasked512constload)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg3(ptr, mask, mem)
 		return true
@@ -61938,7 +61840,7 @@ func rewriteValueAMD64_OpAMD64VPSRLQ512const(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VPSRLQ512const [c] l:(VMOVDQUload512 {sym} [off] ptr mem))
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VPSRLQ512constload {sym} [makeValAndOff(int32(int8(c)),off)] ptr mem)
+	// result: (VPSRLQ512constload {sym} [makeValAndOff(int32(uint8(c)),off)] ptr mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		l := v_0
@@ -61953,7 +61855,7 @@ func rewriteValueAMD64_OpAMD64VPSRLQ512const(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VPSRLQ512constload)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg2(ptr, mem)
 		return true
@@ -61965,7 +61867,7 @@ func rewriteValueAMD64_OpAMD64VPSRLQMasked128const(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VPSRLQMasked128const [c] l:(VMOVDQUload128 {sym} [off] ptr mem) mask)
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VPSRLQMasked128constload {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem)
+	// result: (VPSRLQMasked128constload {sym} [makeValAndOff(int32(uint8(c)),off)] ptr mask mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		l := v_0
@@ -61981,7 +61883,7 @@ func rewriteValueAMD64_OpAMD64VPSRLQMasked128const(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VPSRLQMasked128constload)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg3(ptr, mask, mem)
 		return true
@@ -61993,7 +61895,7 @@ func rewriteValueAMD64_OpAMD64VPSRLQMasked256const(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VPSRLQMasked256const [c] l:(VMOVDQUload256 {sym} [off] ptr mem) mask)
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VPSRLQMasked256constload {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem)
+	// result: (VPSRLQMasked256constload {sym} [makeValAndOff(int32(uint8(c)),off)] ptr mask mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		l := v_0
@@ -62009,7 +61911,7 @@ func rewriteValueAMD64_OpAMD64VPSRLQMasked256const(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VPSRLQMasked256constload)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg3(ptr, mask, mem)
 		return true
@@ -62021,7 +61923,7 @@ func rewriteValueAMD64_OpAMD64VPSRLQMasked512const(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VPSRLQMasked512const [c] l:(VMOVDQUload512 {sym} [off] ptr mem) mask)
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VPSRLQMasked512constload {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem)
+	// result: (VPSRLQMasked512constload {sym} [makeValAndOff(int32(uint8(c)),off)] ptr mask mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		l := v_0
@@ -62037,7 +61939,7 @@ func rewriteValueAMD64_OpAMD64VPSRLQMasked512const(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VPSRLQMasked512constload)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg3(ptr, mask, mem)
 		return true
@@ -62506,7 +62408,7 @@ func rewriteValueAMD64_OpAMD64VPTERNLOGD128(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VPTERNLOGD128 [c] x y l:(VMOVDQUload128 {sym} [off] ptr mem))
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VPTERNLOGD128load {sym} [makeValAndOff(int32(int8(c)),off)] x y ptr mem)
+	// result: (VPTERNLOGD128load {sym} [makeValAndOff(int32(uint8(c)),off)] x y ptr mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		x := v_0
@@ -62523,7 +62425,7 @@ func rewriteValueAMD64_OpAMD64VPTERNLOGD128(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VPTERNLOGD128load)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg4(x, y, ptr, mem)
 		return true
@@ -62536,7 +62438,7 @@ func rewriteValueAMD64_OpAMD64VPTERNLOGD256(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VPTERNLOGD256 [c] x y l:(VMOVDQUload256 {sym} [off] ptr mem))
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VPTERNLOGD256load {sym} [makeValAndOff(int32(int8(c)),off)] x y ptr mem)
+	// result: (VPTERNLOGD256load {sym} [makeValAndOff(int32(uint8(c)),off)] x y ptr mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		x := v_0
@@ -62553,7 +62455,7 @@ func rewriteValueAMD64_OpAMD64VPTERNLOGD256(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VPTERNLOGD256load)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg4(x, y, ptr, mem)
 		return true
@@ -62566,7 +62468,7 @@ func rewriteValueAMD64_OpAMD64VPTERNLOGD512(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VPTERNLOGD512 [c] x y l:(VMOVDQUload512 {sym} [off] ptr mem))
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VPTERNLOGD512load {sym} [makeValAndOff(int32(int8(c)),off)] x y ptr mem)
+	// result: (VPTERNLOGD512load {sym} [makeValAndOff(int32(uint8(c)),off)] x y ptr mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		x := v_0
@@ -62583,7 +62485,7 @@ func rewriteValueAMD64_OpAMD64VPTERNLOGD512(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VPTERNLOGD512load)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg4(x, y, ptr, mem)
 		return true
@@ -62596,7 +62498,7 @@ func rewriteValueAMD64_OpAMD64VPTERNLOGQ128(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VPTERNLOGQ128 [c] x y l:(VMOVDQUload128 {sym} [off] ptr mem))
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VPTERNLOGQ128load {sym} [makeValAndOff(int32(int8(c)),off)] x y ptr mem)
+	// result: (VPTERNLOGQ128load {sym} [makeValAndOff(int32(uint8(c)),off)] x y ptr mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		x := v_0
@@ -62613,7 +62515,7 @@ func rewriteValueAMD64_OpAMD64VPTERNLOGQ128(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VPTERNLOGQ128load)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg4(x, y, ptr, mem)
 		return true
@@ -62626,7 +62528,7 @@ func rewriteValueAMD64_OpAMD64VPTERNLOGQ256(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VPTERNLOGQ256 [c] x y l:(VMOVDQUload256 {sym} [off] ptr mem))
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VPTERNLOGQ256load {sym} [makeValAndOff(int32(int8(c)),off)] x y ptr mem)
+	// result: (VPTERNLOGQ256load {sym} [makeValAndOff(int32(uint8(c)),off)] x y ptr mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		x := v_0
@@ -62643,7 +62545,7 @@ func rewriteValueAMD64_OpAMD64VPTERNLOGQ256(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VPTERNLOGQ256load)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg4(x, y, ptr, mem)
 		return true
@@ -62656,7 +62558,7 @@ func rewriteValueAMD64_OpAMD64VPTERNLOGQ512(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VPTERNLOGQ512 [c] x y l:(VMOVDQUload512 {sym} [off] ptr mem))
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VPTERNLOGQ512load {sym} [makeValAndOff(int32(int8(c)),off)] x y ptr mem)
+	// result: (VPTERNLOGQ512load {sym} [makeValAndOff(int32(uint8(c)),off)] x y ptr mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		x := v_0
@@ -62673,7 +62575,7 @@ func rewriteValueAMD64_OpAMD64VPTERNLOGQ512(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VPTERNLOGQ512load)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg4(x, y, ptr, mem)
 		return true
@@ -63306,7 +63208,7 @@ func rewriteValueAMD64_OpAMD64VREDUCEPD128(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VREDUCEPD128 [c] l:(VMOVDQUload128 {sym} [off] ptr mem))
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VREDUCEPD128load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mem)
+	// result: (VREDUCEPD128load {sym} [makeValAndOff(int32(uint8(c)),off)] ptr mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		l := v_0
@@ -63321,7 +63223,7 @@ func rewriteValueAMD64_OpAMD64VREDUCEPD128(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VREDUCEPD128load)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg2(ptr, mem)
 		return true
@@ -63332,7 +63234,7 @@ func rewriteValueAMD64_OpAMD64VREDUCEPD256(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VREDUCEPD256 [c] l:(VMOVDQUload256 {sym} [off] ptr mem))
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VREDUCEPD256load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mem)
+	// result: (VREDUCEPD256load {sym} [makeValAndOff(int32(uint8(c)),off)] ptr mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		l := v_0
@@ -63347,7 +63249,7 @@ func rewriteValueAMD64_OpAMD64VREDUCEPD256(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VREDUCEPD256load)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg2(ptr, mem)
 		return true
@@ -63358,7 +63260,7 @@ func rewriteValueAMD64_OpAMD64VREDUCEPD512(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VREDUCEPD512 [c] l:(VMOVDQUload512 {sym} [off] ptr mem))
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VREDUCEPD512load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mem)
+	// result: (VREDUCEPD512load {sym} [makeValAndOff(int32(uint8(c)),off)] ptr mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		l := v_0
@@ -63373,7 +63275,7 @@ func rewriteValueAMD64_OpAMD64VREDUCEPD512(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VREDUCEPD512load)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg2(ptr, mem)
 		return true
@@ -63385,7 +63287,7 @@ func rewriteValueAMD64_OpAMD64VREDUCEPDMasked128(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VREDUCEPDMasked128 [c] l:(VMOVDQUload128 {sym} [off] ptr mem) mask)
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VREDUCEPDMasked128load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem)
+	// result: (VREDUCEPDMasked128load {sym} [makeValAndOff(int32(uint8(c)),off)] ptr mask mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		l := v_0
@@ -63401,7 +63303,7 @@ func rewriteValueAMD64_OpAMD64VREDUCEPDMasked128(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VREDUCEPDMasked128load)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg3(ptr, mask, mem)
 		return true
@@ -63413,7 +63315,7 @@ func rewriteValueAMD64_OpAMD64VREDUCEPDMasked256(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VREDUCEPDMasked256 [c] l:(VMOVDQUload256 {sym} [off] ptr mem) mask)
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VREDUCEPDMasked256load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem)
+	// result: (VREDUCEPDMasked256load {sym} [makeValAndOff(int32(uint8(c)),off)] ptr mask mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		l := v_0
@@ -63429,7 +63331,7 @@ func rewriteValueAMD64_OpAMD64VREDUCEPDMasked256(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VREDUCEPDMasked256load)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg3(ptr, mask, mem)
 		return true
@@ -63441,7 +63343,7 @@ func rewriteValueAMD64_OpAMD64VREDUCEPDMasked512(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VREDUCEPDMasked512 [c] l:(VMOVDQUload512 {sym} [off] ptr mem) mask)
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VREDUCEPDMasked512load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem)
+	// result: (VREDUCEPDMasked512load {sym} [makeValAndOff(int32(uint8(c)),off)] ptr mask mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		l := v_0
@@ -63457,7 +63359,7 @@ func rewriteValueAMD64_OpAMD64VREDUCEPDMasked512(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VREDUCEPDMasked512load)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg3(ptr, mask, mem)
 		return true
@@ -63468,7 +63370,7 @@ func rewriteValueAMD64_OpAMD64VREDUCEPS128(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VREDUCEPS128 [c] l:(VMOVDQUload128 {sym} [off] ptr mem))
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VREDUCEPS128load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mem)
+	// result: (VREDUCEPS128load {sym} [makeValAndOff(int32(uint8(c)),off)] ptr mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		l := v_0
@@ -63483,7 +63385,7 @@ func rewriteValueAMD64_OpAMD64VREDUCEPS128(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VREDUCEPS128load)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg2(ptr, mem)
 		return true
@@ -63494,7 +63396,7 @@ func rewriteValueAMD64_OpAMD64VREDUCEPS256(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VREDUCEPS256 [c] l:(VMOVDQUload256 {sym} [off] ptr mem))
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VREDUCEPS256load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mem)
+	// result: (VREDUCEPS256load {sym} [makeValAndOff(int32(uint8(c)),off)] ptr mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		l := v_0
@@ -63509,7 +63411,7 @@ func rewriteValueAMD64_OpAMD64VREDUCEPS256(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VREDUCEPS256load)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg2(ptr, mem)
 		return true
@@ -63520,7 +63422,7 @@ func rewriteValueAMD64_OpAMD64VREDUCEPS512(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VREDUCEPS512 [c] l:(VMOVDQUload512 {sym} [off] ptr mem))
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VREDUCEPS512load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mem)
+	// result: (VREDUCEPS512load {sym} [makeValAndOff(int32(uint8(c)),off)] ptr mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		l := v_0
@@ -63535,7 +63437,7 @@ func rewriteValueAMD64_OpAMD64VREDUCEPS512(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VREDUCEPS512load)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg2(ptr, mem)
 		return true
@@ -63547,7 +63449,7 @@ func rewriteValueAMD64_OpAMD64VREDUCEPSMasked128(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VREDUCEPSMasked128 [c] l:(VMOVDQUload128 {sym} [off] ptr mem) mask)
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VREDUCEPSMasked128load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem)
+	// result: (VREDUCEPSMasked128load {sym} [makeValAndOff(int32(uint8(c)),off)] ptr mask mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		l := v_0
@@ -63563,7 +63465,7 @@ func rewriteValueAMD64_OpAMD64VREDUCEPSMasked128(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VREDUCEPSMasked128load)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg3(ptr, mask, mem)
 		return true
@@ -63575,7 +63477,7 @@ func rewriteValueAMD64_OpAMD64VREDUCEPSMasked256(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VREDUCEPSMasked256 [c] l:(VMOVDQUload256 {sym} [off] ptr mem) mask)
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VREDUCEPSMasked256load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem)
+	// result: (VREDUCEPSMasked256load {sym} [makeValAndOff(int32(uint8(c)),off)] ptr mask mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		l := v_0
@@ -63591,7 +63493,7 @@ func rewriteValueAMD64_OpAMD64VREDUCEPSMasked256(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VREDUCEPSMasked256load)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg3(ptr, mask, mem)
 		return true
@@ -63603,7 +63505,7 @@ func rewriteValueAMD64_OpAMD64VREDUCEPSMasked512(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VREDUCEPSMasked512 [c] l:(VMOVDQUload512 {sym} [off] ptr mem) mask)
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VREDUCEPSMasked512load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem)
+	// result: (VREDUCEPSMasked512load {sym} [makeValAndOff(int32(uint8(c)),off)] ptr mask mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		l := v_0
@@ -63619,7 +63521,7 @@ func rewriteValueAMD64_OpAMD64VREDUCEPSMasked512(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VREDUCEPSMasked512load)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg3(ptr, mask, mem)
 		return true
@@ -63630,7 +63532,7 @@ func rewriteValueAMD64_OpAMD64VRNDSCALEPD128(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VRNDSCALEPD128 [c] l:(VMOVDQUload128 {sym} [off] ptr mem))
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VRNDSCALEPD128load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mem)
+	// result: (VRNDSCALEPD128load {sym} [makeValAndOff(int32(uint8(c)),off)] ptr mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		l := v_0
@@ -63645,7 +63547,7 @@ func rewriteValueAMD64_OpAMD64VRNDSCALEPD128(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VRNDSCALEPD128load)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg2(ptr, mem)
 		return true
@@ -63656,7 +63558,7 @@ func rewriteValueAMD64_OpAMD64VRNDSCALEPD256(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VRNDSCALEPD256 [c] l:(VMOVDQUload256 {sym} [off] ptr mem))
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VRNDSCALEPD256load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mem)
+	// result: (VRNDSCALEPD256load {sym} [makeValAndOff(int32(uint8(c)),off)] ptr mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		l := v_0
@@ -63671,7 +63573,7 @@ func rewriteValueAMD64_OpAMD64VRNDSCALEPD256(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VRNDSCALEPD256load)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg2(ptr, mem)
 		return true
@@ -63682,7 +63584,7 @@ func rewriteValueAMD64_OpAMD64VRNDSCALEPD512(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VRNDSCALEPD512 [c] l:(VMOVDQUload512 {sym} [off] ptr mem))
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VRNDSCALEPD512load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mem)
+	// result: (VRNDSCALEPD512load {sym} [makeValAndOff(int32(uint8(c)),off)] ptr mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		l := v_0
@@ -63697,7 +63599,7 @@ func rewriteValueAMD64_OpAMD64VRNDSCALEPD512(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VRNDSCALEPD512load)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg2(ptr, mem)
 		return true
@@ -63709,7 +63611,7 @@ func rewriteValueAMD64_OpAMD64VRNDSCALEPDMasked128(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VRNDSCALEPDMasked128 [c] l:(VMOVDQUload128 {sym} [off] ptr mem) mask)
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VRNDSCALEPDMasked128load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem)
+	// result: (VRNDSCALEPDMasked128load {sym} [makeValAndOff(int32(uint8(c)),off)] ptr mask mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		l := v_0
@@ -63725,7 +63627,7 @@ func rewriteValueAMD64_OpAMD64VRNDSCALEPDMasked128(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VRNDSCALEPDMasked128load)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg3(ptr, mask, mem)
 		return true
@@ -63737,7 +63639,7 @@ func rewriteValueAMD64_OpAMD64VRNDSCALEPDMasked256(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VRNDSCALEPDMasked256 [c] l:(VMOVDQUload256 {sym} [off] ptr mem) mask)
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VRNDSCALEPDMasked256load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem)
+	// result: (VRNDSCALEPDMasked256load {sym} [makeValAndOff(int32(uint8(c)),off)] ptr mask mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		l := v_0
@@ -63753,7 +63655,7 @@ func rewriteValueAMD64_OpAMD64VRNDSCALEPDMasked256(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VRNDSCALEPDMasked256load)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg3(ptr, mask, mem)
 		return true
@@ -63765,7 +63667,7 @@ func rewriteValueAMD64_OpAMD64VRNDSCALEPDMasked512(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VRNDSCALEPDMasked512 [c] l:(VMOVDQUload512 {sym} [off] ptr mem) mask)
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VRNDSCALEPDMasked512load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem)
+	// result: (VRNDSCALEPDMasked512load {sym} [makeValAndOff(int32(uint8(c)),off)] ptr mask mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		l := v_0
@@ -63781,7 +63683,7 @@ func rewriteValueAMD64_OpAMD64VRNDSCALEPDMasked512(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VRNDSCALEPDMasked512load)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg3(ptr, mask, mem)
 		return true
@@ -63792,7 +63694,7 @@ func rewriteValueAMD64_OpAMD64VRNDSCALEPS128(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VRNDSCALEPS128 [c] l:(VMOVDQUload128 {sym} [off] ptr mem))
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VRNDSCALEPS128load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mem)
+	// result: (VRNDSCALEPS128load {sym} [makeValAndOff(int32(uint8(c)),off)] ptr mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		l := v_0
@@ -63807,7 +63709,7 @@ func rewriteValueAMD64_OpAMD64VRNDSCALEPS128(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VRNDSCALEPS128load)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg2(ptr, mem)
 		return true
@@ -63818,7 +63720,7 @@ func rewriteValueAMD64_OpAMD64VRNDSCALEPS256(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VRNDSCALEPS256 [c] l:(VMOVDQUload256 {sym} [off] ptr mem))
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VRNDSCALEPS256load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mem)
+	// result: (VRNDSCALEPS256load {sym} [makeValAndOff(int32(uint8(c)),off)] ptr mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		l := v_0
@@ -63833,7 +63735,7 @@ func rewriteValueAMD64_OpAMD64VRNDSCALEPS256(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VRNDSCALEPS256load)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg2(ptr, mem)
 		return true
@@ -63844,7 +63746,7 @@ func rewriteValueAMD64_OpAMD64VRNDSCALEPS512(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VRNDSCALEPS512 [c] l:(VMOVDQUload512 {sym} [off] ptr mem))
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VRNDSCALEPS512load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mem)
+	// result: (VRNDSCALEPS512load {sym} [makeValAndOff(int32(uint8(c)),off)] ptr mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		l := v_0
@@ -63859,7 +63761,7 @@ func rewriteValueAMD64_OpAMD64VRNDSCALEPS512(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VRNDSCALEPS512load)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg2(ptr, mem)
 		return true
@@ -63871,7 +63773,7 @@ func rewriteValueAMD64_OpAMD64VRNDSCALEPSMasked128(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VRNDSCALEPSMasked128 [c] l:(VMOVDQUload128 {sym} [off] ptr mem) mask)
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VRNDSCALEPSMasked128load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem)
+	// result: (VRNDSCALEPSMasked128load {sym} [makeValAndOff(int32(uint8(c)),off)] ptr mask mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		l := v_0
@@ -63887,7 +63789,7 @@ func rewriteValueAMD64_OpAMD64VRNDSCALEPSMasked128(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VRNDSCALEPSMasked128load)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg3(ptr, mask, mem)
 		return true
@@ -63899,7 +63801,7 @@ func rewriteValueAMD64_OpAMD64VRNDSCALEPSMasked256(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VRNDSCALEPSMasked256 [c] l:(VMOVDQUload256 {sym} [off] ptr mem) mask)
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VRNDSCALEPSMasked256load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem)
+	// result: (VRNDSCALEPSMasked256load {sym} [makeValAndOff(int32(uint8(c)),off)] ptr mask mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		l := v_0
@@ -63915,7 +63817,7 @@ func rewriteValueAMD64_OpAMD64VRNDSCALEPSMasked256(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VRNDSCALEPSMasked256load)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg3(ptr, mask, mem)
 		return true
@@ -63927,7 +63829,7 @@ func rewriteValueAMD64_OpAMD64VRNDSCALEPSMasked512(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VRNDSCALEPSMasked512 [c] l:(VMOVDQUload512 {sym} [off] ptr mem) mask)
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VRNDSCALEPSMasked512load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem)
+	// result: (VRNDSCALEPSMasked512load {sym} [makeValAndOff(int32(uint8(c)),off)] ptr mask mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		l := v_0
@@ -63943,7 +63845,7 @@ func rewriteValueAMD64_OpAMD64VRNDSCALEPSMasked512(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VRNDSCALEPSMasked512load)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg3(ptr, mask, mem)
 		return true
@@ -64553,7 +64455,7 @@ func rewriteValueAMD64_OpAMD64VSHUFPD512(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VSHUFPD512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem))
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VSHUFPD512load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mem)
+	// result: (VSHUFPD512load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		x := v_0
@@ -64569,7 +64471,7 @@ func rewriteValueAMD64_OpAMD64VSHUFPD512(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VSHUFPD512load)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg3(x, ptr, mem)
 		return true
@@ -64581,7 +64483,7 @@ func rewriteValueAMD64_OpAMD64VSHUFPS512(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (VSHUFPS512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem))
 	// cond: canMergeLoad(v, l) && clobber(l)
-	// result: (VSHUFPS512load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mem)
+	// result: (VSHUFPS512load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mem)
 	for {
 		c := auxIntToUint8(v.AuxInt)
 		x := v_0
@@ -64597,7 +64499,7 @@ func rewriteValueAMD64_OpAMD64VSHUFPS512(v *Value) bool {
 			break
 		}
 		v.reset(OpAMD64VSHUFPS512load)
-		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+		v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(uint8(c)), off))
 		v.Aux = symToAux(sym)
 		v.AddArg3(x, ptr, mem)
 		return true
@@ -68826,13 +68728,11 @@ func rewriteValueAMD64_OpCvt8toMask64x8(v *Value) bool {
 func rewriteValueAMD64_OpCvtMask16x16to16(v *Value) bool {
 	v_0 := v.Args[0]
 	b := v.Block
-	// match: (CvtMask16x16to16 <t> x)
-	// result: (KMOVWi <t> (VPMOVVec16x16ToM <types.TypeMask> x))
+	// match: (CvtMask16x16to16 x)
+	// result: (KMOVWi (VPMOVVec16x16ToM <types.TypeMask> x))
 	for {
-		t := v.Type
 		x := v_0
 		v.reset(OpAMD64KMOVWi)
-		v.Type = t
 		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x16ToM, types.TypeMask)
 		v0.AddArg(x)
 		v.AddArg(v0)
@@ -68842,13 +68742,11 @@ func rewriteValueAMD64_OpCvtMask16x16to16(v *Value) bool {
 func rewriteValueAMD64_OpCvtMask16x32to32(v *Value) bool {
 	v_0 := v.Args[0]
 	b := v.Block
-	// match: (CvtMask16x32to32 <t> x)
-	// result: (KMOVDi <t> (VPMOVVec16x32ToM <types.TypeMask> x))
+	// match: (CvtMask16x32to32 x)
+	// result: (KMOVDi (VPMOVVec16x32ToM <types.TypeMask> x))
 	for {
-		t := v.Type
 		x := v_0
 		v.reset(OpAMD64KMOVDi)
-		v.Type = t
 		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x32ToM, types.TypeMask)
 		v0.AddArg(x)
 		v.AddArg(v0)
@@ -68858,13 +68756,11 @@ func rewriteValueAMD64_OpCvtMask16x32to32(v *Value) bool {
 func rewriteValueAMD64_OpCvtMask16x8to8(v *Value) bool {
 	v_0 := v.Args[0]
 	b := v.Block
-	// match: (CvtMask16x8to8 <t> x)
-	// result: (KMOVBi <t> (VPMOVVec16x8ToM <types.TypeMask> x))
+	// match: (CvtMask16x8to8 x)
+	// result: (KMOVBi (VPMOVVec16x8ToM <types.TypeMask> x))
 	for {
-		t := v.Type
 		x := v_0
 		v.reset(OpAMD64KMOVBi)
-		v.Type = t
 		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x8ToM, types.TypeMask)
 		v0.AddArg(x)
 		v.AddArg(v0)
@@ -68874,141 +68770,39 @@ func rewriteValueAMD64_OpCvtMask16x8to8(v *Value) bool {
 func rewriteValueAMD64_OpCvtMask32x16to16(v *Value) bool {
 	v_0 := v.Args[0]
 	b := v.Block
-	// match: (CvtMask32x16to16 <t> x)
-	// result: (KMOVWi <t> (VPMOVVec32x16ToM <types.TypeMask> x))
+	// match: (CvtMask32x16to16 x)
+	// result: (KMOVWi (VPMOVVec32x16ToM <types.TypeMask> x))
 	for {
-		t := v.Type
 		x := v_0
 		v.reset(OpAMD64KMOVWi)
-		v.Type = t
 		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask)
 		v0.AddArg(x)
 		v.AddArg(v0)
 		return true
 	}
 }
-func rewriteValueAMD64_OpCvtMask32x4to8(v *Value) bool {
-	v_0 := v.Args[0]
-	b := v.Block
-	// match: (CvtMask32x4to8 <t> x)
-	// result: (KMOVBi <t> (VPMOVVec32x4ToM <types.TypeMask> x))
-	for {
-		t := v.Type
-		x := v_0
-		v.reset(OpAMD64KMOVBi)
-		v.Type = t
-		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask)
-		v0.AddArg(x)
-		v.AddArg(v0)
-		return true
-	}
-}
-func rewriteValueAMD64_OpCvtMask32x8to8(v *Value) bool {
-	v_0 := v.Args[0]
-	b := v.Block
-	// match: (CvtMask32x8to8 <t> x)
-	// result: (KMOVBi <t> (VPMOVVec32x8ToM <types.TypeMask> x))
-	for {
-		t := v.Type
-		x := v_0
-		v.reset(OpAMD64KMOVBi)
-		v.Type = t
-		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask)
-		v0.AddArg(x)
-		v.AddArg(v0)
-		return true
-	}
-}
-func rewriteValueAMD64_OpCvtMask64x2to8(v *Value) bool {
-	v_0 := v.Args[0]
-	b := v.Block
-	// match: (CvtMask64x2to8 <t> x)
-	// result: (KMOVBi <t> (VPMOVVec64x2ToM <types.TypeMask> x))
-	for {
-		t := v.Type
-		x := v_0
-		v.reset(OpAMD64KMOVBi)
-		v.Type = t
-		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask)
-		v0.AddArg(x)
-		v.AddArg(v0)
-		return true
-	}
-}
-func rewriteValueAMD64_OpCvtMask64x4to8(v *Value) bool {
-	v_0 := v.Args[0]
-	b := v.Block
-	// match: (CvtMask64x4to8 <t> x)
-	// result: (KMOVBi <t> (VPMOVVec64x4ToM <types.TypeMask> x))
-	for {
-		t := v.Type
-		x := v_0
-		v.reset(OpAMD64KMOVBi)
-		v.Type = t
-		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x4ToM, types.TypeMask)
-		v0.AddArg(x)
-		v.AddArg(v0)
-		return true
-	}
-}
 func rewriteValueAMD64_OpCvtMask64x8to8(v *Value) bool {
 	v_0 := v.Args[0]
 	b := v.Block
-	// match: (CvtMask64x8to8 <t> x)
-	// result: (KMOVBi <t> (VPMOVVec64x8ToM <types.TypeMask> x))
+	// match: (CvtMask64x8to8 x)
+	// result: (KMOVBi (VPMOVVec64x8ToM <types.TypeMask> x))
 	for {
-		t := v.Type
 		x := v_0
 		v.reset(OpAMD64KMOVBi)
-		v.Type = t
 		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x8ToM, types.TypeMask)
 		v0.AddArg(x)
 		v.AddArg(v0)
 		return true
 	}
 }
-func rewriteValueAMD64_OpCvtMask8x16to16(v *Value) bool {
-	v_0 := v.Args[0]
-	b := v.Block
-	// match: (CvtMask8x16to16 <t> x)
-	// result: (KMOVWi <t> (VPMOVVec8x16ToM <types.TypeMask> x))
-	for {
-		t := v.Type
-		x := v_0
-		v.reset(OpAMD64KMOVWi)
-		v.Type = t
-		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x16ToM, types.TypeMask)
-		v0.AddArg(x)
-		v.AddArg(v0)
-		return true
-	}
-}
-func rewriteValueAMD64_OpCvtMask8x32to32(v *Value) bool {
-	v_0 := v.Args[0]
-	b := v.Block
-	// match: (CvtMask8x32to32 <t> x)
-	// result: (KMOVDi <t> (VPMOVVec8x32ToM <types.TypeMask> x))
-	for {
-		t := v.Type
-		x := v_0
-		v.reset(OpAMD64KMOVDi)
-		v.Type = t
-		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x32ToM, types.TypeMask)
-		v0.AddArg(x)
-		v.AddArg(v0)
-		return true
-	}
-}
 func rewriteValueAMD64_OpCvtMask8x64to64(v *Value) bool {
 	v_0 := v.Args[0]
 	b := v.Block
-	// match: (CvtMask8x64to64 <t> x)
-	// result: (KMOVQi <t> (VPMOVVec8x64ToM <types.TypeMask> x))
+	// match: (CvtMask8x64to64 x)
+	// result: (KMOVQi (VPMOVVec8x64ToM <types.TypeMask> x))
 	for {
-		t := v.Type
 		x := v_0
 		v.reset(OpAMD64KMOVQi)
-		v.Type = t
 		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x64ToM, types.TypeMask)
 		v0.AddArg(x)
 		v.AddArg(v0)
@@ -71229,94 +71023,82 @@ func rewriteValueAMD64_OpIsInBounds(v *Value) bool {
 		return true
 	}
 }
-func rewriteValueAMD64_OpIsNanFloat32x16(v *Value) bool {
-	v_1 := v.Args[1]
+func rewriteValueAMD64_OpIsNaNFloat32x16(v *Value) bool {
 	v_0 := v.Args[0]
 	b := v.Block
 	typ := &b.Func.Config.Types
-	// match: (IsNanFloat32x16 x y)
-	// result: (VPMOVMToVec32x16 (VCMPPS512 [3] x y))
+	// match: (IsNaNFloat32x16 x)
+	// result: (VPMOVMToVec32x16 (VCMPPS512 [3] x x))
 	for {
 		x := v_0
-		y := v_1
 		v.reset(OpAMD64VPMOVMToVec32x16)
 		v0 := b.NewValue0(v.Pos, OpAMD64VCMPPS512, typ.Mask)
 		v0.AuxInt = uint8ToAuxInt(3)
-		v0.AddArg2(x, y)
+		v0.AddArg2(x, x)
 		v.AddArg(v0)
 		return true
 	}
 }
-func rewriteValueAMD64_OpIsNanFloat32x4(v *Value) bool {
-	v_1 := v.Args[1]
+func rewriteValueAMD64_OpIsNaNFloat32x4(v *Value) bool {
 	v_0 := v.Args[0]
-	// match: (IsNanFloat32x4 x y)
-	// result: (VCMPPS128 [3] x y)
+	// match: (IsNaNFloat32x4 x)
+	// result: (VCMPPS128 [3] x x)
 	for {
 		x := v_0
-		y := v_1
 		v.reset(OpAMD64VCMPPS128)
 		v.AuxInt = uint8ToAuxInt(3)
-		v.AddArg2(x, y)
+		v.AddArg2(x, x)
 		return true
 	}
 }
-func rewriteValueAMD64_OpIsNanFloat32x8(v *Value) bool {
-	v_1 := v.Args[1]
+func rewriteValueAMD64_OpIsNaNFloat32x8(v *Value) bool {
 	v_0 := v.Args[0]
-	// match: (IsNanFloat32x8 x y)
-	// result: (VCMPPS256 [3] x y)
+	// match: (IsNaNFloat32x8 x)
+	// result: (VCMPPS256 [3] x x)
 	for {
 		x := v_0
-		y := v_1
 		v.reset(OpAMD64VCMPPS256)
 		v.AuxInt = uint8ToAuxInt(3)
-		v.AddArg2(x, y)
+		v.AddArg2(x, x)
 		return true
 	}
 }
-func rewriteValueAMD64_OpIsNanFloat64x2(v *Value) bool {
-	v_1 := v.Args[1]
+func rewriteValueAMD64_OpIsNaNFloat64x2(v *Value) bool {
 	v_0 := v.Args[0]
-	// match: (IsNanFloat64x2 x y)
-	// result: (VCMPPD128 [3] x y)
+	// match: (IsNaNFloat64x2 x)
+	// result: (VCMPPD128 [3] x x)
 	for {
 		x := v_0
-		y := v_1
 		v.reset(OpAMD64VCMPPD128)
 		v.AuxInt = uint8ToAuxInt(3)
-		v.AddArg2(x, y)
+		v.AddArg2(x, x)
 		return true
 	}
 }
-func rewriteValueAMD64_OpIsNanFloat64x4(v *Value) bool {
-	v_1 := v.Args[1]
+func rewriteValueAMD64_OpIsNaNFloat64x4(v *Value) bool {
 	v_0 := v.Args[0]
-	// match: (IsNanFloat64x4 x y)
-	// result: (VCMPPD256 [3] x y)
+	// match: (IsNaNFloat64x4 x)
+	// result: (VCMPPD256 [3] x x)
 	for {
 		x := v_0
-		y := v_1
 		v.reset(OpAMD64VCMPPD256)
 		v.AuxInt = uint8ToAuxInt(3)
-		v.AddArg2(x, y)
+		v.AddArg2(x, x)
 		return true
 	}
 }
-func rewriteValueAMD64_OpIsNanFloat64x8(v *Value) bool {
-	v_1 := v.Args[1]
+func rewriteValueAMD64_OpIsNaNFloat64x8(v *Value) bool {
 	v_0 := v.Args[0]
 	b := v.Block
 	typ := &b.Func.Config.Types
-	// match: (IsNanFloat64x8 x y)
-	// result: (VPMOVMToVec64x8 (VCMPPD512 [3] x y))
+	// match: (IsNaNFloat64x8 x)
+	// result: (VPMOVMToVec64x8 (VCMPPD512 [3] x x))
 	for {
 		x := v_0
-		y := v_1
 		v.reset(OpAMD64VPMOVMToVec64x8)
 		v0 := b.NewValue0(v.Pos, OpAMD64VCMPPD512, typ.Mask)
 		v0.AuxInt = uint8ToAuxInt(3)
-		v0.AddArg2(x, y)
+		v0.AddArg2(x, x)
 		v.AddArg(v0)
 		return true
 	}
diff --git a/src/cmd/compile/internal/ssa/sccp.go b/src/cmd/compile/internal/ssa/sccp.go
index 9b958d0454..7ef8d6b7c1 100644
--- a/src/cmd/compile/internal/ssa/sccp.go
+++ b/src/cmd/compile/internal/ssa/sccp.go
@@ -507,6 +507,10 @@ func (t *worklist) propagate(block *Block) {
 				branchIdx = 1 - condLattice.val.AuxInt
 			} else {
 				branchIdx = condLattice.val.AuxInt
+				if branchIdx < 0 || branchIdx >= int64(len(block.Succs)) {
+					// unreachable code, do nothing then
+					break
+				}
 			}
 			t.edges = append(t.edges, block.Succs[branchIdx])
 		} else {
diff --git a/src/cmd/compile/internal/ssa/tern_helpers.go b/src/cmd/compile/internal/ssa/tern_helpers.go
index 3ffc980c33..923a9f505e 100644
--- a/src/cmd/compile/internal/ssa/tern_helpers.go
+++ b/src/cmd/compile/internal/ssa/tern_helpers.go
@@ -1,4 +1,4 @@
-// Code generated by 'go run genfiles.go'; DO NOT EDIT.
+// Code generated by 'tmplgen'; DO NOT EDIT.
 
 package ssa
 
diff --git a/src/cmd/compile/internal/ssagen/intrinsics.go b/src/cmd/compile/internal/ssagen/intrinsics.go
index 4425c5617b..e2eebd783d 100644
--- a/src/cmd/compile/internal/ssagen/intrinsics.go
+++ b/src/cmd/compile/internal/ssagen/intrinsics.go
@@ -1667,6 +1667,12 @@ func initIntrinsics(cfg *intrinsicBuildConfig) {
 		addF(simdPackage, "Uint16x16.IsZero", opLen1(ssa.OpIsZeroVec, types.Types[types.TBOOL]), sys.AMD64)
 		addF(simdPackage, "Uint32x8.IsZero", opLen1(ssa.OpIsZeroVec, types.Types[types.TBOOL]), sys.AMD64)
 		addF(simdPackage, "Uint64x4.IsZero", opLen1(ssa.OpIsZeroVec, types.Types[types.TBOOL]), sys.AMD64)
+		addF(simdPackage, "Float32x4.IsNaN", opLen1(ssa.OpIsNaNFloat32x4, types.TypeVec128), sys.AMD64)
+		addF(simdPackage, "Float32x8.IsNaN", opLen1(ssa.OpIsNaNFloat32x8, types.TypeVec256), sys.AMD64)
+		addF(simdPackage, "Float32x16.IsNaN", opLen1(ssa.OpIsNaNFloat32x16, types.TypeVec512), sys.AMD64)
+		addF(simdPackage, "Float64x2.IsNaN", opLen1(ssa.OpIsNaNFloat64x2, types.TypeVec128), sys.AMD64)
+		addF(simdPackage, "Float64x4.IsNaN", opLen1(ssa.OpIsNaNFloat64x4, types.TypeVec256), sys.AMD64)
+		addF(simdPackage, "Float64x8.IsNaN", opLen1(ssa.OpIsNaNFloat64x8, types.TypeVec512), sys.AMD64)
 
 		// sfp4 is intrinsic-if-constant, but otherwise it's complicated enough to just implement in Go.
 		sfp4 := func(method string, hwop ssa.Op, vectype *types.Type) {
diff --git a/src/cmd/compile/internal/ssagen/simdintrinsics.go b/src/cmd/compile/internal/ssagen/simdintrinsics.go
index 7eb5456994..4ad0c6032c 100644
--- a/src/cmd/compile/internal/ssagen/simdintrinsics.go
+++ b/src/cmd/compile/internal/ssagen/simdintrinsics.go
@@ -1,4 +1,4 @@
-// Code generated by x/arch/internal/simdgen using 'go run . -xedPath $XED_PATH -o godefs -goroot $GOROOT go.yaml types.yaml categories.yaml'; DO NOT EDIT.
+// Code generated by 'simdgen -o godefs -goroot $GOROOT -xedPath $XED_PATH go.yaml types.yaml categories.yaml'; DO NOT EDIT.
 
 package ssagen
 
@@ -69,19 +69,19 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
 	addF(simdPackage, "Uint64x4.Add", opLen2(ssa.OpAddUint64x4, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Uint64x8.Add", opLen2(ssa.OpAddUint64x8, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Float32x4.AddPairs", opLen2(ssa.OpAddPairsFloat32x4, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Float32x8.AddPairs", opLen2(ssa.OpAddPairsFloat32x8, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Float64x2.AddPairs", opLen2(ssa.OpAddPairsFloat64x2, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Float64x4.AddPairs", opLen2(ssa.OpAddPairsFloat64x4, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Int16x8.AddPairs", opLen2(ssa.OpAddPairsInt16x8, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Int16x16.AddPairs", opLen2(ssa.OpAddPairsInt16x16, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Int32x4.AddPairs", opLen2(ssa.OpAddPairsInt32x4, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Int32x8.AddPairs", opLen2(ssa.OpAddPairsInt32x8, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Uint16x8.AddPairs", opLen2(ssa.OpAddPairsUint16x8, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Uint16x16.AddPairs", opLen2(ssa.OpAddPairsUint16x16, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Uint32x4.AddPairs", opLen2(ssa.OpAddPairsUint32x4, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Uint32x8.AddPairs", opLen2(ssa.OpAddPairsUint32x8, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Float32x8.AddPairsGrouped", opLen2(ssa.OpAddPairsGroupedFloat32x8, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Float64x4.AddPairsGrouped", opLen2(ssa.OpAddPairsGroupedFloat64x4, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Int16x16.AddPairsGrouped", opLen2(ssa.OpAddPairsGroupedInt16x16, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Int32x8.AddPairsGrouped", opLen2(ssa.OpAddPairsGroupedInt32x8, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Uint16x16.AddPairsGrouped", opLen2(ssa.OpAddPairsGroupedUint16x16, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Uint32x8.AddPairsGrouped", opLen2(ssa.OpAddPairsGroupedUint32x8, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Int16x8.AddPairsSaturated", opLen2(ssa.OpAddPairsSaturatedInt16x8, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Int16x16.AddPairsSaturated", opLen2(ssa.OpAddPairsSaturatedInt16x16, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Int16x16.AddPairsSaturatedGrouped", opLen2(ssa.OpAddPairsSaturatedGroupedInt16x16, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Int8x16.AddSaturated", opLen2(ssa.OpAddSaturatedInt8x16, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Int8x32.AddSaturated", opLen2(ssa.OpAddSaturatedInt8x32, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Int8x64.AddSaturated", opLen2(ssa.OpAddSaturatedInt8x64, types.TypeVec512), sys.AMD64)
@@ -328,12 +328,6 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
 	addF(simdPackage, "Uint8x16.DotProductPairsSaturated", opLen2(ssa.OpDotProductPairsSaturatedUint8x16, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Uint8x32.DotProductPairsSaturated", opLen2(ssa.OpDotProductPairsSaturatedUint8x32, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Uint8x64.DotProductPairsSaturated", opLen2(ssa.OpDotProductPairsSaturatedUint8x64, types.TypeVec512), sys.AMD64)
-	addF(simdPackage, "Int8x16.DotProductQuadruple", opLen3_31Zero3(ssa.OpDotProductQuadrupleInt32x4, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Int8x32.DotProductQuadruple", opLen3_31Zero3(ssa.OpDotProductQuadrupleInt32x8, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Int8x64.DotProductQuadruple", opLen3_31Zero3(ssa.OpDotProductQuadrupleInt32x16, types.TypeVec512), sys.AMD64)
-	addF(simdPackage, "Int8x16.DotProductQuadrupleSaturated", opLen3_31Zero3(ssa.OpDotProductQuadrupleSaturatedInt32x4, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Int8x32.DotProductQuadrupleSaturated", opLen3_31Zero3(ssa.OpDotProductQuadrupleSaturatedInt32x8, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Int8x64.DotProductQuadrupleSaturated", opLen3_31Zero3(ssa.OpDotProductQuadrupleSaturatedInt32x16, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Int8x16.Equal", opLen2(ssa.OpEqualInt8x16, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Int8x32.Equal", opLen2(ssa.OpEqualInt8x32, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Int8x64.Equal", opLen2(ssa.OpEqualInt8x64, types.TypeVec512), sys.AMD64)
@@ -394,26 +388,26 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
 	addF(simdPackage, "Uint64x2.Expand", opLen2(ssa.OpExpandUint64x2, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Uint64x4.Expand", opLen2(ssa.OpExpandUint64x4, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Uint64x8.Expand", opLen2(ssa.OpExpandUint64x8, types.TypeVec512), sys.AMD64)
-	addF(simdPackage, "Int8x16.ExtendLo2ToInt64x2", opLen1(ssa.OpExtendLo2ToInt64x2Int8x16, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Int16x8.ExtendLo2ToInt64x2", opLen1(ssa.OpExtendLo2ToInt64x2Int16x8, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Int32x4.ExtendLo2ToInt64x2", opLen1(ssa.OpExtendLo2ToInt64x2Int32x4, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Uint8x16.ExtendLo2ToUint64x2", opLen1(ssa.OpExtendLo2ToUint64x2Uint8x16, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Uint16x8.ExtendLo2ToUint64x2", opLen1(ssa.OpExtendLo2ToUint64x2Uint16x8, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Uint32x4.ExtendLo2ToUint64x2", opLen1(ssa.OpExtendLo2ToUint64x2Uint32x4, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Int8x16.ExtendLo4ToInt32x4", opLen1(ssa.OpExtendLo4ToInt32x4Int8x16, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Int16x8.ExtendLo4ToInt32x4", opLen1(ssa.OpExtendLo4ToInt32x4Int16x8, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Int8x16.ExtendLo4ToInt64x4", opLen1(ssa.OpExtendLo4ToInt64x4Int8x16, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Int16x8.ExtendLo4ToInt64x4", opLen1(ssa.OpExtendLo4ToInt64x4Int16x8, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Uint8x16.ExtendLo4ToUint32x4", opLen1(ssa.OpExtendLo4ToUint32x4Uint8x16, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Uint16x8.ExtendLo4ToUint32x4", opLen1(ssa.OpExtendLo4ToUint32x4Uint16x8, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Uint8x16.ExtendLo4ToUint64x4", opLen1(ssa.OpExtendLo4ToUint64x4Uint8x16, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Uint16x8.ExtendLo4ToUint64x4", opLen1(ssa.OpExtendLo4ToUint64x4Uint16x8, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Int8x16.ExtendLo8ToInt16x8", opLen1(ssa.OpExtendLo8ToInt16x8Int8x16, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Int8x16.ExtendLo8ToInt32x8", opLen1(ssa.OpExtendLo8ToInt32x8Int8x16, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Int8x16.ExtendLo8ToInt64x8", opLen1(ssa.OpExtendLo8ToInt64x8Int8x16, types.TypeVec512), sys.AMD64)
-	addF(simdPackage, "Uint8x16.ExtendLo8ToUint16x8", opLen1(ssa.OpExtendLo8ToUint16x8Uint8x16, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Uint8x16.ExtendLo8ToUint32x8", opLen1(ssa.OpExtendLo8ToUint32x8Uint8x16, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Uint8x16.ExtendLo8ToUint64x8", opLen1(ssa.OpExtendLo8ToUint64x8Uint8x16, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Int8x16.ExtendLo2ToInt64", opLen1(ssa.OpExtendLo2ToInt64Int8x16, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Int16x8.ExtendLo2ToInt64", opLen1(ssa.OpExtendLo2ToInt64Int16x8, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Int32x4.ExtendLo2ToInt64", opLen1(ssa.OpExtendLo2ToInt64Int32x4, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Uint8x16.ExtendLo2ToUint64", opLen1(ssa.OpExtendLo2ToUint64Uint8x16, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Uint16x8.ExtendLo2ToUint64", opLen1(ssa.OpExtendLo2ToUint64Uint16x8, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Uint32x4.ExtendLo2ToUint64", opLen1(ssa.OpExtendLo2ToUint64Uint32x4, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Int8x16.ExtendLo4ToInt32", opLen1(ssa.OpExtendLo4ToInt32Int8x16, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Int16x8.ExtendLo4ToInt32", opLen1(ssa.OpExtendLo4ToInt32Int16x8, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Int8x16.ExtendLo4ToInt64", opLen1(ssa.OpExtendLo4ToInt64Int8x16, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Int16x8.ExtendLo4ToInt64", opLen1(ssa.OpExtendLo4ToInt64Int16x8, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Uint8x16.ExtendLo4ToUint32", opLen1(ssa.OpExtendLo4ToUint32Uint8x16, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Uint16x8.ExtendLo4ToUint32", opLen1(ssa.OpExtendLo4ToUint32Uint16x8, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Uint8x16.ExtendLo4ToUint64", opLen1(ssa.OpExtendLo4ToUint64Uint8x16, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Uint16x8.ExtendLo4ToUint64", opLen1(ssa.OpExtendLo4ToUint64Uint16x8, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Int8x16.ExtendLo8ToInt16", opLen1(ssa.OpExtendLo8ToInt16Int8x16, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Int8x16.ExtendLo8ToInt32", opLen1(ssa.OpExtendLo8ToInt32Int8x16, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Int8x16.ExtendLo8ToInt64", opLen1(ssa.OpExtendLo8ToInt64Int8x16, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Uint8x16.ExtendLo8ToUint16", opLen1(ssa.OpExtendLo8ToUint16Uint8x16, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Uint8x16.ExtendLo8ToUint32", opLen1(ssa.OpExtendLo8ToUint32Uint8x16, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Uint8x16.ExtendLo8ToUint64", opLen1(ssa.OpExtendLo8ToUint64Uint8x16, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Int8x16.ExtendToInt16", opLen1(ssa.OpExtendToInt16Int8x16, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Int8x32.ExtendToInt16", opLen1(ssa.OpExtendToInt16Int8x32, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Int8x16.ExtendToInt32", opLen1(ssa.OpExtendToInt32Int8x16, types.TypeVec512), sys.AMD64)
@@ -577,12 +571,6 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
 	addF(simdPackage, "Uint32x16.InterleaveLoGrouped", opLen2(ssa.OpInterleaveLoGroupedUint32x16, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Uint64x4.InterleaveLoGrouped", opLen2(ssa.OpInterleaveLoGroupedUint64x4, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Uint64x8.InterleaveLoGrouped", opLen2(ssa.OpInterleaveLoGroupedUint64x8, types.TypeVec512), sys.AMD64)
-	addF(simdPackage, "Float32x4.IsNan", opLen2(ssa.OpIsNanFloat32x4, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Float32x8.IsNan", opLen2(ssa.OpIsNanFloat32x8, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Float32x16.IsNan", opLen2(ssa.OpIsNanFloat32x16, types.TypeVec512), sys.AMD64)
-	addF(simdPackage, "Float64x2.IsNan", opLen2(ssa.OpIsNanFloat64x2, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Float64x4.IsNan", opLen2(ssa.OpIsNanFloat64x4, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Float64x8.IsNan", opLen2(ssa.OpIsNanFloat64x8, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Int32x4.LeadingZeros", opLen1(ssa.OpLeadingZerosInt32x4, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Int32x8.LeadingZeros", opLen1(ssa.OpLeadingZerosInt32x8, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Int32x16.LeadingZeros", opLen1(ssa.OpLeadingZerosInt32x16, types.TypeVec512), sys.AMD64)
@@ -926,29 +914,29 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
 	addF(simdPackage, "Int64x4.SaturateToInt16", opLen1(ssa.OpSaturateToInt16Int64x4, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Int64x8.SaturateToInt16", opLen1(ssa.OpSaturateToInt16Int64x8, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Int32x4.SaturateToInt16Concat", opLen2(ssa.OpSaturateToInt16ConcatInt32x4, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Int32x8.SaturateToInt16Concat", opLen2(ssa.OpSaturateToInt16ConcatInt32x8, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Int32x16.SaturateToInt16Concat", opLen2(ssa.OpSaturateToInt16ConcatInt32x16, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Int32x8.SaturateToInt16ConcatGrouped", opLen2(ssa.OpSaturateToInt16ConcatGroupedInt32x8, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Int32x16.SaturateToInt16ConcatGrouped", opLen2(ssa.OpSaturateToInt16ConcatGroupedInt32x16, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Int64x2.SaturateToInt32", opLen1(ssa.OpSaturateToInt32Int64x2, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Int64x4.SaturateToInt32", opLen1(ssa.OpSaturateToInt32Int64x4, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Int64x8.SaturateToInt32", opLen1(ssa.OpSaturateToInt32Int64x8, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Int16x8.SaturateToUint8", opLen1(ssa.OpSaturateToUint8Int16x8, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Int16x16.SaturateToUint8", opLen1(ssa.OpSaturateToUint8Int16x16, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Int32x4.SaturateToUint8", opLen1(ssa.OpSaturateToUint8Int32x4, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Int32x8.SaturateToUint8", opLen1(ssa.OpSaturateToUint8Int32x8, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Int32x16.SaturateToUint8", opLen1(ssa.OpSaturateToUint8Int32x16, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Int64x2.SaturateToUint8", opLen1(ssa.OpSaturateToUint8Int64x2, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Int64x4.SaturateToUint8", opLen1(ssa.OpSaturateToUint8Int64x4, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Int64x8.SaturateToUint8", opLen1(ssa.OpSaturateToUint8Int64x8, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Uint16x8.SaturateToUint8", opLen1(ssa.OpSaturateToUint8Uint16x8, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Uint16x16.SaturateToUint8", opLen1(ssa.OpSaturateToUint8Uint16x16, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Uint16x32.SaturateToUint8", opLen1(ssa.OpSaturateToUint8Uint16x32, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Uint32x4.SaturateToUint8", opLen1(ssa.OpSaturateToUint8Uint32x4, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Uint32x8.SaturateToUint8", opLen1(ssa.OpSaturateToUint8Uint32x8, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Uint32x16.SaturateToUint8", opLen1(ssa.OpSaturateToUint8Uint32x16, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Uint64x2.SaturateToUint8", opLen1(ssa.OpSaturateToUint8Uint64x2, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Uint64x4.SaturateToUint8", opLen1(ssa.OpSaturateToUint8Uint64x4, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Uint64x8.SaturateToUint8", opLen1(ssa.OpSaturateToUint8Uint64x8, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Uint32x4.SaturateToUint16", opLen1(ssa.OpSaturateToUint16Uint32x4, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Uint32x8.SaturateToUint16", opLen1(ssa.OpSaturateToUint16Uint32x8, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Uint32x16.SaturateToUint16", opLen1(ssa.OpSaturateToUint16Uint32x16, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Uint64x2.SaturateToUint16", opLen1(ssa.OpSaturateToUint16Uint64x2, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Uint64x4.SaturateToUint16", opLen1(ssa.OpSaturateToUint16Uint64x4, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Uint64x8.SaturateToUint16", opLen1(ssa.OpSaturateToUint16Uint64x8, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Uint32x4.SaturateToUint16Concat", opLen2(ssa.OpSaturateToUint16ConcatUint32x4, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Uint32x8.SaturateToUint16Concat", opLen2(ssa.OpSaturateToUint16ConcatUint32x8, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Uint32x16.SaturateToUint16Concat", opLen2(ssa.OpSaturateToUint16ConcatUint32x16, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Int32x4.SaturateToUint16Concat", opLen2(ssa.OpSaturateToUint16ConcatInt32x4, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Int32x8.SaturateToUint16ConcatGrouped", opLen2(ssa.OpSaturateToUint16ConcatGroupedInt32x8, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Int32x16.SaturateToUint16ConcatGrouped", opLen2(ssa.OpSaturateToUint16ConcatGroupedInt32x16, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Uint64x2.SaturateToUint32", opLen1(ssa.OpSaturateToUint32Uint64x2, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Uint64x4.SaturateToUint32", opLen1(ssa.OpSaturateToUint32Uint64x4, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Uint64x8.SaturateToUint32", opLen1(ssa.OpSaturateToUint32Uint64x8, types.TypeVec256), sys.AMD64)
@@ -1199,19 +1187,19 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
 	addF(simdPackage, "Uint64x4.Sub", opLen2(ssa.OpSubUint64x4, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Uint64x8.Sub", opLen2(ssa.OpSubUint64x8, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Float32x4.SubPairs", opLen2(ssa.OpSubPairsFloat32x4, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Float32x8.SubPairs", opLen2(ssa.OpSubPairsFloat32x8, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Float64x2.SubPairs", opLen2(ssa.OpSubPairsFloat64x2, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Float64x4.SubPairs", opLen2(ssa.OpSubPairsFloat64x4, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Int16x8.SubPairs", opLen2(ssa.OpSubPairsInt16x8, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Int16x16.SubPairs", opLen2(ssa.OpSubPairsInt16x16, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Int32x4.SubPairs", opLen2(ssa.OpSubPairsInt32x4, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Int32x8.SubPairs", opLen2(ssa.OpSubPairsInt32x8, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Uint16x8.SubPairs", opLen2(ssa.OpSubPairsUint16x8, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Uint16x16.SubPairs", opLen2(ssa.OpSubPairsUint16x16, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Uint32x4.SubPairs", opLen2(ssa.OpSubPairsUint32x4, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Uint32x8.SubPairs", opLen2(ssa.OpSubPairsUint32x8, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Float32x8.SubPairsGrouped", opLen2(ssa.OpSubPairsGroupedFloat32x8, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Float64x4.SubPairsGrouped", opLen2(ssa.OpSubPairsGroupedFloat64x4, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Int16x16.SubPairsGrouped", opLen2(ssa.OpSubPairsGroupedInt16x16, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Int32x8.SubPairsGrouped", opLen2(ssa.OpSubPairsGroupedInt32x8, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Uint16x16.SubPairsGrouped", opLen2(ssa.OpSubPairsGroupedUint16x16, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Uint32x8.SubPairsGrouped", opLen2(ssa.OpSubPairsGroupedUint32x8, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Int16x8.SubPairsSaturated", opLen2(ssa.OpSubPairsSaturatedInt16x8, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Int16x16.SubPairsSaturated", opLen2(ssa.OpSubPairsSaturatedInt16x16, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Int16x16.SubPairsSaturatedGrouped", opLen2(ssa.OpSubPairsSaturatedGroupedInt16x16, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Int8x16.SubSaturated", opLen2(ssa.OpSubSaturatedInt8x16, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Int8x32.SubSaturated", opLen2(ssa.OpSubSaturatedInt8x32, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Int8x64.SubSaturated", opLen2(ssa.OpSubSaturatedInt8x64, types.TypeVec512), sys.AMD64)
diff --git a/src/cmd/compile/testdata/script/issue77033.txt b/src/cmd/compile/testdata/script/issue77033.txt
new file mode 100644
index 0000000000..3b977e5440
--- /dev/null
+++ b/src/cmd/compile/testdata/script/issue77033.txt
@@ -0,0 +1,40 @@
+go test -bench=Foo -cpuprofile=default.pgo
+go test -bench=Foo -pgo=default.pgo
+! stdout 'FAIL'
+
+-- main_test.go --
+package main
+
+import (
+	"testing"
+)
+
+var a int
+
+func save(x int) {
+	a = x
+}
+
+func foo() {
+	for i := range yield1 {
+		defer save(i)
+	}
+}
+
+func yield1(yield func(int) bool) {
+	yield(1)
+}
+
+func BenchmarkFoo(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		foo()
+	}
+	if a != 1 {
+		b.Fatalf("a = %d; want 1", a)
+	}
+}
+
+-- go.mod --
+module demo
+
+go 1.24
diff --git a/src/cmd/dist/test.go b/src/cmd/dist/test.go
index 6d3742525c..48c3aa5efd 100644
--- a/src/cmd/dist/test.go
+++ b/src/cmd/dist/test.go
@@ -748,7 +748,7 @@ func (t *tester) registerTests() {
 	if !strings.Contains(goexperiment, "jsonv2") {
 		t.registerTest("GOEXPERIMENT=jsonv2 go test encoding/json/...", &goTest{
 			variant: "jsonv2",
-			env:     []string{"GOEXPERIMENT=jsonv2"},
+			env:     []string{"GOEXPERIMENT=" + goexperiments("jsonv2")},
 			pkg:     "encoding/json/...",
 		})
 	}
@@ -757,7 +757,7 @@ func (t *tester) registerTests() {
 	if !strings.Contains(goexperiment, "runtimesecret") {
 		t.registerTest("GOEXPERIMENT=runtimesecret go test runtime/secret/...", &goTest{
 			variant: "runtimesecret",
-			env:     []string{"GOEXPERIMENT=runtimesecret"},
+			env:     []string{"GOEXPERIMENT=" + goexperiments("runtimesecret")},
 			pkg:     "runtime/secret/...",
 		})
 	}
@@ -766,7 +766,7 @@ func (t *tester) registerTests() {
 	if goarch == "amd64" && !strings.Contains(goexperiment, "simd") {
 		t.registerTest("GOEXPERIMENT=simd go test simd/archsimd/...", &goTest{
 			variant: "simd",
-			env:     []string{"GOEXPERIMENT=simd"},
+			env:     []string{"GOEXPERIMENT=" + goexperiments("simd")},
 			pkg:     "simd/archsimd/...",
 		})
 	}
@@ -1888,3 +1888,19 @@ func fipsVersions(short bool) []string {
 	}
 	return versions
 }
+
+// goexperiments returns the GOEXPERIMENT value to use
+// when running a test with the given experiments enabled.
+//
+// It preserves any existing GOEXPERIMENTs.
+func goexperiments(exps ...string) string {
+	if len(exps) == 0 {
+		return goexperiment
+	}
+	existing := goexperiment
+	if existing != "" {
+		existing += ","
+	}
+	return existing + strings.Join(exps, ",")
+
+}
diff --git a/src/cmd/go.mod b/src/cmd/go.mod
index c7d3cc6136..85e8c4cb5f 100644
--- a/src/cmd/go.mod
+++ b/src/cmd/go.mod
@@ -11,7 +11,7 @@ require (
 	golang.org/x/sys v0.39.0
 	golang.org/x/telemetry v0.0.0-20251128220624-abf20d0e57ec
 	golang.org/x/term v0.38.0
-	golang.org/x/tools v0.39.1-0.20251205000126-062ef7b6ced2
+	golang.org/x/tools v0.39.1-0.20251230210517-d44be789a05c
 )
 
 require (
diff --git a/src/cmd/go.sum b/src/cmd/go.sum
index b02c469a41..61c88e5253 100644
--- a/src/cmd/go.sum
+++ b/src/cmd/go.sum
@@ -22,7 +22,7 @@ golang.org/x/term v0.38.0 h1:PQ5pkm/rLO6HnxFR7N2lJHOZX6Kez5Y1gDSJla6jo7Q=
 golang.org/x/term v0.38.0/go.mod h1:bSEAKrOT1W+VSu9TSCMtoGEOUcKxOKgl3LE5QEF/xVg=
 golang.org/x/text v0.32.0 h1:ZD01bjUt1FQ9WJ0ClOL5vxgxOI/sVCNgX1YtKwcY0mU=
 golang.org/x/text v0.32.0/go.mod h1:o/rUWzghvpD5TXrTIBuJU77MTaN0ljMWE47kxGJQ7jY=
-golang.org/x/tools v0.39.1-0.20251205000126-062ef7b6ced2 h1:2Qqv605Nus9iUp3ErvEU/q92Q3HAzeROztzl9pzAno8=
-golang.org/x/tools v0.39.1-0.20251205000126-062ef7b6ced2/go.mod h1:JnefbkDPyD8UU2kI5fuf8ZX4/yUeh9W877ZeBONxUqQ=
+golang.org/x/tools v0.39.1-0.20251230210517-d44be789a05c h1:0pZej6BQOooNbOfjJEu4v5qx9hdwFX8HnvHCcNXcs2w=
+golang.org/x/tools v0.39.1-0.20251230210517-d44be789a05c/go.mod h1:JnefbkDPyD8UU2kI5fuf8ZX4/yUeh9W877ZeBONxUqQ=
 rsc.io/markdown v0.0.0-20240306144322-0bf8f97ee8ef h1:mqLYrXCXYEZOop9/Dbo6RPX11539nwiCNBb1icVPmw8=
 rsc.io/markdown v0.0.0-20240306144322-0bf8f97ee8ef/go.mod h1:8xcPgWmwlZONN1D9bjxtHEjrUtSEa3fakVF8iaewYKQ=
diff --git a/src/cmd/go/alldocs.go b/src/cmd/go/alldocs.go
index fe9b862073..8c346dafdb 100644
--- a/src/cmd/go/alldocs.go
+++ b/src/cmd/go/alldocs.go
@@ -1954,7 +1954,7 @@
 //
 //	-o file
 //	    Save a copy of the test binary to the named file.
-//	    The test still runs (unless -c or -i is specified).
+//	    The test still runs (unless -c is specified).
 //	    If file ends in a slash or names an existing directory,
 //	    the test is written to pkg.test in that directory.
 //
diff --git a/src/cmd/go/internal/doc/pkgsite.go b/src/cmd/go/internal/doc/pkgsite.go
index c173167b63..dc344cbbca 100644
--- a/src/cmd/go/internal/doc/pkgsite.go
+++ b/src/cmd/go/internal/doc/pkgsite.go
@@ -71,7 +71,7 @@ func doPkgsite(urlPath, fragment string) error {
 		env = append(env, "GOPROXY="+gomodcache+","+goproxy)
 	}
 
-	const version = "v0.0.0-20250714212547-01b046e81fe7"
+	const version = "v0.0.0-20251223195805-1a3bd3c788fe"
 	cmd := exec.Command(goCmd(), "run", "golang.org/x/pkgsite/cmd/internal/doc@"+version,
 		"-gorepo", buildCtx.GOROOT,
 		"-http", addr,
diff --git a/src/cmd/go/internal/modindex/scan.go b/src/cmd/go/internal/modindex/scan.go
index af2c0abe04..beded695bf 100644
--- a/src/cmd/go/internal/modindex/scan.go
+++ b/src/cmd/go/internal/modindex/scan.go
@@ -112,10 +112,10 @@ func parseErrorToString(err error) string {
 		return ""
 	}
 	var p parseError
-	if e, ok := err.(scanner.ErrorList); ok {
-		p.ErrorList = &e
+	if errlist, ok := err.(scanner.ErrorList); ok {
+		p.ErrorList = &errlist
 	} else {
-		p.ErrorString = e.Error()
+		p.ErrorString = err.Error()
 	}
 	s, err := json.Marshal(p)
 	if err != nil {
diff --git a/src/cmd/go/internal/test/test.go b/src/cmd/go/internal/test/test.go
index 916943904d..9309aa65ed 100644
--- a/src/cmd/go/internal/test/test.go
+++ b/src/cmd/go/internal/test/test.go
@@ -163,7 +163,7 @@ In addition to the build flags, the flags handled by 'go test' itself are:
 
 	-o file
 	    Save a copy of the test binary to the named file.
-	    The test still runs (unless -c or -i is specified).
+	    The test still runs (unless -c is specified).
 	    If file ends in a slash or names an existing directory,
 	    the test is written to pkg.test in that directory.
 
diff --git a/src/cmd/go/testdata/script/list_empty_importpath.txt b/src/cmd/go/testdata/script/list_empty_importpath.txt
index fe4210322b..0960a7795d 100644
--- a/src/cmd/go/testdata/script/list_empty_importpath.txt
+++ b/src/cmd/go/testdata/script/list_empty_importpath.txt
@@ -1,15 +1,6 @@
 ! go list all
 ! stderr 'panic'
-[!GOOS:windows] [!GOOS:solaris] [!GOOS:freebsd] [!GOOS:openbsd] [!GOOS:netbsd] stderr 'invalid import path'
-# #73976: Allow 'no errors' on Windows, Solaris, and BSD until issue
-# is resolved to prevent flakes. 'no errors' is printed by
-# empty scanner.ErrorList errors so that's probably where the
-# message is coming from, though we don't know how.
-[GOOS:windows] stderr 'invalid import path|no errors'
-[GOOS:solaris] stderr 'invalid import path|no errors'
-[GOOS:freebsd] stderr 'invalid import path|no errors'
-[GOOS:openbsd] stderr 'invalid import path|no errors'
-[GOOS:netbsd] stderr 'invalid import path|no errors'
+stderr 'invalid import path'
 
 # go list produces a package for 'p' but not for ''
 go list -e all
diff --git a/src/cmd/go/testdata/vcstest/git/legacytest.txt b/src/cmd/go/testdata/vcstest/git/legacytest.txt
index 5846983cef..6465242d62 100644
--- a/src/cmd/go/testdata/vcstest/git/legacytest.txt
+++ b/src/cmd/go/testdata/vcstest/git/legacytest.txt
@@ -6,7 +6,7 @@ env GIT_COMMITTER_NAME=$GIT_AUTHOR_NAME
 env GIT_COMMITTER_EMAIL=$GIT_AUTHOR_EMAIL
 
 git init
-git branch -M master
+git checkout -b master
 
 at 2018-07-17T12:41:39-04:00
 cp x_cf92c7b.go x.go
diff --git a/src/cmd/internal/bootstrap_test/overlaydir_test.go b/src/cmd/internal/bootstrap_test/overlaydir_test.go
index 5812c453ac..bee3214b67 100644
--- a/src/cmd/internal/bootstrap_test/overlaydir_test.go
+++ b/src/cmd/internal/bootstrap_test/overlaydir_test.go
@@ -43,6 +43,9 @@ func overlayDir(dstRoot, srcRoot string) error {
 		dstPath := filepath.Join(dstRoot, suffix)
 
 		info, err := entry.Info()
+		if err != nil {
+			return err
+		}
 		perm := info.Mode() & os.ModePerm
 		if info.Mode()&os.ModeSymlink != 0 {
 			info, err = os.Stat(srcPath)
diff --git a/src/cmd/link/link_test.go b/src/cmd/link/link_test.go
index bc7504e5b1..036eda13bc 100644
--- a/src/cmd/link/link_test.go
+++ b/src/cmd/link/link_test.go
@@ -869,6 +869,9 @@ func TestFuncAlignOption(t *testing.T) {
 			"_main.bar": false,
 			"_main.baz": false}
 		syms, err := f.Symbols()
+		if err != nil {
+			t.Errorf("failed to get symbols with err %v", err)
+		}
 		for _, s := range syms {
 			fn := s.Name
 			if _, ok := fname[fn]; !ok {
diff --git a/src/cmd/vendor/golang.org/x/tools/go/analysis/passes/modernize/doc.go b/src/cmd/vendor/golang.org/x/tools/go/analysis/passes/modernize/doc.go
index 45aed7909c..f1202c7a11 100644
--- a/src/cmd/vendor/golang.org/x/tools/go/analysis/passes/modernize/doc.go
+++ b/src/cmd/vendor/golang.org/x/tools/go/analysis/passes/modernize/doc.go
@@ -80,6 +80,8 @@ or b.ResetTimer within the same function will also be removed.
 Caveats: The b.Loop() method is designed to prevent the compiler from
 optimizing away the benchmark loop, which can occasionally result in
 slower execution due to increased allocations in some specific cases.
+Since its fix may change the performance of nanosecond-scale benchmarks,
+bloop is disabled by default in the `go fix` analyzer suite; see golang/go#74967.
 
 # Analyzer any
 
diff --git a/src/cmd/vendor/golang.org/x/tools/go/analysis/passes/modernize/maps.go b/src/cmd/vendor/golang.org/x/tools/go/analysis/passes/modernize/maps.go
index f97541d4b3..795f5b6c6b 100644
--- a/src/cmd/vendor/golang.org/x/tools/go/analysis/passes/modernize/maps.go
+++ b/src/cmd/vendor/golang.org/x/tools/go/analysis/passes/modernize/maps.go
@@ -231,9 +231,28 @@ func mapsloop(pass *analysis.Pass) (any, error) {
 				// Have: for k, v := range x { lhs = rhs }
 
 				assign := rng.Body.List[0].(*ast.AssignStmt)
+
+				// usesKV reports whether e references vars k or v.
+				usesKV := func(e ast.Expr) bool {
+					k := info.Defs[rng.Key.(*ast.Ident)]
+					v := info.Defs[rng.Value.(*ast.Ident)]
+					for n := range ast.Preorder(e) {
+						if id, ok := n.(*ast.Ident); ok {
+							obj := info.Uses[id]
+							if obj != nil && // don't rely on k, v being non-nil
+								(obj == k || obj == v) {
+								return true
+							}
+						}
+					}
+					return false
+				}
+
 				if index, ok := assign.Lhs[0].(*ast.IndexExpr); ok &&
+					len(assign.Lhs) == 1 &&
 					astutil.EqualSyntax(rng.Key, index.Index) &&
-					astutil.EqualSyntax(rng.Value, assign.Rhs[0]) {
+					astutil.EqualSyntax(rng.Value, assign.Rhs[0]) &&
+					!usesKV(index.X) { // reject (e.g.) f(k, v)[k] = v
 					if tmap, ok := typeparams.CoreType(info.TypeOf(index.X)).(*types.Map); ok &&
 						types.Identical(info.TypeOf(index), info.TypeOf(rng.Value)) && // m[k], v
 						types.Identical(tmap.Key(), info.TypeOf(rng.Key)) {
diff --git a/src/cmd/vendor/golang.org/x/tools/go/analysis/passes/modernize/modernize.go b/src/cmd/vendor/golang.org/x/tools/go/analysis/passes/modernize/modernize.go
index 013ce79d6c..f09a2d26ca 100644
--- a/src/cmd/vendor/golang.org/x/tools/go/analysis/passes/modernize/modernize.go
+++ b/src/cmd/vendor/golang.org/x/tools/go/analysis/passes/modernize/modernize.go
@@ -34,7 +34,7 @@ var doc string
 var Suite = []*analysis.Analyzer{
 	AnyAnalyzer,
 	// AppendClippedAnalyzer, // not nil-preserving!
-	BLoopAnalyzer,
+	// BLoopAnalyzer, // may skew benchmark results, see golang/go#74967
 	FmtAppendfAnalyzer,
 	ForVarAnalyzer,
 	MapsLoopAnalyzer,
diff --git a/src/cmd/vendor/modules.txt b/src/cmd/vendor/modules.txt
index 7c122cd9d1..9c179c4bcd 100644
--- a/src/cmd/vendor/modules.txt
+++ b/src/cmd/vendor/modules.txt
@@ -73,7 +73,7 @@ golang.org/x/text/internal/tag
 golang.org/x/text/language
 golang.org/x/text/transform
 golang.org/x/text/unicode/norm
-# golang.org/x/tools v0.39.1-0.20251205000126-062ef7b6ced2
+# golang.org/x/tools v0.39.1-0.20251230210517-d44be789a05c
 ## explicit; go 1.24.0
 golang.org/x/tools/cmd/bisect
 golang.org/x/tools/cover
diff --git a/src/crypto/cipher/gcm_fips140v2.0_test.go b/src/crypto/cipher/gcm_fips140v1.26_test.go
index d3a8ea5c63..9f17a497ca 100644
--- a/src/crypto/cipher/gcm_fips140v2.0_test.go
+++ b/src/crypto/cipher/gcm_fips140v1.26_test.go
@@ -18,10 +18,10 @@ import (
 	"testing"
 )
 
-func TestGCMNoncesFIPSV2(t *testing.T) {
+func TestGCMNoncesFIPSV126(t *testing.T) {
 	cryptotest.MustSupportFIPS140(t)
 	if !fips140.Enabled {
-		cmd := testenv.Command(t, testenv.Executable(t), "-test.run=^TestGCMNoncesFIPSV2$", "-test.v")
+		cmd := testenv.Command(t, testenv.Executable(t), "-test.run=^TestGCMNoncesFIPSV126$", "-test.v")
 		cmd.Env = append(cmd.Environ(), "GODEBUG=fips140=on")
 		out, err := cmd.CombinedOutput()
 		t.Logf("running with GODEBUG=fips140=on:\n%s", out)
diff --git a/src/crypto/hpke/aead_fipsv1.0.go b/src/crypto/hpke/aead_fips140v1.0.go
index 986126cbf9..986126cbf9 100644
--- a/src/crypto/hpke/aead_fipsv1.0.go
+++ b/src/crypto/hpke/aead_fips140v1.0.go
diff --git a/src/crypto/hpke/aead_fipsv2.0.go b/src/crypto/hpke/aead_fips140v1.26.go
index 710eb1c08f..710eb1c08f 100644
--- a/src/crypto/hpke/aead_fipsv2.0.go
+++ b/src/crypto/hpke/aead_fips140v1.26.go
diff --git a/src/crypto/internal/fips140only/fips140only_test.go b/src/crypto/internal/fips140only/fips140only_test.go
new file mode 100644
index 0000000000..96df536d56
--- /dev/null
+++ b/src/crypto/internal/fips140only/fips140only_test.go
@@ -0,0 +1,408 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package fips140only_test
+
+import (
+	"crypto"
+	"crypto/aes"
+	"crypto/cipher"
+	"crypto/des"
+	"crypto/dsa"
+	"crypto/ecdh"
+	"crypto/ecdsa"
+	"crypto/ed25519"
+	"crypto/elliptic"
+	"crypto/hkdf"
+	"crypto/hmac"
+	"crypto/hpke"
+	"crypto/internal/cryptotest"
+	"crypto/internal/fips140"
+	"crypto/internal/fips140only"
+	"crypto/md5"
+	"crypto/mlkem"
+	"crypto/mlkem/mlkemtest"
+	"crypto/pbkdf2"
+	"crypto/rand"
+	"crypto/rc4"
+	"crypto/rsa"
+	"crypto/sha1"
+	"crypto/sha256"
+	_ "crypto/sha3"
+	_ "crypto/sha512"
+	"crypto/x509"
+	"encoding/pem"
+	"fmt"
+	"internal/godebug"
+	"internal/testenv"
+	"io"
+	"math/big"
+	"os"
+	"strings"
+	"testing"
+
+	"golang.org/x/crypto/chacha20poly1305"
+)
+
+func TestFIPS140Only(t *testing.T) {
+	cryptotest.MustSupportFIPS140(t)
+	if !fips140only.Enforced() {
+		cmd := testenv.Command(t, testenv.Executable(t), "-test.run=^TestFIPS140Only$", "-test.v")
+		cmd.Env = append(cmd.Environ(), "GODEBUG=fips140=only")
+		out, err := cmd.CombinedOutput()
+		t.Logf("running with GODEBUG=fips140=only:\n%s", out)
+		if err != nil {
+			t.Errorf("fips140=only subprocess failed: %v", err)
+		}
+		return
+	}
+	t.Run("cryptocustomrand=0", func(t *testing.T) {
+		t.Setenv("GODEBUG", os.Getenv("GODEBUG")+",cryptocustomrand=0")
+		testFIPS140Only(t)
+	})
+	t.Run("cryptocustomrand=1", func(t *testing.T) {
+		t.Setenv("GODEBUG", os.Getenv("GODEBUG")+",cryptocustomrand=1")
+		testFIPS140Only(t)
+	})
+}
+
+func testFIPS140Only(t *testing.T) {
+	if !fips140only.Enforced() {
+		t.Fatal("FIPS 140-only mode not enforced")
+	}
+	t.Logf("GODEBUG=fips140=only enabled")
+	fips140.ResetServiceIndicator()
+
+	aesBlock, err := aes.NewCipher(make([]byte, 16))
+	if err != nil {
+		t.Fatal(err)
+	}
+	notAESBlock := blockWrap{aesBlock}
+	iv := make([]byte, aes.BlockSize)
+
+	cipher.NewCBCEncrypter(aesBlock, iv)
+	expectPanic(t, func() { cipher.NewCBCEncrypter(notAESBlock, iv) })
+	cipher.NewCBCDecrypter(aesBlock, iv)
+	expectPanic(t, func() { cipher.NewCBCDecrypter(notAESBlock, iv) })
+
+	expectPanic(t, func() { cipher.NewCFBEncrypter(aesBlock, iv) })
+	expectPanic(t, func() { cipher.NewCFBDecrypter(aesBlock, iv) })
+
+	cipher.NewCTR(aesBlock, iv)
+	expectPanic(t, func() { cipher.NewCTR(notAESBlock, iv) })
+
+	expectPanic(t, func() { cipher.NewOFB(aesBlock, iv) })
+
+	expectErr(t, errRet2(cipher.NewGCM(aesBlock)))
+	expectErr(t, errRet2(cipher.NewGCMWithNonceSize(aesBlock, 12)))
+	expectErr(t, errRet2(cipher.NewGCMWithTagSize(aesBlock, 12)))
+	expectNoErr(t, errRet2(cipher.NewGCMWithRandomNonce(aesBlock)))
+
+	expectErr(t, errRet2(des.NewCipher(make([]byte, 8))))
+	expectErr(t, errRet2(des.NewTripleDESCipher(make([]byte, 24))))
+
+	expectErr(t, errRet2(rc4.NewCipher(make([]byte, 16))))
+
+	expectErr(t, errRet2(chacha20poly1305.New(make([]byte, chacha20poly1305.KeySize))))
+	expectErr(t, errRet2(chacha20poly1305.NewX(make([]byte, chacha20poly1305.KeySize))))
+
+	expectPanic(t, func() { md5.New().Sum(nil) })
+	expectErr(t, errRet2(md5.New().Write(make([]byte, 16))))
+	expectPanic(t, func() { md5.Sum([]byte("foo")) })
+
+	expectPanic(t, func() { sha1.New().Sum(nil) })
+	expectErr(t, errRet2(sha1.New().Write(make([]byte, 16))))
+	expectPanic(t, func() { sha1.Sum([]byte("foo")) })
+
+	withApprovedHash(func(h crypto.Hash) { h.New().Sum(nil) })
+	withNonApprovedHash(func(h crypto.Hash) { expectPanic(t, func() { h.New().Sum(nil) }) })
+
+	expectErr(t, errRet2(pbkdf2.Key(sha256.New, "password", make([]byte, 16), 1, 10)))
+	expectErr(t, errRet2(pbkdf2.Key(sha256.New, "password", make([]byte, 10), 1, 14)))
+	withNonApprovedHash(func(h crypto.Hash) {
+		expectErr(t, errRet2(pbkdf2.Key(h.New, "password", make([]byte, 16), 1, 14)))
+	})
+	withApprovedHash(func(h crypto.Hash) {
+		expectNoErr(t, errRet2(pbkdf2.Key(h.New, "password", make([]byte, 16), 1, 14)))
+	})
+
+	expectPanic(t, func() { hmac.New(sha256.New, make([]byte, 10)) })
+	withNonApprovedHash(func(h crypto.Hash) {
+		expectPanic(t, func() { hmac.New(h.New, make([]byte, 16)) })
+	})
+	withApprovedHash(func(h crypto.Hash) { hmac.New(h.New, make([]byte, 16)) })
+
+	expectErr(t, errRet2(hkdf.Key(sha256.New, make([]byte, 10), nil, "", 16)))
+	withNonApprovedHash(func(h crypto.Hash) {
+		expectErr(t, errRet2(hkdf.Key(h.New, make([]byte, 16), nil, "", 16)))
+	})
+	withApprovedHash(func(h crypto.Hash) {
+		expectNoErr(t, errRet2(hkdf.Key(h.New, make([]byte, 16), nil, "", 16)))
+	})
+
+	expectErr(t, errRet2(hkdf.Extract(sha256.New, make([]byte, 10), nil)))
+	withNonApprovedHash(func(h crypto.Hash) {
+		expectErr(t, errRet2(hkdf.Extract(h.New, make([]byte, 16), nil)))
+	})
+	withApprovedHash(func(h crypto.Hash) {
+		expectNoErr(t, errRet2(hkdf.Extract(h.New, make([]byte, 16), nil)))
+	})
+
+	expectErr(t, errRet2(hkdf.Expand(sha256.New, make([]byte, 10), "", 16)))
+	withNonApprovedHash(func(h crypto.Hash) {
+		expectErr(t, errRet2(hkdf.Expand(h.New, make([]byte, 16), "", 16)))
+	})
+	withApprovedHash(func(h crypto.Hash) {
+		expectNoErr(t, errRet2(hkdf.Expand(h.New, make([]byte, 16), "", 16)))
+	})
+
+	expectErr(t, errRet2(rand.Prime(rand.Reader, 10)))
+
+	expectErr(t, dsa.GenerateParameters(&dsa.Parameters{}, rand.Reader, dsa.L1024N160))
+	expectErr(t, dsa.GenerateKey(&dsa.PrivateKey{}, rand.Reader))
+	expectErr(t, errRet3(dsa.Sign(rand.Reader, &dsa.PrivateKey{}, make([]byte, 16))))
+	expectPanic(t, func() {
+		dsa.Verify(&dsa.PublicKey{}, make([]byte, 16), big.NewInt(1), big.NewInt(1))
+	})
+
+	expectErr(t, errRet2(ecdh.X25519().GenerateKey(rand.Reader)))
+	expectErr(t, errRet2(ecdh.X25519().NewPrivateKey(make([]byte, 32))))
+	expectErr(t, errRet2(ecdh.X25519().NewPublicKey(make([]byte, 32))))
+	for _, curve := range []ecdh.Curve{ecdh.P256(), ecdh.P384(), ecdh.P521()} {
+		expectErrIfCustomRand(t, errRet2(curve.GenerateKey(readerWrap{rand.Reader})))
+		k, err := curve.GenerateKey(rand.Reader)
+		if err != nil {
+			t.Fatal(err)
+		}
+		expectNoErr(t, errRet2(curve.NewPrivateKey(k.Bytes())))
+		expectNoErr(t, errRet2(curve.NewPublicKey(k.PublicKey().Bytes())))
+	}
+
+	for _, curve := range []elliptic.Curve{elliptic.P256(), elliptic.P384(), elliptic.P521()} {
+		expectErrIfCustomRand(t, errRet2(ecdsa.GenerateKey(curve, readerWrap{rand.Reader})))
+		k, err := ecdsa.GenerateKey(curve, rand.Reader)
+		if err != nil {
+			t.Fatal(err)
+		}
+
+		expectErrIfCustomRand(t, errRet2(k.Sign(readerWrap{rand.Reader}, make([]byte, 32), nil)))
+		expectErrIfCustomRand(t, errRet2(ecdsa.SignASN1(readerWrap{rand.Reader}, k, make([]byte, 32))))
+		expectErrIfCustomRand(t, errRet3(ecdsa.Sign(readerWrap{rand.Reader}, k, make([]byte, 32))))
+		expectNoErr(t, errRet2(k.Sign(rand.Reader, make([]byte, 32), nil)))
+		expectNoErr(t, errRet2(ecdsa.SignASN1(rand.Reader, k, make([]byte, 32))))
+		expectNoErr(t, errRet3(ecdsa.Sign(rand.Reader, k, make([]byte, 32))))
+
+		withNonApprovedHash(func(h crypto.Hash) {
+			expectErr(t, errRet2(k.Sign(nil, make([]byte, h.Size()), h)))
+		})
+		withApprovedHash(func(h crypto.Hash) {
+			expectNoErr(t, errRet2(k.Sign(nil, make([]byte, h.Size()), h)))
+		})
+	}
+	customCurve := &elliptic.CurveParams{Name: "custom", P: big.NewInt(1)}
+	expectErr(t, errRet2(ecdsa.GenerateKey(customCurve, rand.Reader)))
+
+	_, ed25519Key, err := ed25519.GenerateKey(rand.Reader)
+	if err != nil {
+		t.Fatal(err)
+	}
+	expectNoErr(t, errRet2(ed25519Key.Sign(nil, make([]byte, 32), crypto.Hash(0))))
+	expectNoErr(t, errRet2(ed25519Key.Sign(nil, make([]byte, 64), crypto.SHA512)))
+	// ed25519ctx is not allowed (but ed25519ph with context is).
+	expectErr(t, errRet2(ed25519Key.Sign(nil, make([]byte, 32), &ed25519.Options{
+		Context: "test",
+	})))
+	expectNoErr(t, errRet2(ed25519Key.Sign(nil, make([]byte, 64), &ed25519.Options{
+		Hash: crypto.SHA512, Context: "test",
+	})))
+	expectNoErr(t, errRet2(ed25519Key.Sign(nil, make([]byte, 64), &ed25519.Options{
+		Hash: crypto.SHA512,
+	})))
+
+	expectErr(t, errRet2(rsa.GenerateMultiPrimeKey(rand.Reader, 3, 2048)))
+	expectErr(t, errRet2(rsa.GenerateKey(rand.Reader, 1024)))
+	expectErr(t, errRet2(rsa.GenerateKey(rand.Reader, 2049)))
+	expectErrIfCustomRand(t, errRet2(rsa.GenerateKey(readerWrap{rand.Reader}, 2048)))
+	rsaKey, err := rsa.GenerateKey(rand.Reader, 2048)
+	expectNoErr(t, err)
+
+	smallKey := parseKey(testingKey(`-----BEGIN RSA TESTING KEY-----
+MIICXQIBAAKBgQDMrln6XoAa3Rjts+kRi5obbP86qSf/562RcuDO+yMXeTLHfi4M
+8ubyhoFY+UKBCGBLmmTO7ikbvQgdipkT3xVkU8nM3XTW4sxrnw0X5QXsl4PGlMo0
+5UufxYyQxe7bbjuwFz2XnN6Jz4orpOfO0s36/KVHj9lZRl+REpr/Jy+nJQIDAQAB
+AoGAJ9WEwGO01cWSzOwXH2mGX/EKCQ4TsUuS7XwogU/B6BcXyVhmuPFq/ecsdDbq
+ePc62mvdU6JpELNsyWcIXKQtYsRgJHxNS+KJkCQIq6YeiAWRG0XL6q+qVj+HtT8a
+1Qrmul9ZBd23Y9wLF8pg/xWDQYvb8DPAb/xJ0e/KEBZcWU8CQQDXFCFCGpCfwyxY
+Cq8G/3B94D9UYwk5mK6jRIH5m8LbaX9bKKetf8+If8TWVgeuiRjjN4WEQ78lPoSg
+3Fsz2qs3AkEA85/JCudNUf2FnY+T6h1c/2SWekZiZ1NS4lCh/C7iYuAN3oa8zGkf
+gjjR5e0+Z8rUAcZkTukxyLLaNqy6rs9GgwJAVR6pXvEGhcQHe7yWso1LpvWl+q7L
+StkrXIBTdEb54j4pYhl/6wFnUB1I+I7JsYCeseYaWFM7hfDtKoCrM6V6FwJBANxh
+KmfmnJcSkw/YlaEuNrYAs+6gRNvbEBsRfba2Yqu2qlUl5Ruz7IDMDXPEjLMvU2DX
+ql2HrTU0NRlIXwdLESkCQQDGJ54H6WK1eE1YvtxCaLm28zmogcFlvc21pym+PpM1
+bXVL8iKLrG91IYQByUHZIn3WVAd2bfi4MfKagRt0ggd4
+-----END RSA TESTING KEY-----`))
+
+	expectNoErr(t, errRet2(rsaKey.Sign(rand.Reader, make([]byte, 32), crypto.SHA256)))
+	expectErr(t, errRet2(smallKey.Sign(rand.Reader, make([]byte, 32), crypto.SHA256)))
+	expectErr(t, errRet2(rsaKey.Sign(rand.Reader, make([]byte, 20), crypto.SHA1)))
+	// rand is always ignored for PKCS1v15 signing
+	expectNoErr(t, errRet2(rsaKey.Sign(readerWrap{rand.Reader}, make([]byte, 32), crypto.SHA256)))
+
+	sigPKCS1v15, err := rsa.SignPKCS1v15(rand.Reader, rsaKey, crypto.SHA256, make([]byte, 32))
+	expectNoErr(t, err)
+	expectErr(t, errRet2(rsa.SignPKCS1v15(rand.Reader, smallKey, crypto.SHA256, make([]byte, 32))))
+	expectErr(t, errRet2(rsa.SignPKCS1v15(rand.Reader, rsaKey, crypto.SHA1, make([]byte, 20))))
+	// rand is always ignored for PKCS1v15 signing
+	expectNoErr(t, errRet2(rsa.SignPKCS1v15(readerWrap{rand.Reader}, rsaKey, crypto.SHA256, make([]byte, 32))))
+
+	expectNoErr(t, rsa.VerifyPKCS1v15(&rsaKey.PublicKey, crypto.SHA256, make([]byte, 32), sigPKCS1v15))
+	expectErr(t, rsa.VerifyPKCS1v15(&smallKey.PublicKey, crypto.SHA256, make([]byte, 32), sigPKCS1v15))
+	expectErr(t, rsa.VerifyPKCS1v15(&rsaKey.PublicKey, crypto.SHA1, make([]byte, 20), sigPKCS1v15))
+
+	sigPSS, err := rsa.SignPSS(rand.Reader, rsaKey, crypto.SHA256, make([]byte, 32), nil)
+	expectNoErr(t, err)
+	expectErr(t, errRet2(rsa.SignPSS(rand.Reader, smallKey, crypto.SHA256, make([]byte, 32), nil)))
+	expectErr(t, errRet2(rsa.SignPSS(rand.Reader, rsaKey, crypto.SHA1, make([]byte, 20), nil)))
+	expectErr(t, errRet2(rsa.SignPSS(readerWrap{rand.Reader}, rsaKey, crypto.SHA256, make([]byte, 32), nil)))
+
+	expectNoErr(t, rsa.VerifyPSS(&rsaKey.PublicKey, crypto.SHA256, make([]byte, 32), sigPSS, nil))
+	expectErr(t, rsa.VerifyPSS(&smallKey.PublicKey, crypto.SHA256, make([]byte, 32), sigPSS, nil))
+	expectErr(t, rsa.VerifyPSS(&rsaKey.PublicKey, crypto.SHA1, make([]byte, 20), sigPSS, nil))
+
+	k, err := mlkem.GenerateKey768()
+	expectNoErr(t, err)
+	expectErr(t, errRet3(mlkemtest.Encapsulate768(k.EncapsulationKey(), make([]byte, 32))))
+	k1024, err := mlkem.GenerateKey1024()
+	expectNoErr(t, err)
+	expectErr(t, errRet3(mlkemtest.Encapsulate1024(k1024.EncapsulationKey(), make([]byte, 32))))
+
+	for _, kem := range []hpke.KEM{
+		hpke.DHKEM(ecdh.P256()),
+		hpke.DHKEM(ecdh.P384()),
+		hpke.DHKEM(ecdh.P521()),
+		hpke.MLKEM768(),
+		hpke.MLKEM1024(),
+		hpke.MLKEM768P256(),
+		hpke.MLKEM1024P384(),
+		hpke.MLKEM768X25519(), // allowed as hybrid
+	} {
+		t.Run(fmt.Sprintf("HKPE KEM %04x", kem.ID()), func(t *testing.T) {
+			k, err := kem.GenerateKey()
+			expectNoErr(t, err)
+			expectNoErr(t, errRet2(kem.DeriveKeyPair(make([]byte, 64))))
+			kb, err := k.Bytes()
+			expectNoErr(t, err)
+			expectNoErr(t, errRet2(kem.NewPrivateKey(kb)))
+			expectNoErr(t, errRet2(kem.NewPublicKey(k.PublicKey().Bytes())))
+			if fips140.Version() == "v1.0.0" {
+				t.Skip("FIPS 140-3 Module v1.0.0 does not provide HPKE GCM modes")
+			}
+			c, err := hpke.Seal(k.PublicKey(), hpke.HKDFSHA256(), hpke.AES128GCM(), nil, nil)
+			expectNoErr(t, err)
+			_, err = hpke.Open(k, hpke.HKDFSHA256(), hpke.AES128GCM(), nil, c)
+			expectNoErr(t, err)
+		})
+	}
+	expectErr(t, errRet2(hpke.DHKEM(ecdh.X25519()).GenerateKey()))
+	expectErr(t, errRet2(hpke.DHKEM(ecdh.X25519()).DeriveKeyPair(make([]byte, 64))))
+	expectErr(t, errRet2(hpke.DHKEM(ecdh.X25519()).NewPrivateKey(make([]byte, 32))))
+	expectErr(t, errRet2(hpke.DHKEM(ecdh.X25519()).NewPublicKey(make([]byte, 32))))
+	hpkeK, err := hpke.MLKEM768().GenerateKey()
+	expectNoErr(t, err)
+	expectErr(t, errRet2(hpke.Seal(hpkeK.PublicKey(), hpke.HKDFSHA256(), hpke.ChaCha20Poly1305(), nil, nil)))
+	expectErr(t, errRet2(hpke.Open(hpkeK, hpke.HKDFSHA256(), hpke.ChaCha20Poly1305(), nil, make([]byte, 2000))))
+
+	// fips140=only mode should prevent any operation that would make the FIPS
+	// 140-3 module set its service indicator to false.
+	if !fips140.ServiceIndicator() {
+		t.Errorf("service indicator not set")
+	}
+}
+
+type blockWrap struct {
+	cipher.Block
+}
+
+type readerWrap struct {
+	io.Reader
+}
+
+func withApprovedHash(f func(crypto.Hash)) {
+	f(crypto.SHA224)
+	f(crypto.SHA256)
+	f(crypto.SHA384)
+	f(crypto.SHA512)
+	f(crypto.SHA3_224)
+	f(crypto.SHA3_256)
+	f(crypto.SHA3_384)
+	f(crypto.SHA3_512)
+	f(crypto.SHA512_224)
+	f(crypto.SHA512_256)
+}
+
+func withNonApprovedHash(f func(crypto.Hash)) {
+	f(crypto.MD5)
+	f(crypto.SHA1)
+}
+
+func expectPanic(t *testing.T, f func()) {
+	t.Helper()
+	defer func() {
+		t.Helper()
+		if err := recover(); err == nil {
+			t.Errorf("expected panic")
+		} else {
+			if s, ok := err.(string); !ok || !strings.Contains(s, "FIPS 140-only") {
+				t.Errorf("unexpected panic: %v", err)
+			}
+		}
+	}()
+	f()
+}
+
+var cryptocustomrand = godebug.New("cryptocustomrand")
+
+func expectErr(t *testing.T, err error) {
+	t.Helper()
+	if err == nil {
+		t.Errorf("expected error")
+	} else if !strings.Contains(err.Error(), "FIPS 140-only") {
+		t.Errorf("unexpected error: %v", err)
+	}
+}
+
+func expectNoErr(t *testing.T, err error) {
+	t.Helper()
+	if err != nil {
+		t.Errorf("unexpected error: %v", err)
+	}
+}
+
+func expectErrIfCustomRand(t *testing.T, err error) {
+	t.Helper()
+	if cryptocustomrand.Value() == "1" {
+		expectErr(t, err)
+	} else {
+		expectNoErr(t, err)
+	}
+}
+
+func errRet2[T any](_ T, err error) error {
+	return err
+}
+
+func errRet3[T any](_, _ T, err error) error {
+	return err
+}
+
+func testingKey(s string) string { return strings.ReplaceAll(s, "TESTING KEY", "PRIVATE KEY") }
+
+func parseKey(s string) *rsa.PrivateKey {
+	p, _ := pem.Decode([]byte(s))
+	k, err := x509.ParsePKCS1PrivateKey(p.Bytes)
+	if err != nil {
+		panic(err)
+	}
+	return k
+}
diff --git a/src/crypto/internal/fips140test/acvp_capabilities_fips140v2.0.json b/src/crypto/internal/fips140test/acvp_capabilities_fips140v1.26.json
index 33c8aa235b..33c8aa235b 100644
--- a/src/crypto/internal/fips140test/acvp_capabilities_fips140v2.0.json
+++ b/src/crypto/internal/fips140test/acvp_capabilities_fips140v1.26.json
diff --git a/src/crypto/internal/fips140test/acvp_fips140v2.0_test.go b/src/crypto/internal/fips140test/acvp_fips140v1.26_test.go
index e9ef91537a..10a44f1492 100644
--- a/src/crypto/internal/fips140test/acvp_fips140v2.0_test.go
+++ b/src/crypto/internal/fips140test/acvp_fips140v1.26_test.go
@@ -12,10 +12,10 @@ import (
 	"fmt"
 )
 
-//go:embed acvp_capabilities_fips140v2.0.json
+//go:embed acvp_capabilities_fips140v1.26.json
 var capabilitiesJson []byte
 
-var testConfigFile = "acvp_test_fips140v2.0.config.json"
+var testConfigFile = "acvp_test_fips140v1.26.config.json"
 
 func init() {
 	commands["ML-DSA-44/keyGen"] = cmdMlDsaKeyGenAft(mldsa.NewPrivateKey44)
diff --git a/src/crypto/internal/fips140test/acvp_test_fips140v2.0.config.json b/src/crypto/internal/fips140test/acvp_test_fips140v1.26.config.json
index 51c76d9288..51c76d9288 100644
--- a/src/crypto/internal/fips140test/acvp_test_fips140v2.0.config.json
+++ b/src/crypto/internal/fips140test/acvp_test_fips140v1.26.config.json
diff --git a/src/crypto/internal/fips140test/cast_fips140v1.0_test.go b/src/crypto/internal/fips140test/cast_fips140v1.0_test.go
index 4780966208..b9ddfe4d8b 100644
--- a/src/crypto/internal/fips140test/cast_fips140v1.0_test.go
+++ b/src/crypto/internal/fips140test/cast_fips140v1.0_test.go
@@ -6,4 +6,4 @@
 
 package fipstest
 
-func fips140v2Conditionals() {}
+func fips140v126Conditionals() {}
diff --git a/src/crypto/internal/fips140test/cast_fips140v2.0_test.go b/src/crypto/internal/fips140test/cast_fips140v1.26_test.go
index 06e0513a7f..ef79068c38 100644
--- a/src/crypto/internal/fips140test/cast_fips140v2.0_test.go
+++ b/src/crypto/internal/fips140test/cast_fips140v1.26_test.go
@@ -8,7 +8,7 @@ package fipstest
 
 import "crypto/internal/fips140/mldsa"
 
-func fips140v2Conditionals() {
+func fips140v126Conditionals() {
 	// ML-DSA sign and verify PCT
 	kMLDSA := mldsa.GenerateKey44()
 	// ML-DSA-44
diff --git a/src/crypto/internal/fips140test/cast_test.go b/src/crypto/internal/fips140test/cast_test.go
index 5a80006622..817dcb9a35 100644
--- a/src/crypto/internal/fips140test/cast_test.go
+++ b/src/crypto/internal/fips140test/cast_test.go
@@ -115,7 +115,7 @@ func TestAllCASTs(t *testing.T) {
 
 // TestConditionals causes the conditional CASTs and PCTs to be invoked.
 func TestConditionals(t *testing.T) {
-	fips140v2Conditionals()
+	fips140v126Conditionals()
 	// ML-KEM PCT
 	kMLKEM, err := mlkem.GenerateKey768()
 	if err != nil {
diff --git a/src/crypto/internal/rand/rand_fipsv1.0.go b/src/crypto/internal/rand/rand_fips140v1.0.go
index 29eba7e0bc..29eba7e0bc 100644
--- a/src/crypto/internal/rand/rand_fipsv1.0.go
+++ b/src/crypto/internal/rand/rand_fips140v1.0.go
diff --git a/src/crypto/internal/rand/rand_fipsv2.0.go b/src/crypto/internal/rand/rand_fips140v1.26.go
index 0dc18e7883..0dc18e7883 100644
--- a/src/crypto/internal/rand/rand_fipsv2.0.go
+++ b/src/crypto/internal/rand/rand_fips140v1.26.go
diff --git a/src/crypto/tls/conn.go b/src/crypto/tls/conn.go
index c04c7a506e..a840125a45 100644
--- a/src/crypto/tls/conn.go
+++ b/src/crypto/tls/conn.go
@@ -224,6 +224,9 @@ func (hc *halfConn) changeCipherSpec() error {
 	return nil
 }
 
+// setTrafficSecret sets the traffic secret for the given encryption level. setTrafficSecret
+// should not be called directly, but rather through the Conn setWriteTrafficSecret and
+// setReadTrafficSecret wrapper methods.
 func (hc *halfConn) setTrafficSecret(suite *cipherSuiteTLS13, level QUICEncryptionLevel, secret []byte) {
 	hc.trafficSecret = secret
 	hc.level = level
@@ -1339,9 +1342,6 @@ func (c *Conn) handleKeyUpdate(keyUpdate *keyUpdateMsg) error {
 		return c.in.setErrorLocked(c.sendAlert(alertInternalError))
 	}
 
-	newSecret := cipherSuite.nextTrafficSecret(c.in.trafficSecret)
-	c.in.setTrafficSecret(cipherSuite, QUICEncryptionLevelInitial, newSecret)
-
 	if keyUpdate.updateRequested {
 		c.out.Lock()
 		defer c.out.Unlock()
@@ -1359,7 +1359,12 @@ func (c *Conn) handleKeyUpdate(keyUpdate *keyUpdateMsg) error {
 		}
 
 		newSecret := cipherSuite.nextTrafficSecret(c.out.trafficSecret)
-		c.out.setTrafficSecret(cipherSuite, QUICEncryptionLevelInitial, newSecret)
+		c.setWriteTrafficSecret(cipherSuite, QUICEncryptionLevelInitial, newSecret)
+	}
+
+	newSecret := cipherSuite.nextTrafficSecret(c.in.trafficSecret)
+	if err := c.setReadTrafficSecret(cipherSuite, QUICEncryptionLevelInitial, newSecret); err != nil {
+		return err
 	}
 
 	return nil
@@ -1576,7 +1581,9 @@ func (c *Conn) handshakeContext(ctx context.Context) (ret error) {
 			// Provide the 1-RTT read secret now that the handshake is complete.
 			// The QUIC layer MUST NOT decrypt 1-RTT packets prior to completing
 			// the handshake (RFC 9001, Section 5.7).
-			c.quicSetReadSecret(QUICEncryptionLevelApplication, c.cipherSuite, c.in.trafficSecret)
+			if err := c.quicSetReadSecret(QUICEncryptionLevelApplication, c.cipherSuite, c.in.trafficSecret); err != nil {
+				return err
+			}
 		} else {
 			c.out.Lock()
 			a, ok := errors.AsType[alert](c.out.err)
@@ -1672,3 +1679,25 @@ func (c *Conn) VerifyHostname(host string) error {
 	}
 	return c.peerCertificates[0].VerifyHostname(host)
 }
+
+// setReadTrafficSecret sets the read traffic secret for the given encryption level. If
+// being called at the same time as setWriteTrafficSecret, the caller must ensure the call
+// to setWriteTrafficSecret happens first so any alerts are sent at the write level.
+func (c *Conn) setReadTrafficSecret(suite *cipherSuiteTLS13, level QUICEncryptionLevel, secret []byte) error {
+	// Ensure that there are no buffered handshake messages before changing the
+	// read keys, since that can cause messages to be parsed that were encrypted
+	// using old keys which are no longer appropriate.
+	if c.hand.Len() != 0 {
+		c.sendAlert(alertUnexpectedMessage)
+		return errors.New("tls: handshake buffer not empty before setting read traffic secret")
+	}
+	c.in.setTrafficSecret(suite, level, secret)
+	return nil
+}
+
+// setWriteTrafficSecret sets the write traffic secret for the given encryption level. If
+// being called at the same time as setReadTrafficSecret, the caller must ensure the call
+// to setWriteTrafficSecret happens first so any alerts are sent at the write level.
+func (c *Conn) setWriteTrafficSecret(suite *cipherSuiteTLS13, level QUICEncryptionLevel, secret []byte) {
+	c.out.setTrafficSecret(suite, level, secret)
+}
diff --git a/src/crypto/tls/handshake_client_tls13.go b/src/crypto/tls/handshake_client_tls13.go
index e696bd3a13..77a24b4a78 100644
--- a/src/crypto/tls/handshake_client_tls13.go
+++ b/src/crypto/tls/handshake_client_tls13.go
@@ -490,16 +490,17 @@ func (hs *clientHandshakeStateTLS13) establishHandshakeKeys() error {
 	handshakeSecret := earlySecret.HandshakeSecret(sharedKey)
 
 	clientSecret := handshakeSecret.ClientHandshakeTrafficSecret(hs.transcript)
-	c.out.setTrafficSecret(hs.suite, QUICEncryptionLevelHandshake, clientSecret)
+	c.setWriteTrafficSecret(hs.suite, QUICEncryptionLevelHandshake, clientSecret)
 	serverSecret := handshakeSecret.ServerHandshakeTrafficSecret(hs.transcript)
-	c.in.setTrafficSecret(hs.suite, QUICEncryptionLevelHandshake, serverSecret)
+	if err := c.setReadTrafficSecret(hs.suite, QUICEncryptionLevelHandshake, serverSecret); err != nil {
+		return err
+	}
 
 	if c.quic != nil {
-		if c.hand.Len() != 0 {
-			c.sendAlert(alertUnexpectedMessage)
-		}
 		c.quicSetWriteSecret(QUICEncryptionLevelHandshake, hs.suite.id, clientSecret)
-		c.quicSetReadSecret(QUICEncryptionLevelHandshake, hs.suite.id, serverSecret)
+		if err := c.quicSetReadSecret(QUICEncryptionLevelHandshake, hs.suite.id, serverSecret); err != nil {
+			return err
+		}
 	}
 
 	err = c.config.writeKeyLog(keyLogLabelClientHandshake, hs.hello.random, clientSecret)
@@ -710,7 +711,9 @@ func (hs *clientHandshakeStateTLS13) readServerFinished() error {
 
 	hs.trafficSecret = hs.masterSecret.ClientApplicationTrafficSecret(hs.transcript)
 	serverSecret := hs.masterSecret.ServerApplicationTrafficSecret(hs.transcript)
-	c.in.setTrafficSecret(hs.suite, QUICEncryptionLevelApplication, serverSecret)
+	if err := c.setReadTrafficSecret(hs.suite, QUICEncryptionLevelApplication, serverSecret); err != nil {
+		return err
+	}
 
 	err = c.config.writeKeyLog(keyLogLabelClientTraffic, hs.hello.random, hs.trafficSecret)
 	if err != nil {
@@ -813,16 +816,13 @@ func (hs *clientHandshakeStateTLS13) sendClientFinished() error {
 		return err
 	}
 
-	c.out.setTrafficSecret(hs.suite, QUICEncryptionLevelApplication, hs.trafficSecret)
+	c.setWriteTrafficSecret(hs.suite, QUICEncryptionLevelApplication, hs.trafficSecret)
 
 	if !c.config.SessionTicketsDisabled && c.config.ClientSessionCache != nil {
 		c.resumptionSecret = hs.masterSecret.ResumptionMasterSecret(hs.transcript)
 	}
 
 	if c.quic != nil {
-		if c.hand.Len() != 0 {
-			c.sendAlert(alertUnexpectedMessage)
-		}
 		c.quicSetWriteSecret(QUICEncryptionLevelApplication, hs.suite.id, hs.trafficSecret)
 	}
 
diff --git a/src/crypto/tls/handshake_server_tls13.go b/src/crypto/tls/handshake_server_tls13.go
index 3bed1359a3..b066924e29 100644
--- a/src/crypto/tls/handshake_server_tls13.go
+++ b/src/crypto/tls/handshake_server_tls13.go
@@ -410,7 +410,9 @@ func (hs *serverHandshakeStateTLS13) checkForResumption() error {
 				return err
 			}
 			earlyTrafficSecret := hs.earlySecret.ClientEarlyTrafficSecret(transcript)
-			c.quicSetReadSecret(QUICEncryptionLevelEarly, hs.suite.id, earlyTrafficSecret)
+			if err := c.quicSetReadSecret(QUICEncryptionLevelEarly, hs.suite.id, earlyTrafficSecret); err != nil {
+				return err
+			}
 		}
 
 		c.didResume = true
@@ -514,6 +516,14 @@ func (hs *serverHandshakeStateTLS13) sendDummyChangeCipherSpec() error {
 func (hs *serverHandshakeStateTLS13) doHelloRetryRequest(selectedGroup CurveID) (*keyShare, error) {
 	c := hs.c
 
+	// Make sure the client didn't send extra handshake messages alongside
+	// their initial client_hello. If they sent two client_hello messages,
+	// we will consume the second before they respond to the server_hello.
+	if c.hand.Len() != 0 {
+		c.sendAlert(alertUnexpectedMessage)
+		return nil, errors.New("tls: handshake buffer not empty before HelloRetryRequest")
+	}
+
 	// The first ClientHello gets double-hashed into the transcript upon a
 	// HelloRetryRequest. See RFC 8446, Section 4.4.1.
 	if err := transcriptMsg(hs.clientHello, hs.transcript); err != nil {
@@ -733,17 +743,18 @@ func (hs *serverHandshakeStateTLS13) sendServerParameters() error {
 	}
 	hs.handshakeSecret = earlySecret.HandshakeSecret(hs.sharedKey)
 
-	clientSecret := hs.handshakeSecret.ClientHandshakeTrafficSecret(hs.transcript)
-	c.in.setTrafficSecret(hs.suite, QUICEncryptionLevelHandshake, clientSecret)
 	serverSecret := hs.handshakeSecret.ServerHandshakeTrafficSecret(hs.transcript)
-	c.out.setTrafficSecret(hs.suite, QUICEncryptionLevelHandshake, serverSecret)
+	c.setWriteTrafficSecret(hs.suite, QUICEncryptionLevelHandshake, serverSecret)
+	clientSecret := hs.handshakeSecret.ClientHandshakeTrafficSecret(hs.transcript)
+	if err := c.setReadTrafficSecret(hs.suite, QUICEncryptionLevelHandshake, clientSecret); err != nil {
+		return err
+	}
 
 	if c.quic != nil {
-		if c.hand.Len() != 0 {
-			c.sendAlert(alertUnexpectedMessage)
-		}
 		c.quicSetWriteSecret(QUICEncryptionLevelHandshake, hs.suite.id, serverSecret)
-		c.quicSetReadSecret(QUICEncryptionLevelHandshake, hs.suite.id, clientSecret)
+		if err := c.quicSetReadSecret(QUICEncryptionLevelHandshake, hs.suite.id, clientSecret); err != nil {
+			return err
+		}
 	}
 
 	err := c.config.writeKeyLog(keyLogLabelClientHandshake, hs.clientHello.random, clientSecret)
@@ -887,13 +898,9 @@ func (hs *serverHandshakeStateTLS13) sendServerFinished() error {
 
 	hs.trafficSecret = hs.masterSecret.ClientApplicationTrafficSecret(hs.transcript)
 	serverSecret := hs.masterSecret.ServerApplicationTrafficSecret(hs.transcript)
-	c.out.setTrafficSecret(hs.suite, QUICEncryptionLevelApplication, serverSecret)
+	c.setWriteTrafficSecret(hs.suite, QUICEncryptionLevelApplication, serverSecret)
 
 	if c.quic != nil {
-		if c.hand.Len() != 0 {
-			// TODO: Handle this in setTrafficSecret?
-			c.sendAlert(alertUnexpectedMessage)
-		}
 		c.quicSetWriteSecret(QUICEncryptionLevelApplication, hs.suite.id, serverSecret)
 	}
 
@@ -1123,7 +1130,9 @@ func (hs *serverHandshakeStateTLS13) readClientFinished() error {
 		return errors.New("tls: invalid client finished hash")
 	}
 
-	c.in.setTrafficSecret(hs.suite, QUICEncryptionLevelApplication, hs.trafficSecret)
+	if err := c.setReadTrafficSecret(hs.suite, QUICEncryptionLevelApplication, hs.trafficSecret); err != nil {
+		return err
+	}
 
 	return nil
 }
diff --git a/src/crypto/tls/handshake_test.go b/src/crypto/tls/handshake_test.go
index 6e15459a9a..9cea8182d0 100644
--- a/src/crypto/tls/handshake_test.go
+++ b/src/crypto/tls/handshake_test.go
@@ -7,6 +7,7 @@ package tls
 import (
 	"bufio"
 	"bytes"
+	"context"
 	"crypto/ed25519"
 	"crypto/x509"
 	"encoding/hex"
@@ -638,3 +639,142 @@ var clientEd25519KeyPEM = testingKey(`
 -----BEGIN TESTING KEY-----
 MC4CAQAwBQYDK2VwBCIEINifzf07d9qx3d44e0FSbV4mC/xQxT644RRbpgNpin7I
 -----END TESTING KEY-----`)
+
+func TestServerHelloTrailingMessage(t *testing.T) {
+	// In TLS 1.3 the change cipher spec message is optional. If a CCS message
+	// is not sent, after reading the ServerHello, the read traffic secret is
+	// set, and all following messages must be encrypted. If the server sends
+	// additional unencrypted messages in a record with the ServerHello, the
+	// client must either fail or ignore the additional messages.
+
+	c, s := localPipe(t)
+	go func() {
+		ctx := context.Background()
+		srv := Server(s, testConfig)
+		clientHello, _, err := srv.readClientHello(ctx)
+		if err != nil {
+			testFatal(t, err)
+		}
+
+		hs := serverHandshakeStateTLS13{
+			c:           srv,
+			ctx:         ctx,
+			clientHello: clientHello,
+		}
+		if err := hs.processClientHello(); err != nil {
+			testFatal(t, err)
+		}
+		if err := transcriptMsg(hs.clientHello, hs.transcript); err != nil {
+			testFatal(t, err)
+		}
+
+		record, err := concatHandshakeMessages(hs.hello, &encryptedExtensionsMsg{alpnProtocol: "h2"})
+		if err != nil {
+			testFatal(t, err)
+		}
+
+		if _, err := s.Write(record); err != nil {
+			testFatal(t, err)
+		}
+		srv.Close()
+	}()
+
+	cli := Client(c, testConfig)
+	expectedErr := "tls: handshake buffer not empty before setting read traffic secret"
+	if err := cli.Handshake(); err == nil {
+		t.Fatal("expected error from incomplete handshake, got nil")
+	} else if err.Error() != expectedErr {
+		t.Fatalf("expected error %q, got %q", expectedErr, err.Error())
+	}
+}
+
+func TestClientHelloTrailingMessage(t *testing.T) {
+	// Same as TestServerHelloTrailingMessage but for the client side.
+
+	c, s := localPipe(t)
+	go func() {
+		cli := Client(c, testConfig)
+
+		hello, _, _, err := cli.makeClientHello()
+		if err != nil {
+			testFatal(t, err)
+		}
+
+		record, err := concatHandshakeMessages(hello, &certificateMsgTLS13{})
+		if err != nil {
+			testFatal(t, err)
+		}
+
+		if _, err := c.Write(record); err != nil {
+			testFatal(t, err)
+		}
+		cli.Close()
+	}()
+
+	srv := Server(s, testConfig)
+	expectedErr := "tls: handshake buffer not empty before setting read traffic secret"
+	if err := srv.Handshake(); err == nil {
+		t.Fatal("expected error from incomplete handshake, got nil")
+	} else if err.Error() != expectedErr {
+		t.Fatalf("expected error %q, got %q", expectedErr, err.Error())
+	}
+}
+
+func TestDoubleClientHelloHRR(t *testing.T) {
+	// If a client sends two ClientHello messages in a single record, and the
+	// server sends a HRR after reading the first ClientHello, the server must
+	// either fail or ignore the trailing ClientHello.
+
+	c, s := localPipe(t)
+
+	go func() {
+		cli := Client(c, testConfig)
+
+		hello, _, _, err := cli.makeClientHello()
+		if err != nil {
+			testFatal(t, err)
+		}
+		hello.keyShares = nil
+
+		record, err := concatHandshakeMessages(hello, hello)
+		if err != nil {
+			testFatal(t, err)
+		}
+
+		if _, err := c.Write(record); err != nil {
+			testFatal(t, err)
+		}
+		cli.Close()
+	}()
+
+	srv := Server(s, testConfig)
+	expectedErr := "tls: handshake buffer not empty before HelloRetryRequest"
+	if err := srv.Handshake(); err == nil {
+		t.Fatal("expected error from incomplete handshake, got nil")
+	} else if err.Error() != expectedErr {
+		t.Fatalf("expected error %q, got %q", expectedErr, err.Error())
+	}
+}
+
+// concatHandshakeMessages marshals and concatenates the given handshake
+// messages into a single record.
+func concatHandshakeMessages(msgs ...handshakeMessage) ([]byte, error) {
+	var marshalled []byte
+	for _, msg := range msgs {
+		data, err := msg.marshal()
+		if err != nil {
+			return nil, err
+		}
+		marshalled = append(marshalled, data...)
+	}
+	m := len(marshalled)
+	outBuf := make([]byte, recordHeaderLen)
+	outBuf[0] = byte(recordTypeHandshake)
+	vers := VersionTLS12
+	outBuf[1] = byte(vers >> 8)
+	outBuf[2] = byte(vers)
+	outBuf[3] = byte(m >> 8)
+	outBuf[4] = byte(m)
+	outBuf = append(outBuf, marshalled...)
+	return outBuf, nil
+}
diff --git a/src/crypto/tls/quic.go b/src/crypto/tls/quic.go
index b3f95dbb18..76b7eb2cbd 100644
--- a/src/crypto/tls/quic.go
+++ b/src/crypto/tls/quic.go
@@ -402,13 +402,22 @@ func (c *Conn) quicReadHandshakeBytes(n int) error {
 	return nil
 }
 
-func (c *Conn) quicSetReadSecret(level QUICEncryptionLevel, suite uint16, secret []byte) {
+func (c *Conn) quicSetReadSecret(level QUICEncryptionLevel, suite uint16, secret []byte) error {
+	// Ensure that there are no buffered handshake messages before changing the
+	// read keys, since that can cause messages to be parsed that were encrypted
+	// using old keys which are no longer appropriate.
+	// TODO(roland): we should merge this check with the similar one in setReadTrafficSecret.
+	if c.hand.Len() != 0 {
+		c.sendAlert(alertUnexpectedMessage)
+		return errors.New("tls: handshake buffer not empty before setting read traffic secret")
+	}
 	c.quic.events = append(c.quic.events, QUICEvent{
 		Kind:  QUICSetReadSecret,
 		Level: level,
 		Suite: suite,
 		Data:  secret,
 	})
+	return nil
 }
 
 func (c *Conn) quicSetWriteSecret(level QUICEncryptionLevel, suite uint16, secret []byte) {
diff --git a/src/debug/pe/file.go b/src/debug/pe/file.go
index ed63a11cb6..91b7d1dca1 100644
--- a/src/debug/pe/file.go
+++ b/src/debug/pe/file.go
@@ -379,7 +379,11 @@ func (f *File) ImportedSymbols() ([]string, error) {
 	}
 
 	// seek to the virtual address specified in the import data directory
-	d = d[idd.VirtualAddress-ds.VirtualAddress:]
+	seek := idd.VirtualAddress - ds.VirtualAddress
+	if seek >= uint32(len(d)) {
+		return nil, errors.New("optional header data directory virtual size doesn't fit within data seek")
+	}
+	d = d[seek:]
 
 	// start decoding the import directory
 	var ida []ImportDirectory
@@ -408,9 +412,16 @@ func (f *File) ImportedSymbols() ([]string, error) {
 		dt.dll, _ = getString(names, int(dt.Name-ds.VirtualAddress))
 		d, _ = ds.Data()
 		// seek to OriginalFirstThunk
-		d = d[dt.OriginalFirstThunk-ds.VirtualAddress:]
+		seek := dt.OriginalFirstThunk - ds.VirtualAddress
+		if seek >= uint32(len(d)) {
+			return nil, errors.New("import directory original first thunk doesn't fit within data seek")
+		}
+		d = d[seek:]
 		for len(d) > 0 {
 			if pe64 { // 64bit
+				if len(d) < 8 {
+					return nil, errors.New("thunk parsing needs at least 8-bytes")
+				}
 				va := binary.LittleEndian.Uint64(d[0:8])
 				d = d[8:]
 				if va == 0 {
@@ -423,6 +434,9 @@ func (f *File) ImportedSymbols() ([]string, error) {
 					all = append(all, fn+":"+dt.dll)
 				}
 			} else { // 32bit
+				if len(d) <= 4 {
+					return nil, errors.New("thunk parsing needs at least 5-bytes")
+				}
 				va := binary.LittleEndian.Uint32(d[0:4])
 				d = d[4:]
 				if va == 0 {
diff --git a/src/encoding/gob/doc.go b/src/encoding/gob/doc.go
index c746806887..390f25088e 100644
--- a/src/encoding/gob/doc.go
+++ b/src/encoding/gob/doc.go
@@ -153,16 +153,16 @@ are transmitted, even if all the elements are zero.
 
 Structs are sent as a sequence of (field number, field value) pairs. The field
 value is sent using the standard gob encoding for its type, recursively. If a
-field has the zero value for its type (except for arrays; see above), it is omitted
-from the transmission. The field number is defined by the type of the encoded
-struct: the first field of the encoded type is field 0, the second is field 1,
-etc. When encoding a value, the field numbers are delta encoded for efficiency
-and the fields are always sent in order of increasing field number; the deltas are
-therefore unsigned. The initialization for the delta encoding sets the field
-number to -1, so an unsigned integer field 0 with value 7 is transmitted as unsigned
-delta = 1, unsigned value = 7 or (01 07). Finally, after all the fields have been
-sent a terminating mark denotes the end of the struct. That mark is a delta=0
-value, which has representation (00).
+field has the zero value for its type (except for arrays; see above) or it's a
+pointer to a zero value, it is omitted from the transmission. The field number
+is defined by the type of the encoded struct: the first field of the encoded type
+is field 0, the second is field 1, etc. When encoding a value, the field numbers
+are delta encoded for efficiency and the fields are always sent in order of
+increasing field number; the deltas are therefore unsigned. The initialization
+for the delta encoding sets the field number to -1, so an unsigned integer field 0
+with value 7 is transmitted as unsigned delta = 1, unsigned value = 7 or (01 07).
+Finally, after all the fields have been sent a terminating mark denotes the end
+of the struct. That mark is a delta=0 value, which has representation (00).
 
 Interface types are not checked for compatibility; all interface types are
 treated, for transmission, as members of a single "interface" type, analogous to
diff --git a/src/errors/join.go b/src/errors/join.go
index 08a79867c6..730bf7043c 100644
--- a/src/errors/join.go
+++ b/src/errors/join.go
@@ -27,16 +27,6 @@ func Join(errs ...error) error {
 	if n == 0 {
 		return nil
 	}
-	if n == 1 {
-		for _, err := range errs {
-			if _, ok := err.(interface {
-				Unwrap() []error
-			}); ok {
-				return err
-			}
-		}
-	}
-
 	e := &joinError{
 		errs: make([]error, 0, n),
 	}
diff --git a/src/errors/join_test.go b/src/errors/join_test.go
index 439b372ca0..8ee4d7f77b 100644
--- a/src/errors/join_test.go
+++ b/src/errors/join_test.go
@@ -25,6 +25,7 @@ func TestJoinReturnsNil(t *testing.T) {
 func TestJoin(t *testing.T) {
 	err1 := errors.New("err1")
 	err2 := errors.New("err2")
+	merr := multiErr{errors.New("err3")}
 	for _, test := range []struct {
 		errs []error
 		want []error
@@ -37,6 +38,9 @@ func TestJoin(t *testing.T) {
 	}, {
 		errs: []error{err1, nil, err2},
 		want: []error{err1, err2},
+	}, {
+		errs: []error{merr},
+		want: []error{merr},
 	}} {
 		got := errors.Join(test.errs...).(interface{ Unwrap() []error }).Unwrap()
 		if !reflect.DeepEqual(got, test.want) {
@@ -70,37 +74,3 @@ func TestJoinErrorMethod(t *testing.T) {
 		}
 	}
 }
-
-func BenchmarkJoin(b *testing.B) {
-	for _, bb := range []struct {
-		name string
-		errs []error
-	}{
-		{
-			name: "no error",
-		},
-		{
-			name: "single non-nil error",
-			errs: []error{errors.New("err")},
-		},
-		{
-			name: "multiple errors",
-			errs: []error{errors.New("err"), errors.New("newerr"), errors.New("newerr2")},
-		},
-		{
-			name: "unwrappable single error",
-			errs: []error{errors.Join(errors.New("err"))},
-		},
-		{
-			name: "nil first error",
-			errs: []error{nil, errors.New("newerr")},
-		},
-	} {
-		b.Run(bb.name, func(b *testing.B) {
-			b.ReportAllocs()
-			for i := 0; i < b.N; i++ {
-				_ = errors.Join(bb.errs...)
-			}
-		})
-	}
-}
diff --git a/src/go/doc/comment_test.go b/src/go/doc/comment_test.go
index 0e7de3eb78..2569e73c7c 100644
--- a/src/go/doc/comment_test.go
+++ b/src/go/doc/comment_test.go
@@ -24,12 +24,12 @@ func TestComment(t *testing.T) {
 	pkg := New(pkgs["pkgdoc"], "testdata/pkgdoc", 0)
 
 	var (
-		input           = "[T] and [U] are types, and [T.M] is a method, but [V] is a broken link. [rand.Int] and [crand.Reader] are things. [G.M1] and [G.M2] are generic methods. [I.F] is an interface method and [I.V] is a broken link.\n"
-		wantHTML        = `<p><a href="#T">T</a> and <a href="#U">U</a> are types, and <a href="#T.M">T.M</a> is a method, but [V] is a broken link. <a href="/math/rand#Int">rand.Int</a> and <a href="/crypto/rand#Reader">crand.Reader</a> are things. <a href="#G.M1">G.M1</a> and <a href="#G.M2">G.M2</a> are generic methods. <a href="#I.F">I.F</a> is an interface method and [I.V] is a broken link.` + "\n"
-		wantOldHTML     = "<p>[T] and [U] are <i>types</i>, and [T.M] is a method, but [V] is a broken link. [rand.Int] and [crand.Reader] are things. [G.M1] and [G.M2] are generic methods. [I.F] is an interface method and [I.V] is a broken link.\n"
-		wantMarkdown    = "[T](#T) and [U](#U) are types, and [T.M](#T.M) is a method, but \\[V] is a broken link. [rand.Int](/math/rand#Int) and [crand.Reader](/crypto/rand#Reader) are things. [G.M1](#G.M1) and [G.M2](#G.M2) are generic methods. [I.F](#I.F) is an interface method and \\[I.V] is a broken link.\n"
-		wantText        = "T and U are types, and T.M is a method, but [V] is a broken link. rand.Int and\ncrand.Reader are things. G.M1 and G.M2 are generic methods. I.F is an interface\nmethod and [I.V] is a broken link.\n"
-		wantOldText     = "[T] and [U] are types, and [T.M] is a method, but [V] is a broken link.\n[rand.Int] and [crand.Reader] are things. [G.M1] and [G.M2] are generic methods.\n[I.F] is an interface method and [I.V] is a broken link.\n"
+		input           = "[T] and [U] are types, and [T.M] is a method, but [V] is a broken link. [rand.Int] and [crand.Reader] are things. [G.X] is a field, [G.M1] and [G.M2] are generic methods. [I.F] is an interface method and [I.V] is a broken link.\n"
+		wantHTML        = `<p><a href="#T">T</a> and <a href="#U">U</a> are types, and <a href="#T.M">T.M</a> is a method, but [V] is a broken link. <a href="/math/rand#Int">rand.Int</a> and <a href="/crypto/rand#Reader">crand.Reader</a> are things. <a href="#G.X">G.X</a> is a field, <a href="#G.M1">G.M1</a> and <a href="#G.M2">G.M2</a> are generic methods. <a href="#I.F">I.F</a> is an interface method and [I.V] is a broken link.` + "\n"
+		wantOldHTML     = "<p>[T] and [U] are <i>types</i>, and [T.M] is a method, but [V] is a broken link. [rand.Int] and [crand.Reader] are things. [G.X] is a field, [G.M1] and [G.M2] are generic methods. [I.F] is an interface method and [I.V] is a broken link.\n"
+		wantMarkdown    = "[T](#T) and [U](#U) are types, and [T.M](#T.M) is a method, but \\[V] is a broken link. [rand.Int](/math/rand#Int) and [crand.Reader](/crypto/rand#Reader) are things. [G.X](#G.X) is a field, [G.M1](#G.M1) and [G.M2](#G.M2) are generic methods. [I.F](#I.F) is an interface method and \\[I.V] is a broken link.\n"
+		wantText        = "T and U are types, and T.M is a method, but [V] is a broken link. rand.Int and\ncrand.Reader are things. G.X is a field, G.M1 and G.M2 are generic methods.\nI.F is an interface method and [I.V] is a broken link.\n"
+		wantOldText     = "[T] and [U] are types, and [T.M] is a method, but [V] is a broken link.\n[rand.Int] and [crand.Reader] are things. [G.X] is a field, [G.M1] and [G.M2]\nare generic methods. [I.F] is an interface method and [I.V] is a broken link.\n"
 		wantSynopsis    = "T and U are types, and T.M is a method, but [V] is a broken link."
 		wantOldSynopsis = "[T] and [U] are types, and [T.M] is a method, but [V] is a broken link."
 	)
diff --git a/src/go/doc/doc.go b/src/go/doc/doc.go
index 0c23f1a46c..8c786896fd 100644
--- a/src/go/doc/doc.go
+++ b/src/go/doc/doc.go
@@ -168,6 +168,7 @@ func (p *Package) collectTypes(types []*Type) {
 		p.collectFuncs(t.Funcs)
 		p.collectFuncs(t.Methods)
 		p.collectInterfaceMethods(t)
+		p.collectStructFields(t)
 	}
 }
 
@@ -212,6 +213,24 @@ func (p *Package) collectInterfaceMethods(t *Type) {
 	}
 }
 
+func (p *Package) collectStructFields(t *Type) {
+	for _, s := range t.Decl.Specs {
+		spec, ok := s.(*ast.TypeSpec)
+		if !ok {
+			continue
+		}
+		list, isStruct := fields(spec.Type)
+		if !isStruct {
+			continue
+		}
+		for _, field := range list {
+			for _, name := range field.Names {
+				p.syms[t.Name+"."+name.Name] = true
+			}
+		}
+	}
+}
+
 // NewFromFiles computes documentation for a package.
 //
 // The package is specified by a list of *ast.Files and corresponding
diff --git a/src/go/doc/example.go b/src/go/doc/example.go
index ba1f863df0..8c01bf0a8d 100644
--- a/src/go/doc/example.go
+++ b/src/go/doc/example.go
@@ -74,6 +74,9 @@ func Examples(testFiles ...*ast.File) []*Example {
 			if params := f.Type.Params; len(params.List) != 0 {
 				continue // function has params; not a valid example
 			}
+			if results := f.Type.Results; results != nil && len(results.List) != 0 {
+				continue // function has results; not a valid example
+			}
 			if f.Body == nil { // ast.File.Body nil dereference (see issue 28044)
 				continue
 			}
diff --git a/src/go/doc/example_test.go b/src/go/doc/example_test.go
index 2fd54f8abb..db2b2d34cd 100644
--- a/src/go/doc/example_test.go
+++ b/src/go/doc/example_test.go
@@ -228,6 +228,8 @@ func ExampleFunc1_foo()           {}
 func ExampleFunc1_foo_suffix()    {}
 func ExampleFunc1_foo_Suffix()    {} // matches Func1, instead of Func1_foo
 func Examplefunc1()               {} // invalid - cannot match unexported
+func ExampleFunc1_params(a int)   {} // invalid - has parameter
+func ExampleFunc1_results() int   {} // invalid - has results
 
 func ExampleType1_Func1()               {}
 func ExampleType1_Func1_()              {} // invalid - suffix must start with a lower-case letter
diff --git a/src/go/doc/testdata/pkgdoc/doc.go b/src/go/doc/testdata/pkgdoc/doc.go
index d542dc2cdd..24e127c7fb 100644
--- a/src/go/doc/testdata/pkgdoc/doc.go
+++ b/src/go/doc/testdata/pkgdoc/doc.go
@@ -18,7 +18,7 @@ func (T) M() {}
 var _ = rand.Int
 var _ = crand.Reader
 
-type G[T any] struct{ x T }
+type G[T any] struct{ X T }
 
 func (g G[T]) M1()  {}
 func (g *G[T]) M2() {}
diff --git a/src/internal/coverage/decodemeta/decodefile.go b/src/internal/coverage/decodemeta/decodefile.go
index 6f4dd1a3ec..474844bf97 100644
--- a/src/internal/coverage/decodemeta/decodefile.go
+++ b/src/internal/coverage/decodemeta/decodefile.go
@@ -75,7 +75,7 @@ func (r *CoverageMetaFileReader) readFileHeader() error {
 	// Vet the version. If this is a meta-data file from the future,
 	// we won't be able to read it.
 	if r.hdr.Version > coverage.MetaFileVersion {
-		return fmt.Errorf("meta-data file withn unknown version %d (expected %d)", r.hdr.Version, coverage.MetaFileVersion)
+		return fmt.Errorf("meta-data file with an unknown version %d (expected %d)", r.hdr.Version, coverage.MetaFileVersion)
 	}
 
 	// Read package offsets for good measure
diff --git a/src/internal/cpu/cpu_x86.go b/src/internal/cpu/cpu_x86.go
index 4610ce807e..711fb045c3 100644
--- a/src/internal/cpu/cpu_x86.go
+++ b/src/internal/cpu/cpu_x86.go
@@ -219,7 +219,7 @@ func doinit() {
 	if eax7 >= 1 {
 		eax71, _, _, _ := cpuid(7, 1)
 		if X86.HasAVX {
-			X86.HasAVXVNNI = isSet(4, eax71)
+			X86.HasAVXVNNI = isSet(eax71, cpuid_AVXVNNI)
 		}
 	}
 
diff --git a/src/net/rpc/server.go b/src/net/rpc/server.go
index 4233a426fe..961145c6f2 100644
--- a/src/net/rpc/server.go
+++ b/src/net/rpc/server.go
@@ -202,7 +202,7 @@ func NewServer() *Server {
 // DefaultServer is the default instance of [*Server].
 var DefaultServer = NewServer()
 
-// Is this type exported or a builtin?
+// isExportedOrBuiltinType reports whether t is an exported or builtin type
 func isExportedOrBuiltinType(t reflect.Type) bool {
 	for t.Kind() == reflect.Pointer {
 		t = t.Elem()
diff --git a/src/os/exec/exec.go b/src/os/exec/exec.go
index e84ebfc453..aa7a6be7f0 100644
--- a/src/os/exec/exec.go
+++ b/src/os/exec/exec.go
@@ -102,6 +102,7 @@ import (
 	"runtime"
 	"strconv"
 	"strings"
+	"sync/atomic"
 	"syscall"
 	"time"
 )
@@ -354,6 +355,11 @@ type Cmd struct {
 	// the work of resolving the extension, so Start doesn't need to do it again.
 	// This is only used on Windows.
 	cachedLookExtensions struct{ in, out string }
+
+	// startCalled records that Start was attempted, regardless of outcome.
+	// (Until go.dev/issue/77075 is resolved, we use atomic.SwapInt32,
+	// not atomic.Bool.Swap, to avoid triggering the copylocks vet check.)
+	startCalled int32
 }
 
 // A ctxResult reports the result of watching the Context associated with a
@@ -635,7 +641,8 @@ func (c *Cmd) Run() error {
 func (c *Cmd) Start() error {
 	// Check for doubled Start calls before we defer failure cleanup. If the prior
 	// call to Start succeeded, we don't want to spuriously close its pipes.
-	if c.Process != nil {
+	// It is an error to call Start twice even if the first call did not create a process.
+	if atomic.SwapInt32(&c.startCalled, 1) != 0 {
 		return errors.New("exec: already started")
 	}
 
@@ -647,6 +654,7 @@ func (c *Cmd) Start() error {
 		if !started {
 			closeDescriptors(c.parentIOPipes)
 			c.parentIOPipes = nil
+			c.goroutine = nil // aid GC, finalization of pipe fds
 		}
 	}()
 
diff --git a/src/os/exec/exec_test.go b/src/os/exec/exec_test.go
index 1decebdc22..bf2f3da535 100644
--- a/src/os/exec/exec_test.go
+++ b/src/os/exec/exec_test.go
@@ -1839,3 +1839,29 @@ func TestAbsPathExec(t *testing.T) {
 		}
 	})
 }
+
+// Calling Start twice is an error, regardless of outcome.
+func TestStart_twice(t *testing.T) {
+	testenv.MustHaveExec(t)
+
+	cmd := exec.Command("/bin/nonesuch")
+	for i, want := range []string{
+		cond(runtime.GOOS == "windows",
+			`exec: "/bin/nonesuch": executable file not found in %PATH%`,
+			"fork/exec /bin/nonesuch: no such file or directory"),
+		"exec: already started",
+	} {
+		err := cmd.Start()
+		if got := fmt.Sprint(err); got != want {
+			t.Errorf("Start call #%d return err %q, want %q", i+1, got, want)
+		}
+	}
+}
+
+func cond[T any](cond bool, t, f T) T {
+	if cond {
+		return t
+	} else {
+		return f
+	}
+}
diff --git a/src/reflect/value.go b/src/reflect/value.go
index 7f0ec2a397..8c8acbaa9a 100644
--- a/src/reflect/value.go
+++ b/src/reflect/value.go
@@ -362,6 +362,7 @@ func (v Value) CanSet() bool {
 // type of the function's corresponding input parameter.
 // If v is a variadic function, Call creates the variadic slice parameter
 // itself, copying in the corresponding values.
+// It panics if the Value was obtained by accessing unexported struct fields.
 func (v Value) Call(in []Value) []Value {
 	v.mustBe(Func)
 	v.mustBeExported()
@@ -375,6 +376,7 @@ func (v Value) Call(in []Value) []Value {
 // It returns the output results as Values.
 // As in Go, each input argument must be assignable to the
 // type of the function's corresponding input parameter.
+// It panics if the Value was obtained by accessing unexported struct fields.
 func (v Value) CallSlice(in []Value) []Value {
 	v.mustBe(Func)
 	v.mustBeExported()
diff --git a/src/regexp/find_test.go b/src/regexp/find_test.go
index 49e9619cef..5b446c29cb 100644
--- a/src/regexp/find_test.go
+++ b/src/regexp/find_test.go
@@ -159,23 +159,23 @@ func TestFind(t *testing.T) {
 	for _, test := range findTests {
 		re := MustCompile(test.pat)
 		if re.String() != test.pat {
-			t.Errorf("String() = `%s`; should be `%s`", re.String(), test.pat)
+			t.Errorf("re.String() = %q, want %q", re.String(), test.pat)
 		}
 		result := re.Find([]byte(test.text))
 		switch {
 		case len(test.matches) == 0 && len(result) == 0:
 			// ok
 		case test.matches == nil && result != nil:
-			t.Errorf("expected no match; got one: %s", test)
+			t.Errorf("got match %q, want none: %s", result, test)
 		case test.matches != nil && result == nil:
-			t.Errorf("expected match; got none: %s", test)
+			t.Errorf("got no match, want one: %s", test)
 		case test.matches != nil && result != nil:
-			expect := test.text[test.matches[0][0]:test.matches[0][1]]
+			want := test.text[test.matches[0][0]:test.matches[0][1]]
 			if len(result) != cap(result) {
-				t.Errorf("expected capacity %d got %d: %s", len(result), cap(result), test)
+				t.Errorf("got capacity %d, want %d: %s", cap(result), len(result), test)
 			}
-			if expect != string(result) {
-				t.Errorf("expected %q got %q: %s", expect, result, test)
+			if want != string(result) {
+				t.Errorf("got %q, want %q: %s", result, want, test)
 			}
 		}
 	}
@@ -188,16 +188,16 @@ func TestFindString(t *testing.T) {
 		case len(test.matches) == 0 && len(result) == 0:
 			// ok
 		case test.matches == nil && result != "":
-			t.Errorf("expected no match; got one: %s", test)
+			t.Errorf("got match %q, want none: %s", result, test)
 		case test.matches != nil && result == "":
 			// Tricky because an empty result has two meanings: no match or empty match.
 			if test.matches[0][0] != test.matches[0][1] {
-				t.Errorf("expected match; got none: %s", test)
+				t.Errorf("got no match, want one: %s", test)
 			}
 		case test.matches != nil && result != "":
-			expect := test.text[test.matches[0][0]:test.matches[0][1]]
-			if expect != result {
-				t.Errorf("expected %q got %q: %s", expect, result, test)
+			want := test.text[test.matches[0][0]:test.matches[0][1]]
+			if want != result {
+				t.Errorf("got %q, want %q: %s", result, want, test)
 			}
 		}
 	}
@@ -208,13 +208,13 @@ func testFindIndex(test *FindTest, result []int, t *testing.T) {
 	case len(test.matches) == 0 && len(result) == 0:
 		// ok
 	case test.matches == nil && result != nil:
-		t.Errorf("expected no match; got one: %s", test)
+		t.Errorf("got match %v, want none: %s", result, test)
 	case test.matches != nil && result == nil:
-		t.Errorf("expected match; got none: %s", test)
+		t.Errorf("got no match, want one: %s", test)
 	case test.matches != nil && result != nil:
-		expect := test.matches[0]
-		if expect[0] != result[0] || expect[1] != result[1] {
-			t.Errorf("expected %v got %v: %s", expect, result, test)
+		want := test.matches[0]
+		if want[0] != result[0] || want[1] != result[1] {
+			t.Errorf("got %v, want %v: %s", result, want, test)
 		}
 	}
 }
@@ -246,22 +246,22 @@ func TestFindAll(t *testing.T) {
 		case test.matches == nil && result == nil:
 			// ok
 		case test.matches == nil && result != nil:
-			t.Errorf("expected no match; got one: %s", test)
+			t.Errorf("got match %q, want none: %s", result, test)
 		case test.matches != nil && result == nil:
-			t.Fatalf("expected match; got none: %s", test)
+			t.Fatalf("got no match, want one: %s", test)
 		case test.matches != nil && result != nil:
 			if len(test.matches) != len(result) {
-				t.Errorf("expected %d matches; got %d: %s", len(test.matches), len(result), test)
+				t.Errorf("got %d matches, want %d: %s", len(result), len(test.matches), test)
 				continue
 			}
 			for k, e := range test.matches {
 				got := result[k]
 				if len(got) != cap(got) {
-					t.Errorf("match %d: expected capacity %d got %d: %s", k, len(got), cap(got), test)
+					t.Errorf("match %d: got capacity %d, want %d: %s", k, cap(got), len(got), test)
 				}
-				expect := test.text[e[0]:e[1]]
-				if expect != string(got) {
-					t.Errorf("match %d: expected %q got %q: %s", k, expect, got, test)
+				want := test.text[e[0]:e[1]]
+				if want != string(got) {
+					t.Errorf("match %d: got %q, want %q: %s", k, got, want, test)
 				}
 			}
 		}
@@ -275,18 +275,18 @@ func TestFindAllString(t *testing.T) {
 		case test.matches == nil && result == nil:
 			// ok
 		case test.matches == nil && result != nil:
-			t.Errorf("expected no match; got one: %s", test)
+			t.Errorf("got match %q, want none: %s", result, test)
 		case test.matches != nil && result == nil:
-			t.Errorf("expected match; got none: %s", test)
+			t.Errorf("got no match, want one: %s", test)
 		case test.matches != nil && result != nil:
 			if len(test.matches) != len(result) {
-				t.Errorf("expected %d matches; got %d: %s", len(test.matches), len(result), test)
+				t.Errorf("got %d matches, want %d: %s", len(result), len(test.matches), test)
 				continue
 			}
 			for k, e := range test.matches {
-				expect := test.text[e[0]:e[1]]
-				if expect != result[k] {
-					t.Errorf("expected %q got %q: %s", expect, result, test)
+				want := test.text[e[0]:e[1]]
+				if want != result[k] {
+					t.Errorf("got %q, want %q: %s", result[k], want, test)
 				}
 			}
 		}
@@ -298,17 +298,17 @@ func testFindAllIndex(test *FindTest, result [][]int, t *testing.T) {
 	case test.matches == nil && result == nil:
 		// ok
 	case test.matches == nil && result != nil:
-		t.Errorf("expected no match; got one: %s", test)
+		t.Errorf("got match %v, want none: %s", result, test)
 	case test.matches != nil && result == nil:
-		t.Errorf("expected match; got none: %s", test)
+		t.Errorf("got no match, want one: %s", test)
 	case test.matches != nil && result != nil:
 		if len(test.matches) != len(result) {
-			t.Errorf("expected %d matches; got %d: %s", len(test.matches), len(result), test)
+			t.Errorf("got %d matches, want %d: %s", len(result), len(test.matches), test)
 			return
 		}
 		for k, e := range test.matches {
 			if e[0] != result[k][0] || e[1] != result[k][1] {
-				t.Errorf("match %d: expected %v got %v: %s", k, e, result[k], test)
+				t.Errorf("match %d: got %v, want %v: %s", k, result[k], e, test)
 			}
 		}
 	}
@@ -330,24 +330,24 @@ func TestFindAllStringIndex(t *testing.T) {
 
 func testSubmatchBytes(test *FindTest, n int, submatches []int, result [][]byte, t *testing.T) {
 	if len(submatches) != len(result)*2 {
-		t.Errorf("match %d: expected %d submatches; got %d: %s", n, len(submatches)/2, len(result), test)
+		t.Errorf("match %d: got %d submatches, want %d: %s", n, len(result), len(submatches)/2, test)
 		return
 	}
 	for k := 0; k < len(submatches); k += 2 {
 		if submatches[k] == -1 {
 			if result[k/2] != nil {
-				t.Errorf("match %d: expected nil got %q: %s", n, result, test)
+				t.Errorf("match %d: got %q, want nil: %s", n, result, test)
 			}
 			continue
 		}
 		got := result[k/2]
 		if len(got) != cap(got) {
-			t.Errorf("match %d: expected capacity %d got %d: %s", n, len(got), cap(got), test)
+			t.Errorf("match %d: got capacity %d, want %d: %s", n, cap(got), len(got), test)
 			return
 		}
-		expect := test.text[submatches[k]:submatches[k+1]]
-		if expect != string(got) {
-			t.Errorf("match %d: expected %q got %q: %s", n, expect, got, test)
+		want := test.text[submatches[k]:submatches[k+1]]
+		if want != string(got) {
+			t.Errorf("match %d: got %q, want %q: %s", n, got, want, test)
 			return
 		}
 	}
@@ -360,9 +360,9 @@ func TestFindSubmatch(t *testing.T) {
 		case test.matches == nil && result == nil:
 			// ok
 		case test.matches == nil && result != nil:
-			t.Errorf("expected no match; got one: %s", test)
+			t.Errorf("got match %q, want none: %s", result, test)
 		case test.matches != nil && result == nil:
-			t.Errorf("expected match; got none: %s", test)
+			t.Errorf("got no match, want one: %s", test)
 		case test.matches != nil && result != nil:
 			testSubmatchBytes(&test, 0, test.matches[0], result, t)
 		}
@@ -371,19 +371,19 @@ func TestFindSubmatch(t *testing.T) {
 
 func testSubmatchString(test *FindTest, n int, submatches []int, result []string, t *testing.T) {
 	if len(submatches) != len(result)*2 {
-		t.Errorf("match %d: expected %d submatches; got %d: %s", n, len(submatches)/2, len(result), test)
+		t.Errorf("match %d: got %d submatches, want %d: %s", n, len(result), len(submatches)/2, test)
 		return
 	}
 	for k := 0; k < len(submatches); k += 2 {
 		if submatches[k] == -1 {
 			if result[k/2] != "" {
-				t.Errorf("match %d: expected nil got %q: %s", n, result, test)
+				t.Errorf("match %d: got %q, want empty string: %s", n, result, test)
 			}
 			continue
 		}
-		expect := test.text[submatches[k]:submatches[k+1]]
-		if expect != result[k/2] {
-			t.Errorf("match %d: expected %q got %q: %s", n, expect, result, test)
+		want := test.text[submatches[k]:submatches[k+1]]
+		if want != result[k/2] {
+			t.Errorf("match %d: got %q, want %q: %s", n, result[k/2], want, test)
 			return
 		}
 	}
@@ -396,23 +396,23 @@ func TestFindStringSubmatch(t *testing.T) {
 		case test.matches == nil && result == nil:
 			// ok
 		case test.matches == nil && result != nil:
-			t.Errorf("expected no match; got one: %s", test)
+			t.Errorf("got match %q, want none: %s", result, test)
 		case test.matches != nil && result == nil:
-			t.Errorf("expected match; got none: %s", test)
+			t.Errorf("got no match, want one: %s", test)
 		case test.matches != nil && result != nil:
 			testSubmatchString(&test, 0, test.matches[0], result, t)
 		}
 	}
 }
 
-func testSubmatchIndices(test *FindTest, n int, expect, result []int, t *testing.T) {
-	if len(expect) != len(result) {
-		t.Errorf("match %d: expected %d matches; got %d: %s", n, len(expect)/2, len(result)/2, test)
+func testSubmatchIndices(test *FindTest, n int, want, result []int, t *testing.T) {
+	if len(want) != len(result) {
+		t.Errorf("match %d: got %d matches, want %d: %s", n, len(result)/2, len(want)/2, test)
 		return
 	}
-	for k, e := range expect {
+	for k, e := range want {
 		if e != result[k] {
-			t.Errorf("match %d: submatch error: expected %v got %v: %s", n, expect, result, test)
+			t.Errorf("match %d: submatch error: got %v, want %v: %s", n, result, want, test)
 		}
 	}
 }
@@ -422,9 +422,9 @@ func testFindSubmatchIndex(test *FindTest, result []int, t *testing.T) {
 	case test.matches == nil && result == nil:
 		// ok
 	case test.matches == nil && result != nil:
-		t.Errorf("expected no match; got one: %s", test)
+		t.Errorf("got match %v, want none: %s", result, test)
 	case test.matches != nil && result == nil:
-		t.Errorf("expected match; got none: %s", test)
+		t.Errorf("got no match, want one: %s", test)
 	case test.matches != nil && result != nil:
 		testSubmatchIndices(test, 0, test.matches[0], result, t)
 	}
@@ -457,11 +457,11 @@ func TestFindAllSubmatch(t *testing.T) {
 		case test.matches == nil && result == nil:
 			// ok
 		case test.matches == nil && result != nil:
-			t.Errorf("expected no match; got one: %s", test)
+			t.Errorf("got match %q, want none: %s", result, test)
 		case test.matches != nil && result == nil:
-			t.Errorf("expected match; got none: %s", test)
+			t.Errorf("got no match, want one: %s", test)
 		case len(test.matches) != len(result):
-			t.Errorf("expected %d matches; got %d: %s", len(test.matches), len(result), test)
+			t.Errorf("got %d matches, want %d: %s", len(result), len(test.matches), test)
 		case test.matches != nil && result != nil:
 			for k, match := range test.matches {
 				testSubmatchBytes(&test, k, match, result[k], t)
@@ -477,11 +477,11 @@ func TestFindAllStringSubmatch(t *testing.T) {
 		case test.matches == nil && result == nil:
 			// ok
 		case test.matches == nil && result != nil:
-			t.Errorf("expected no match; got one: %s", test)
+			t.Errorf("got match %q, want none: %s", result, test)
 		case test.matches != nil && result == nil:
-			t.Errorf("expected match; got none: %s", test)
+			t.Errorf("got no match, want one: %s", test)
 		case len(test.matches) != len(result):
-			t.Errorf("expected %d matches; got %d: %s", len(test.matches), len(result), test)
+			t.Errorf("got %d matches, want %d: %s", len(result), len(test.matches), test)
 		case test.matches != nil && result != nil:
 			for k, match := range test.matches {
 				testSubmatchString(&test, k, match, result[k], t)
@@ -495,11 +495,11 @@ func testFindAllSubmatchIndex(test *FindTest, result [][]int, t *testing.T) {
 	case test.matches == nil && result == nil:
 		// ok
 	case test.matches == nil && result != nil:
-		t.Errorf("expected no match; got one: %s", test)
+		t.Errorf("got match %v, want none: %s", result, test)
 	case test.matches != nil && result == nil:
-		t.Errorf("expected match; got none: %s", test)
+		t.Errorf("got no match, want one: %s", test)
 	case len(test.matches) != len(result):
-		t.Errorf("expected %d matches; got %d: %s", len(test.matches), len(result), test)
+		t.Errorf("got %d matches, want %d: %s", len(result), len(test.matches), test)
 	case test.matches != nil && result != nil:
 		for k, match := range test.matches {
 			testSubmatchIndices(test, k, match, result[k], t)
diff --git a/src/runtime/malloc.go b/src/runtime/malloc.go
index fd79356aba..c08bc7574b 100644
--- a/src/runtime/malloc.go
+++ b/src/runtime/malloc.go
@@ -1213,7 +1213,7 @@ func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer {
 	if goexperiment.RuntimeSecret && gp.secret > 0 {
 		// Mark any object allocated while in secret mode as secret.
 		// This ensures we zero it immediately when freeing it.
-		addSecret(x)
+		addSecret(x, size)
 	}
 
 	// Notify sanitizers, if enabled.
diff --git a/src/runtime/malloc_generated.go b/src/runtime/malloc_generated.go
index cf329d2696..2be6a5b6f5 100644
--- a/src/runtime/malloc_generated.go
+++ b/src/runtime/malloc_generated.go
@@ -156,7 +156,7 @@ func mallocgcSmallScanNoHeaderSC1(size uintptr, typ *_type, needzero bool) unsaf
 	gp := getg()
 	if goexperiment.RuntimeSecret && gp.secret > 0 {
 
-		addSecret(x)
+		addSecret(x, size)
 	}
 
 	if valgrindenabled {
@@ -321,7 +321,7 @@ func mallocgcSmallScanNoHeaderSC2(size uintptr, typ *_type, needzero bool) unsaf
 	gp := getg()
 	if goexperiment.RuntimeSecret && gp.secret > 0 {
 
-		addSecret(x)
+		addSecret(x, size)
 	}
 
 	if valgrindenabled {
@@ -486,7 +486,7 @@ func mallocgcSmallScanNoHeaderSC3(size uintptr, typ *_type, needzero bool) unsaf
 	gp := getg()
 	if goexperiment.RuntimeSecret && gp.secret > 0 {
 
-		addSecret(x)
+		addSecret(x, size)
 	}
 
 	if valgrindenabled {
@@ -651,7 +651,7 @@ func mallocgcSmallScanNoHeaderSC4(size uintptr, typ *_type, needzero bool) unsaf
 	gp := getg()
 	if goexperiment.RuntimeSecret && gp.secret > 0 {
 
-		addSecret(x)
+		addSecret(x, size)
 	}
 
 	if valgrindenabled {
@@ -816,7 +816,7 @@ func mallocgcSmallScanNoHeaderSC5(size uintptr, typ *_type, needzero bool) unsaf
 	gp := getg()
 	if goexperiment.RuntimeSecret && gp.secret > 0 {
 
-		addSecret(x)
+		addSecret(x, size)
 	}
 
 	if valgrindenabled {
@@ -981,7 +981,7 @@ func mallocgcSmallScanNoHeaderSC6(size uintptr, typ *_type, needzero bool) unsaf
 	gp := getg()
 	if goexperiment.RuntimeSecret && gp.secret > 0 {
 
-		addSecret(x)
+		addSecret(x, size)
 	}
 
 	if valgrindenabled {
@@ -1146,7 +1146,7 @@ func mallocgcSmallScanNoHeaderSC7(size uintptr, typ *_type, needzero bool) unsaf
 	gp := getg()
 	if goexperiment.RuntimeSecret && gp.secret > 0 {
 
-		addSecret(x)
+		addSecret(x, size)
 	}
 
 	if valgrindenabled {
@@ -1311,7 +1311,7 @@ func mallocgcSmallScanNoHeaderSC8(size uintptr, typ *_type, needzero bool) unsaf
 	gp := getg()
 	if goexperiment.RuntimeSecret && gp.secret > 0 {
 
-		addSecret(x)
+		addSecret(x, size)
 	}
 
 	if valgrindenabled {
@@ -1476,7 +1476,7 @@ func mallocgcSmallScanNoHeaderSC9(size uintptr, typ *_type, needzero bool) unsaf
 	gp := getg()
 	if goexperiment.RuntimeSecret && gp.secret > 0 {
 
-		addSecret(x)
+		addSecret(x, size)
 	}
 
 	if valgrindenabled {
@@ -1641,7 +1641,7 @@ func mallocgcSmallScanNoHeaderSC10(size uintptr, typ *_type, needzero bool) unsa
 	gp := getg()
 	if goexperiment.RuntimeSecret && gp.secret > 0 {
 
-		addSecret(x)
+		addSecret(x, size)
 	}
 
 	if valgrindenabled {
@@ -1806,7 +1806,7 @@ func mallocgcSmallScanNoHeaderSC11(size uintptr, typ *_type, needzero bool) unsa
 	gp := getg()
 	if goexperiment.RuntimeSecret && gp.secret > 0 {
 
-		addSecret(x)
+		addSecret(x, size)
 	}
 
 	if valgrindenabled {
@@ -1971,7 +1971,7 @@ func mallocgcSmallScanNoHeaderSC12(size uintptr, typ *_type, needzero bool) unsa
 	gp := getg()
 	if goexperiment.RuntimeSecret && gp.secret > 0 {
 
-		addSecret(x)
+		addSecret(x, size)
 	}
 
 	if valgrindenabled {
@@ -2136,7 +2136,7 @@ func mallocgcSmallScanNoHeaderSC13(size uintptr, typ *_type, needzero bool) unsa
 	gp := getg()
 	if goexperiment.RuntimeSecret && gp.secret > 0 {
 
-		addSecret(x)
+		addSecret(x, size)
 	}
 
 	if valgrindenabled {
@@ -2301,7 +2301,7 @@ func mallocgcSmallScanNoHeaderSC14(size uintptr, typ *_type, needzero bool) unsa
 	gp := getg()
 	if goexperiment.RuntimeSecret && gp.secret > 0 {
 
-		addSecret(x)
+		addSecret(x, size)
 	}
 
 	if valgrindenabled {
@@ -2466,7 +2466,7 @@ func mallocgcSmallScanNoHeaderSC15(size uintptr, typ *_type, needzero bool) unsa
 	gp := getg()
 	if goexperiment.RuntimeSecret && gp.secret > 0 {
 
-		addSecret(x)
+		addSecret(x, size)
 	}
 
 	if valgrindenabled {
@@ -2631,7 +2631,7 @@ func mallocgcSmallScanNoHeaderSC16(size uintptr, typ *_type, needzero bool) unsa
 	gp := getg()
 	if goexperiment.RuntimeSecret && gp.secret > 0 {
 
-		addSecret(x)
+		addSecret(x, size)
 	}
 
 	if valgrindenabled {
@@ -2796,7 +2796,7 @@ func mallocgcSmallScanNoHeaderSC17(size uintptr, typ *_type, needzero bool) unsa
 	gp := getg()
 	if goexperiment.RuntimeSecret && gp.secret > 0 {
 
-		addSecret(x)
+		addSecret(x, size)
 	}
 
 	if valgrindenabled {
@@ -2961,7 +2961,7 @@ func mallocgcSmallScanNoHeaderSC18(size uintptr, typ *_type, needzero bool) unsa
 	gp := getg()
 	if goexperiment.RuntimeSecret && gp.secret > 0 {
 
-		addSecret(x)
+		addSecret(x, size)
 	}
 
 	if valgrindenabled {
@@ -3126,7 +3126,7 @@ func mallocgcSmallScanNoHeaderSC19(size uintptr, typ *_type, needzero bool) unsa
 	gp := getg()
 	if goexperiment.RuntimeSecret && gp.secret > 0 {
 
-		addSecret(x)
+		addSecret(x, size)
 	}
 
 	if valgrindenabled {
@@ -3291,7 +3291,7 @@ func mallocgcSmallScanNoHeaderSC20(size uintptr, typ *_type, needzero bool) unsa
 	gp := getg()
 	if goexperiment.RuntimeSecret && gp.secret > 0 {
 
-		addSecret(x)
+		addSecret(x, size)
 	}
 
 	if valgrindenabled {
@@ -3456,7 +3456,7 @@ func mallocgcSmallScanNoHeaderSC21(size uintptr, typ *_type, needzero bool) unsa
 	gp := getg()
 	if goexperiment.RuntimeSecret && gp.secret > 0 {
 
-		addSecret(x)
+		addSecret(x, size)
 	}
 
 	if valgrindenabled {
@@ -3621,7 +3621,7 @@ func mallocgcSmallScanNoHeaderSC22(size uintptr, typ *_type, needzero bool) unsa
 	gp := getg()
 	if goexperiment.RuntimeSecret && gp.secret > 0 {
 
-		addSecret(x)
+		addSecret(x, size)
 	}
 
 	if valgrindenabled {
@@ -3786,7 +3786,7 @@ func mallocgcSmallScanNoHeaderSC23(size uintptr, typ *_type, needzero bool) unsa
 	gp := getg()
 	if goexperiment.RuntimeSecret && gp.secret > 0 {
 
-		addSecret(x)
+		addSecret(x, size)
 	}
 
 	if valgrindenabled {
@@ -3951,7 +3951,7 @@ func mallocgcSmallScanNoHeaderSC24(size uintptr, typ *_type, needzero bool) unsa
 	gp := getg()
 	if goexperiment.RuntimeSecret && gp.secret > 0 {
 
-		addSecret(x)
+		addSecret(x, size)
 	}
 
 	if valgrindenabled {
@@ -4116,7 +4116,7 @@ func mallocgcSmallScanNoHeaderSC25(size uintptr, typ *_type, needzero bool) unsa
 	gp := getg()
 	if goexperiment.RuntimeSecret && gp.secret > 0 {
 
-		addSecret(x)
+		addSecret(x, size)
 	}
 
 	if valgrindenabled {
@@ -4281,7 +4281,7 @@ func mallocgcSmallScanNoHeaderSC26(size uintptr, typ *_type, needzero bool) unsa
 	gp := getg()
 	if goexperiment.RuntimeSecret && gp.secret > 0 {
 
-		addSecret(x)
+		addSecret(x, size)
 	}
 
 	if valgrindenabled {
@@ -6686,7 +6686,7 @@ func mallocgcSmallNoScanSC2(size uintptr, typ *_type, needzero bool) unsafe.Poin
 			gp := getg()
 			if goexperiment.RuntimeSecret && gp.secret > 0 {
 
-				addSecret(x)
+				addSecret(x, size)
 			}
 
 			if valgrindenabled {
@@ -6757,7 +6757,7 @@ func mallocgcSmallNoScanSC2(size uintptr, typ *_type, needzero bool) unsafe.Poin
 	gp := getg()
 	if goexperiment.RuntimeSecret && gp.secret > 0 {
 
-		addSecret(x)
+		addSecret(x, size)
 	}
 
 	if valgrindenabled {
@@ -6822,7 +6822,7 @@ func mallocgcSmallNoScanSC3(size uintptr, typ *_type, needzero bool) unsafe.Poin
 			gp := getg()
 			if goexperiment.RuntimeSecret && gp.secret > 0 {
 
-				addSecret(x)
+				addSecret(x, size)
 			}
 
 			if valgrindenabled {
@@ -6893,7 +6893,7 @@ func mallocgcSmallNoScanSC3(size uintptr, typ *_type, needzero bool) unsafe.Poin
 	gp := getg()
 	if goexperiment.RuntimeSecret && gp.secret > 0 {
 
-		addSecret(x)
+		addSecret(x, size)
 	}
 
 	if valgrindenabled {
@@ -6958,7 +6958,7 @@ func mallocgcSmallNoScanSC4(size uintptr, typ *_type, needzero bool) unsafe.Poin
 			gp := getg()
 			if goexperiment.RuntimeSecret && gp.secret > 0 {
 
-				addSecret(x)
+				addSecret(x, size)
 			}
 
 			if valgrindenabled {
@@ -7029,7 +7029,7 @@ func mallocgcSmallNoScanSC4(size uintptr, typ *_type, needzero bool) unsafe.Poin
 	gp := getg()
 	if goexperiment.RuntimeSecret && gp.secret > 0 {
 
-		addSecret(x)
+		addSecret(x, size)
 	}
 
 	if valgrindenabled {
@@ -7094,7 +7094,7 @@ func mallocgcSmallNoScanSC5(size uintptr, typ *_type, needzero bool) unsafe.Poin
 			gp := getg()
 			if goexperiment.RuntimeSecret && gp.secret > 0 {
 
-				addSecret(x)
+				addSecret(x, size)
 			}
 
 			if valgrindenabled {
@@ -7165,7 +7165,7 @@ func mallocgcSmallNoScanSC5(size uintptr, typ *_type, needzero bool) unsafe.Poin
 	gp := getg()
 	if goexperiment.RuntimeSecret && gp.secret > 0 {
 
-		addSecret(x)
+		addSecret(x, size)
 	}
 
 	if valgrindenabled {
@@ -7230,7 +7230,7 @@ func mallocgcSmallNoScanSC6(size uintptr, typ *_type, needzero bool) unsafe.Poin
 			gp := getg()
 			if goexperiment.RuntimeSecret && gp.secret > 0 {
 
-				addSecret(x)
+				addSecret(x, size)
 			}
 
 			if valgrindenabled {
@@ -7301,7 +7301,7 @@ func mallocgcSmallNoScanSC6(size uintptr, typ *_type, needzero bool) unsafe.Poin
 	gp := getg()
 	if goexperiment.RuntimeSecret && gp.secret > 0 {
 
-		addSecret(x)
+		addSecret(x, size)
 	}
 
 	if valgrindenabled {
@@ -7366,7 +7366,7 @@ func mallocgcSmallNoScanSC7(size uintptr, typ *_type, needzero bool) unsafe.Poin
 			gp := getg()
 			if goexperiment.RuntimeSecret && gp.secret > 0 {
 
-				addSecret(x)
+				addSecret(x, size)
 			}
 
 			if valgrindenabled {
@@ -7437,7 +7437,7 @@ func mallocgcSmallNoScanSC7(size uintptr, typ *_type, needzero bool) unsafe.Poin
 	gp := getg()
 	if goexperiment.RuntimeSecret && gp.secret > 0 {
 
-		addSecret(x)
+		addSecret(x, size)
 	}
 
 	if valgrindenabled {
@@ -7502,7 +7502,7 @@ func mallocgcSmallNoScanSC8(size uintptr, typ *_type, needzero bool) unsafe.Poin
 			gp := getg()
 			if goexperiment.RuntimeSecret && gp.secret > 0 {
 
-				addSecret(x)
+				addSecret(x, size)
 			}
 
 			if valgrindenabled {
@@ -7573,7 +7573,7 @@ func mallocgcSmallNoScanSC8(size uintptr, typ *_type, needzero bool) unsafe.Poin
 	gp := getg()
 	if goexperiment.RuntimeSecret && gp.secret > 0 {
 
-		addSecret(x)
+		addSecret(x, size)
 	}
 
 	if valgrindenabled {
@@ -7638,7 +7638,7 @@ func mallocgcSmallNoScanSC9(size uintptr, typ *_type, needzero bool) unsafe.Poin
 			gp := getg()
 			if goexperiment.RuntimeSecret && gp.secret > 0 {
 
-				addSecret(x)
+				addSecret(x, size)
 			}
 
 			if valgrindenabled {
@@ -7709,7 +7709,7 @@ func mallocgcSmallNoScanSC9(size uintptr, typ *_type, needzero bool) unsafe.Poin
 	gp := getg()
 	if goexperiment.RuntimeSecret && gp.secret > 0 {
 
-		addSecret(x)
+		addSecret(x, size)
 	}
 
 	if valgrindenabled {
@@ -7774,7 +7774,7 @@ func mallocgcSmallNoScanSC10(size uintptr, typ *_type, needzero bool) unsafe.Poi
 			gp := getg()
 			if goexperiment.RuntimeSecret && gp.secret > 0 {
 
-				addSecret(x)
+				addSecret(x, size)
 			}
 
 			if valgrindenabled {
@@ -7845,7 +7845,7 @@ func mallocgcSmallNoScanSC10(size uintptr, typ *_type, needzero bool) unsafe.Poi
 	gp := getg()
 	if goexperiment.RuntimeSecret && gp.secret > 0 {
 
-		addSecret(x)
+		addSecret(x, size)
 	}
 
 	if valgrindenabled {
@@ -7910,7 +7910,7 @@ func mallocgcSmallNoScanSC11(size uintptr, typ *_type, needzero bool) unsafe.Poi
 			gp := getg()
 			if goexperiment.RuntimeSecret && gp.secret > 0 {
 
-				addSecret(x)
+				addSecret(x, size)
 			}
 
 			if valgrindenabled {
@@ -7981,7 +7981,7 @@ func mallocgcSmallNoScanSC11(size uintptr, typ *_type, needzero bool) unsafe.Poi
 	gp := getg()
 	if goexperiment.RuntimeSecret && gp.secret > 0 {
 
-		addSecret(x)
+		addSecret(x, size)
 	}
 
 	if valgrindenabled {
@@ -8046,7 +8046,7 @@ func mallocgcSmallNoScanSC12(size uintptr, typ *_type, needzero bool) unsafe.Poi
 			gp := getg()
 			if goexperiment.RuntimeSecret && gp.secret > 0 {
 
-				addSecret(x)
+				addSecret(x, size)
 			}
 
 			if valgrindenabled {
@@ -8117,7 +8117,7 @@ func mallocgcSmallNoScanSC12(size uintptr, typ *_type, needzero bool) unsafe.Poi
 	gp := getg()
 	if goexperiment.RuntimeSecret && gp.secret > 0 {
 
-		addSecret(x)
+		addSecret(x, size)
 	}
 
 	if valgrindenabled {
@@ -8182,7 +8182,7 @@ func mallocgcSmallNoScanSC13(size uintptr, typ *_type, needzero bool) unsafe.Poi
 			gp := getg()
 			if goexperiment.RuntimeSecret && gp.secret > 0 {
 
-				addSecret(x)
+				addSecret(x, size)
 			}
 
 			if valgrindenabled {
@@ -8253,7 +8253,7 @@ func mallocgcSmallNoScanSC13(size uintptr, typ *_type, needzero bool) unsafe.Poi
 	gp := getg()
 	if goexperiment.RuntimeSecret && gp.secret > 0 {
 
-		addSecret(x)
+		addSecret(x, size)
 	}
 
 	if valgrindenabled {
@@ -8318,7 +8318,7 @@ func mallocgcSmallNoScanSC14(size uintptr, typ *_type, needzero bool) unsafe.Poi
 			gp := getg()
 			if goexperiment.RuntimeSecret && gp.secret > 0 {
 
-				addSecret(x)
+				addSecret(x, size)
 			}
 
 			if valgrindenabled {
@@ -8389,7 +8389,7 @@ func mallocgcSmallNoScanSC14(size uintptr, typ *_type, needzero bool) unsafe.Poi
 	gp := getg()
 	if goexperiment.RuntimeSecret && gp.secret > 0 {
 
-		addSecret(x)
+		addSecret(x, size)
 	}
 
 	if valgrindenabled {
@@ -8454,7 +8454,7 @@ func mallocgcSmallNoScanSC15(size uintptr, typ *_type, needzero bool) unsafe.Poi
 			gp := getg()
 			if goexperiment.RuntimeSecret && gp.secret > 0 {
 
-				addSecret(x)
+				addSecret(x, size)
 			}
 
 			if valgrindenabled {
@@ -8525,7 +8525,7 @@ func mallocgcSmallNoScanSC15(size uintptr, typ *_type, needzero bool) unsafe.Poi
 	gp := getg()
 	if goexperiment.RuntimeSecret && gp.secret > 0 {
 
-		addSecret(x)
+		addSecret(x, size)
 	}
 
 	if valgrindenabled {
@@ -8590,7 +8590,7 @@ func mallocgcSmallNoScanSC16(size uintptr, typ *_type, needzero bool) unsafe.Poi
 			gp := getg()
 			if goexperiment.RuntimeSecret && gp.secret > 0 {
 
-				addSecret(x)
+				addSecret(x, size)
 			}
 
 			if valgrindenabled {
@@ -8661,7 +8661,7 @@ func mallocgcSmallNoScanSC16(size uintptr, typ *_type, needzero bool) unsafe.Poi
 	gp := getg()
 	if goexperiment.RuntimeSecret && gp.secret > 0 {
 
-		addSecret(x)
+		addSecret(x, size)
 	}
 
 	if valgrindenabled {
@@ -8726,7 +8726,7 @@ func mallocgcSmallNoScanSC17(size uintptr, typ *_type, needzero bool) unsafe.Poi
 			gp := getg()
 			if goexperiment.RuntimeSecret && gp.secret > 0 {
 
-				addSecret(x)
+				addSecret(x, size)
 			}
 
 			if valgrindenabled {
@@ -8797,7 +8797,7 @@ func mallocgcSmallNoScanSC17(size uintptr, typ *_type, needzero bool) unsafe.Poi
 	gp := getg()
 	if goexperiment.RuntimeSecret && gp.secret > 0 {
 
-		addSecret(x)
+		addSecret(x, size)
 	}
 
 	if valgrindenabled {
@@ -8862,7 +8862,7 @@ func mallocgcSmallNoScanSC18(size uintptr, typ *_type, needzero bool) unsafe.Poi
 			gp := getg()
 			if goexperiment.RuntimeSecret && gp.secret > 0 {
 
-				addSecret(x)
+				addSecret(x, size)
 			}
 
 			if valgrindenabled {
@@ -8933,7 +8933,7 @@ func mallocgcSmallNoScanSC18(size uintptr, typ *_type, needzero bool) unsafe.Poi
 	gp := getg()
 	if goexperiment.RuntimeSecret && gp.secret > 0 {
 
-		addSecret(x)
+		addSecret(x, size)
 	}
 
 	if valgrindenabled {
@@ -8998,7 +8998,7 @@ func mallocgcSmallNoScanSC19(size uintptr, typ *_type, needzero bool) unsafe.Poi
 			gp := getg()
 			if goexperiment.RuntimeSecret && gp.secret > 0 {
 
-				addSecret(x)
+				addSecret(x, size)
 			}
 
 			if valgrindenabled {
@@ -9069,7 +9069,7 @@ func mallocgcSmallNoScanSC19(size uintptr, typ *_type, needzero bool) unsafe.Poi
 	gp := getg()
 	if goexperiment.RuntimeSecret && gp.secret > 0 {
 
-		addSecret(x)
+		addSecret(x, size)
 	}
 
 	if valgrindenabled {
@@ -9134,7 +9134,7 @@ func mallocgcSmallNoScanSC20(size uintptr, typ *_type, needzero bool) unsafe.Poi
 			gp := getg()
 			if goexperiment.RuntimeSecret && gp.secret > 0 {
 
-				addSecret(x)
+				addSecret(x, size)
 			}
 
 			if valgrindenabled {
@@ -9205,7 +9205,7 @@ func mallocgcSmallNoScanSC20(size uintptr, typ *_type, needzero bool) unsafe.Poi
 	gp := getg()
 	if goexperiment.RuntimeSecret && gp.secret > 0 {
 
-		addSecret(x)
+		addSecret(x, size)
 	}
 
 	if valgrindenabled {
@@ -9270,7 +9270,7 @@ func mallocgcSmallNoScanSC21(size uintptr, typ *_type, needzero bool) unsafe.Poi
 			gp := getg()
 			if goexperiment.RuntimeSecret && gp.secret > 0 {
 
-				addSecret(x)
+				addSecret(x, size)
 			}
 
 			if valgrindenabled {
@@ -9341,7 +9341,7 @@ func mallocgcSmallNoScanSC21(size uintptr, typ *_type, needzero bool) unsafe.Poi
 	gp := getg()
 	if goexperiment.RuntimeSecret && gp.secret > 0 {
 
-		addSecret(x)
+		addSecret(x, size)
 	}
 
 	if valgrindenabled {
@@ -9406,7 +9406,7 @@ func mallocgcSmallNoScanSC22(size uintptr, typ *_type, needzero bool) unsafe.Poi
 			gp := getg()
 			if goexperiment.RuntimeSecret && gp.secret > 0 {
 
-				addSecret(x)
+				addSecret(x, size)
 			}
 
 			if valgrindenabled {
@@ -9477,7 +9477,7 @@ func mallocgcSmallNoScanSC22(size uintptr, typ *_type, needzero bool) unsafe.Poi
 	gp := getg()
 	if goexperiment.RuntimeSecret && gp.secret > 0 {
 
-		addSecret(x)
+		addSecret(x, size)
 	}
 
 	if valgrindenabled {
@@ -9542,7 +9542,7 @@ func mallocgcSmallNoScanSC23(size uintptr, typ *_type, needzero bool) unsafe.Poi
 			gp := getg()
 			if goexperiment.RuntimeSecret && gp.secret > 0 {
 
-				addSecret(x)
+				addSecret(x, size)
 			}
 
 			if valgrindenabled {
@@ -9613,7 +9613,7 @@ func mallocgcSmallNoScanSC23(size uintptr, typ *_type, needzero bool) unsafe.Poi
 	gp := getg()
 	if goexperiment.RuntimeSecret && gp.secret > 0 {
 
-		addSecret(x)
+		addSecret(x, size)
 	}
 
 	if valgrindenabled {
@@ -9678,7 +9678,7 @@ func mallocgcSmallNoScanSC24(size uintptr, typ *_type, needzero bool) unsafe.Poi
 			gp := getg()
 			if goexperiment.RuntimeSecret && gp.secret > 0 {
 
-				addSecret(x)
+				addSecret(x, size)
 			}
 
 			if valgrindenabled {
@@ -9749,7 +9749,7 @@ func mallocgcSmallNoScanSC24(size uintptr, typ *_type, needzero bool) unsafe.Poi
 	gp := getg()
 	if goexperiment.RuntimeSecret && gp.secret > 0 {
 
-		addSecret(x)
+		addSecret(x, size)
 	}
 
 	if valgrindenabled {
@@ -9814,7 +9814,7 @@ func mallocgcSmallNoScanSC25(size uintptr, typ *_type, needzero bool) unsafe.Poi
 			gp := getg()
 			if goexperiment.RuntimeSecret && gp.secret > 0 {
 
-				addSecret(x)
+				addSecret(x, size)
 			}
 
 			if valgrindenabled {
@@ -9885,7 +9885,7 @@ func mallocgcSmallNoScanSC25(size uintptr, typ *_type, needzero bool) unsafe.Poi
 	gp := getg()
 	if goexperiment.RuntimeSecret && gp.secret > 0 {
 
-		addSecret(x)
+		addSecret(x, size)
 	}
 
 	if valgrindenabled {
@@ -9950,7 +9950,7 @@ func mallocgcSmallNoScanSC26(size uintptr, typ *_type, needzero bool) unsafe.Poi
 			gp := getg()
 			if goexperiment.RuntimeSecret && gp.secret > 0 {
 
-				addSecret(x)
+				addSecret(x, size)
 			}
 
 			if valgrindenabled {
@@ -10021,7 +10021,7 @@ func mallocgcSmallNoScanSC26(size uintptr, typ *_type, needzero bool) unsafe.Poi
 	gp := getg()
 	if goexperiment.RuntimeSecret && gp.secret > 0 {
 
-		addSecret(x)
+		addSecret(x, size)
 	}
 
 	if valgrindenabled {
diff --git a/src/runtime/malloc_stubs.go b/src/runtime/malloc_stubs.go
index 8c424935bf..b395172e4b 100644
--- a/src/runtime/malloc_stubs.go
+++ b/src/runtime/malloc_stubs.go
@@ -101,7 +101,7 @@ func mallocStub(size uintptr, typ *_type, needzero bool) unsafe.Pointer {
 		if goexperiment.RuntimeSecret && gp.secret > 0 {
 			// Mark any object allocated while in secret mode as secret.
 			// This ensures we zero it immediately when freeing it.
-			addSecret(x)
+			addSecret(x, size)
 		}
 	}
 
diff --git a/src/runtime/mcleanup_test.go b/src/runtime/mcleanup_test.go
index 5afe85e103..dfc688a0f2 100644
--- a/src/runtime/mcleanup_test.go
+++ b/src/runtime/mcleanup_test.go
@@ -331,9 +331,14 @@ func TestCleanupLost(t *testing.T) {
 	}
 	wg.Wait()
 	runtime.GC()
-	runtime.BlockUntilEmptyCleanupQueue(int64(10 * time.Second))
+	timeout := 10 * time.Second
+	empty := runtime.BlockUntilEmptyCleanupQueue(int64(timeout))
+	if !empty {
+		t.Errorf("failed to drain cleanup queue within %s", timeout)
+	}
+
 	if got := int(got.Load()); got != want {
-		t.Errorf("expected %d cleanups to be executed, got %d", got, want)
+		t.Errorf("%d cleanups executed, expected %d", got, want)
 	}
 }
 
diff --git a/src/runtime/metrics_cgo_test.go b/src/runtime/metrics_cgo_test.go
index 6cc9d23195..ef1e3dd71d 100644
--- a/src/runtime/metrics_cgo_test.go
+++ b/src/runtime/metrics_cgo_test.go
@@ -12,7 +12,7 @@ import (
 	"testing"
 )
 
-func TestNotInGoMetricCallback(t *testing.T) {
+func TestNotInGoMetric(t *testing.T) {
 	switch runtime.GOOS {
 	case "windows", "plan9":
 		t.Skip("unsupported on Windows and Plan9")
@@ -22,11 +22,22 @@ func TestNotInGoMetricCallback(t *testing.T) {
 		}
 	}
 
-	// This test is run in a subprocess to prevent other tests from polluting the metrics
-	// and because we need to make some cgo callbacks.
-	output := runTestProg(t, "testprogcgo", "NotInGoMetricCallback")
-	want := "OK\n"
-	if output != want {
-		t.Fatalf("output:\n%s\n\nwanted:\n%s", output, want)
+	run := func(t *testing.T, name string) {
+		// This test is run in a subprocess to prevent other tests from polluting the metrics
+		// and because we need to make some cgo callbacks.
+		output := runTestProg(t, "testprogcgo", name)
+		want := "OK\n"
+		if output != want {
+			t.Fatalf("output:\n%s\n\nwanted:\n%s", output, want)
+		}
 	}
+	t.Run("CgoCall", func(t *testing.T) {
+		run(t, "NotInGoMetricCgoCall")
+	})
+	t.Run("CgoCallback", func(t *testing.T) {
+		run(t, "NotInGoMetricCgoCallback")
+	})
+	t.Run("CgoCallAndCallback", func(t *testing.T) {
+		run(t, "NotInGoMetricCgoCallAndCallback")
+	})
 }
diff --git a/src/runtime/mheap.go b/src/runtime/mheap.go
index 61dc5457fc..68dfca4668 100644
--- a/src/runtime/mheap.go
+++ b/src/runtime/mheap.go
@@ -2745,6 +2745,14 @@ type specialPinCounter struct {
 	counter uintptr
 }
 
+// specialSecret tracks whether we need to zero an object immediately
+// upon freeing.
+type specialSecret struct {
+	_       sys.NotInHeap
+	special special
+	size    uintptr
+}
+
 // specialsIter helps iterate over specials lists.
 type specialsIter struct {
 	pprev **special
@@ -2775,6 +2783,12 @@ func (i *specialsIter) unlinkAndNext() *special {
 
 // freeSpecial performs any cleanup on special s and deallocates it.
 // s must already be unlinked from the specials list.
+// TODO(mknyszek): p and size together DO NOT represent a valid allocation.
+// size is the size of the allocation block in the span (mspan.elemsize), and p is
+// whatever pointer the special was attached to, which need not point to the
+// beginning of the block, though it may.
+// Consider passing the arguments differently to avoid giving the impression
+// that p and size together represent an address range.
 func freeSpecial(s *special, p unsafe.Pointer, size uintptr) {
 	switch s.kind {
 	case _KindSpecialFinalizer:
@@ -2828,7 +2842,19 @@ func freeSpecial(s *special, p unsafe.Pointer, size uintptr) {
 		mheap_.specialBubbleAlloc.free(unsafe.Pointer(st))
 		unlock(&mheap_.speciallock)
 	case _KindSpecialSecret:
-		memclrNoHeapPointers(p, size)
+		ss := (*specialSecret)(unsafe.Pointer(s))
+		// p is the actual byte location that the special was
+		// attached to, but the size argument is the span
+		// element size. If we were to zero out using the size
+		// argument, we'd trounce over adjacent memory in cases
+		// where the allocation contains a header. Hence, we use
+		// the user-visible size which we stash in the special itself.
+		//
+		// p always points to the beginning of the user-visible
+		// allocation since the only way to attach a secret special
+		// is via the allocation path. This isn't universal for
+		// tiny allocs, but we avoid them in mallocgc anyway.
+		memclrNoHeapPointers(p, ss.size)
 		lock(&mheap_.speciallock)
 		mheap_.specialSecretAlloc.free(unsafe.Pointer(s))
 		unlock(&mheap_.speciallock)
diff --git a/src/runtime/proc.go b/src/runtime/proc.go
index 5ea96f03f5..005c875cbf 100644
--- a/src/runtime/proc.go
+++ b/src/runtime/proc.go
@@ -2455,8 +2455,16 @@ func needm(signal bool) {
 	// mp.curg is now a real goroutine.
 	casgstatus(mp.curg, _Gdeadextra, _Gsyscall)
 	sched.ngsys.Add(-1)
-	// N.B. We do not update nGsyscallNoP, because isExtraInC threads are not
-	// counted as real goroutines while they're in C.
+
+	// This is technically inaccurate, but we set isExtraInC to false above,
+	// and so we need to update addGSyscallNoP to keep the two pieces of state
+	// consistent (it's only updated when isExtraInC is false). More specifically,
+	// When we get to cgocallbackg and exitsyscall, we'll be looking for a P, and
+	// since isExtraInC is false, we will decrement this metric.
+	//
+	// The inaccuracy is thankfully transient: only until this thread can get a P.
+	// We're going into Go anyway, so it's okay to pretend we're a real goroutine now.
+	addGSyscallNoP(mp)
 
 	if !signal {
 		if trace.ok() {
@@ -5027,7 +5035,7 @@ func exitsyscallTryGetP(oldp *p) *p {
 	if oldp != nil {
 		if thread, ok := setBlockOnExitSyscall(oldp); ok {
 			thread.takeP()
-			addGSyscallNoP(thread.mp) // takeP does the opposite, but this is a net zero change.
+			decGSyscallNoP(getg().m) // We got a P for ourselves.
 			thread.resume()
 			return oldp
 		}
diff --git a/src/runtime/rt0_freebsd_arm64.s b/src/runtime/rt0_freebsd_arm64.s
index a7a952664e..93562c5dd0 100644
--- a/src/runtime/rt0_freebsd_arm64.s
+++ b/src/runtime/rt0_freebsd_arm64.s
@@ -4,9 +4,12 @@
 
 #include "textflag.h"
 
-// On FreeBSD argc/argv are passed in R0, not RSP
+// FreeBSD passes a pointer to the argument block in R0, not RSP,
+// so _rt0_arm64 cannot be used.
 TEXT _rt0_arm64_freebsd(SB),NOSPLIT,$0
-	JMP	_rt0_arm64(SB)
+	ADD	$8, R0, R1	// argv (use R0 while it's still the pointer)
+	MOVD	0(R0), R0	// argc
+	JMP	runtime·rt0_go(SB)
 
 // When building with -buildmode=c-shared, this symbol is called when the shared
 // library is loaded.
diff --git a/src/runtime/secret.go b/src/runtime/secret.go
index 4c199d31d0..8aad63b54f 100644
--- a/src/runtime/secret.go
+++ b/src/runtime/secret.go
@@ -55,15 +55,9 @@ func secret_eraseSecrets() {
 	// Don't put any code here: the stack frame's contents are gone!
 }
 
-// specialSecret tracks whether we need to zero an object immediately
-// upon freeing.
-type specialSecret struct {
-	special special
-}
-
 // addSecret records the fact that we need to zero p immediately
 // when it is freed.
-func addSecret(p unsafe.Pointer) {
+func addSecret(p unsafe.Pointer, size uintptr) {
 	// TODO(dmo): figure out the cost of these. These are mostly
 	// intended to catch allocations that happen via the runtime
 	// that the user has no control over and not big buffers that user
@@ -72,6 +66,7 @@ func addSecret(p unsafe.Pointer) {
 	lock(&mheap_.speciallock)
 	s := (*specialSecret)(mheap_.specialSecretAlloc.alloc())
 	s.special.kind = _KindSpecialSecret
+	s.size = size
 	unlock(&mheap_.speciallock)
 	addspecial(p, &s.special, false)
 }
diff --git a/src/runtime/secret/alloc_test.go b/src/runtime/secret/alloc_test.go
new file mode 100644
index 0000000000..8f82dad4b5
--- /dev/null
+++ b/src/runtime/secret/alloc_test.go
@@ -0,0 +1,39 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build goexperiment.runtimesecret && (arm64 || amd64) && linux
+
+package secret_test
+
+import (
+	"runtime"
+	"runtime/secret"
+	"testing"
+)
+
+func TestInterleavedAllocFrees(t *testing.T) {
+	// Interleave heap objects that are kept alive beyond secret.Do
+	// with heap objects that do not live past secret.Do.
+	// The intent is for the clearing of one object (with the wrong size)
+	// to clobber the type header of the next slot. If the GC sees a nil type header
+	// when it expects to find one, it can throw.
+	type T struct {
+		p *int
+		x [1024]byte
+	}
+	for range 10 {
+		var s []*T
+		secret.Do(func() {
+			for i := range 100 {
+				t := &T{}
+				if i%2 == 0 {
+					s = append(s, t)
+				}
+			}
+		})
+		runtime.GC()
+		runtime.GC()
+		runtime.KeepAlive(s)
+	}
+}
diff --git a/src/runtime/secret/doc.go b/src/runtime/secret/doc.go
new file mode 100644
index 0000000000..c0dd4f95a6
--- /dev/null
+++ b/src/runtime/secret/doc.go
@@ -0,0 +1,15 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build goexperiment.runtimesecret
+
+// Package secret contains helper functions for zeroing out memory
+// that is otherwise invisible to a user program in the service of
+// forward secrecy. See https://en.wikipedia.org/wiki/Forward_secrecy for
+// more information.
+//
+// This package (runtime/secret) is experimental,
+// and not subject to the Go 1 compatibility promise.
+// It only exists when building with the GOEXPERIMENT=runtimesecret environment variable set.
+package secret
diff --git a/src/runtime/secret/secret.go b/src/runtime/secret/secret.go
index 9eae22605f..00a03b2d50 100644
--- a/src/runtime/secret/secret.go
+++ b/src/runtime/secret/secret.go
@@ -18,12 +18,23 @@ import (
 // entire call tree initiated by f.)
 //   - Any registers used by f are erased before Do returns.
 //   - Any stack used by f is erased before Do returns.
-//   - Any heap allocation done by f is erased as soon as the garbage
-//     collector realizes that it is no longer reachable.
+//   - Heap allocations done by f are erased as soon as the garbage
+//     collector realizes that all allocated values are no longer reachable.
 //   - Do works even if f panics or calls runtime.Goexit.  As part of
 //     that, any panic raised by f will appear as if it originates from
 //     Do itself.
 //
+// Users should be cautious of allocating inside Do.
+// Erasing heap memory after Do returns may increase garbage collector sweep times and
+// requires additional memory to keep track of allocations until they are to be erased.
+// These costs can compound when an allocation is done in the service of growing a value,
+// like appending to a slice or inserting into a map. In these cases, the entire new allocation is erased rather
+// than just the secret parts of it.
+//
+// To reduce lifetimes of allocations and avoid unexpected performance issues,
+// if a function invoked by Do needs to yield a result that shouldn't be erased,
+// it should do so by copying the result into an allocation created by the caller.
+//
 // Limitations:
 //   - Currently only supported on linux/amd64 and linux/arm64.  On unsupported
 //     platforms, Do will invoke f directly.
diff --git a/src/runtime/secret/secret_test.go b/src/runtime/secret/secret_test.go
index 98d67cf8a4..e2f78c53a0 100644
--- a/src/runtime/secret/secret_test.go
+++ b/src/runtime/secret/secret_test.go
@@ -19,6 +19,7 @@ import (
 	"testing"
 	"time"
 	"unsafe"
+	"weak"
 )
 
 type secretType int64
@@ -63,28 +64,33 @@ func heapSTiny() *secretType {
 // are freed.
 // See runtime/mheap.go:freeSpecial.
 func TestHeap(t *testing.T) {
-	var u uintptr
+	var addr uintptr
+	var p weak.Pointer[S]
 	Do(func() {
-		u = uintptr(unsafe.Pointer(heapS()))
+		sp := heapS()
+		addr = uintptr(unsafe.Pointer(sp))
+		p = weak.Make(sp)
 	})
-
-	runtime.GC()
+	waitCollected(t, p)
 
 	// Check that object got zeroed.
-	checkRangeForSecret(t, u, u+unsafe.Sizeof(S{}))
+	checkRangeForSecret(t, addr, addr+unsafe.Sizeof(S{}))
 	// Also check our stack, just because we can.
 	checkStackForSecret(t)
 }
 
 func TestHeapTiny(t *testing.T) {
-	var u uintptr
+	var addr uintptr
+	var p weak.Pointer[secretType]
 	Do(func() {
-		u = uintptr(unsafe.Pointer(heapSTiny()))
+		sp := heapSTiny()
+		addr = uintptr(unsafe.Pointer(sp))
+		p = weak.Make(sp)
 	})
-	runtime.GC()
+	waitCollected(t, p)
 
 	// Check that object got zeroed.
-	checkRangeForSecret(t, u, u+unsafe.Sizeof(secretType(0)))
+	checkRangeForSecret(t, addr, addr+unsafe.Sizeof(secretType(0)))
 	// Also check our stack, just because we can.
 	checkStackForSecret(t)
 }
@@ -240,6 +246,20 @@ func checkRangeForSecret(t *testing.T, lo, hi uintptr) {
 	}
 }
 
+func waitCollected[P any](t *testing.T, ptr weak.Pointer[P]) {
+	t.Helper()
+	i := 0
+	for ptr.Value() != nil {
+		runtime.GC()
+		i++
+		// 20 seems like a decent number of times to try
+		if i > 20 {
+			t.Errorf("value was never collected")
+		}
+	}
+	t.Logf("number of cycles until collection: %d", i)
+}
+
 func TestRegisters(t *testing.T) {
 	Do(func() {
 		s := makeS()
diff --git a/src/runtime/secret_nosecret.go b/src/runtime/secret_nosecret.go
index bf50fb5a54..0692d6bf70 100644
--- a/src/runtime/secret_nosecret.go
+++ b/src/runtime/secret_nosecret.go
@@ -22,9 +22,7 @@ func secret_dec() {}
 //go:linkname secret_eraseSecrets runtime/secret.eraseSecrets
 func secret_eraseSecrets() {}
 
-func addSecret(p unsafe.Pointer) {}
-
-type specialSecret struct{}
+func addSecret(p unsafe.Pointer, size uintptr) {}
 
 //go:linkname secret_getStack runtime/secret.getStack
 func secret_getStack() (uintptr, uintptr) { return 0, 0 }
diff --git a/src/runtime/testdata/testprogcgo/notingo.go b/src/runtime/testdata/testprogcgo/notingo.go
index 5af4c00e1f..a385ae24d6 100644
--- a/src/runtime/testdata/testprogcgo/notingo.go
+++ b/src/runtime/testdata/testprogcgo/notingo.go
@@ -12,6 +12,7 @@ package main
 #include <pthread.h>
 
 extern void Ready();
+extern void BlockForeverInGo();
 
 static _Atomic int spinning;
 static _Atomic int released;
@@ -40,6 +41,21 @@ static void Release() {
 	atomic_store(&spinning, 0);
 	atomic_store(&released, 1);
 }
+
+static void* enterGoThenWait(void* arg __attribute__ ((unused))) {
+	BlockForeverInGo();
+	return NULL;
+}
+
+static void WaitInGoInNewCThread() {
+	pthread_t tid;
+	pthread_create(&tid, NULL, enterGoThenWait, NULL);
+}
+
+static void SpinForever() {
+	atomic_fetch_add(&spinning, 1);
+	while(1) {};
+}
 */
 import "C"
 
@@ -47,15 +63,62 @@ import (
 	"os"
 	"runtime"
 	"runtime/metrics"
+	"sync/atomic"
 )
 
 func init() {
-	register("NotInGoMetricCallback", NotInGoMetricCallback)
+	register("NotInGoMetricCgoCall", NotInGoMetricCgoCall)
+	register("NotInGoMetricCgoCallback", NotInGoMetricCgoCallback)
+	register("NotInGoMetricCgoCallAndCallback", NotInGoMetricCgoCallAndCallback)
 }
 
-func NotInGoMetricCallback() {
+// NotInGoMetric just double-checks that N goroutines in cgo count as the metric reading N.
+func NotInGoMetricCgoCall() {
 	const N = 10
+
+	// Spin up the same number of goroutines that will all wait in a cgo call.
+	for range N {
+		go func() {
+			C.SpinForever()
+		}()
+	}
+
+	// Make sure we're all blocked and spinning.
+	for C.Spinning() < N {
+	}
+
+	// Read not-in-go before taking the Ps back.
 	s := []metrics.Sample{{Name: "/sched/goroutines/not-in-go:goroutines"}}
+	failed := false
+	metrics.Read(s)
+	if n := s[0].Value.Uint64(); n != N {
+		println("pre-STW: expected", N, "not-in-go goroutines, found", n)
+	}
+
+	// Do something that stops the world to take all the Ps back.
+	//
+	// This will force a re-accounting of some of the goroutines and
+	// re-checking not-in-go will help catch bugs.
+	runtime.ReadMemStats(&m)
+
+	// Read not-in-go.
+	metrics.Read(s)
+	if n := s[0].Value.Uint64(); n != N {
+		println("post-STW: expected", N, "not-in-go goroutines, found", n)
+	}
+
+	// Fail if we get a bad reading.
+	if failed {
+		os.Exit(2)
+	}
+	println("OK")
+}
+
+// NotInGoMetricCgoCallback tests that threads that called into Go, then returned
+// to C with *no* Go on the stack, are *not* counted as not-in-go in the
+// runtime/metrics package.
+func NotInGoMetricCgoCallback() {
+	const N = 10
 
 	// Create N new C threads that have called into Go at least once.
 	for range N {
@@ -90,6 +153,7 @@ func NotInGoMetricCallback() {
 	}
 
 	// Read not-in-go.
+	s := []metrics.Sample{{Name: "/sched/goroutines/not-in-go:goroutines"}}
 	metrics.Read(s)
 	if n := s[0].Value.Uint64(); n != 0 {
 		println("expected 0 not-in-go goroutines, found", n)
@@ -105,3 +169,69 @@ var readyCh = make(chan bool)
 func Ready() {
 	readyCh <- true
 }
+
+// NotInGoMetricCgoCallAndCallback tests that threads that called into Go are not
+// keeping the count of not-in-go threads negative. Specifically, needm sets
+// isExtraInC to false, breaking some of the invariants behind the not-in-go
+// runtime/metrics metric, causing the underlying count to break if we don't
+// account for this. In go.dev/cl/726964 this amounts to nGsyscallNoP being negative.
+// Unfortunately the runtime/metrics package masks a negative nGsyscallNoP because
+// it can transiently go negative due to a race. Therefore, this test checks
+// the condition by making sure not-in-go is positive when we expect it to be.
+// That is, threads in a cgo callback are *not* cancelling out threads in a
+// regular cgo call.
+func NotInGoMetricCgoCallAndCallback() {
+	const N = 10
+
+	// Spin up some threads that will do a cgo callback and just wait in Go.
+	// These threads are the ones we're worried about having the incorrect
+	// accounting that skews the count later.
+	for range N {
+		C.WaitInGoInNewCThread()
+	}
+
+	// Spin up the same number of goroutines that will all wait in a cgo call.
+	for range N {
+		go func() {
+			C.SpinForever()
+		}()
+	}
+
+	// Make sure we're all blocked and spinning.
+	for C.Spinning() < N || blockedForever.Load() < N {
+	}
+
+	// Read not-in-go before taking the Ps back.
+	s := []metrics.Sample{{Name: "/sched/goroutines/not-in-go:goroutines"}}
+	failed := false
+	metrics.Read(s)
+	if n := s[0].Value.Uint64(); n != N {
+		println("pre-STW: expected", N, "not-in-go goroutines, found", n)
+	}
+
+	// Do something that stops the world to take all the Ps back.
+	//
+	// This will force a re-accounting of some of the goroutines and
+	// re-checking not-in-go will help catch bugs.
+	runtime.ReadMemStats(&m)
+
+	// Read not-in-go.
+	metrics.Read(s)
+	if n := s[0].Value.Uint64(); n != N {
+		println("post-STW: expected", N, "not-in-go goroutines, found", n)
+	}
+
+	// Fail if we get a bad reading.
+	if failed {
+		os.Exit(2)
+	}
+	println("OK")
+}
+
+var blockedForever atomic.Uint32
+
+//export BlockForeverInGo
+func BlockForeverInGo() {
+	blockedForever.Add(1)
+	select {}
+}
diff --git a/src/runtime/trace.go b/src/runtime/trace.go
index a7e8937a05..5f568d205e 100644
--- a/src/runtime/trace.go
+++ b/src/runtime/trace.go
@@ -12,7 +12,7 @@
 //
 // ## Design
 //
-// The basic idea behind the the execution tracer is to have per-M buffers that
+// The basic idea behind the execution tracer is to have per-M buffers that
 // trace data may be written into. Each M maintains a write flag indicating whether
 // its trace buffer is currently in use.
 //
@@ -173,7 +173,7 @@
 // doesn't do this directly for performance reasons. The runtime implementation instead caches
 // a G on the M created for the C thread. On Linux this M is then cached in the thread's TLS,
 // and on other systems, the M is put on a global list on exit from Go. We need to do some
-// extra work to make sure that this is modeled correctly in the the tracer. For example,
+// extra work to make sure that this is modeled correctly in the tracer. For example,
 // a C thread exiting Go may leave a P hanging off of its M (whether that M is kept in TLS
 // or placed back on a list). In order to correctly model goroutine creation and destruction,
 // we must behave as if the P was at some point stolen by the runtime, if the C thread
diff --git a/src/runtime/tracebuf.go b/src/runtime/tracebuf.go
index 5adaede424..1caf69f8b8 100644
--- a/src/runtime/tracebuf.go
+++ b/src/runtime/tracebuf.go
@@ -29,7 +29,7 @@ type traceWriter struct {
 	*traceBuf
 }
 
-// writer returns an a traceWriter that writes into the current M's stream.
+// writer returns a traceWriter that writes into the current M's stream.
 //
 // Once this is called, the caller must guard against stack growth until
 // end is called on it. Therefore, it's highly recommended to use this
diff --git a/src/simd/archsimd/_gen/simdgen/gen_simdTypes.go b/src/simd/archsimd/_gen/simdgen/gen_simdTypes.go
index ca4f73c738..dd3a75eb44 100644
--- a/src/simd/archsimd/_gen/simdgen/gen_simdTypes.go
+++ b/src/simd/archsimd/_gen/simdgen/gen_simdTypes.go
@@ -30,6 +30,13 @@ func (x simdType) ElemBits() int {
 	return x.Size / x.Lanes
 }
 
+func (x simdType) Article() string {
+	if strings.HasPrefix(x.Name, "Int") {
+		return "an"
+	}
+	return "a" // Float, Uint
+}
+
 // LanesContainer returns the smallest int/uint bit size that is
 // large enough to hold one bit for each lane.  E.g., Mask32x4
 // is 4 lanes, and a uint8 is the smallest uint that has 4 bits.
@@ -86,6 +93,33 @@ func (x simdType) MaskedStoreDoc() string {
 	}
 }
 
+func (x simdType) ToBitsDoc() string {
+	if x.Size == 512 || x.ElemBits() == 16 {
+		return fmt.Sprintf("// Asm: KMOV%s, CPU Features: AVX512", x.IntelSizeSuffix())
+	}
+	// 128/256 bit vectors with 8, 32, 64 bit elements
+	var asm string
+	var feat string
+	switch x.ElemBits() {
+	case 8:
+		asm = "VPMOVMSKB"
+		if x.Size == 256 {
+			feat = "AVX2"
+		} else {
+			feat = "AVX"
+		}
+	case 32:
+		asm = "VMOVMSKPS"
+		feat = "AVX"
+	case 64:
+		asm = "VMOVMSKPD"
+		feat = "AVX"
+	default:
+		panic("unexpected ElemBits")
+	}
+	return fmt.Sprintf("// Asm: %s, CPU Features: %s", asm, feat)
+}
+
 func compareSimdTypes(x, y simdType) int {
 	// "vreg" then "mask"
 	if c := -compareNatural(x.Type, y.Type); c != 0 {
@@ -135,7 +169,11 @@ type v{{.}} struct {
 {{end}}
 
 {{define "typeTmpl"}}
-// {{.Name}} is a {{.Size}}-bit SIMD vector of {{.Lanes}} {{.Base}}
+{{- if eq .Type "mask"}}
+// {{.Name}} is a mask for a SIMD vector of {{.Lanes}} {{.ElemBits}}-bit elements.
+{{- else}}
+// {{.Name}} is a {{.Size}}-bit SIMD vector of {{.Lanes}} {{.Base}}s.
+{{- end}}
 type {{.Name}} struct {
 {{.Fields}}
 }
@@ -171,15 +209,15 @@ func (X86Features) {{.Feature}}() bool {
 `
 
 const simdLoadStoreTemplate = `
-// Len returns the number of elements in a {{.Name}}
+// Len returns the number of elements in {{.Article}} {{.Name}}.
 func (x {{.Name}}) Len() int { return {{.Lanes}} }
 
-// Load{{.Name}} loads a {{.Name}} from an array
+// Load{{.Name}} loads {{.Article}} {{.Name}} from an array.
 //
 //go:noescape
 func Load{{.Name}}(y *[{{.Lanes}}]{{.Base}}) {{.Name}}
 
-// Store stores a {{.Name}} to an array
+// Store stores {{.Article}} {{.Name}} to an array.
 //
 //go:noescape
 func (x {{.Name}}) Store(y *[{{.Lanes}}]{{.Base}})
@@ -199,21 +237,21 @@ func {{.Name}}FromBits(y uint{{.LanesContainer}}) {{.Name}}
 // Only the lower {{.Lanes}} bits of y are used.
 {{- end}}
 //
-// Asm: KMOV{{.IntelSizeSuffix}}, CPU Features: AVX512
+{{.ToBitsDoc}}
 func (x {{.Name}}) ToBits() uint{{.LanesContainer}}
 `
 
 const simdMaskedLoadStoreTemplate = `
-// LoadMasked{{.Name}} loads a {{.Name}} from an array,
-// at those elements enabled by mask
+// LoadMasked{{.Name}} loads {{.Article}} {{.Name}} from an array,
+// at those elements enabled by mask.
 //
 {{.MaskedLoadDoc}}
 //
 //go:noescape
 func LoadMasked{{.Name}}(y *[{{.Lanes}}]{{.Base}}, mask Mask{{.ElemBits}}x{{.Lanes}}) {{.Name}}
 
-// StoreMasked stores a {{.Name}} to an array,
-// at those elements enabled by mask
+// StoreMasked stores {{.Article}} {{.Name}} to an array,
+// at those elements enabled by mask.
 //
 {{.MaskedStoreDoc}}
 //
@@ -395,15 +433,15 @@ func ({{.Op1NameAndType "x"}}) {{.Go}}({{.ImmName}} uint8, {{.Op2NameAndType "y"
 {{end}}
 
 {{define "vectorConversion"}}
-// {{.Tdst.Name}} converts from {{.Tsrc.Name}} to {{.Tdst.Name}}
-func (from {{.Tsrc.Name}}) As{{.Tdst.Name}}() (to {{.Tdst.Name}})
+// As{{.Tdst.Name}} returns {{.Tdst.Article}} {{.Tdst.Name}} with the same bit representation as x.
+func (x {{.Tsrc.Name}}) As{{.Tdst.Name}}() {{.Tdst.Name}}
 {{end}}
 
 {{define "mask"}}
-// To{{.VectorCounterpart}} converts from {{.Name}} to {{.VectorCounterpart}}
+// To{{.VectorCounterpart}} converts from {{.Name}} to {{.VectorCounterpart}}.
 func (from {{.Name}}) To{{.VectorCounterpart}}() (to {{.VectorCounterpart}})
 
-// asMask converts from {{.VectorCounterpart}} to {{.Name}}
+// asMask converts from {{.VectorCounterpart}} to {{.Name}}.
 func (from {{.VectorCounterpart}}) asMask() (to {{.Name}})
 
 func (x {{.Name}}) And(y {{.Name}}) {{.Name}}
diff --git a/src/simd/archsimd/_gen/simdgen/gen_simdrules.go b/src/simd/archsimd/_gen/simdgen/gen_simdrules.go
index 90c3fb620e..7a8823483a 100644
--- a/src/simd/archsimd/_gen/simdgen/gen_simdrules.go
+++ b/src/simd/archsimd/_gen/simdgen/gen_simdrules.go
@@ -275,7 +275,7 @@ func writeSIMDRules(ops []Operation) *bytes.Buffer {
 						origArgs = after
 					}
 					immArg = "[c] "
-					immArgCombineOff = " [makeValAndOff(int32(int8(c)),off)] "
+					immArgCombineOff = " [makeValAndOff(int32(uint8(c)),off)] "
 				}
 				memOpData.ArgsLoadAddr = immArg + origArgs + fmt.Sprintf("l:(VMOVDQUload%d {sym} [off] ptr mem)", *lastVreg.Bits)
 				// Remove the last vreg from the arg and change it to "ptr".
diff --git a/src/simd/archsimd/_gen/simdgen/gen_simdssa.go b/src/simd/archsimd/_gen/simdgen/gen_simdssa.go
index c9d8693aa1..876ffabe3d 100644
--- a/src/simd/archsimd/_gen/simdgen/gen_simdssa.go
+++ b/src/simd/archsimd/_gen/simdgen/gen_simdssa.go
@@ -13,9 +13,7 @@ import (
 )
 
 var (
-	ssaTemplates = template.Must(template.New("simdSSA").Parse(`
-{{define "header"}}// Code generated by x/arch/internal/simdgen using 'go run . -xedPath $XED_PATH -o godefs -goroot $GOROOT go.yaml types.yaml categories.yaml'; DO NOT EDIT.
-
+	ssaTemplates = template.Must(template.New("simdSSA").Parse(`{{define "header"}}` + generatedHeader + `
 package amd64
 
 import (
diff --git a/src/simd/archsimd/_gen/simdgen/godefs.go b/src/simd/archsimd/_gen/simdgen/godefs.go
index 2c10377420..e956c1cd1d 100644
--- a/src/simd/archsimd/_gen/simdgen/godefs.go
+++ b/src/simd/archsimd/_gen/simdgen/godefs.go
@@ -135,6 +135,19 @@ func (o *Operation) DecodeUnified(v *unify.Value) error {
 
 	o.In = append(o.rawOperation.In, o.rawOperation.InVariant...)
 
+	// For down conversions, the high elements are zeroed if the result has more elements.
+	// TODO: we should encode this logic in the YAML file, instead of hardcoding it here.
+	if len(o.In) > 0 && len(o.Out) > 0 {
+		inLanes := o.In[0].Lanes
+		outLanes := o.Out[0].Lanes
+		if inLanes != nil && outLanes != nil && *inLanes < *outLanes {
+			if (strings.Contains(o.Go, "Saturate") || strings.Contains(o.Go, "Truncate")) &&
+				!strings.Contains(o.Go, "Concat") {
+				o.Documentation += "\n// Results are packed to low elements in the returned vector, its upper elements are zeroed."
+			}
+		}
+	}
+
 	return nil
 }
 
@@ -362,7 +375,7 @@ func compareNatural(s1, s2 string) int {
 	return strings.Compare(s1, s2)
 }
 
-const generatedHeader = `// Code generated by x/arch/internal/simdgen using 'go run . -xedPath $XED_PATH -o godefs -goroot $GOROOT go.yaml types.yaml categories.yaml'; DO NOT EDIT.
+const generatedHeader = `// Code generated by 'simdgen -o godefs -goroot $GOROOT -xedPath $XED_PATH go.yaml types.yaml categories.yaml'; DO NOT EDIT.
 `
 
 func writeGoDefs(path string, cl unify.Closure) error {
diff --git a/src/simd/archsimd/_gen/simdgen/ops/AddSub/categories.yaml b/src/simd/archsimd/_gen/simdgen/ops/AddSub/categories.yaml
index 35e8104218..ac5bd825db 100644
--- a/src/simd/archsimd/_gen/simdgen/ops/AddSub/categories.yaml
+++ b/src/simd/archsimd/_gen/simdgen/ops/AddSub/categories.yaml
@@ -17,21 +17,83 @@
     // NAME subtracts corresponding elements of two vectors with saturation.
 - go: AddPairs
   commutative: false
+  out:
+  - elemBits: 16|32
   documentation: !string |-
     // NAME horizontally adds adjacent pairs of elements.
-    // For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
+    // For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...].
+- go: AddPairs
+  commutative: false
+  out:
+  - elemBits: 64
+  documentation: !string |-
+    // NAME horizontally adds adjacent pairs of elements.
+    // For x = [x0, x1] and y = [y0, y1], the result is [x0+x1, y0+y1].
 - go: SubPairs
   commutative: false
+  out:
+  - elemBits: 16|32
   documentation: !string |-
     // NAME horizontally subtracts adjacent pairs of elements.
-    // For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
+    // For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...].
+- go: SubPairs
+  commutative: false
+  out:
+  - elemBits: 64
+  documentation: !string |-
+    // NAME horizontally subtracts adjacent pairs of elements.
+    // For x = [x0, x1] and y = [y0, y1], the result is [x0-x1, y0-y1].
 - go: AddPairsSaturated
   commutative: false
   documentation: !string |-
     // NAME horizontally adds adjacent pairs of elements with saturation.
-    // For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
+    // For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...].
 - go: SubPairsSaturated
   commutative: false
   documentation: !string |-
     // NAME horizontally subtracts adjacent pairs of elements with saturation.
-    // For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
+    // For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...].
+- go: AddPairsGrouped
+  commutative: false
+  out:
+  - elemBits: 16|32
+  documentation: !string |-
+    // NAME horizontally adds adjacent pairs of elements.
+    // With each 128-bit as a group:
+    // for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...].
+- go: AddPairsGrouped
+  commutative: false
+  out:
+  - elemBits: 64
+  documentation: !string |-
+    // NAME horizontally adds adjacent pairs of elements.
+    // With each 128-bit as a group:
+    // for x = [x0, x1] and y = [y0, y1], the result is [x0+x1, y0+y1].
+- go: SubPairsGrouped
+  commutative: false
+  out:
+  - elemBits: 16|32
+  documentation: !string |-
+    // NAME horizontally subtracts adjacent pairs of elements.
+    // With each 128-bit as a group:
+    // for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...].
+- go: SubPairsGrouped
+  commutative: false
+  out:
+  - elemBits: 64
+  documentation: !string |-
+    // NAME horizontally subtracts adjacent pairs of elements.
+    // With each 128-bit as a group:
+    // for x = [x0, x1] and y = [y0, y1], the result is [x0-x1, y0-y1].
+- go: AddPairsSaturatedGrouped
+  commutative: false
+  documentation: !string |-
+    // NAME horizontally adds adjacent pairs of elements with saturation.
+    // With each 128-bit as a group:
+    // for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...].
+- go: SubPairsSaturatedGrouped
+  commutative: false
+  documentation: !string |-
+    // NAME horizontally subtracts adjacent pairs of elements with saturation.
+    // With each 128-bit as a group:
+    // for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...].
diff --git a/src/simd/archsimd/_gen/simdgen/ops/AddSub/go.yaml b/src/simd/archsimd/_gen/simdgen/ops/AddSub/go.yaml
index 4423d8c7c6..17cee597d9 100644
--- a/src/simd/archsimd/_gen/simdgen/ops/AddSub/go.yaml
+++ b/src/simd/archsimd/_gen/simdgen/ops/AddSub/go.yaml
@@ -53,25 +53,71 @@
   - *uint
 - go: AddPairs
   asm: "VPHADD[DW]"
-  in: *2any
-  out: *1any
+  in: &2any128
+  - &any128
+    go: $t
+    bits: 128
+  - *any128
+  out: &1any128
+  - *any128
 - go: SubPairs
   asm: "VPHSUB[DW]"
-  in: *2any
-  out: *1any
+  in: *2any128
+  out: *1any128
 - go: AddPairs
   asm: "VHADDP[SD]" # floats
-  in: *2any
-  out: *1any
+  in: *2any128
+  out: *1any128
 - go: SubPairs
   asm: "VHSUBP[SD]"  # floats
-  in: *2any
-  out: *1any
+  in: *2any128
+  out: *1any128
 - go: AddPairsSaturated
   asm: "VPHADDS[DW]"
-  in: *2int
-  out: *1int
+  in: &2int128
+  - &int128
+    go: $t
+    base: int
+    bits: 128
+  - *int128
+  out: &1int128
+  - *int128
 - go: SubPairsSaturated
   asm: "VPHSUBS[DW]"
-  in: *2int
-  out: *1int
+  in: *2int128
+  out: *1int128
+- go: AddPairsGrouped
+  asm: "VPHADD[DW]"
+  in: &2any256
+  - &any256
+    go: $t
+    bits: 256
+  - *any256
+  out: &1any256
+  - *any256
+- go: SubPairsGrouped
+  asm: "VPHSUB[DW]"
+  in: *2any256
+  out: *1any256
+- go: AddPairsGrouped
+  asm: "VHADDP[SD]" # floats
+  in: *2any256
+  out: *1any256
+- go: SubPairsGrouped
+  asm: "VHSUBP[SD]"  # floats
+  in: *2any256
+  out: *1any256
+- go: AddPairsSaturatedGrouped
+  asm: "VPHADDS[DW]"
+  in: &2int256
+  - &int256
+    go: $t
+    base: int
+    bits: 256
+  - *int256
+  out: &1int256
+  - *int256
+- go: SubPairsSaturatedGrouped
+  asm: "VPHSUBS[DW]"
+  in: *2int256
+  out: *1int256
diff --git a/src/simd/archsimd/_gen/simdgen/ops/Compares/categories.yaml b/src/simd/archsimd/_gen/simdgen/ops/Compares/categories.yaml
index 4b639d7a34..97ee587503 100644
--- a/src/simd/archsimd/_gen/simdgen/ops/Compares/categories.yaml
+++ b/src/simd/archsimd/_gen/simdgen/ops/Compares/categories.yaml
@@ -10,34 +10,29 @@
   constImm: 0
   commutative: true
   documentation: !string |-
-    // NAME returns x equals y, elementwise.
+    // NAME returns a mask whose elements indicate whether x == y.
 - go: Less
   constImm: 1
   commutative: false
   documentation: !string |-
-    // NAME returns x less-than y, elementwise.
+    // NAME returns a mask whose elements indicate whether x < y.
 - go: LessEqual
   constImm: 2
   commutative: false
   documentation: !string |-
-    // NAME returns x less-than-or-equals y, elementwise.
-- go: IsNan # For float only.
-  constImm: 3
-  commutative: true
-  documentation: !string |-
-    // NAME checks if elements are NaN. Use as x.IsNan(x).
+    // NAME returns a mask whose elements indicate whether x <= y.
 - go: NotEqual
   constImm: 4
   commutative: true
   documentation: !string |-
-    // NAME returns x not-equals y, elementwise.
+    // NAME returns a mask whose elements indicate whether x != y.
 - go: GreaterEqual
   constImm: 13
   commutative: false
   documentation: !string |-
-    // NAME returns x greater-than-or-equals y, elementwise.
+    // NAME returns a mask whose elements indicate whether x >= y.
 - go: Greater
   constImm: 14
   commutative: false
   documentation: !string |-
-    // NAME returns x greater-than y, elementwise.
+    // NAME returns a mask whose elements indicate whether x > y.
diff --git a/src/simd/archsimd/_gen/simdgen/ops/Compares/go.yaml b/src/simd/archsimd/_gen/simdgen/ops/Compares/go.yaml
index 3f6c8a45b6..6dbfb57343 100644
--- a/src/simd/archsimd/_gen/simdgen/ops/Compares/go.yaml
+++ b/src/simd/archsimd/_gen/simdgen/ops/Compares/go.yaml
@@ -121,7 +121,7 @@
   - class: mask
 
 # Floats
-- go: Equal|Greater|Less|LessEqual|GreaterEqual|NotEqual|IsNan
+- go: (Equal|Greater|Less|LessEqual|GreaterEqual|NotEqual)
   regexpTag: "compares"
   asm: "VCMPP[SD]"
   in:
@@ -135,7 +135,7 @@
   - go: $t
     overwriteBase: int
     overwriteClass: mask
-- go: (Equal|Greater|Less|LessEqual|GreaterEqual|NotEqual|IsNan)
+- go: (Equal|Greater|Less|LessEqual|GreaterEqual|NotEqual)
   regexpTag: "compares"
   asm: "VCMPP[SD]"
   in:
diff --git a/src/simd/archsimd/_gen/simdgen/ops/Converts/categories.yaml b/src/simd/archsimd/_gen/simdgen/ops/Converts/categories.yaml
index dd33284063..698e6d9956 100644
--- a/src/simd/archsimd/_gen/simdgen/ops/Converts/categories.yaml
+++ b/src/simd/archsimd/_gen/simdgen/ops/Converts/categories.yaml
@@ -44,124 +44,174 @@
     // NAME converts element values to float64.
 
 # Int <-> Int conversions
-- go: "(Extend|Saturate|Truncate)?ToInt8"
+- go: "TruncateToInt8"
   commutative: false
   regexpTag: "convert"
   documentation: !string |-
-    // NAME converts element values to int8.
-- go: "(Extend|Saturate|Truncate)?ToInt16(Concat)?"
+    // NAME truncates element values to int8.
+- go: "SaturateToInt8"
   commutative: false
   regexpTag: "convert"
   documentation: !string |-
-    // NAME converts element values to int16.
-- go: "(Extend|Saturate|Truncate)?ToInt32"
+    // NAME converts element values to int8 with signed saturation.
+- go: "ExtendToInt16(Concat)?"
   commutative: false
   regexpTag: "convert"
   documentation: !string |-
-    // NAME converts element values to int32.
-- go: "(Extend|Saturate|Truncate)?ToInt64"
+    // NAME sign-extends element values to int16.
+- go: "TruncateToInt16(Concat)?"
   commutative: false
   regexpTag: "convert"
   documentation: !string |-
-    // NAME converts element values to int64.
-- go: "(Extend|Saturate|Truncate)?ToUint8"
+    // NAME truncates element values to int16.
+- go: "SaturateToInt16(Concat(Grouped)?)?"
   commutative: false
   regexpTag: "convert"
   documentation: !string |-
-    // NAME converts element values to uint8.
-- go: "(Extend|Saturate|Truncate)?ToUint16(Concat)?"
+    // NAME converts element values to int16 with signed saturation.
+- go: "ExtendToInt32"
   commutative: false
   regexpTag: "convert"
   documentation: !string |-
-    // NAME converts element values to uint16.
-- go: "(Extend|Saturate|Truncate)?ToUint32"
+    // NAME sign-extends element values to int32.
+- go: "TruncateToInt32"
+  commutative: false
   regexpTag: "convert"
+  documentation: !string |-
+    // NAME truncates element values to int32.
+- go: "SaturateToInt32"
   commutative: false
+  regexpTag: "convert"
   documentation: !string |-
-    // NAME converts element values to uint32.
-- go: "(Extend|Saturate|Truncate)?ToUint64"
+    // NAME converts element values to int32 with signed saturation.
+- go: "ExtendToInt64"
+  commutative: false
   regexpTag: "convert"
+  documentation: !string |-
+    // NAME sign-extends element values to int64.
+- go: "TruncateToUint8"
   commutative: false
+  regexpTag: "convert"
   documentation: !string |-
-    // NAME converts element values to uint64.
+    // NAME truncates element values to uint8.
+- go: "SaturateToUint8"
+  commutative: false
+  regexpTag: "convert"
+  documentation: !string |-
+    // NAME converts element values to uint8 with unsigned saturation.
+- go: "ExtendToUint16(Concat)?"
+  commutative: false
+  regexpTag: "convert"
+  documentation: !string |-
+    // NAME zero-extends element values to uint16.
+- go: "TruncateToUint16(Concat)?"
+  commutative: false
+  regexpTag: "convert"
+  documentation: !string |-
+    // NAME truncates element values to uint16.
+- go: "SaturateToUint16(Concat(Grouped)?)?"
+  commutative: false
+  regexpTag: "convert"
+  documentation: !string |-
+    // NAME converts element values to uint16 with unsigned saturation.
+- go: "ExtendToUint32"
+  regexpTag: "convert"
+  commutative: false
+  documentation: !string |-
+    // NAME zero-extends element values to uint32.
+- go: "TruncateToUint32"
+  regexpTag: "convert"
+  commutative: false
+  documentation: !string |-
+    // NAME truncates element values to uint32.
+- go: "SaturateToUint32"
+  regexpTag: "convert"
+  commutative: false
+  documentation: !string |-
+    // NAME converts element values to uint32 with unsigned saturation.
+- go: "ExtendToUint64"
+  regexpTag: "convert"
+  commutative: false
+  documentation: !string |-
+    // NAME zero-extends element values to uint64.
 # low-part only Int <-> Int conversions
-- go: ExtendLo8ToUint16x8
+- go: ExtendLo8ToUint16
   commutative: false
   documentation: !string |-
-    // NAME converts 8 lowest vector element values to uint16.
-- go: ExtendLo8ToInt16x8
+    // NAME zero-extends 8 lowest vector element values to uint16.
+- go: ExtendLo8ToInt16
   commutative: false
   documentation: !string |-
-    // NAME converts 8 lowest vector element values to int16.
-- go: ExtendLo4ToUint32x4
+    // NAME sign-extends 8 lowest vector element values to int16.
+- go: ExtendLo4ToUint32
   commutative: false
   documentation: !string |-
-    // NAME converts 4 lowest vector element values to uint32.
-- go: ExtendLo4ToInt32x4
+    // NAME zero-extends 4 lowest vector element values to uint32.
+- go: ExtendLo4ToInt32
   commutative: false
   documentation: !string |-
-    // NAME converts 4 lowest vector element values to int32.
-- go: ExtendLo2ToUint64x2
+    // NAME sign-extends 4 lowest vector element values to int32.
+- go: ExtendLo2ToUint64
   commutative: false
   documentation: !string |-
-    // NAME converts 2 lowest vector element values to uint64.
-- go: ExtendLo2ToInt64x2
+    // NAME zero-extends 2 lowest vector element values to uint64.
+- go: ExtendLo2ToInt64
   commutative: false
   documentation: !string |-
-    // NAME converts 2 lowest vector element values to int64.
-- go: ExtendLo2ToUint64x2
+    // NAME sign-extends 2 lowest vector element values to int64.
+- go: ExtendLo2ToUint64
   commutative: false
   documentation: !string |-
-    // NAME converts 2 lowest vector element values to uint64.
-- go: ExtendLo4ToUint64x4
+    // NAME zero-extends 2 lowest vector element values to uint64.
+- go: ExtendLo4ToUint64
   commutative: false
   documentation: !string |-
-    // NAME converts 4 lowest vector element values to uint64.
-- go: ExtendLo2ToInt64x2
+    // NAME zero-extends 4 lowest vector element values to uint64.
+- go: ExtendLo2ToInt64
   commutative: false
   documentation: !string |-
-    // NAME converts 2 lowest vector element values to int64.
-- go: ExtendLo4ToInt64x4
+    // NAME sign-extends 2 lowest vector element values to int64.
+- go: ExtendLo4ToInt64
   commutative: false
   documentation: !string |-
-    // NAME converts 4 lowest vector element values to int64.
-- go: ExtendLo4ToUint32x4
+    // NAME sign-extends 4 lowest vector element values to int64.
+- go: ExtendLo4ToUint32
   commutative: false
   documentation: !string |-
-    // NAME converts 4 lowest vector element values to uint32.
-- go: ExtendLo8ToUint32x8
+    // NAME zero-extends 4 lowest vector element values to uint32.
+- go: ExtendLo8ToUint32
   commutative: false
   documentation: !string |-
-    // NAME converts 8 lowest vector element values to uint32.
-- go: ExtendLo4ToInt32x4
+    // NAME zero-extends 8 lowest vector element values to uint32.
+- go: ExtendLo4ToInt32
   commutative: false
   documentation: !string |-
-    // NAME converts 4 lowest vector element values to int32.
-- go: ExtendLo8ToInt32x8
+    // NAME sign-extends 4 lowest vector element values to int32.
+- go: ExtendLo8ToInt32
   commutative: false
   documentation: !string |-
-    // NAME converts 8 lowest vector element values to int32.
-- go: ExtendLo2ToUint64x2
+    // NAME sign-extends 8 lowest vector element values to int32.
+- go: ExtendLo2ToUint64
   commutative: false
   documentation: !string |-
-    // NAME converts 2 lowest vector element values to uint64.
-- go: ExtendLo4ToUint64x4
+    // NAME zero-extends 2 lowest vector element values to uint64.
+- go: ExtendLo4ToUint64
   commutative: false
   documentation: !string |-
-    // NAME converts 4 lowest vector element values to uint64.
-- go: ExtendLo8ToUint64x8
+    // NAME zero-extends 4 lowest vector element values to uint64.
+- go: ExtendLo8ToUint64
   commutative: false
   documentation: !string |-
-    // NAME converts 8 lowest vector element values to uint64.
-- go: ExtendLo2ToInt64x2
+    // NAME zero-extends 8 lowest vector element values to uint64.
+- go: ExtendLo2ToInt64
   commutative: false
   documentation: !string |-
-    // NAME converts 2 lowest vector element values to int64.
-- go: ExtendLo4ToInt64x4
+    // NAME sign-extends 2 lowest vector element values to int64.
+- go: ExtendLo4ToInt64
   commutative: false
   documentation: !string |-
-    // NAME converts 4 lowest vector element values to int64.
-- go: ExtendLo8ToInt64x8
+    // NAME sign-extends 4 lowest vector element values to int64.
+- go: ExtendLo8ToInt64
   commutative: false
   documentation: !string |-
-    // NAME converts 8 lowest vector element values to int64.
-\ No newline at end of file
+    // NAME sign-extends 8 lowest vector element values to int64.
diff --git a/src/simd/archsimd/_gen/simdgen/ops/Converts/go.yaml b/src/simd/archsimd/_gen/simdgen/ops/Converts/go.yaml
index af058124fb..2f19d12616 100644
--- a/src/simd/archsimd/_gen/simdgen/ops/Converts/go.yaml
+++ b/src/simd/archsimd/_gen/simdgen/ops/Converts/go.yaml
@@ -138,9 +138,6 @@
 # Widening integer conversions.
 # uint8 -> uint16
 - go: ExtendToUint16
-  addDoc: &zeroExtendDoc
-    !string |-
-    // The result vector's elements are zero-extended.
   regexpTag: "convert"
   asm: "VPMOVZXBW"
   in:
@@ -156,7 +153,6 @@
 - go: ExtendToUint16
   regexpTag: "convert"
   asm: "VPMOVZXBW"
-  addDoc: *zeroExtendDoc
   in:
   - &u8x32
     base: uint
@@ -171,9 +167,6 @@
 - go: ExtendToInt16
   regexpTag: "convert"
   asm: "VPMOVSXBW"
-  addDoc: &signExtendDoc
-    !string |-
-    // The result vector's elements are sign-extended.
   in:
   - &i8x16
     base: int
@@ -187,7 +180,6 @@
 - go: ExtendToInt16
   regexpTag: "convert"
   asm: "VPMOVSXBW"
-  addDoc: *signExtendDoc
   in:
   - &i8x32
     base: int
@@ -202,7 +194,6 @@
 - go: ExtendToUint32
   regexpTag: "convert"
   asm: "VPMOVZXWD"
-  addDoc: *zeroExtendDoc
   in:
   - &u16x8
     base: uint
@@ -216,7 +207,6 @@
 - go: ExtendToUint32
   regexpTag: "convert"
   asm: "VPMOVZXWD"
-  addDoc: *zeroExtendDoc
   in:
   - *u16x16
   out:
@@ -228,7 +218,6 @@
 - go: ExtendToInt32
   regexpTag: "convert"
   asm: "VPMOVSXWD"
-  addDoc: *signExtendDoc
   in:
   - &i16x8
     base: int
@@ -242,7 +231,6 @@
 - go: ExtendToInt32
   regexpTag: "convert"
   asm: "VPMOVSXWD"
-  addDoc: *signExtendDoc
   in:
   - *i16x16
   out:
@@ -254,7 +242,6 @@
 - go: ExtendToUint64
   regexpTag: "convert"
   asm: "VPMOVZXDQ"
-  addDoc: *zeroExtendDoc
   in:
   - &u32x4
     base: uint
@@ -268,7 +255,6 @@
 - go: ExtendToUint64
   regexpTag: "convert"
   asm: "VPMOVZXDQ"
-  addDoc: *zeroExtendDoc
   in:
   - *u32x8
   out:
@@ -280,7 +266,6 @@
 - go: ExtendToInt64
   regexpTag: "convert"
   asm: "VPMOVSXDQ"
-  addDoc: *signExtendDoc
   in:
   - &i32x4
     base: int
@@ -294,7 +279,6 @@
 - go: ExtendToInt64
   regexpTag: "convert"
   asm: "VPMOVSXDQ"
-  addDoc: *signExtendDoc
   in:
   - *i32x8
   out:
@@ -306,7 +290,6 @@
 - go: ExtendToUint64
   regexpTag: "convert"
   asm: "VPMOVZXWQ"
-  addDoc: *zeroExtendDoc
   in:
   - *u16x8
   out:
@@ -315,7 +298,6 @@
 - go: ExtendToInt64
   regexpTag: "convert"
   asm: "VPMOVSXWQ"
-  addDoc: *signExtendDoc
   in:
   - *i16x8
   out:
@@ -324,7 +306,6 @@
 - go: ExtendToUint32
   regexpTag: "convert"
   asm: "VPMOVZXBD"
-  addDoc: *zeroExtendDoc
   in:
   - *u8x16
   out:
@@ -333,7 +314,6 @@
 - go: ExtendToInt32
   regexpTag: "convert"
   asm: "VPMOVSXBD"
-  addDoc: *signExtendDoc
   in:
   - *i8x16
   out:
@@ -342,10 +322,6 @@
 - go: TruncateToInt8
   regexpTag: "convert"
   asm: "VPMOV[WDQ]B"
-  addDoc: &truncDocZeroUpper
-    !string |-
-    // Conversion is done with truncation on the vector elements.
-    // Results are packed to low elements in the returned vector, its upper elements are zero-cleared.
   in:
     - base: int
   out:
@@ -354,7 +330,6 @@
 - go: TruncateToUint8
   regexpTag: "convert"
   asm: "VPMOV[WDQ]B"
-  addDoc: *truncDocZeroUpper
   in:
     - base: uint
   out:
@@ -363,9 +338,6 @@
 - go: TruncateToInt8
   regexpTag: "convert"
   asm: "VPMOV[WDQ]B"
-  addDoc: &truncDoc
-    !string |-
-    // Conversion is done with truncation on the vector elements.
   in:
     - base: int
   out:
@@ -374,7 +346,6 @@
 - go: TruncateToUint8
   regexpTag: "convert"
   asm: "VPMOV[WDQ]B"
-  addDoc: *truncDoc
   in:
     - base: uint
   out:
@@ -383,7 +354,6 @@
 - go: TruncateToInt16
   regexpTag: "convert"
   asm: "VPMOV[DQ]W"
-  addDoc: *truncDoc
   in:
     - base: int
   out:
@@ -391,7 +361,6 @@
 - go: TruncateToUint16
   regexpTag: "convert"
   asm: "VPMOV[DQ]W"
-  addDoc: *truncDoc
   in:
     - base: uint
   out:
@@ -399,7 +368,6 @@
 - go: TruncateToInt32
   regexpTag: "convert"
   asm: "VPMOVQD"
-  addDoc: *truncDoc
   in:
     - base: int
   out:
@@ -407,7 +375,6 @@
 - go: TruncateToUint32
   regexpTag: "convert"
   asm: "VPMOVQD"
-  addDoc: *truncDoc
   in:
     - base: uint
   out:
@@ -416,10 +383,6 @@
 - go: SaturateToInt8
   regexpTag: "convert"
   asm: "VPMOVS[WDQ]B"
-  addDoc: &satDocZeroUpper
-    !string |-
-    // Conversion is done with saturation on the vector elements.
-    // Results are packed to low elements in the returned vector, its upper elements are zero-cleared.
   in:
     - base: int
   out:
@@ -427,19 +390,15 @@
       bits: 128
 - go: SaturateToUint8
   regexpTag: "convert"
-  asm: "VPMOVS[WDQ]B"
-  addDoc: *satDocZeroUpper
+  asm: "VPMOVUS[WDQ]B"
   in:
-    - base: int
+    - base: uint
   out:
-    - base: int
+    - base: uint
       bits: 128
 - go: SaturateToInt8
   regexpTag: "convert"
   asm: "VPMOVS[WDQ]B"
-  addDoc: &satDoc
-    !string |-
-    // Conversion is done with saturation on the vector elements.
   in:
     - base: int
   out:
@@ -448,7 +407,6 @@
 - go: SaturateToUint8
   regexpTag: "convert"
   asm: "VPMOVUS[WDQ]B"
-  addDoc: *satDoc
   in:
     - base: uint
   out:
@@ -457,7 +415,6 @@
 - go: SaturateToInt16
   regexpTag: "convert"
   asm: "VPMOVS[DQ]W"
-  addDoc: *satDoc
   in:
     - base: int
   out:
@@ -465,7 +422,6 @@
 - go: SaturateToUint16
   regexpTag: "convert"
   asm: "VPMOVUS[DQ]W"
-  addDoc: *satDoc
   in:
     - base: uint
   out:
@@ -473,7 +429,6 @@
 - go: SaturateToInt32
   regexpTag: "convert"
   asm: "VPMOVSQD"
-  addDoc: *satDoc
   in:
     - base: int
   out:
@@ -481,7 +436,6 @@
 - go: SaturateToUint32
   regexpTag: "convert"
   asm: "VPMOVUSQD"
-  addDoc: *satDoc
   in:
     - base: uint
   out:
@@ -492,67 +446,86 @@
   asm: "VPACKSSDW"
   addDoc: &satDocConcat
     !string |-
+    // The converted elements from x will be packed to the lower part of the result vector,
+    // the converted elements from y will be packed to the upper part of the result vector.
+  in:
+    - base: int
+    - base: int
+  out:
+    - base: int
+      bits: 128
+- go: SaturateToInt16ConcatGrouped
+  regexpTag: "convert"
+  asm: "VPACKSSDW"
+  addDoc: &satDocConcatGrouped
+    !string |-
     // With each 128-bit as a group:
-    // The converted group from the first input vector will be packed to the lower part of the result vector,
-    // the converted group from the second input vector will be packed to the upper part of the result vector.
-    // Conversion is done with saturation on the vector elements.
+    // The converted elements from x will be packed to the lower part of the group in the result vector,
+    // the converted elements from y will be packed to the upper part of the group in the result vector.
   in:
     - base: int
     - base: int
   out:
     - base: int
+      bits: 256|512
 - go: SaturateToUint16Concat
   regexpTag: "convert"
   asm: "VPACKUSDW"
   addDoc: *satDocConcat
   in:
+    - base: int
+    - base: int
+  out:
     - base: uint
-    - base: uint
+      bits: 128
+- go: SaturateToUint16ConcatGrouped
+  regexpTag: "convert"
+  asm: "VPACKUSDW"
+  addDoc: *satDocConcatGrouped
+  in:
+    - base: int
+    - base: int
   out:
     - base: uint
+      bits: 256|512
 
 # low-part only conversions.
 # uint8->uint16
-- go: ExtendLo8ToUint16x8
+- go: ExtendLo8ToUint16
   regexpTag: "convert"
   asm: "VPMOVZXBW"
-  addDoc: *zeroExtendDoc
   in:
   - *u8x16
   out:
   - *u16x8
 # int8->int16
-- go: ExtendLo8ToInt16x8
+- go: ExtendLo8ToInt16
   regexpTag: "convert"
   asm: "VPMOVSXBW"
-  addDoc: *signExtendDoc
   in:
   - *i8x16
   out:
   - *i16x8
 # uint16->uint32
-- go: ExtendLo4ToUint32x4
+- go: ExtendLo4ToUint32
   regexpTag: "convert"
   asm: "VPMOVZXWD"
-  addDoc: *zeroExtendDoc
   in:
   - *u16x8
   out:
   - *u32x4
 # int16->int32
-- go: ExtendLo4ToInt32x4
+- go: ExtendLo4ToInt32
   regexpTag: "convert"
   asm: "VPMOVSXWD"
-  addDoc: *signExtendDoc
   in:
   - *i16x8
   out:
   - *i32x4
 # uint32 -> uint64
-- go: ExtendLo2ToUint64x2
+- go: ExtendLo2ToUint64
   regexpTag: "convert"
   asm: "VPMOVZXDQ"
-  addDoc: *zeroExtendDoc
   in:
   - *u32x4
   out:
@@ -561,10 +534,9 @@
     elemBits: 64
     bits: 128
 # int32 -> int64
-- go: ExtendLo2ToInt64x2
+- go: ExtendLo2ToInt64
   regexpTag: "convert"
   asm: "VPMOVSXDQ"
-  addDoc: *signExtendDoc
   in:
   - *i32x4
   out:
@@ -573,120 +545,106 @@
     elemBits: 64
     bits: 128
 # uint16 -> uint64
-- go: ExtendLo2ToUint64x2
+- go: ExtendLo2ToUint64
   regexpTag: "convert"
   asm: "VPMOVZXWQ"
-  addDoc: *zeroExtendDoc
   in:
   - *u16x8
   out:
   - *u64x2
-- go: ExtendLo4ToUint64x4
+- go: ExtendLo4ToUint64
   regexpTag: "convert"
   asm: "VPMOVZXWQ"
-  addDoc: *zeroExtendDoc
   in:
   - *u16x8
   out:
   - *u64x4
 # int16 -> int64
-- go: ExtendLo2ToInt64x2
+- go: ExtendLo2ToInt64
   regexpTag: "convert"
   asm: "VPMOVSXWQ"
-  addDoc: *signExtendDoc
   in:
   - *i16x8
   out:
   - *i64x2
-- go: ExtendLo4ToInt64x4
+- go: ExtendLo4ToInt64
   regexpTag: "convert"
   asm: "VPMOVSXWQ"
-  addDoc: *signExtendDoc
   in:
   - *i16x8
   out:
   - *i64x4
 # uint8 -> uint32
-- go: ExtendLo4ToUint32x4
+- go: ExtendLo4ToUint32
   regexpTag: "convert"
   asm: "VPMOVZXBD"
-  addDoc: *zeroExtendDoc
   in:
   - *u8x16
   out:
   - *u32x4
-- go: ExtendLo8ToUint32x8
+- go: ExtendLo8ToUint32
   regexpTag: "convert"
   asm: "VPMOVZXBD"
-  addDoc: *zeroExtendDoc
   in:
   - *u8x16
   out:
   - *u32x8
 # int8 -> int32
-- go: ExtendLo4ToInt32x4
+- go: ExtendLo4ToInt32
   regexpTag: "convert"
   asm: "VPMOVSXBD"
-  addDoc: *signExtendDoc
   in:
   - *i8x16
   out:
   - *i32x4
-- go: ExtendLo8ToInt32x8
+- go: ExtendLo8ToInt32
   regexpTag: "convert"
   asm: "VPMOVSXBD"
-  addDoc: *signExtendDoc
   in:
   - *i8x16
   out:
   - *i32x8
 # uint8 -> uint64
-- go: ExtendLo2ToUint64x2
+- go: ExtendLo2ToUint64
   regexpTag: "convert"
   asm: "VPMOVZXBQ"
-  addDoc: *zeroExtendDoc
   in:
   - *u8x16
   out:
   - *u64x2
-- go: ExtendLo4ToUint64x4
+- go: ExtendLo4ToUint64
   regexpTag: "convert"
   asm: "VPMOVZXBQ"
-  addDoc: *zeroExtendDoc
   in:
   - *u8x16
   out:
   - *u64x4
-- go: ExtendLo8ToUint64x8
+- go: ExtendLo8ToUint64
   regexpTag: "convert"
   asm: "VPMOVZXBQ"
-  addDoc: *zeroExtendDoc
   in:
   - *u8x16
   out:
   - *u64x8
 # int8 -> int64
-- go: ExtendLo2ToInt64x2
+- go: ExtendLo2ToInt64
   regexpTag: "convert"
   asm: "VPMOVSXBQ"
-  addDoc: *signExtendDoc
   in:
   - *i8x16
   out:
   - *i64x2
-- go: ExtendLo4ToInt64x4
+- go: ExtendLo4ToInt64
   regexpTag: "convert"
   asm: "VPMOVSXBQ"
-  addDoc: *signExtendDoc
   in:
   - *i8x16
   out:
   - *i64x4
-- go: ExtendLo8ToInt64x8
+- go: ExtendLo8ToInt64
   regexpTag: "convert"
   asm: "VPMOVSXBQ"
-  addDoc: *signExtendDoc
   in:
   - *i8x16
   out:
-  - *i64x8
-\ No newline at end of file
+  - *i64x8
diff --git a/src/simd/archsimd/_gen/simdgen/ops/FPonlyArith/categories.yaml b/src/simd/archsimd/_gen/simdgen/ops/FPonlyArith/categories.yaml
index f2d8af6886..90f5208ff7 100644
--- a/src/simd/archsimd/_gen/simdgen/ops/FPonlyArith/categories.yaml
+++ b/src/simd/archsimd/_gen/simdgen/ops/FPonlyArith/categories.yaml
@@ -18,12 +18,13 @@
 - go: Scale
   commutative: false
   documentation: !string |-
-    // NAME multiplies elements by a power of 2.
+    // NAME multiplies each element of x by 2 raised to the power of the
+    // floor of the corresponding element in y.
 - go: RoundToEven
   commutative: false
   constImm: 0
   documentation: !string |-
-    // NAME rounds elements to the nearest integer.
+    // NAME rounds elements to the nearest integer, rounding ties to even.
 - go: RoundToEvenScaled
   commutative: false
   constImm: 0
diff --git a/src/simd/archsimd/_gen/simdgen/ops/IntOnlyArith/categories.yaml b/src/simd/archsimd/_gen/simdgen/ops/IntOnlyArith/categories.yaml
index bf33642a11..ae6554d731 100644
--- a/src/simd/archsimd/_gen/simdgen/ops/IntOnlyArith/categories.yaml
+++ b/src/simd/archsimd/_gen/simdgen/ops/IntOnlyArith/categories.yaml
@@ -12,8 +12,8 @@
   # Applies sign of second operand to first: sign(val, sign_src)
   commutative: false
   documentation: !string |-
-    // NAME returns the product of the first operand with -1, 0, or 1,
-    // whichever constant is nearest to the value of the second operand.
+    // NAME returns the product of x with -1, 0, or 1,
+    // whichever constant is nearest to the value of y.
   # Sign does not have masked version
 - go: OnesCount
   commutative: false
diff --git a/src/simd/archsimd/_gen/simdgen/ops/MLOps/categories.yaml b/src/simd/archsimd/_gen/simdgen/ops/MLOps/categories.yaml
index 2b1da7adaf..54a8ece574 100644
--- a/src/simd/archsimd/_gen/simdgen/ops/MLOps/categories.yaml
+++ b/src/simd/archsimd/_gen/simdgen/ops/MLOps/categories.yaml
@@ -10,21 +10,10 @@
   documentation: !string |-
     // NAME multiplies the elements and add the pairs together with saturation,
     // yielding a vector of half as many elements with twice the input element size.
-# QuadDotProduct, i.e. VPDPBUSD(S) are operations with src/dst on the same register, we are not supporting this as of now.
 # - go: DotProductBroadcast
 #   commutative: true
 # #   documentation: !string |-
 #     // NAME multiplies all elements and broadcasts the sum.
-- go: DotProductQuadruple
-  commutative: false
-  documentation: !string |-
-    // NAME performs dot products on groups of 4 elements of x and y.
-    // NAME(x, y).Add(z) will be optimized to the full form of the underlying instruction.
-- go: DotProductQuadrupleSaturated
-  commutative: false
-  documentation: !string |-
-    // NAME multiplies performs dot products on groups of 4 elements of x and y.
-    // NAME(x, y).Add(z) will be optimized to the full form of the underlying instruction.
 - go: AddDotProductPairs
   commutative: false
   noTypes: "true"
diff --git a/src/simd/archsimd/_gen/simdgen/ops/MLOps/go.yaml b/src/simd/archsimd/_gen/simdgen/ops/MLOps/go.yaml
index 4a1195b52d..18ce8a53b2 100644
--- a/src/simd/archsimd/_gen/simdgen/ops/MLOps/go.yaml
+++ b/src/simd/archsimd/_gen/simdgen/ops/MLOps/go.yaml
@@ -33,33 +33,6 @@
 #     const: 127
 #   out:
 #   - *dpb_src
-- go: DotProductQuadruple
-  asm: "VPDPBUSD"
-  operandOrder: "31Zero3" # switch operand 3 and 1, and make 3 always 0
-  in:
-  - &qdpa_acc
-    go: $t_acc
-    base: int
-    elemBits: 32
-  - &qdpa_src1
-    go: $t_src1
-    base: uint
-    overwriteElementBits: 8
-  - &qdpa_src2
-    go: $t_src2
-    base: int
-    overwriteElementBits: 8
-  out:
-  - *qdpa_acc
-- go: DotProductQuadrupleSaturated
-  asm: "VPDPBUSDS"
-  operandOrder: "31Zero3" # switch operand 3 and 1, and make 3 always 0
-  in:
-  - *qdpa_acc
-  - *qdpa_src1
-  - *qdpa_src2
-  out:
-  - *qdpa_acc
 - go: AddDotProductPairs
   asm: "VPDPWSSD"
   in:
diff --git a/src/simd/archsimd/_gen/simdgen/ops/MinMax/categories.yaml b/src/simd/archsimd/_gen/simdgen/ops/MinMax/categories.yaml
index a7e30f4693..1d79d85a46 100644
--- a/src/simd/archsimd/_gen/simdgen/ops/MinMax/categories.yaml
+++ b/src/simd/archsimd/_gen/simdgen/ops/MinMax/categories.yaml
@@ -2,8 +2,8 @@
 - go: Max
   commutative: true
   documentation: !string |-
-    // NAME computes the maximum of corresponding elements.
+    // NAME computes the maximum of each pair of corresponding elements in x and y.
 - go: Min
   commutative: true
   documentation: !string |-
-    // NAME computes the minimum of corresponding elements.
+    // NAME computes the minimum of each pair of corresponding elements in x and y.
diff --git a/src/simd/archsimd/_gen/simdgen/ops/Moves/categories.yaml b/src/simd/archsimd/_gen/simdgen/ops/Moves/categories.yaml
index 3c86974e8a..38bc9374cc 100644
--- a/src/simd/archsimd/_gen/simdgen/ops/Moves/categories.yaml
+++ b/src/simd/archsimd/_gen/simdgen/ops/Moves/categories.yaml
@@ -31,17 +31,23 @@
   commutative: false
   documentation: !string |-
     // NAME performs a full permutation of vector x using indices:
-    // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+    //
+    //   result = {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+    //
 - go: Permute
   commutative: false
   documentation: !string |-
     // NAME performs a full permutation of vector x using indices:
-    // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+    //
+    //   result = {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+    //
 - go: ConcatPermute # ConcatPermute is only available on or after AVX512
   commutative: false
   documentation: !string |-
     // NAME performs a full permutation of vector x, y using indices:
-    // result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+    //
+    //   result = {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+    //
     // where xy is the concatenation of x (lower half) and y (upper half).
     // Only the needed bits to represent xy's index are used in indices' elements.
 - go: Compress
@@ -236,12 +242,12 @@
 - go: ConcatShiftBytesRight
   commutative: false
   documentation: !string |-
-    // NAME concatenates x and y and shift it right by constant bytes.
+    // NAME concatenates x and y and shift it right by shift bytes.
     // The result vector will be the lower half of the concatenated vector.
 
 - go: ConcatShiftBytesRightGrouped
   commutative: false
   documentation: !string |-
-    // NAME concatenates x and y and shift it right by constant bytes.
+    // NAME concatenates x and y and shift it right by shift bytes.
     // The result vector will be the lower half of the concatenated vector.
     // This operation is performed grouped by each 16 byte.
diff --git a/src/simd/archsimd/_gen/simdgen/ops/Moves/go.yaml b/src/simd/archsimd/_gen/simdgen/ops/Moves/go.yaml
index 726a983ac4..e1fd184ed7 100644
--- a/src/simd/archsimd/_gen/simdgen/ops/Moves/go.yaml
+++ b/src/simd/archsimd/_gen/simdgen/ops/Moves/go.yaml
@@ -227,7 +227,7 @@
 - go: Permute
   asm: "VPERMQ|VPERMPD"
   addDoc: !string |-
-    // The low 2 bits (values 0-3) of each element of indices is used
+    // The low 2 bits (values 0-3) of each element of indices is used.
   operandOrder: "21Type1"
   in:
   - &anyindices
@@ -244,7 +244,7 @@
 - go: Permute
   asm: "VPERM[WDQ]|VPERMP[SD]"
   addDoc: !string |-
-    // The low 3 bits (values 0-7) of each element of indices is used
+    // The low 3 bits (values 0-7) of each element of indices is used.
   operandOrder: "21Type1"
   in:
   - *anyindices
@@ -257,7 +257,7 @@
 - go: Permute
   asm: "VPERM[BWD]|VPERMPS"
   addDoc: !string |-
-    // The low 4 bits (values 0-15) of each element of indices is used
+    // The low 4 bits (values 0-15) of each element of indices is used.
   operandOrder: "21Type1"
   in:
   - *anyindices
@@ -270,7 +270,7 @@
 - go: Permute
   asm: "VPERM[BW]"
   addDoc: !string |-
-    // The low 5 bits (values 0-31) of each element of indices is used
+    // The low 5 bits (values 0-31) of each element of indices is used.
   operandOrder: "21Type1"
   in:
   - *anyindices
@@ -283,7 +283,7 @@
 - go: Permute
   asm: "VPERMB"
   addDoc: !string |-
-    // The low 6 bits (values 0-63) of each element of indices is used
+    // The low 6 bits (values 0-63) of each element of indices is used.
   operandOrder: "21Type1"
   in:
   - *anyindices
@@ -489,7 +489,9 @@
 - go: PermuteOrZeroGrouped
   asm: VPSHUFB
   addDoc: !string |-
-    // result = {x_group0[indices[0]], x_group0[indices[1]], ..., x_group1[indices[16]], x_group1[indices[17]], ...}
+    //
+    //   result = {x_group0[indices[0]], x_group0[indices[1]], ..., x_group1[indices[16]], x_group1[indices[17]], ...}
+    //
     // The lower four bits of each byte-sized index in indices select an element from its corresponding group in x,
     // unless the index's sign bit is set in which case zero is used instead.
     // Each group is of size 128-bit.
@@ -506,7 +508,9 @@
 - go: permuteScalars
   asm: VPSHUFD
   addDoc: !string |-
-    // result = {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]}
+    //
+    //   result = {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]}
+    //
     // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
   in:
   - *128any
@@ -520,7 +524,9 @@
 - go: permuteScalarsGrouped
   asm: VPSHUFD
   addDoc: !string |-
-    // result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
+    //
+    //   result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
+    //
     // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
     // Each group is of size 128-bit.
   in:
@@ -535,7 +541,9 @@
 - go: permuteScalarsLo
   asm: VPSHUFLW
   addDoc: !string |-
-    // result = {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]], x[4], x[5], x[6], x[7]}
+    //
+    //   result = {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]], x[4], x[5], x[6], x[7]}
+    //
     // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
   in:
     - &128lanes8
@@ -573,7 +581,9 @@
 - go: permuteScalarsHi
   asm: VPSHUFHW
   addDoc: !string |-
-    // result = {x[0], x[1], x[2], x[3], x[indices[0:2]+4], x[indices[2:4]+4], x[indices[4:6]+4], x[indices[6:8]+4]}
+    //
+    //   result = {x[0], x[1], x[2], x[3], x[indices[0:2]+4], x[indices[2:4]+4], x[indices[4:6]+4], x[indices[6:8]+4]}
+    //
     // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
   in:
   - *128lanes8
@@ -1001,6 +1011,7 @@
   - *uint128
   - class: immediate
     immOffset: 0
+    name: shift
   out:
   - *uint128
 
@@ -1014,5 +1025,6 @@
   - *uint256512
   - class: immediate
     immOffset: 0
+    name: shift
   out:
   - *uint256512
diff --git a/src/simd/archsimd/_gen/simdgen/ops/Mul/categories.yaml b/src/simd/archsimd/_gen/simdgen/ops/Mul/categories.yaml
index 92491b51d4..bb020ed48f 100644
--- a/src/simd/archsimd/_gen/simdgen/ops/Mul/categories.yaml
+++ b/src/simd/archsimd/_gen/simdgen/ops/Mul/categories.yaml
@@ -7,7 +7,7 @@
   commutative: true
   documentation: !string |-
     // NAME multiplies even-indexed elements, widening the result.
-    // Result[i] = v1.Even[i] * v2.Even[i].
+    // Result[i] = v1[2*i] * v2[2*i].
 - go: MulHigh
   commutative: true
   documentation: !string |-
diff --git a/src/simd/archsimd/_gen/simdgen/ops/ShiftRotate/categories.yaml b/src/simd/archsimd/_gen/simdgen/ops/ShiftRotate/categories.yaml
index 0d0b006cfb..0d205aab79 100644
--- a/src/simd/archsimd/_gen/simdgen/ops/ShiftRotate/categories.yaml
+++ b/src/simd/archsimd/_gen/simdgen/ops/ShiftRotate/categories.yaml
@@ -4,21 +4,21 @@
   specialLower: sftimm
   commutative: false
   documentation: !string |-
-    // NAME shifts each element to the left by the specified number of bits. Emptied lower bits are zeroed.
+    // NAME shifts each element to the left by y bits.
 - go: ShiftAllRight
   signed: false
   nameAndSizeCheck: true
   specialLower: sftimm
   commutative: false
   documentation: !string |-
-    // NAME shifts each element to the right by the specified number of bits. Emptied upper bits are zeroed.
+    // NAME performs an unsigned right shift on each element by y bits.
 - go: ShiftAllRight
   signed: true
   specialLower: sftimm
   nameAndSizeCheck: true
   commutative: false
   documentation: !string |-
-    // NAME shifts each element to the right by the specified number of bits. Emptied upper bits are filled with the sign bit.
+    // NAME performs a signed right shift on each element by y bits.
 - go: shiftAllLeftConst # no APIs, only ssa ops.
   noTypes: "true"
   noGenericOps: "true"
@@ -44,24 +44,24 @@
   nameAndSizeCheck: true
   commutative: false
   documentation: !string |-
-    // NAME shifts each element in x to the left by the number of bits specified in y's corresponding elements. Emptied lower bits are zeroed.
+    // NAME shifts each element in x to the left by the number of bits specified in y's corresponding elements.
 - go: ShiftRight
   signed: false
   nameAndSizeCheck: true
   commutative: false
   documentation: !string |-
-    // NAME shifts each element in x to the right by the number of bits specified in y's corresponding elements. Emptied upper bits are zeroed.
+    // NAME performs an unsigned right shift on each element in x by the number of bits specified in y's corresponding elements.
 - go: ShiftRight
   signed: true
   nameAndSizeCheck: true
   commutative: false
   documentation: !string |-
-    // NAME shifts each element in x to the right by the number of bits specified in y's corresponding elements. Emptied upper bits are filled with the sign bit.
+    // NAME performs a signed right shift on each element in x by the number of bits specified in y's corresponding elements.
 - go: RotateAllLeft
   nameAndSizeCheck: true
   commutative: false
   documentation: !string |-
-    // NAME rotates each element to the left by the number of bits specified by the immediate.
+    // NAME rotates each element to the left by the number of bits specified by shift.
 - go: RotateLeft
   nameAndSizeCheck: true
   commutative: false
@@ -71,7 +71,7 @@
   nameAndSizeCheck: true
   commutative: false
   documentation: !string |-
-    // NAME rotates each element to the right by the number of bits specified by the immediate.
+    // NAME rotates each element to the right by the number of bits specified by shift.
 - go: RotateRight
   nameAndSizeCheck: true
   commutative: false
@@ -81,23 +81,23 @@
   nameAndSizeCheck: true
   commutative: false
   documentation: !string |-
-    // NAME shifts each element of x to the left by the number of bits specified by the
-    // immediate(only the lower 5 bits are used), and then copies the upper bits of y to the emptied lower bits of the shifted x.
+    // NAME shifts each element of x to the left by the number of bits specified by
+    // shift (only the lower 5 bits are used), and then copies the upper bits of y to the emptied lower bits of the shifted x.
 - go: ShiftAllRightConcat
   nameAndSizeCheck: true
   commutative: false
   documentation: !string |-
-    // NAME shifts each element of x to the right by the number of bits specified by the
-    // immediate(only the lower 5 bits are used), and then copies the lower bits of y to the emptied upper bits of the shifted x.
+    // NAME shifts each element of x to the right by the number of bits specified by
+    // shift (only the lower 5 bits are used), and then copies the lower bits of y to the emptied upper bits of the shifted x.
 - go: ShiftLeftConcat
   nameAndSizeCheck: true
   commutative: false
   documentation: !string |-
     // NAME shifts each element of x to the left by the number of bits specified by the
-    // corresponding elements in y(only the lower 5 bits are used), and then copies the upper bits of z to the emptied lower bits of the shifted x.
+    // corresponding elements in y (only the lower 5 bits are used), and then copies the upper bits of z to the emptied lower bits of the shifted x.
 - go: ShiftRightConcat
   nameAndSizeCheck: true
   commutative: false
   documentation: !string |-
     // NAME shifts each element of x to the right by the number of bits specified by the
-    // corresponding elements in y(only the lower 5 bits are used), and then copies the lower bits of z to the emptied upper bits of the shifted x.
+    // corresponding elements in y (only the lower 5 bits are used), and then copies the lower bits of z to the emptied upper bits of the shifted x.
diff --git a/src/simd/archsimd/_gen/tmplgen/main.go b/src/simd/archsimd/_gen/tmplgen/main.go
index 473e4f14c0..8db185e1e0 100644
--- a/src/simd/archsimd/_gen/tmplgen/main.go
+++ b/src/simd/archsimd/_gen/tmplgen/main.go
@@ -40,17 +40,23 @@ func (sat shapeAndTemplate) target(outType string, width int) shapeAndTemplate {
 	newSat := sat
 	newShape := *sat.s
 	newShape.output = func(t string, w, c int) (ot string, ow int, oc int) {
-		return outType, width, c
+		oc = c
+		if width*c > 512 {
+			oc = 512 / width
+		} else if width*c < 128 {
+			oc = 128 / width
+		}
+		return outType, width, oc
 	}
 	newSat.s = &newShape
 	return newSat
 }
 
-func (sat shapeAndTemplate) shrinkTo(outType string, by int) shapeAndTemplate {
+func (sat shapeAndTemplate) targetFixed(outType string, width, count int) shapeAndTemplate {
 	newSat := sat
 	newShape := *sat.s
 	newShape.output = func(t string, w, c int) (ot string, ow int, oc int) {
-		return outType, w / by, c * by
+		return outType, width, count
 	}
 	newSat.s = &newShape
 	return newSat
@@ -98,6 +104,17 @@ var uintShapes = &shapes{
 	uints: []int{8, 16, 32, 64},
 }
 
+var floatShapes = &shapes{
+	vecs:   []int{128, 256, 512},
+	floats: []int{32, 64},
+}
+
+var integerShapes = &shapes{
+	vecs:  []int{128, 256, 512},
+	ints:  []int{8, 16, 32, 64},
+	uints: []int{8, 16, 32, 64},
+}
+
 var avx512Shapes = &shapes{
 	vecs:   []int{512},
 	ints:   []int{8, 16, 32, 64},
@@ -278,7 +295,7 @@ func testPrologue(t, s string, out io.Writer) {
 	fmt.Fprintf(out,
 		`// Code generated by '%s'; DO NOT EDIT.
 
-//go:build goexperiment.simd
+//go:build goexperiment.simd && amd64
 
 // This file contains functions testing %s.
 // Each function in this file is specialized for a
@@ -311,12 +328,12 @@ func shapedTemplateOf(s *shapes, name, temp string) shapeAndTemplate {
 }
 
 var sliceTemplate = templateOf("slice", `
-// Load{{.VType}}Slice loads {{.AOrAn}} {{.VType}} from a slice of at least {{.Count}} {{.Etype}}s
+// Load{{.VType}}Slice loads {{.AOrAn}} {{.VType}} from a slice of at least {{.Count}} {{.Etype}}s.
 func Load{{.VType}}Slice(s []{{.Etype}}) {{.VType}} {
 	return Load{{.VType}}((*[{{.Count}}]{{.Etype}})(s))
 }
 
-// StoreSlice stores x into a slice of at least {{.Count}} {{.Etype}}s
+// StoreSlice stores x into a slice of at least {{.Count}} {{.Etype}}s.
 func (x {{.VType}}) StoreSlice(s []{{.Etype}}) {
 	x.Store((*[{{.Count}}]{{.Etype}})(s))
 }
@@ -356,15 +373,49 @@ func test{{.VType}}UnaryFlaky(t *testing.T, f func(x archsimd.{{.VType}}) archsi
 `)
 
 var convertTemplate = templateOf("convert_helpers", `
-// test{{.VType}}ConvertTo{{.OEType}} tests the simd conversion method f against the expected behavior generated by want
-// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+// test{{.VType}}ConvertTo{{.OEType}} tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
 func test{{.VType}}ConvertTo{{.OEType}}(t *testing.T, f func(x archsimd.{{.VType}}) archsimd.{{.OVType}}, want func(x []{{.Etype}}) []{{.OEtype}}) {
 	n := {{.Count}}
 	t.Helper()
 	forSlice(t, {{.Etype}}s, n, func(x []{{.Etype}}) bool {
 	 	t.Helper()
 		a := archsimd.Load{{.VType}}Slice(x)
-		g := make([]{{.OEtype}}, n)
+		g := make([]{{.OEtype}}, {{.OCount}})
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() {t.Helper(); t.Logf("x=%v", x)})
+	})
+}
+`)
+
+var (
+	// templates and shapes for conversion.
+	// TODO: this includes shapes where in and out have the same element type,
+	// which are not needed.
+	unaryToInt8    = convertTemplate.target("int", 8)
+	unaryToUint8   = convertTemplate.target("uint", 8)
+	unaryToInt16   = convertTemplate.target("int", 16)
+	unaryToUint16  = convertTemplate.target("uint", 16)
+	unaryToInt32   = convertTemplate.target("int", 32)
+	unaryToUint32  = convertTemplate.target("uint", 32)
+	unaryToInt64   = convertTemplate.target("int", 64)
+	unaryToUint64  = convertTemplate.target("uint", 64)
+	unaryToFloat32 = convertTemplate.target("float", 32)
+	unaryToFloat64 = convertTemplate.target("float", 64)
+)
+
+var convertLoTemplate = shapedTemplateOf(integerShapes, "convert_lo_helpers", `
+// test{{.VType}}ConvertLoTo{{.OVType}} tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low {{.OCount}} elements.
+func test{{.VType}}ConvertLoTo{{.OVType}}(t *testing.T, f func(x archsimd.{{.VType}}) archsimd.{{.OVType}}, want func(x []{{.Etype}}) []{{.OEtype}}) {
+	n := {{.Count}}
+	t.Helper()
+	forSlice(t, {{.Etype}}s, n, func(x []{{.Etype}}) bool {
+	 	t.Helper()
+		a := archsimd.Load{{.VType}}Slice(x)
+		g := make([]{{.OEtype}}, {{.OCount}})
 		f(a).StoreSlice(g)
 		w := want(x)
 		return checkSlicesLogInput(t, g, w, 0.0, func() {t.Helper(); t.Logf("x=%v", x)})
@@ -372,9 +423,23 @@ func test{{.VType}}ConvertTo{{.OEType}}(t *testing.T, f func(x archsimd.{{.VType
 }
 `)
 
-var unaryToInt32 = convertTemplate.target("int", 32)
-var unaryToUint32 = convertTemplate.target("uint", 32)
-var unaryToUint16 = convertTemplate.target("uint", 16)
+var (
+	// templates and shapes for conversion of low elements.
+	// The output is fixed to 128- or 256-bits (no 512-bit, as the
+	// regular convertTemplate covers that).
+	// TODO: this includes shapes where in and out have the same element
+	// type or length, which are not needed.
+	unaryToInt64x2  = convertLoTemplate.targetFixed("int", 64, 2)
+	unaryToInt64x4  = convertLoTemplate.targetFixed("int", 64, 4)
+	unaryToUint64x2 = convertLoTemplate.targetFixed("uint", 64, 2)
+	unaryToUint64x4 = convertLoTemplate.targetFixed("uint", 64, 4)
+	unaryToInt32x4  = convertLoTemplate.targetFixed("int", 32, 4)
+	unaryToInt32x8  = convertLoTemplate.targetFixed("int", 32, 8)
+	unaryToUint32x4 = convertLoTemplate.targetFixed("uint", 32, 4)
+	unaryToUint32x8 = convertLoTemplate.targetFixed("uint", 32, 8)
+	unaryToInt16x8  = convertLoTemplate.targetFixed("int", 16, 8)
+	unaryToUint16x8 = convertLoTemplate.targetFixed("uint", 16, 8)
+)
 
 var binaryTemplate = templateOf("binary_helpers", `
 // test{{.VType}}Binary tests the simd binary method f against the expected behavior generated by want
@@ -447,6 +512,22 @@ func test{{.VType}}Compare(t *testing.T, f func(_, _ archsimd.{{.VType}}) archsi
 }
 `)
 
+var compareUnaryTemplate = shapedTemplateOf(floatShapes, "compare_unary_helpers", `
+// test{{.VType}}UnaryCompare tests the simd unary comparison method f against the expected behavior generated by want
+func test{{.VType}}UnaryCompare(t *testing.T, f func(x archsimd.{{.VType}}) archsimd.Mask{{.WxC}}, want func(x []{{.Etype}}) []int64) {
+	n := {{.Count}}
+	t.Helper()
+	forSlice(t, {{.Etype}}s, n, func(x []{{.Etype}}) bool {
+	 	t.Helper()
+		a := archsimd.Load{{.VType}}Slice(x)
+		g := make([]int{{.EWidth}}, n)
+		f(a).ToInt{{.WxC}}().StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, s64(g), w, 0.0, func() {t.Helper(); t.Logf("x=%v", x)})
+	})
+}
+`)
+
 // TODO this has not been tested yet.
 var compareMaskedTemplate = templateOf("comparemasked_helpers", `
 // test{{.VType}}CompareMasked tests the simd masked comparison method f against the expected behavior generated by want
@@ -580,32 +661,32 @@ func (t templateData) CPUfeature() string {
 }
 
 var avx2SignedComparisonsTemplate = shapedTemplateOf(avx2SignedComparisons, "avx2 signed comparisons", `
-// Less returns a mask whose elements indicate whether x < y
+// Less returns a mask whose elements indicate whether x < y.
 //
-// Emulated, CPU Feature {{.CPUfeature}}
+// Emulated, CPU Feature: {{.CPUfeature}}
 func (x {{.VType}}) Less(y {{.VType}}) Mask{{.WxC}} {
 	return y.Greater(x)
 }
 
-// GreaterEqual returns a mask whose elements indicate whether x >= y
+// GreaterEqual returns a mask whose elements indicate whether x >= y.
 //
-// Emulated, CPU Feature {{.CPUfeature}}
+// Emulated, CPU Feature: {{.CPUfeature}}
 func (x {{.VType}}) GreaterEqual(y {{.VType}}) Mask{{.WxC}} {
 	ones := x.Equal(x).ToInt{{.WxC}}()
 	return y.Greater(x).ToInt{{.WxC}}().Xor(ones).asMask()
 }
 
-// LessEqual returns a mask whose elements indicate whether x <= y
+// LessEqual returns a mask whose elements indicate whether x <= y.
 //
-// Emulated, CPU Feature {{.CPUfeature}}
+// Emulated, CPU Feature: {{.CPUfeature}}
 func (x {{.VType}}) LessEqual(y {{.VType}}) Mask{{.WxC}} {
 	ones := x.Equal(x).ToInt{{.WxC}}()
 	return x.Greater(y).ToInt{{.WxC}}().Xor(ones).asMask()
 }
 
-// NotEqual returns a mask whose elements indicate whether x != y
+// NotEqual returns a mask whose elements indicate whether x != y.
 //
-// Emulated, CPU Feature {{.CPUfeature}}
+// Emulated, CPU Feature: {{.CPUfeature}}
 func (x {{.VType}}) NotEqual(y {{.VType}}) Mask{{.WxC}} {
 	ones := x.Equal(x).ToInt{{.WxC}}()
 	return x.Equal(y).ToInt{{.WxC}}().Xor(ones).asMask()	
@@ -613,18 +694,18 @@ func (x {{.VType}}) NotEqual(y {{.VType}}) Mask{{.WxC}} {
 `)
 
 var bitWiseIntTemplate = shapedTemplateOf(intShapes, "bitwise int complement", `
-// Not returns the bitwise complement of x
+// Not returns the bitwise complement of x.
 //
-// Emulated, CPU Feature {{.CPUfeature}}
+// Emulated, CPU Feature: {{.CPUfeature}}
 func (x {{.VType}}) Not() {{.VType}} {
 	return x.Xor(x.Equal(x).ToInt{{.WxC}}())
 }
 `)
 
 var bitWiseUintTemplate = shapedTemplateOf(uintShapes, "bitwise uint complement", `
-// Not returns the bitwise complement of x
+// Not returns the bitwise complement of x.
 //
-// Emulated, CPU Feature {{.CPUfeature}}
+// Emulated, CPU Feature: {{.CPUfeature}}
 func (x {{.VType}}) Not() {{.VType}} {
 	return x.Xor(x.Equal(x).ToInt{{.WxC}}().As{{.VType}}())
 }
@@ -643,9 +724,9 @@ func (t templateData) CPUfeatureAVX2if8() string {
 }
 
 var avx2UnsignedComparisonsTemplate = shapedTemplateOf(avx2UnsignedComparisons, "avx2 unsigned comparisons", `
-// Greater returns a mask whose elements indicate whether x > y
+// Greater returns a mask whose elements indicate whether x > y.
 //
-// Emulated, CPU Feature {{.CPUfeatureAVX2if8}}
+// Emulated, CPU Feature: {{.CPUfeatureAVX2if8}}
 func (x {{.VType}}) Greater(y {{.VType}}) Mask{{.WxC}} {
 	a, b := x.AsInt{{.WxC}}(), y.AsInt{{.WxC}}()
 {{- if eq .EWidth 8}}
@@ -657,9 +738,9 @@ func (x {{.VType}}) Greater(y {{.VType}}) Mask{{.WxC}} {
 	return a.Xor(signs).Greater(b.Xor(signs))
 }
 
-// Less returns a mask whose elements indicate whether x < y
+// Less returns a mask whose elements indicate whether x < y.
 //
-// Emulated, CPU Feature {{.CPUfeatureAVX2if8}}
+// Emulated, CPU Feature: {{.CPUfeatureAVX2if8}}
 func (x {{.VType}}) Less(y {{.VType}}) Mask{{.WxC}} {
 	a, b := x.AsInt{{.WxC}}(), y.AsInt{{.WxC}}()
 {{- if eq .EWidth 8}}
@@ -671,9 +752,9 @@ func (x {{.VType}}) Less(y {{.VType}}) Mask{{.WxC}} {
 	return b.Xor(signs).Greater(a.Xor(signs))
 }
 
-// GreaterEqual returns a mask whose elements indicate whether x >= y
+// GreaterEqual returns a mask whose elements indicate whether x >= y.
 //
-// Emulated, CPU Feature {{.CPUfeatureAVX2if8}}
+// Emulated, CPU Feature: {{.CPUfeatureAVX2if8}}
 func (x {{.VType}}) GreaterEqual(y {{.VType}}) Mask{{.WxC}} {
 	a, b := x.AsInt{{.WxC}}(), y.AsInt{{.WxC}}()
 	ones := x.Equal(x).ToInt{{.WxC}}()
@@ -685,9 +766,9 @@ func (x {{.VType}}) GreaterEqual(y {{.VType}}) Mask{{.WxC}} {
 	return b.Xor(signs).Greater(a.Xor(signs)).ToInt{{.WxC}}().Xor(ones).asMask()
 }
 
-// LessEqual returns a mask whose elements indicate whether x <= y
+// LessEqual returns a mask whose elements indicate whether x <= y.
 //
-// Emulated, CPU Feature {{.CPUfeatureAVX2if8}}
+// Emulated, CPU Feature: {{.CPUfeatureAVX2if8}}
 func (x {{.VType}}) LessEqual(y {{.VType}}) Mask{{.WxC}} {
 	a, b := x.AsInt{{.WxC}}(), y.AsInt{{.WxC}}()
 	ones := x.Equal(x).ToInt{{.WxC}}()
@@ -699,9 +780,9 @@ func (x {{.VType}}) LessEqual(y {{.VType}}) Mask{{.WxC}} {
 	return a.Xor(signs).Greater(b.Xor(signs)).ToInt{{.WxC}}().Xor(ones).asMask()
 }
 
-// NotEqual returns a mask whose elements indicate whether x != y
+// NotEqual returns a mask whose elements indicate whether x != y.
 //
-// Emulated, CPU Feature {{.CPUfeature}}
+// Emulated, CPU Feature: {{.CPUfeature}}
 func (x {{.VType}}) NotEqual(y {{.VType}}) Mask{{.WxC}} {
 	a, b := x.AsInt{{.WxC}}(), y.AsInt{{.WxC}}()
 	ones := x.Equal(x).ToInt{{.WxC}}()
@@ -758,7 +839,7 @@ func (x {{.VType}}) Masked(mask Mask{{.WxC}}) {{.VType}} {
 {{- end -}}
 }
 
-// Merge returns x but with elements set to y where m is false.
+// Merge returns x but with elements set to y where mask is false.
 func (x {{.VType}}) Merge(y {{.VType}}, mask Mask{{.WxC}}) {{.VType}} {
 {{- if eq .Base "Int" }}
 	return y.blendMasked(x, mask)
@@ -789,7 +870,7 @@ var broadcastTemplate = templateOf("Broadcast functions", `
 // Broadcast{{.VType}} returns a vector with the input
 // x assigned to all elements of the output.
 //
-// Emulated, CPU Feature {{.CPUfeatureBC}}
+// Emulated, CPU Feature: {{.CPUfeatureBC}}
 func Broadcast{{.VType}}(x {{.Etype}}) {{.VType}} {
 	var z {{.As128BitVec }}
 	return z.SetElem(0, x).Broadcast{{.Vwidth}}()
@@ -804,7 +885,7 @@ func (from {{.Base}}{{.WxC}}) ToMask() (to Mask{{.WxC}}) {
 `)
 
 var stringTemplate = shapedTemplateOf(allShapes, "String methods", `
-// String returns a string representation of SIMD vector x
+// String returns a string representation of SIMD vector x.
 func (x {{.VType}}) String() string {
 	var s [{{.Count}}]{{.Etype}}
 	x.Store(&s)
@@ -862,7 +943,17 @@ func main() {
 		one(*ush, unsafePrologue, unsafePATemplate)
 	}
 	if *uh != "" {
-		one(*uh, curryTestPrologue("unary simd methods"), unaryTemplate, unaryToInt32, unaryToUint32, unaryToUint16, unaryFlakyTemplate)
+		one(*uh, curryTestPrologue("unary simd methods"), unaryTemplate,
+			unaryToInt8, unaryToUint8, unaryToInt16, unaryToUint16,
+			unaryToInt32, unaryToUint32, unaryToInt64, unaryToUint64,
+			unaryToFloat32, unaryToFloat64,
+			unaryToInt64x2, unaryToInt64x4,
+			unaryToUint64x2, unaryToUint64x4,
+			unaryToInt32x4, unaryToInt32x8,
+			unaryToUint32x4, unaryToUint32x8,
+			unaryToInt16x8, unaryToUint16x8,
+			unaryFlakyTemplate,
+		)
 	}
 	if *bh != "" {
 		one(*bh, curryTestPrologue("binary simd methods"), binaryTemplate)
@@ -871,7 +962,7 @@ func main() {
 		one(*th, curryTestPrologue("ternary simd methods"), ternaryTemplate, ternaryFlakyTemplate)
 	}
 	if *ch != "" {
-		one(*ch, curryTestPrologue("simd methods that compare two operands"), compareTemplate)
+		one(*ch, curryTestPrologue("simd methods that compare two operands"), compareTemplate, compareUnaryTemplate)
 	}
 	if *cmh != "" {
 		one(*cmh, curryTestPrologue("simd methods that compare two operands under a mask"), compareMaskedTemplate)
@@ -1018,7 +1109,7 @@ func nonTemplateRewrites(filename string, prologue func(s string, out io.Writer)
 
 	out := new(bytes.Buffer)
 
-	prologue("go run genfiles.go", out)
+	prologue("tmplgen", out)
 	for _, rewrite := range rewrites {
 		rewrite(out)
 	}
@@ -1054,7 +1145,7 @@ func one(filename string, prologue func(s string, out io.Writer), sats ...shapeA
 
 	out := new(bytes.Buffer)
 
-	prologue("go run genfiles.go", out)
+	prologue("tmplgen", out)
 	for _, sat := range sats {
 		sat.forTemplates(out)
 	}
diff --git a/src/simd/archsimd/compare_gen_amd64.go b/src/simd/archsimd/compare_gen_amd64.go
index a8636f0b33..09f8277dc9 100644
--- a/src/simd/archsimd/compare_gen_amd64.go
+++ b/src/simd/archsimd/compare_gen_amd64.go
@@ -1,278 +1,278 @@
-// Code generated by 'go run genfiles.go'; DO NOT EDIT.
+// Code generated by 'tmplgen'; DO NOT EDIT.
 
 //go:build goexperiment.simd
 
 package archsimd
 
-// Less returns a mask whose elements indicate whether x < y
+// Less returns a mask whose elements indicate whether x < y.
 //
-// Emulated, CPU Feature AVX
+// Emulated, CPU Feature: AVX
 func (x Int8x16) Less(y Int8x16) Mask8x16 {
 	return y.Greater(x)
 }
 
-// GreaterEqual returns a mask whose elements indicate whether x >= y
+// GreaterEqual returns a mask whose elements indicate whether x >= y.
 //
-// Emulated, CPU Feature AVX
+// Emulated, CPU Feature: AVX
 func (x Int8x16) GreaterEqual(y Int8x16) Mask8x16 {
 	ones := x.Equal(x).ToInt8x16()
 	return y.Greater(x).ToInt8x16().Xor(ones).asMask()
 }
 
-// LessEqual returns a mask whose elements indicate whether x <= y
+// LessEqual returns a mask whose elements indicate whether x <= y.
 //
-// Emulated, CPU Feature AVX
+// Emulated, CPU Feature: AVX
 func (x Int8x16) LessEqual(y Int8x16) Mask8x16 {
 	ones := x.Equal(x).ToInt8x16()
 	return x.Greater(y).ToInt8x16().Xor(ones).asMask()
 }
 
-// NotEqual returns a mask whose elements indicate whether x != y
+// NotEqual returns a mask whose elements indicate whether x != y.
 //
-// Emulated, CPU Feature AVX
+// Emulated, CPU Feature: AVX
 func (x Int8x16) NotEqual(y Int8x16) Mask8x16 {
 	ones := x.Equal(x).ToInt8x16()
 	return x.Equal(y).ToInt8x16().Xor(ones).asMask()
 }
 
-// Less returns a mask whose elements indicate whether x < y
+// Less returns a mask whose elements indicate whether x < y.
 //
-// Emulated, CPU Feature AVX
+// Emulated, CPU Feature: AVX
 func (x Int16x8) Less(y Int16x8) Mask16x8 {
 	return y.Greater(x)
 }
 
-// GreaterEqual returns a mask whose elements indicate whether x >= y
+// GreaterEqual returns a mask whose elements indicate whether x >= y.
 //
-// Emulated, CPU Feature AVX
+// Emulated, CPU Feature: AVX
 func (x Int16x8) GreaterEqual(y Int16x8) Mask16x8 {
 	ones := x.Equal(x).ToInt16x8()
 	return y.Greater(x).ToInt16x8().Xor(ones).asMask()
 }
 
-// LessEqual returns a mask whose elements indicate whether x <= y
+// LessEqual returns a mask whose elements indicate whether x <= y.
 //
-// Emulated, CPU Feature AVX
+// Emulated, CPU Feature: AVX
 func (x Int16x8) LessEqual(y Int16x8) Mask16x8 {
 	ones := x.Equal(x).ToInt16x8()
 	return x.Greater(y).ToInt16x8().Xor(ones).asMask()
 }
 
-// NotEqual returns a mask whose elements indicate whether x != y
+// NotEqual returns a mask whose elements indicate whether x != y.
 //
-// Emulated, CPU Feature AVX
+// Emulated, CPU Feature: AVX
 func (x Int16x8) NotEqual(y Int16x8) Mask16x8 {
 	ones := x.Equal(x).ToInt16x8()
 	return x.Equal(y).ToInt16x8().Xor(ones).asMask()
 }
 
-// Less returns a mask whose elements indicate whether x < y
+// Less returns a mask whose elements indicate whether x < y.
 //
-// Emulated, CPU Feature AVX
+// Emulated, CPU Feature: AVX
 func (x Int32x4) Less(y Int32x4) Mask32x4 {
 	return y.Greater(x)
 }
 
-// GreaterEqual returns a mask whose elements indicate whether x >= y
+// GreaterEqual returns a mask whose elements indicate whether x >= y.
 //
-// Emulated, CPU Feature AVX
+// Emulated, CPU Feature: AVX
 func (x Int32x4) GreaterEqual(y Int32x4) Mask32x4 {
 	ones := x.Equal(x).ToInt32x4()
 	return y.Greater(x).ToInt32x4().Xor(ones).asMask()
 }
 
-// LessEqual returns a mask whose elements indicate whether x <= y
+// LessEqual returns a mask whose elements indicate whether x <= y.
 //
-// Emulated, CPU Feature AVX
+// Emulated, CPU Feature: AVX
 func (x Int32x4) LessEqual(y Int32x4) Mask32x4 {
 	ones := x.Equal(x).ToInt32x4()
 	return x.Greater(y).ToInt32x4().Xor(ones).asMask()
 }
 
-// NotEqual returns a mask whose elements indicate whether x != y
+// NotEqual returns a mask whose elements indicate whether x != y.
 //
-// Emulated, CPU Feature AVX
+// Emulated, CPU Feature: AVX
 func (x Int32x4) NotEqual(y Int32x4) Mask32x4 {
 	ones := x.Equal(x).ToInt32x4()
 	return x.Equal(y).ToInt32x4().Xor(ones).asMask()
 }
 
-// Less returns a mask whose elements indicate whether x < y
+// Less returns a mask whose elements indicate whether x < y.
 //
-// Emulated, CPU Feature AVX
+// Emulated, CPU Feature: AVX
 func (x Int64x2) Less(y Int64x2) Mask64x2 {
 	return y.Greater(x)
 }
 
-// GreaterEqual returns a mask whose elements indicate whether x >= y
+// GreaterEqual returns a mask whose elements indicate whether x >= y.
 //
-// Emulated, CPU Feature AVX
+// Emulated, CPU Feature: AVX
 func (x Int64x2) GreaterEqual(y Int64x2) Mask64x2 {
 	ones := x.Equal(x).ToInt64x2()
 	return y.Greater(x).ToInt64x2().Xor(ones).asMask()
 }
 
-// LessEqual returns a mask whose elements indicate whether x <= y
+// LessEqual returns a mask whose elements indicate whether x <= y.
 //
-// Emulated, CPU Feature AVX
+// Emulated, CPU Feature: AVX
 func (x Int64x2) LessEqual(y Int64x2) Mask64x2 {
 	ones := x.Equal(x).ToInt64x2()
 	return x.Greater(y).ToInt64x2().Xor(ones).asMask()
 }
 
-// NotEqual returns a mask whose elements indicate whether x != y
+// NotEqual returns a mask whose elements indicate whether x != y.
 //
-// Emulated, CPU Feature AVX
+// Emulated, CPU Feature: AVX
 func (x Int64x2) NotEqual(y Int64x2) Mask64x2 {
 	ones := x.Equal(x).ToInt64x2()
 	return x.Equal(y).ToInt64x2().Xor(ones).asMask()
 }
 
-// Less returns a mask whose elements indicate whether x < y
+// Less returns a mask whose elements indicate whether x < y.
 //
-// Emulated, CPU Feature AVX2
+// Emulated, CPU Feature: AVX2
 func (x Int8x32) Less(y Int8x32) Mask8x32 {
 	return y.Greater(x)
 }
 
-// GreaterEqual returns a mask whose elements indicate whether x >= y
+// GreaterEqual returns a mask whose elements indicate whether x >= y.
 //
-// Emulated, CPU Feature AVX2
+// Emulated, CPU Feature: AVX2
 func (x Int8x32) GreaterEqual(y Int8x32) Mask8x32 {
 	ones := x.Equal(x).ToInt8x32()
 	return y.Greater(x).ToInt8x32().Xor(ones).asMask()
 }
 
-// LessEqual returns a mask whose elements indicate whether x <= y
+// LessEqual returns a mask whose elements indicate whether x <= y.
 //
-// Emulated, CPU Feature AVX2
+// Emulated, CPU Feature: AVX2
 func (x Int8x32) LessEqual(y Int8x32) Mask8x32 {
 	ones := x.Equal(x).ToInt8x32()
 	return x.Greater(y).ToInt8x32().Xor(ones).asMask()
 }
 
-// NotEqual returns a mask whose elements indicate whether x != y
+// NotEqual returns a mask whose elements indicate whether x != y.
 //
-// Emulated, CPU Feature AVX2
+// Emulated, CPU Feature: AVX2
 func (x Int8x32) NotEqual(y Int8x32) Mask8x32 {
 	ones := x.Equal(x).ToInt8x32()
 	return x.Equal(y).ToInt8x32().Xor(ones).asMask()
 }
 
-// Less returns a mask whose elements indicate whether x < y
+// Less returns a mask whose elements indicate whether x < y.
 //
-// Emulated, CPU Feature AVX2
+// Emulated, CPU Feature: AVX2
 func (x Int16x16) Less(y Int16x16) Mask16x16 {
 	return y.Greater(x)
 }
 
-// GreaterEqual returns a mask whose elements indicate whether x >= y
+// GreaterEqual returns a mask whose elements indicate whether x >= y.
 //
-// Emulated, CPU Feature AVX2
+// Emulated, CPU Feature: AVX2
 func (x Int16x16) GreaterEqual(y Int16x16) Mask16x16 {
 	ones := x.Equal(x).ToInt16x16()
 	return y.Greater(x).ToInt16x16().Xor(ones).asMask()
 }
 
-// LessEqual returns a mask whose elements indicate whether x <= y
+// LessEqual returns a mask whose elements indicate whether x <= y.
 //
-// Emulated, CPU Feature AVX2
+// Emulated, CPU Feature: AVX2
 func (x Int16x16) LessEqual(y Int16x16) Mask16x16 {
 	ones := x.Equal(x).ToInt16x16()
 	return x.Greater(y).ToInt16x16().Xor(ones).asMask()
 }
 
-// NotEqual returns a mask whose elements indicate whether x != y
+// NotEqual returns a mask whose elements indicate whether x != y.
 //
-// Emulated, CPU Feature AVX2
+// Emulated, CPU Feature: AVX2
 func (x Int16x16) NotEqual(y Int16x16) Mask16x16 {
 	ones := x.Equal(x).ToInt16x16()
 	return x.Equal(y).ToInt16x16().Xor(ones).asMask()
 }
 
-// Less returns a mask whose elements indicate whether x < y
+// Less returns a mask whose elements indicate whether x < y.
 //
-// Emulated, CPU Feature AVX2
+// Emulated, CPU Feature: AVX2
 func (x Int32x8) Less(y Int32x8) Mask32x8 {
 	return y.Greater(x)
 }
 
-// GreaterEqual returns a mask whose elements indicate whether x >= y
+// GreaterEqual returns a mask whose elements indicate whether x >= y.
 //
-// Emulated, CPU Feature AVX2
+// Emulated, CPU Feature: AVX2
 func (x Int32x8) GreaterEqual(y Int32x8) Mask32x8 {
 	ones := x.Equal(x).ToInt32x8()
 	return y.Greater(x).ToInt32x8().Xor(ones).asMask()
 }
 
-// LessEqual returns a mask whose elements indicate whether x <= y
+// LessEqual returns a mask whose elements indicate whether x <= y.
 //
-// Emulated, CPU Feature AVX2
+// Emulated, CPU Feature: AVX2
 func (x Int32x8) LessEqual(y Int32x8) Mask32x8 {
 	ones := x.Equal(x).ToInt32x8()
 	return x.Greater(y).ToInt32x8().Xor(ones).asMask()
 }
 
-// NotEqual returns a mask whose elements indicate whether x != y
+// NotEqual returns a mask whose elements indicate whether x != y.
 //
-// Emulated, CPU Feature AVX2
+// Emulated, CPU Feature: AVX2
 func (x Int32x8) NotEqual(y Int32x8) Mask32x8 {
 	ones := x.Equal(x).ToInt32x8()
 	return x.Equal(y).ToInt32x8().Xor(ones).asMask()
 }
 
-// Less returns a mask whose elements indicate whether x < y
+// Less returns a mask whose elements indicate whether x < y.
 //
-// Emulated, CPU Feature AVX2
+// Emulated, CPU Feature: AVX2
 func (x Int64x4) Less(y Int64x4) Mask64x4 {
 	return y.Greater(x)
 }
 
-// GreaterEqual returns a mask whose elements indicate whether x >= y
+// GreaterEqual returns a mask whose elements indicate whether x >= y.
 //
-// Emulated, CPU Feature AVX2
+// Emulated, CPU Feature: AVX2
 func (x Int64x4) GreaterEqual(y Int64x4) Mask64x4 {
 	ones := x.Equal(x).ToInt64x4()
 	return y.Greater(x).ToInt64x4().Xor(ones).asMask()
 }
 
-// LessEqual returns a mask whose elements indicate whether x <= y
+// LessEqual returns a mask whose elements indicate whether x <= y.
 //
-// Emulated, CPU Feature AVX2
+// Emulated, CPU Feature: AVX2
 func (x Int64x4) LessEqual(y Int64x4) Mask64x4 {
 	ones := x.Equal(x).ToInt64x4()
 	return x.Greater(y).ToInt64x4().Xor(ones).asMask()
 }
 
-// NotEqual returns a mask whose elements indicate whether x != y
+// NotEqual returns a mask whose elements indicate whether x != y.
 //
-// Emulated, CPU Feature AVX2
+// Emulated, CPU Feature: AVX2
 func (x Int64x4) NotEqual(y Int64x4) Mask64x4 {
 	ones := x.Equal(x).ToInt64x4()
 	return x.Equal(y).ToInt64x4().Xor(ones).asMask()
 }
 
-// Greater returns a mask whose elements indicate whether x > y
+// Greater returns a mask whose elements indicate whether x > y.
 //
-// Emulated, CPU Feature AVX2
+// Emulated, CPU Feature: AVX2
 func (x Uint8x16) Greater(y Uint8x16) Mask8x16 {
 	a, b := x.AsInt8x16(), y.AsInt8x16()
 	signs := BroadcastInt8x16(-1 << (8 - 1))
 	return a.Xor(signs).Greater(b.Xor(signs))
 }
 
-// Less returns a mask whose elements indicate whether x < y
+// Less returns a mask whose elements indicate whether x < y.
 //
-// Emulated, CPU Feature AVX2
+// Emulated, CPU Feature: AVX2
 func (x Uint8x16) Less(y Uint8x16) Mask8x16 {
 	a, b := x.AsInt8x16(), y.AsInt8x16()
 	signs := BroadcastInt8x16(-1 << (8 - 1))
 	return b.Xor(signs).Greater(a.Xor(signs))
 }
 
-// GreaterEqual returns a mask whose elements indicate whether x >= y
+// GreaterEqual returns a mask whose elements indicate whether x >= y.
 //
-// Emulated, CPU Feature AVX2
+// Emulated, CPU Feature: AVX2
 func (x Uint8x16) GreaterEqual(y Uint8x16) Mask8x16 {
 	a, b := x.AsInt8x16(), y.AsInt8x16()
 	ones := x.Equal(x).ToInt8x16()
@@ -280,9 +280,9 @@ func (x Uint8x16) GreaterEqual(y Uint8x16) Mask8x16 {
 	return b.Xor(signs).Greater(a.Xor(signs)).ToInt8x16().Xor(ones).asMask()
 }
 
-// LessEqual returns a mask whose elements indicate whether x <= y
+// LessEqual returns a mask whose elements indicate whether x <= y.
 //
-// Emulated, CPU Feature AVX2
+// Emulated, CPU Feature: AVX2
 func (x Uint8x16) LessEqual(y Uint8x16) Mask8x16 {
 	a, b := x.AsInt8x16(), y.AsInt8x16()
 	ones := x.Equal(x).ToInt8x16()
@@ -290,18 +290,18 @@ func (x Uint8x16) LessEqual(y Uint8x16) Mask8x16 {
 	return a.Xor(signs).Greater(b.Xor(signs)).ToInt8x16().Xor(ones).asMask()
 }
 
-// NotEqual returns a mask whose elements indicate whether x != y
+// NotEqual returns a mask whose elements indicate whether x != y.
 //
-// Emulated, CPU Feature AVX
+// Emulated, CPU Feature: AVX
 func (x Uint8x16) NotEqual(y Uint8x16) Mask8x16 {
 	a, b := x.AsInt8x16(), y.AsInt8x16()
 	ones := x.Equal(x).ToInt8x16()
 	return a.Equal(b).ToInt8x16().Xor(ones).asMask()
 }
 
-// Greater returns a mask whose elements indicate whether x > y
+// Greater returns a mask whose elements indicate whether x > y.
 //
-// Emulated, CPU Feature AVX
+// Emulated, CPU Feature: AVX
 func (x Uint16x8) Greater(y Uint16x8) Mask16x8 {
 	a, b := x.AsInt16x8(), y.AsInt16x8()
 	ones := x.Equal(x).ToInt16x8()
@@ -309,9 +309,9 @@ func (x Uint16x8) Greater(y Uint16x8) Mask16x8 {
 	return a.Xor(signs).Greater(b.Xor(signs))
 }
 
-// Less returns a mask whose elements indicate whether x < y
+// Less returns a mask whose elements indicate whether x < y.
 //
-// Emulated, CPU Feature AVX
+// Emulated, CPU Feature: AVX
 func (x Uint16x8) Less(y Uint16x8) Mask16x8 {
 	a, b := x.AsInt16x8(), y.AsInt16x8()
 	ones := x.Equal(x).ToInt16x8()
@@ -319,9 +319,9 @@ func (x Uint16x8) Less(y Uint16x8) Mask16x8 {
 	return b.Xor(signs).Greater(a.Xor(signs))
 }
 
-// GreaterEqual returns a mask whose elements indicate whether x >= y
+// GreaterEqual returns a mask whose elements indicate whether x >= y.
 //
-// Emulated, CPU Feature AVX
+// Emulated, CPU Feature: AVX
 func (x Uint16x8) GreaterEqual(y Uint16x8) Mask16x8 {
 	a, b := x.AsInt16x8(), y.AsInt16x8()
 	ones := x.Equal(x).ToInt16x8()
@@ -329,9 +329,9 @@ func (x Uint16x8) GreaterEqual(y Uint16x8) Mask16x8 {
 	return b.Xor(signs).Greater(a.Xor(signs)).ToInt16x8().Xor(ones).asMask()
 }
 
-// LessEqual returns a mask whose elements indicate whether x <= y
+// LessEqual returns a mask whose elements indicate whether x <= y.
 //
-// Emulated, CPU Feature AVX
+// Emulated, CPU Feature: AVX
 func (x Uint16x8) LessEqual(y Uint16x8) Mask16x8 {
 	a, b := x.AsInt16x8(), y.AsInt16x8()
 	ones := x.Equal(x).ToInt16x8()
@@ -339,18 +339,18 @@ func (x Uint16x8) LessEqual(y Uint16x8) Mask16x8 {
 	return a.Xor(signs).Greater(b.Xor(signs)).ToInt16x8().Xor(ones).asMask()
 }
 
-// NotEqual returns a mask whose elements indicate whether x != y
+// NotEqual returns a mask whose elements indicate whether x != y.
 //
-// Emulated, CPU Feature AVX
+// Emulated, CPU Feature: AVX
 func (x Uint16x8) NotEqual(y Uint16x8) Mask16x8 {
 	a, b := x.AsInt16x8(), y.AsInt16x8()
 	ones := x.Equal(x).ToInt16x8()
 	return a.Equal(b).ToInt16x8().Xor(ones).asMask()
 }
 
-// Greater returns a mask whose elements indicate whether x > y
+// Greater returns a mask whose elements indicate whether x > y.
 //
-// Emulated, CPU Feature AVX
+// Emulated, CPU Feature: AVX
 func (x Uint32x4) Greater(y Uint32x4) Mask32x4 {
 	a, b := x.AsInt32x4(), y.AsInt32x4()
 	ones := x.Equal(x).ToInt32x4()
@@ -358,9 +358,9 @@ func (x Uint32x4) Greater(y Uint32x4) Mask32x4 {
 	return a.Xor(signs).Greater(b.Xor(signs))
 }
 
-// Less returns a mask whose elements indicate whether x < y
+// Less returns a mask whose elements indicate whether x < y.
 //
-// Emulated, CPU Feature AVX
+// Emulated, CPU Feature: AVX
 func (x Uint32x4) Less(y Uint32x4) Mask32x4 {
 	a, b := x.AsInt32x4(), y.AsInt32x4()
 	ones := x.Equal(x).ToInt32x4()
@@ -368,9 +368,9 @@ func (x Uint32x4) Less(y Uint32x4) Mask32x4 {
 	return b.Xor(signs).Greater(a.Xor(signs))
 }
 
-// GreaterEqual returns a mask whose elements indicate whether x >= y
+// GreaterEqual returns a mask whose elements indicate whether x >= y.
 //
-// Emulated, CPU Feature AVX
+// Emulated, CPU Feature: AVX
 func (x Uint32x4) GreaterEqual(y Uint32x4) Mask32x4 {
 	a, b := x.AsInt32x4(), y.AsInt32x4()
 	ones := x.Equal(x).ToInt32x4()
@@ -378,9 +378,9 @@ func (x Uint32x4) GreaterEqual(y Uint32x4) Mask32x4 {
 	return b.Xor(signs).Greater(a.Xor(signs)).ToInt32x4().Xor(ones).asMask()
 }
 
-// LessEqual returns a mask whose elements indicate whether x <= y
+// LessEqual returns a mask whose elements indicate whether x <= y.
 //
-// Emulated, CPU Feature AVX
+// Emulated, CPU Feature: AVX
 func (x Uint32x4) LessEqual(y Uint32x4) Mask32x4 {
 	a, b := x.AsInt32x4(), y.AsInt32x4()
 	ones := x.Equal(x).ToInt32x4()
@@ -388,18 +388,18 @@ func (x Uint32x4) LessEqual(y Uint32x4) Mask32x4 {
 	return a.Xor(signs).Greater(b.Xor(signs)).ToInt32x4().Xor(ones).asMask()
 }
 
-// NotEqual returns a mask whose elements indicate whether x != y
+// NotEqual returns a mask whose elements indicate whether x != y.
 //
-// Emulated, CPU Feature AVX
+// Emulated, CPU Feature: AVX
 func (x Uint32x4) NotEqual(y Uint32x4) Mask32x4 {
 	a, b := x.AsInt32x4(), y.AsInt32x4()
 	ones := x.Equal(x).ToInt32x4()
 	return a.Equal(b).ToInt32x4().Xor(ones).asMask()
 }
 
-// Greater returns a mask whose elements indicate whether x > y
+// Greater returns a mask whose elements indicate whether x > y.
 //
-// Emulated, CPU Feature AVX
+// Emulated, CPU Feature: AVX
 func (x Uint64x2) Greater(y Uint64x2) Mask64x2 {
 	a, b := x.AsInt64x2(), y.AsInt64x2()
 	ones := x.Equal(x).ToInt64x2()
@@ -407,9 +407,9 @@ func (x Uint64x2) Greater(y Uint64x2) Mask64x2 {
 	return a.Xor(signs).Greater(b.Xor(signs))
 }
 
-// Less returns a mask whose elements indicate whether x < y
+// Less returns a mask whose elements indicate whether x < y.
 //
-// Emulated, CPU Feature AVX
+// Emulated, CPU Feature: AVX
 func (x Uint64x2) Less(y Uint64x2) Mask64x2 {
 	a, b := x.AsInt64x2(), y.AsInt64x2()
 	ones := x.Equal(x).ToInt64x2()
@@ -417,9 +417,9 @@ func (x Uint64x2) Less(y Uint64x2) Mask64x2 {
 	return b.Xor(signs).Greater(a.Xor(signs))
 }
 
-// GreaterEqual returns a mask whose elements indicate whether x >= y
+// GreaterEqual returns a mask whose elements indicate whether x >= y.
 //
-// Emulated, CPU Feature AVX
+// Emulated, CPU Feature: AVX
 func (x Uint64x2) GreaterEqual(y Uint64x2) Mask64x2 {
 	a, b := x.AsInt64x2(), y.AsInt64x2()
 	ones := x.Equal(x).ToInt64x2()
@@ -427,9 +427,9 @@ func (x Uint64x2) GreaterEqual(y Uint64x2) Mask64x2 {
 	return b.Xor(signs).Greater(a.Xor(signs)).ToInt64x2().Xor(ones).asMask()
 }
 
-// LessEqual returns a mask whose elements indicate whether x <= y
+// LessEqual returns a mask whose elements indicate whether x <= y.
 //
-// Emulated, CPU Feature AVX
+// Emulated, CPU Feature: AVX
 func (x Uint64x2) LessEqual(y Uint64x2) Mask64x2 {
 	a, b := x.AsInt64x2(), y.AsInt64x2()
 	ones := x.Equal(x).ToInt64x2()
@@ -437,36 +437,36 @@ func (x Uint64x2) LessEqual(y Uint64x2) Mask64x2 {
 	return a.Xor(signs).Greater(b.Xor(signs)).ToInt64x2().Xor(ones).asMask()
 }
 
-// NotEqual returns a mask whose elements indicate whether x != y
+// NotEqual returns a mask whose elements indicate whether x != y.
 //
-// Emulated, CPU Feature AVX
+// Emulated, CPU Feature: AVX
 func (x Uint64x2) NotEqual(y Uint64x2) Mask64x2 {
 	a, b := x.AsInt64x2(), y.AsInt64x2()
 	ones := x.Equal(x).ToInt64x2()
 	return a.Equal(b).ToInt64x2().Xor(ones).asMask()
 }
 
-// Greater returns a mask whose elements indicate whether x > y
+// Greater returns a mask whose elements indicate whether x > y.
 //
-// Emulated, CPU Feature AVX2
+// Emulated, CPU Feature: AVX2
 func (x Uint8x32) Greater(y Uint8x32) Mask8x32 {
 	a, b := x.AsInt8x32(), y.AsInt8x32()
 	signs := BroadcastInt8x32(-1 << (8 - 1))
 	return a.Xor(signs).Greater(b.Xor(signs))
 }
 
-// Less returns a mask whose elements indicate whether x < y
+// Less returns a mask whose elements indicate whether x < y.
 //
-// Emulated, CPU Feature AVX2
+// Emulated, CPU Feature: AVX2
 func (x Uint8x32) Less(y Uint8x32) Mask8x32 {
 	a, b := x.AsInt8x32(), y.AsInt8x32()
 	signs := BroadcastInt8x32(-1 << (8 - 1))
 	return b.Xor(signs).Greater(a.Xor(signs))
 }
 
-// GreaterEqual returns a mask whose elements indicate whether x >= y
+// GreaterEqual returns a mask whose elements indicate whether x >= y.
 //
-// Emulated, CPU Feature AVX2
+// Emulated, CPU Feature: AVX2
 func (x Uint8x32) GreaterEqual(y Uint8x32) Mask8x32 {
 	a, b := x.AsInt8x32(), y.AsInt8x32()
 	ones := x.Equal(x).ToInt8x32()
@@ -474,9 +474,9 @@ func (x Uint8x32) GreaterEqual(y Uint8x32) Mask8x32 {
 	return b.Xor(signs).Greater(a.Xor(signs)).ToInt8x32().Xor(ones).asMask()
 }
 
-// LessEqual returns a mask whose elements indicate whether x <= y
+// LessEqual returns a mask whose elements indicate whether x <= y.
 //
-// Emulated, CPU Feature AVX2
+// Emulated, CPU Feature: AVX2
 func (x Uint8x32) LessEqual(y Uint8x32) Mask8x32 {
 	a, b := x.AsInt8x32(), y.AsInt8x32()
 	ones := x.Equal(x).ToInt8x32()
@@ -484,18 +484,18 @@ func (x Uint8x32) LessEqual(y Uint8x32) Mask8x32 {
 	return a.Xor(signs).Greater(b.Xor(signs)).ToInt8x32().Xor(ones).asMask()
 }
 
-// NotEqual returns a mask whose elements indicate whether x != y
+// NotEqual returns a mask whose elements indicate whether x != y.
 //
-// Emulated, CPU Feature AVX2
+// Emulated, CPU Feature: AVX2
 func (x Uint8x32) NotEqual(y Uint8x32) Mask8x32 {
 	a, b := x.AsInt8x32(), y.AsInt8x32()
 	ones := x.Equal(x).ToInt8x32()
 	return a.Equal(b).ToInt8x32().Xor(ones).asMask()
 }
 
-// Greater returns a mask whose elements indicate whether x > y
+// Greater returns a mask whose elements indicate whether x > y.
 //
-// Emulated, CPU Feature AVX2
+// Emulated, CPU Feature: AVX2
 func (x Uint16x16) Greater(y Uint16x16) Mask16x16 {
 	a, b := x.AsInt16x16(), y.AsInt16x16()
 	ones := x.Equal(x).ToInt16x16()
@@ -503,9 +503,9 @@ func (x Uint16x16) Greater(y Uint16x16) Mask16x16 {
 	return a.Xor(signs).Greater(b.Xor(signs))
 }
 
-// Less returns a mask whose elements indicate whether x < y
+// Less returns a mask whose elements indicate whether x < y.
 //
-// Emulated, CPU Feature AVX2
+// Emulated, CPU Feature: AVX2
 func (x Uint16x16) Less(y Uint16x16) Mask16x16 {
 	a, b := x.AsInt16x16(), y.AsInt16x16()
 	ones := x.Equal(x).ToInt16x16()
@@ -513,9 +513,9 @@ func (x Uint16x16) Less(y Uint16x16) Mask16x16 {
 	return b.Xor(signs).Greater(a.Xor(signs))
 }
 
-// GreaterEqual returns a mask whose elements indicate whether x >= y
+// GreaterEqual returns a mask whose elements indicate whether x >= y.
 //
-// Emulated, CPU Feature AVX2
+// Emulated, CPU Feature: AVX2
 func (x Uint16x16) GreaterEqual(y Uint16x16) Mask16x16 {
 	a, b := x.AsInt16x16(), y.AsInt16x16()
 	ones := x.Equal(x).ToInt16x16()
@@ -523,9 +523,9 @@ func (x Uint16x16) GreaterEqual(y Uint16x16) Mask16x16 {
 	return b.Xor(signs).Greater(a.Xor(signs)).ToInt16x16().Xor(ones).asMask()
 }
 
-// LessEqual returns a mask whose elements indicate whether x <= y
+// LessEqual returns a mask whose elements indicate whether x <= y.
 //
-// Emulated, CPU Feature AVX2
+// Emulated, CPU Feature: AVX2
 func (x Uint16x16) LessEqual(y Uint16x16) Mask16x16 {
 	a, b := x.AsInt16x16(), y.AsInt16x16()
 	ones := x.Equal(x).ToInt16x16()
@@ -533,18 +533,18 @@ func (x Uint16x16) LessEqual(y Uint16x16) Mask16x16 {
 	return a.Xor(signs).Greater(b.Xor(signs)).ToInt16x16().Xor(ones).asMask()
 }
 
-// NotEqual returns a mask whose elements indicate whether x != y
+// NotEqual returns a mask whose elements indicate whether x != y.
 //
-// Emulated, CPU Feature AVX2
+// Emulated, CPU Feature: AVX2
 func (x Uint16x16) NotEqual(y Uint16x16) Mask16x16 {
 	a, b := x.AsInt16x16(), y.AsInt16x16()
 	ones := x.Equal(x).ToInt16x16()
 	return a.Equal(b).ToInt16x16().Xor(ones).asMask()
 }
 
-// Greater returns a mask whose elements indicate whether x > y
+// Greater returns a mask whose elements indicate whether x > y.
 //
-// Emulated, CPU Feature AVX2
+// Emulated, CPU Feature: AVX2
 func (x Uint32x8) Greater(y Uint32x8) Mask32x8 {
 	a, b := x.AsInt32x8(), y.AsInt32x8()
 	ones := x.Equal(x).ToInt32x8()
@@ -552,9 +552,9 @@ func (x Uint32x8) Greater(y Uint32x8) Mask32x8 {
 	return a.Xor(signs).Greater(b.Xor(signs))
 }
 
-// Less returns a mask whose elements indicate whether x < y
+// Less returns a mask whose elements indicate whether x < y.
 //
-// Emulated, CPU Feature AVX2
+// Emulated, CPU Feature: AVX2
 func (x Uint32x8) Less(y Uint32x8) Mask32x8 {
 	a, b := x.AsInt32x8(), y.AsInt32x8()
 	ones := x.Equal(x).ToInt32x8()
@@ -562,9 +562,9 @@ func (x Uint32x8) Less(y Uint32x8) Mask32x8 {
 	return b.Xor(signs).Greater(a.Xor(signs))
 }
 
-// GreaterEqual returns a mask whose elements indicate whether x >= y
+// GreaterEqual returns a mask whose elements indicate whether x >= y.
 //
-// Emulated, CPU Feature AVX2
+// Emulated, CPU Feature: AVX2
 func (x Uint32x8) GreaterEqual(y Uint32x8) Mask32x8 {
 	a, b := x.AsInt32x8(), y.AsInt32x8()
 	ones := x.Equal(x).ToInt32x8()
@@ -572,9 +572,9 @@ func (x Uint32x8) GreaterEqual(y Uint32x8) Mask32x8 {
 	return b.Xor(signs).Greater(a.Xor(signs)).ToInt32x8().Xor(ones).asMask()
 }
 
-// LessEqual returns a mask whose elements indicate whether x <= y
+// LessEqual returns a mask whose elements indicate whether x <= y.
 //
-// Emulated, CPU Feature AVX2
+// Emulated, CPU Feature: AVX2
 func (x Uint32x8) LessEqual(y Uint32x8) Mask32x8 {
 	a, b := x.AsInt32x8(), y.AsInt32x8()
 	ones := x.Equal(x).ToInt32x8()
@@ -582,18 +582,18 @@ func (x Uint32x8) LessEqual(y Uint32x8) Mask32x8 {
 	return a.Xor(signs).Greater(b.Xor(signs)).ToInt32x8().Xor(ones).asMask()
 }
 
-// NotEqual returns a mask whose elements indicate whether x != y
+// NotEqual returns a mask whose elements indicate whether x != y.
 //
-// Emulated, CPU Feature AVX2
+// Emulated, CPU Feature: AVX2
 func (x Uint32x8) NotEqual(y Uint32x8) Mask32x8 {
 	a, b := x.AsInt32x8(), y.AsInt32x8()
 	ones := x.Equal(x).ToInt32x8()
 	return a.Equal(b).ToInt32x8().Xor(ones).asMask()
 }
 
-// Greater returns a mask whose elements indicate whether x > y
+// Greater returns a mask whose elements indicate whether x > y.
 //
-// Emulated, CPU Feature AVX2
+// Emulated, CPU Feature: AVX2
 func (x Uint64x4) Greater(y Uint64x4) Mask64x4 {
 	a, b := x.AsInt64x4(), y.AsInt64x4()
 	ones := x.Equal(x).ToInt64x4()
@@ -601,9 +601,9 @@ func (x Uint64x4) Greater(y Uint64x4) Mask64x4 {
 	return a.Xor(signs).Greater(b.Xor(signs))
 }
 
-// Less returns a mask whose elements indicate whether x < y
+// Less returns a mask whose elements indicate whether x < y.
 //
-// Emulated, CPU Feature AVX2
+// Emulated, CPU Feature: AVX2
 func (x Uint64x4) Less(y Uint64x4) Mask64x4 {
 	a, b := x.AsInt64x4(), y.AsInt64x4()
 	ones := x.Equal(x).ToInt64x4()
@@ -611,9 +611,9 @@ func (x Uint64x4) Less(y Uint64x4) Mask64x4 {
 	return b.Xor(signs).Greater(a.Xor(signs))
 }
 
-// GreaterEqual returns a mask whose elements indicate whether x >= y
+// GreaterEqual returns a mask whose elements indicate whether x >= y.
 //
-// Emulated, CPU Feature AVX2
+// Emulated, CPU Feature: AVX2
 func (x Uint64x4) GreaterEqual(y Uint64x4) Mask64x4 {
 	a, b := x.AsInt64x4(), y.AsInt64x4()
 	ones := x.Equal(x).ToInt64x4()
@@ -621,9 +621,9 @@ func (x Uint64x4) GreaterEqual(y Uint64x4) Mask64x4 {
 	return b.Xor(signs).Greater(a.Xor(signs)).ToInt64x4().Xor(ones).asMask()
 }
 
-// LessEqual returns a mask whose elements indicate whether x <= y
+// LessEqual returns a mask whose elements indicate whether x <= y.
 //
-// Emulated, CPU Feature AVX2
+// Emulated, CPU Feature: AVX2
 func (x Uint64x4) LessEqual(y Uint64x4) Mask64x4 {
 	a, b := x.AsInt64x4(), y.AsInt64x4()
 	ones := x.Equal(x).ToInt64x4()
@@ -631,9 +631,9 @@ func (x Uint64x4) LessEqual(y Uint64x4) Mask64x4 {
 	return a.Xor(signs).Greater(b.Xor(signs)).ToInt64x4().Xor(ones).asMask()
 }
 
-// NotEqual returns a mask whose elements indicate whether x != y
+// NotEqual returns a mask whose elements indicate whether x != y.
 //
-// Emulated, CPU Feature AVX2
+// Emulated, CPU Feature: AVX2
 func (x Uint64x4) NotEqual(y Uint64x4) Mask64x4 {
 	a, b := x.AsInt64x4(), y.AsInt64x4()
 	ones := x.Equal(x).ToInt64x4()
diff --git a/src/simd/archsimd/cpu.go b/src/simd/archsimd/cpu.go
index bb0ebbc16a..d0c0ff5426 100644
--- a/src/simd/archsimd/cpu.go
+++ b/src/simd/archsimd/cpu.go
@@ -1,4 +1,4 @@
-// Code generated by x/arch/internal/simdgen using 'go run . -xedPath $XED_PATH -o godefs -goroot $GOROOT go.yaml types.yaml categories.yaml'; DO NOT EDIT.
+// Code generated by 'simdgen -o godefs -goroot $GOROOT -xedPath $XED_PATH go.yaml types.yaml categories.yaml'; DO NOT EDIT.
 
 //go:build goexperiment.simd
 
diff --git a/src/simd/archsimd/extra_amd64.go b/src/simd/archsimd/extra_amd64.go
index 921e148f63..b0dba6d234 100644
--- a/src/simd/archsimd/extra_amd64.go
+++ b/src/simd/archsimd/extra_amd64.go
@@ -19,7 +19,7 @@ func ClearAVXUpperBits()
 // IsZero returns true if all elements of x are zeros.
 //
 // This method compiles to VPTEST x, x.
-// x.And(y).IsZero() and x.AndNot(y).IsZero() will be optimized to VPTEST x, y
+// x.And(y).IsZero() and x.AndNot(y).IsZero() will be optimized to VPTEST x, y.
 //
 // Asm: VPTEST, CPU Feature: AVX
 func (x Int8x16) IsZero() bool
@@ -27,7 +27,7 @@ func (x Int8x16) IsZero() bool
 // IsZero returns true if all elements of x are zeros.
 //
 // This method compiles to VPTEST x, x.
-// x.And(y).IsZero() and x.AndNot(y).IsZero() will be optimized to VPTEST x, y
+// x.And(y).IsZero() and x.AndNot(y).IsZero() will be optimized to VPTEST x, y.
 //
 // Asm: VPTEST, CPU Feature: AVX
 func (x Int8x32) IsZero() bool
@@ -35,7 +35,7 @@ func (x Int8x32) IsZero() bool
 // IsZero returns true if all elements of x are zeros.
 //
 // This method compiles to VPTEST x, x.
-// x.And(y).IsZero() and x.AndNot(y).IsZero() will be optimized to VPTEST x, y
+// x.And(y).IsZero() and x.AndNot(y).IsZero() will be optimized to VPTEST x, y.
 //
 // Asm: VPTEST, CPU Feature: AVX
 func (x Int16x8) IsZero() bool
@@ -43,7 +43,7 @@ func (x Int16x8) IsZero() bool
 // IsZero returns true if all elements of x are zeros.
 //
 // This method compiles to VPTEST x, x.
-// x.And(y).IsZero() and x.AndNot(y).IsZero() will be optimized to VPTEST x, y
+// x.And(y).IsZero() and x.AndNot(y).IsZero() will be optimized to VPTEST x, y.
 //
 // Asm: VPTEST, CPU Feature: AVX
 func (x Int16x16) IsZero() bool
@@ -51,7 +51,7 @@ func (x Int16x16) IsZero() bool
 // IsZero returns true if all elements of x are zeros.
 //
 // This method compiles to VPTEST x, x.
-// x.And(y).IsZero() and x.AndNot(y).IsZero() will be optimized to VPTEST x, y
+// x.And(y).IsZero() and x.AndNot(y).IsZero() will be optimized to VPTEST x, y.
 //
 // Asm: VPTEST, CPU Feature: AVX
 func (x Int32x4) IsZero() bool
@@ -59,7 +59,7 @@ func (x Int32x4) IsZero() bool
 // IsZero returns true if all elements of x are zeros.
 //
 // This method compiles to VPTEST x, x.
-// x.And(y).IsZero() and x.AndNot(y).IsZero() will be optimized to VPTEST x, y
+// x.And(y).IsZero() and x.AndNot(y).IsZero() will be optimized to VPTEST x, y.
 //
 // Asm: VPTEST, CPU Feature: AVX
 func (x Int32x8) IsZero() bool
@@ -67,7 +67,7 @@ func (x Int32x8) IsZero() bool
 // IsZero returns true if all elements of x are zeros.
 //
 // This method compiles to VPTEST x, x.
-// x.And(y).IsZero() and x.AndNot(y).IsZero() will be optimized to VPTEST x, y
+// x.And(y).IsZero() and x.AndNot(y).IsZero() will be optimized to VPTEST x, y.
 //
 // Asm: VPTEST, CPU Feature: AVX
 func (x Int64x2) IsZero() bool
@@ -75,7 +75,7 @@ func (x Int64x2) IsZero() bool
 // IsZero returns true if all elements of x are zeros.
 //
 // This method compiles to VPTEST x, x.
-// x.And(y).IsZero() and x.AndNot(y).IsZero() will be optimized to VPTEST x, y
+// x.And(y).IsZero() and x.AndNot(y).IsZero() will be optimized to VPTEST x, y.
 //
 // Asm: VPTEST, CPU Feature: AVX
 func (x Int64x4) IsZero() bool
@@ -83,7 +83,7 @@ func (x Int64x4) IsZero() bool
 // IsZero returns true if all elements of x are zeros.
 //
 // This method compiles to VPTEST x, x.
-// x.And(y).IsZero() and x.AndNot(y).IsZero() will be optimized to VPTEST x, y
+// x.And(y).IsZero() and x.AndNot(y).IsZero() will be optimized to VPTEST x, y.
 //
 // Asm: VPTEST, CPU Feature: AVX
 func (x Uint8x16) IsZero() bool
@@ -91,7 +91,7 @@ func (x Uint8x16) IsZero() bool
 // IsZero returns true if all elements of x are zeros.
 //
 // This method compiles to VPTEST x, x.
-// x.And(y).IsZero() and x.AndNot(y).IsZero() will be optimized to VPTEST x, y
+// x.And(y).IsZero() and x.AndNot(y).IsZero() will be optimized to VPTEST x, y.
 //
 // Asm: VPTEST, CPU Feature: AVX
 func (x Uint8x32) IsZero() bool
@@ -99,7 +99,7 @@ func (x Uint8x32) IsZero() bool
 // IsZero returns true if all elements of x are zeros.
 //
 // This method compiles to VPTEST x, x.
-// x.And(y).IsZero() and x.AndNot(y).IsZero() will be optimized to VPTEST x, y
+// x.And(y).IsZero() and x.AndNot(y).IsZero() will be optimized to VPTEST x, y.
 //
 // Asm: VPTEST, CPU Feature: AVX
 func (x Uint16x8) IsZero() bool
@@ -107,7 +107,7 @@ func (x Uint16x8) IsZero() bool
 // IsZero returns true if all elements of x are zeros.
 //
 // This method compiles to VPTEST x, x.
-// x.And(y).IsZero() and x.AndNot(y).IsZero() will be optimized to VPTEST x, y
+// x.And(y).IsZero() and x.AndNot(y).IsZero() will be optimized to VPTEST x, y.
 //
 // Asm: VPTEST, CPU Feature: AVX
 func (x Uint16x16) IsZero() bool
@@ -115,7 +115,7 @@ func (x Uint16x16) IsZero() bool
 // IsZero returns true if all elements of x are zeros.
 //
 // This method compiles to VPTEST x, x.
-// x.And(y).IsZero() and x.AndNot(y).IsZero() will be optimized to VPTEST x, y
+// x.And(y).IsZero() and x.AndNot(y).IsZero() will be optimized to VPTEST x, y.
 //
 // Asm: VPTEST, CPU Feature: AVX
 func (x Uint32x4) IsZero() bool
@@ -123,7 +123,7 @@ func (x Uint32x4) IsZero() bool
 // IsZero returns true if all elements of x are zeros.
 //
 // This method compiles to VPTEST x, x.
-// x.And(y).IsZero() and x.AndNot(y).IsZero() will be optimized to VPTEST x, y
+// x.And(y).IsZero() and x.AndNot(y).IsZero() will be optimized to VPTEST x, y.
 //
 // Asm: VPTEST, CPU Feature: AVX
 func (x Uint32x8) IsZero() bool
@@ -131,7 +131,7 @@ func (x Uint32x8) IsZero() bool
 // IsZero returns true if all elements of x are zeros.
 //
 // This method compiles to VPTEST x, x.
-// x.And(y).IsZero() and x.AndNot(y).IsZero() will be optimized to VPTEST x, y
+// x.And(y).IsZero() and x.AndNot(y).IsZero() will be optimized to VPTEST x, y.
 //
 // Asm: VPTEST, CPU Feature: AVX
 func (x Uint64x2) IsZero() bool
@@ -139,7 +139,43 @@ func (x Uint64x2) IsZero() bool
 // IsZero returns true if all elements of x are zeros.
 //
 // This method compiles to VPTEST x, x.
-// x.And(y).IsZero() and x.AndNot(y).IsZero() will be optimized to VPTEST x, y
+// x.And(y).IsZero() and x.AndNot(y).IsZero() will be optimized to VPTEST x, y.
 //
 // Asm: VPTEST, CPU Feature: AVX
 func (x Uint64x4) IsZero() bool
+
+// IsNaN returns a mask whose elements indicate whether the corresponding
+// elements of x are NaN.
+//
+// Asm: VCMPPS, CPU Feature: AVX
+func (x Float32x4) IsNaN() Mask32x4
+
+// IsNaN returns a mask whose elements indicate whether the corresponding
+// elements of x are NaN.
+//
+// Asm: VCMPPS, CPU Feature: AVX
+func (x Float32x8) IsNaN() Mask32x8
+
+// IsNaN returns a mask whose elements indicate whether the corresponding
+// elements of x are NaN.
+//
+// Asm: VCMPPS, CPU Feature: AVX512
+func (x Float32x16) IsNaN() Mask32x16
+
+// IsNaN returns a mask whose elements indicate whether the corresponding
+// elements of x are NaN.
+//
+// Asm: VCMPPD, CPU Feature: AVX
+func (x Float64x2) IsNaN() Mask64x2
+
+// IsNaN returns a mask whose elements indicate whether the corresponding
+// elements of x are NaN.
+//
+// Asm: VCMPPD, CPU Feature: AVX
+func (x Float64x4) IsNaN() Mask64x4
+
+// IsNaN returns a mask whose elements indicate whether the corresponding
+// elements of x are NaN.
+//
+// Asm: VCMPPD, CPU Feature: AVX512
+func (x Float64x8) IsNaN() Mask64x8
diff --git a/src/simd/archsimd/internal/simd_test/binary_helpers_test.go b/src/simd/archsimd/internal/simd_test/binary_helpers_test.go
index 9c361dbeb9..c725f657f3 100644
--- a/src/simd/archsimd/internal/simd_test/binary_helpers_test.go
+++ b/src/simd/archsimd/internal/simd_test/binary_helpers_test.go
@@ -1,6 +1,6 @@
-// Code generated by 'go run genfiles.go'; DO NOT EDIT.
+// Code generated by 'tmplgen'; DO NOT EDIT.
 
-//go:build goexperiment.simd
+//go:build goexperiment.simd && amd64
 
 // This file contains functions testing binary simd methods.
 // Each function in this file is specialized for a
diff --git a/src/simd/archsimd/internal/simd_test/binary_test.go b/src/simd/archsimd/internal/simd_test/binary_test.go
index fa2b9511ca..28efdcb52f 100644
--- a/src/simd/archsimd/internal/simd_test/binary_test.go
+++ b/src/simd/archsimd/internal/simd_test/binary_test.go
@@ -17,23 +17,29 @@ func TestAdd(t *testing.T) {
 	testFloat64x2Binary(t, archsimd.Float64x2.Add, addSlice[float64])
 	testFloat64x4Binary(t, archsimd.Float64x4.Add, addSlice[float64])
 
-	testInt16x16Binary(t, archsimd.Int16x16.Add, addSlice[int16])
 	testInt16x8Binary(t, archsimd.Int16x8.Add, addSlice[int16])
 	testInt32x4Binary(t, archsimd.Int32x4.Add, addSlice[int32])
-	testInt32x8Binary(t, archsimd.Int32x8.Add, addSlice[int32])
 	testInt64x2Binary(t, archsimd.Int64x2.Add, addSlice[int64])
-	testInt64x4Binary(t, archsimd.Int64x4.Add, addSlice[int64])
 	testInt8x16Binary(t, archsimd.Int8x16.Add, addSlice[int8])
-	testInt8x32Binary(t, archsimd.Int8x32.Add, addSlice[int8])
 
-	testUint16x16Binary(t, archsimd.Uint16x16.Add, addSlice[uint16])
-	testUint16x8Binary(t, archsimd.Uint16x8.Add, addSlice[uint16])
 	testUint32x4Binary(t, archsimd.Uint32x4.Add, addSlice[uint32])
-	testUint32x8Binary(t, archsimd.Uint32x8.Add, addSlice[uint32])
 	testUint64x2Binary(t, archsimd.Uint64x2.Add, addSlice[uint64])
-	testUint64x4Binary(t, archsimd.Uint64x4.Add, addSlice[uint64])
+	testUint16x8Binary(t, archsimd.Uint16x8.Add, addSlice[uint16])
 	testUint8x16Binary(t, archsimd.Uint8x16.Add, addSlice[uint8])
-	testUint8x32Binary(t, archsimd.Uint8x32.Add, addSlice[uint8])
+
+	if archsimd.X86.AVX2() {
+		testUint16x16Binary(t, archsimd.Uint16x16.Add, addSlice[uint16])
+		testUint32x8Binary(t, archsimd.Uint32x8.Add, addSlice[uint32])
+		testUint64x4Binary(t, archsimd.Uint64x4.Add, addSlice[uint64])
+		testUint8x32Binary(t, archsimd.Uint8x32.Add, addSlice[uint8])
+	}
+
+	if archsimd.X86.AVX2() {
+		testInt16x16Binary(t, archsimd.Int16x16.Add, addSlice[int16])
+		testInt32x8Binary(t, archsimd.Int32x8.Add, addSlice[int32])
+		testInt64x4Binary(t, archsimd.Int64x4.Add, addSlice[int64])
+		testInt8x32Binary(t, archsimd.Int8x32.Add, addSlice[int8])
+	}
 
 	if archsimd.X86.AVX512() {
 		testFloat32x16Binary(t, archsimd.Float32x16.Add, addSlice[float32])
@@ -55,23 +61,29 @@ func TestSub(t *testing.T) {
 	testFloat64x2Binary(t, archsimd.Float64x2.Sub, subSlice[float64])
 	testFloat64x4Binary(t, archsimd.Float64x4.Sub, subSlice[float64])
 
-	testInt16x16Binary(t, archsimd.Int16x16.Sub, subSlice[int16])
-	testInt16x8Binary(t, archsimd.Int16x8.Sub, subSlice[int16])
 	testInt32x4Binary(t, archsimd.Int32x4.Sub, subSlice[int32])
-	testInt32x8Binary(t, archsimd.Int32x8.Sub, subSlice[int32])
+	testInt16x8Binary(t, archsimd.Int16x8.Sub, subSlice[int16])
 	testInt64x2Binary(t, archsimd.Int64x2.Sub, subSlice[int64])
-	testInt64x4Binary(t, archsimd.Int64x4.Sub, subSlice[int64])
 	testInt8x16Binary(t, archsimd.Int8x16.Sub, subSlice[int8])
-	testInt8x32Binary(t, archsimd.Int8x32.Sub, subSlice[int8])
 
-	testUint16x16Binary(t, archsimd.Uint16x16.Sub, subSlice[uint16])
-	testUint16x8Binary(t, archsimd.Uint16x8.Sub, subSlice[uint16])
+	if archsimd.X86.AVX2() {
+		testInt16x16Binary(t, archsimd.Int16x16.Sub, subSlice[int16])
+		testInt32x8Binary(t, archsimd.Int32x8.Sub, subSlice[int32])
+		testInt64x4Binary(t, archsimd.Int64x4.Sub, subSlice[int64])
+		testInt8x32Binary(t, archsimd.Int8x32.Sub, subSlice[int8])
+	}
+
 	testUint32x4Binary(t, archsimd.Uint32x4.Sub, subSlice[uint32])
-	testUint32x8Binary(t, archsimd.Uint32x8.Sub, subSlice[uint32])
+	testUint16x8Binary(t, archsimd.Uint16x8.Sub, subSlice[uint16])
 	testUint64x2Binary(t, archsimd.Uint64x2.Sub, subSlice[uint64])
-	testUint64x4Binary(t, archsimd.Uint64x4.Sub, subSlice[uint64])
 	testUint8x16Binary(t, archsimd.Uint8x16.Sub, subSlice[uint8])
-	testUint8x32Binary(t, archsimd.Uint8x32.Sub, subSlice[uint8])
+
+	if archsimd.X86.AVX2() {
+		testUint16x16Binary(t, archsimd.Uint16x16.Sub, subSlice[uint16])
+		testUint32x8Binary(t, archsimd.Uint32x8.Sub, subSlice[uint32])
+		testUint64x4Binary(t, archsimd.Uint64x4.Sub, subSlice[uint64])
+		testUint8x32Binary(t, archsimd.Uint8x32.Sub, subSlice[uint8])
+	}
 
 	if archsimd.X86.AVX512() {
 		testFloat32x16Binary(t, archsimd.Float32x16.Sub, subSlice[float32])
@@ -93,10 +105,13 @@ func TestMax(t *testing.T) {
 	// testFloat64x2Binary(t, archsimd.Float64x2.Max, maxSlice[float64]) // nan is wrong
 	// testFloat64x4Binary(t, archsimd.Float64x4.Max, maxSlice[float64]) // nan is wrong
 
-	testInt16x16Binary(t, archsimd.Int16x16.Max, maxSlice[int16])
 	testInt16x8Binary(t, archsimd.Int16x8.Max, maxSlice[int16])
 	testInt32x4Binary(t, archsimd.Int32x4.Max, maxSlice[int32])
-	testInt32x8Binary(t, archsimd.Int32x8.Max, maxSlice[int32])
+
+	if archsimd.X86.AVX2() {
+		testInt16x16Binary(t, archsimd.Int16x16.Max, maxSlice[int16])
+		testInt32x8Binary(t, archsimd.Int32x8.Max, maxSlice[int32])
+	}
 
 	if archsimd.X86.AVX512() {
 		testInt64x2Binary(t, archsimd.Int64x2.Max, maxSlice[int64])
@@ -104,12 +119,18 @@ func TestMax(t *testing.T) {
 	}
 
 	testInt8x16Binary(t, archsimd.Int8x16.Max, maxSlice[int8])
-	testInt8x32Binary(t, archsimd.Int8x32.Max, maxSlice[int8])
 
-	testUint16x16Binary(t, archsimd.Uint16x16.Max, maxSlice[uint16])
+	if archsimd.X86.AVX2() {
+		testInt8x32Binary(t, archsimd.Int8x32.Max, maxSlice[int8])
+	}
+
 	testUint16x8Binary(t, archsimd.Uint16x8.Max, maxSlice[uint16])
 	testUint32x4Binary(t, archsimd.Uint32x4.Max, maxSlice[uint32])
-	testUint32x8Binary(t, archsimd.Uint32x8.Max, maxSlice[uint32])
+
+	if archsimd.X86.AVX2() {
+		testUint16x16Binary(t, archsimd.Uint16x16.Max, maxSlice[uint16])
+		testUint32x8Binary(t, archsimd.Uint32x8.Max, maxSlice[uint32])
+	}
 
 	if archsimd.X86.AVX512() {
 		testUint64x2Binary(t, archsimd.Uint64x2.Max, maxSlice[uint64])
@@ -117,7 +138,10 @@ func TestMax(t *testing.T) {
 	}
 
 	testUint8x16Binary(t, archsimd.Uint8x16.Max, maxSlice[uint8])
-	testUint8x32Binary(t, archsimd.Uint8x32.Max, maxSlice[uint8])
+
+	if archsimd.X86.AVX2() {
+		testUint8x32Binary(t, archsimd.Uint8x32.Max, maxSlice[uint8])
+	}
 
 	if archsimd.X86.AVX512() {
 		// testFloat32x16Binary(t, archsimd.Float32x16.Max, maxSlice[float32]) // nan is wrong
@@ -139,10 +163,13 @@ func TestMin(t *testing.T) {
 	// testFloat64x2Binary(t, archsimd.Float64x2.Min, minSlice[float64]) // nan is wrong
 	// testFloat64x4Binary(t, archsimd.Float64x4.Min, minSlice[float64]) // nan is wrong
 
-	testInt16x16Binary(t, archsimd.Int16x16.Min, minSlice[int16])
 	testInt16x8Binary(t, archsimd.Int16x8.Min, minSlice[int16])
 	testInt32x4Binary(t, archsimd.Int32x4.Min, minSlice[int32])
-	testInt32x8Binary(t, archsimd.Int32x8.Min, minSlice[int32])
+
+	if archsimd.X86.AVX2() {
+		testInt16x16Binary(t, archsimd.Int16x16.Min, minSlice[int16])
+		testInt32x8Binary(t, archsimd.Int32x8.Min, minSlice[int32])
+	}
 
 	if archsimd.X86.AVX512() {
 		testInt64x2Binary(t, archsimd.Int64x2.Min, minSlice[int64])
@@ -150,12 +177,18 @@ func TestMin(t *testing.T) {
 	}
 
 	testInt8x16Binary(t, archsimd.Int8x16.Min, minSlice[int8])
-	testInt8x32Binary(t, archsimd.Int8x32.Min, minSlice[int8])
 
-	testUint16x16Binary(t, archsimd.Uint16x16.Min, minSlice[uint16])
+	if archsimd.X86.AVX2() {
+		testInt8x32Binary(t, archsimd.Int8x32.Min, minSlice[int8])
+	}
+
 	testUint16x8Binary(t, archsimd.Uint16x8.Min, minSlice[uint16])
 	testUint32x4Binary(t, archsimd.Uint32x4.Min, minSlice[uint32])
-	testUint32x8Binary(t, archsimd.Uint32x8.Min, minSlice[uint32])
+
+	if archsimd.X86.AVX2() {
+		testUint16x16Binary(t, archsimd.Uint16x16.Min, minSlice[uint16])
+		testUint32x8Binary(t, archsimd.Uint32x8.Min, minSlice[uint32])
+	}
 
 	if archsimd.X86.AVX512() {
 		testUint64x2Binary(t, archsimd.Uint64x2.Min, minSlice[uint64])
@@ -163,7 +196,10 @@ func TestMin(t *testing.T) {
 	}
 
 	testUint8x16Binary(t, archsimd.Uint8x16.Min, minSlice[uint8])
-	testUint8x32Binary(t, archsimd.Uint8x32.Min, minSlice[uint8])
+
+	if archsimd.X86.AVX2() {
+		testUint8x32Binary(t, archsimd.Uint8x32.Min, minSlice[uint8])
+	}
 
 	if archsimd.X86.AVX512() {
 		// testFloat32x16Binary(t, archsimd.Float32x16.Min, minSlice[float32]) // nan is wrong
@@ -180,23 +216,29 @@ func TestMin(t *testing.T) {
 }
 
 func TestAnd(t *testing.T) {
-	testInt16x16Binary(t, archsimd.Int16x16.And, andSlice[int16])
 	testInt16x8Binary(t, archsimd.Int16x8.And, andSlice[int16])
 	testInt32x4Binary(t, archsimd.Int32x4.And, andSlice[int32])
-	testInt32x8Binary(t, archsimd.Int32x8.And, andSlice[int32])
 	testInt64x2Binary(t, archsimd.Int64x2.And, andSlice[int64])
-	testInt64x4Binary(t, archsimd.Int64x4.And, andSlice[int64])
 	testInt8x16Binary(t, archsimd.Int8x16.And, andSlice[int8])
-	testInt8x32Binary(t, archsimd.Int8x32.And, andSlice[int8])
 
-	testUint16x16Binary(t, archsimd.Uint16x16.And, andSlice[uint16])
+	if archsimd.X86.AVX2() {
+		testInt16x16Binary(t, archsimd.Int16x16.And, andSlice[int16])
+		testInt32x8Binary(t, archsimd.Int32x8.And, andSlice[int32])
+		testInt64x4Binary(t, archsimd.Int64x4.And, andSlice[int64])
+		testInt8x32Binary(t, archsimd.Int8x32.And, andSlice[int8])
+	}
+
 	testUint16x8Binary(t, archsimd.Uint16x8.And, andSlice[uint16])
 	testUint32x4Binary(t, archsimd.Uint32x4.And, andSlice[uint32])
-	testUint32x8Binary(t, archsimd.Uint32x8.And, andSlice[uint32])
 	testUint64x2Binary(t, archsimd.Uint64x2.And, andSlice[uint64])
-	testUint64x4Binary(t, archsimd.Uint64x4.And, andSlice[uint64])
 	testUint8x16Binary(t, archsimd.Uint8x16.And, andSlice[uint8])
-	testUint8x32Binary(t, archsimd.Uint8x32.And, andSlice[uint8])
+
+	if archsimd.X86.AVX2() {
+		testUint16x16Binary(t, archsimd.Uint16x16.And, andSlice[uint16])
+		testUint32x8Binary(t, archsimd.Uint32x8.And, andSlice[uint32])
+		testUint64x4Binary(t, archsimd.Uint64x4.And, andSlice[uint64])
+		testUint8x32Binary(t, archsimd.Uint8x32.And, andSlice[uint8])
+	}
 
 	if archsimd.X86.AVX512() {
 		//	testInt8x64Binary(t, archsimd.Int8x64.And, andISlice[int8]) // missing
@@ -211,23 +253,29 @@ func TestAnd(t *testing.T) {
 }
 
 func TestAndNot(t *testing.T) {
-	testInt16x16Binary(t, archsimd.Int16x16.AndNot, andNotSlice[int16])
 	testInt16x8Binary(t, archsimd.Int16x8.AndNot, andNotSlice[int16])
 	testInt32x4Binary(t, archsimd.Int32x4.AndNot, andNotSlice[int32])
-	testInt32x8Binary(t, archsimd.Int32x8.AndNot, andNotSlice[int32])
 	testInt64x2Binary(t, archsimd.Int64x2.AndNot, andNotSlice[int64])
-	testInt64x4Binary(t, archsimd.Int64x4.AndNot, andNotSlice[int64])
 	testInt8x16Binary(t, archsimd.Int8x16.AndNot, andNotSlice[int8])
-	testInt8x32Binary(t, archsimd.Int8x32.AndNot, andNotSlice[int8])
 
-	testUint16x16Binary(t, archsimd.Uint16x16.AndNot, andNotSlice[uint16])
+	if archsimd.X86.AVX2() {
+		testInt16x16Binary(t, archsimd.Int16x16.AndNot, andNotSlice[int16])
+		testInt32x8Binary(t, archsimd.Int32x8.AndNot, andNotSlice[int32])
+		testInt64x4Binary(t, archsimd.Int64x4.AndNot, andNotSlice[int64])
+		testInt8x32Binary(t, archsimd.Int8x32.AndNot, andNotSlice[int8])
+	}
+
+	testUint8x16Binary(t, archsimd.Uint8x16.AndNot, andNotSlice[uint8])
 	testUint16x8Binary(t, archsimd.Uint16x8.AndNot, andNotSlice[uint16])
 	testUint32x4Binary(t, archsimd.Uint32x4.AndNot, andNotSlice[uint32])
-	testUint32x8Binary(t, archsimd.Uint32x8.AndNot, andNotSlice[uint32])
 	testUint64x2Binary(t, archsimd.Uint64x2.AndNot, andNotSlice[uint64])
-	testUint64x4Binary(t, archsimd.Uint64x4.AndNot, andNotSlice[uint64])
-	testUint8x16Binary(t, archsimd.Uint8x16.AndNot, andNotSlice[uint8])
-	testUint8x32Binary(t, archsimd.Uint8x32.AndNot, andNotSlice[uint8])
+
+	if archsimd.X86.AVX2() {
+		testUint16x16Binary(t, archsimd.Uint16x16.AndNot, andNotSlice[uint16])
+		testUint32x8Binary(t, archsimd.Uint32x8.AndNot, andNotSlice[uint32])
+		testUint64x4Binary(t, archsimd.Uint64x4.AndNot, andNotSlice[uint64])
+		testUint8x32Binary(t, archsimd.Uint8x32.AndNot, andNotSlice[uint8])
+	}
 
 	if archsimd.X86.AVX512() {
 		testInt8x64Binary(t, archsimd.Int8x64.AndNot, andNotSlice[int8])
@@ -242,23 +290,29 @@ func TestAndNot(t *testing.T) {
 }
 
 func TestXor(t *testing.T) {
-	testInt16x16Binary(t, archsimd.Int16x16.Xor, xorSlice[int16])
 	testInt16x8Binary(t, archsimd.Int16x8.Xor, xorSlice[int16])
 	testInt32x4Binary(t, archsimd.Int32x4.Xor, xorSlice[int32])
-	testInt32x8Binary(t, archsimd.Int32x8.Xor, xorSlice[int32])
 	testInt64x2Binary(t, archsimd.Int64x2.Xor, xorSlice[int64])
-	testInt64x4Binary(t, archsimd.Int64x4.Xor, xorSlice[int64])
 	testInt8x16Binary(t, archsimd.Int8x16.Xor, xorSlice[int8])
-	testInt8x32Binary(t, archsimd.Int8x32.Xor, xorSlice[int8])
 
-	testUint16x16Binary(t, archsimd.Uint16x16.Xor, xorSlice[uint16])
+	if archsimd.X86.AVX2() {
+		testInt16x16Binary(t, archsimd.Int16x16.Xor, xorSlice[int16])
+		testInt32x8Binary(t, archsimd.Int32x8.Xor, xorSlice[int32])
+		testInt64x4Binary(t, archsimd.Int64x4.Xor, xorSlice[int64])
+		testInt8x32Binary(t, archsimd.Int8x32.Xor, xorSlice[int8])
+	}
+
 	testUint16x8Binary(t, archsimd.Uint16x8.Xor, xorSlice[uint16])
 	testUint32x4Binary(t, archsimd.Uint32x4.Xor, xorSlice[uint32])
-	testUint32x8Binary(t, archsimd.Uint32x8.Xor, xorSlice[uint32])
 	testUint64x2Binary(t, archsimd.Uint64x2.Xor, xorSlice[uint64])
-	testUint64x4Binary(t, archsimd.Uint64x4.Xor, xorSlice[uint64])
 	testUint8x16Binary(t, archsimd.Uint8x16.Xor, xorSlice[uint8])
-	testUint8x32Binary(t, archsimd.Uint8x32.Xor, xorSlice[uint8])
+
+	if archsimd.X86.AVX2() {
+		testUint16x16Binary(t, archsimd.Uint16x16.Xor, xorSlice[uint16])
+		testUint32x8Binary(t, archsimd.Uint32x8.Xor, xorSlice[uint32])
+		testUint64x4Binary(t, archsimd.Uint64x4.Xor, xorSlice[uint64])
+		testUint8x32Binary(t, archsimd.Uint8x32.Xor, xorSlice[uint8])
+	}
 
 	if archsimd.X86.AVX512() {
 		//	testInt8x64Binary(t, archsimd.Int8x64.Xor, andISlice[int8]) // missing
@@ -273,23 +327,29 @@ func TestXor(t *testing.T) {
 }
 
 func TestOr(t *testing.T) {
-	testInt16x16Binary(t, archsimd.Int16x16.Or, orSlice[int16])
 	testInt16x8Binary(t, archsimd.Int16x8.Or, orSlice[int16])
 	testInt32x4Binary(t, archsimd.Int32x4.Or, orSlice[int32])
-	testInt32x8Binary(t, archsimd.Int32x8.Or, orSlice[int32])
 	testInt64x2Binary(t, archsimd.Int64x2.Or, orSlice[int64])
-	testInt64x4Binary(t, archsimd.Int64x4.Or, orSlice[int64])
 	testInt8x16Binary(t, archsimd.Int8x16.Or, orSlice[int8])
-	testInt8x32Binary(t, archsimd.Int8x32.Or, orSlice[int8])
 
-	testUint16x16Binary(t, archsimd.Uint16x16.Or, orSlice[uint16])
+	if archsimd.X86.AVX2() {
+		testInt16x16Binary(t, archsimd.Int16x16.Or, orSlice[int16])
+		testInt32x8Binary(t, archsimd.Int32x8.Or, orSlice[int32])
+		testInt64x4Binary(t, archsimd.Int64x4.Or, orSlice[int64])
+		testInt8x32Binary(t, archsimd.Int8x32.Or, orSlice[int8])
+	}
+
 	testUint16x8Binary(t, archsimd.Uint16x8.Or, orSlice[uint16])
 	testUint32x4Binary(t, archsimd.Uint32x4.Or, orSlice[uint32])
-	testUint32x8Binary(t, archsimd.Uint32x8.Or, orSlice[uint32])
 	testUint64x2Binary(t, archsimd.Uint64x2.Or, orSlice[uint64])
-	testUint64x4Binary(t, archsimd.Uint64x4.Or, orSlice[uint64])
 	testUint8x16Binary(t, archsimd.Uint8x16.Or, orSlice[uint8])
-	testUint8x32Binary(t, archsimd.Uint8x32.Or, orSlice[uint8])
+
+	if archsimd.X86.AVX2() {
+		testUint16x16Binary(t, archsimd.Uint16x16.Or, orSlice[uint16])
+		testUint32x8Binary(t, archsimd.Uint32x8.Or, orSlice[uint32])
+		testUint64x4Binary(t, archsimd.Uint64x4.Or, orSlice[uint64])
+		testUint8x32Binary(t, archsimd.Uint8x32.Or, orSlice[uint8])
+	}
 
 	if archsimd.X86.AVX512() {
 		//	testInt8x64Binary(t, archsimd.Int8x64.Or, andISlice[int8]) // missing
@@ -309,10 +369,13 @@ func TestMul(t *testing.T) {
 	testFloat64x2Binary(t, archsimd.Float64x2.Mul, mulSlice[float64])
 	testFloat64x4Binary(t, archsimd.Float64x4.Mul, mulSlice[float64])
 
-	testInt16x16Binary(t, archsimd.Int16x16.Mul, mulSlice[int16])
 	testInt16x8Binary(t, archsimd.Int16x8.Mul, mulSlice[int16])
 	testInt32x4Binary(t, archsimd.Int32x4.Mul, mulSlice[int32])
-	testInt32x8Binary(t, archsimd.Int32x8.Mul, mulSlice[int32])
+
+	if archsimd.X86.AVX2() {
+		testInt16x16Binary(t, archsimd.Int16x16.Mul, mulSlice[int16])
+		testInt32x8Binary(t, archsimd.Int32x8.Mul, mulSlice[int32])
+	}
 
 	// testInt8x16Binary(t, archsimd.Int8x16.Mul, mulSlice[int8]) // nope
 	// testInt8x32Binary(t, archsimd.Int8x32.Mul, mulSlice[int8])
diff --git a/src/simd/archsimd/internal/simd_test/compare_helpers_test.go b/src/simd/archsimd/internal/simd_test/compare_helpers_test.go
index 279fdc7155..7a33f0ffa4 100644
--- a/src/simd/archsimd/internal/simd_test/compare_helpers_test.go
+++ b/src/simd/archsimd/internal/simd_test/compare_helpers_test.go
@@ -1,6 +1,6 @@
-// Code generated by 'go run genfiles.go'; DO NOT EDIT.
+// Code generated by 'tmplgen'; DO NOT EDIT.
 
-//go:build goexperiment.simd
+//go:build goexperiment.simd && amd64
 
 // This file contains functions testing simd methods that compare two operands.
 // Each function in this file is specialized for a
@@ -462,3 +462,87 @@ func testFloat64x8Compare(t *testing.T, f func(_, _ archsimd.Float64x8) archsimd
 		return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
 	})
 }
+
+// testFloat32x4UnaryCompare tests the simd unary comparison method f against the expected behavior generated by want
+func testFloat32x4UnaryCompare(t *testing.T, f func(x archsimd.Float32x4) archsimd.Mask32x4, want func(x []float32) []int64) {
+	n := 4
+	t.Helper()
+	forSlice(t, float32s, n, func(x []float32) bool {
+		t.Helper()
+		a := archsimd.LoadFloat32x4Slice(x)
+		g := make([]int32, n)
+		f(a).ToInt32x4().StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testFloat64x2UnaryCompare tests the simd unary comparison method f against the expected behavior generated by want
+func testFloat64x2UnaryCompare(t *testing.T, f func(x archsimd.Float64x2) archsimd.Mask64x2, want func(x []float64) []int64) {
+	n := 2
+	t.Helper()
+	forSlice(t, float64s, n, func(x []float64) bool {
+		t.Helper()
+		a := archsimd.LoadFloat64x2Slice(x)
+		g := make([]int64, n)
+		f(a).ToInt64x2().StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testFloat32x8UnaryCompare tests the simd unary comparison method f against the expected behavior generated by want
+func testFloat32x8UnaryCompare(t *testing.T, f func(x archsimd.Float32x8) archsimd.Mask32x8, want func(x []float32) []int64) {
+	n := 8
+	t.Helper()
+	forSlice(t, float32s, n, func(x []float32) bool {
+		t.Helper()
+		a := archsimd.LoadFloat32x8Slice(x)
+		g := make([]int32, n)
+		f(a).ToInt32x8().StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testFloat64x4UnaryCompare tests the simd unary comparison method f against the expected behavior generated by want
+func testFloat64x4UnaryCompare(t *testing.T, f func(x archsimd.Float64x4) archsimd.Mask64x4, want func(x []float64) []int64) {
+	n := 4
+	t.Helper()
+	forSlice(t, float64s, n, func(x []float64) bool {
+		t.Helper()
+		a := archsimd.LoadFloat64x4Slice(x)
+		g := make([]int64, n)
+		f(a).ToInt64x4().StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testFloat32x16UnaryCompare tests the simd unary comparison method f against the expected behavior generated by want
+func testFloat32x16UnaryCompare(t *testing.T, f func(x archsimd.Float32x16) archsimd.Mask32x16, want func(x []float32) []int64) {
+	n := 16
+	t.Helper()
+	forSlice(t, float32s, n, func(x []float32) bool {
+		t.Helper()
+		a := archsimd.LoadFloat32x16Slice(x)
+		g := make([]int32, n)
+		f(a).ToInt32x16().StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testFloat64x8UnaryCompare tests the simd unary comparison method f against the expected behavior generated by want
+func testFloat64x8UnaryCompare(t *testing.T, f func(x archsimd.Float64x8) archsimd.Mask64x8, want func(x []float64) []int64) {
+	n := 8
+	t.Helper()
+	forSlice(t, float64s, n, func(x []float64) bool {
+		t.Helper()
+		a := archsimd.LoadFloat64x8Slice(x)
+		g := make([]int64, n)
+		f(a).ToInt64x8().StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
diff --git a/src/simd/archsimd/internal/simd_test/compare_test.go b/src/simd/archsimd/internal/simd_test/compare_test.go
index 4485e9bdaa..ea8514ac93 100644
--- a/src/simd/archsimd/internal/simd_test/compare_test.go
+++ b/src/simd/archsimd/internal/simd_test/compare_test.go
@@ -21,32 +21,39 @@ func TestLess(t *testing.T) {
 	testFloat64x2Compare(t, archsimd.Float64x2.Less, lessSlice[float64])
 	testFloat64x4Compare(t, archsimd.Float64x4.Less, lessSlice[float64])
 
-	testInt16x16Compare(t, archsimd.Int16x16.Less, lessSlice[int16])
 	testInt16x8Compare(t, archsimd.Int16x8.Less, lessSlice[int16])
 	testInt32x4Compare(t, archsimd.Int32x4.Less, lessSlice[int32])
-	testInt32x8Compare(t, archsimd.Int32x8.Less, lessSlice[int32])
 	testInt64x2Compare(t, archsimd.Int64x2.Less, lessSlice[int64])
-	testInt64x4Compare(t, archsimd.Int64x4.Less, lessSlice[int64])
 	testInt8x16Compare(t, archsimd.Int8x16.Less, lessSlice[int8])
-	testInt8x32Compare(t, archsimd.Int8x32.Less, lessSlice[int8])
 
-	testInt16x16Compare(t, archsimd.Int16x16.Less, lessSlice[int16])
+	if archsimd.X86.AVX2() {
+		testInt16x16Compare(t, archsimd.Int16x16.Less, lessSlice[int16])
+		testInt32x8Compare(t, archsimd.Int32x8.Less, lessSlice[int32])
+		testInt64x4Compare(t, archsimd.Int64x4.Less, lessSlice[int64])
+		testInt8x32Compare(t, archsimd.Int8x32.Less, lessSlice[int8])
+
+		testInt16x16Compare(t, archsimd.Int16x16.Less, lessSlice[int16])
+		testInt32x8Compare(t, archsimd.Int32x8.Less, lessSlice[int32])
+		testInt64x4Compare(t, archsimd.Int64x4.Less, lessSlice[int64])
+		testInt8x32Compare(t, archsimd.Int8x32.Less, lessSlice[int8])
+	}
+
 	testInt16x8Compare(t, archsimd.Int16x8.Less, lessSlice[int16])
 	testInt32x4Compare(t, archsimd.Int32x4.Less, lessSlice[int32])
-	testInt32x8Compare(t, archsimd.Int32x8.Less, lessSlice[int32])
 	testInt64x2Compare(t, archsimd.Int64x2.Less, lessSlice[int64])
-	testInt64x4Compare(t, archsimd.Int64x4.Less, lessSlice[int64])
 	testInt8x16Compare(t, archsimd.Int8x16.Less, lessSlice[int8])
-	testInt8x32Compare(t, archsimd.Int8x32.Less, lessSlice[int8])
 
-	testUint16x16Compare(t, archsimd.Uint16x16.Less, lessSlice[uint16])
 	testUint16x8Compare(t, archsimd.Uint16x8.Less, lessSlice[uint16])
 	testUint32x4Compare(t, archsimd.Uint32x4.Less, lessSlice[uint32])
-	testUint32x8Compare(t, archsimd.Uint32x8.Less, lessSlice[uint32])
 	testUint64x2Compare(t, archsimd.Uint64x2.Less, lessSlice[uint64])
-	testUint64x4Compare(t, archsimd.Uint64x4.Less, lessSlice[uint64])
 	testUint8x16Compare(t, archsimd.Uint8x16.Less, lessSlice[uint8])
-	testUint8x32Compare(t, archsimd.Uint8x32.Less, lessSlice[uint8])
+
+	if archsimd.X86.AVX2() {
+		testUint16x16Compare(t, archsimd.Uint16x16.Less, lessSlice[uint16])
+		testUint32x8Compare(t, archsimd.Uint32x8.Less, lessSlice[uint32])
+		testUint64x4Compare(t, archsimd.Uint64x4.Less, lessSlice[uint64])
+		testUint8x32Compare(t, archsimd.Uint8x32.Less, lessSlice[uint8])
+	}
 
 	if archsimd.X86.AVX512() {
 		testUint16x16Compare(t, archsimd.Uint16x16.Less, lessSlice[uint16])
@@ -77,23 +84,29 @@ func TestLessEqual(t *testing.T) {
 	testFloat64x2Compare(t, archsimd.Float64x2.LessEqual, lessEqualSlice[float64])
 	testFloat64x4Compare(t, archsimd.Float64x4.LessEqual, lessEqualSlice[float64])
 
-	testInt16x16Compare(t, archsimd.Int16x16.LessEqual, lessEqualSlice[int16])
 	testInt16x8Compare(t, archsimd.Int16x8.LessEqual, lessEqualSlice[int16])
 	testInt32x4Compare(t, archsimd.Int32x4.LessEqual, lessEqualSlice[int32])
-	testInt32x8Compare(t, archsimd.Int32x8.LessEqual, lessEqualSlice[int32])
 	testInt64x2Compare(t, archsimd.Int64x2.LessEqual, lessEqualSlice[int64])
-	testInt64x4Compare(t, archsimd.Int64x4.LessEqual, lessEqualSlice[int64])
 	testInt8x16Compare(t, archsimd.Int8x16.LessEqual, lessEqualSlice[int8])
-	testInt8x32Compare(t, archsimd.Int8x32.LessEqual, lessEqualSlice[int8])
 
-	testUint16x16Compare(t, archsimd.Uint16x16.LessEqual, lessEqualSlice[uint16])
+	if archsimd.X86.AVX2() {
+		testInt16x16Compare(t, archsimd.Int16x16.LessEqual, lessEqualSlice[int16])
+		testInt32x8Compare(t, archsimd.Int32x8.LessEqual, lessEqualSlice[int32])
+		testInt64x4Compare(t, archsimd.Int64x4.LessEqual, lessEqualSlice[int64])
+		testInt8x32Compare(t, archsimd.Int8x32.LessEqual, lessEqualSlice[int8])
+	}
+
 	testUint16x8Compare(t, archsimd.Uint16x8.LessEqual, lessEqualSlice[uint16])
 	testUint32x4Compare(t, archsimd.Uint32x4.LessEqual, lessEqualSlice[uint32])
-	testUint32x8Compare(t, archsimd.Uint32x8.LessEqual, lessEqualSlice[uint32])
 	testUint64x2Compare(t, archsimd.Uint64x2.LessEqual, lessEqualSlice[uint64])
-	testUint64x4Compare(t, archsimd.Uint64x4.LessEqual, lessEqualSlice[uint64])
 	testUint8x16Compare(t, archsimd.Uint8x16.LessEqual, lessEqualSlice[uint8])
-	testUint8x32Compare(t, archsimd.Uint8x32.LessEqual, lessEqualSlice[uint8])
+
+	if archsimd.X86.AVX2() {
+		testUint16x16Compare(t, archsimd.Uint16x16.LessEqual, lessEqualSlice[uint16])
+		testUint32x8Compare(t, archsimd.Uint32x8.LessEqual, lessEqualSlice[uint32])
+		testUint64x4Compare(t, archsimd.Uint64x4.LessEqual, lessEqualSlice[uint64])
+		testUint8x32Compare(t, archsimd.Uint8x32.LessEqual, lessEqualSlice[uint8])
+	}
 
 	if archsimd.X86.AVX512() {
 		testFloat32x16Compare(t, archsimd.Float32x16.LessEqual, lessEqualSlice[float32])
@@ -115,25 +128,29 @@ func TestGreater(t *testing.T) {
 	testFloat64x2Compare(t, archsimd.Float64x2.Greater, greaterSlice[float64])
 	testFloat64x4Compare(t, archsimd.Float64x4.Greater, greaterSlice[float64])
 
-	testInt16x16Compare(t, archsimd.Int16x16.Greater, greaterSlice[int16])
 	testInt16x8Compare(t, archsimd.Int16x8.Greater, greaterSlice[int16])
 	testInt32x4Compare(t, archsimd.Int32x4.Greater, greaterSlice[int32])
-	testInt32x8Compare(t, archsimd.Int32x8.Greater, greaterSlice[int32])
-
 	testInt64x2Compare(t, archsimd.Int64x2.Greater, greaterSlice[int64])
-	testInt64x4Compare(t, archsimd.Int64x4.Greater, greaterSlice[int64])
 	testInt8x16Compare(t, archsimd.Int8x16.Greater, greaterSlice[int8])
-	testInt8x32Compare(t, archsimd.Int8x32.Greater, greaterSlice[int8])
 
-	testUint16x16Compare(t, archsimd.Uint16x16.Greater, greaterSlice[uint16])
+	if archsimd.X86.AVX2() {
+		testInt16x16Compare(t, archsimd.Int16x16.Greater, greaterSlice[int16])
+		testInt32x8Compare(t, archsimd.Int32x8.Greater, greaterSlice[int32])
+		testInt64x4Compare(t, archsimd.Int64x4.Greater, greaterSlice[int64])
+		testInt8x32Compare(t, archsimd.Int8x32.Greater, greaterSlice[int8])
+	}
+
 	testUint16x8Compare(t, archsimd.Uint16x8.Greater, greaterSlice[uint16])
 	testUint32x4Compare(t, archsimd.Uint32x4.Greater, greaterSlice[uint32])
-	testUint32x8Compare(t, archsimd.Uint32x8.Greater, greaterSlice[uint32])
-
 	testUint64x2Compare(t, archsimd.Uint64x2.Greater, greaterSlice[uint64])
-	testUint64x4Compare(t, archsimd.Uint64x4.Greater, greaterSlice[uint64])
 	testUint8x16Compare(t, archsimd.Uint8x16.Greater, greaterSlice[uint8])
-	testUint8x32Compare(t, archsimd.Uint8x32.Greater, greaterSlice[uint8])
+
+	if archsimd.X86.AVX2() {
+		testUint16x16Compare(t, archsimd.Uint16x16.Greater, greaterSlice[uint16])
+		testUint32x8Compare(t, archsimd.Uint32x8.Greater, greaterSlice[uint32])
+		testUint64x4Compare(t, archsimd.Uint64x4.Greater, greaterSlice[uint64])
+		testUint8x32Compare(t, archsimd.Uint8x32.Greater, greaterSlice[uint8])
+	}
 
 	if archsimd.X86.AVX512() {
 
@@ -156,23 +173,29 @@ func TestGreaterEqual(t *testing.T) {
 	testFloat64x2Compare(t, archsimd.Float64x2.GreaterEqual, greaterEqualSlice[float64])
 	testFloat64x4Compare(t, archsimd.Float64x4.GreaterEqual, greaterEqualSlice[float64])
 
-	testInt16x16Compare(t, archsimd.Int16x16.GreaterEqual, greaterEqualSlice[int16])
 	testInt16x8Compare(t, archsimd.Int16x8.GreaterEqual, greaterEqualSlice[int16])
 	testInt32x4Compare(t, archsimd.Int32x4.GreaterEqual, greaterEqualSlice[int32])
-	testInt32x8Compare(t, archsimd.Int32x8.GreaterEqual, greaterEqualSlice[int32])
 	testInt64x2Compare(t, archsimd.Int64x2.GreaterEqual, greaterEqualSlice[int64])
-	testInt64x4Compare(t, archsimd.Int64x4.GreaterEqual, greaterEqualSlice[int64])
 	testInt8x16Compare(t, archsimd.Int8x16.GreaterEqual, greaterEqualSlice[int8])
-	testInt8x32Compare(t, archsimd.Int8x32.GreaterEqual, greaterEqualSlice[int8])
 
-	testUint16x16Compare(t, archsimd.Uint16x16.GreaterEqual, greaterEqualSlice[uint16])
+	if archsimd.X86.AVX2() {
+		testInt16x16Compare(t, archsimd.Int16x16.GreaterEqual, greaterEqualSlice[int16])
+		testInt32x8Compare(t, archsimd.Int32x8.GreaterEqual, greaterEqualSlice[int32])
+		testInt64x4Compare(t, archsimd.Int64x4.GreaterEqual, greaterEqualSlice[int64])
+		testInt8x32Compare(t, archsimd.Int8x32.GreaterEqual, greaterEqualSlice[int8])
+	}
+
 	testUint16x8Compare(t, archsimd.Uint16x8.GreaterEqual, greaterEqualSlice[uint16])
 	testUint32x4Compare(t, archsimd.Uint32x4.GreaterEqual, greaterEqualSlice[uint32])
-	testUint32x8Compare(t, archsimd.Uint32x8.GreaterEqual, greaterEqualSlice[uint32])
 	testUint64x2Compare(t, archsimd.Uint64x2.GreaterEqual, greaterEqualSlice[uint64])
-	testUint64x4Compare(t, archsimd.Uint64x4.GreaterEqual, greaterEqualSlice[uint64])
 	testUint8x16Compare(t, archsimd.Uint8x16.GreaterEqual, greaterEqualSlice[uint8])
-	testUint8x32Compare(t, archsimd.Uint8x32.GreaterEqual, greaterEqualSlice[uint8])
+
+	if archsimd.X86.AVX2() {
+		testUint16x16Compare(t, archsimd.Uint16x16.GreaterEqual, greaterEqualSlice[uint16])
+		testUint32x8Compare(t, archsimd.Uint32x8.GreaterEqual, greaterEqualSlice[uint32])
+		testUint64x4Compare(t, archsimd.Uint64x4.GreaterEqual, greaterEqualSlice[uint64])
+		testUint8x32Compare(t, archsimd.Uint8x32.GreaterEqual, greaterEqualSlice[uint8])
+	}
 
 	if archsimd.X86.AVX512() {
 		testFloat32x16Compare(t, archsimd.Float32x16.GreaterEqual, greaterEqualSlice[float32])
@@ -194,23 +217,29 @@ func TestEqual(t *testing.T) {
 	testFloat64x2Compare(t, archsimd.Float64x2.Equal, equalSlice[float64])
 	testFloat64x4Compare(t, archsimd.Float64x4.Equal, equalSlice[float64])
 
-	testInt16x16Compare(t, archsimd.Int16x16.Equal, equalSlice[int16])
 	testInt16x8Compare(t, archsimd.Int16x8.Equal, equalSlice[int16])
 	testInt32x4Compare(t, archsimd.Int32x4.Equal, equalSlice[int32])
-	testInt32x8Compare(t, archsimd.Int32x8.Equal, equalSlice[int32])
 	testInt64x2Compare(t, archsimd.Int64x2.Equal, equalSlice[int64])
-	testInt64x4Compare(t, archsimd.Int64x4.Equal, equalSlice[int64])
 	testInt8x16Compare(t, archsimd.Int8x16.Equal, equalSlice[int8])
-	testInt8x32Compare(t, archsimd.Int8x32.Equal, equalSlice[int8])
 
-	testUint16x16Compare(t, archsimd.Uint16x16.Equal, equalSlice[uint16])
+	if archsimd.X86.AVX2() {
+		testInt16x16Compare(t, archsimd.Int16x16.Equal, equalSlice[int16])
+		testInt32x8Compare(t, archsimd.Int32x8.Equal, equalSlice[int32])
+		testInt64x4Compare(t, archsimd.Int64x4.Equal, equalSlice[int64])
+		testInt8x32Compare(t, archsimd.Int8x32.Equal, equalSlice[int8])
+	}
+
 	testUint16x8Compare(t, archsimd.Uint16x8.Equal, equalSlice[uint16])
 	testUint32x4Compare(t, archsimd.Uint32x4.Equal, equalSlice[uint32])
-	testUint32x8Compare(t, archsimd.Uint32x8.Equal, equalSlice[uint32])
 	testUint64x2Compare(t, archsimd.Uint64x2.Equal, equalSlice[uint64])
-	testUint64x4Compare(t, archsimd.Uint64x4.Equal, equalSlice[uint64])
 	testUint8x16Compare(t, archsimd.Uint8x16.Equal, equalSlice[uint8])
-	testUint8x32Compare(t, archsimd.Uint8x32.Equal, equalSlice[uint8])
+
+	if archsimd.X86.AVX2() {
+		testUint16x16Compare(t, archsimd.Uint16x16.Equal, equalSlice[uint16])
+		testUint32x8Compare(t, archsimd.Uint32x8.Equal, equalSlice[uint32])
+		testUint64x4Compare(t, archsimd.Uint64x4.Equal, equalSlice[uint64])
+		testUint8x32Compare(t, archsimd.Uint8x32.Equal, equalSlice[uint8])
+	}
 
 	if archsimd.X86.AVX512() {
 		testFloat32x16Compare(t, archsimd.Float32x16.Equal, equalSlice[float32])
@@ -232,23 +261,29 @@ func TestNotEqual(t *testing.T) {
 	testFloat64x2Compare(t, archsimd.Float64x2.NotEqual, notEqualSlice[float64])
 	testFloat64x4Compare(t, archsimd.Float64x4.NotEqual, notEqualSlice[float64])
 
-	testInt16x16Compare(t, archsimd.Int16x16.NotEqual, notEqualSlice[int16])
 	testInt16x8Compare(t, archsimd.Int16x8.NotEqual, notEqualSlice[int16])
 	testInt32x4Compare(t, archsimd.Int32x4.NotEqual, notEqualSlice[int32])
-	testInt32x8Compare(t, archsimd.Int32x8.NotEqual, notEqualSlice[int32])
 	testInt64x2Compare(t, archsimd.Int64x2.NotEqual, notEqualSlice[int64])
-	testInt64x4Compare(t, archsimd.Int64x4.NotEqual, notEqualSlice[int64])
 	testInt8x16Compare(t, archsimd.Int8x16.NotEqual, notEqualSlice[int8])
-	testInt8x32Compare(t, archsimd.Int8x32.NotEqual, notEqualSlice[int8])
 
-	testUint16x16Compare(t, archsimd.Uint16x16.NotEqual, notEqualSlice[uint16])
+	if archsimd.X86.AVX2() {
+		testInt16x16Compare(t, archsimd.Int16x16.NotEqual, notEqualSlice[int16])
+		testInt32x8Compare(t, archsimd.Int32x8.NotEqual, notEqualSlice[int32])
+		testInt64x4Compare(t, archsimd.Int64x4.NotEqual, notEqualSlice[int64])
+		testInt8x32Compare(t, archsimd.Int8x32.NotEqual, notEqualSlice[int8])
+	}
+
 	testUint16x8Compare(t, archsimd.Uint16x8.NotEqual, notEqualSlice[uint16])
 	testUint32x4Compare(t, archsimd.Uint32x4.NotEqual, notEqualSlice[uint32])
-	testUint32x8Compare(t, archsimd.Uint32x8.NotEqual, notEqualSlice[uint32])
 	testUint64x2Compare(t, archsimd.Uint64x2.NotEqual, notEqualSlice[uint64])
-	testUint64x4Compare(t, archsimd.Uint64x4.NotEqual, notEqualSlice[uint64])
 	testUint8x16Compare(t, archsimd.Uint8x16.NotEqual, notEqualSlice[uint8])
-	testUint8x32Compare(t, archsimd.Uint8x32.NotEqual, notEqualSlice[uint8])
+
+	if archsimd.X86.AVX2() {
+		testUint16x16Compare(t, archsimd.Uint16x16.NotEqual, notEqualSlice[uint16])
+		testUint32x8Compare(t, archsimd.Uint32x8.NotEqual, notEqualSlice[uint32])
+		testUint64x4Compare(t, archsimd.Uint64x4.NotEqual, notEqualSlice[uint64])
+		testUint8x32Compare(t, archsimd.Uint8x32.NotEqual, notEqualSlice[uint8])
+	}
 
 	if archsimd.X86.AVX512() {
 		testFloat32x16Compare(t, archsimd.Float32x16.NotEqual, notEqualSlice[float32])
@@ -263,3 +298,49 @@ func TestNotEqual(t *testing.T) {
 		testUint64x8Compare(t, archsimd.Uint64x8.NotEqual, notEqualSlice[uint64])
 	}
 }
+
+func TestIsNaN(t *testing.T) {
+	testFloat32x4UnaryCompare(t, archsimd.Float32x4.IsNaN, isNaNSlice[float32])
+	testFloat32x8UnaryCompare(t, archsimd.Float32x8.IsNaN, isNaNSlice[float32])
+	testFloat64x2UnaryCompare(t, archsimd.Float64x2.IsNaN, isNaNSlice[float64])
+	testFloat64x4UnaryCompare(t, archsimd.Float64x4.IsNaN, isNaNSlice[float64])
+
+	if archsimd.X86.AVX512() {
+		testFloat32x16UnaryCompare(t, archsimd.Float32x16.IsNaN, isNaNSlice[float32])
+		testFloat64x8UnaryCompare(t, archsimd.Float64x8.IsNaN, isNaNSlice[float64])
+	}
+
+	// Test x.IsNaN().Or(y.IsNaN()), which is optimized to VCMPP(S|D) $3, x, y.
+	want32 := mapCompare(func(x, y float32) bool { return x != x || y != y })
+	want64 := mapCompare(func(x, y float64) bool { return x != x || y != y })
+	testFloat32x4Compare(t,
+		func(x, y archsimd.Float32x4) archsimd.Mask32x4 {
+			return x.IsNaN().Or(y.IsNaN())
+		}, want32)
+	testFloat64x2Compare(t,
+		func(x, y archsimd.Float64x2) archsimd.Mask64x2 {
+			return x.IsNaN().Or(y.IsNaN())
+		}, want64)
+
+	if archsimd.X86.AVX2() {
+		testFloat32x8Compare(t,
+			func(x, y archsimd.Float32x8) archsimd.Mask32x8 {
+				return x.IsNaN().Or(y.IsNaN())
+			}, want32)
+		testFloat64x4Compare(t,
+			func(x, y archsimd.Float64x4) archsimd.Mask64x4 {
+				return x.IsNaN().Or(y.IsNaN())
+			}, want64)
+	}
+
+	if archsimd.X86.AVX512() {
+		testFloat32x16Compare(t,
+			func(x, y archsimd.Float32x16) archsimd.Mask32x16 {
+				return x.IsNaN().Or(y.IsNaN())
+			}, want32)
+		testFloat64x8Compare(t,
+			func(x, y archsimd.Float64x8) archsimd.Mask64x8 {
+				return x.IsNaN().Or(y.IsNaN())
+			}, want64)
+	}
+}
diff --git a/src/simd/archsimd/internal/simd_test/comparemasked_helpers_test.go b/src/simd/archsimd/internal/simd_test/comparemasked_helpers_test.go
index 7ceee652a9..c7197568ed 100644
--- a/src/simd/archsimd/internal/simd_test/comparemasked_helpers_test.go
+++ b/src/simd/archsimd/internal/simd_test/comparemasked_helpers_test.go
@@ -1,6 +1,6 @@
-// Code generated by 'go run genfiles.go'; DO NOT EDIT.
+// Code generated by 'tmplgen'; DO NOT EDIT.
 
-//go:build goexperiment.simd
+//go:build goexperiment.simd && amd64
 
 // This file contains functions testing simd methods that compare two operands under a mask.
 // Each function in this file is specialized for a
diff --git a/src/simd/archsimd/internal/simd_test/generate.go b/src/simd/archsimd/internal/simd_test/generate.go
index e744a5299f..4bc24fea2d 100644
--- a/src/simd/archsimd/internal/simd_test/generate.go
+++ b/src/simd/archsimd/internal/simd_test/generate.go
@@ -7,5 +7,6 @@
 package simd
 
 // Invoke code generators.
+// The test helpers are generated by tmplgen from the archsimd package.
 
-//go:generate go run -C ../.. genfiles.go
+//go:generate go -C ../.. generate
diff --git a/src/simd/archsimd/internal/simd_test/helpers_test.go b/src/simd/archsimd/internal/simd_test/helpers_test.go
index b9d5098dba..d6963586c0 100644
--- a/src/simd/archsimd/internal/simd_test/helpers_test.go
+++ b/src/simd/archsimd/internal/simd_test/helpers_test.go
@@ -126,8 +126,22 @@ func map1[T, U any](elem func(x T) U) func(x []T) []U {
 	}
 }
 
-// map1 returns a function that returns the slice of the results of applying
-// comparison function elem to the respective elements of its two slice inputs.
+// map1n returns a function that returns the slice of the results of applying
+// input parameter elem to the respective elements of its single slice input,
+// extended (with zero values) or truncated to length n.
+func map1n[T, U any](elem func(x T) U, n int) func(x []T) []U {
+	return func(x []T) []U {
+		s := make([]U, n)
+		for i := range min(len(x), n) {
+			s[i] = elem(x[i])
+		}
+		return s
+	}
+}
+
+// mapCompare returns a function that returns the slice of the results of applying
+// comparison function elem to the respective elements of its two slice inputs,
+// and returns -1 if the comparison is true, 0 otherwise.
 func mapCompare[T number](elem func(x, y T) bool) func(x, y []T) []int64 {
 	return func(x, y []T) []int64 {
 		s := make([]int64, len(x))
@@ -168,12 +182,14 @@ var nzero = -zero
 var inf = 1 / zero
 var ninf = -1 / zero
 var nan = math.NaN()
+var snan32 = math.Float32frombits(0x7f800001)
+var snan64 = math.Float64frombits(0x7ff0000000000001)
 
 // N controls how large the test vectors are
 const N = 144
 
-var float32s = nOf(N, []float32{float32(inf), float32(ninf), 1, float32(nan), float32(zero), 2, float32(nan), float32(zero), 3, float32(-zero), float32(1.0 / zero), float32(-1.0 / zero), 1.0 / 2, 1.0 / 4, 1.0 / 8, 1.0 / 1000, 1.0 / 1000000, 1, -1, 0, 2, -2, 3, -3, math.MaxFloat32, 1 / math.MaxFloat32, 10, -10, 100, 20, -20, 300, -300, -4000, -80, -160, -3200, -64, -4, -8, -16, -32, -64})
-var float64s = nOf(N, []float64{inf, ninf, nan, zero, -zero, 1 / zero, -1 / zero, 0.0001, 0.0000001, 1, -1, 0, 2, -2, 3, -3, math.MaxFloat64, 1.0 / math.MaxFloat64, 10, -10, 100, 20, -20, 300, -300, -4000, -80, -16, -32, -64})
+var float32s = nOf(N, []float32{float32(inf), float32(ninf), 1, float32(nan), snan32, -float32(nan), -snan32, float32(zero), 2, float32(nan), float32(zero), 3, float32(-zero), float32(1.0 / zero), float32(-1.0 / zero), 1.0 / 2, 1.0 / 4, 1.0 / 8, 1.0 / 1000, 1.0 / 1000000, 1, -1, 0, 2, -2, 3, -3, math.MaxFloat32, 1 / math.MaxFloat32, 10, -10, 100, 20, -20, 300, -300, -4000, -80, -160, -3200, -64, -4, -8, -16, -32, -64})
+var float64s = nOf(N, []float64{inf, ninf, nan, snan64, -nan, -snan64, zero, -zero, 1 / zero, -1 / zero, 0.0001, 0.0000001, 1, -1, 0, 2, -2, 3, -3, math.MaxFloat64, 1.0 / math.MaxFloat64, 10, -10, 100, 20, -20, 300, -300, -4000, -80, -16, -32, -64})
 
 var int32s = nOf(N, []int32{1, -1, 0, 2, 4, 8, 1024, 0xffffff, -0xffffff, 0x55555, 0x77777, 0xccccc, -0x55555, -0x77777, -0xccccc, -4, -8, -16, -32, -64})
 var uint32s = nOf(N, []uint32{1, 0, 2, 4, 8, 1024, 0xffffff, ^uint32(0xffffff), 0x55555, 0x77777, 0xccccc, ^uint32(0x55555), ^uint32(0x77777), ^uint32(0xccccc)})
diff --git a/src/simd/archsimd/internal/simd_test/simd_test.go b/src/simd/archsimd/internal/simd_test/simd_test.go
index 83925ae789..36bde92455 100644
--- a/src/simd/archsimd/internal/simd_test/simd_test.go
+++ b/src/simd/archsimd/internal/simd_test/simd_test.go
@@ -13,6 +13,7 @@ import (
 	"simd/archsimd"
 	"slices"
 	"testing"
+	"unsafe"
 )
 
 func TestMain(m *testing.M) {
@@ -225,6 +226,10 @@ func TestShiftAll(t *testing.T) {
 }
 
 func TestSlicesInt8(t *testing.T) {
+	if !archsimd.X86.AVX2() {
+		t.Skip("Test requires X86.AVX2, not available on this hardware")
+		return
+	}
 	a := []int8{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
 		17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}
 	v := archsimd.LoadInt8x32Slice(a)
@@ -258,6 +263,10 @@ func TestSlicesInt8GetElem(t *testing.T) {
 }
 
 func TestSlicesInt8TooShortLoad(t *testing.T) {
+	if !archsimd.X86.AVX2() {
+		t.Skip("Test requires X86.AVX2, not available on this hardware")
+		return
+	}
 	defer func() {
 		if r := recover(); r != nil {
 			t.Logf("Saw EXPECTED panic %v", r)
@@ -274,6 +283,10 @@ func TestSlicesInt8TooShortLoad(t *testing.T) {
 }
 
 func TestSlicesInt8TooShortStore(t *testing.T) {
+	if !archsimd.X86.AVX2() {
+		t.Skip("Test requires X86.AVX2, not available on this hardware")
+		return
+	}
 	defer func() {
 		if r := recover(); r != nil {
 			t.Logf("Saw EXPECTED panic %v", r)
@@ -303,6 +316,10 @@ func TestSlicesFloat64(t *testing.T) {
 
 // TODO: try to reduce this test to be smaller.
 func TestMergeLocals(t *testing.T) {
+	if !archsimd.X86.AVX2() {
+		t.Skip("Test requires X86.AVX2, not available on this hardware")
+		return
+	}
 	testMergeLocalswrapper(t, archsimd.Int64x4.Add)
 }
 
@@ -362,12 +379,79 @@ func TestBitMaskFromBitsLoad(t *testing.T) {
 }
 
 func TestBitMaskToBits(t *testing.T) {
-	if !archsimd.X86.AVX512() {
-		t.Skip("Test requires X86.AVX512, not available on this hardware")
-		return
+	int8s := []int8{
+		0, 1, 1, 0, 0, 1, 0, 1,
+		1, 0, 1, 1, 0, 0, 1, 0,
+		1, 0, 0, 1, 1, 0, 1, 0,
+		0, 1, 1, 0, 0, 1, 0, 1,
+		1, 0, 0, 1, 0, 1, 1, 0,
+		0, 1, 0, 1, 1, 0, 0, 1,
+		1, 0, 1, 0, 0, 1, 1, 0,
+		0, 1, 1, 0, 1, 0, 0, 1,
+	}
+	int16s := make([]int16, 32)
+	for i := range int16s {
+		int16s[i] = int16(int8s[i])
+	}
+	int32s := make([]int32, 16)
+	for i := range int32s {
+		int32s[i] = int32(int8s[i])
+	}
+	int64s := make([]int64, 8)
+	for i := range int64s {
+		int64s[i] = int64(int8s[i])
+	}
+	want64 := uint64(0)
+	for i := range int8s {
+		want64 |= uint64(int8s[i]) << i
+	}
+	want32 := uint32(want64)
+	want16 := uint16(want64)
+	want8 := uint8(want64)
+	want4 := want8 & 0b1111
+	want2 := want4 & 0b11
+
+	if v := archsimd.LoadInt8x16Slice(int8s[:16]).ToMask().ToBits(); v != want16 {
+		t.Errorf("want %b, got %b", want16, v)
+	}
+	if v := archsimd.LoadInt32x4Slice(int32s[:4]).ToMask().ToBits(); v != want4 {
+		t.Errorf("want %b, got %b", want4, v)
+	}
+	if v := archsimd.LoadInt32x8Slice(int32s[:8]).ToMask().ToBits(); v != want8 {
+		t.Errorf("want %b, got %b", want8, v)
+	}
+	if v := archsimd.LoadInt64x2Slice(int64s[:2]).ToMask().ToBits(); v != want2 {
+		t.Errorf("want %b, got %b", want2, v)
+	}
+	if v := archsimd.LoadInt64x4Slice(int64s[:4]).ToMask().ToBits(); v != want4 {
+		t.Errorf("want %b, got %b", want4, v)
 	}
-	if v := archsimd.LoadInt16x8Slice([]int16{1, 0, 1, 0, 0, 0, 0, 0}).ToMask().ToBits(); v != 0b101 {
-		t.Errorf("Want 0b101, got %b", v)
+
+	if archsimd.X86.AVX2() {
+		if v := archsimd.LoadInt8x32Slice(int8s[:32]).ToMask().ToBits(); v != want32 {
+			t.Errorf("want %b, got %b", want32, v)
+		}
+	}
+
+	if archsimd.X86.AVX512() {
+		if v := archsimd.LoadInt8x64Slice(int8s).ToMask().ToBits(); v != want64 {
+			t.Errorf("want %b, got %b", want64, v)
+		}
+		if v := archsimd.LoadInt16x8Slice(int16s[:8]).ToMask().ToBits(); v != want8 {
+			t.Errorf("want %b, got %b", want8, v)
+		}
+		if v := archsimd.LoadInt16x16Slice(int16s[:16]).ToMask().ToBits(); v != want16 {
+			t.Errorf("want %b, got %b", want16, v)
+		}
+		if v := archsimd.LoadInt16x32Slice(int16s).ToMask().ToBits(); v != want32 {
+			t.Errorf("want %b, got %b", want32, v)
+		}
+		if v := archsimd.LoadInt32x16Slice(int32s).ToMask().ToBits(); v != want16 {
+			t.Errorf("want %b, got %b", want16, v)
+		}
+		if v := archsimd.LoadInt64x8Slice(int64s).ToMask().ToBits(); v != want8 {
+			t.Errorf("want %b, got %b", want8, v)
+		}
 	}
 }
 
@@ -385,6 +469,10 @@ func TestBitMaskToBitsStore(t *testing.T) {
 }
 
 func TestMergeFloat(t *testing.T) {
+	if !archsimd.X86.AVX2() {
+		t.Skip("Test requires X86.AVX2, not available on this hardware")
+		return
+	}
 	k := make([]int64, 4, 4)
 	s := make([]float64, 4, 4)
 
@@ -472,6 +560,10 @@ func TestBroadcastUint16x8(t *testing.T) {
 }
 
 func TestBroadcastInt8x32(t *testing.T) {
+	if !archsimd.X86.AVX2() {
+		t.Skip("Test requires X86.AVX2, not available on this hardware")
+		return
+	}
 	s := make([]int8, 32, 32)
 	archsimd.BroadcastInt8x32(-123).StoreSlice(s)
 	checkSlices(t, s, []int8{-123, -123, -123, -123, -123, -123, -123, -123,
@@ -1105,6 +1197,10 @@ func TestSelectTernOptInt32x16(t *testing.T) {
 }
 
 func TestMaskedMerge(t *testing.T) {
+	if !archsimd.X86.AVX2() {
+		t.Skip("Test requires X86.AVX2, not available on this hardware")
+		return
+	}
 	x := archsimd.LoadInt64x4Slice([]int64{1, 2, 3, 4})
 	y := archsimd.LoadInt64x4Slice([]int64{5, 6, 1, 1})
 	z := archsimd.LoadInt64x4Slice([]int64{-1, -2, -3, -4})
@@ -1123,40 +1219,6 @@ func TestMaskedMerge(t *testing.T) {
 	}
 }
 
-func TestDotProductQuadruple(t *testing.T) {
-	if !archsimd.X86.AVXVNNI() {
-		t.Skip("Test requires X86.AVXVNNI, not available on this hardware")
-		return
-	}
-	xd := make([]int8, 16)
-	yd := make([]uint8, 16)
-	zd := make([]int32, 4)
-	wanted1 := make([]int32, 4)
-	wanted2 := make([]int32, 4)
-	res1 := make([]int32, 4)
-	res2 := make([]int32, 4)
-	for i := range 4 {
-		xd[i] = 5
-		yd[i] = 6
-		zd[i] = 3
-		wanted1[i] = 30
-		wanted2[i] = 30
-	}
-	x := archsimd.LoadInt8x16Slice(xd)
-	y := archsimd.LoadUint8x16Slice(yd)
-	z := archsimd.LoadInt32x4Slice(zd)
-	x.DotProductQuadruple(y).StoreSlice(res1)
-	x.DotProductQuadruple(y).Add(z).StoreSlice(res1)
-	for i := range 4 {
-		if res1[i] != wanted1[i] {
-			t.Errorf("got %d wanted %d", res1[i], wanted1[i])
-		}
-		if res2[i] != wanted2[i] {
-			t.Errorf("got %d wanted %d", res2[i], wanted2[i])
-		}
-	}
-}
-
 func TestPermuteScalars(t *testing.T) {
 	x := []int32{11, 12, 13, 14}
 	want := []int32{12, 13, 14, 11}
@@ -1166,6 +1228,10 @@ func TestPermuteScalars(t *testing.T) {
 }
 
 func TestPermuteScalarsGrouped(t *testing.T) {
+	if !archsimd.X86.AVX2() {
+		t.Skip("Test requires X86.AVX2, not available on this hardware")
+		return
+	}
 	x := []int32{11, 12, 13, 14, 21, 22, 23, 24}
 	want := []int32{12, 13, 14, 11, 22, 23, 24, 21}
 	got := make([]int32, 8)
@@ -1190,6 +1256,10 @@ func TestPermuteScalarsLo(t *testing.T) {
 }
 
 func TestPermuteScalarsHiGrouped(t *testing.T) {
+	if !archsimd.X86.AVX2() {
+		t.Skip("Test requires X86.AVX2, not available on this hardware")
+		return
+	}
 	x := []int16{-1, -2, -3, -4, 11, 12, 13, 14, -11, -12, -13, -14, 111, 112, 113, 114}
 	want := []int16{-1, -2, -3, -4, 12, 13, 14, 11, -11, -12, -13, -14, 112, 113, 114, 111}
 	got := make([]int16, len(x))
@@ -1198,6 +1268,10 @@ func TestPermuteScalarsHiGrouped(t *testing.T) {
 }
 
 func TestPermuteScalarsLoGrouped(t *testing.T) {
+	if !archsimd.X86.AVX2() {
+		t.Skip("Test requires X86.AVX2, not available on this hardware")
+		return
+	}
 	x := []int16{11, 12, 13, 14, 4, 5, 6, 7, 111, 112, 113, 114, 14, 15, 16, 17}
 	want := []int16{12, 13, 14, 11, 4, 5, 6, 7, 112, 113, 114, 111, 14, 15, 16, 17}
 	got := make([]int16, len(x))
@@ -1222,3 +1296,147 @@ func TestClMul(t *testing.T) {
 	foo(y.CarrylessMultiply(0, 0, y), []uint64{5, 0})
 
 }
+
+func addPairsSlice[T number](a, b []T) []T {
+	r := make([]T, len(a))
+	for i := range len(a) / 2 {
+		r[i] = a[2*i] + a[2*i+1]
+		r[i+len(a)/2] = b[2*i] + b[2*i+1]
+	}
+	return r
+}
+
+func subPairsSlice[T number](a, b []T) []T {
+	r := make([]T, len(a))
+	for i := range len(a) / 2 {
+		r[i] = a[2*i] - a[2*i+1]
+		r[i+len(a)/2] = b[2*i] - b[2*i+1]
+	}
+	return r
+}
+
+func addPairsGroupedSlice[T number](a, b []T) []T {
+	group := int(128 / unsafe.Sizeof(a[0]))
+	r := make([]T, 0, len(a))
+	for i := range len(a) / group {
+		r = append(r, addPairsSlice(a[i*group:(i+1)*group], b[i*group:(i+1)*group])...)
+	}
+	return r
+}
+
+func subPairsGroupedSlice[T number](a, b []T) []T {
+	group := int(128 / unsafe.Sizeof(a[0]))
+	r := make([]T, 0, len(a))
+	for i := range len(a) / group {
+		r = append(r, subPairsSlice(a[i*group:(i+1)*group], b[i*group:(i+1)*group])...)
+	}
+	return r
+}
+
+func TestAddSubPairs(t *testing.T) {
+	testInt16x8Binary(t, archsimd.Int16x8.AddPairs, addPairsSlice[int16])
+	testInt16x8Binary(t, archsimd.Int16x8.SubPairs, subPairsSlice[int16])
+	testUint16x8Binary(t, archsimd.Uint16x8.AddPairs, addPairsSlice[uint16])
+	testUint16x8Binary(t, archsimd.Uint16x8.SubPairs, subPairsSlice[uint16])
+	testInt32x4Binary(t, archsimd.Int32x4.AddPairs, addPairsSlice[int32])
+	testInt32x4Binary(t, archsimd.Int32x4.SubPairs, subPairsSlice[int32])
+	testUint32x4Binary(t, archsimd.Uint32x4.AddPairs, addPairsSlice[uint32])
+	testUint32x4Binary(t, archsimd.Uint32x4.SubPairs, subPairsSlice[uint32])
+	testFloat32x4Binary(t, archsimd.Float32x4.AddPairs, addPairsSlice[float32])
+	testFloat32x4Binary(t, archsimd.Float32x4.SubPairs, subPairsSlice[float32])
+	testFloat64x2Binary(t, archsimd.Float64x2.AddPairs, addPairsSlice[float64])
+	testFloat64x2Binary(t, archsimd.Float64x2.SubPairs, subPairsSlice[float64])
+
+	// Grouped versions
+	if archsimd.X86.AVX2() {
+		testInt16x16Binary(t, archsimd.Int16x16.AddPairsGrouped, addPairsGroupedSlice[int16])
+		testInt16x16Binary(t, archsimd.Int16x16.SubPairsGrouped, subPairsGroupedSlice[int16])
+		testUint16x16Binary(t, archsimd.Uint16x16.AddPairsGrouped, addPairsGroupedSlice[uint16])
+		testUint16x16Binary(t, archsimd.Uint16x16.SubPairsGrouped, subPairsGroupedSlice[uint16])
+		testInt32x8Binary(t, archsimd.Int32x8.AddPairsGrouped, addPairsGroupedSlice[int32])
+		testInt32x8Binary(t, archsimd.Int32x8.SubPairsGrouped, subPairsGroupedSlice[int32])
+		testUint32x8Binary(t, archsimd.Uint32x8.AddPairsGrouped, addPairsGroupedSlice[uint32])
+		testUint32x8Binary(t, archsimd.Uint32x8.SubPairsGrouped, subPairsGroupedSlice[uint32])
+		testFloat32x8Binary(t, archsimd.Float32x8.AddPairsGrouped, addPairsGroupedSlice[float32])
+		testFloat32x8Binary(t, archsimd.Float32x8.SubPairsGrouped, subPairsGroupedSlice[float32])
+		testFloat64x4Binary(t, archsimd.Float64x4.AddPairsGrouped, addPairsGroupedSlice[float64])
+		testFloat64x4Binary(t, archsimd.Float64x4.SubPairsGrouped, subPairsGroupedSlice[float64])
+	}
+}
+
+func convConcatSlice[T, U number](a, b []T, conv func(T) U) []U {
+	r := make([]U, len(a)+len(b))
+	for i, v := range a {
+		r[i] = conv(v)
+	}
+	for i, v := range b {
+		r[len(a)+i] = conv(v)
+	}
+	return r
+}
+
+func convConcatGroupedSlice[T, U number](a, b []T, conv func(T) U) []U {
+	group := int(128 / unsafe.Sizeof(a[0]))
+	r := make([]U, 0, len(a)+len(b))
+	for i := 0; i < len(a)/group; i++ {
+		r = append(r, convConcatSlice(a[i*group:(i+1)*group], b[i*group:(i+1)*group], conv)...)
+	}
+	return r
+}
+
+func TestSaturateConcat(t *testing.T) {
+	// Int32x4.SaturateToInt16Concat
+	forSlicePair(t, int32s, 4, func(x, y []int32) bool {
+		a, b := archsimd.LoadInt32x4Slice(x), archsimd.LoadInt32x4Slice(y)
+		var out [8]int16
+		a.SaturateToInt16Concat(b).Store(&out)
+		want := convConcatSlice(x, y, satToInt16)
+		return checkSlicesLogInput(t, out[:], want, 0, func() { t.Logf("x=%v, y=%v", x, y) })
+	})
+	// Int32x4.SaturateToUint16Concat
+	forSlicePair(t, int32s, 4, func(x, y []int32) bool {
+		a, b := archsimd.LoadInt32x4Slice(x), archsimd.LoadInt32x4Slice(y)
+		var out [8]uint16
+		a.SaturateToUint16Concat(b).Store(&out)
+		want := convConcatSlice(x, y, satToUint16)
+		return checkSlicesLogInput(t, out[:], want, 0, func() { t.Logf("x=%v, y=%v", x, y) })
+	})
+
+	if archsimd.X86.AVX2() {
+		// Int32x8.SaturateToInt16ConcatGrouped
+		forSlicePair(t, int32s, 8, func(x, y []int32) bool {
+			a, b := archsimd.LoadInt32x8Slice(x), archsimd.LoadInt32x8Slice(y)
+			var out [16]int16
+			a.SaturateToInt16ConcatGrouped(b).Store(&out)
+			want := convConcatGroupedSlice(x, y, satToInt16)
+			return checkSlicesLogInput(t, out[:], want, 0, func() { t.Logf("x=%v, y=%v", x, y) })
+		})
+		// Int32x8.SaturateToUint16ConcatGrouped
+		forSlicePair(t, int32s, 8, func(x, y []int32) bool {
+			a, b := archsimd.LoadInt32x8Slice(x), archsimd.LoadInt32x8Slice(y)
+			var out [16]uint16
+			a.SaturateToUint16ConcatGrouped(b).Store(&out)
+			want := convConcatGroupedSlice(x, y, satToUint16)
+			return checkSlicesLogInput(t, out[:], want, 0, func() { t.Logf("x=%v, y=%v", x, y) })
+		})
+	}
+
+	if archsimd.X86.AVX512() {
+		// Int32x16.SaturateToInt16ConcatGrouped
+		forSlicePair(t, int32s, 16, func(x, y []int32) bool {
+			a, b := archsimd.LoadInt32x16Slice(x), archsimd.LoadInt32x16Slice(y)
+			var out [32]int16
+			a.SaturateToInt16ConcatGrouped(b).Store(&out)
+			want := convConcatGroupedSlice(x, y, satToInt16)
+			return checkSlicesLogInput(t, out[:], want, 0, func() { t.Logf("x=%v, y=%v", x, y) })
+		})
+		// Int32x16.SaturateToUint16ConcatGrouped
+		forSlicePair(t, int32s, 16, func(x, y []int32) bool {
+			a, b := archsimd.LoadInt32x16Slice(x), archsimd.LoadInt32x16Slice(y)
+			var out [32]uint16
+			a.SaturateToUint16ConcatGrouped(b).Store(&out)
+			want := convConcatGroupedSlice(x, y, satToUint16)
+			return checkSlicesLogInput(t, out[:], want, 0, func() { t.Logf("x=%v, y=%v", x, y) })
+		})
+	}
+}
diff --git a/src/simd/archsimd/internal/simd_test/simulation_helpers_test.go b/src/simd/archsimd/internal/simd_test/simulation_helpers_test.go
index 2f040ffb3e..ac60b6d377 100644
--- a/src/simd/archsimd/internal/simd_test/simulation_helpers_test.go
+++ b/src/simd/archsimd/internal/simd_test/simulation_helpers_test.go
@@ -29,6 +29,10 @@ func notEqual[T number](x, y T) bool {
 	return x != y
 }
 
+func isNaN[T float](x T) bool {
+	return x != x
+}
+
 func abs[T number](x T) T {
 	// TODO this will need a non-standard FP-equality test.
 	if x == 0 { // true if x is -0.
@@ -121,16 +125,6 @@ func toUint64[T number](x T) uint64 {
 }
 
 func toUint32[T number](x T) uint32 {
-	switch y := (any(x)).(type) {
-	case float32:
-		if y < 0 || y > float32(math.MaxUint32) || y != y {
-			return math.MaxUint32
-		}
-	case float64:
-		if y < 0 || y > float64(math.MaxUint32) || y != y {
-			return math.MaxUint32
-		}
-	}
 	return uint32(x)
 }
 
@@ -158,6 +152,74 @@ func toFloat64[T number](x T) float64 {
 	return float64(x)
 }
 
+// X86 specific behavior for conversion from float to int32.
+// If the value cannot be represented as int32, it returns -0x80000000.
+func floatToInt32_x86[T float](x T) int32 {
+	switch y := (any(x)).(type) {
+	case float32:
+		if y != y || y < math.MinInt32 ||
+			y >= math.MaxInt32 { // float32(MaxInt32) == 0x80000000, actually overflows
+			return -0x80000000
+		}
+	case float64:
+		if y != y || y < math.MinInt32 ||
+			y > math.MaxInt32 { // float64(MaxInt32) is exact, no overflow
+			return -0x80000000
+		}
+	}
+	return int32(x)
+}
+
+// X86 specific behavior for conversion from float to int64.
+// If the value cannot be represented as int64, it returns -0x80000000_00000000.
+func floatToInt64_x86[T float](x T) int64 {
+	switch y := (any(x)).(type) {
+	case float32:
+		if y != y || y < math.MinInt64 ||
+			y >= math.MaxInt64 { // float32(MaxInt64) == 0x80000000_00000000, actually overflows
+			return -0x80000000_00000000
+		}
+	case float64:
+		if y != y || y < math.MinInt64 ||
+			y >= math.MaxInt64 { // float64(MaxInt64) == 0x80000000_00000000, also overflows
+			return -0x80000000_00000000
+		}
+	}
+	return int64(x)
+}
+
+// X86 specific behavior for conversion from float to uint32.
+// If the value cannot be represented as uint32, it returns 1<<32 - 1.
+func floatToUint32_x86[T float](x T) uint32 {
+	switch y := (any(x)).(type) {
+	case float32:
+		if y < 0 || y > math.MaxUint32 || y != y {
+			return 1<<32 - 1
+		}
+	case float64:
+		if y < 0 || y > math.MaxUint32 || y != y {
+			return 1<<32 - 1
+		}
+	}
+	return uint32(x)
+}
+
+// X86 specific behavior for conversion from float to uint64.
+// If the value cannot be represented as uint64, it returns 1<<64 - 1.
+func floatToUint64_x86[T float](x T) uint64 {
+	switch y := (any(x)).(type) {
+	case float32:
+		if y < 0 || y > math.MaxUint64 || y != y {
+			return 1<<64 - 1
+		}
+	case float64:
+		if y < 0 || y > math.MaxUint64 || y != y {
+			return 1<<64 - 1
+		}
+	}
+	return uint64(x)
+}
+
 func ceilResidueForPrecision[T float](i int) func(T) T {
 	f := 1.0
 	for i > 0 {
@@ -241,6 +303,15 @@ func notEqualSlice[T number](x, y []T) []int64 {
 	return mapCompare[T](notEqual)(x, y)
 }
 
+func isNaNSlice[T float](x []T) []int64 {
+	return map1[T](func(x T) int64 {
+		if isNaN(x) {
+			return -1
+		}
+		return 0
+	})(x)
+}
+
 func ceilSlice[T float](x []T) []T {
 	return map1[T](ceil)(x)
 }
@@ -272,3 +343,90 @@ func imaSlice[T integer](x, y, z []T) []T {
 func fmaSlice[T float](x, y, z []T) []T {
 	return map3[T](fma)(x, y, z)
 }
+
+func satToInt8[T integer](x T) int8 {
+	var m int8 = -128
+	var M int8 = 127
+	if T(M) < T(m) { // expecting T being a larger type
+		panic("bad input type")
+	}
+	if x < T(m) {
+		return m
+	}
+	if x > T(M) {
+		return M
+	}
+	return int8(x)
+}
+
+func satToUint8[T integer](x T) uint8 {
+	var M uint8 = 255
+	if T(M) < 0 { // expecting T being a larger type
+		panic("bad input type")
+	}
+	if x < 0 {
+		return 0
+	}
+	if x > T(M) {
+		return M
+	}
+	return uint8(x)
+}
+
+func satToInt16[T integer](x T) int16 {
+	var m int16 = -32768
+	var M int16 = 32767
+	if T(M) < T(m) { // expecting T being a larger type
+		panic("bad input type")
+	}
+	if x < T(m) {
+		return m
+	}
+	if x > T(M) {
+		return M
+	}
+	return int16(x)
+}
+
+func satToUint16[T integer](x T) uint16 {
+	var M uint16 = 65535
+	if T(M) < 0 { // expecting T being a larger type
+		panic("bad input type")
+	}
+	if x < 0 {
+		return 0
+	}
+	if x > T(M) {
+		return M
+	}
+	return uint16(x)
+}
+
+func satToInt32[T integer](x T) int32 {
+	var m int32 = -1 << 31
+	var M int32 = 1<<31 - 1
+	if T(M) < T(m) { // expecting T being a larger type
+		panic("bad input type")
+	}
+	if x < T(m) {
+		return m
+	}
+	if x > T(M) {
+		return M
+	}
+	return int32(x)
+}
+
+func satToUint32[T integer](x T) uint32 {
+	var M uint32 = 1<<32 - 1
+	if T(M) < 0 { // expecting T being a larger type
+		panic("bad input type")
+	}
+	if x < 0 {
+		return 0
+	}
+	if x > T(M) {
+		return M
+	}
+	return uint32(x)
+}
diff --git a/src/simd/archsimd/internal/simd_test/ternary_helpers_test.go b/src/simd/archsimd/internal/simd_test/ternary_helpers_test.go
index c37f9ef0ca..2e25010890 100644
--- a/src/simd/archsimd/internal/simd_test/ternary_helpers_test.go
+++ b/src/simd/archsimd/internal/simd_test/ternary_helpers_test.go
@@ -1,6 +1,6 @@
-// Code generated by 'go run genfiles.go'; DO NOT EDIT.
+// Code generated by 'tmplgen'; DO NOT EDIT.
 
-//go:build goexperiment.simd
+//go:build goexperiment.simd && amd64
 
 // This file contains functions testing ternary simd methods.
 // Each function in this file is specialized for a
diff --git a/src/simd/archsimd/internal/simd_test/unary_helpers_test.go b/src/simd/archsimd/internal/simd_test/unary_helpers_test.go
index e2610ad98b..5d14c4ff05 100644
--- a/src/simd/archsimd/internal/simd_test/unary_helpers_test.go
+++ b/src/simd/archsimd/internal/simd_test/unary_helpers_test.go
@@ -1,6 +1,6 @@
-// Code generated by 'go run genfiles.go'; DO NOT EDIT.
+// Code generated by 'tmplgen'; DO NOT EDIT.
 
-//go:build goexperiment.simd
+//go:build goexperiment.simd && amd64
 
 // This file contains functions testing unary simd methods.
 // Each function in this file is specialized for a
@@ -433,915 +433,8400 @@ func testFloat64x8Unary(t *testing.T, f func(_ archsimd.Float64x8) archsimd.Floa
 	})
 }
 
-// testInt8x16ConvertToInt32 tests the simd conversion method f against the expected behavior generated by want
-// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+// testInt8x16ConvertToInt8 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testInt8x16ConvertToInt8(t *testing.T, f func(x archsimd.Int8x16) archsimd.Int8x16, want func(x []int8) []int8) {
+	n := 16
+	t.Helper()
+	forSlice(t, int8s, n, func(x []int8) bool {
+		t.Helper()
+		a := archsimd.LoadInt8x16Slice(x)
+		g := make([]int8, 16)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt16x8ConvertToInt8 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testInt16x8ConvertToInt8(t *testing.T, f func(x archsimd.Int16x8) archsimd.Int8x16, want func(x []int16) []int8) {
+	n := 8
+	t.Helper()
+	forSlice(t, int16s, n, func(x []int16) bool {
+		t.Helper()
+		a := archsimd.LoadInt16x8Slice(x)
+		g := make([]int8, 16)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt32x4ConvertToInt8 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testInt32x4ConvertToInt8(t *testing.T, f func(x archsimd.Int32x4) archsimd.Int8x16, want func(x []int32) []int8) {
+	n := 4
+	t.Helper()
+	forSlice(t, int32s, n, func(x []int32) bool {
+		t.Helper()
+		a := archsimd.LoadInt32x4Slice(x)
+		g := make([]int8, 16)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt64x2ConvertToInt8 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testInt64x2ConvertToInt8(t *testing.T, f func(x archsimd.Int64x2) archsimd.Int8x16, want func(x []int64) []int8) {
+	n := 2
+	t.Helper()
+	forSlice(t, int64s, n, func(x []int64) bool {
+		t.Helper()
+		a := archsimd.LoadInt64x2Slice(x)
+		g := make([]int8, 16)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint8x16ConvertToInt8 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testUint8x16ConvertToInt8(t *testing.T, f func(x archsimd.Uint8x16) archsimd.Int8x16, want func(x []uint8) []int8) {
+	n := 16
+	t.Helper()
+	forSlice(t, uint8s, n, func(x []uint8) bool {
+		t.Helper()
+		a := archsimd.LoadUint8x16Slice(x)
+		g := make([]int8, 16)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint16x8ConvertToInt8 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testUint16x8ConvertToInt8(t *testing.T, f func(x archsimd.Uint16x8) archsimd.Int8x16, want func(x []uint16) []int8) {
+	n := 8
+	t.Helper()
+	forSlice(t, uint16s, n, func(x []uint16) bool {
+		t.Helper()
+		a := archsimd.LoadUint16x8Slice(x)
+		g := make([]int8, 16)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint32x4ConvertToInt8 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testUint32x4ConvertToInt8(t *testing.T, f func(x archsimd.Uint32x4) archsimd.Int8x16, want func(x []uint32) []int8) {
+	n := 4
+	t.Helper()
+	forSlice(t, uint32s, n, func(x []uint32) bool {
+		t.Helper()
+		a := archsimd.LoadUint32x4Slice(x)
+		g := make([]int8, 16)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint64x2ConvertToInt8 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testUint64x2ConvertToInt8(t *testing.T, f func(x archsimd.Uint64x2) archsimd.Int8x16, want func(x []uint64) []int8) {
+	n := 2
+	t.Helper()
+	forSlice(t, uint64s, n, func(x []uint64) bool {
+		t.Helper()
+		a := archsimd.LoadUint64x2Slice(x)
+		g := make([]int8, 16)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testFloat32x4ConvertToInt8 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testFloat32x4ConvertToInt8(t *testing.T, f func(x archsimd.Float32x4) archsimd.Int8x16, want func(x []float32) []int8) {
+	n := 4
+	t.Helper()
+	forSlice(t, float32s, n, func(x []float32) bool {
+		t.Helper()
+		a := archsimd.LoadFloat32x4Slice(x)
+		g := make([]int8, 16)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testFloat64x2ConvertToInt8 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testFloat64x2ConvertToInt8(t *testing.T, f func(x archsimd.Float64x2) archsimd.Int8x16, want func(x []float64) []int8) {
+	n := 2
+	t.Helper()
+	forSlice(t, float64s, n, func(x []float64) bool {
+		t.Helper()
+		a := archsimd.LoadFloat64x2Slice(x)
+		g := make([]int8, 16)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt8x32ConvertToInt8 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testInt8x32ConvertToInt8(t *testing.T, f func(x archsimd.Int8x32) archsimd.Int8x32, want func(x []int8) []int8) {
+	n := 32
+	t.Helper()
+	forSlice(t, int8s, n, func(x []int8) bool {
+		t.Helper()
+		a := archsimd.LoadInt8x32Slice(x)
+		g := make([]int8, 32)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt16x16ConvertToInt8 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testInt16x16ConvertToInt8(t *testing.T, f func(x archsimd.Int16x16) archsimd.Int8x16, want func(x []int16) []int8) {
+	n := 16
+	t.Helper()
+	forSlice(t, int16s, n, func(x []int16) bool {
+		t.Helper()
+		a := archsimd.LoadInt16x16Slice(x)
+		g := make([]int8, 16)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt32x8ConvertToInt8 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testInt32x8ConvertToInt8(t *testing.T, f func(x archsimd.Int32x8) archsimd.Int8x16, want func(x []int32) []int8) {
+	n := 8
+	t.Helper()
+	forSlice(t, int32s, n, func(x []int32) bool {
+		t.Helper()
+		a := archsimd.LoadInt32x8Slice(x)
+		g := make([]int8, 16)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt64x4ConvertToInt8 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testInt64x4ConvertToInt8(t *testing.T, f func(x archsimd.Int64x4) archsimd.Int8x16, want func(x []int64) []int8) {
+	n := 4
+	t.Helper()
+	forSlice(t, int64s, n, func(x []int64) bool {
+		t.Helper()
+		a := archsimd.LoadInt64x4Slice(x)
+		g := make([]int8, 16)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint8x32ConvertToInt8 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testUint8x32ConvertToInt8(t *testing.T, f func(x archsimd.Uint8x32) archsimd.Int8x32, want func(x []uint8) []int8) {
+	n := 32
+	t.Helper()
+	forSlice(t, uint8s, n, func(x []uint8) bool {
+		t.Helper()
+		a := archsimd.LoadUint8x32Slice(x)
+		g := make([]int8, 32)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint16x16ConvertToInt8 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testUint16x16ConvertToInt8(t *testing.T, f func(x archsimd.Uint16x16) archsimd.Int8x16, want func(x []uint16) []int8) {
+	n := 16
+	t.Helper()
+	forSlice(t, uint16s, n, func(x []uint16) bool {
+		t.Helper()
+		a := archsimd.LoadUint16x16Slice(x)
+		g := make([]int8, 16)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint32x8ConvertToInt8 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testUint32x8ConvertToInt8(t *testing.T, f func(x archsimd.Uint32x8) archsimd.Int8x16, want func(x []uint32) []int8) {
+	n := 8
+	t.Helper()
+	forSlice(t, uint32s, n, func(x []uint32) bool {
+		t.Helper()
+		a := archsimd.LoadUint32x8Slice(x)
+		g := make([]int8, 16)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint64x4ConvertToInt8 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testUint64x4ConvertToInt8(t *testing.T, f func(x archsimd.Uint64x4) archsimd.Int8x16, want func(x []uint64) []int8) {
+	n := 4
+	t.Helper()
+	forSlice(t, uint64s, n, func(x []uint64) bool {
+		t.Helper()
+		a := archsimd.LoadUint64x4Slice(x)
+		g := make([]int8, 16)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testFloat32x8ConvertToInt8 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testFloat32x8ConvertToInt8(t *testing.T, f func(x archsimd.Float32x8) archsimd.Int8x16, want func(x []float32) []int8) {
+	n := 8
+	t.Helper()
+	forSlice(t, float32s, n, func(x []float32) bool {
+		t.Helper()
+		a := archsimd.LoadFloat32x8Slice(x)
+		g := make([]int8, 16)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testFloat64x4ConvertToInt8 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testFloat64x4ConvertToInt8(t *testing.T, f func(x archsimd.Float64x4) archsimd.Int8x16, want func(x []float64) []int8) {
+	n := 4
+	t.Helper()
+	forSlice(t, float64s, n, func(x []float64) bool {
+		t.Helper()
+		a := archsimd.LoadFloat64x4Slice(x)
+		g := make([]int8, 16)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt8x64ConvertToInt8 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testInt8x64ConvertToInt8(t *testing.T, f func(x archsimd.Int8x64) archsimd.Int8x64, want func(x []int8) []int8) {
+	n := 64
+	t.Helper()
+	forSlice(t, int8s, n, func(x []int8) bool {
+		t.Helper()
+		a := archsimd.LoadInt8x64Slice(x)
+		g := make([]int8, 64)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt16x32ConvertToInt8 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testInt16x32ConvertToInt8(t *testing.T, f func(x archsimd.Int16x32) archsimd.Int8x32, want func(x []int16) []int8) {
+	n := 32
+	t.Helper()
+	forSlice(t, int16s, n, func(x []int16) bool {
+		t.Helper()
+		a := archsimd.LoadInt16x32Slice(x)
+		g := make([]int8, 32)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt32x16ConvertToInt8 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testInt32x16ConvertToInt8(t *testing.T, f func(x archsimd.Int32x16) archsimd.Int8x16, want func(x []int32) []int8) {
+	n := 16
+	t.Helper()
+	forSlice(t, int32s, n, func(x []int32) bool {
+		t.Helper()
+		a := archsimd.LoadInt32x16Slice(x)
+		g := make([]int8, 16)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt64x8ConvertToInt8 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testInt64x8ConvertToInt8(t *testing.T, f func(x archsimd.Int64x8) archsimd.Int8x16, want func(x []int64) []int8) {
+	n := 8
+	t.Helper()
+	forSlice(t, int64s, n, func(x []int64) bool {
+		t.Helper()
+		a := archsimd.LoadInt64x8Slice(x)
+		g := make([]int8, 16)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint8x64ConvertToInt8 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testUint8x64ConvertToInt8(t *testing.T, f func(x archsimd.Uint8x64) archsimd.Int8x64, want func(x []uint8) []int8) {
+	n := 64
+	t.Helper()
+	forSlice(t, uint8s, n, func(x []uint8) bool {
+		t.Helper()
+		a := archsimd.LoadUint8x64Slice(x)
+		g := make([]int8, 64)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint16x32ConvertToInt8 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testUint16x32ConvertToInt8(t *testing.T, f func(x archsimd.Uint16x32) archsimd.Int8x32, want func(x []uint16) []int8) {
+	n := 32
+	t.Helper()
+	forSlice(t, uint16s, n, func(x []uint16) bool {
+		t.Helper()
+		a := archsimd.LoadUint16x32Slice(x)
+		g := make([]int8, 32)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint32x16ConvertToInt8 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testUint32x16ConvertToInt8(t *testing.T, f func(x archsimd.Uint32x16) archsimd.Int8x16, want func(x []uint32) []int8) {
+	n := 16
+	t.Helper()
+	forSlice(t, uint32s, n, func(x []uint32) bool {
+		t.Helper()
+		a := archsimd.LoadUint32x16Slice(x)
+		g := make([]int8, 16)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint64x8ConvertToInt8 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testUint64x8ConvertToInt8(t *testing.T, f func(x archsimd.Uint64x8) archsimd.Int8x16, want func(x []uint64) []int8) {
+	n := 8
+	t.Helper()
+	forSlice(t, uint64s, n, func(x []uint64) bool {
+		t.Helper()
+		a := archsimd.LoadUint64x8Slice(x)
+		g := make([]int8, 16)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testFloat32x16ConvertToInt8 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testFloat32x16ConvertToInt8(t *testing.T, f func(x archsimd.Float32x16) archsimd.Int8x16, want func(x []float32) []int8) {
+	n := 16
+	t.Helper()
+	forSlice(t, float32s, n, func(x []float32) bool {
+		t.Helper()
+		a := archsimd.LoadFloat32x16Slice(x)
+		g := make([]int8, 16)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testFloat64x8ConvertToInt8 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testFloat64x8ConvertToInt8(t *testing.T, f func(x archsimd.Float64x8) archsimd.Int8x16, want func(x []float64) []int8) {
+	n := 8
+	t.Helper()
+	forSlice(t, float64s, n, func(x []float64) bool {
+		t.Helper()
+		a := archsimd.LoadFloat64x8Slice(x)
+		g := make([]int8, 16)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt8x16ConvertToUint8 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testInt8x16ConvertToUint8(t *testing.T, f func(x archsimd.Int8x16) archsimd.Uint8x16, want func(x []int8) []uint8) {
+	n := 16
+	t.Helper()
+	forSlice(t, int8s, n, func(x []int8) bool {
+		t.Helper()
+		a := archsimd.LoadInt8x16Slice(x)
+		g := make([]uint8, 16)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt16x8ConvertToUint8 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testInt16x8ConvertToUint8(t *testing.T, f func(x archsimd.Int16x8) archsimd.Uint8x16, want func(x []int16) []uint8) {
+	n := 8
+	t.Helper()
+	forSlice(t, int16s, n, func(x []int16) bool {
+		t.Helper()
+		a := archsimd.LoadInt16x8Slice(x)
+		g := make([]uint8, 16)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt32x4ConvertToUint8 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testInt32x4ConvertToUint8(t *testing.T, f func(x archsimd.Int32x4) archsimd.Uint8x16, want func(x []int32) []uint8) {
+	n := 4
+	t.Helper()
+	forSlice(t, int32s, n, func(x []int32) bool {
+		t.Helper()
+		a := archsimd.LoadInt32x4Slice(x)
+		g := make([]uint8, 16)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt64x2ConvertToUint8 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testInt64x2ConvertToUint8(t *testing.T, f func(x archsimd.Int64x2) archsimd.Uint8x16, want func(x []int64) []uint8) {
+	n := 2
+	t.Helper()
+	forSlice(t, int64s, n, func(x []int64) bool {
+		t.Helper()
+		a := archsimd.LoadInt64x2Slice(x)
+		g := make([]uint8, 16)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint8x16ConvertToUint8 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testUint8x16ConvertToUint8(t *testing.T, f func(x archsimd.Uint8x16) archsimd.Uint8x16, want func(x []uint8) []uint8) {
+	n := 16
+	t.Helper()
+	forSlice(t, uint8s, n, func(x []uint8) bool {
+		t.Helper()
+		a := archsimd.LoadUint8x16Slice(x)
+		g := make([]uint8, 16)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint16x8ConvertToUint8 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testUint16x8ConvertToUint8(t *testing.T, f func(x archsimd.Uint16x8) archsimd.Uint8x16, want func(x []uint16) []uint8) {
+	n := 8
+	t.Helper()
+	forSlice(t, uint16s, n, func(x []uint16) bool {
+		t.Helper()
+		a := archsimd.LoadUint16x8Slice(x)
+		g := make([]uint8, 16)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint32x4ConvertToUint8 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testUint32x4ConvertToUint8(t *testing.T, f func(x archsimd.Uint32x4) archsimd.Uint8x16, want func(x []uint32) []uint8) {
+	n := 4
+	t.Helper()
+	forSlice(t, uint32s, n, func(x []uint32) bool {
+		t.Helper()
+		a := archsimd.LoadUint32x4Slice(x)
+		g := make([]uint8, 16)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint64x2ConvertToUint8 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testUint64x2ConvertToUint8(t *testing.T, f func(x archsimd.Uint64x2) archsimd.Uint8x16, want func(x []uint64) []uint8) {
+	n := 2
+	t.Helper()
+	forSlice(t, uint64s, n, func(x []uint64) bool {
+		t.Helper()
+		a := archsimd.LoadUint64x2Slice(x)
+		g := make([]uint8, 16)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testFloat32x4ConvertToUint8 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testFloat32x4ConvertToUint8(t *testing.T, f func(x archsimd.Float32x4) archsimd.Uint8x16, want func(x []float32) []uint8) {
+	n := 4
+	t.Helper()
+	forSlice(t, float32s, n, func(x []float32) bool {
+		t.Helper()
+		a := archsimd.LoadFloat32x4Slice(x)
+		g := make([]uint8, 16)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testFloat64x2ConvertToUint8 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testFloat64x2ConvertToUint8(t *testing.T, f func(x archsimd.Float64x2) archsimd.Uint8x16, want func(x []float64) []uint8) {
+	n := 2
+	t.Helper()
+	forSlice(t, float64s, n, func(x []float64) bool {
+		t.Helper()
+		a := archsimd.LoadFloat64x2Slice(x)
+		g := make([]uint8, 16)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt8x32ConvertToUint8 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testInt8x32ConvertToUint8(t *testing.T, f func(x archsimd.Int8x32) archsimd.Uint8x32, want func(x []int8) []uint8) {
+	n := 32
+	t.Helper()
+	forSlice(t, int8s, n, func(x []int8) bool {
+		t.Helper()
+		a := archsimd.LoadInt8x32Slice(x)
+		g := make([]uint8, 32)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt16x16ConvertToUint8 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testInt16x16ConvertToUint8(t *testing.T, f func(x archsimd.Int16x16) archsimd.Uint8x16, want func(x []int16) []uint8) {
+	n := 16
+	t.Helper()
+	forSlice(t, int16s, n, func(x []int16) bool {
+		t.Helper()
+		a := archsimd.LoadInt16x16Slice(x)
+		g := make([]uint8, 16)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt32x8ConvertToUint8 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testInt32x8ConvertToUint8(t *testing.T, f func(x archsimd.Int32x8) archsimd.Uint8x16, want func(x []int32) []uint8) {
+	n := 8
+	t.Helper()
+	forSlice(t, int32s, n, func(x []int32) bool {
+		t.Helper()
+		a := archsimd.LoadInt32x8Slice(x)
+		g := make([]uint8, 16)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt64x4ConvertToUint8 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testInt64x4ConvertToUint8(t *testing.T, f func(x archsimd.Int64x4) archsimd.Uint8x16, want func(x []int64) []uint8) {
+	n := 4
+	t.Helper()
+	forSlice(t, int64s, n, func(x []int64) bool {
+		t.Helper()
+		a := archsimd.LoadInt64x4Slice(x)
+		g := make([]uint8, 16)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint8x32ConvertToUint8 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testUint8x32ConvertToUint8(t *testing.T, f func(x archsimd.Uint8x32) archsimd.Uint8x32, want func(x []uint8) []uint8) {
+	n := 32
+	t.Helper()
+	forSlice(t, uint8s, n, func(x []uint8) bool {
+		t.Helper()
+		a := archsimd.LoadUint8x32Slice(x)
+		g := make([]uint8, 32)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint16x16ConvertToUint8 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testUint16x16ConvertToUint8(t *testing.T, f func(x archsimd.Uint16x16) archsimd.Uint8x16, want func(x []uint16) []uint8) {
+	n := 16
+	t.Helper()
+	forSlice(t, uint16s, n, func(x []uint16) bool {
+		t.Helper()
+		a := archsimd.LoadUint16x16Slice(x)
+		g := make([]uint8, 16)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint32x8ConvertToUint8 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testUint32x8ConvertToUint8(t *testing.T, f func(x archsimd.Uint32x8) archsimd.Uint8x16, want func(x []uint32) []uint8) {
+	n := 8
+	t.Helper()
+	forSlice(t, uint32s, n, func(x []uint32) bool {
+		t.Helper()
+		a := archsimd.LoadUint32x8Slice(x)
+		g := make([]uint8, 16)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint64x4ConvertToUint8 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testUint64x4ConvertToUint8(t *testing.T, f func(x archsimd.Uint64x4) archsimd.Uint8x16, want func(x []uint64) []uint8) {
+	n := 4
+	t.Helper()
+	forSlice(t, uint64s, n, func(x []uint64) bool {
+		t.Helper()
+		a := archsimd.LoadUint64x4Slice(x)
+		g := make([]uint8, 16)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testFloat32x8ConvertToUint8 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testFloat32x8ConvertToUint8(t *testing.T, f func(x archsimd.Float32x8) archsimd.Uint8x16, want func(x []float32) []uint8) {
+	n := 8
+	t.Helper()
+	forSlice(t, float32s, n, func(x []float32) bool {
+		t.Helper()
+		a := archsimd.LoadFloat32x8Slice(x)
+		g := make([]uint8, 16)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testFloat64x4ConvertToUint8 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testFloat64x4ConvertToUint8(t *testing.T, f func(x archsimd.Float64x4) archsimd.Uint8x16, want func(x []float64) []uint8) {
+	n := 4
+	t.Helper()
+	forSlice(t, float64s, n, func(x []float64) bool {
+		t.Helper()
+		a := archsimd.LoadFloat64x4Slice(x)
+		g := make([]uint8, 16)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt8x64ConvertToUint8 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testInt8x64ConvertToUint8(t *testing.T, f func(x archsimd.Int8x64) archsimd.Uint8x64, want func(x []int8) []uint8) {
+	n := 64
+	t.Helper()
+	forSlice(t, int8s, n, func(x []int8) bool {
+		t.Helper()
+		a := archsimd.LoadInt8x64Slice(x)
+		g := make([]uint8, 64)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt16x32ConvertToUint8 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testInt16x32ConvertToUint8(t *testing.T, f func(x archsimd.Int16x32) archsimd.Uint8x32, want func(x []int16) []uint8) {
+	n := 32
+	t.Helper()
+	forSlice(t, int16s, n, func(x []int16) bool {
+		t.Helper()
+		a := archsimd.LoadInt16x32Slice(x)
+		g := make([]uint8, 32)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt32x16ConvertToUint8 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testInt32x16ConvertToUint8(t *testing.T, f func(x archsimd.Int32x16) archsimd.Uint8x16, want func(x []int32) []uint8) {
+	n := 16
+	t.Helper()
+	forSlice(t, int32s, n, func(x []int32) bool {
+		t.Helper()
+		a := archsimd.LoadInt32x16Slice(x)
+		g := make([]uint8, 16)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt64x8ConvertToUint8 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testInt64x8ConvertToUint8(t *testing.T, f func(x archsimd.Int64x8) archsimd.Uint8x16, want func(x []int64) []uint8) {
+	n := 8
+	t.Helper()
+	forSlice(t, int64s, n, func(x []int64) bool {
+		t.Helper()
+		a := archsimd.LoadInt64x8Slice(x)
+		g := make([]uint8, 16)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint8x64ConvertToUint8 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testUint8x64ConvertToUint8(t *testing.T, f func(x archsimd.Uint8x64) archsimd.Uint8x64, want func(x []uint8) []uint8) {
+	n := 64
+	t.Helper()
+	forSlice(t, uint8s, n, func(x []uint8) bool {
+		t.Helper()
+		a := archsimd.LoadUint8x64Slice(x)
+		g := make([]uint8, 64)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint16x32ConvertToUint8 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testUint16x32ConvertToUint8(t *testing.T, f func(x archsimd.Uint16x32) archsimd.Uint8x32, want func(x []uint16) []uint8) {
+	n := 32
+	t.Helper()
+	forSlice(t, uint16s, n, func(x []uint16) bool {
+		t.Helper()
+		a := archsimd.LoadUint16x32Slice(x)
+		g := make([]uint8, 32)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint32x16ConvertToUint8 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testUint32x16ConvertToUint8(t *testing.T, f func(x archsimd.Uint32x16) archsimd.Uint8x16, want func(x []uint32) []uint8) {
+	n := 16
+	t.Helper()
+	forSlice(t, uint32s, n, func(x []uint32) bool {
+		t.Helper()
+		a := archsimd.LoadUint32x16Slice(x)
+		g := make([]uint8, 16)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint64x8ConvertToUint8 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testUint64x8ConvertToUint8(t *testing.T, f func(x archsimd.Uint64x8) archsimd.Uint8x16, want func(x []uint64) []uint8) {
+	n := 8
+	t.Helper()
+	forSlice(t, uint64s, n, func(x []uint64) bool {
+		t.Helper()
+		a := archsimd.LoadUint64x8Slice(x)
+		g := make([]uint8, 16)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testFloat32x16ConvertToUint8 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testFloat32x16ConvertToUint8(t *testing.T, f func(x archsimd.Float32x16) archsimd.Uint8x16, want func(x []float32) []uint8) {
+	n := 16
+	t.Helper()
+	forSlice(t, float32s, n, func(x []float32) bool {
+		t.Helper()
+		a := archsimd.LoadFloat32x16Slice(x)
+		g := make([]uint8, 16)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testFloat64x8ConvertToUint8 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testFloat64x8ConvertToUint8(t *testing.T, f func(x archsimd.Float64x8) archsimd.Uint8x16, want func(x []float64) []uint8) {
+	n := 8
+	t.Helper()
+	forSlice(t, float64s, n, func(x []float64) bool {
+		t.Helper()
+		a := archsimd.LoadFloat64x8Slice(x)
+		g := make([]uint8, 16)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt8x16ConvertToInt16 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testInt8x16ConvertToInt16(t *testing.T, f func(x archsimd.Int8x16) archsimd.Int16x16, want func(x []int8) []int16) {
+	n := 16
+	t.Helper()
+	forSlice(t, int8s, n, func(x []int8) bool {
+		t.Helper()
+		a := archsimd.LoadInt8x16Slice(x)
+		g := make([]int16, 16)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt16x8ConvertToInt16 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testInt16x8ConvertToInt16(t *testing.T, f func(x archsimd.Int16x8) archsimd.Int16x8, want func(x []int16) []int16) {
+	n := 8
+	t.Helper()
+	forSlice(t, int16s, n, func(x []int16) bool {
+		t.Helper()
+		a := archsimd.LoadInt16x8Slice(x)
+		g := make([]int16, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt32x4ConvertToInt16 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testInt32x4ConvertToInt16(t *testing.T, f func(x archsimd.Int32x4) archsimd.Int16x8, want func(x []int32) []int16) {
+	n := 4
+	t.Helper()
+	forSlice(t, int32s, n, func(x []int32) bool {
+		t.Helper()
+		a := archsimd.LoadInt32x4Slice(x)
+		g := make([]int16, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt64x2ConvertToInt16 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testInt64x2ConvertToInt16(t *testing.T, f func(x archsimd.Int64x2) archsimd.Int16x8, want func(x []int64) []int16) {
+	n := 2
+	t.Helper()
+	forSlice(t, int64s, n, func(x []int64) bool {
+		t.Helper()
+		a := archsimd.LoadInt64x2Slice(x)
+		g := make([]int16, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint8x16ConvertToInt16 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testUint8x16ConvertToInt16(t *testing.T, f func(x archsimd.Uint8x16) archsimd.Int16x16, want func(x []uint8) []int16) {
+	n := 16
+	t.Helper()
+	forSlice(t, uint8s, n, func(x []uint8) bool {
+		t.Helper()
+		a := archsimd.LoadUint8x16Slice(x)
+		g := make([]int16, 16)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint16x8ConvertToInt16 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testUint16x8ConvertToInt16(t *testing.T, f func(x archsimd.Uint16x8) archsimd.Int16x8, want func(x []uint16) []int16) {
+	n := 8
+	t.Helper()
+	forSlice(t, uint16s, n, func(x []uint16) bool {
+		t.Helper()
+		a := archsimd.LoadUint16x8Slice(x)
+		g := make([]int16, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint32x4ConvertToInt16 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testUint32x4ConvertToInt16(t *testing.T, f func(x archsimd.Uint32x4) archsimd.Int16x8, want func(x []uint32) []int16) {
+	n := 4
+	t.Helper()
+	forSlice(t, uint32s, n, func(x []uint32) bool {
+		t.Helper()
+		a := archsimd.LoadUint32x4Slice(x)
+		g := make([]int16, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint64x2ConvertToInt16 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testUint64x2ConvertToInt16(t *testing.T, f func(x archsimd.Uint64x2) archsimd.Int16x8, want func(x []uint64) []int16) {
+	n := 2
+	t.Helper()
+	forSlice(t, uint64s, n, func(x []uint64) bool {
+		t.Helper()
+		a := archsimd.LoadUint64x2Slice(x)
+		g := make([]int16, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testFloat32x4ConvertToInt16 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testFloat32x4ConvertToInt16(t *testing.T, f func(x archsimd.Float32x4) archsimd.Int16x8, want func(x []float32) []int16) {
+	n := 4
+	t.Helper()
+	forSlice(t, float32s, n, func(x []float32) bool {
+		t.Helper()
+		a := archsimd.LoadFloat32x4Slice(x)
+		g := make([]int16, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testFloat64x2ConvertToInt16 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testFloat64x2ConvertToInt16(t *testing.T, f func(x archsimd.Float64x2) archsimd.Int16x8, want func(x []float64) []int16) {
+	n := 2
+	t.Helper()
+	forSlice(t, float64s, n, func(x []float64) bool {
+		t.Helper()
+		a := archsimd.LoadFloat64x2Slice(x)
+		g := make([]int16, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt8x32ConvertToInt16 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testInt8x32ConvertToInt16(t *testing.T, f func(x archsimd.Int8x32) archsimd.Int16x32, want func(x []int8) []int16) {
+	n := 32
+	t.Helper()
+	forSlice(t, int8s, n, func(x []int8) bool {
+		t.Helper()
+		a := archsimd.LoadInt8x32Slice(x)
+		g := make([]int16, 32)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt16x16ConvertToInt16 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testInt16x16ConvertToInt16(t *testing.T, f func(x archsimd.Int16x16) archsimd.Int16x16, want func(x []int16) []int16) {
+	n := 16
+	t.Helper()
+	forSlice(t, int16s, n, func(x []int16) bool {
+		t.Helper()
+		a := archsimd.LoadInt16x16Slice(x)
+		g := make([]int16, 16)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt32x8ConvertToInt16 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testInt32x8ConvertToInt16(t *testing.T, f func(x archsimd.Int32x8) archsimd.Int16x8, want func(x []int32) []int16) {
+	n := 8
+	t.Helper()
+	forSlice(t, int32s, n, func(x []int32) bool {
+		t.Helper()
+		a := archsimd.LoadInt32x8Slice(x)
+		g := make([]int16, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt64x4ConvertToInt16 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testInt64x4ConvertToInt16(t *testing.T, f func(x archsimd.Int64x4) archsimd.Int16x8, want func(x []int64) []int16) {
+	n := 4
+	t.Helper()
+	forSlice(t, int64s, n, func(x []int64) bool {
+		t.Helper()
+		a := archsimd.LoadInt64x4Slice(x)
+		g := make([]int16, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint8x32ConvertToInt16 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testUint8x32ConvertToInt16(t *testing.T, f func(x archsimd.Uint8x32) archsimd.Int16x32, want func(x []uint8) []int16) {
+	n := 32
+	t.Helper()
+	forSlice(t, uint8s, n, func(x []uint8) bool {
+		t.Helper()
+		a := archsimd.LoadUint8x32Slice(x)
+		g := make([]int16, 32)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint16x16ConvertToInt16 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testUint16x16ConvertToInt16(t *testing.T, f func(x archsimd.Uint16x16) archsimd.Int16x16, want func(x []uint16) []int16) {
+	n := 16
+	t.Helper()
+	forSlice(t, uint16s, n, func(x []uint16) bool {
+		t.Helper()
+		a := archsimd.LoadUint16x16Slice(x)
+		g := make([]int16, 16)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint32x8ConvertToInt16 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testUint32x8ConvertToInt16(t *testing.T, f func(x archsimd.Uint32x8) archsimd.Int16x8, want func(x []uint32) []int16) {
+	n := 8
+	t.Helper()
+	forSlice(t, uint32s, n, func(x []uint32) bool {
+		t.Helper()
+		a := archsimd.LoadUint32x8Slice(x)
+		g := make([]int16, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint64x4ConvertToInt16 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testUint64x4ConvertToInt16(t *testing.T, f func(x archsimd.Uint64x4) archsimd.Int16x8, want func(x []uint64) []int16) {
+	n := 4
+	t.Helper()
+	forSlice(t, uint64s, n, func(x []uint64) bool {
+		t.Helper()
+		a := archsimd.LoadUint64x4Slice(x)
+		g := make([]int16, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testFloat32x8ConvertToInt16 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testFloat32x8ConvertToInt16(t *testing.T, f func(x archsimd.Float32x8) archsimd.Int16x8, want func(x []float32) []int16) {
+	n := 8
+	t.Helper()
+	forSlice(t, float32s, n, func(x []float32) bool {
+		t.Helper()
+		a := archsimd.LoadFloat32x8Slice(x)
+		g := make([]int16, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testFloat64x4ConvertToInt16 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testFloat64x4ConvertToInt16(t *testing.T, f func(x archsimd.Float64x4) archsimd.Int16x8, want func(x []float64) []int16) {
+	n := 4
+	t.Helper()
+	forSlice(t, float64s, n, func(x []float64) bool {
+		t.Helper()
+		a := archsimd.LoadFloat64x4Slice(x)
+		g := make([]int16, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt8x64ConvertToInt16 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testInt8x64ConvertToInt16(t *testing.T, f func(x archsimd.Int8x64) archsimd.Int16x32, want func(x []int8) []int16) {
+	n := 64
+	t.Helper()
+	forSlice(t, int8s, n, func(x []int8) bool {
+		t.Helper()
+		a := archsimd.LoadInt8x64Slice(x)
+		g := make([]int16, 32)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt16x32ConvertToInt16 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testInt16x32ConvertToInt16(t *testing.T, f func(x archsimd.Int16x32) archsimd.Int16x32, want func(x []int16) []int16) {
+	n := 32
+	t.Helper()
+	forSlice(t, int16s, n, func(x []int16) bool {
+		t.Helper()
+		a := archsimd.LoadInt16x32Slice(x)
+		g := make([]int16, 32)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt32x16ConvertToInt16 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testInt32x16ConvertToInt16(t *testing.T, f func(x archsimd.Int32x16) archsimd.Int16x16, want func(x []int32) []int16) {
+	n := 16
+	t.Helper()
+	forSlice(t, int32s, n, func(x []int32) bool {
+		t.Helper()
+		a := archsimd.LoadInt32x16Slice(x)
+		g := make([]int16, 16)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt64x8ConvertToInt16 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testInt64x8ConvertToInt16(t *testing.T, f func(x archsimd.Int64x8) archsimd.Int16x8, want func(x []int64) []int16) {
+	n := 8
+	t.Helper()
+	forSlice(t, int64s, n, func(x []int64) bool {
+		t.Helper()
+		a := archsimd.LoadInt64x8Slice(x)
+		g := make([]int16, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint8x64ConvertToInt16 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testUint8x64ConvertToInt16(t *testing.T, f func(x archsimd.Uint8x64) archsimd.Int16x32, want func(x []uint8) []int16) {
+	n := 64
+	t.Helper()
+	forSlice(t, uint8s, n, func(x []uint8) bool {
+		t.Helper()
+		a := archsimd.LoadUint8x64Slice(x)
+		g := make([]int16, 32)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint16x32ConvertToInt16 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testUint16x32ConvertToInt16(t *testing.T, f func(x archsimd.Uint16x32) archsimd.Int16x32, want func(x []uint16) []int16) {
+	n := 32
+	t.Helper()
+	forSlice(t, uint16s, n, func(x []uint16) bool {
+		t.Helper()
+		a := archsimd.LoadUint16x32Slice(x)
+		g := make([]int16, 32)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint32x16ConvertToInt16 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testUint32x16ConvertToInt16(t *testing.T, f func(x archsimd.Uint32x16) archsimd.Int16x16, want func(x []uint32) []int16) {
+	n := 16
+	t.Helper()
+	forSlice(t, uint32s, n, func(x []uint32) bool {
+		t.Helper()
+		a := archsimd.LoadUint32x16Slice(x)
+		g := make([]int16, 16)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint64x8ConvertToInt16 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testUint64x8ConvertToInt16(t *testing.T, f func(x archsimd.Uint64x8) archsimd.Int16x8, want func(x []uint64) []int16) {
+	n := 8
+	t.Helper()
+	forSlice(t, uint64s, n, func(x []uint64) bool {
+		t.Helper()
+		a := archsimd.LoadUint64x8Slice(x)
+		g := make([]int16, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testFloat32x16ConvertToInt16 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testFloat32x16ConvertToInt16(t *testing.T, f func(x archsimd.Float32x16) archsimd.Int16x16, want func(x []float32) []int16) {
+	n := 16
+	t.Helper()
+	forSlice(t, float32s, n, func(x []float32) bool {
+		t.Helper()
+		a := archsimd.LoadFloat32x16Slice(x)
+		g := make([]int16, 16)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testFloat64x8ConvertToInt16 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testFloat64x8ConvertToInt16(t *testing.T, f func(x archsimd.Float64x8) archsimd.Int16x8, want func(x []float64) []int16) {
+	n := 8
+	t.Helper()
+	forSlice(t, float64s, n, func(x []float64) bool {
+		t.Helper()
+		a := archsimd.LoadFloat64x8Slice(x)
+		g := make([]int16, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt8x16ConvertToUint16 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testInt8x16ConvertToUint16(t *testing.T, f func(x archsimd.Int8x16) archsimd.Uint16x16, want func(x []int8) []uint16) {
+	n := 16
+	t.Helper()
+	forSlice(t, int8s, n, func(x []int8) bool {
+		t.Helper()
+		a := archsimd.LoadInt8x16Slice(x)
+		g := make([]uint16, 16)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt16x8ConvertToUint16 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testInt16x8ConvertToUint16(t *testing.T, f func(x archsimd.Int16x8) archsimd.Uint16x8, want func(x []int16) []uint16) {
+	n := 8
+	t.Helper()
+	forSlice(t, int16s, n, func(x []int16) bool {
+		t.Helper()
+		a := archsimd.LoadInt16x8Slice(x)
+		g := make([]uint16, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt32x4ConvertToUint16 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testInt32x4ConvertToUint16(t *testing.T, f func(x archsimd.Int32x4) archsimd.Uint16x8, want func(x []int32) []uint16) {
+	n := 4
+	t.Helper()
+	forSlice(t, int32s, n, func(x []int32) bool {
+		t.Helper()
+		a := archsimd.LoadInt32x4Slice(x)
+		g := make([]uint16, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt64x2ConvertToUint16 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testInt64x2ConvertToUint16(t *testing.T, f func(x archsimd.Int64x2) archsimd.Uint16x8, want func(x []int64) []uint16) {
+	n := 2
+	t.Helper()
+	forSlice(t, int64s, n, func(x []int64) bool {
+		t.Helper()
+		a := archsimd.LoadInt64x2Slice(x)
+		g := make([]uint16, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint8x16ConvertToUint16 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testUint8x16ConvertToUint16(t *testing.T, f func(x archsimd.Uint8x16) archsimd.Uint16x16, want func(x []uint8) []uint16) {
+	n := 16
+	t.Helper()
+	forSlice(t, uint8s, n, func(x []uint8) bool {
+		t.Helper()
+		a := archsimd.LoadUint8x16Slice(x)
+		g := make([]uint16, 16)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint16x8ConvertToUint16 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testUint16x8ConvertToUint16(t *testing.T, f func(x archsimd.Uint16x8) archsimd.Uint16x8, want func(x []uint16) []uint16) {
+	n := 8
+	t.Helper()
+	forSlice(t, uint16s, n, func(x []uint16) bool {
+		t.Helper()
+		a := archsimd.LoadUint16x8Slice(x)
+		g := make([]uint16, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint32x4ConvertToUint16 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testUint32x4ConvertToUint16(t *testing.T, f func(x archsimd.Uint32x4) archsimd.Uint16x8, want func(x []uint32) []uint16) {
+	n := 4
+	t.Helper()
+	forSlice(t, uint32s, n, func(x []uint32) bool {
+		t.Helper()
+		a := archsimd.LoadUint32x4Slice(x)
+		g := make([]uint16, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint64x2ConvertToUint16 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testUint64x2ConvertToUint16(t *testing.T, f func(x archsimd.Uint64x2) archsimd.Uint16x8, want func(x []uint64) []uint16) {
+	n := 2
+	t.Helper()
+	forSlice(t, uint64s, n, func(x []uint64) bool {
+		t.Helper()
+		a := archsimd.LoadUint64x2Slice(x)
+		g := make([]uint16, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testFloat32x4ConvertToUint16 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testFloat32x4ConvertToUint16(t *testing.T, f func(x archsimd.Float32x4) archsimd.Uint16x8, want func(x []float32) []uint16) {
+	n := 4
+	t.Helper()
+	forSlice(t, float32s, n, func(x []float32) bool {
+		t.Helper()
+		a := archsimd.LoadFloat32x4Slice(x)
+		g := make([]uint16, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testFloat64x2ConvertToUint16 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testFloat64x2ConvertToUint16(t *testing.T, f func(x archsimd.Float64x2) archsimd.Uint16x8, want func(x []float64) []uint16) {
+	n := 2
+	t.Helper()
+	forSlice(t, float64s, n, func(x []float64) bool {
+		t.Helper()
+		a := archsimd.LoadFloat64x2Slice(x)
+		g := make([]uint16, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt8x32ConvertToUint16 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testInt8x32ConvertToUint16(t *testing.T, f func(x archsimd.Int8x32) archsimd.Uint16x32, want func(x []int8) []uint16) {
+	n := 32
+	t.Helper()
+	forSlice(t, int8s, n, func(x []int8) bool {
+		t.Helper()
+		a := archsimd.LoadInt8x32Slice(x)
+		g := make([]uint16, 32)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt16x16ConvertToUint16 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testInt16x16ConvertToUint16(t *testing.T, f func(x archsimd.Int16x16) archsimd.Uint16x16, want func(x []int16) []uint16) {
+	n := 16
+	t.Helper()
+	forSlice(t, int16s, n, func(x []int16) bool {
+		t.Helper()
+		a := archsimd.LoadInt16x16Slice(x)
+		g := make([]uint16, 16)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt32x8ConvertToUint16 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testInt32x8ConvertToUint16(t *testing.T, f func(x archsimd.Int32x8) archsimd.Uint16x8, want func(x []int32) []uint16) {
+	n := 8
+	t.Helper()
+	forSlice(t, int32s, n, func(x []int32) bool {
+		t.Helper()
+		a := archsimd.LoadInt32x8Slice(x)
+		g := make([]uint16, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt64x4ConvertToUint16 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testInt64x4ConvertToUint16(t *testing.T, f func(x archsimd.Int64x4) archsimd.Uint16x8, want func(x []int64) []uint16) {
+	n := 4
+	t.Helper()
+	forSlice(t, int64s, n, func(x []int64) bool {
+		t.Helper()
+		a := archsimd.LoadInt64x4Slice(x)
+		g := make([]uint16, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint8x32ConvertToUint16 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testUint8x32ConvertToUint16(t *testing.T, f func(x archsimd.Uint8x32) archsimd.Uint16x32, want func(x []uint8) []uint16) {
+	n := 32
+	t.Helper()
+	forSlice(t, uint8s, n, func(x []uint8) bool {
+		t.Helper()
+		a := archsimd.LoadUint8x32Slice(x)
+		g := make([]uint16, 32)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint16x16ConvertToUint16 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testUint16x16ConvertToUint16(t *testing.T, f func(x archsimd.Uint16x16) archsimd.Uint16x16, want func(x []uint16) []uint16) {
+	n := 16
+	t.Helper()
+	forSlice(t, uint16s, n, func(x []uint16) bool {
+		t.Helper()
+		a := archsimd.LoadUint16x16Slice(x)
+		g := make([]uint16, 16)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint32x8ConvertToUint16 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testUint32x8ConvertToUint16(t *testing.T, f func(x archsimd.Uint32x8) archsimd.Uint16x8, want func(x []uint32) []uint16) {
+	n := 8
+	t.Helper()
+	forSlice(t, uint32s, n, func(x []uint32) bool {
+		t.Helper()
+		a := archsimd.LoadUint32x8Slice(x)
+		g := make([]uint16, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint64x4ConvertToUint16 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testUint64x4ConvertToUint16(t *testing.T, f func(x archsimd.Uint64x4) archsimd.Uint16x8, want func(x []uint64) []uint16) {
+	n := 4
+	t.Helper()
+	forSlice(t, uint64s, n, func(x []uint64) bool {
+		t.Helper()
+		a := archsimd.LoadUint64x4Slice(x)
+		g := make([]uint16, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testFloat32x8ConvertToUint16 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testFloat32x8ConvertToUint16(t *testing.T, f func(x archsimd.Float32x8) archsimd.Uint16x8, want func(x []float32) []uint16) {
+	n := 8
+	t.Helper()
+	forSlice(t, float32s, n, func(x []float32) bool {
+		t.Helper()
+		a := archsimd.LoadFloat32x8Slice(x)
+		g := make([]uint16, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testFloat64x4ConvertToUint16 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testFloat64x4ConvertToUint16(t *testing.T, f func(x archsimd.Float64x4) archsimd.Uint16x8, want func(x []float64) []uint16) {
+	n := 4
+	t.Helper()
+	forSlice(t, float64s, n, func(x []float64) bool {
+		t.Helper()
+		a := archsimd.LoadFloat64x4Slice(x)
+		g := make([]uint16, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt8x64ConvertToUint16 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testInt8x64ConvertToUint16(t *testing.T, f func(x archsimd.Int8x64) archsimd.Uint16x32, want func(x []int8) []uint16) {
+	n := 64
+	t.Helper()
+	forSlice(t, int8s, n, func(x []int8) bool {
+		t.Helper()
+		a := archsimd.LoadInt8x64Slice(x)
+		g := make([]uint16, 32)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt16x32ConvertToUint16 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testInt16x32ConvertToUint16(t *testing.T, f func(x archsimd.Int16x32) archsimd.Uint16x32, want func(x []int16) []uint16) {
+	n := 32
+	t.Helper()
+	forSlice(t, int16s, n, func(x []int16) bool {
+		t.Helper()
+		a := archsimd.LoadInt16x32Slice(x)
+		g := make([]uint16, 32)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt32x16ConvertToUint16 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testInt32x16ConvertToUint16(t *testing.T, f func(x archsimd.Int32x16) archsimd.Uint16x16, want func(x []int32) []uint16) {
+	n := 16
+	t.Helper()
+	forSlice(t, int32s, n, func(x []int32) bool {
+		t.Helper()
+		a := archsimd.LoadInt32x16Slice(x)
+		g := make([]uint16, 16)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt64x8ConvertToUint16 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testInt64x8ConvertToUint16(t *testing.T, f func(x archsimd.Int64x8) archsimd.Uint16x8, want func(x []int64) []uint16) {
+	n := 8
+	t.Helper()
+	forSlice(t, int64s, n, func(x []int64) bool {
+		t.Helper()
+		a := archsimd.LoadInt64x8Slice(x)
+		g := make([]uint16, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint8x64ConvertToUint16 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testUint8x64ConvertToUint16(t *testing.T, f func(x archsimd.Uint8x64) archsimd.Uint16x32, want func(x []uint8) []uint16) {
+	n := 64
+	t.Helper()
+	forSlice(t, uint8s, n, func(x []uint8) bool {
+		t.Helper()
+		a := archsimd.LoadUint8x64Slice(x)
+		g := make([]uint16, 32)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint16x32ConvertToUint16 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testUint16x32ConvertToUint16(t *testing.T, f func(x archsimd.Uint16x32) archsimd.Uint16x32, want func(x []uint16) []uint16) {
+	n := 32
+	t.Helper()
+	forSlice(t, uint16s, n, func(x []uint16) bool {
+		t.Helper()
+		a := archsimd.LoadUint16x32Slice(x)
+		g := make([]uint16, 32)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint32x16ConvertToUint16 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testUint32x16ConvertToUint16(t *testing.T, f func(x archsimd.Uint32x16) archsimd.Uint16x16, want func(x []uint32) []uint16) {
+	n := 16
+	t.Helper()
+	forSlice(t, uint32s, n, func(x []uint32) bool {
+		t.Helper()
+		a := archsimd.LoadUint32x16Slice(x)
+		g := make([]uint16, 16)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint64x8ConvertToUint16 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testUint64x8ConvertToUint16(t *testing.T, f func(x archsimd.Uint64x8) archsimd.Uint16x8, want func(x []uint64) []uint16) {
+	n := 8
+	t.Helper()
+	forSlice(t, uint64s, n, func(x []uint64) bool {
+		t.Helper()
+		a := archsimd.LoadUint64x8Slice(x)
+		g := make([]uint16, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testFloat32x16ConvertToUint16 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testFloat32x16ConvertToUint16(t *testing.T, f func(x archsimd.Float32x16) archsimd.Uint16x16, want func(x []float32) []uint16) {
+	n := 16
+	t.Helper()
+	forSlice(t, float32s, n, func(x []float32) bool {
+		t.Helper()
+		a := archsimd.LoadFloat32x16Slice(x)
+		g := make([]uint16, 16)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testFloat64x8ConvertToUint16 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testFloat64x8ConvertToUint16(t *testing.T, f func(x archsimd.Float64x8) archsimd.Uint16x8, want func(x []float64) []uint16) {
+	n := 8
+	t.Helper()
+	forSlice(t, float64s, n, func(x []float64) bool {
+		t.Helper()
+		a := archsimd.LoadFloat64x8Slice(x)
+		g := make([]uint16, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt8x16ConvertToInt32 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
 func testInt8x16ConvertToInt32(t *testing.T, f func(x archsimd.Int8x16) archsimd.Int32x16, want func(x []int8) []int32) {
 	n := 16
 	t.Helper()
 	forSlice(t, int8s, n, func(x []int8) bool {
 		t.Helper()
 		a := archsimd.LoadInt8x16Slice(x)
-		g := make([]int32, n)
+		g := make([]int32, 16)
 		f(a).StoreSlice(g)
 		w := want(x)
 		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
 	})
 }
 
-// testInt16x8ConvertToInt32 tests the simd conversion method f against the expected behavior generated by want
-// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+// testInt16x8ConvertToInt32 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
 func testInt16x8ConvertToInt32(t *testing.T, f func(x archsimd.Int16x8) archsimd.Int32x8, want func(x []int16) []int32) {
 	n := 8
 	t.Helper()
 	forSlice(t, int16s, n, func(x []int16) bool {
 		t.Helper()
 		a := archsimd.LoadInt16x8Slice(x)
-		g := make([]int32, n)
+		g := make([]int32, 8)
 		f(a).StoreSlice(g)
 		w := want(x)
 		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
 	})
 }
 
-// testInt32x4ConvertToInt32 tests the simd conversion method f against the expected behavior generated by want
-// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+// testInt32x4ConvertToInt32 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
 func testInt32x4ConvertToInt32(t *testing.T, f func(x archsimd.Int32x4) archsimd.Int32x4, want func(x []int32) []int32) {
 	n := 4
 	t.Helper()
 	forSlice(t, int32s, n, func(x []int32) bool {
 		t.Helper()
 		a := archsimd.LoadInt32x4Slice(x)
-		g := make([]int32, n)
+		g := make([]int32, 4)
 		f(a).StoreSlice(g)
 		w := want(x)
 		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
 	})
 }
 
-// testUint8x16ConvertToInt32 tests the simd conversion method f against the expected behavior generated by want
-// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+// testInt64x2ConvertToInt32 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testInt64x2ConvertToInt32(t *testing.T, f func(x archsimd.Int64x2) archsimd.Int32x4, want func(x []int64) []int32) {
+	n := 2
+	t.Helper()
+	forSlice(t, int64s, n, func(x []int64) bool {
+		t.Helper()
+		a := archsimd.LoadInt64x2Slice(x)
+		g := make([]int32, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint8x16ConvertToInt32 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
 func testUint8x16ConvertToInt32(t *testing.T, f func(x archsimd.Uint8x16) archsimd.Int32x16, want func(x []uint8) []int32) {
 	n := 16
 	t.Helper()
 	forSlice(t, uint8s, n, func(x []uint8) bool {
 		t.Helper()
 		a := archsimd.LoadUint8x16Slice(x)
-		g := make([]int32, n)
+		g := make([]int32, 16)
 		f(a).StoreSlice(g)
 		w := want(x)
 		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
 	})
 }
 
-// testUint16x8ConvertToInt32 tests the simd conversion method f against the expected behavior generated by want
-// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+// testUint16x8ConvertToInt32 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
 func testUint16x8ConvertToInt32(t *testing.T, f func(x archsimd.Uint16x8) archsimd.Int32x8, want func(x []uint16) []int32) {
 	n := 8
 	t.Helper()
 	forSlice(t, uint16s, n, func(x []uint16) bool {
 		t.Helper()
 		a := archsimd.LoadUint16x8Slice(x)
-		g := make([]int32, n)
+		g := make([]int32, 8)
 		f(a).StoreSlice(g)
 		w := want(x)
 		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
 	})
 }
 
-// testUint32x4ConvertToInt32 tests the simd conversion method f against the expected behavior generated by want
-// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+// testUint32x4ConvertToInt32 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
 func testUint32x4ConvertToInt32(t *testing.T, f func(x archsimd.Uint32x4) archsimd.Int32x4, want func(x []uint32) []int32) {
 	n := 4
 	t.Helper()
 	forSlice(t, uint32s, n, func(x []uint32) bool {
 		t.Helper()
 		a := archsimd.LoadUint32x4Slice(x)
-		g := make([]int32, n)
+		g := make([]int32, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint64x2ConvertToInt32 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testUint64x2ConvertToInt32(t *testing.T, f func(x archsimd.Uint64x2) archsimd.Int32x4, want func(x []uint64) []int32) {
+	n := 2
+	t.Helper()
+	forSlice(t, uint64s, n, func(x []uint64) bool {
+		t.Helper()
+		a := archsimd.LoadUint64x2Slice(x)
+		g := make([]int32, 4)
 		f(a).StoreSlice(g)
 		w := want(x)
 		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
 	})
 }
 
-// testFloat32x4ConvertToInt32 tests the simd conversion method f against the expected behavior generated by want
-// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+// testFloat32x4ConvertToInt32 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
 func testFloat32x4ConvertToInt32(t *testing.T, f func(x archsimd.Float32x4) archsimd.Int32x4, want func(x []float32) []int32) {
 	n := 4
 	t.Helper()
 	forSlice(t, float32s, n, func(x []float32) bool {
 		t.Helper()
 		a := archsimd.LoadFloat32x4Slice(x)
-		g := make([]int32, n)
+		g := make([]int32, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testFloat64x2ConvertToInt32 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testFloat64x2ConvertToInt32(t *testing.T, f func(x archsimd.Float64x2) archsimd.Int32x4, want func(x []float64) []int32) {
+	n := 2
+	t.Helper()
+	forSlice(t, float64s, n, func(x []float64) bool {
+		t.Helper()
+		a := archsimd.LoadFloat64x2Slice(x)
+		g := make([]int32, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt8x32ConvertToInt32 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testInt8x32ConvertToInt32(t *testing.T, f func(x archsimd.Int8x32) archsimd.Int32x16, want func(x []int8) []int32) {
+	n := 32
+	t.Helper()
+	forSlice(t, int8s, n, func(x []int8) bool {
+		t.Helper()
+		a := archsimd.LoadInt8x32Slice(x)
+		g := make([]int32, 16)
 		f(a).StoreSlice(g)
 		w := want(x)
 		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
 	})
 }
 
-// testInt16x16ConvertToInt32 tests the simd conversion method f against the expected behavior generated by want
-// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+// testInt16x16ConvertToInt32 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
 func testInt16x16ConvertToInt32(t *testing.T, f func(x archsimd.Int16x16) archsimd.Int32x16, want func(x []int16) []int32) {
 	n := 16
 	t.Helper()
 	forSlice(t, int16s, n, func(x []int16) bool {
 		t.Helper()
 		a := archsimd.LoadInt16x16Slice(x)
-		g := make([]int32, n)
+		g := make([]int32, 16)
 		f(a).StoreSlice(g)
 		w := want(x)
 		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
 	})
 }
 
-// testInt32x8ConvertToInt32 tests the simd conversion method f against the expected behavior generated by want
-// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+// testInt32x8ConvertToInt32 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
 func testInt32x8ConvertToInt32(t *testing.T, f func(x archsimd.Int32x8) archsimd.Int32x8, want func(x []int32) []int32) {
 	n := 8
 	t.Helper()
 	forSlice(t, int32s, n, func(x []int32) bool {
 		t.Helper()
 		a := archsimd.LoadInt32x8Slice(x)
-		g := make([]int32, n)
+		g := make([]int32, 8)
 		f(a).StoreSlice(g)
 		w := want(x)
 		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
 	})
 }
 
-// testInt64x4ConvertToInt32 tests the simd conversion method f against the expected behavior generated by want
-// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+// testInt64x4ConvertToInt32 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
 func testInt64x4ConvertToInt32(t *testing.T, f func(x archsimd.Int64x4) archsimd.Int32x4, want func(x []int64) []int32) {
 	n := 4
 	t.Helper()
 	forSlice(t, int64s, n, func(x []int64) bool {
 		t.Helper()
 		a := archsimd.LoadInt64x4Slice(x)
-		g := make([]int32, n)
+		g := make([]int32, 4)
 		f(a).StoreSlice(g)
 		w := want(x)
 		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
 	})
 }
 
-// testUint16x16ConvertToInt32 tests the simd conversion method f against the expected behavior generated by want
-// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+// testUint8x32ConvertToInt32 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testUint8x32ConvertToInt32(t *testing.T, f func(x archsimd.Uint8x32) archsimd.Int32x16, want func(x []uint8) []int32) {
+	n := 32
+	t.Helper()
+	forSlice(t, uint8s, n, func(x []uint8) bool {
+		t.Helper()
+		a := archsimd.LoadUint8x32Slice(x)
+		g := make([]int32, 16)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint16x16ConvertToInt32 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
 func testUint16x16ConvertToInt32(t *testing.T, f func(x archsimd.Uint16x16) archsimd.Int32x16, want func(x []uint16) []int32) {
 	n := 16
 	t.Helper()
 	forSlice(t, uint16s, n, func(x []uint16) bool {
 		t.Helper()
 		a := archsimd.LoadUint16x16Slice(x)
-		g := make([]int32, n)
+		g := make([]int32, 16)
 		f(a).StoreSlice(g)
 		w := want(x)
 		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
 	})
 }
 
-// testUint32x8ConvertToInt32 tests the simd conversion method f against the expected behavior generated by want
-// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+// testUint32x8ConvertToInt32 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
 func testUint32x8ConvertToInt32(t *testing.T, f func(x archsimd.Uint32x8) archsimd.Int32x8, want func(x []uint32) []int32) {
 	n := 8
 	t.Helper()
 	forSlice(t, uint32s, n, func(x []uint32) bool {
 		t.Helper()
 		a := archsimd.LoadUint32x8Slice(x)
-		g := make([]int32, n)
+		g := make([]int32, 8)
 		f(a).StoreSlice(g)
 		w := want(x)
 		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
 	})
 }
 
-// testUint64x4ConvertToInt32 tests the simd conversion method f against the expected behavior generated by want
-// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+// testUint64x4ConvertToInt32 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
 func testUint64x4ConvertToInt32(t *testing.T, f func(x archsimd.Uint64x4) archsimd.Int32x4, want func(x []uint64) []int32) {
 	n := 4
 	t.Helper()
 	forSlice(t, uint64s, n, func(x []uint64) bool {
 		t.Helper()
 		a := archsimd.LoadUint64x4Slice(x)
-		g := make([]int32, n)
+		g := make([]int32, 4)
 		f(a).StoreSlice(g)
 		w := want(x)
 		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
 	})
 }
 
-// testFloat32x8ConvertToInt32 tests the simd conversion method f against the expected behavior generated by want
-// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+// testFloat32x8ConvertToInt32 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
 func testFloat32x8ConvertToInt32(t *testing.T, f func(x archsimd.Float32x8) archsimd.Int32x8, want func(x []float32) []int32) {
 	n := 8
 	t.Helper()
 	forSlice(t, float32s, n, func(x []float32) bool {
 		t.Helper()
 		a := archsimd.LoadFloat32x8Slice(x)
-		g := make([]int32, n)
+		g := make([]int32, 8)
 		f(a).StoreSlice(g)
 		w := want(x)
 		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
 	})
 }
 
-// testFloat64x4ConvertToInt32 tests the simd conversion method f against the expected behavior generated by want
-// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+// testFloat64x4ConvertToInt32 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
 func testFloat64x4ConvertToInt32(t *testing.T, f func(x archsimd.Float64x4) archsimd.Int32x4, want func(x []float64) []int32) {
 	n := 4
 	t.Helper()
 	forSlice(t, float64s, n, func(x []float64) bool {
 		t.Helper()
 		a := archsimd.LoadFloat64x4Slice(x)
-		g := make([]int32, n)
+		g := make([]int32, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt8x64ConvertToInt32 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testInt8x64ConvertToInt32(t *testing.T, f func(x archsimd.Int8x64) archsimd.Int32x16, want func(x []int8) []int32) {
+	n := 64
+	t.Helper()
+	forSlice(t, int8s, n, func(x []int8) bool {
+		t.Helper()
+		a := archsimd.LoadInt8x64Slice(x)
+		g := make([]int32, 16)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt16x32ConvertToInt32 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testInt16x32ConvertToInt32(t *testing.T, f func(x archsimd.Int16x32) archsimd.Int32x16, want func(x []int16) []int32) {
+	n := 32
+	t.Helper()
+	forSlice(t, int16s, n, func(x []int16) bool {
+		t.Helper()
+		a := archsimd.LoadInt16x32Slice(x)
+		g := make([]int32, 16)
 		f(a).StoreSlice(g)
 		w := want(x)
 		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
 	})
 }
 
-// testInt32x16ConvertToInt32 tests the simd conversion method f against the expected behavior generated by want
-// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+// testInt32x16ConvertToInt32 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
 func testInt32x16ConvertToInt32(t *testing.T, f func(x archsimd.Int32x16) archsimd.Int32x16, want func(x []int32) []int32) {
 	n := 16
 	t.Helper()
 	forSlice(t, int32s, n, func(x []int32) bool {
 		t.Helper()
 		a := archsimd.LoadInt32x16Slice(x)
-		g := make([]int32, n)
+		g := make([]int32, 16)
 		f(a).StoreSlice(g)
 		w := want(x)
 		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
 	})
 }
 
-// testInt64x8ConvertToInt32 tests the simd conversion method f against the expected behavior generated by want
-// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+// testInt64x8ConvertToInt32 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
 func testInt64x8ConvertToInt32(t *testing.T, f func(x archsimd.Int64x8) archsimd.Int32x8, want func(x []int64) []int32) {
 	n := 8
 	t.Helper()
 	forSlice(t, int64s, n, func(x []int64) bool {
 		t.Helper()
 		a := archsimd.LoadInt64x8Slice(x)
-		g := make([]int32, n)
+		g := make([]int32, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint8x64ConvertToInt32 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testUint8x64ConvertToInt32(t *testing.T, f func(x archsimd.Uint8x64) archsimd.Int32x16, want func(x []uint8) []int32) {
+	n := 64
+	t.Helper()
+	forSlice(t, uint8s, n, func(x []uint8) bool {
+		t.Helper()
+		a := archsimd.LoadUint8x64Slice(x)
+		g := make([]int32, 16)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint16x32ConvertToInt32 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testUint16x32ConvertToInt32(t *testing.T, f func(x archsimd.Uint16x32) archsimd.Int32x16, want func(x []uint16) []int32) {
+	n := 32
+	t.Helper()
+	forSlice(t, uint16s, n, func(x []uint16) bool {
+		t.Helper()
+		a := archsimd.LoadUint16x32Slice(x)
+		g := make([]int32, 16)
 		f(a).StoreSlice(g)
 		w := want(x)
 		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
 	})
 }
 
-// testUint32x16ConvertToInt32 tests the simd conversion method f against the expected behavior generated by want
-// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+// testUint32x16ConvertToInt32 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
 func testUint32x16ConvertToInt32(t *testing.T, f func(x archsimd.Uint32x16) archsimd.Int32x16, want func(x []uint32) []int32) {
 	n := 16
 	t.Helper()
 	forSlice(t, uint32s, n, func(x []uint32) bool {
 		t.Helper()
 		a := archsimd.LoadUint32x16Slice(x)
-		g := make([]int32, n)
+		g := make([]int32, 16)
 		f(a).StoreSlice(g)
 		w := want(x)
 		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
 	})
 }
 
-// testUint64x8ConvertToInt32 tests the simd conversion method f against the expected behavior generated by want
-// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+// testUint64x8ConvertToInt32 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
 func testUint64x8ConvertToInt32(t *testing.T, f func(x archsimd.Uint64x8) archsimd.Int32x8, want func(x []uint64) []int32) {
 	n := 8
 	t.Helper()
 	forSlice(t, uint64s, n, func(x []uint64) bool {
 		t.Helper()
 		a := archsimd.LoadUint64x8Slice(x)
-		g := make([]int32, n)
+		g := make([]int32, 8)
 		f(a).StoreSlice(g)
 		w := want(x)
 		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
 	})
 }
 
-// testFloat32x16ConvertToInt32 tests the simd conversion method f against the expected behavior generated by want
-// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+// testFloat32x16ConvertToInt32 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
 func testFloat32x16ConvertToInt32(t *testing.T, f func(x archsimd.Float32x16) archsimd.Int32x16, want func(x []float32) []int32) {
 	n := 16
 	t.Helper()
 	forSlice(t, float32s, n, func(x []float32) bool {
 		t.Helper()
 		a := archsimd.LoadFloat32x16Slice(x)
-		g := make([]int32, n)
+		g := make([]int32, 16)
 		f(a).StoreSlice(g)
 		w := want(x)
 		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
 	})
 }
 
-// testFloat64x8ConvertToInt32 tests the simd conversion method f against the expected behavior generated by want
-// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+// testFloat64x8ConvertToInt32 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
 func testFloat64x8ConvertToInt32(t *testing.T, f func(x archsimd.Float64x8) archsimd.Int32x8, want func(x []float64) []int32) {
 	n := 8
 	t.Helper()
 	forSlice(t, float64s, n, func(x []float64) bool {
 		t.Helper()
 		a := archsimd.LoadFloat64x8Slice(x)
-		g := make([]int32, n)
+		g := make([]int32, 8)
 		f(a).StoreSlice(g)
 		w := want(x)
 		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
 	})
 }
 
-// testInt8x16ConvertToUint32 tests the simd conversion method f against the expected behavior generated by want
-// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+// testInt8x16ConvertToUint32 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
 func testInt8x16ConvertToUint32(t *testing.T, f func(x archsimd.Int8x16) archsimd.Uint32x16, want func(x []int8) []uint32) {
 	n := 16
 	t.Helper()
 	forSlice(t, int8s, n, func(x []int8) bool {
 		t.Helper()
 		a := archsimd.LoadInt8x16Slice(x)
-		g := make([]uint32, n)
+		g := make([]uint32, 16)
 		f(a).StoreSlice(g)
 		w := want(x)
 		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
 	})
 }
 
-// testInt16x8ConvertToUint32 tests the simd conversion method f against the expected behavior generated by want
-// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+// testInt16x8ConvertToUint32 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
 func testInt16x8ConvertToUint32(t *testing.T, f func(x archsimd.Int16x8) archsimd.Uint32x8, want func(x []int16) []uint32) {
 	n := 8
 	t.Helper()
 	forSlice(t, int16s, n, func(x []int16) bool {
 		t.Helper()
 		a := archsimd.LoadInt16x8Slice(x)
-		g := make([]uint32, n)
+		g := make([]uint32, 8)
 		f(a).StoreSlice(g)
 		w := want(x)
 		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
 	})
 }
 
-// testInt32x4ConvertToUint32 tests the simd conversion method f against the expected behavior generated by want
-// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+// testInt32x4ConvertToUint32 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
 func testInt32x4ConvertToUint32(t *testing.T, f func(x archsimd.Int32x4) archsimd.Uint32x4, want func(x []int32) []uint32) {
 	n := 4
 	t.Helper()
 	forSlice(t, int32s, n, func(x []int32) bool {
 		t.Helper()
 		a := archsimd.LoadInt32x4Slice(x)
-		g := make([]uint32, n)
+		g := make([]uint32, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt64x2ConvertToUint32 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testInt64x2ConvertToUint32(t *testing.T, f func(x archsimd.Int64x2) archsimd.Uint32x4, want func(x []int64) []uint32) {
+	n := 2
+	t.Helper()
+	forSlice(t, int64s, n, func(x []int64) bool {
+		t.Helper()
+		a := archsimd.LoadInt64x2Slice(x)
+		g := make([]uint32, 4)
 		f(a).StoreSlice(g)
 		w := want(x)
 		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
 	})
 }
 
-// testUint8x16ConvertToUint32 tests the simd conversion method f against the expected behavior generated by want
-// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+// testUint8x16ConvertToUint32 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
 func testUint8x16ConvertToUint32(t *testing.T, f func(x archsimd.Uint8x16) archsimd.Uint32x16, want func(x []uint8) []uint32) {
 	n := 16
 	t.Helper()
 	forSlice(t, uint8s, n, func(x []uint8) bool {
 		t.Helper()
 		a := archsimd.LoadUint8x16Slice(x)
-		g := make([]uint32, n)
+		g := make([]uint32, 16)
 		f(a).StoreSlice(g)
 		w := want(x)
 		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
 	})
 }
 
-// testUint16x8ConvertToUint32 tests the simd conversion method f against the expected behavior generated by want
-// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+// testUint16x8ConvertToUint32 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
 func testUint16x8ConvertToUint32(t *testing.T, f func(x archsimd.Uint16x8) archsimd.Uint32x8, want func(x []uint16) []uint32) {
 	n := 8
 	t.Helper()
 	forSlice(t, uint16s, n, func(x []uint16) bool {
 		t.Helper()
 		a := archsimd.LoadUint16x8Slice(x)
-		g := make([]uint32, n)
+		g := make([]uint32, 8)
 		f(a).StoreSlice(g)
 		w := want(x)
 		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
 	})
 }
 
-// testUint32x4ConvertToUint32 tests the simd conversion method f against the expected behavior generated by want
-// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+// testUint32x4ConvertToUint32 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
 func testUint32x4ConvertToUint32(t *testing.T, f func(x archsimd.Uint32x4) archsimd.Uint32x4, want func(x []uint32) []uint32) {
 	n := 4
 	t.Helper()
 	forSlice(t, uint32s, n, func(x []uint32) bool {
 		t.Helper()
 		a := archsimd.LoadUint32x4Slice(x)
-		g := make([]uint32, n)
+		g := make([]uint32, 4)
 		f(a).StoreSlice(g)
 		w := want(x)
 		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
 	})
 }
 
-// testFloat32x4ConvertToUint32 tests the simd conversion method f against the expected behavior generated by want
-// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+// testUint64x2ConvertToUint32 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testUint64x2ConvertToUint32(t *testing.T, f func(x archsimd.Uint64x2) archsimd.Uint32x4, want func(x []uint64) []uint32) {
+	n := 2
+	t.Helper()
+	forSlice(t, uint64s, n, func(x []uint64) bool {
+		t.Helper()
+		a := archsimd.LoadUint64x2Slice(x)
+		g := make([]uint32, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testFloat32x4ConvertToUint32 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
 func testFloat32x4ConvertToUint32(t *testing.T, f func(x archsimd.Float32x4) archsimd.Uint32x4, want func(x []float32) []uint32) {
 	n := 4
 	t.Helper()
 	forSlice(t, float32s, n, func(x []float32) bool {
 		t.Helper()
 		a := archsimd.LoadFloat32x4Slice(x)
-		g := make([]uint32, n)
+		g := make([]uint32, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testFloat64x2ConvertToUint32 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testFloat64x2ConvertToUint32(t *testing.T, f func(x archsimd.Float64x2) archsimd.Uint32x4, want func(x []float64) []uint32) {
+	n := 2
+	t.Helper()
+	forSlice(t, float64s, n, func(x []float64) bool {
+		t.Helper()
+		a := archsimd.LoadFloat64x2Slice(x)
+		g := make([]uint32, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt8x32ConvertToUint32 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testInt8x32ConvertToUint32(t *testing.T, f func(x archsimd.Int8x32) archsimd.Uint32x16, want func(x []int8) []uint32) {
+	n := 32
+	t.Helper()
+	forSlice(t, int8s, n, func(x []int8) bool {
+		t.Helper()
+		a := archsimd.LoadInt8x32Slice(x)
+		g := make([]uint32, 16)
 		f(a).StoreSlice(g)
 		w := want(x)
 		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
 	})
 }
 
-// testInt16x16ConvertToUint32 tests the simd conversion method f against the expected behavior generated by want
-// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+// testInt16x16ConvertToUint32 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
 func testInt16x16ConvertToUint32(t *testing.T, f func(x archsimd.Int16x16) archsimd.Uint32x16, want func(x []int16) []uint32) {
 	n := 16
 	t.Helper()
 	forSlice(t, int16s, n, func(x []int16) bool {
 		t.Helper()
 		a := archsimd.LoadInt16x16Slice(x)
-		g := make([]uint32, n)
+		g := make([]uint32, 16)
 		f(a).StoreSlice(g)
 		w := want(x)
 		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
 	})
 }
 
-// testInt32x8ConvertToUint32 tests the simd conversion method f against the expected behavior generated by want
-// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+// testInt32x8ConvertToUint32 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
 func testInt32x8ConvertToUint32(t *testing.T, f func(x archsimd.Int32x8) archsimd.Uint32x8, want func(x []int32) []uint32) {
 	n := 8
 	t.Helper()
 	forSlice(t, int32s, n, func(x []int32) bool {
 		t.Helper()
 		a := archsimd.LoadInt32x8Slice(x)
-		g := make([]uint32, n)
+		g := make([]uint32, 8)
 		f(a).StoreSlice(g)
 		w := want(x)
 		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
 	})
 }
 
-// testInt64x4ConvertToUint32 tests the simd conversion method f against the expected behavior generated by want
-// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+// testInt64x4ConvertToUint32 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
 func testInt64x4ConvertToUint32(t *testing.T, f func(x archsimd.Int64x4) archsimd.Uint32x4, want func(x []int64) []uint32) {
 	n := 4
 	t.Helper()
 	forSlice(t, int64s, n, func(x []int64) bool {
 		t.Helper()
 		a := archsimd.LoadInt64x4Slice(x)
-		g := make([]uint32, n)
+		g := make([]uint32, 4)
 		f(a).StoreSlice(g)
 		w := want(x)
 		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
 	})
 }
 
-// testUint16x16ConvertToUint32 tests the simd conversion method f against the expected behavior generated by want
-// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+// testUint8x32ConvertToUint32 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testUint8x32ConvertToUint32(t *testing.T, f func(x archsimd.Uint8x32) archsimd.Uint32x16, want func(x []uint8) []uint32) {
+	n := 32
+	t.Helper()
+	forSlice(t, uint8s, n, func(x []uint8) bool {
+		t.Helper()
+		a := archsimd.LoadUint8x32Slice(x)
+		g := make([]uint32, 16)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint16x16ConvertToUint32 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
 func testUint16x16ConvertToUint32(t *testing.T, f func(x archsimd.Uint16x16) archsimd.Uint32x16, want func(x []uint16) []uint32) {
 	n := 16
 	t.Helper()
 	forSlice(t, uint16s, n, func(x []uint16) bool {
 		t.Helper()
 		a := archsimd.LoadUint16x16Slice(x)
-		g := make([]uint32, n)
+		g := make([]uint32, 16)
 		f(a).StoreSlice(g)
 		w := want(x)
 		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
 	})
 }
 
-// testUint32x8ConvertToUint32 tests the simd conversion method f against the expected behavior generated by want
-// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+// testUint32x8ConvertToUint32 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
 func testUint32x8ConvertToUint32(t *testing.T, f func(x archsimd.Uint32x8) archsimd.Uint32x8, want func(x []uint32) []uint32) {
 	n := 8
 	t.Helper()
 	forSlice(t, uint32s, n, func(x []uint32) bool {
 		t.Helper()
 		a := archsimd.LoadUint32x8Slice(x)
-		g := make([]uint32, n)
+		g := make([]uint32, 8)
 		f(a).StoreSlice(g)
 		w := want(x)
 		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
 	})
 }
 
-// testUint64x4ConvertToUint32 tests the simd conversion method f against the expected behavior generated by want
-// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+// testUint64x4ConvertToUint32 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
 func testUint64x4ConvertToUint32(t *testing.T, f func(x archsimd.Uint64x4) archsimd.Uint32x4, want func(x []uint64) []uint32) {
 	n := 4
 	t.Helper()
 	forSlice(t, uint64s, n, func(x []uint64) bool {
 		t.Helper()
 		a := archsimd.LoadUint64x4Slice(x)
-		g := make([]uint32, n)
+		g := make([]uint32, 4)
 		f(a).StoreSlice(g)
 		w := want(x)
 		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
 	})
 }
 
-// testFloat32x8ConvertToUint32 tests the simd conversion method f against the expected behavior generated by want
-// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+// testFloat32x8ConvertToUint32 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
 func testFloat32x8ConvertToUint32(t *testing.T, f func(x archsimd.Float32x8) archsimd.Uint32x8, want func(x []float32) []uint32) {
 	n := 8
 	t.Helper()
 	forSlice(t, float32s, n, func(x []float32) bool {
 		t.Helper()
 		a := archsimd.LoadFloat32x8Slice(x)
-		g := make([]uint32, n)
+		g := make([]uint32, 8)
 		f(a).StoreSlice(g)
 		w := want(x)
 		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
 	})
 }
 
-// testFloat64x4ConvertToUint32 tests the simd conversion method f against the expected behavior generated by want
-// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+// testFloat64x4ConvertToUint32 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
 func testFloat64x4ConvertToUint32(t *testing.T, f func(x archsimd.Float64x4) archsimd.Uint32x4, want func(x []float64) []uint32) {
 	n := 4
 	t.Helper()
 	forSlice(t, float64s, n, func(x []float64) bool {
 		t.Helper()
 		a := archsimd.LoadFloat64x4Slice(x)
-		g := make([]uint32, n)
+		g := make([]uint32, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt8x64ConvertToUint32 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testInt8x64ConvertToUint32(t *testing.T, f func(x archsimd.Int8x64) archsimd.Uint32x16, want func(x []int8) []uint32) {
+	n := 64
+	t.Helper()
+	forSlice(t, int8s, n, func(x []int8) bool {
+		t.Helper()
+		a := archsimd.LoadInt8x64Slice(x)
+		g := make([]uint32, 16)
 		f(a).StoreSlice(g)
 		w := want(x)
 		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
 	})
 }
 
-// testInt32x16ConvertToUint32 tests the simd conversion method f against the expected behavior generated by want
-// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+// testInt16x32ConvertToUint32 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testInt16x32ConvertToUint32(t *testing.T, f func(x archsimd.Int16x32) archsimd.Uint32x16, want func(x []int16) []uint32) {
+	n := 32
+	t.Helper()
+	forSlice(t, int16s, n, func(x []int16) bool {
+		t.Helper()
+		a := archsimd.LoadInt16x32Slice(x)
+		g := make([]uint32, 16)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt32x16ConvertToUint32 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
 func testInt32x16ConvertToUint32(t *testing.T, f func(x archsimd.Int32x16) archsimd.Uint32x16, want func(x []int32) []uint32) {
 	n := 16
 	t.Helper()
 	forSlice(t, int32s, n, func(x []int32) bool {
 		t.Helper()
 		a := archsimd.LoadInt32x16Slice(x)
-		g := make([]uint32, n)
+		g := make([]uint32, 16)
 		f(a).StoreSlice(g)
 		w := want(x)
 		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
 	})
 }
 
-// testInt64x8ConvertToUint32 tests the simd conversion method f against the expected behavior generated by want
-// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+// testInt64x8ConvertToUint32 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
 func testInt64x8ConvertToUint32(t *testing.T, f func(x archsimd.Int64x8) archsimd.Uint32x8, want func(x []int64) []uint32) {
 	n := 8
 	t.Helper()
 	forSlice(t, int64s, n, func(x []int64) bool {
 		t.Helper()
 		a := archsimd.LoadInt64x8Slice(x)
-		g := make([]uint32, n)
+		g := make([]uint32, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint8x64ConvertToUint32 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testUint8x64ConvertToUint32(t *testing.T, f func(x archsimd.Uint8x64) archsimd.Uint32x16, want func(x []uint8) []uint32) {
+	n := 64
+	t.Helper()
+	forSlice(t, uint8s, n, func(x []uint8) bool {
+		t.Helper()
+		a := archsimd.LoadUint8x64Slice(x)
+		g := make([]uint32, 16)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint16x32ConvertToUint32 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testUint16x32ConvertToUint32(t *testing.T, f func(x archsimd.Uint16x32) archsimd.Uint32x16, want func(x []uint16) []uint32) {
+	n := 32
+	t.Helper()
+	forSlice(t, uint16s, n, func(x []uint16) bool {
+		t.Helper()
+		a := archsimd.LoadUint16x32Slice(x)
+		g := make([]uint32, 16)
 		f(a).StoreSlice(g)
 		w := want(x)
 		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
 	})
 }
 
-// testUint32x16ConvertToUint32 tests the simd conversion method f against the expected behavior generated by want
-// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+// testUint32x16ConvertToUint32 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
 func testUint32x16ConvertToUint32(t *testing.T, f func(x archsimd.Uint32x16) archsimd.Uint32x16, want func(x []uint32) []uint32) {
 	n := 16
 	t.Helper()
 	forSlice(t, uint32s, n, func(x []uint32) bool {
 		t.Helper()
 		a := archsimd.LoadUint32x16Slice(x)
-		g := make([]uint32, n)
+		g := make([]uint32, 16)
 		f(a).StoreSlice(g)
 		w := want(x)
 		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
 	})
 }
 
-// testUint64x8ConvertToUint32 tests the simd conversion method f against the expected behavior generated by want
-// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+// testUint64x8ConvertToUint32 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
 func testUint64x8ConvertToUint32(t *testing.T, f func(x archsimd.Uint64x8) archsimd.Uint32x8, want func(x []uint64) []uint32) {
 	n := 8
 	t.Helper()
 	forSlice(t, uint64s, n, func(x []uint64) bool {
 		t.Helper()
 		a := archsimd.LoadUint64x8Slice(x)
-		g := make([]uint32, n)
+		g := make([]uint32, 8)
 		f(a).StoreSlice(g)
 		w := want(x)
 		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
 	})
 }
 
-// testFloat32x16ConvertToUint32 tests the simd conversion method f against the expected behavior generated by want
-// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+// testFloat32x16ConvertToUint32 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
 func testFloat32x16ConvertToUint32(t *testing.T, f func(x archsimd.Float32x16) archsimd.Uint32x16, want func(x []float32) []uint32) {
 	n := 16
 	t.Helper()
 	forSlice(t, float32s, n, func(x []float32) bool {
 		t.Helper()
 		a := archsimd.LoadFloat32x16Slice(x)
-		g := make([]uint32, n)
+		g := make([]uint32, 16)
 		f(a).StoreSlice(g)
 		w := want(x)
 		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
 	})
 }
 
-// testFloat64x8ConvertToUint32 tests the simd conversion method f against the expected behavior generated by want
-// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+// testFloat64x8ConvertToUint32 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
 func testFloat64x8ConvertToUint32(t *testing.T, f func(x archsimd.Float64x8) archsimd.Uint32x8, want func(x []float64) []uint32) {
 	n := 8
 	t.Helper()
 	forSlice(t, float64s, n, func(x []float64) bool {
 		t.Helper()
 		a := archsimd.LoadFloat64x8Slice(x)
-		g := make([]uint32, n)
+		g := make([]uint32, 8)
 		f(a).StoreSlice(g)
 		w := want(x)
 		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
 	})
 }
 
-// testInt8x16ConvertToUint16 tests the simd conversion method f against the expected behavior generated by want
-// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
-func testInt8x16ConvertToUint16(t *testing.T, f func(x archsimd.Int8x16) archsimd.Uint16x16, want func(x []int8) []uint16) {
+// testInt8x16ConvertToInt64 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testInt8x16ConvertToInt64(t *testing.T, f func(x archsimd.Int8x16) archsimd.Int64x8, want func(x []int8) []int64) {
 	n := 16
 	t.Helper()
 	forSlice(t, int8s, n, func(x []int8) bool {
 		t.Helper()
 		a := archsimd.LoadInt8x16Slice(x)
-		g := make([]uint16, n)
+		g := make([]int64, 8)
 		f(a).StoreSlice(g)
 		w := want(x)
 		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
 	})
 }
 
-// testInt16x8ConvertToUint16 tests the simd conversion method f against the expected behavior generated by want
-// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
-func testInt16x8ConvertToUint16(t *testing.T, f func(x archsimd.Int16x8) archsimd.Uint16x8, want func(x []int16) []uint16) {
+// testInt16x8ConvertToInt64 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testInt16x8ConvertToInt64(t *testing.T, f func(x archsimd.Int16x8) archsimd.Int64x8, want func(x []int16) []int64) {
 	n := 8
 	t.Helper()
 	forSlice(t, int16s, n, func(x []int16) bool {
 		t.Helper()
 		a := archsimd.LoadInt16x8Slice(x)
-		g := make([]uint16, n)
+		g := make([]int64, 8)
 		f(a).StoreSlice(g)
 		w := want(x)
 		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
 	})
 }
 
-// testUint8x16ConvertToUint16 tests the simd conversion method f against the expected behavior generated by want
-// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
-func testUint8x16ConvertToUint16(t *testing.T, f func(x archsimd.Uint8x16) archsimd.Uint16x16, want func(x []uint8) []uint16) {
+// testInt32x4ConvertToInt64 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testInt32x4ConvertToInt64(t *testing.T, f func(x archsimd.Int32x4) archsimd.Int64x4, want func(x []int32) []int64) {
+	n := 4
+	t.Helper()
+	forSlice(t, int32s, n, func(x []int32) bool {
+		t.Helper()
+		a := archsimd.LoadInt32x4Slice(x)
+		g := make([]int64, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt64x2ConvertToInt64 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testInt64x2ConvertToInt64(t *testing.T, f func(x archsimd.Int64x2) archsimd.Int64x2, want func(x []int64) []int64) {
+	n := 2
+	t.Helper()
+	forSlice(t, int64s, n, func(x []int64) bool {
+		t.Helper()
+		a := archsimd.LoadInt64x2Slice(x)
+		g := make([]int64, 2)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint8x16ConvertToInt64 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testUint8x16ConvertToInt64(t *testing.T, f func(x archsimd.Uint8x16) archsimd.Int64x8, want func(x []uint8) []int64) {
 	n := 16
 	t.Helper()
 	forSlice(t, uint8s, n, func(x []uint8) bool {
 		t.Helper()
 		a := archsimd.LoadUint8x16Slice(x)
-		g := make([]uint16, n)
+		g := make([]int64, 8)
 		f(a).StoreSlice(g)
 		w := want(x)
 		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
 	})
 }
 
-// testUint16x8ConvertToUint16 tests the simd conversion method f against the expected behavior generated by want
-// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
-func testUint16x8ConvertToUint16(t *testing.T, f func(x archsimd.Uint16x8) archsimd.Uint16x8, want func(x []uint16) []uint16) {
+// testUint16x8ConvertToInt64 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testUint16x8ConvertToInt64(t *testing.T, f func(x archsimd.Uint16x8) archsimd.Int64x8, want func(x []uint16) []int64) {
 	n := 8
 	t.Helper()
 	forSlice(t, uint16s, n, func(x []uint16) bool {
 		t.Helper()
 		a := archsimd.LoadUint16x8Slice(x)
-		g := make([]uint16, n)
+		g := make([]int64, 8)
 		f(a).StoreSlice(g)
 		w := want(x)
 		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
 	})
 }
 
-// testInt8x32ConvertToUint16 tests the simd conversion method f against the expected behavior generated by want
-// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
-func testInt8x32ConvertToUint16(t *testing.T, f func(x archsimd.Int8x32) archsimd.Uint16x32, want func(x []int8) []uint16) {
+// testUint32x4ConvertToInt64 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testUint32x4ConvertToInt64(t *testing.T, f func(x archsimd.Uint32x4) archsimd.Int64x4, want func(x []uint32) []int64) {
+	n := 4
+	t.Helper()
+	forSlice(t, uint32s, n, func(x []uint32) bool {
+		t.Helper()
+		a := archsimd.LoadUint32x4Slice(x)
+		g := make([]int64, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint64x2ConvertToInt64 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testUint64x2ConvertToInt64(t *testing.T, f func(x archsimd.Uint64x2) archsimd.Int64x2, want func(x []uint64) []int64) {
+	n := 2
+	t.Helper()
+	forSlice(t, uint64s, n, func(x []uint64) bool {
+		t.Helper()
+		a := archsimd.LoadUint64x2Slice(x)
+		g := make([]int64, 2)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testFloat32x4ConvertToInt64 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testFloat32x4ConvertToInt64(t *testing.T, f func(x archsimd.Float32x4) archsimd.Int64x4, want func(x []float32) []int64) {
+	n := 4
+	t.Helper()
+	forSlice(t, float32s, n, func(x []float32) bool {
+		t.Helper()
+		a := archsimd.LoadFloat32x4Slice(x)
+		g := make([]int64, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testFloat64x2ConvertToInt64 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testFloat64x2ConvertToInt64(t *testing.T, f func(x archsimd.Float64x2) archsimd.Int64x2, want func(x []float64) []int64) {
+	n := 2
+	t.Helper()
+	forSlice(t, float64s, n, func(x []float64) bool {
+		t.Helper()
+		a := archsimd.LoadFloat64x2Slice(x)
+		g := make([]int64, 2)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt8x32ConvertToInt64 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testInt8x32ConvertToInt64(t *testing.T, f func(x archsimd.Int8x32) archsimd.Int64x8, want func(x []int8) []int64) {
 	n := 32
 	t.Helper()
 	forSlice(t, int8s, n, func(x []int8) bool {
 		t.Helper()
 		a := archsimd.LoadInt8x32Slice(x)
-		g := make([]uint16, n)
+		g := make([]int64, 8)
 		f(a).StoreSlice(g)
 		w := want(x)
 		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
 	})
 }
 
-// testInt16x16ConvertToUint16 tests the simd conversion method f against the expected behavior generated by want
-// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
-func testInt16x16ConvertToUint16(t *testing.T, f func(x archsimd.Int16x16) archsimd.Uint16x16, want func(x []int16) []uint16) {
+// testInt16x16ConvertToInt64 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testInt16x16ConvertToInt64(t *testing.T, f func(x archsimd.Int16x16) archsimd.Int64x8, want func(x []int16) []int64) {
 	n := 16
 	t.Helper()
 	forSlice(t, int16s, n, func(x []int16) bool {
 		t.Helper()
 		a := archsimd.LoadInt16x16Slice(x)
-		g := make([]uint16, n)
+		g := make([]int64, 8)
 		f(a).StoreSlice(g)
 		w := want(x)
 		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
 	})
 }
 
-// testInt32x8ConvertToUint16 tests the simd conversion method f against the expected behavior generated by want
-// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
-func testInt32x8ConvertToUint16(t *testing.T, f func(x archsimd.Int32x8) archsimd.Uint16x8, want func(x []int32) []uint16) {
+// testInt32x8ConvertToInt64 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testInt32x8ConvertToInt64(t *testing.T, f func(x archsimd.Int32x8) archsimd.Int64x8, want func(x []int32) []int64) {
 	n := 8
 	t.Helper()
 	forSlice(t, int32s, n, func(x []int32) bool {
 		t.Helper()
 		a := archsimd.LoadInt32x8Slice(x)
-		g := make([]uint16, n)
+		g := make([]int64, 8)
 		f(a).StoreSlice(g)
 		w := want(x)
 		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
 	})
 }
 
-// testUint8x32ConvertToUint16 tests the simd conversion method f against the expected behavior generated by want
-// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
-func testUint8x32ConvertToUint16(t *testing.T, f func(x archsimd.Uint8x32) archsimd.Uint16x32, want func(x []uint8) []uint16) {
+// testInt64x4ConvertToInt64 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testInt64x4ConvertToInt64(t *testing.T, f func(x archsimd.Int64x4) archsimd.Int64x4, want func(x []int64) []int64) {
+	n := 4
+	t.Helper()
+	forSlice(t, int64s, n, func(x []int64) bool {
+		t.Helper()
+		a := archsimd.LoadInt64x4Slice(x)
+		g := make([]int64, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint8x32ConvertToInt64 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testUint8x32ConvertToInt64(t *testing.T, f func(x archsimd.Uint8x32) archsimd.Int64x8, want func(x []uint8) []int64) {
 	n := 32
 	t.Helper()
 	forSlice(t, uint8s, n, func(x []uint8) bool {
 		t.Helper()
 		a := archsimd.LoadUint8x32Slice(x)
-		g := make([]uint16, n)
+		g := make([]int64, 8)
 		f(a).StoreSlice(g)
 		w := want(x)
 		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
 	})
 }
 
-// testUint16x16ConvertToUint16 tests the simd conversion method f against the expected behavior generated by want
-// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
-func testUint16x16ConvertToUint16(t *testing.T, f func(x archsimd.Uint16x16) archsimd.Uint16x16, want func(x []uint16) []uint16) {
+// testUint16x16ConvertToInt64 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testUint16x16ConvertToInt64(t *testing.T, f func(x archsimd.Uint16x16) archsimd.Int64x8, want func(x []uint16) []int64) {
 	n := 16
 	t.Helper()
 	forSlice(t, uint16s, n, func(x []uint16) bool {
 		t.Helper()
 		a := archsimd.LoadUint16x16Slice(x)
-		g := make([]uint16, n)
+		g := make([]int64, 8)
 		f(a).StoreSlice(g)
 		w := want(x)
 		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
 	})
 }
 
-// testUint32x8ConvertToUint16 tests the simd conversion method f against the expected behavior generated by want
-// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
-func testUint32x8ConvertToUint16(t *testing.T, f func(x archsimd.Uint32x8) archsimd.Uint16x8, want func(x []uint32) []uint16) {
+// testUint32x8ConvertToInt64 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testUint32x8ConvertToInt64(t *testing.T, f func(x archsimd.Uint32x8) archsimd.Int64x8, want func(x []uint32) []int64) {
 	n := 8
 	t.Helper()
 	forSlice(t, uint32s, n, func(x []uint32) bool {
 		t.Helper()
 		a := archsimd.LoadUint32x8Slice(x)
-		g := make([]uint16, n)
+		g := make([]int64, 8)
 		f(a).StoreSlice(g)
 		w := want(x)
 		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
 	})
 }
 
-// testFloat32x8ConvertToUint16 tests the simd conversion method f against the expected behavior generated by want
-// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
-func testFloat32x8ConvertToUint16(t *testing.T, f func(x archsimd.Float32x8) archsimd.Uint16x8, want func(x []float32) []uint16) {
+// testUint64x4ConvertToInt64 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testUint64x4ConvertToInt64(t *testing.T, f func(x archsimd.Uint64x4) archsimd.Int64x4, want func(x []uint64) []int64) {
+	n := 4
+	t.Helper()
+	forSlice(t, uint64s, n, func(x []uint64) bool {
+		t.Helper()
+		a := archsimd.LoadUint64x4Slice(x)
+		g := make([]int64, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testFloat32x8ConvertToInt64 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testFloat32x8ConvertToInt64(t *testing.T, f func(x archsimd.Float32x8) archsimd.Int64x8, want func(x []float32) []int64) {
 	n := 8
 	t.Helper()
 	forSlice(t, float32s, n, func(x []float32) bool {
 		t.Helper()
 		a := archsimd.LoadFloat32x8Slice(x)
-		g := make([]uint16, n)
+		g := make([]int64, 8)
 		f(a).StoreSlice(g)
 		w := want(x)
 		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
 	})
 }
 
-// testInt16x32ConvertToUint16 tests the simd conversion method f against the expected behavior generated by want
-// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
-func testInt16x32ConvertToUint16(t *testing.T, f func(x archsimd.Int16x32) archsimd.Uint16x32, want func(x []int16) []uint16) {
+// testFloat64x4ConvertToInt64 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testFloat64x4ConvertToInt64(t *testing.T, f func(x archsimd.Float64x4) archsimd.Int64x4, want func(x []float64) []int64) {
+	n := 4
+	t.Helper()
+	forSlice(t, float64s, n, func(x []float64) bool {
+		t.Helper()
+		a := archsimd.LoadFloat64x4Slice(x)
+		g := make([]int64, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt8x64ConvertToInt64 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testInt8x64ConvertToInt64(t *testing.T, f func(x archsimd.Int8x64) archsimd.Int64x8, want func(x []int8) []int64) {
+	n := 64
+	t.Helper()
+	forSlice(t, int8s, n, func(x []int8) bool {
+		t.Helper()
+		a := archsimd.LoadInt8x64Slice(x)
+		g := make([]int64, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt16x32ConvertToInt64 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testInt16x32ConvertToInt64(t *testing.T, f func(x archsimd.Int16x32) archsimd.Int64x8, want func(x []int16) []int64) {
 	n := 32
 	t.Helper()
 	forSlice(t, int16s, n, func(x []int16) bool {
 		t.Helper()
 		a := archsimd.LoadInt16x32Slice(x)
-		g := make([]uint16, n)
+		g := make([]int64, 8)
 		f(a).StoreSlice(g)
 		w := want(x)
 		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
 	})
 }
 
-// testInt32x16ConvertToUint16 tests the simd conversion method f against the expected behavior generated by want
-// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
-func testInt32x16ConvertToUint16(t *testing.T, f func(x archsimd.Int32x16) archsimd.Uint16x16, want func(x []int32) []uint16) {
+// testInt32x16ConvertToInt64 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testInt32x16ConvertToInt64(t *testing.T, f func(x archsimd.Int32x16) archsimd.Int64x8, want func(x []int32) []int64) {
 	n := 16
 	t.Helper()
 	forSlice(t, int32s, n, func(x []int32) bool {
 		t.Helper()
 		a := archsimd.LoadInt32x16Slice(x)
-		g := make([]uint16, n)
+		g := make([]int64, 8)
 		f(a).StoreSlice(g)
 		w := want(x)
 		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
 	})
 }
 
-// testInt64x8ConvertToUint16 tests the simd conversion method f against the expected behavior generated by want
-// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
-func testInt64x8ConvertToUint16(t *testing.T, f func(x archsimd.Int64x8) archsimd.Uint16x8, want func(x []int64) []uint16) {
+// testInt64x8ConvertToInt64 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testInt64x8ConvertToInt64(t *testing.T, f func(x archsimd.Int64x8) archsimd.Int64x8, want func(x []int64) []int64) {
 	n := 8
 	t.Helper()
 	forSlice(t, int64s, n, func(x []int64) bool {
 		t.Helper()
 		a := archsimd.LoadInt64x8Slice(x)
-		g := make([]uint16, n)
+		g := make([]int64, 8)
 		f(a).StoreSlice(g)
 		w := want(x)
 		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
 	})
 }
 
-// testUint16x32ConvertToUint16 tests the simd conversion method f against the expected behavior generated by want
-// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
-func testUint16x32ConvertToUint16(t *testing.T, f func(x archsimd.Uint16x32) archsimd.Uint16x32, want func(x []uint16) []uint16) {
+// testUint8x64ConvertToInt64 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testUint8x64ConvertToInt64(t *testing.T, f func(x archsimd.Uint8x64) archsimd.Int64x8, want func(x []uint8) []int64) {
+	n := 64
+	t.Helper()
+	forSlice(t, uint8s, n, func(x []uint8) bool {
+		t.Helper()
+		a := archsimd.LoadUint8x64Slice(x)
+		g := make([]int64, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint16x32ConvertToInt64 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testUint16x32ConvertToInt64(t *testing.T, f func(x archsimd.Uint16x32) archsimd.Int64x8, want func(x []uint16) []int64) {
 	n := 32
 	t.Helper()
 	forSlice(t, uint16s, n, func(x []uint16) bool {
 		t.Helper()
 		a := archsimd.LoadUint16x32Slice(x)
-		g := make([]uint16, n)
+		g := make([]int64, 8)
 		f(a).StoreSlice(g)
 		w := want(x)
 		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
 	})
 }
 
-// testUint32x16ConvertToUint16 tests the simd conversion method f against the expected behavior generated by want
-// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
-func testUint32x16ConvertToUint16(t *testing.T, f func(x archsimd.Uint32x16) archsimd.Uint16x16, want func(x []uint32) []uint16) {
+// testUint32x16ConvertToInt64 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testUint32x16ConvertToInt64(t *testing.T, f func(x archsimd.Uint32x16) archsimd.Int64x8, want func(x []uint32) []int64) {
 	n := 16
 	t.Helper()
 	forSlice(t, uint32s, n, func(x []uint32) bool {
 		t.Helper()
 		a := archsimd.LoadUint32x16Slice(x)
-		g := make([]uint16, n)
+		g := make([]int64, 8)
 		f(a).StoreSlice(g)
 		w := want(x)
 		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
 	})
 }
 
-// testUint64x8ConvertToUint16 tests the simd conversion method f against the expected behavior generated by want
-// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
-func testUint64x8ConvertToUint16(t *testing.T, f func(x archsimd.Uint64x8) archsimd.Uint16x8, want func(x []uint64) []uint16) {
+// testUint64x8ConvertToInt64 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testUint64x8ConvertToInt64(t *testing.T, f func(x archsimd.Uint64x8) archsimd.Int64x8, want func(x []uint64) []int64) {
 	n := 8
 	t.Helper()
 	forSlice(t, uint64s, n, func(x []uint64) bool {
 		t.Helper()
 		a := archsimd.LoadUint64x8Slice(x)
-		g := make([]uint16, n)
+		g := make([]int64, 8)
 		f(a).StoreSlice(g)
 		w := want(x)
 		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
 	})
 }
 
-// testFloat32x16ConvertToUint16 tests the simd conversion method f against the expected behavior generated by want
-// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
-func testFloat32x16ConvertToUint16(t *testing.T, f func(x archsimd.Float32x16) archsimd.Uint16x16, want func(x []float32) []uint16) {
+// testFloat32x16ConvertToInt64 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testFloat32x16ConvertToInt64(t *testing.T, f func(x archsimd.Float32x16) archsimd.Int64x8, want func(x []float32) []int64) {
 	n := 16
 	t.Helper()
 	forSlice(t, float32s, n, func(x []float32) bool {
 		t.Helper()
 		a := archsimd.LoadFloat32x16Slice(x)
-		g := make([]uint16, n)
+		g := make([]int64, 8)
 		f(a).StoreSlice(g)
 		w := want(x)
 		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
 	})
 }
 
-// testFloat64x8ConvertToUint16 tests the simd conversion method f against the expected behavior generated by want
-// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
-func testFloat64x8ConvertToUint16(t *testing.T, f func(x archsimd.Float64x8) archsimd.Uint16x8, want func(x []float64) []uint16) {
+// testFloat64x8ConvertToInt64 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testFloat64x8ConvertToInt64(t *testing.T, f func(x archsimd.Float64x8) archsimd.Int64x8, want func(x []float64) []int64) {
 	n := 8
 	t.Helper()
 	forSlice(t, float64s, n, func(x []float64) bool {
 		t.Helper()
 		a := archsimd.LoadFloat64x8Slice(x)
-		g := make([]uint16, n)
+		g := make([]int64, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt8x16ConvertToUint64 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testInt8x16ConvertToUint64(t *testing.T, f func(x archsimd.Int8x16) archsimd.Uint64x8, want func(x []int8) []uint64) {
+	n := 16
+	t.Helper()
+	forSlice(t, int8s, n, func(x []int8) bool {
+		t.Helper()
+		a := archsimd.LoadInt8x16Slice(x)
+		g := make([]uint64, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt16x8ConvertToUint64 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testInt16x8ConvertToUint64(t *testing.T, f func(x archsimd.Int16x8) archsimd.Uint64x8, want func(x []int16) []uint64) {
+	n := 8
+	t.Helper()
+	forSlice(t, int16s, n, func(x []int16) bool {
+		t.Helper()
+		a := archsimd.LoadInt16x8Slice(x)
+		g := make([]uint64, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt32x4ConvertToUint64 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testInt32x4ConvertToUint64(t *testing.T, f func(x archsimd.Int32x4) archsimd.Uint64x4, want func(x []int32) []uint64) {
+	n := 4
+	t.Helper()
+	forSlice(t, int32s, n, func(x []int32) bool {
+		t.Helper()
+		a := archsimd.LoadInt32x4Slice(x)
+		g := make([]uint64, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt64x2ConvertToUint64 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testInt64x2ConvertToUint64(t *testing.T, f func(x archsimd.Int64x2) archsimd.Uint64x2, want func(x []int64) []uint64) {
+	n := 2
+	t.Helper()
+	forSlice(t, int64s, n, func(x []int64) bool {
+		t.Helper()
+		a := archsimd.LoadInt64x2Slice(x)
+		g := make([]uint64, 2)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint8x16ConvertToUint64 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testUint8x16ConvertToUint64(t *testing.T, f func(x archsimd.Uint8x16) archsimd.Uint64x8, want func(x []uint8) []uint64) {
+	n := 16
+	t.Helper()
+	forSlice(t, uint8s, n, func(x []uint8) bool {
+		t.Helper()
+		a := archsimd.LoadUint8x16Slice(x)
+		g := make([]uint64, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint16x8ConvertToUint64 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testUint16x8ConvertToUint64(t *testing.T, f func(x archsimd.Uint16x8) archsimd.Uint64x8, want func(x []uint16) []uint64) {
+	n := 8
+	t.Helper()
+	forSlice(t, uint16s, n, func(x []uint16) bool {
+		t.Helper()
+		a := archsimd.LoadUint16x8Slice(x)
+		g := make([]uint64, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint32x4ConvertToUint64 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testUint32x4ConvertToUint64(t *testing.T, f func(x archsimd.Uint32x4) archsimd.Uint64x4, want func(x []uint32) []uint64) {
+	n := 4
+	t.Helper()
+	forSlice(t, uint32s, n, func(x []uint32) bool {
+		t.Helper()
+		a := archsimd.LoadUint32x4Slice(x)
+		g := make([]uint64, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint64x2ConvertToUint64 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testUint64x2ConvertToUint64(t *testing.T, f func(x archsimd.Uint64x2) archsimd.Uint64x2, want func(x []uint64) []uint64) {
+	n := 2
+	t.Helper()
+	forSlice(t, uint64s, n, func(x []uint64) bool {
+		t.Helper()
+		a := archsimd.LoadUint64x2Slice(x)
+		g := make([]uint64, 2)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testFloat32x4ConvertToUint64 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testFloat32x4ConvertToUint64(t *testing.T, f func(x archsimd.Float32x4) archsimd.Uint64x4, want func(x []float32) []uint64) {
+	n := 4
+	t.Helper()
+	forSlice(t, float32s, n, func(x []float32) bool {
+		t.Helper()
+		a := archsimd.LoadFloat32x4Slice(x)
+		g := make([]uint64, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testFloat64x2ConvertToUint64 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testFloat64x2ConvertToUint64(t *testing.T, f func(x archsimd.Float64x2) archsimd.Uint64x2, want func(x []float64) []uint64) {
+	n := 2
+	t.Helper()
+	forSlice(t, float64s, n, func(x []float64) bool {
+		t.Helper()
+		a := archsimd.LoadFloat64x2Slice(x)
+		g := make([]uint64, 2)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt8x32ConvertToUint64 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testInt8x32ConvertToUint64(t *testing.T, f func(x archsimd.Int8x32) archsimd.Uint64x8, want func(x []int8) []uint64) {
+	n := 32
+	t.Helper()
+	forSlice(t, int8s, n, func(x []int8) bool {
+		t.Helper()
+		a := archsimd.LoadInt8x32Slice(x)
+		g := make([]uint64, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt16x16ConvertToUint64 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testInt16x16ConvertToUint64(t *testing.T, f func(x archsimd.Int16x16) archsimd.Uint64x8, want func(x []int16) []uint64) {
+	n := 16
+	t.Helper()
+	forSlice(t, int16s, n, func(x []int16) bool {
+		t.Helper()
+		a := archsimd.LoadInt16x16Slice(x)
+		g := make([]uint64, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt32x8ConvertToUint64 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testInt32x8ConvertToUint64(t *testing.T, f func(x archsimd.Int32x8) archsimd.Uint64x8, want func(x []int32) []uint64) {
+	n := 8
+	t.Helper()
+	forSlice(t, int32s, n, func(x []int32) bool {
+		t.Helper()
+		a := archsimd.LoadInt32x8Slice(x)
+		g := make([]uint64, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt64x4ConvertToUint64 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testInt64x4ConvertToUint64(t *testing.T, f func(x archsimd.Int64x4) archsimd.Uint64x4, want func(x []int64) []uint64) {
+	n := 4
+	t.Helper()
+	forSlice(t, int64s, n, func(x []int64) bool {
+		t.Helper()
+		a := archsimd.LoadInt64x4Slice(x)
+		g := make([]uint64, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint8x32ConvertToUint64 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testUint8x32ConvertToUint64(t *testing.T, f func(x archsimd.Uint8x32) archsimd.Uint64x8, want func(x []uint8) []uint64) {
+	n := 32
+	t.Helper()
+	forSlice(t, uint8s, n, func(x []uint8) bool {
+		t.Helper()
+		a := archsimd.LoadUint8x32Slice(x)
+		g := make([]uint64, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint16x16ConvertToUint64 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testUint16x16ConvertToUint64(t *testing.T, f func(x archsimd.Uint16x16) archsimd.Uint64x8, want func(x []uint16) []uint64) {
+	n := 16
+	t.Helper()
+	forSlice(t, uint16s, n, func(x []uint16) bool {
+		t.Helper()
+		a := archsimd.LoadUint16x16Slice(x)
+		g := make([]uint64, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint32x8ConvertToUint64 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testUint32x8ConvertToUint64(t *testing.T, f func(x archsimd.Uint32x8) archsimd.Uint64x8, want func(x []uint32) []uint64) {
+	n := 8
+	t.Helper()
+	forSlice(t, uint32s, n, func(x []uint32) bool {
+		t.Helper()
+		a := archsimd.LoadUint32x8Slice(x)
+		g := make([]uint64, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint64x4ConvertToUint64 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testUint64x4ConvertToUint64(t *testing.T, f func(x archsimd.Uint64x4) archsimd.Uint64x4, want func(x []uint64) []uint64) {
+	n := 4
+	t.Helper()
+	forSlice(t, uint64s, n, func(x []uint64) bool {
+		t.Helper()
+		a := archsimd.LoadUint64x4Slice(x)
+		g := make([]uint64, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testFloat32x8ConvertToUint64 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testFloat32x8ConvertToUint64(t *testing.T, f func(x archsimd.Float32x8) archsimd.Uint64x8, want func(x []float32) []uint64) {
+	n := 8
+	t.Helper()
+	forSlice(t, float32s, n, func(x []float32) bool {
+		t.Helper()
+		a := archsimd.LoadFloat32x8Slice(x)
+		g := make([]uint64, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testFloat64x4ConvertToUint64 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testFloat64x4ConvertToUint64(t *testing.T, f func(x archsimd.Float64x4) archsimd.Uint64x4, want func(x []float64) []uint64) {
+	n := 4
+	t.Helper()
+	forSlice(t, float64s, n, func(x []float64) bool {
+		t.Helper()
+		a := archsimd.LoadFloat64x4Slice(x)
+		g := make([]uint64, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt8x64ConvertToUint64 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testInt8x64ConvertToUint64(t *testing.T, f func(x archsimd.Int8x64) archsimd.Uint64x8, want func(x []int8) []uint64) {
+	n := 64
+	t.Helper()
+	forSlice(t, int8s, n, func(x []int8) bool {
+		t.Helper()
+		a := archsimd.LoadInt8x64Slice(x)
+		g := make([]uint64, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt16x32ConvertToUint64 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testInt16x32ConvertToUint64(t *testing.T, f func(x archsimd.Int16x32) archsimd.Uint64x8, want func(x []int16) []uint64) {
+	n := 32
+	t.Helper()
+	forSlice(t, int16s, n, func(x []int16) bool {
+		t.Helper()
+		a := archsimd.LoadInt16x32Slice(x)
+		g := make([]uint64, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt32x16ConvertToUint64 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testInt32x16ConvertToUint64(t *testing.T, f func(x archsimd.Int32x16) archsimd.Uint64x8, want func(x []int32) []uint64) {
+	n := 16
+	t.Helper()
+	forSlice(t, int32s, n, func(x []int32) bool {
+		t.Helper()
+		a := archsimd.LoadInt32x16Slice(x)
+		g := make([]uint64, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt64x8ConvertToUint64 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testInt64x8ConvertToUint64(t *testing.T, f func(x archsimd.Int64x8) archsimd.Uint64x8, want func(x []int64) []uint64) {
+	n := 8
+	t.Helper()
+	forSlice(t, int64s, n, func(x []int64) bool {
+		t.Helper()
+		a := archsimd.LoadInt64x8Slice(x)
+		g := make([]uint64, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint8x64ConvertToUint64 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testUint8x64ConvertToUint64(t *testing.T, f func(x archsimd.Uint8x64) archsimd.Uint64x8, want func(x []uint8) []uint64) {
+	n := 64
+	t.Helper()
+	forSlice(t, uint8s, n, func(x []uint8) bool {
+		t.Helper()
+		a := archsimd.LoadUint8x64Slice(x)
+		g := make([]uint64, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint16x32ConvertToUint64 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testUint16x32ConvertToUint64(t *testing.T, f func(x archsimd.Uint16x32) archsimd.Uint64x8, want func(x []uint16) []uint64) {
+	n := 32
+	t.Helper()
+	forSlice(t, uint16s, n, func(x []uint16) bool {
+		t.Helper()
+		a := archsimd.LoadUint16x32Slice(x)
+		g := make([]uint64, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint32x16ConvertToUint64 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testUint32x16ConvertToUint64(t *testing.T, f func(x archsimd.Uint32x16) archsimd.Uint64x8, want func(x []uint32) []uint64) {
+	n := 16
+	t.Helper()
+	forSlice(t, uint32s, n, func(x []uint32) bool {
+		t.Helper()
+		a := archsimd.LoadUint32x16Slice(x)
+		g := make([]uint64, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint64x8ConvertToUint64 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testUint64x8ConvertToUint64(t *testing.T, f func(x archsimd.Uint64x8) archsimd.Uint64x8, want func(x []uint64) []uint64) {
+	n := 8
+	t.Helper()
+	forSlice(t, uint64s, n, func(x []uint64) bool {
+		t.Helper()
+		a := archsimd.LoadUint64x8Slice(x)
+		g := make([]uint64, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testFloat32x16ConvertToUint64 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testFloat32x16ConvertToUint64(t *testing.T, f func(x archsimd.Float32x16) archsimd.Uint64x8, want func(x []float32) []uint64) {
+	n := 16
+	t.Helper()
+	forSlice(t, float32s, n, func(x []float32) bool {
+		t.Helper()
+		a := archsimd.LoadFloat32x16Slice(x)
+		g := make([]uint64, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testFloat64x8ConvertToUint64 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testFloat64x8ConvertToUint64(t *testing.T, f func(x archsimd.Float64x8) archsimd.Uint64x8, want func(x []float64) []uint64) {
+	n := 8
+	t.Helper()
+	forSlice(t, float64s, n, func(x []float64) bool {
+		t.Helper()
+		a := archsimd.LoadFloat64x8Slice(x)
+		g := make([]uint64, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt8x16ConvertToFloat32 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testInt8x16ConvertToFloat32(t *testing.T, f func(x archsimd.Int8x16) archsimd.Float32x16, want func(x []int8) []float32) {
+	n := 16
+	t.Helper()
+	forSlice(t, int8s, n, func(x []int8) bool {
+		t.Helper()
+		a := archsimd.LoadInt8x16Slice(x)
+		g := make([]float32, 16)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt16x8ConvertToFloat32 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testInt16x8ConvertToFloat32(t *testing.T, f func(x archsimd.Int16x8) archsimd.Float32x8, want func(x []int16) []float32) {
+	n := 8
+	t.Helper()
+	forSlice(t, int16s, n, func(x []int16) bool {
+		t.Helper()
+		a := archsimd.LoadInt16x8Slice(x)
+		g := make([]float32, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt32x4ConvertToFloat32 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testInt32x4ConvertToFloat32(t *testing.T, f func(x archsimd.Int32x4) archsimd.Float32x4, want func(x []int32) []float32) {
+	n := 4
+	t.Helper()
+	forSlice(t, int32s, n, func(x []int32) bool {
+		t.Helper()
+		a := archsimd.LoadInt32x4Slice(x)
+		g := make([]float32, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt64x2ConvertToFloat32 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testInt64x2ConvertToFloat32(t *testing.T, f func(x archsimd.Int64x2) archsimd.Float32x4, want func(x []int64) []float32) {
+	n := 2
+	t.Helper()
+	forSlice(t, int64s, n, func(x []int64) bool {
+		t.Helper()
+		a := archsimd.LoadInt64x2Slice(x)
+		g := make([]float32, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint8x16ConvertToFloat32 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testUint8x16ConvertToFloat32(t *testing.T, f func(x archsimd.Uint8x16) archsimd.Float32x16, want func(x []uint8) []float32) {
+	n := 16
+	t.Helper()
+	forSlice(t, uint8s, n, func(x []uint8) bool {
+		t.Helper()
+		a := archsimd.LoadUint8x16Slice(x)
+		g := make([]float32, 16)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint16x8ConvertToFloat32 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testUint16x8ConvertToFloat32(t *testing.T, f func(x archsimd.Uint16x8) archsimd.Float32x8, want func(x []uint16) []float32) {
+	n := 8
+	t.Helper()
+	forSlice(t, uint16s, n, func(x []uint16) bool {
+		t.Helper()
+		a := archsimd.LoadUint16x8Slice(x)
+		g := make([]float32, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint32x4ConvertToFloat32 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testUint32x4ConvertToFloat32(t *testing.T, f func(x archsimd.Uint32x4) archsimd.Float32x4, want func(x []uint32) []float32) {
+	n := 4
+	t.Helper()
+	forSlice(t, uint32s, n, func(x []uint32) bool {
+		t.Helper()
+		a := archsimd.LoadUint32x4Slice(x)
+		g := make([]float32, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint64x2ConvertToFloat32 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testUint64x2ConvertToFloat32(t *testing.T, f func(x archsimd.Uint64x2) archsimd.Float32x4, want func(x []uint64) []float32) {
+	n := 2
+	t.Helper()
+	forSlice(t, uint64s, n, func(x []uint64) bool {
+		t.Helper()
+		a := archsimd.LoadUint64x2Slice(x)
+		g := make([]float32, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testFloat32x4ConvertToFloat32 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testFloat32x4ConvertToFloat32(t *testing.T, f func(x archsimd.Float32x4) archsimd.Float32x4, want func(x []float32) []float32) {
+	n := 4
+	t.Helper()
+	forSlice(t, float32s, n, func(x []float32) bool {
+		t.Helper()
+		a := archsimd.LoadFloat32x4Slice(x)
+		g := make([]float32, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testFloat64x2ConvertToFloat32 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testFloat64x2ConvertToFloat32(t *testing.T, f func(x archsimd.Float64x2) archsimd.Float32x4, want func(x []float64) []float32) {
+	n := 2
+	t.Helper()
+	forSlice(t, float64s, n, func(x []float64) bool {
+		t.Helper()
+		a := archsimd.LoadFloat64x2Slice(x)
+		g := make([]float32, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt8x32ConvertToFloat32 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testInt8x32ConvertToFloat32(t *testing.T, f func(x archsimd.Int8x32) archsimd.Float32x16, want func(x []int8) []float32) {
+	n := 32
+	t.Helper()
+	forSlice(t, int8s, n, func(x []int8) bool {
+		t.Helper()
+		a := archsimd.LoadInt8x32Slice(x)
+		g := make([]float32, 16)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt16x16ConvertToFloat32 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testInt16x16ConvertToFloat32(t *testing.T, f func(x archsimd.Int16x16) archsimd.Float32x16, want func(x []int16) []float32) {
+	n := 16
+	t.Helper()
+	forSlice(t, int16s, n, func(x []int16) bool {
+		t.Helper()
+		a := archsimd.LoadInt16x16Slice(x)
+		g := make([]float32, 16)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt32x8ConvertToFloat32 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testInt32x8ConvertToFloat32(t *testing.T, f func(x archsimd.Int32x8) archsimd.Float32x8, want func(x []int32) []float32) {
+	n := 8
+	t.Helper()
+	forSlice(t, int32s, n, func(x []int32) bool {
+		t.Helper()
+		a := archsimd.LoadInt32x8Slice(x)
+		g := make([]float32, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt64x4ConvertToFloat32 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testInt64x4ConvertToFloat32(t *testing.T, f func(x archsimd.Int64x4) archsimd.Float32x4, want func(x []int64) []float32) {
+	n := 4
+	t.Helper()
+	forSlice(t, int64s, n, func(x []int64) bool {
+		t.Helper()
+		a := archsimd.LoadInt64x4Slice(x)
+		g := make([]float32, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint8x32ConvertToFloat32 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testUint8x32ConvertToFloat32(t *testing.T, f func(x archsimd.Uint8x32) archsimd.Float32x16, want func(x []uint8) []float32) {
+	n := 32
+	t.Helper()
+	forSlice(t, uint8s, n, func(x []uint8) bool {
+		t.Helper()
+		a := archsimd.LoadUint8x32Slice(x)
+		g := make([]float32, 16)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint16x16ConvertToFloat32 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testUint16x16ConvertToFloat32(t *testing.T, f func(x archsimd.Uint16x16) archsimd.Float32x16, want func(x []uint16) []float32) {
+	n := 16
+	t.Helper()
+	forSlice(t, uint16s, n, func(x []uint16) bool {
+		t.Helper()
+		a := archsimd.LoadUint16x16Slice(x)
+		g := make([]float32, 16)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint32x8ConvertToFloat32 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testUint32x8ConvertToFloat32(t *testing.T, f func(x archsimd.Uint32x8) archsimd.Float32x8, want func(x []uint32) []float32) {
+	n := 8
+	t.Helper()
+	forSlice(t, uint32s, n, func(x []uint32) bool {
+		t.Helper()
+		a := archsimd.LoadUint32x8Slice(x)
+		g := make([]float32, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint64x4ConvertToFloat32 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testUint64x4ConvertToFloat32(t *testing.T, f func(x archsimd.Uint64x4) archsimd.Float32x4, want func(x []uint64) []float32) {
+	n := 4
+	t.Helper()
+	forSlice(t, uint64s, n, func(x []uint64) bool {
+		t.Helper()
+		a := archsimd.LoadUint64x4Slice(x)
+		g := make([]float32, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testFloat32x8ConvertToFloat32 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testFloat32x8ConvertToFloat32(t *testing.T, f func(x archsimd.Float32x8) archsimd.Float32x8, want func(x []float32) []float32) {
+	n := 8
+	t.Helper()
+	forSlice(t, float32s, n, func(x []float32) bool {
+		t.Helper()
+		a := archsimd.LoadFloat32x8Slice(x)
+		g := make([]float32, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testFloat64x4ConvertToFloat32 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testFloat64x4ConvertToFloat32(t *testing.T, f func(x archsimd.Float64x4) archsimd.Float32x4, want func(x []float64) []float32) {
+	n := 4
+	t.Helper()
+	forSlice(t, float64s, n, func(x []float64) bool {
+		t.Helper()
+		a := archsimd.LoadFloat64x4Slice(x)
+		g := make([]float32, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt8x64ConvertToFloat32 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testInt8x64ConvertToFloat32(t *testing.T, f func(x archsimd.Int8x64) archsimd.Float32x16, want func(x []int8) []float32) {
+	n := 64
+	t.Helper()
+	forSlice(t, int8s, n, func(x []int8) bool {
+		t.Helper()
+		a := archsimd.LoadInt8x64Slice(x)
+		g := make([]float32, 16)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt16x32ConvertToFloat32 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testInt16x32ConvertToFloat32(t *testing.T, f func(x archsimd.Int16x32) archsimd.Float32x16, want func(x []int16) []float32) {
+	n := 32
+	t.Helper()
+	forSlice(t, int16s, n, func(x []int16) bool {
+		t.Helper()
+		a := archsimd.LoadInt16x32Slice(x)
+		g := make([]float32, 16)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt32x16ConvertToFloat32 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testInt32x16ConvertToFloat32(t *testing.T, f func(x archsimd.Int32x16) archsimd.Float32x16, want func(x []int32) []float32) {
+	n := 16
+	t.Helper()
+	forSlice(t, int32s, n, func(x []int32) bool {
+		t.Helper()
+		a := archsimd.LoadInt32x16Slice(x)
+		g := make([]float32, 16)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt64x8ConvertToFloat32 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testInt64x8ConvertToFloat32(t *testing.T, f func(x archsimd.Int64x8) archsimd.Float32x8, want func(x []int64) []float32) {
+	n := 8
+	t.Helper()
+	forSlice(t, int64s, n, func(x []int64) bool {
+		t.Helper()
+		a := archsimd.LoadInt64x8Slice(x)
+		g := make([]float32, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint8x64ConvertToFloat32 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testUint8x64ConvertToFloat32(t *testing.T, f func(x archsimd.Uint8x64) archsimd.Float32x16, want func(x []uint8) []float32) {
+	n := 64
+	t.Helper()
+	forSlice(t, uint8s, n, func(x []uint8) bool {
+		t.Helper()
+		a := archsimd.LoadUint8x64Slice(x)
+		g := make([]float32, 16)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint16x32ConvertToFloat32 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testUint16x32ConvertToFloat32(t *testing.T, f func(x archsimd.Uint16x32) archsimd.Float32x16, want func(x []uint16) []float32) {
+	n := 32
+	t.Helper()
+	forSlice(t, uint16s, n, func(x []uint16) bool {
+		t.Helper()
+		a := archsimd.LoadUint16x32Slice(x)
+		g := make([]float32, 16)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint32x16ConvertToFloat32 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testUint32x16ConvertToFloat32(t *testing.T, f func(x archsimd.Uint32x16) archsimd.Float32x16, want func(x []uint32) []float32) {
+	n := 16
+	t.Helper()
+	forSlice(t, uint32s, n, func(x []uint32) bool {
+		t.Helper()
+		a := archsimd.LoadUint32x16Slice(x)
+		g := make([]float32, 16)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint64x8ConvertToFloat32 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testUint64x8ConvertToFloat32(t *testing.T, f func(x archsimd.Uint64x8) archsimd.Float32x8, want func(x []uint64) []float32) {
+	n := 8
+	t.Helper()
+	forSlice(t, uint64s, n, func(x []uint64) bool {
+		t.Helper()
+		a := archsimd.LoadUint64x8Slice(x)
+		g := make([]float32, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testFloat32x16ConvertToFloat32 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testFloat32x16ConvertToFloat32(t *testing.T, f func(x archsimd.Float32x16) archsimd.Float32x16, want func(x []float32) []float32) {
+	n := 16
+	t.Helper()
+	forSlice(t, float32s, n, func(x []float32) bool {
+		t.Helper()
+		a := archsimd.LoadFloat32x16Slice(x)
+		g := make([]float32, 16)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testFloat64x8ConvertToFloat32 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testFloat64x8ConvertToFloat32(t *testing.T, f func(x archsimd.Float64x8) archsimd.Float32x8, want func(x []float64) []float32) {
+	n := 8
+	t.Helper()
+	forSlice(t, float64s, n, func(x []float64) bool {
+		t.Helper()
+		a := archsimd.LoadFloat64x8Slice(x)
+		g := make([]float32, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt8x16ConvertToFloat64 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testInt8x16ConvertToFloat64(t *testing.T, f func(x archsimd.Int8x16) archsimd.Float64x8, want func(x []int8) []float64) {
+	n := 16
+	t.Helper()
+	forSlice(t, int8s, n, func(x []int8) bool {
+		t.Helper()
+		a := archsimd.LoadInt8x16Slice(x)
+		g := make([]float64, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt16x8ConvertToFloat64 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testInt16x8ConvertToFloat64(t *testing.T, f func(x archsimd.Int16x8) archsimd.Float64x8, want func(x []int16) []float64) {
+	n := 8
+	t.Helper()
+	forSlice(t, int16s, n, func(x []int16) bool {
+		t.Helper()
+		a := archsimd.LoadInt16x8Slice(x)
+		g := make([]float64, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt32x4ConvertToFloat64 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testInt32x4ConvertToFloat64(t *testing.T, f func(x archsimd.Int32x4) archsimd.Float64x4, want func(x []int32) []float64) {
+	n := 4
+	t.Helper()
+	forSlice(t, int32s, n, func(x []int32) bool {
+		t.Helper()
+		a := archsimd.LoadInt32x4Slice(x)
+		g := make([]float64, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt64x2ConvertToFloat64 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testInt64x2ConvertToFloat64(t *testing.T, f func(x archsimd.Int64x2) archsimd.Float64x2, want func(x []int64) []float64) {
+	n := 2
+	t.Helper()
+	forSlice(t, int64s, n, func(x []int64) bool {
+		t.Helper()
+		a := archsimd.LoadInt64x2Slice(x)
+		g := make([]float64, 2)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint8x16ConvertToFloat64 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testUint8x16ConvertToFloat64(t *testing.T, f func(x archsimd.Uint8x16) archsimd.Float64x8, want func(x []uint8) []float64) {
+	n := 16
+	t.Helper()
+	forSlice(t, uint8s, n, func(x []uint8) bool {
+		t.Helper()
+		a := archsimd.LoadUint8x16Slice(x)
+		g := make([]float64, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint16x8ConvertToFloat64 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testUint16x8ConvertToFloat64(t *testing.T, f func(x archsimd.Uint16x8) archsimd.Float64x8, want func(x []uint16) []float64) {
+	n := 8
+	t.Helper()
+	forSlice(t, uint16s, n, func(x []uint16) bool {
+		t.Helper()
+		a := archsimd.LoadUint16x8Slice(x)
+		g := make([]float64, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint32x4ConvertToFloat64 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testUint32x4ConvertToFloat64(t *testing.T, f func(x archsimd.Uint32x4) archsimd.Float64x4, want func(x []uint32) []float64) {
+	n := 4
+	t.Helper()
+	forSlice(t, uint32s, n, func(x []uint32) bool {
+		t.Helper()
+		a := archsimd.LoadUint32x4Slice(x)
+		g := make([]float64, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint64x2ConvertToFloat64 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testUint64x2ConvertToFloat64(t *testing.T, f func(x archsimd.Uint64x2) archsimd.Float64x2, want func(x []uint64) []float64) {
+	n := 2
+	t.Helper()
+	forSlice(t, uint64s, n, func(x []uint64) bool {
+		t.Helper()
+		a := archsimd.LoadUint64x2Slice(x)
+		g := make([]float64, 2)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testFloat32x4ConvertToFloat64 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testFloat32x4ConvertToFloat64(t *testing.T, f func(x archsimd.Float32x4) archsimd.Float64x4, want func(x []float32) []float64) {
+	n := 4
+	t.Helper()
+	forSlice(t, float32s, n, func(x []float32) bool {
+		t.Helper()
+		a := archsimd.LoadFloat32x4Slice(x)
+		g := make([]float64, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testFloat64x2ConvertToFloat64 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testFloat64x2ConvertToFloat64(t *testing.T, f func(x archsimd.Float64x2) archsimd.Float64x2, want func(x []float64) []float64) {
+	n := 2
+	t.Helper()
+	forSlice(t, float64s, n, func(x []float64) bool {
+		t.Helper()
+		a := archsimd.LoadFloat64x2Slice(x)
+		g := make([]float64, 2)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt8x32ConvertToFloat64 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testInt8x32ConvertToFloat64(t *testing.T, f func(x archsimd.Int8x32) archsimd.Float64x8, want func(x []int8) []float64) {
+	n := 32
+	t.Helper()
+	forSlice(t, int8s, n, func(x []int8) bool {
+		t.Helper()
+		a := archsimd.LoadInt8x32Slice(x)
+		g := make([]float64, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt16x16ConvertToFloat64 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testInt16x16ConvertToFloat64(t *testing.T, f func(x archsimd.Int16x16) archsimd.Float64x8, want func(x []int16) []float64) {
+	n := 16
+	t.Helper()
+	forSlice(t, int16s, n, func(x []int16) bool {
+		t.Helper()
+		a := archsimd.LoadInt16x16Slice(x)
+		g := make([]float64, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt32x8ConvertToFloat64 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testInt32x8ConvertToFloat64(t *testing.T, f func(x archsimd.Int32x8) archsimd.Float64x8, want func(x []int32) []float64) {
+	n := 8
+	t.Helper()
+	forSlice(t, int32s, n, func(x []int32) bool {
+		t.Helper()
+		a := archsimd.LoadInt32x8Slice(x)
+		g := make([]float64, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt64x4ConvertToFloat64 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testInt64x4ConvertToFloat64(t *testing.T, f func(x archsimd.Int64x4) archsimd.Float64x4, want func(x []int64) []float64) {
+	n := 4
+	t.Helper()
+	forSlice(t, int64s, n, func(x []int64) bool {
+		t.Helper()
+		a := archsimd.LoadInt64x4Slice(x)
+		g := make([]float64, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint8x32ConvertToFloat64 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testUint8x32ConvertToFloat64(t *testing.T, f func(x archsimd.Uint8x32) archsimd.Float64x8, want func(x []uint8) []float64) {
+	n := 32
+	t.Helper()
+	forSlice(t, uint8s, n, func(x []uint8) bool {
+		t.Helper()
+		a := archsimd.LoadUint8x32Slice(x)
+		g := make([]float64, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint16x16ConvertToFloat64 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testUint16x16ConvertToFloat64(t *testing.T, f func(x archsimd.Uint16x16) archsimd.Float64x8, want func(x []uint16) []float64) {
+	n := 16
+	t.Helper()
+	forSlice(t, uint16s, n, func(x []uint16) bool {
+		t.Helper()
+		a := archsimd.LoadUint16x16Slice(x)
+		g := make([]float64, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint32x8ConvertToFloat64 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testUint32x8ConvertToFloat64(t *testing.T, f func(x archsimd.Uint32x8) archsimd.Float64x8, want func(x []uint32) []float64) {
+	n := 8
+	t.Helper()
+	forSlice(t, uint32s, n, func(x []uint32) bool {
+		t.Helper()
+		a := archsimd.LoadUint32x8Slice(x)
+		g := make([]float64, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint64x4ConvertToFloat64 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testUint64x4ConvertToFloat64(t *testing.T, f func(x archsimd.Uint64x4) archsimd.Float64x4, want func(x []uint64) []float64) {
+	n := 4
+	t.Helper()
+	forSlice(t, uint64s, n, func(x []uint64) bool {
+		t.Helper()
+		a := archsimd.LoadUint64x4Slice(x)
+		g := make([]float64, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testFloat32x8ConvertToFloat64 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testFloat32x8ConvertToFloat64(t *testing.T, f func(x archsimd.Float32x8) archsimd.Float64x8, want func(x []float32) []float64) {
+	n := 8
+	t.Helper()
+	forSlice(t, float32s, n, func(x []float32) bool {
+		t.Helper()
+		a := archsimd.LoadFloat32x8Slice(x)
+		g := make([]float64, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testFloat64x4ConvertToFloat64 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testFloat64x4ConvertToFloat64(t *testing.T, f func(x archsimd.Float64x4) archsimd.Float64x4, want func(x []float64) []float64) {
+	n := 4
+	t.Helper()
+	forSlice(t, float64s, n, func(x []float64) bool {
+		t.Helper()
+		a := archsimd.LoadFloat64x4Slice(x)
+		g := make([]float64, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt8x64ConvertToFloat64 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testInt8x64ConvertToFloat64(t *testing.T, f func(x archsimd.Int8x64) archsimd.Float64x8, want func(x []int8) []float64) {
+	n := 64
+	t.Helper()
+	forSlice(t, int8s, n, func(x []int8) bool {
+		t.Helper()
+		a := archsimd.LoadInt8x64Slice(x)
+		g := make([]float64, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt16x32ConvertToFloat64 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testInt16x32ConvertToFloat64(t *testing.T, f func(x archsimd.Int16x32) archsimd.Float64x8, want func(x []int16) []float64) {
+	n := 32
+	t.Helper()
+	forSlice(t, int16s, n, func(x []int16) bool {
+		t.Helper()
+		a := archsimd.LoadInt16x32Slice(x)
+		g := make([]float64, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt32x16ConvertToFloat64 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testInt32x16ConvertToFloat64(t *testing.T, f func(x archsimd.Int32x16) archsimd.Float64x8, want func(x []int32) []float64) {
+	n := 16
+	t.Helper()
+	forSlice(t, int32s, n, func(x []int32) bool {
+		t.Helper()
+		a := archsimd.LoadInt32x16Slice(x)
+		g := make([]float64, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt64x8ConvertToFloat64 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testInt64x8ConvertToFloat64(t *testing.T, f func(x archsimd.Int64x8) archsimd.Float64x8, want func(x []int64) []float64) {
+	n := 8
+	t.Helper()
+	forSlice(t, int64s, n, func(x []int64) bool {
+		t.Helper()
+		a := archsimd.LoadInt64x8Slice(x)
+		g := make([]float64, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint8x64ConvertToFloat64 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testUint8x64ConvertToFloat64(t *testing.T, f func(x archsimd.Uint8x64) archsimd.Float64x8, want func(x []uint8) []float64) {
+	n := 64
+	t.Helper()
+	forSlice(t, uint8s, n, func(x []uint8) bool {
+		t.Helper()
+		a := archsimd.LoadUint8x64Slice(x)
+		g := make([]float64, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint16x32ConvertToFloat64 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testUint16x32ConvertToFloat64(t *testing.T, f func(x archsimd.Uint16x32) archsimd.Float64x8, want func(x []uint16) []float64) {
+	n := 32
+	t.Helper()
+	forSlice(t, uint16s, n, func(x []uint16) bool {
+		t.Helper()
+		a := archsimd.LoadUint16x32Slice(x)
+		g := make([]float64, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint32x16ConvertToFloat64 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testUint32x16ConvertToFloat64(t *testing.T, f func(x archsimd.Uint32x16) archsimd.Float64x8, want func(x []uint32) []float64) {
+	n := 16
+	t.Helper()
+	forSlice(t, uint32s, n, func(x []uint32) bool {
+		t.Helper()
+		a := archsimd.LoadUint32x16Slice(x)
+		g := make([]float64, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint64x8ConvertToFloat64 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testUint64x8ConvertToFloat64(t *testing.T, f func(x archsimd.Uint64x8) archsimd.Float64x8, want func(x []uint64) []float64) {
+	n := 8
+	t.Helper()
+	forSlice(t, uint64s, n, func(x []uint64) bool {
+		t.Helper()
+		a := archsimd.LoadUint64x8Slice(x)
+		g := make([]float64, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testFloat32x16ConvertToFloat64 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testFloat32x16ConvertToFloat64(t *testing.T, f func(x archsimd.Float32x16) archsimd.Float64x8, want func(x []float32) []float64) {
+	n := 16
+	t.Helper()
+	forSlice(t, float32s, n, func(x []float32) bool {
+		t.Helper()
+		a := archsimd.LoadFloat32x16Slice(x)
+		g := make([]float64, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testFloat64x8ConvertToFloat64 tests the simd conversion method f against the expected behavior generated by want.
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width,
+// (extended to at least 128 bits, or truncated to at most 512 bits).
+func testFloat64x8ConvertToFloat64(t *testing.T, f func(x archsimd.Float64x8) archsimd.Float64x8, want func(x []float64) []float64) {
+	n := 8
+	t.Helper()
+	forSlice(t, float64s, n, func(x []float64) bool {
+		t.Helper()
+		a := archsimd.LoadFloat64x8Slice(x)
+		g := make([]float64, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt8x16ConvertLoToInt64x2 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 2 elements.
+func testInt8x16ConvertLoToInt64x2(t *testing.T, f func(x archsimd.Int8x16) archsimd.Int64x2, want func(x []int8) []int64) {
+	n := 16
+	t.Helper()
+	forSlice(t, int8s, n, func(x []int8) bool {
+		t.Helper()
+		a := archsimd.LoadInt8x16Slice(x)
+		g := make([]int64, 2)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt16x8ConvertLoToInt64x2 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 2 elements.
+func testInt16x8ConvertLoToInt64x2(t *testing.T, f func(x archsimd.Int16x8) archsimd.Int64x2, want func(x []int16) []int64) {
+	n := 8
+	t.Helper()
+	forSlice(t, int16s, n, func(x []int16) bool {
+		t.Helper()
+		a := archsimd.LoadInt16x8Slice(x)
+		g := make([]int64, 2)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt32x4ConvertLoToInt64x2 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 2 elements.
+func testInt32x4ConvertLoToInt64x2(t *testing.T, f func(x archsimd.Int32x4) archsimd.Int64x2, want func(x []int32) []int64) {
+	n := 4
+	t.Helper()
+	forSlice(t, int32s, n, func(x []int32) bool {
+		t.Helper()
+		a := archsimd.LoadInt32x4Slice(x)
+		g := make([]int64, 2)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt64x2ConvertLoToInt64x2 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 2 elements.
+func testInt64x2ConvertLoToInt64x2(t *testing.T, f func(x archsimd.Int64x2) archsimd.Int64x2, want func(x []int64) []int64) {
+	n := 2
+	t.Helper()
+	forSlice(t, int64s, n, func(x []int64) bool {
+		t.Helper()
+		a := archsimd.LoadInt64x2Slice(x)
+		g := make([]int64, 2)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint8x16ConvertLoToInt64x2 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 2 elements.
+func testUint8x16ConvertLoToInt64x2(t *testing.T, f func(x archsimd.Uint8x16) archsimd.Int64x2, want func(x []uint8) []int64) {
+	n := 16
+	t.Helper()
+	forSlice(t, uint8s, n, func(x []uint8) bool {
+		t.Helper()
+		a := archsimd.LoadUint8x16Slice(x)
+		g := make([]int64, 2)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint16x8ConvertLoToInt64x2 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 2 elements.
+func testUint16x8ConvertLoToInt64x2(t *testing.T, f func(x archsimd.Uint16x8) archsimd.Int64x2, want func(x []uint16) []int64) {
+	n := 8
+	t.Helper()
+	forSlice(t, uint16s, n, func(x []uint16) bool {
+		t.Helper()
+		a := archsimd.LoadUint16x8Slice(x)
+		g := make([]int64, 2)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint32x4ConvertLoToInt64x2 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 2 elements.
+func testUint32x4ConvertLoToInt64x2(t *testing.T, f func(x archsimd.Uint32x4) archsimd.Int64x2, want func(x []uint32) []int64) {
+	n := 4
+	t.Helper()
+	forSlice(t, uint32s, n, func(x []uint32) bool {
+		t.Helper()
+		a := archsimd.LoadUint32x4Slice(x)
+		g := make([]int64, 2)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint64x2ConvertLoToInt64x2 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 2 elements.
+func testUint64x2ConvertLoToInt64x2(t *testing.T, f func(x archsimd.Uint64x2) archsimd.Int64x2, want func(x []uint64) []int64) {
+	n := 2
+	t.Helper()
+	forSlice(t, uint64s, n, func(x []uint64) bool {
+		t.Helper()
+		a := archsimd.LoadUint64x2Slice(x)
+		g := make([]int64, 2)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt8x32ConvertLoToInt64x2 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 2 elements.
+func testInt8x32ConvertLoToInt64x2(t *testing.T, f func(x archsimd.Int8x32) archsimd.Int64x2, want func(x []int8) []int64) {
+	n := 32
+	t.Helper()
+	forSlice(t, int8s, n, func(x []int8) bool {
+		t.Helper()
+		a := archsimd.LoadInt8x32Slice(x)
+		g := make([]int64, 2)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt16x16ConvertLoToInt64x2 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 2 elements.
+func testInt16x16ConvertLoToInt64x2(t *testing.T, f func(x archsimd.Int16x16) archsimd.Int64x2, want func(x []int16) []int64) {
+	n := 16
+	t.Helper()
+	forSlice(t, int16s, n, func(x []int16) bool {
+		t.Helper()
+		a := archsimd.LoadInt16x16Slice(x)
+		g := make([]int64, 2)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt32x8ConvertLoToInt64x2 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 2 elements.
+func testInt32x8ConvertLoToInt64x2(t *testing.T, f func(x archsimd.Int32x8) archsimd.Int64x2, want func(x []int32) []int64) {
+	n := 8
+	t.Helper()
+	forSlice(t, int32s, n, func(x []int32) bool {
+		t.Helper()
+		a := archsimd.LoadInt32x8Slice(x)
+		g := make([]int64, 2)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt64x4ConvertLoToInt64x2 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 2 elements.
+func testInt64x4ConvertLoToInt64x2(t *testing.T, f func(x archsimd.Int64x4) archsimd.Int64x2, want func(x []int64) []int64) {
+	n := 4
+	t.Helper()
+	forSlice(t, int64s, n, func(x []int64) bool {
+		t.Helper()
+		a := archsimd.LoadInt64x4Slice(x)
+		g := make([]int64, 2)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint8x32ConvertLoToInt64x2 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 2 elements.
+func testUint8x32ConvertLoToInt64x2(t *testing.T, f func(x archsimd.Uint8x32) archsimd.Int64x2, want func(x []uint8) []int64) {
+	n := 32
+	t.Helper()
+	forSlice(t, uint8s, n, func(x []uint8) bool {
+		t.Helper()
+		a := archsimd.LoadUint8x32Slice(x)
+		g := make([]int64, 2)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint16x16ConvertLoToInt64x2 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 2 elements.
+func testUint16x16ConvertLoToInt64x2(t *testing.T, f func(x archsimd.Uint16x16) archsimd.Int64x2, want func(x []uint16) []int64) {
+	n := 16
+	t.Helper()
+	forSlice(t, uint16s, n, func(x []uint16) bool {
+		t.Helper()
+		a := archsimd.LoadUint16x16Slice(x)
+		g := make([]int64, 2)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint32x8ConvertLoToInt64x2 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 2 elements.
+func testUint32x8ConvertLoToInt64x2(t *testing.T, f func(x archsimd.Uint32x8) archsimd.Int64x2, want func(x []uint32) []int64) {
+	n := 8
+	t.Helper()
+	forSlice(t, uint32s, n, func(x []uint32) bool {
+		t.Helper()
+		a := archsimd.LoadUint32x8Slice(x)
+		g := make([]int64, 2)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint64x4ConvertLoToInt64x2 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 2 elements.
+func testUint64x4ConvertLoToInt64x2(t *testing.T, f func(x archsimd.Uint64x4) archsimd.Int64x2, want func(x []uint64) []int64) {
+	n := 4
+	t.Helper()
+	forSlice(t, uint64s, n, func(x []uint64) bool {
+		t.Helper()
+		a := archsimd.LoadUint64x4Slice(x)
+		g := make([]int64, 2)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt8x64ConvertLoToInt64x2 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 2 elements.
+func testInt8x64ConvertLoToInt64x2(t *testing.T, f func(x archsimd.Int8x64) archsimd.Int64x2, want func(x []int8) []int64) {
+	n := 64
+	t.Helper()
+	forSlice(t, int8s, n, func(x []int8) bool {
+		t.Helper()
+		a := archsimd.LoadInt8x64Slice(x)
+		g := make([]int64, 2)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt16x32ConvertLoToInt64x2 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 2 elements.
+func testInt16x32ConvertLoToInt64x2(t *testing.T, f func(x archsimd.Int16x32) archsimd.Int64x2, want func(x []int16) []int64) {
+	n := 32
+	t.Helper()
+	forSlice(t, int16s, n, func(x []int16) bool {
+		t.Helper()
+		a := archsimd.LoadInt16x32Slice(x)
+		g := make([]int64, 2)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt32x16ConvertLoToInt64x2 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 2 elements.
+func testInt32x16ConvertLoToInt64x2(t *testing.T, f func(x archsimd.Int32x16) archsimd.Int64x2, want func(x []int32) []int64) {
+	n := 16
+	t.Helper()
+	forSlice(t, int32s, n, func(x []int32) bool {
+		t.Helper()
+		a := archsimd.LoadInt32x16Slice(x)
+		g := make([]int64, 2)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt64x8ConvertLoToInt64x2 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 2 elements.
+func testInt64x8ConvertLoToInt64x2(t *testing.T, f func(x archsimd.Int64x8) archsimd.Int64x2, want func(x []int64) []int64) {
+	n := 8
+	t.Helper()
+	forSlice(t, int64s, n, func(x []int64) bool {
+		t.Helper()
+		a := archsimd.LoadInt64x8Slice(x)
+		g := make([]int64, 2)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint8x64ConvertLoToInt64x2 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 2 elements.
+func testUint8x64ConvertLoToInt64x2(t *testing.T, f func(x archsimd.Uint8x64) archsimd.Int64x2, want func(x []uint8) []int64) {
+	n := 64
+	t.Helper()
+	forSlice(t, uint8s, n, func(x []uint8) bool {
+		t.Helper()
+		a := archsimd.LoadUint8x64Slice(x)
+		g := make([]int64, 2)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint16x32ConvertLoToInt64x2 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 2 elements.
+func testUint16x32ConvertLoToInt64x2(t *testing.T, f func(x archsimd.Uint16x32) archsimd.Int64x2, want func(x []uint16) []int64) {
+	n := 32
+	t.Helper()
+	forSlice(t, uint16s, n, func(x []uint16) bool {
+		t.Helper()
+		a := archsimd.LoadUint16x32Slice(x)
+		g := make([]int64, 2)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint32x16ConvertLoToInt64x2 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 2 elements.
+func testUint32x16ConvertLoToInt64x2(t *testing.T, f func(x archsimd.Uint32x16) archsimd.Int64x2, want func(x []uint32) []int64) {
+	n := 16
+	t.Helper()
+	forSlice(t, uint32s, n, func(x []uint32) bool {
+		t.Helper()
+		a := archsimd.LoadUint32x16Slice(x)
+		g := make([]int64, 2)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint64x8ConvertLoToInt64x2 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 2 elements.
+func testUint64x8ConvertLoToInt64x2(t *testing.T, f func(x archsimd.Uint64x8) archsimd.Int64x2, want func(x []uint64) []int64) {
+	n := 8
+	t.Helper()
+	forSlice(t, uint64s, n, func(x []uint64) bool {
+		t.Helper()
+		a := archsimd.LoadUint64x8Slice(x)
+		g := make([]int64, 2)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt8x16ConvertLoToInt64x4 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 4 elements.
+func testInt8x16ConvertLoToInt64x4(t *testing.T, f func(x archsimd.Int8x16) archsimd.Int64x4, want func(x []int8) []int64) {
+	n := 16
+	t.Helper()
+	forSlice(t, int8s, n, func(x []int8) bool {
+		t.Helper()
+		a := archsimd.LoadInt8x16Slice(x)
+		g := make([]int64, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt16x8ConvertLoToInt64x4 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 4 elements.
+func testInt16x8ConvertLoToInt64x4(t *testing.T, f func(x archsimd.Int16x8) archsimd.Int64x4, want func(x []int16) []int64) {
+	n := 8
+	t.Helper()
+	forSlice(t, int16s, n, func(x []int16) bool {
+		t.Helper()
+		a := archsimd.LoadInt16x8Slice(x)
+		g := make([]int64, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt32x4ConvertLoToInt64x4 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 4 elements.
+func testInt32x4ConvertLoToInt64x4(t *testing.T, f func(x archsimd.Int32x4) archsimd.Int64x4, want func(x []int32) []int64) {
+	n := 4
+	t.Helper()
+	forSlice(t, int32s, n, func(x []int32) bool {
+		t.Helper()
+		a := archsimd.LoadInt32x4Slice(x)
+		g := make([]int64, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt64x2ConvertLoToInt64x4 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 4 elements.
+func testInt64x2ConvertLoToInt64x4(t *testing.T, f func(x archsimd.Int64x2) archsimd.Int64x4, want func(x []int64) []int64) {
+	n := 2
+	t.Helper()
+	forSlice(t, int64s, n, func(x []int64) bool {
+		t.Helper()
+		a := archsimd.LoadInt64x2Slice(x)
+		g := make([]int64, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint8x16ConvertLoToInt64x4 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 4 elements.
+func testUint8x16ConvertLoToInt64x4(t *testing.T, f func(x archsimd.Uint8x16) archsimd.Int64x4, want func(x []uint8) []int64) {
+	n := 16
+	t.Helper()
+	forSlice(t, uint8s, n, func(x []uint8) bool {
+		t.Helper()
+		a := archsimd.LoadUint8x16Slice(x)
+		g := make([]int64, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint16x8ConvertLoToInt64x4 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 4 elements.
+func testUint16x8ConvertLoToInt64x4(t *testing.T, f func(x archsimd.Uint16x8) archsimd.Int64x4, want func(x []uint16) []int64) {
+	n := 8
+	t.Helper()
+	forSlice(t, uint16s, n, func(x []uint16) bool {
+		t.Helper()
+		a := archsimd.LoadUint16x8Slice(x)
+		g := make([]int64, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint32x4ConvertLoToInt64x4 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 4 elements.
+func testUint32x4ConvertLoToInt64x4(t *testing.T, f func(x archsimd.Uint32x4) archsimd.Int64x4, want func(x []uint32) []int64) {
+	n := 4
+	t.Helper()
+	forSlice(t, uint32s, n, func(x []uint32) bool {
+		t.Helper()
+		a := archsimd.LoadUint32x4Slice(x)
+		g := make([]int64, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint64x2ConvertLoToInt64x4 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 4 elements.
+func testUint64x2ConvertLoToInt64x4(t *testing.T, f func(x archsimd.Uint64x2) archsimd.Int64x4, want func(x []uint64) []int64) {
+	n := 2
+	t.Helper()
+	forSlice(t, uint64s, n, func(x []uint64) bool {
+		t.Helper()
+		a := archsimd.LoadUint64x2Slice(x)
+		g := make([]int64, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt8x32ConvertLoToInt64x4 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 4 elements.
+func testInt8x32ConvertLoToInt64x4(t *testing.T, f func(x archsimd.Int8x32) archsimd.Int64x4, want func(x []int8) []int64) {
+	n := 32
+	t.Helper()
+	forSlice(t, int8s, n, func(x []int8) bool {
+		t.Helper()
+		a := archsimd.LoadInt8x32Slice(x)
+		g := make([]int64, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt16x16ConvertLoToInt64x4 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 4 elements.
+func testInt16x16ConvertLoToInt64x4(t *testing.T, f func(x archsimd.Int16x16) archsimd.Int64x4, want func(x []int16) []int64) {
+	n := 16
+	t.Helper()
+	forSlice(t, int16s, n, func(x []int16) bool {
+		t.Helper()
+		a := archsimd.LoadInt16x16Slice(x)
+		g := make([]int64, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt32x8ConvertLoToInt64x4 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 4 elements.
+func testInt32x8ConvertLoToInt64x4(t *testing.T, f func(x archsimd.Int32x8) archsimd.Int64x4, want func(x []int32) []int64) {
+	n := 8
+	t.Helper()
+	forSlice(t, int32s, n, func(x []int32) bool {
+		t.Helper()
+		a := archsimd.LoadInt32x8Slice(x)
+		g := make([]int64, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt64x4ConvertLoToInt64x4 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 4 elements.
+func testInt64x4ConvertLoToInt64x4(t *testing.T, f func(x archsimd.Int64x4) archsimd.Int64x4, want func(x []int64) []int64) {
+	n := 4
+	t.Helper()
+	forSlice(t, int64s, n, func(x []int64) bool {
+		t.Helper()
+		a := archsimd.LoadInt64x4Slice(x)
+		g := make([]int64, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint8x32ConvertLoToInt64x4 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 4 elements.
+func testUint8x32ConvertLoToInt64x4(t *testing.T, f func(x archsimd.Uint8x32) archsimd.Int64x4, want func(x []uint8) []int64) {
+	n := 32
+	t.Helper()
+	forSlice(t, uint8s, n, func(x []uint8) bool {
+		t.Helper()
+		a := archsimd.LoadUint8x32Slice(x)
+		g := make([]int64, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint16x16ConvertLoToInt64x4 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 4 elements.
+func testUint16x16ConvertLoToInt64x4(t *testing.T, f func(x archsimd.Uint16x16) archsimd.Int64x4, want func(x []uint16) []int64) {
+	n := 16
+	t.Helper()
+	forSlice(t, uint16s, n, func(x []uint16) bool {
+		t.Helper()
+		a := archsimd.LoadUint16x16Slice(x)
+		g := make([]int64, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint32x8ConvertLoToInt64x4 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 4 elements.
+func testUint32x8ConvertLoToInt64x4(t *testing.T, f func(x archsimd.Uint32x8) archsimd.Int64x4, want func(x []uint32) []int64) {
+	n := 8
+	t.Helper()
+	forSlice(t, uint32s, n, func(x []uint32) bool {
+		t.Helper()
+		a := archsimd.LoadUint32x8Slice(x)
+		g := make([]int64, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint64x4ConvertLoToInt64x4 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 4 elements.
+func testUint64x4ConvertLoToInt64x4(t *testing.T, f func(x archsimd.Uint64x4) archsimd.Int64x4, want func(x []uint64) []int64) {
+	n := 4
+	t.Helper()
+	forSlice(t, uint64s, n, func(x []uint64) bool {
+		t.Helper()
+		a := archsimd.LoadUint64x4Slice(x)
+		g := make([]int64, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt8x64ConvertLoToInt64x4 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 4 elements.
+func testInt8x64ConvertLoToInt64x4(t *testing.T, f func(x archsimd.Int8x64) archsimd.Int64x4, want func(x []int8) []int64) {
+	n := 64
+	t.Helper()
+	forSlice(t, int8s, n, func(x []int8) bool {
+		t.Helper()
+		a := archsimd.LoadInt8x64Slice(x)
+		g := make([]int64, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt16x32ConvertLoToInt64x4 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 4 elements.
+func testInt16x32ConvertLoToInt64x4(t *testing.T, f func(x archsimd.Int16x32) archsimd.Int64x4, want func(x []int16) []int64) {
+	n := 32
+	t.Helper()
+	forSlice(t, int16s, n, func(x []int16) bool {
+		t.Helper()
+		a := archsimd.LoadInt16x32Slice(x)
+		g := make([]int64, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt32x16ConvertLoToInt64x4 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 4 elements.
+func testInt32x16ConvertLoToInt64x4(t *testing.T, f func(x archsimd.Int32x16) archsimd.Int64x4, want func(x []int32) []int64) {
+	n := 16
+	t.Helper()
+	forSlice(t, int32s, n, func(x []int32) bool {
+		t.Helper()
+		a := archsimd.LoadInt32x16Slice(x)
+		g := make([]int64, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt64x8ConvertLoToInt64x4 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 4 elements.
+func testInt64x8ConvertLoToInt64x4(t *testing.T, f func(x archsimd.Int64x8) archsimd.Int64x4, want func(x []int64) []int64) {
+	n := 8
+	t.Helper()
+	forSlice(t, int64s, n, func(x []int64) bool {
+		t.Helper()
+		a := archsimd.LoadInt64x8Slice(x)
+		g := make([]int64, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint8x64ConvertLoToInt64x4 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 4 elements.
+func testUint8x64ConvertLoToInt64x4(t *testing.T, f func(x archsimd.Uint8x64) archsimd.Int64x4, want func(x []uint8) []int64) {
+	n := 64
+	t.Helper()
+	forSlice(t, uint8s, n, func(x []uint8) bool {
+		t.Helper()
+		a := archsimd.LoadUint8x64Slice(x)
+		g := make([]int64, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint16x32ConvertLoToInt64x4 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 4 elements.
+func testUint16x32ConvertLoToInt64x4(t *testing.T, f func(x archsimd.Uint16x32) archsimd.Int64x4, want func(x []uint16) []int64) {
+	n := 32
+	t.Helper()
+	forSlice(t, uint16s, n, func(x []uint16) bool {
+		t.Helper()
+		a := archsimd.LoadUint16x32Slice(x)
+		g := make([]int64, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint32x16ConvertLoToInt64x4 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 4 elements.
+func testUint32x16ConvertLoToInt64x4(t *testing.T, f func(x archsimd.Uint32x16) archsimd.Int64x4, want func(x []uint32) []int64) {
+	n := 16
+	t.Helper()
+	forSlice(t, uint32s, n, func(x []uint32) bool {
+		t.Helper()
+		a := archsimd.LoadUint32x16Slice(x)
+		g := make([]int64, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint64x8ConvertLoToInt64x4 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 4 elements.
+func testUint64x8ConvertLoToInt64x4(t *testing.T, f func(x archsimd.Uint64x8) archsimd.Int64x4, want func(x []uint64) []int64) {
+	n := 8
+	t.Helper()
+	forSlice(t, uint64s, n, func(x []uint64) bool {
+		t.Helper()
+		a := archsimd.LoadUint64x8Slice(x)
+		g := make([]int64, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt8x16ConvertLoToUint64x2 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 2 elements.
+func testInt8x16ConvertLoToUint64x2(t *testing.T, f func(x archsimd.Int8x16) archsimd.Uint64x2, want func(x []int8) []uint64) {
+	n := 16
+	t.Helper()
+	forSlice(t, int8s, n, func(x []int8) bool {
+		t.Helper()
+		a := archsimd.LoadInt8x16Slice(x)
+		g := make([]uint64, 2)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt16x8ConvertLoToUint64x2 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 2 elements.
+func testInt16x8ConvertLoToUint64x2(t *testing.T, f func(x archsimd.Int16x8) archsimd.Uint64x2, want func(x []int16) []uint64) {
+	n := 8
+	t.Helper()
+	forSlice(t, int16s, n, func(x []int16) bool {
+		t.Helper()
+		a := archsimd.LoadInt16x8Slice(x)
+		g := make([]uint64, 2)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt32x4ConvertLoToUint64x2 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 2 elements.
+func testInt32x4ConvertLoToUint64x2(t *testing.T, f func(x archsimd.Int32x4) archsimd.Uint64x2, want func(x []int32) []uint64) {
+	n := 4
+	t.Helper()
+	forSlice(t, int32s, n, func(x []int32) bool {
+		t.Helper()
+		a := archsimd.LoadInt32x4Slice(x)
+		g := make([]uint64, 2)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt64x2ConvertLoToUint64x2 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 2 elements.
+func testInt64x2ConvertLoToUint64x2(t *testing.T, f func(x archsimd.Int64x2) archsimd.Uint64x2, want func(x []int64) []uint64) {
+	n := 2
+	t.Helper()
+	forSlice(t, int64s, n, func(x []int64) bool {
+		t.Helper()
+		a := archsimd.LoadInt64x2Slice(x)
+		g := make([]uint64, 2)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint8x16ConvertLoToUint64x2 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 2 elements.
+func testUint8x16ConvertLoToUint64x2(t *testing.T, f func(x archsimd.Uint8x16) archsimd.Uint64x2, want func(x []uint8) []uint64) {
+	n := 16
+	t.Helper()
+	forSlice(t, uint8s, n, func(x []uint8) bool {
+		t.Helper()
+		a := archsimd.LoadUint8x16Slice(x)
+		g := make([]uint64, 2)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint16x8ConvertLoToUint64x2 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 2 elements.
+func testUint16x8ConvertLoToUint64x2(t *testing.T, f func(x archsimd.Uint16x8) archsimd.Uint64x2, want func(x []uint16) []uint64) {
+	n := 8
+	t.Helper()
+	forSlice(t, uint16s, n, func(x []uint16) bool {
+		t.Helper()
+		a := archsimd.LoadUint16x8Slice(x)
+		g := make([]uint64, 2)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint32x4ConvertLoToUint64x2 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 2 elements.
+func testUint32x4ConvertLoToUint64x2(t *testing.T, f func(x archsimd.Uint32x4) archsimd.Uint64x2, want func(x []uint32) []uint64) {
+	n := 4
+	t.Helper()
+	forSlice(t, uint32s, n, func(x []uint32) bool {
+		t.Helper()
+		a := archsimd.LoadUint32x4Slice(x)
+		g := make([]uint64, 2)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint64x2ConvertLoToUint64x2 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 2 elements.
+func testUint64x2ConvertLoToUint64x2(t *testing.T, f func(x archsimd.Uint64x2) archsimd.Uint64x2, want func(x []uint64) []uint64) {
+	n := 2
+	t.Helper()
+	forSlice(t, uint64s, n, func(x []uint64) bool {
+		t.Helper()
+		a := archsimd.LoadUint64x2Slice(x)
+		g := make([]uint64, 2)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt8x32ConvertLoToUint64x2 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 2 elements.
+func testInt8x32ConvertLoToUint64x2(t *testing.T, f func(x archsimd.Int8x32) archsimd.Uint64x2, want func(x []int8) []uint64) {
+	n := 32
+	t.Helper()
+	forSlice(t, int8s, n, func(x []int8) bool {
+		t.Helper()
+		a := archsimd.LoadInt8x32Slice(x)
+		g := make([]uint64, 2)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt16x16ConvertLoToUint64x2 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 2 elements.
+func testInt16x16ConvertLoToUint64x2(t *testing.T, f func(x archsimd.Int16x16) archsimd.Uint64x2, want func(x []int16) []uint64) {
+	n := 16
+	t.Helper()
+	forSlice(t, int16s, n, func(x []int16) bool {
+		t.Helper()
+		a := archsimd.LoadInt16x16Slice(x)
+		g := make([]uint64, 2)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt32x8ConvertLoToUint64x2 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 2 elements.
+func testInt32x8ConvertLoToUint64x2(t *testing.T, f func(x archsimd.Int32x8) archsimd.Uint64x2, want func(x []int32) []uint64) {
+	n := 8
+	t.Helper()
+	forSlice(t, int32s, n, func(x []int32) bool {
+		t.Helper()
+		a := archsimd.LoadInt32x8Slice(x)
+		g := make([]uint64, 2)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt64x4ConvertLoToUint64x2 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 2 elements.
+func testInt64x4ConvertLoToUint64x2(t *testing.T, f func(x archsimd.Int64x4) archsimd.Uint64x2, want func(x []int64) []uint64) {
+	n := 4
+	t.Helper()
+	forSlice(t, int64s, n, func(x []int64) bool {
+		t.Helper()
+		a := archsimd.LoadInt64x4Slice(x)
+		g := make([]uint64, 2)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint8x32ConvertLoToUint64x2 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 2 elements.
+func testUint8x32ConvertLoToUint64x2(t *testing.T, f func(x archsimd.Uint8x32) archsimd.Uint64x2, want func(x []uint8) []uint64) {
+	n := 32
+	t.Helper()
+	forSlice(t, uint8s, n, func(x []uint8) bool {
+		t.Helper()
+		a := archsimd.LoadUint8x32Slice(x)
+		g := make([]uint64, 2)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint16x16ConvertLoToUint64x2 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 2 elements.
+func testUint16x16ConvertLoToUint64x2(t *testing.T, f func(x archsimd.Uint16x16) archsimd.Uint64x2, want func(x []uint16) []uint64) {
+	n := 16
+	t.Helper()
+	forSlice(t, uint16s, n, func(x []uint16) bool {
+		t.Helper()
+		a := archsimd.LoadUint16x16Slice(x)
+		g := make([]uint64, 2)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint32x8ConvertLoToUint64x2 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 2 elements.
+func testUint32x8ConvertLoToUint64x2(t *testing.T, f func(x archsimd.Uint32x8) archsimd.Uint64x2, want func(x []uint32) []uint64) {
+	n := 8
+	t.Helper()
+	forSlice(t, uint32s, n, func(x []uint32) bool {
+		t.Helper()
+		a := archsimd.LoadUint32x8Slice(x)
+		g := make([]uint64, 2)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint64x4ConvertLoToUint64x2 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 2 elements.
+func testUint64x4ConvertLoToUint64x2(t *testing.T, f func(x archsimd.Uint64x4) archsimd.Uint64x2, want func(x []uint64) []uint64) {
+	n := 4
+	t.Helper()
+	forSlice(t, uint64s, n, func(x []uint64) bool {
+		t.Helper()
+		a := archsimd.LoadUint64x4Slice(x)
+		g := make([]uint64, 2)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt8x64ConvertLoToUint64x2 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 2 elements.
+func testInt8x64ConvertLoToUint64x2(t *testing.T, f func(x archsimd.Int8x64) archsimd.Uint64x2, want func(x []int8) []uint64) {
+	n := 64
+	t.Helper()
+	forSlice(t, int8s, n, func(x []int8) bool {
+		t.Helper()
+		a := archsimd.LoadInt8x64Slice(x)
+		g := make([]uint64, 2)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt16x32ConvertLoToUint64x2 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 2 elements.
+func testInt16x32ConvertLoToUint64x2(t *testing.T, f func(x archsimd.Int16x32) archsimd.Uint64x2, want func(x []int16) []uint64) {
+	n := 32
+	t.Helper()
+	forSlice(t, int16s, n, func(x []int16) bool {
+		t.Helper()
+		a := archsimd.LoadInt16x32Slice(x)
+		g := make([]uint64, 2)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt32x16ConvertLoToUint64x2 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 2 elements.
+func testInt32x16ConvertLoToUint64x2(t *testing.T, f func(x archsimd.Int32x16) archsimd.Uint64x2, want func(x []int32) []uint64) {
+	n := 16
+	t.Helper()
+	forSlice(t, int32s, n, func(x []int32) bool {
+		t.Helper()
+		a := archsimd.LoadInt32x16Slice(x)
+		g := make([]uint64, 2)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt64x8ConvertLoToUint64x2 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 2 elements.
+func testInt64x8ConvertLoToUint64x2(t *testing.T, f func(x archsimd.Int64x8) archsimd.Uint64x2, want func(x []int64) []uint64) {
+	n := 8
+	t.Helper()
+	forSlice(t, int64s, n, func(x []int64) bool {
+		t.Helper()
+		a := archsimd.LoadInt64x8Slice(x)
+		g := make([]uint64, 2)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint8x64ConvertLoToUint64x2 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 2 elements.
+func testUint8x64ConvertLoToUint64x2(t *testing.T, f func(x archsimd.Uint8x64) archsimd.Uint64x2, want func(x []uint8) []uint64) {
+	n := 64
+	t.Helper()
+	forSlice(t, uint8s, n, func(x []uint8) bool {
+		t.Helper()
+		a := archsimd.LoadUint8x64Slice(x)
+		g := make([]uint64, 2)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint16x32ConvertLoToUint64x2 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 2 elements.
+func testUint16x32ConvertLoToUint64x2(t *testing.T, f func(x archsimd.Uint16x32) archsimd.Uint64x2, want func(x []uint16) []uint64) {
+	n := 32
+	t.Helper()
+	forSlice(t, uint16s, n, func(x []uint16) bool {
+		t.Helper()
+		a := archsimd.LoadUint16x32Slice(x)
+		g := make([]uint64, 2)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint32x16ConvertLoToUint64x2 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 2 elements.
+func testUint32x16ConvertLoToUint64x2(t *testing.T, f func(x archsimd.Uint32x16) archsimd.Uint64x2, want func(x []uint32) []uint64) {
+	n := 16
+	t.Helper()
+	forSlice(t, uint32s, n, func(x []uint32) bool {
+		t.Helper()
+		a := archsimd.LoadUint32x16Slice(x)
+		g := make([]uint64, 2)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint64x8ConvertLoToUint64x2 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 2 elements.
+func testUint64x8ConvertLoToUint64x2(t *testing.T, f func(x archsimd.Uint64x8) archsimd.Uint64x2, want func(x []uint64) []uint64) {
+	n := 8
+	t.Helper()
+	forSlice(t, uint64s, n, func(x []uint64) bool {
+		t.Helper()
+		a := archsimd.LoadUint64x8Slice(x)
+		g := make([]uint64, 2)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt8x16ConvertLoToUint64x4 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 4 elements.
+func testInt8x16ConvertLoToUint64x4(t *testing.T, f func(x archsimd.Int8x16) archsimd.Uint64x4, want func(x []int8) []uint64) {
+	n := 16
+	t.Helper()
+	forSlice(t, int8s, n, func(x []int8) bool {
+		t.Helper()
+		a := archsimd.LoadInt8x16Slice(x)
+		g := make([]uint64, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt16x8ConvertLoToUint64x4 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 4 elements.
+func testInt16x8ConvertLoToUint64x4(t *testing.T, f func(x archsimd.Int16x8) archsimd.Uint64x4, want func(x []int16) []uint64) {
+	n := 8
+	t.Helper()
+	forSlice(t, int16s, n, func(x []int16) bool {
+		t.Helper()
+		a := archsimd.LoadInt16x8Slice(x)
+		g := make([]uint64, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt32x4ConvertLoToUint64x4 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 4 elements.
+func testInt32x4ConvertLoToUint64x4(t *testing.T, f func(x archsimd.Int32x4) archsimd.Uint64x4, want func(x []int32) []uint64) {
+	n := 4
+	t.Helper()
+	forSlice(t, int32s, n, func(x []int32) bool {
+		t.Helper()
+		a := archsimd.LoadInt32x4Slice(x)
+		g := make([]uint64, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt64x2ConvertLoToUint64x4 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 4 elements.
+func testInt64x2ConvertLoToUint64x4(t *testing.T, f func(x archsimd.Int64x2) archsimd.Uint64x4, want func(x []int64) []uint64) {
+	n := 2
+	t.Helper()
+	forSlice(t, int64s, n, func(x []int64) bool {
+		t.Helper()
+		a := archsimd.LoadInt64x2Slice(x)
+		g := make([]uint64, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint8x16ConvertLoToUint64x4 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 4 elements.
+func testUint8x16ConvertLoToUint64x4(t *testing.T, f func(x archsimd.Uint8x16) archsimd.Uint64x4, want func(x []uint8) []uint64) {
+	n := 16
+	t.Helper()
+	forSlice(t, uint8s, n, func(x []uint8) bool {
+		t.Helper()
+		a := archsimd.LoadUint8x16Slice(x)
+		g := make([]uint64, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint16x8ConvertLoToUint64x4 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 4 elements.
+func testUint16x8ConvertLoToUint64x4(t *testing.T, f func(x archsimd.Uint16x8) archsimd.Uint64x4, want func(x []uint16) []uint64) {
+	n := 8
+	t.Helper()
+	forSlice(t, uint16s, n, func(x []uint16) bool {
+		t.Helper()
+		a := archsimd.LoadUint16x8Slice(x)
+		g := make([]uint64, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint32x4ConvertLoToUint64x4 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 4 elements.
+func testUint32x4ConvertLoToUint64x4(t *testing.T, f func(x archsimd.Uint32x4) archsimd.Uint64x4, want func(x []uint32) []uint64) {
+	n := 4
+	t.Helper()
+	forSlice(t, uint32s, n, func(x []uint32) bool {
+		t.Helper()
+		a := archsimd.LoadUint32x4Slice(x)
+		g := make([]uint64, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint64x2ConvertLoToUint64x4 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 4 elements.
+func testUint64x2ConvertLoToUint64x4(t *testing.T, f func(x archsimd.Uint64x2) archsimd.Uint64x4, want func(x []uint64) []uint64) {
+	n := 2
+	t.Helper()
+	forSlice(t, uint64s, n, func(x []uint64) bool {
+		t.Helper()
+		a := archsimd.LoadUint64x2Slice(x)
+		g := make([]uint64, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt8x32ConvertLoToUint64x4 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 4 elements.
+func testInt8x32ConvertLoToUint64x4(t *testing.T, f func(x archsimd.Int8x32) archsimd.Uint64x4, want func(x []int8) []uint64) {
+	n := 32
+	t.Helper()
+	forSlice(t, int8s, n, func(x []int8) bool {
+		t.Helper()
+		a := archsimd.LoadInt8x32Slice(x)
+		g := make([]uint64, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt16x16ConvertLoToUint64x4 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 4 elements.
+func testInt16x16ConvertLoToUint64x4(t *testing.T, f func(x archsimd.Int16x16) archsimd.Uint64x4, want func(x []int16) []uint64) {
+	n := 16
+	t.Helper()
+	forSlice(t, int16s, n, func(x []int16) bool {
+		t.Helper()
+		a := archsimd.LoadInt16x16Slice(x)
+		g := make([]uint64, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt32x8ConvertLoToUint64x4 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 4 elements.
+func testInt32x8ConvertLoToUint64x4(t *testing.T, f func(x archsimd.Int32x8) archsimd.Uint64x4, want func(x []int32) []uint64) {
+	n := 8
+	t.Helper()
+	forSlice(t, int32s, n, func(x []int32) bool {
+		t.Helper()
+		a := archsimd.LoadInt32x8Slice(x)
+		g := make([]uint64, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt64x4ConvertLoToUint64x4 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 4 elements.
+func testInt64x4ConvertLoToUint64x4(t *testing.T, f func(x archsimd.Int64x4) archsimd.Uint64x4, want func(x []int64) []uint64) {
+	n := 4
+	t.Helper()
+	forSlice(t, int64s, n, func(x []int64) bool {
+		t.Helper()
+		a := archsimd.LoadInt64x4Slice(x)
+		g := make([]uint64, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint8x32ConvertLoToUint64x4 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 4 elements.
+func testUint8x32ConvertLoToUint64x4(t *testing.T, f func(x archsimd.Uint8x32) archsimd.Uint64x4, want func(x []uint8) []uint64) {
+	n := 32
+	t.Helper()
+	forSlice(t, uint8s, n, func(x []uint8) bool {
+		t.Helper()
+		a := archsimd.LoadUint8x32Slice(x)
+		g := make([]uint64, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint16x16ConvertLoToUint64x4 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 4 elements.
+func testUint16x16ConvertLoToUint64x4(t *testing.T, f func(x archsimd.Uint16x16) archsimd.Uint64x4, want func(x []uint16) []uint64) {
+	n := 16
+	t.Helper()
+	forSlice(t, uint16s, n, func(x []uint16) bool {
+		t.Helper()
+		a := archsimd.LoadUint16x16Slice(x)
+		g := make([]uint64, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint32x8ConvertLoToUint64x4 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 4 elements.
+func testUint32x8ConvertLoToUint64x4(t *testing.T, f func(x archsimd.Uint32x8) archsimd.Uint64x4, want func(x []uint32) []uint64) {
+	n := 8
+	t.Helper()
+	forSlice(t, uint32s, n, func(x []uint32) bool {
+		t.Helper()
+		a := archsimd.LoadUint32x8Slice(x)
+		g := make([]uint64, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint64x4ConvertLoToUint64x4 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 4 elements.
+func testUint64x4ConvertLoToUint64x4(t *testing.T, f func(x archsimd.Uint64x4) archsimd.Uint64x4, want func(x []uint64) []uint64) {
+	n := 4
+	t.Helper()
+	forSlice(t, uint64s, n, func(x []uint64) bool {
+		t.Helper()
+		a := archsimd.LoadUint64x4Slice(x)
+		g := make([]uint64, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt8x64ConvertLoToUint64x4 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 4 elements.
+func testInt8x64ConvertLoToUint64x4(t *testing.T, f func(x archsimd.Int8x64) archsimd.Uint64x4, want func(x []int8) []uint64) {
+	n := 64
+	t.Helper()
+	forSlice(t, int8s, n, func(x []int8) bool {
+		t.Helper()
+		a := archsimd.LoadInt8x64Slice(x)
+		g := make([]uint64, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt16x32ConvertLoToUint64x4 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 4 elements.
+func testInt16x32ConvertLoToUint64x4(t *testing.T, f func(x archsimd.Int16x32) archsimd.Uint64x4, want func(x []int16) []uint64) {
+	n := 32
+	t.Helper()
+	forSlice(t, int16s, n, func(x []int16) bool {
+		t.Helper()
+		a := archsimd.LoadInt16x32Slice(x)
+		g := make([]uint64, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt32x16ConvertLoToUint64x4 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 4 elements.
+func testInt32x16ConvertLoToUint64x4(t *testing.T, f func(x archsimd.Int32x16) archsimd.Uint64x4, want func(x []int32) []uint64) {
+	n := 16
+	t.Helper()
+	forSlice(t, int32s, n, func(x []int32) bool {
+		t.Helper()
+		a := archsimd.LoadInt32x16Slice(x)
+		g := make([]uint64, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt64x8ConvertLoToUint64x4 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 4 elements.
+func testInt64x8ConvertLoToUint64x4(t *testing.T, f func(x archsimd.Int64x8) archsimd.Uint64x4, want func(x []int64) []uint64) {
+	n := 8
+	t.Helper()
+	forSlice(t, int64s, n, func(x []int64) bool {
+		t.Helper()
+		a := archsimd.LoadInt64x8Slice(x)
+		g := make([]uint64, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint8x64ConvertLoToUint64x4 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 4 elements.
+func testUint8x64ConvertLoToUint64x4(t *testing.T, f func(x archsimd.Uint8x64) archsimd.Uint64x4, want func(x []uint8) []uint64) {
+	n := 64
+	t.Helper()
+	forSlice(t, uint8s, n, func(x []uint8) bool {
+		t.Helper()
+		a := archsimd.LoadUint8x64Slice(x)
+		g := make([]uint64, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint16x32ConvertLoToUint64x4 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 4 elements.
+func testUint16x32ConvertLoToUint64x4(t *testing.T, f func(x archsimd.Uint16x32) archsimd.Uint64x4, want func(x []uint16) []uint64) {
+	n := 32
+	t.Helper()
+	forSlice(t, uint16s, n, func(x []uint16) bool {
+		t.Helper()
+		a := archsimd.LoadUint16x32Slice(x)
+		g := make([]uint64, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint32x16ConvertLoToUint64x4 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 4 elements.
+func testUint32x16ConvertLoToUint64x4(t *testing.T, f func(x archsimd.Uint32x16) archsimd.Uint64x4, want func(x []uint32) []uint64) {
+	n := 16
+	t.Helper()
+	forSlice(t, uint32s, n, func(x []uint32) bool {
+		t.Helper()
+		a := archsimd.LoadUint32x16Slice(x)
+		g := make([]uint64, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint64x8ConvertLoToUint64x4 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 4 elements.
+func testUint64x8ConvertLoToUint64x4(t *testing.T, f func(x archsimd.Uint64x8) archsimd.Uint64x4, want func(x []uint64) []uint64) {
+	n := 8
+	t.Helper()
+	forSlice(t, uint64s, n, func(x []uint64) bool {
+		t.Helper()
+		a := archsimd.LoadUint64x8Slice(x)
+		g := make([]uint64, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt8x16ConvertLoToInt32x4 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 4 elements.
+func testInt8x16ConvertLoToInt32x4(t *testing.T, f func(x archsimd.Int8x16) archsimd.Int32x4, want func(x []int8) []int32) {
+	n := 16
+	t.Helper()
+	forSlice(t, int8s, n, func(x []int8) bool {
+		t.Helper()
+		a := archsimd.LoadInt8x16Slice(x)
+		g := make([]int32, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt16x8ConvertLoToInt32x4 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 4 elements.
+func testInt16x8ConvertLoToInt32x4(t *testing.T, f func(x archsimd.Int16x8) archsimd.Int32x4, want func(x []int16) []int32) {
+	n := 8
+	t.Helper()
+	forSlice(t, int16s, n, func(x []int16) bool {
+		t.Helper()
+		a := archsimd.LoadInt16x8Slice(x)
+		g := make([]int32, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt32x4ConvertLoToInt32x4 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 4 elements.
+func testInt32x4ConvertLoToInt32x4(t *testing.T, f func(x archsimd.Int32x4) archsimd.Int32x4, want func(x []int32) []int32) {
+	n := 4
+	t.Helper()
+	forSlice(t, int32s, n, func(x []int32) bool {
+		t.Helper()
+		a := archsimd.LoadInt32x4Slice(x)
+		g := make([]int32, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt64x2ConvertLoToInt32x4 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 4 elements.
+func testInt64x2ConvertLoToInt32x4(t *testing.T, f func(x archsimd.Int64x2) archsimd.Int32x4, want func(x []int64) []int32) {
+	n := 2
+	t.Helper()
+	forSlice(t, int64s, n, func(x []int64) bool {
+		t.Helper()
+		a := archsimd.LoadInt64x2Slice(x)
+		g := make([]int32, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint8x16ConvertLoToInt32x4 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 4 elements.
+func testUint8x16ConvertLoToInt32x4(t *testing.T, f func(x archsimd.Uint8x16) archsimd.Int32x4, want func(x []uint8) []int32) {
+	n := 16
+	t.Helper()
+	forSlice(t, uint8s, n, func(x []uint8) bool {
+		t.Helper()
+		a := archsimd.LoadUint8x16Slice(x)
+		g := make([]int32, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint16x8ConvertLoToInt32x4 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 4 elements.
+func testUint16x8ConvertLoToInt32x4(t *testing.T, f func(x archsimd.Uint16x8) archsimd.Int32x4, want func(x []uint16) []int32) {
+	n := 8
+	t.Helper()
+	forSlice(t, uint16s, n, func(x []uint16) bool {
+		t.Helper()
+		a := archsimd.LoadUint16x8Slice(x)
+		g := make([]int32, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint32x4ConvertLoToInt32x4 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 4 elements.
+func testUint32x4ConvertLoToInt32x4(t *testing.T, f func(x archsimd.Uint32x4) archsimd.Int32x4, want func(x []uint32) []int32) {
+	n := 4
+	t.Helper()
+	forSlice(t, uint32s, n, func(x []uint32) bool {
+		t.Helper()
+		a := archsimd.LoadUint32x4Slice(x)
+		g := make([]int32, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint64x2ConvertLoToInt32x4 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 4 elements.
+func testUint64x2ConvertLoToInt32x4(t *testing.T, f func(x archsimd.Uint64x2) archsimd.Int32x4, want func(x []uint64) []int32) {
+	n := 2
+	t.Helper()
+	forSlice(t, uint64s, n, func(x []uint64) bool {
+		t.Helper()
+		a := archsimd.LoadUint64x2Slice(x)
+		g := make([]int32, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt8x32ConvertLoToInt32x4 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 4 elements.
+func testInt8x32ConvertLoToInt32x4(t *testing.T, f func(x archsimd.Int8x32) archsimd.Int32x4, want func(x []int8) []int32) {
+	n := 32
+	t.Helper()
+	forSlice(t, int8s, n, func(x []int8) bool {
+		t.Helper()
+		a := archsimd.LoadInt8x32Slice(x)
+		g := make([]int32, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt16x16ConvertLoToInt32x4 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 4 elements.
+func testInt16x16ConvertLoToInt32x4(t *testing.T, f func(x archsimd.Int16x16) archsimd.Int32x4, want func(x []int16) []int32) {
+	n := 16
+	t.Helper()
+	forSlice(t, int16s, n, func(x []int16) bool {
+		t.Helper()
+		a := archsimd.LoadInt16x16Slice(x)
+		g := make([]int32, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt32x8ConvertLoToInt32x4 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 4 elements.
+func testInt32x8ConvertLoToInt32x4(t *testing.T, f func(x archsimd.Int32x8) archsimd.Int32x4, want func(x []int32) []int32) {
+	n := 8
+	t.Helper()
+	forSlice(t, int32s, n, func(x []int32) bool {
+		t.Helper()
+		a := archsimd.LoadInt32x8Slice(x)
+		g := make([]int32, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt64x4ConvertLoToInt32x4 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 4 elements.
+func testInt64x4ConvertLoToInt32x4(t *testing.T, f func(x archsimd.Int64x4) archsimd.Int32x4, want func(x []int64) []int32) {
+	n := 4
+	t.Helper()
+	forSlice(t, int64s, n, func(x []int64) bool {
+		t.Helper()
+		a := archsimd.LoadInt64x4Slice(x)
+		g := make([]int32, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint8x32ConvertLoToInt32x4 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 4 elements.
+func testUint8x32ConvertLoToInt32x4(t *testing.T, f func(x archsimd.Uint8x32) archsimd.Int32x4, want func(x []uint8) []int32) {
+	n := 32
+	t.Helper()
+	forSlice(t, uint8s, n, func(x []uint8) bool {
+		t.Helper()
+		a := archsimd.LoadUint8x32Slice(x)
+		g := make([]int32, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint16x16ConvertLoToInt32x4 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 4 elements.
+func testUint16x16ConvertLoToInt32x4(t *testing.T, f func(x archsimd.Uint16x16) archsimd.Int32x4, want func(x []uint16) []int32) {
+	n := 16
+	t.Helper()
+	forSlice(t, uint16s, n, func(x []uint16) bool {
+		t.Helper()
+		a := archsimd.LoadUint16x16Slice(x)
+		g := make([]int32, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint32x8ConvertLoToInt32x4 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 4 elements.
+func testUint32x8ConvertLoToInt32x4(t *testing.T, f func(x archsimd.Uint32x8) archsimd.Int32x4, want func(x []uint32) []int32) {
+	n := 8
+	t.Helper()
+	forSlice(t, uint32s, n, func(x []uint32) bool {
+		t.Helper()
+		a := archsimd.LoadUint32x8Slice(x)
+		g := make([]int32, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint64x4ConvertLoToInt32x4 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 4 elements.
+func testUint64x4ConvertLoToInt32x4(t *testing.T, f func(x archsimd.Uint64x4) archsimd.Int32x4, want func(x []uint64) []int32) {
+	n := 4
+	t.Helper()
+	forSlice(t, uint64s, n, func(x []uint64) bool {
+		t.Helper()
+		a := archsimd.LoadUint64x4Slice(x)
+		g := make([]int32, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt8x64ConvertLoToInt32x4 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 4 elements.
+func testInt8x64ConvertLoToInt32x4(t *testing.T, f func(x archsimd.Int8x64) archsimd.Int32x4, want func(x []int8) []int32) {
+	n := 64
+	t.Helper()
+	forSlice(t, int8s, n, func(x []int8) bool {
+		t.Helper()
+		a := archsimd.LoadInt8x64Slice(x)
+		g := make([]int32, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt16x32ConvertLoToInt32x4 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 4 elements.
+func testInt16x32ConvertLoToInt32x4(t *testing.T, f func(x archsimd.Int16x32) archsimd.Int32x4, want func(x []int16) []int32) {
+	n := 32
+	t.Helper()
+	forSlice(t, int16s, n, func(x []int16) bool {
+		t.Helper()
+		a := archsimd.LoadInt16x32Slice(x)
+		g := make([]int32, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt32x16ConvertLoToInt32x4 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 4 elements.
+func testInt32x16ConvertLoToInt32x4(t *testing.T, f func(x archsimd.Int32x16) archsimd.Int32x4, want func(x []int32) []int32) {
+	n := 16
+	t.Helper()
+	forSlice(t, int32s, n, func(x []int32) bool {
+		t.Helper()
+		a := archsimd.LoadInt32x16Slice(x)
+		g := make([]int32, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt64x8ConvertLoToInt32x4 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 4 elements.
+func testInt64x8ConvertLoToInt32x4(t *testing.T, f func(x archsimd.Int64x8) archsimd.Int32x4, want func(x []int64) []int32) {
+	n := 8
+	t.Helper()
+	forSlice(t, int64s, n, func(x []int64) bool {
+		t.Helper()
+		a := archsimd.LoadInt64x8Slice(x)
+		g := make([]int32, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint8x64ConvertLoToInt32x4 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 4 elements.
+func testUint8x64ConvertLoToInt32x4(t *testing.T, f func(x archsimd.Uint8x64) archsimd.Int32x4, want func(x []uint8) []int32) {
+	n := 64
+	t.Helper()
+	forSlice(t, uint8s, n, func(x []uint8) bool {
+		t.Helper()
+		a := archsimd.LoadUint8x64Slice(x)
+		g := make([]int32, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint16x32ConvertLoToInt32x4 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 4 elements.
+func testUint16x32ConvertLoToInt32x4(t *testing.T, f func(x archsimd.Uint16x32) archsimd.Int32x4, want func(x []uint16) []int32) {
+	n := 32
+	t.Helper()
+	forSlice(t, uint16s, n, func(x []uint16) bool {
+		t.Helper()
+		a := archsimd.LoadUint16x32Slice(x)
+		g := make([]int32, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint32x16ConvertLoToInt32x4 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 4 elements.
+func testUint32x16ConvertLoToInt32x4(t *testing.T, f func(x archsimd.Uint32x16) archsimd.Int32x4, want func(x []uint32) []int32) {
+	n := 16
+	t.Helper()
+	forSlice(t, uint32s, n, func(x []uint32) bool {
+		t.Helper()
+		a := archsimd.LoadUint32x16Slice(x)
+		g := make([]int32, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint64x8ConvertLoToInt32x4 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 4 elements.
+func testUint64x8ConvertLoToInt32x4(t *testing.T, f func(x archsimd.Uint64x8) archsimd.Int32x4, want func(x []uint64) []int32) {
+	n := 8
+	t.Helper()
+	forSlice(t, uint64s, n, func(x []uint64) bool {
+		t.Helper()
+		a := archsimd.LoadUint64x8Slice(x)
+		g := make([]int32, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt8x16ConvertLoToInt32x8 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 8 elements.
+func testInt8x16ConvertLoToInt32x8(t *testing.T, f func(x archsimd.Int8x16) archsimd.Int32x8, want func(x []int8) []int32) {
+	n := 16
+	t.Helper()
+	forSlice(t, int8s, n, func(x []int8) bool {
+		t.Helper()
+		a := archsimd.LoadInt8x16Slice(x)
+		g := make([]int32, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt16x8ConvertLoToInt32x8 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 8 elements.
+func testInt16x8ConvertLoToInt32x8(t *testing.T, f func(x archsimd.Int16x8) archsimd.Int32x8, want func(x []int16) []int32) {
+	n := 8
+	t.Helper()
+	forSlice(t, int16s, n, func(x []int16) bool {
+		t.Helper()
+		a := archsimd.LoadInt16x8Slice(x)
+		g := make([]int32, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt32x4ConvertLoToInt32x8 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 8 elements.
+func testInt32x4ConvertLoToInt32x8(t *testing.T, f func(x archsimd.Int32x4) archsimd.Int32x8, want func(x []int32) []int32) {
+	n := 4
+	t.Helper()
+	forSlice(t, int32s, n, func(x []int32) bool {
+		t.Helper()
+		a := archsimd.LoadInt32x4Slice(x)
+		g := make([]int32, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt64x2ConvertLoToInt32x8 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 8 elements.
+func testInt64x2ConvertLoToInt32x8(t *testing.T, f func(x archsimd.Int64x2) archsimd.Int32x8, want func(x []int64) []int32) {
+	n := 2
+	t.Helper()
+	forSlice(t, int64s, n, func(x []int64) bool {
+		t.Helper()
+		a := archsimd.LoadInt64x2Slice(x)
+		g := make([]int32, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint8x16ConvertLoToInt32x8 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 8 elements.
+func testUint8x16ConvertLoToInt32x8(t *testing.T, f func(x archsimd.Uint8x16) archsimd.Int32x8, want func(x []uint8) []int32) {
+	n := 16
+	t.Helper()
+	forSlice(t, uint8s, n, func(x []uint8) bool {
+		t.Helper()
+		a := archsimd.LoadUint8x16Slice(x)
+		g := make([]int32, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint16x8ConvertLoToInt32x8 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 8 elements.
+func testUint16x8ConvertLoToInt32x8(t *testing.T, f func(x archsimd.Uint16x8) archsimd.Int32x8, want func(x []uint16) []int32) {
+	n := 8
+	t.Helper()
+	forSlice(t, uint16s, n, func(x []uint16) bool {
+		t.Helper()
+		a := archsimd.LoadUint16x8Slice(x)
+		g := make([]int32, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint32x4ConvertLoToInt32x8 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 8 elements.
+func testUint32x4ConvertLoToInt32x8(t *testing.T, f func(x archsimd.Uint32x4) archsimd.Int32x8, want func(x []uint32) []int32) {
+	n := 4
+	t.Helper()
+	forSlice(t, uint32s, n, func(x []uint32) bool {
+		t.Helper()
+		a := archsimd.LoadUint32x4Slice(x)
+		g := make([]int32, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint64x2ConvertLoToInt32x8 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 8 elements.
+func testUint64x2ConvertLoToInt32x8(t *testing.T, f func(x archsimd.Uint64x2) archsimd.Int32x8, want func(x []uint64) []int32) {
+	n := 2
+	t.Helper()
+	forSlice(t, uint64s, n, func(x []uint64) bool {
+		t.Helper()
+		a := archsimd.LoadUint64x2Slice(x)
+		g := make([]int32, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt8x32ConvertLoToInt32x8 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 8 elements.
+func testInt8x32ConvertLoToInt32x8(t *testing.T, f func(x archsimd.Int8x32) archsimd.Int32x8, want func(x []int8) []int32) {
+	n := 32
+	t.Helper()
+	forSlice(t, int8s, n, func(x []int8) bool {
+		t.Helper()
+		a := archsimd.LoadInt8x32Slice(x)
+		g := make([]int32, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt16x16ConvertLoToInt32x8 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 8 elements.
+func testInt16x16ConvertLoToInt32x8(t *testing.T, f func(x archsimd.Int16x16) archsimd.Int32x8, want func(x []int16) []int32) {
+	n := 16
+	t.Helper()
+	forSlice(t, int16s, n, func(x []int16) bool {
+		t.Helper()
+		a := archsimd.LoadInt16x16Slice(x)
+		g := make([]int32, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt32x8ConvertLoToInt32x8 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 8 elements.
+func testInt32x8ConvertLoToInt32x8(t *testing.T, f func(x archsimd.Int32x8) archsimd.Int32x8, want func(x []int32) []int32) {
+	n := 8
+	t.Helper()
+	forSlice(t, int32s, n, func(x []int32) bool {
+		t.Helper()
+		a := archsimd.LoadInt32x8Slice(x)
+		g := make([]int32, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt64x4ConvertLoToInt32x8 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 8 elements.
+func testInt64x4ConvertLoToInt32x8(t *testing.T, f func(x archsimd.Int64x4) archsimd.Int32x8, want func(x []int64) []int32) {
+	n := 4
+	t.Helper()
+	forSlice(t, int64s, n, func(x []int64) bool {
+		t.Helper()
+		a := archsimd.LoadInt64x4Slice(x)
+		g := make([]int32, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint8x32ConvertLoToInt32x8 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 8 elements.
+func testUint8x32ConvertLoToInt32x8(t *testing.T, f func(x archsimd.Uint8x32) archsimd.Int32x8, want func(x []uint8) []int32) {
+	n := 32
+	t.Helper()
+	forSlice(t, uint8s, n, func(x []uint8) bool {
+		t.Helper()
+		a := archsimd.LoadUint8x32Slice(x)
+		g := make([]int32, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint16x16ConvertLoToInt32x8 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 8 elements.
+func testUint16x16ConvertLoToInt32x8(t *testing.T, f func(x archsimd.Uint16x16) archsimd.Int32x8, want func(x []uint16) []int32) {
+	n := 16
+	t.Helper()
+	forSlice(t, uint16s, n, func(x []uint16) bool {
+		t.Helper()
+		a := archsimd.LoadUint16x16Slice(x)
+		g := make([]int32, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint32x8ConvertLoToInt32x8 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 8 elements.
+func testUint32x8ConvertLoToInt32x8(t *testing.T, f func(x archsimd.Uint32x8) archsimd.Int32x8, want func(x []uint32) []int32) {
+	n := 8
+	t.Helper()
+	forSlice(t, uint32s, n, func(x []uint32) bool {
+		t.Helper()
+		a := archsimd.LoadUint32x8Slice(x)
+		g := make([]int32, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint64x4ConvertLoToInt32x8 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 8 elements.
+func testUint64x4ConvertLoToInt32x8(t *testing.T, f func(x archsimd.Uint64x4) archsimd.Int32x8, want func(x []uint64) []int32) {
+	n := 4
+	t.Helper()
+	forSlice(t, uint64s, n, func(x []uint64) bool {
+		t.Helper()
+		a := archsimd.LoadUint64x4Slice(x)
+		g := make([]int32, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt8x64ConvertLoToInt32x8 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 8 elements.
+func testInt8x64ConvertLoToInt32x8(t *testing.T, f func(x archsimd.Int8x64) archsimd.Int32x8, want func(x []int8) []int32) {
+	n := 64
+	t.Helper()
+	forSlice(t, int8s, n, func(x []int8) bool {
+		t.Helper()
+		a := archsimd.LoadInt8x64Slice(x)
+		g := make([]int32, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt16x32ConvertLoToInt32x8 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 8 elements.
+func testInt16x32ConvertLoToInt32x8(t *testing.T, f func(x archsimd.Int16x32) archsimd.Int32x8, want func(x []int16) []int32) {
+	n := 32
+	t.Helper()
+	forSlice(t, int16s, n, func(x []int16) bool {
+		t.Helper()
+		a := archsimd.LoadInt16x32Slice(x)
+		g := make([]int32, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt32x16ConvertLoToInt32x8 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 8 elements.
+func testInt32x16ConvertLoToInt32x8(t *testing.T, f func(x archsimd.Int32x16) archsimd.Int32x8, want func(x []int32) []int32) {
+	n := 16
+	t.Helper()
+	forSlice(t, int32s, n, func(x []int32) bool {
+		t.Helper()
+		a := archsimd.LoadInt32x16Slice(x)
+		g := make([]int32, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt64x8ConvertLoToInt32x8 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 8 elements.
+func testInt64x8ConvertLoToInt32x8(t *testing.T, f func(x archsimd.Int64x8) archsimd.Int32x8, want func(x []int64) []int32) {
+	n := 8
+	t.Helper()
+	forSlice(t, int64s, n, func(x []int64) bool {
+		t.Helper()
+		a := archsimd.LoadInt64x8Slice(x)
+		g := make([]int32, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint8x64ConvertLoToInt32x8 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 8 elements.
+func testUint8x64ConvertLoToInt32x8(t *testing.T, f func(x archsimd.Uint8x64) archsimd.Int32x8, want func(x []uint8) []int32) {
+	n := 64
+	t.Helper()
+	forSlice(t, uint8s, n, func(x []uint8) bool {
+		t.Helper()
+		a := archsimd.LoadUint8x64Slice(x)
+		g := make([]int32, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint16x32ConvertLoToInt32x8 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 8 elements.
+func testUint16x32ConvertLoToInt32x8(t *testing.T, f func(x archsimd.Uint16x32) archsimd.Int32x8, want func(x []uint16) []int32) {
+	n := 32
+	t.Helper()
+	forSlice(t, uint16s, n, func(x []uint16) bool {
+		t.Helper()
+		a := archsimd.LoadUint16x32Slice(x)
+		g := make([]int32, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint32x16ConvertLoToInt32x8 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 8 elements.
+func testUint32x16ConvertLoToInt32x8(t *testing.T, f func(x archsimd.Uint32x16) archsimd.Int32x8, want func(x []uint32) []int32) {
+	n := 16
+	t.Helper()
+	forSlice(t, uint32s, n, func(x []uint32) bool {
+		t.Helper()
+		a := archsimd.LoadUint32x16Slice(x)
+		g := make([]int32, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint64x8ConvertLoToInt32x8 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 8 elements.
+func testUint64x8ConvertLoToInt32x8(t *testing.T, f func(x archsimd.Uint64x8) archsimd.Int32x8, want func(x []uint64) []int32) {
+	n := 8
+	t.Helper()
+	forSlice(t, uint64s, n, func(x []uint64) bool {
+		t.Helper()
+		a := archsimd.LoadUint64x8Slice(x)
+		g := make([]int32, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt8x16ConvertLoToUint32x4 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 4 elements.
+func testInt8x16ConvertLoToUint32x4(t *testing.T, f func(x archsimd.Int8x16) archsimd.Uint32x4, want func(x []int8) []uint32) {
+	n := 16
+	t.Helper()
+	forSlice(t, int8s, n, func(x []int8) bool {
+		t.Helper()
+		a := archsimd.LoadInt8x16Slice(x)
+		g := make([]uint32, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt16x8ConvertLoToUint32x4 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 4 elements.
+func testInt16x8ConvertLoToUint32x4(t *testing.T, f func(x archsimd.Int16x8) archsimd.Uint32x4, want func(x []int16) []uint32) {
+	n := 8
+	t.Helper()
+	forSlice(t, int16s, n, func(x []int16) bool {
+		t.Helper()
+		a := archsimd.LoadInt16x8Slice(x)
+		g := make([]uint32, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt32x4ConvertLoToUint32x4 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 4 elements.
+func testInt32x4ConvertLoToUint32x4(t *testing.T, f func(x archsimd.Int32x4) archsimd.Uint32x4, want func(x []int32) []uint32) {
+	n := 4
+	t.Helper()
+	forSlice(t, int32s, n, func(x []int32) bool {
+		t.Helper()
+		a := archsimd.LoadInt32x4Slice(x)
+		g := make([]uint32, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt64x2ConvertLoToUint32x4 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 4 elements.
+func testInt64x2ConvertLoToUint32x4(t *testing.T, f func(x archsimd.Int64x2) archsimd.Uint32x4, want func(x []int64) []uint32) {
+	n := 2
+	t.Helper()
+	forSlice(t, int64s, n, func(x []int64) bool {
+		t.Helper()
+		a := archsimd.LoadInt64x2Slice(x)
+		g := make([]uint32, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint8x16ConvertLoToUint32x4 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 4 elements.
+func testUint8x16ConvertLoToUint32x4(t *testing.T, f func(x archsimd.Uint8x16) archsimd.Uint32x4, want func(x []uint8) []uint32) {
+	n := 16
+	t.Helper()
+	forSlice(t, uint8s, n, func(x []uint8) bool {
+		t.Helper()
+		a := archsimd.LoadUint8x16Slice(x)
+		g := make([]uint32, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint16x8ConvertLoToUint32x4 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 4 elements.
+func testUint16x8ConvertLoToUint32x4(t *testing.T, f func(x archsimd.Uint16x8) archsimd.Uint32x4, want func(x []uint16) []uint32) {
+	n := 8
+	t.Helper()
+	forSlice(t, uint16s, n, func(x []uint16) bool {
+		t.Helper()
+		a := archsimd.LoadUint16x8Slice(x)
+		g := make([]uint32, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint32x4ConvertLoToUint32x4 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 4 elements.
+func testUint32x4ConvertLoToUint32x4(t *testing.T, f func(x archsimd.Uint32x4) archsimd.Uint32x4, want func(x []uint32) []uint32) {
+	n := 4
+	t.Helper()
+	forSlice(t, uint32s, n, func(x []uint32) bool {
+		t.Helper()
+		a := archsimd.LoadUint32x4Slice(x)
+		g := make([]uint32, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint64x2ConvertLoToUint32x4 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 4 elements.
+func testUint64x2ConvertLoToUint32x4(t *testing.T, f func(x archsimd.Uint64x2) archsimd.Uint32x4, want func(x []uint64) []uint32) {
+	n := 2
+	t.Helper()
+	forSlice(t, uint64s, n, func(x []uint64) bool {
+		t.Helper()
+		a := archsimd.LoadUint64x2Slice(x)
+		g := make([]uint32, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt8x32ConvertLoToUint32x4 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 4 elements.
+func testInt8x32ConvertLoToUint32x4(t *testing.T, f func(x archsimd.Int8x32) archsimd.Uint32x4, want func(x []int8) []uint32) {
+	n := 32
+	t.Helper()
+	forSlice(t, int8s, n, func(x []int8) bool {
+		t.Helper()
+		a := archsimd.LoadInt8x32Slice(x)
+		g := make([]uint32, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt16x16ConvertLoToUint32x4 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 4 elements.
+func testInt16x16ConvertLoToUint32x4(t *testing.T, f func(x archsimd.Int16x16) archsimd.Uint32x4, want func(x []int16) []uint32) {
+	n := 16
+	t.Helper()
+	forSlice(t, int16s, n, func(x []int16) bool {
+		t.Helper()
+		a := archsimd.LoadInt16x16Slice(x)
+		g := make([]uint32, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt32x8ConvertLoToUint32x4 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 4 elements.
+func testInt32x8ConvertLoToUint32x4(t *testing.T, f func(x archsimd.Int32x8) archsimd.Uint32x4, want func(x []int32) []uint32) {
+	n := 8
+	t.Helper()
+	forSlice(t, int32s, n, func(x []int32) bool {
+		t.Helper()
+		a := archsimd.LoadInt32x8Slice(x)
+		g := make([]uint32, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt64x4ConvertLoToUint32x4 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 4 elements.
+func testInt64x4ConvertLoToUint32x4(t *testing.T, f func(x archsimd.Int64x4) archsimd.Uint32x4, want func(x []int64) []uint32) {
+	n := 4
+	t.Helper()
+	forSlice(t, int64s, n, func(x []int64) bool {
+		t.Helper()
+		a := archsimd.LoadInt64x4Slice(x)
+		g := make([]uint32, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint8x32ConvertLoToUint32x4 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 4 elements.
+func testUint8x32ConvertLoToUint32x4(t *testing.T, f func(x archsimd.Uint8x32) archsimd.Uint32x4, want func(x []uint8) []uint32) {
+	n := 32
+	t.Helper()
+	forSlice(t, uint8s, n, func(x []uint8) bool {
+		t.Helper()
+		a := archsimd.LoadUint8x32Slice(x)
+		g := make([]uint32, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint16x16ConvertLoToUint32x4 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 4 elements.
+func testUint16x16ConvertLoToUint32x4(t *testing.T, f func(x archsimd.Uint16x16) archsimd.Uint32x4, want func(x []uint16) []uint32) {
+	n := 16
+	t.Helper()
+	forSlice(t, uint16s, n, func(x []uint16) bool {
+		t.Helper()
+		a := archsimd.LoadUint16x16Slice(x)
+		g := make([]uint32, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint32x8ConvertLoToUint32x4 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 4 elements.
+func testUint32x8ConvertLoToUint32x4(t *testing.T, f func(x archsimd.Uint32x8) archsimd.Uint32x4, want func(x []uint32) []uint32) {
+	n := 8
+	t.Helper()
+	forSlice(t, uint32s, n, func(x []uint32) bool {
+		t.Helper()
+		a := archsimd.LoadUint32x8Slice(x)
+		g := make([]uint32, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint64x4ConvertLoToUint32x4 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 4 elements.
+func testUint64x4ConvertLoToUint32x4(t *testing.T, f func(x archsimd.Uint64x4) archsimd.Uint32x4, want func(x []uint64) []uint32) {
+	n := 4
+	t.Helper()
+	forSlice(t, uint64s, n, func(x []uint64) bool {
+		t.Helper()
+		a := archsimd.LoadUint64x4Slice(x)
+		g := make([]uint32, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt8x64ConvertLoToUint32x4 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 4 elements.
+func testInt8x64ConvertLoToUint32x4(t *testing.T, f func(x archsimd.Int8x64) archsimd.Uint32x4, want func(x []int8) []uint32) {
+	n := 64
+	t.Helper()
+	forSlice(t, int8s, n, func(x []int8) bool {
+		t.Helper()
+		a := archsimd.LoadInt8x64Slice(x)
+		g := make([]uint32, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt16x32ConvertLoToUint32x4 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 4 elements.
+func testInt16x32ConvertLoToUint32x4(t *testing.T, f func(x archsimd.Int16x32) archsimd.Uint32x4, want func(x []int16) []uint32) {
+	n := 32
+	t.Helper()
+	forSlice(t, int16s, n, func(x []int16) bool {
+		t.Helper()
+		a := archsimd.LoadInt16x32Slice(x)
+		g := make([]uint32, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt32x16ConvertLoToUint32x4 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 4 elements.
+func testInt32x16ConvertLoToUint32x4(t *testing.T, f func(x archsimd.Int32x16) archsimd.Uint32x4, want func(x []int32) []uint32) {
+	n := 16
+	t.Helper()
+	forSlice(t, int32s, n, func(x []int32) bool {
+		t.Helper()
+		a := archsimd.LoadInt32x16Slice(x)
+		g := make([]uint32, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt64x8ConvertLoToUint32x4 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 4 elements.
+func testInt64x8ConvertLoToUint32x4(t *testing.T, f func(x archsimd.Int64x8) archsimd.Uint32x4, want func(x []int64) []uint32) {
+	n := 8
+	t.Helper()
+	forSlice(t, int64s, n, func(x []int64) bool {
+		t.Helper()
+		a := archsimd.LoadInt64x8Slice(x)
+		g := make([]uint32, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint8x64ConvertLoToUint32x4 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 4 elements.
+func testUint8x64ConvertLoToUint32x4(t *testing.T, f func(x archsimd.Uint8x64) archsimd.Uint32x4, want func(x []uint8) []uint32) {
+	n := 64
+	t.Helper()
+	forSlice(t, uint8s, n, func(x []uint8) bool {
+		t.Helper()
+		a := archsimd.LoadUint8x64Slice(x)
+		g := make([]uint32, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint16x32ConvertLoToUint32x4 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 4 elements.
+func testUint16x32ConvertLoToUint32x4(t *testing.T, f func(x archsimd.Uint16x32) archsimd.Uint32x4, want func(x []uint16) []uint32) {
+	n := 32
+	t.Helper()
+	forSlice(t, uint16s, n, func(x []uint16) bool {
+		t.Helper()
+		a := archsimd.LoadUint16x32Slice(x)
+		g := make([]uint32, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint32x16ConvertLoToUint32x4 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 4 elements.
+func testUint32x16ConvertLoToUint32x4(t *testing.T, f func(x archsimd.Uint32x16) archsimd.Uint32x4, want func(x []uint32) []uint32) {
+	n := 16
+	t.Helper()
+	forSlice(t, uint32s, n, func(x []uint32) bool {
+		t.Helper()
+		a := archsimd.LoadUint32x16Slice(x)
+		g := make([]uint32, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint64x8ConvertLoToUint32x4 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 4 elements.
+func testUint64x8ConvertLoToUint32x4(t *testing.T, f func(x archsimd.Uint64x8) archsimd.Uint32x4, want func(x []uint64) []uint32) {
+	n := 8
+	t.Helper()
+	forSlice(t, uint64s, n, func(x []uint64) bool {
+		t.Helper()
+		a := archsimd.LoadUint64x8Slice(x)
+		g := make([]uint32, 4)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt8x16ConvertLoToUint32x8 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 8 elements.
+func testInt8x16ConvertLoToUint32x8(t *testing.T, f func(x archsimd.Int8x16) archsimd.Uint32x8, want func(x []int8) []uint32) {
+	n := 16
+	t.Helper()
+	forSlice(t, int8s, n, func(x []int8) bool {
+		t.Helper()
+		a := archsimd.LoadInt8x16Slice(x)
+		g := make([]uint32, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt16x8ConvertLoToUint32x8 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 8 elements.
+func testInt16x8ConvertLoToUint32x8(t *testing.T, f func(x archsimd.Int16x8) archsimd.Uint32x8, want func(x []int16) []uint32) {
+	n := 8
+	t.Helper()
+	forSlice(t, int16s, n, func(x []int16) bool {
+		t.Helper()
+		a := archsimd.LoadInt16x8Slice(x)
+		g := make([]uint32, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt32x4ConvertLoToUint32x8 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 8 elements.
+func testInt32x4ConvertLoToUint32x8(t *testing.T, f func(x archsimd.Int32x4) archsimd.Uint32x8, want func(x []int32) []uint32) {
+	n := 4
+	t.Helper()
+	forSlice(t, int32s, n, func(x []int32) bool {
+		t.Helper()
+		a := archsimd.LoadInt32x4Slice(x)
+		g := make([]uint32, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt64x2ConvertLoToUint32x8 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 8 elements.
+func testInt64x2ConvertLoToUint32x8(t *testing.T, f func(x archsimd.Int64x2) archsimd.Uint32x8, want func(x []int64) []uint32) {
+	n := 2
+	t.Helper()
+	forSlice(t, int64s, n, func(x []int64) bool {
+		t.Helper()
+		a := archsimd.LoadInt64x2Slice(x)
+		g := make([]uint32, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint8x16ConvertLoToUint32x8 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 8 elements.
+func testUint8x16ConvertLoToUint32x8(t *testing.T, f func(x archsimd.Uint8x16) archsimd.Uint32x8, want func(x []uint8) []uint32) {
+	n := 16
+	t.Helper()
+	forSlice(t, uint8s, n, func(x []uint8) bool {
+		t.Helper()
+		a := archsimd.LoadUint8x16Slice(x)
+		g := make([]uint32, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint16x8ConvertLoToUint32x8 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 8 elements.
+func testUint16x8ConvertLoToUint32x8(t *testing.T, f func(x archsimd.Uint16x8) archsimd.Uint32x8, want func(x []uint16) []uint32) {
+	n := 8
+	t.Helper()
+	forSlice(t, uint16s, n, func(x []uint16) bool {
+		t.Helper()
+		a := archsimd.LoadUint16x8Slice(x)
+		g := make([]uint32, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint32x4ConvertLoToUint32x8 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 8 elements.
+func testUint32x4ConvertLoToUint32x8(t *testing.T, f func(x archsimd.Uint32x4) archsimd.Uint32x8, want func(x []uint32) []uint32) {
+	n := 4
+	t.Helper()
+	forSlice(t, uint32s, n, func(x []uint32) bool {
+		t.Helper()
+		a := archsimd.LoadUint32x4Slice(x)
+		g := make([]uint32, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint64x2ConvertLoToUint32x8 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 8 elements.
+func testUint64x2ConvertLoToUint32x8(t *testing.T, f func(x archsimd.Uint64x2) archsimd.Uint32x8, want func(x []uint64) []uint32) {
+	n := 2
+	t.Helper()
+	forSlice(t, uint64s, n, func(x []uint64) bool {
+		t.Helper()
+		a := archsimd.LoadUint64x2Slice(x)
+		g := make([]uint32, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt8x32ConvertLoToUint32x8 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 8 elements.
+func testInt8x32ConvertLoToUint32x8(t *testing.T, f func(x archsimd.Int8x32) archsimd.Uint32x8, want func(x []int8) []uint32) {
+	n := 32
+	t.Helper()
+	forSlice(t, int8s, n, func(x []int8) bool {
+		t.Helper()
+		a := archsimd.LoadInt8x32Slice(x)
+		g := make([]uint32, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt16x16ConvertLoToUint32x8 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 8 elements.
+func testInt16x16ConvertLoToUint32x8(t *testing.T, f func(x archsimd.Int16x16) archsimd.Uint32x8, want func(x []int16) []uint32) {
+	n := 16
+	t.Helper()
+	forSlice(t, int16s, n, func(x []int16) bool {
+		t.Helper()
+		a := archsimd.LoadInt16x16Slice(x)
+		g := make([]uint32, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt32x8ConvertLoToUint32x8 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 8 elements.
+func testInt32x8ConvertLoToUint32x8(t *testing.T, f func(x archsimd.Int32x8) archsimd.Uint32x8, want func(x []int32) []uint32) {
+	n := 8
+	t.Helper()
+	forSlice(t, int32s, n, func(x []int32) bool {
+		t.Helper()
+		a := archsimd.LoadInt32x8Slice(x)
+		g := make([]uint32, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt64x4ConvertLoToUint32x8 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 8 elements.
+func testInt64x4ConvertLoToUint32x8(t *testing.T, f func(x archsimd.Int64x4) archsimd.Uint32x8, want func(x []int64) []uint32) {
+	n := 4
+	t.Helper()
+	forSlice(t, int64s, n, func(x []int64) bool {
+		t.Helper()
+		a := archsimd.LoadInt64x4Slice(x)
+		g := make([]uint32, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint8x32ConvertLoToUint32x8 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 8 elements.
+func testUint8x32ConvertLoToUint32x8(t *testing.T, f func(x archsimd.Uint8x32) archsimd.Uint32x8, want func(x []uint8) []uint32) {
+	n := 32
+	t.Helper()
+	forSlice(t, uint8s, n, func(x []uint8) bool {
+		t.Helper()
+		a := archsimd.LoadUint8x32Slice(x)
+		g := make([]uint32, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint16x16ConvertLoToUint32x8 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 8 elements.
+func testUint16x16ConvertLoToUint32x8(t *testing.T, f func(x archsimd.Uint16x16) archsimd.Uint32x8, want func(x []uint16) []uint32) {
+	n := 16
+	t.Helper()
+	forSlice(t, uint16s, n, func(x []uint16) bool {
+		t.Helper()
+		a := archsimd.LoadUint16x16Slice(x)
+		g := make([]uint32, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint32x8ConvertLoToUint32x8 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 8 elements.
+func testUint32x8ConvertLoToUint32x8(t *testing.T, f func(x archsimd.Uint32x8) archsimd.Uint32x8, want func(x []uint32) []uint32) {
+	n := 8
+	t.Helper()
+	forSlice(t, uint32s, n, func(x []uint32) bool {
+		t.Helper()
+		a := archsimd.LoadUint32x8Slice(x)
+		g := make([]uint32, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint64x4ConvertLoToUint32x8 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 8 elements.
+func testUint64x4ConvertLoToUint32x8(t *testing.T, f func(x archsimd.Uint64x4) archsimd.Uint32x8, want func(x []uint64) []uint32) {
+	n := 4
+	t.Helper()
+	forSlice(t, uint64s, n, func(x []uint64) bool {
+		t.Helper()
+		a := archsimd.LoadUint64x4Slice(x)
+		g := make([]uint32, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt8x64ConvertLoToUint32x8 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 8 elements.
+func testInt8x64ConvertLoToUint32x8(t *testing.T, f func(x archsimd.Int8x64) archsimd.Uint32x8, want func(x []int8) []uint32) {
+	n := 64
+	t.Helper()
+	forSlice(t, int8s, n, func(x []int8) bool {
+		t.Helper()
+		a := archsimd.LoadInt8x64Slice(x)
+		g := make([]uint32, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt16x32ConvertLoToUint32x8 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 8 elements.
+func testInt16x32ConvertLoToUint32x8(t *testing.T, f func(x archsimd.Int16x32) archsimd.Uint32x8, want func(x []int16) []uint32) {
+	n := 32
+	t.Helper()
+	forSlice(t, int16s, n, func(x []int16) bool {
+		t.Helper()
+		a := archsimd.LoadInt16x32Slice(x)
+		g := make([]uint32, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt32x16ConvertLoToUint32x8 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 8 elements.
+func testInt32x16ConvertLoToUint32x8(t *testing.T, f func(x archsimd.Int32x16) archsimd.Uint32x8, want func(x []int32) []uint32) {
+	n := 16
+	t.Helper()
+	forSlice(t, int32s, n, func(x []int32) bool {
+		t.Helper()
+		a := archsimd.LoadInt32x16Slice(x)
+		g := make([]uint32, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt64x8ConvertLoToUint32x8 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 8 elements.
+func testInt64x8ConvertLoToUint32x8(t *testing.T, f func(x archsimd.Int64x8) archsimd.Uint32x8, want func(x []int64) []uint32) {
+	n := 8
+	t.Helper()
+	forSlice(t, int64s, n, func(x []int64) bool {
+		t.Helper()
+		a := archsimd.LoadInt64x8Slice(x)
+		g := make([]uint32, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint8x64ConvertLoToUint32x8 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 8 elements.
+func testUint8x64ConvertLoToUint32x8(t *testing.T, f func(x archsimd.Uint8x64) archsimd.Uint32x8, want func(x []uint8) []uint32) {
+	n := 64
+	t.Helper()
+	forSlice(t, uint8s, n, func(x []uint8) bool {
+		t.Helper()
+		a := archsimd.LoadUint8x64Slice(x)
+		g := make([]uint32, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint16x32ConvertLoToUint32x8 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 8 elements.
+func testUint16x32ConvertLoToUint32x8(t *testing.T, f func(x archsimd.Uint16x32) archsimd.Uint32x8, want func(x []uint16) []uint32) {
+	n := 32
+	t.Helper()
+	forSlice(t, uint16s, n, func(x []uint16) bool {
+		t.Helper()
+		a := archsimd.LoadUint16x32Slice(x)
+		g := make([]uint32, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint32x16ConvertLoToUint32x8 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 8 elements.
+func testUint32x16ConvertLoToUint32x8(t *testing.T, f func(x archsimd.Uint32x16) archsimd.Uint32x8, want func(x []uint32) []uint32) {
+	n := 16
+	t.Helper()
+	forSlice(t, uint32s, n, func(x []uint32) bool {
+		t.Helper()
+		a := archsimd.LoadUint32x16Slice(x)
+		g := make([]uint32, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint64x8ConvertLoToUint32x8 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 8 elements.
+func testUint64x8ConvertLoToUint32x8(t *testing.T, f func(x archsimd.Uint64x8) archsimd.Uint32x8, want func(x []uint64) []uint32) {
+	n := 8
+	t.Helper()
+	forSlice(t, uint64s, n, func(x []uint64) bool {
+		t.Helper()
+		a := archsimd.LoadUint64x8Slice(x)
+		g := make([]uint32, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt8x16ConvertLoToInt16x8 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 8 elements.
+func testInt8x16ConvertLoToInt16x8(t *testing.T, f func(x archsimd.Int8x16) archsimd.Int16x8, want func(x []int8) []int16) {
+	n := 16
+	t.Helper()
+	forSlice(t, int8s, n, func(x []int8) bool {
+		t.Helper()
+		a := archsimd.LoadInt8x16Slice(x)
+		g := make([]int16, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt16x8ConvertLoToInt16x8 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 8 elements.
+func testInt16x8ConvertLoToInt16x8(t *testing.T, f func(x archsimd.Int16x8) archsimd.Int16x8, want func(x []int16) []int16) {
+	n := 8
+	t.Helper()
+	forSlice(t, int16s, n, func(x []int16) bool {
+		t.Helper()
+		a := archsimd.LoadInt16x8Slice(x)
+		g := make([]int16, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt32x4ConvertLoToInt16x8 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 8 elements.
+func testInt32x4ConvertLoToInt16x8(t *testing.T, f func(x archsimd.Int32x4) archsimd.Int16x8, want func(x []int32) []int16) {
+	n := 4
+	t.Helper()
+	forSlice(t, int32s, n, func(x []int32) bool {
+		t.Helper()
+		a := archsimd.LoadInt32x4Slice(x)
+		g := make([]int16, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt64x2ConvertLoToInt16x8 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 8 elements.
+func testInt64x2ConvertLoToInt16x8(t *testing.T, f func(x archsimd.Int64x2) archsimd.Int16x8, want func(x []int64) []int16) {
+	n := 2
+	t.Helper()
+	forSlice(t, int64s, n, func(x []int64) bool {
+		t.Helper()
+		a := archsimd.LoadInt64x2Slice(x)
+		g := make([]int16, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint8x16ConvertLoToInt16x8 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 8 elements.
+func testUint8x16ConvertLoToInt16x8(t *testing.T, f func(x archsimd.Uint8x16) archsimd.Int16x8, want func(x []uint8) []int16) {
+	n := 16
+	t.Helper()
+	forSlice(t, uint8s, n, func(x []uint8) bool {
+		t.Helper()
+		a := archsimd.LoadUint8x16Slice(x)
+		g := make([]int16, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint16x8ConvertLoToInt16x8 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 8 elements.
+func testUint16x8ConvertLoToInt16x8(t *testing.T, f func(x archsimd.Uint16x8) archsimd.Int16x8, want func(x []uint16) []int16) {
+	n := 8
+	t.Helper()
+	forSlice(t, uint16s, n, func(x []uint16) bool {
+		t.Helper()
+		a := archsimd.LoadUint16x8Slice(x)
+		g := make([]int16, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint32x4ConvertLoToInt16x8 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 8 elements.
+func testUint32x4ConvertLoToInt16x8(t *testing.T, f func(x archsimd.Uint32x4) archsimd.Int16x8, want func(x []uint32) []int16) {
+	n := 4
+	t.Helper()
+	forSlice(t, uint32s, n, func(x []uint32) bool {
+		t.Helper()
+		a := archsimd.LoadUint32x4Slice(x)
+		g := make([]int16, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint64x2ConvertLoToInt16x8 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 8 elements.
+func testUint64x2ConvertLoToInt16x8(t *testing.T, f func(x archsimd.Uint64x2) archsimd.Int16x8, want func(x []uint64) []int16) {
+	n := 2
+	t.Helper()
+	forSlice(t, uint64s, n, func(x []uint64) bool {
+		t.Helper()
+		a := archsimd.LoadUint64x2Slice(x)
+		g := make([]int16, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt8x32ConvertLoToInt16x8 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 8 elements.
+func testInt8x32ConvertLoToInt16x8(t *testing.T, f func(x archsimd.Int8x32) archsimd.Int16x8, want func(x []int8) []int16) {
+	n := 32
+	t.Helper()
+	forSlice(t, int8s, n, func(x []int8) bool {
+		t.Helper()
+		a := archsimd.LoadInt8x32Slice(x)
+		g := make([]int16, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt16x16ConvertLoToInt16x8 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 8 elements.
+func testInt16x16ConvertLoToInt16x8(t *testing.T, f func(x archsimd.Int16x16) archsimd.Int16x8, want func(x []int16) []int16) {
+	n := 16
+	t.Helper()
+	forSlice(t, int16s, n, func(x []int16) bool {
+		t.Helper()
+		a := archsimd.LoadInt16x16Slice(x)
+		g := make([]int16, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt32x8ConvertLoToInt16x8 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 8 elements.
+func testInt32x8ConvertLoToInt16x8(t *testing.T, f func(x archsimd.Int32x8) archsimd.Int16x8, want func(x []int32) []int16) {
+	n := 8
+	t.Helper()
+	forSlice(t, int32s, n, func(x []int32) bool {
+		t.Helper()
+		a := archsimd.LoadInt32x8Slice(x)
+		g := make([]int16, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt64x4ConvertLoToInt16x8 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 8 elements.
+func testInt64x4ConvertLoToInt16x8(t *testing.T, f func(x archsimd.Int64x4) archsimd.Int16x8, want func(x []int64) []int16) {
+	n := 4
+	t.Helper()
+	forSlice(t, int64s, n, func(x []int64) bool {
+		t.Helper()
+		a := archsimd.LoadInt64x4Slice(x)
+		g := make([]int16, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint8x32ConvertLoToInt16x8 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 8 elements.
+func testUint8x32ConvertLoToInt16x8(t *testing.T, f func(x archsimd.Uint8x32) archsimd.Int16x8, want func(x []uint8) []int16) {
+	n := 32
+	t.Helper()
+	forSlice(t, uint8s, n, func(x []uint8) bool {
+		t.Helper()
+		a := archsimd.LoadUint8x32Slice(x)
+		g := make([]int16, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint16x16ConvertLoToInt16x8 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 8 elements.
+func testUint16x16ConvertLoToInt16x8(t *testing.T, f func(x archsimd.Uint16x16) archsimd.Int16x8, want func(x []uint16) []int16) {
+	n := 16
+	t.Helper()
+	forSlice(t, uint16s, n, func(x []uint16) bool {
+		t.Helper()
+		a := archsimd.LoadUint16x16Slice(x)
+		g := make([]int16, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint32x8ConvertLoToInt16x8 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 8 elements.
+func testUint32x8ConvertLoToInt16x8(t *testing.T, f func(x archsimd.Uint32x8) archsimd.Int16x8, want func(x []uint32) []int16) {
+	n := 8
+	t.Helper()
+	forSlice(t, uint32s, n, func(x []uint32) bool {
+		t.Helper()
+		a := archsimd.LoadUint32x8Slice(x)
+		g := make([]int16, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint64x4ConvertLoToInt16x8 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 8 elements.
+func testUint64x4ConvertLoToInt16x8(t *testing.T, f func(x archsimd.Uint64x4) archsimd.Int16x8, want func(x []uint64) []int16) {
+	n := 4
+	t.Helper()
+	forSlice(t, uint64s, n, func(x []uint64) bool {
+		t.Helper()
+		a := archsimd.LoadUint64x4Slice(x)
+		g := make([]int16, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt8x64ConvertLoToInt16x8 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 8 elements.
+func testInt8x64ConvertLoToInt16x8(t *testing.T, f func(x archsimd.Int8x64) archsimd.Int16x8, want func(x []int8) []int16) {
+	n := 64
+	t.Helper()
+	forSlice(t, int8s, n, func(x []int8) bool {
+		t.Helper()
+		a := archsimd.LoadInt8x64Slice(x)
+		g := make([]int16, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt16x32ConvertLoToInt16x8 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 8 elements.
+func testInt16x32ConvertLoToInt16x8(t *testing.T, f func(x archsimd.Int16x32) archsimd.Int16x8, want func(x []int16) []int16) {
+	n := 32
+	t.Helper()
+	forSlice(t, int16s, n, func(x []int16) bool {
+		t.Helper()
+		a := archsimd.LoadInt16x32Slice(x)
+		g := make([]int16, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt32x16ConvertLoToInt16x8 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 8 elements.
+func testInt32x16ConvertLoToInt16x8(t *testing.T, f func(x archsimd.Int32x16) archsimd.Int16x8, want func(x []int32) []int16) {
+	n := 16
+	t.Helper()
+	forSlice(t, int32s, n, func(x []int32) bool {
+		t.Helper()
+		a := archsimd.LoadInt32x16Slice(x)
+		g := make([]int16, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt64x8ConvertLoToInt16x8 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 8 elements.
+func testInt64x8ConvertLoToInt16x8(t *testing.T, f func(x archsimd.Int64x8) archsimd.Int16x8, want func(x []int64) []int16) {
+	n := 8
+	t.Helper()
+	forSlice(t, int64s, n, func(x []int64) bool {
+		t.Helper()
+		a := archsimd.LoadInt64x8Slice(x)
+		g := make([]int16, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint8x64ConvertLoToInt16x8 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 8 elements.
+func testUint8x64ConvertLoToInt16x8(t *testing.T, f func(x archsimd.Uint8x64) archsimd.Int16x8, want func(x []uint8) []int16) {
+	n := 64
+	t.Helper()
+	forSlice(t, uint8s, n, func(x []uint8) bool {
+		t.Helper()
+		a := archsimd.LoadUint8x64Slice(x)
+		g := make([]int16, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint16x32ConvertLoToInt16x8 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 8 elements.
+func testUint16x32ConvertLoToInt16x8(t *testing.T, f func(x archsimd.Uint16x32) archsimd.Int16x8, want func(x []uint16) []int16) {
+	n := 32
+	t.Helper()
+	forSlice(t, uint16s, n, func(x []uint16) bool {
+		t.Helper()
+		a := archsimd.LoadUint16x32Slice(x)
+		g := make([]int16, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint32x16ConvertLoToInt16x8 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 8 elements.
+func testUint32x16ConvertLoToInt16x8(t *testing.T, f func(x archsimd.Uint32x16) archsimd.Int16x8, want func(x []uint32) []int16) {
+	n := 16
+	t.Helper()
+	forSlice(t, uint32s, n, func(x []uint32) bool {
+		t.Helper()
+		a := archsimd.LoadUint32x16Slice(x)
+		g := make([]int16, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint64x8ConvertLoToInt16x8 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 8 elements.
+func testUint64x8ConvertLoToInt16x8(t *testing.T, f func(x archsimd.Uint64x8) archsimd.Int16x8, want func(x []uint64) []int16) {
+	n := 8
+	t.Helper()
+	forSlice(t, uint64s, n, func(x []uint64) bool {
+		t.Helper()
+		a := archsimd.LoadUint64x8Slice(x)
+		g := make([]int16, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt8x16ConvertLoToUint16x8 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 8 elements.
+func testInt8x16ConvertLoToUint16x8(t *testing.T, f func(x archsimd.Int8x16) archsimd.Uint16x8, want func(x []int8) []uint16) {
+	n := 16
+	t.Helper()
+	forSlice(t, int8s, n, func(x []int8) bool {
+		t.Helper()
+		a := archsimd.LoadInt8x16Slice(x)
+		g := make([]uint16, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt16x8ConvertLoToUint16x8 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 8 elements.
+func testInt16x8ConvertLoToUint16x8(t *testing.T, f func(x archsimd.Int16x8) archsimd.Uint16x8, want func(x []int16) []uint16) {
+	n := 8
+	t.Helper()
+	forSlice(t, int16s, n, func(x []int16) bool {
+		t.Helper()
+		a := archsimd.LoadInt16x8Slice(x)
+		g := make([]uint16, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt32x4ConvertLoToUint16x8 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 8 elements.
+func testInt32x4ConvertLoToUint16x8(t *testing.T, f func(x archsimd.Int32x4) archsimd.Uint16x8, want func(x []int32) []uint16) {
+	n := 4
+	t.Helper()
+	forSlice(t, int32s, n, func(x []int32) bool {
+		t.Helper()
+		a := archsimd.LoadInt32x4Slice(x)
+		g := make([]uint16, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt64x2ConvertLoToUint16x8 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 8 elements.
+func testInt64x2ConvertLoToUint16x8(t *testing.T, f func(x archsimd.Int64x2) archsimd.Uint16x8, want func(x []int64) []uint16) {
+	n := 2
+	t.Helper()
+	forSlice(t, int64s, n, func(x []int64) bool {
+		t.Helper()
+		a := archsimd.LoadInt64x2Slice(x)
+		g := make([]uint16, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint8x16ConvertLoToUint16x8 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 8 elements.
+func testUint8x16ConvertLoToUint16x8(t *testing.T, f func(x archsimd.Uint8x16) archsimd.Uint16x8, want func(x []uint8) []uint16) {
+	n := 16
+	t.Helper()
+	forSlice(t, uint8s, n, func(x []uint8) bool {
+		t.Helper()
+		a := archsimd.LoadUint8x16Slice(x)
+		g := make([]uint16, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint16x8ConvertLoToUint16x8 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 8 elements.
+func testUint16x8ConvertLoToUint16x8(t *testing.T, f func(x archsimd.Uint16x8) archsimd.Uint16x8, want func(x []uint16) []uint16) {
+	n := 8
+	t.Helper()
+	forSlice(t, uint16s, n, func(x []uint16) bool {
+		t.Helper()
+		a := archsimd.LoadUint16x8Slice(x)
+		g := make([]uint16, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint32x4ConvertLoToUint16x8 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 8 elements.
+func testUint32x4ConvertLoToUint16x8(t *testing.T, f func(x archsimd.Uint32x4) archsimd.Uint16x8, want func(x []uint32) []uint16) {
+	n := 4
+	t.Helper()
+	forSlice(t, uint32s, n, func(x []uint32) bool {
+		t.Helper()
+		a := archsimd.LoadUint32x4Slice(x)
+		g := make([]uint16, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint64x2ConvertLoToUint16x8 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 8 elements.
+func testUint64x2ConvertLoToUint16x8(t *testing.T, f func(x archsimd.Uint64x2) archsimd.Uint16x8, want func(x []uint64) []uint16) {
+	n := 2
+	t.Helper()
+	forSlice(t, uint64s, n, func(x []uint64) bool {
+		t.Helper()
+		a := archsimd.LoadUint64x2Slice(x)
+		g := make([]uint16, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt8x32ConvertLoToUint16x8 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 8 elements.
+func testInt8x32ConvertLoToUint16x8(t *testing.T, f func(x archsimd.Int8x32) archsimd.Uint16x8, want func(x []int8) []uint16) {
+	n := 32
+	t.Helper()
+	forSlice(t, int8s, n, func(x []int8) bool {
+		t.Helper()
+		a := archsimd.LoadInt8x32Slice(x)
+		g := make([]uint16, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt16x16ConvertLoToUint16x8 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 8 elements.
+func testInt16x16ConvertLoToUint16x8(t *testing.T, f func(x archsimd.Int16x16) archsimd.Uint16x8, want func(x []int16) []uint16) {
+	n := 16
+	t.Helper()
+	forSlice(t, int16s, n, func(x []int16) bool {
+		t.Helper()
+		a := archsimd.LoadInt16x16Slice(x)
+		g := make([]uint16, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt32x8ConvertLoToUint16x8 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 8 elements.
+func testInt32x8ConvertLoToUint16x8(t *testing.T, f func(x archsimd.Int32x8) archsimd.Uint16x8, want func(x []int32) []uint16) {
+	n := 8
+	t.Helper()
+	forSlice(t, int32s, n, func(x []int32) bool {
+		t.Helper()
+		a := archsimd.LoadInt32x8Slice(x)
+		g := make([]uint16, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt64x4ConvertLoToUint16x8 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 8 elements.
+func testInt64x4ConvertLoToUint16x8(t *testing.T, f func(x archsimd.Int64x4) archsimd.Uint16x8, want func(x []int64) []uint16) {
+	n := 4
+	t.Helper()
+	forSlice(t, int64s, n, func(x []int64) bool {
+		t.Helper()
+		a := archsimd.LoadInt64x4Slice(x)
+		g := make([]uint16, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint8x32ConvertLoToUint16x8 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 8 elements.
+func testUint8x32ConvertLoToUint16x8(t *testing.T, f func(x archsimd.Uint8x32) archsimd.Uint16x8, want func(x []uint8) []uint16) {
+	n := 32
+	t.Helper()
+	forSlice(t, uint8s, n, func(x []uint8) bool {
+		t.Helper()
+		a := archsimd.LoadUint8x32Slice(x)
+		g := make([]uint16, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint16x16ConvertLoToUint16x8 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 8 elements.
+func testUint16x16ConvertLoToUint16x8(t *testing.T, f func(x archsimd.Uint16x16) archsimd.Uint16x8, want func(x []uint16) []uint16) {
+	n := 16
+	t.Helper()
+	forSlice(t, uint16s, n, func(x []uint16) bool {
+		t.Helper()
+		a := archsimd.LoadUint16x16Slice(x)
+		g := make([]uint16, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint32x8ConvertLoToUint16x8 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 8 elements.
+func testUint32x8ConvertLoToUint16x8(t *testing.T, f func(x archsimd.Uint32x8) archsimd.Uint16x8, want func(x []uint32) []uint16) {
+	n := 8
+	t.Helper()
+	forSlice(t, uint32s, n, func(x []uint32) bool {
+		t.Helper()
+		a := archsimd.LoadUint32x8Slice(x)
+		g := make([]uint16, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint64x4ConvertLoToUint16x8 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 8 elements.
+func testUint64x4ConvertLoToUint16x8(t *testing.T, f func(x archsimd.Uint64x4) archsimd.Uint16x8, want func(x []uint64) []uint16) {
+	n := 4
+	t.Helper()
+	forSlice(t, uint64s, n, func(x []uint64) bool {
+		t.Helper()
+		a := archsimd.LoadUint64x4Slice(x)
+		g := make([]uint16, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt8x64ConvertLoToUint16x8 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 8 elements.
+func testInt8x64ConvertLoToUint16x8(t *testing.T, f func(x archsimd.Int8x64) archsimd.Uint16x8, want func(x []int8) []uint16) {
+	n := 64
+	t.Helper()
+	forSlice(t, int8s, n, func(x []int8) bool {
+		t.Helper()
+		a := archsimd.LoadInt8x64Slice(x)
+		g := make([]uint16, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt16x32ConvertLoToUint16x8 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 8 elements.
+func testInt16x32ConvertLoToUint16x8(t *testing.T, f func(x archsimd.Int16x32) archsimd.Uint16x8, want func(x []int16) []uint16) {
+	n := 32
+	t.Helper()
+	forSlice(t, int16s, n, func(x []int16) bool {
+		t.Helper()
+		a := archsimd.LoadInt16x32Slice(x)
+		g := make([]uint16, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt32x16ConvertLoToUint16x8 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 8 elements.
+func testInt32x16ConvertLoToUint16x8(t *testing.T, f func(x archsimd.Int32x16) archsimd.Uint16x8, want func(x []int32) []uint16) {
+	n := 16
+	t.Helper()
+	forSlice(t, int32s, n, func(x []int32) bool {
+		t.Helper()
+		a := archsimd.LoadInt32x16Slice(x)
+		g := make([]uint16, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testInt64x8ConvertLoToUint16x8 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 8 elements.
+func testInt64x8ConvertLoToUint16x8(t *testing.T, f func(x archsimd.Int64x8) archsimd.Uint16x8, want func(x []int64) []uint16) {
+	n := 8
+	t.Helper()
+	forSlice(t, int64s, n, func(x []int64) bool {
+		t.Helper()
+		a := archsimd.LoadInt64x8Slice(x)
+		g := make([]uint16, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint8x64ConvertLoToUint16x8 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 8 elements.
+func testUint8x64ConvertLoToUint16x8(t *testing.T, f func(x archsimd.Uint8x64) archsimd.Uint16x8, want func(x []uint8) []uint16) {
+	n := 64
+	t.Helper()
+	forSlice(t, uint8s, n, func(x []uint8) bool {
+		t.Helper()
+		a := archsimd.LoadUint8x64Slice(x)
+		g := make([]uint16, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint16x32ConvertLoToUint16x8 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 8 elements.
+func testUint16x32ConvertLoToUint16x8(t *testing.T, f func(x archsimd.Uint16x32) archsimd.Uint16x8, want func(x []uint16) []uint16) {
+	n := 32
+	t.Helper()
+	forSlice(t, uint16s, n, func(x []uint16) bool {
+		t.Helper()
+		a := archsimd.LoadUint16x32Slice(x)
+		g := make([]uint16, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint32x16ConvertLoToUint16x8 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 8 elements.
+func testUint32x16ConvertLoToUint16x8(t *testing.T, f func(x archsimd.Uint32x16) archsimd.Uint16x8, want func(x []uint32) []uint16) {
+	n := 16
+	t.Helper()
+	forSlice(t, uint32s, n, func(x []uint32) bool {
+		t.Helper()
+		a := archsimd.LoadUint32x16Slice(x)
+		g := make([]uint16, 8)
+		f(a).StoreSlice(g)
+		w := want(x)
+		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+	})
+}
+
+// testUint64x8ConvertLoToUint16x8 tests the simd conversion method f against the expected behavior generated by want.
+// This converts only the low 8 elements.
+func testUint64x8ConvertLoToUint16x8(t *testing.T, f func(x archsimd.Uint64x8) archsimd.Uint16x8, want func(x []uint64) []uint16) {
+	n := 8
+	t.Helper()
+	forSlice(t, uint64s, n, func(x []uint64) bool {
+		t.Helper()
+		a := archsimd.LoadUint64x8Slice(x)
+		g := make([]uint16, 8)
 		f(a).StoreSlice(g)
 		w := want(x)
 		return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
diff --git a/src/simd/archsimd/internal/simd_test/unary_test.go b/src/simd/archsimd/internal/simd_test/unary_test.go
index 6b53669d78..097feb60ed 100644
--- a/src/simd/archsimd/internal/simd_test/unary_test.go
+++ b/src/simd/archsimd/internal/simd_test/unary_test.go
@@ -69,20 +69,26 @@ func TestSqrt(t *testing.T) {
 
 func TestNot(t *testing.T) {
 	testInt8x16Unary(t, archsimd.Int8x16.Not, map1[int8](not))
-	testInt8x32Unary(t, archsimd.Int8x32.Not, map1[int8](not))
 	testInt16x8Unary(t, archsimd.Int16x8.Not, map1[int16](not))
-	testInt16x16Unary(t, archsimd.Int16x16.Not, map1[int16](not))
 	testInt32x4Unary(t, archsimd.Int32x4.Not, map1[int32](not))
-	testInt32x8Unary(t, archsimd.Int32x8.Not, map1[int32](not))
+
+	if archsimd.X86.AVX2() {
+		testInt8x32Unary(t, archsimd.Int8x32.Not, map1[int8](not))
+		testInt16x16Unary(t, archsimd.Int16x16.Not, map1[int16](not))
+		testInt32x8Unary(t, archsimd.Int32x8.Not, map1[int32](not))
+	}
 }
 
 func TestAbsolute(t *testing.T) {
 	testInt8x16Unary(t, archsimd.Int8x16.Abs, map1[int8](abs))
-	testInt8x32Unary(t, archsimd.Int8x32.Abs, map1[int8](abs))
 	testInt16x8Unary(t, archsimd.Int16x8.Abs, map1[int16](abs))
-	testInt16x16Unary(t, archsimd.Int16x16.Abs, map1[int16](abs))
 	testInt32x4Unary(t, archsimd.Int32x4.Abs, map1[int32](abs))
-	testInt32x8Unary(t, archsimd.Int32x8.Abs, map1[int32](abs))
+
+	if archsimd.X86.AVX2() {
+		testInt8x32Unary(t, archsimd.Int8x32.Abs, map1[int8](abs))
+		testInt16x16Unary(t, archsimd.Int16x16.Abs, map1[int16](abs))
+		testInt32x8Unary(t, archsimd.Int32x8.Abs, map1[int32](abs))
+	}
 	if archsimd.X86.AVX512() {
 		testInt8x64Unary(t, archsimd.Int8x64.Abs, map1[int8](abs))
 		testInt16x32Unary(t, archsimd.Int16x32.Abs, map1[int16](abs))
@@ -110,28 +116,199 @@ func TestCeilScaledResidue(t *testing.T) {
 		map1[float64](func(x float64) float64 { return x - math.Ceil(x) }))
 }
 
-func TestToUint32(t *testing.T) {
-	if !archsimd.X86.AVX512() {
-		t.Skip("Needs AVX512")
+func TestConvert(t *testing.T) {
+	testFloat64x2ConvertToFloat32(t, archsimd.Float64x2.ConvertToFloat32, map1n[float64](toFloat32, 4))
+	testFloat64x4ConvertToFloat32(t, archsimd.Float64x4.ConvertToFloat32, map1[float64](toFloat32))
+	testFloat32x4ConvertToFloat64(t, archsimd.Float32x4.ConvertToFloat64, map1[float32](toFloat64))
+
+	testFloat32x4ConvertToInt32(t, archsimd.Float32x4.ConvertToInt32, map1[float32](floatToInt32_x86))
+	testFloat32x8ConvertToInt32(t, archsimd.Float32x8.ConvertToInt32, map1[float32](floatToInt32_x86))
+	testFloat64x2ConvertToInt32(t, archsimd.Float64x2.ConvertToInt32, map1n[float64](floatToInt32_x86, 4))
+	testFloat64x4ConvertToInt32(t, archsimd.Float64x4.ConvertToInt32, map1[float64](floatToInt32_x86))
+
+	testInt32x4ConvertToFloat32(t, archsimd.Int32x4.ConvertToFloat32, map1[int32](toFloat32))
+	testInt32x8ConvertToFloat32(t, archsimd.Int32x8.ConvertToFloat32, map1[int32](toFloat32))
+	testInt32x4ConvertToFloat64(t, archsimd.Int32x4.ConvertToFloat64, map1[int32](toFloat64))
+
+	if archsimd.X86.AVX512() {
+		testFloat32x8ConvertToFloat64(t, archsimd.Float32x8.ConvertToFloat64, map1[float32](toFloat64))
+		testFloat64x8ConvertToFloat32(t, archsimd.Float64x8.ConvertToFloat32, map1[float64](toFloat32))
+
+		testFloat32x16ConvertToInt32(t, archsimd.Float32x16.ConvertToInt32, map1[float32](floatToInt32_x86))
+		testFloat64x8ConvertToInt32(t, archsimd.Float64x8.ConvertToInt32, map1[float64](floatToInt32_x86))
+		testFloat32x4ConvertToInt64(t, archsimd.Float32x4.ConvertToInt64, map1[float32](floatToInt64_x86))
+		testFloat32x8ConvertToInt64(t, archsimd.Float32x8.ConvertToInt64, map1[float32](floatToInt64_x86))
+		testFloat64x2ConvertToInt64(t, archsimd.Float64x2.ConvertToInt64, map1[float64](floatToInt64_x86))
+		testFloat64x4ConvertToInt64(t, archsimd.Float64x4.ConvertToInt64, map1[float64](floatToInt64_x86))
+		testFloat64x8ConvertToInt64(t, archsimd.Float64x8.ConvertToInt64, map1[float64](floatToInt64_x86))
+
+		testFloat32x4ConvertToUint32(t, archsimd.Float32x4.ConvertToUint32, map1[float32](floatToUint32_x86))
+		testFloat32x8ConvertToUint32(t, archsimd.Float32x8.ConvertToUint32, map1[float32](floatToUint32_x86))
+		testFloat32x16ConvertToUint32(t, archsimd.Float32x16.ConvertToUint32, map1[float32](floatToUint32_x86))
+		testFloat64x2ConvertToUint32(t, archsimd.Float64x2.ConvertToUint32, map1n[float64](floatToUint32_x86, 4))
+		testFloat64x4ConvertToUint32(t, archsimd.Float64x4.ConvertToUint32, map1[float64](floatToUint32_x86))
+		testFloat64x8ConvertToUint32(t, archsimd.Float64x8.ConvertToUint32, map1[float64](floatToUint32_x86))
+		testFloat32x4ConvertToUint64(t, archsimd.Float32x4.ConvertToUint64, map1[float32](floatToUint64_x86))
+		testFloat32x8ConvertToUint64(t, archsimd.Float32x8.ConvertToUint64, map1[float32](floatToUint64_x86))
+		testFloat64x2ConvertToUint64(t, archsimd.Float64x2.ConvertToUint64, map1[float64](floatToUint64_x86))
+		testFloat64x4ConvertToUint64(t, archsimd.Float64x4.ConvertToUint64, map1[float64](floatToUint64_x86))
+		testFloat64x8ConvertToUint64(t, archsimd.Float64x8.ConvertToUint64, map1[float64](floatToUint64_x86))
+
+		testInt32x16ConvertToFloat32(t, archsimd.Int32x16.ConvertToFloat32, map1[int32](toFloat32))
+		testInt64x2ConvertToFloat32(t, archsimd.Int64x2.ConvertToFloat32, map1n[int64](toFloat32, 4))
+		testInt64x4ConvertToFloat32(t, archsimd.Int64x4.ConvertToFloat32, map1[int64](toFloat32))
+		testInt64x8ConvertToFloat32(t, archsimd.Int64x8.ConvertToFloat32, map1[int64](toFloat32))
+		testInt64x2ConvertToFloat64(t, archsimd.Int64x2.ConvertToFloat64, map1[int64](toFloat64))
+		testInt64x4ConvertToFloat64(t, archsimd.Int64x4.ConvertToFloat64, map1[int64](toFloat64))
+		testInt64x8ConvertToFloat64(t, archsimd.Int64x8.ConvertToFloat64, map1[int64](toFloat64))
+
+		testUint32x4ConvertToFloat32(t, archsimd.Uint32x4.ConvertToFloat32, map1[uint32](toFloat32))
+		testUint32x8ConvertToFloat32(t, archsimd.Uint32x8.ConvertToFloat32, map1[uint32](toFloat32))
+		testUint32x16ConvertToFloat32(t, archsimd.Uint32x16.ConvertToFloat32, map1[uint32](toFloat32))
+		testUint64x2ConvertToFloat32(t, archsimd.Uint64x2.ConvertToFloat32, map1n[uint64](toFloat32, 4))
+		testUint64x4ConvertToFloat32(t, archsimd.Uint64x4.ConvertToFloat32, map1[uint64](toFloat32))
+		testUint64x8ConvertToFloat32(t, archsimd.Uint64x8.ConvertToFloat32, map1[uint64](toFloat32))
+		testUint32x4ConvertToFloat64(t, archsimd.Uint32x4.ConvertToFloat64, map1[uint32](toFloat64))
+		testUint32x8ConvertToFloat64(t, archsimd.Uint32x8.ConvertToFloat64, map1[uint32](toFloat64))
+		testUint64x2ConvertToFloat64(t, archsimd.Uint64x2.ConvertToFloat64, map1[uint64](toFloat64))
+		testUint64x4ConvertToFloat64(t, archsimd.Uint64x4.ConvertToFloat64, map1[uint64](toFloat64))
+		testUint64x8ConvertToFloat64(t, archsimd.Uint64x8.ConvertToFloat64, map1[uint64](toFloat64))
+	}
+}
+
+func TestExtend(t *testing.T) {
+	if archsimd.X86.AVX2() {
+		testInt8x16ConvertToInt16(t, archsimd.Int8x16.ExtendToInt16, map1[int8](toInt16))
+		testInt16x8ConvertToInt32(t, archsimd.Int16x8.ExtendToInt32, map1[int16](toInt32))
+		testInt32x4ConvertToInt64(t, archsimd.Int32x4.ExtendToInt64, map1[int32](toInt64))
+		testUint8x16ConvertToUint16(t, archsimd.Uint8x16.ExtendToUint16, map1[uint8](toUint16))
+		testUint16x8ConvertToUint32(t, archsimd.Uint16x8.ExtendToUint32, map1[uint16](toUint32))
+		testUint32x4ConvertToUint64(t, archsimd.Uint32x4.ExtendToUint64, map1[uint32](toUint64))
+	}
+
+	if archsimd.X86.AVX512() {
+		testInt8x32ConvertToInt16(t, archsimd.Int8x32.ExtendToInt16, map1[int8](toInt16))
+		testInt8x16ConvertToInt32(t, archsimd.Int8x16.ExtendToInt32, map1[int8](toInt32))
+		testInt16x16ConvertToInt32(t, archsimd.Int16x16.ExtendToInt32, map1[int16](toInt32))
+		testInt16x8ConvertToInt64(t, archsimd.Int16x8.ExtendToInt64, map1[int16](toInt64))
+		testInt32x8ConvertToInt64(t, archsimd.Int32x8.ExtendToInt64, map1[int32](toInt64))
+		testUint8x32ConvertToUint16(t, archsimd.Uint8x32.ExtendToUint16, map1[uint8](toUint16))
+		testUint8x16ConvertToUint32(t, archsimd.Uint8x16.ExtendToUint32, map1[uint8](toUint32))
+		testUint16x16ConvertToUint32(t, archsimd.Uint16x16.ExtendToUint32, map1[uint16](toUint32))
+		testUint16x8ConvertToUint64(t, archsimd.Uint16x8.ExtendToUint64, map1[uint16](toUint64))
+		testUint32x8ConvertToUint64(t, archsimd.Uint32x8.ExtendToUint64, map1[uint32](toUint64))
 	}
-	testFloat32x4ConvertToUint32(t, archsimd.Float32x4.ConvertToUint32, map1[float32](toUint32))
-	testFloat32x8ConvertToUint32(t, archsimd.Float32x8.ConvertToUint32, map1[float32](toUint32))
-	testFloat32x16ConvertToUint32(t, archsimd.Float32x16.ConvertToUint32, map1[float32](toUint32))
 }
 
-func TestToInt32(t *testing.T) {
-	testFloat32x4ConvertToInt32(t, archsimd.Float32x4.ConvertToInt32, map1[float32](toInt32))
-	testFloat32x8ConvertToInt32(t, archsimd.Float32x8.ConvertToInt32, map1[float32](toInt32))
+func TestExtendLo(t *testing.T) {
+	testInt8x16ConvertLoToInt64x2(t, archsimd.Int8x16.ExtendLo2ToInt64, map1n[int8](toInt64, 2))
+	testInt16x8ConvertLoToInt64x2(t, archsimd.Int16x8.ExtendLo2ToInt64, map1n[int16](toInt64, 2))
+	testInt32x4ConvertLoToInt64x2(t, archsimd.Int32x4.ExtendLo2ToInt64, map1n[int32](toInt64, 2))
+	testUint8x16ConvertLoToUint64x2(t, archsimd.Uint8x16.ExtendLo2ToUint64, map1n[uint8](toUint64, 2))
+	testUint16x8ConvertLoToUint64x2(t, archsimd.Uint16x8.ExtendLo2ToUint64, map1n[uint16](toUint64, 2))
+	testUint32x4ConvertLoToUint64x2(t, archsimd.Uint32x4.ExtendLo2ToUint64, map1n[uint32](toUint64, 2))
+	testInt8x16ConvertLoToInt32x4(t, archsimd.Int8x16.ExtendLo4ToInt32, map1n[int8](toInt32, 4))
+	testInt16x8ConvertLoToInt32x4(t, archsimd.Int16x8.ExtendLo4ToInt32, map1n[int16](toInt32, 4))
+	testUint8x16ConvertLoToUint32x4(t, archsimd.Uint8x16.ExtendLo4ToUint32, map1n[uint8](toUint32, 4))
+	testUint16x8ConvertLoToUint32x4(t, archsimd.Uint16x8.ExtendLo4ToUint32, map1n[uint16](toUint32, 4))
+	testInt8x16ConvertLoToInt16x8(t, archsimd.Int8x16.ExtendLo8ToInt16, map1n[int8](toInt16, 8))
+	testUint8x16ConvertLoToUint16x8(t, archsimd.Uint8x16.ExtendLo8ToUint16, map1n[uint8](toUint16, 8))
+
+	if archsimd.X86.AVX2() {
+		testInt8x16ConvertLoToInt64x4(t, archsimd.Int8x16.ExtendLo4ToInt64, map1n[int8](toInt64, 4))
+		testInt16x8ConvertLoToInt64x4(t, archsimd.Int16x8.ExtendLo4ToInt64, map1n[int16](toInt64, 4))
+		testUint8x16ConvertLoToUint64x4(t, archsimd.Uint8x16.ExtendLo4ToUint64, map1n[uint8](toUint64, 4))
+		testUint16x8ConvertLoToUint64x4(t, archsimd.Uint16x8.ExtendLo4ToUint64, map1n[uint16](toUint64, 4))
+		testInt8x16ConvertLoToInt32x8(t, archsimd.Int8x16.ExtendLo8ToInt32, map1n[int8](toInt32, 8))
+		testUint8x16ConvertLoToUint32x8(t, archsimd.Uint8x16.ExtendLo8ToUint32, map1n[uint8](toUint32, 8))
+	}
+
+	if archsimd.X86.AVX512() {
+		testInt8x16ConvertToInt64(t, archsimd.Int8x16.ExtendLo8ToInt64, map1n[int8](toInt64, 8))
+		testUint8x16ConvertToUint64(t, archsimd.Uint8x16.ExtendLo8ToUint64, map1n[uint8](toUint64, 8))
+	}
 }
 
-func TestConverts(t *testing.T) {
-	testUint8x16ConvertToUint16(t, archsimd.Uint8x16.ExtendToUint16, map1[uint8](toUint16))
-	testUint16x8ConvertToUint32(t, archsimd.Uint16x8.ExtendToUint32, map1[uint16](toUint32))
+func TestTruncate(t *testing.T) {
+	if archsimd.X86.AVX512() {
+		testInt16x8ConvertToInt8(t, archsimd.Int16x8.TruncateToInt8, map1n[int16](toInt8, 16))
+		testInt16x16ConvertToInt8(t, archsimd.Int16x16.TruncateToInt8, map1[int16](toInt8))
+		testInt16x32ConvertToInt8(t, archsimd.Int16x32.TruncateToInt8, map1[int16](toInt8))
+		testInt32x4ConvertToInt8(t, archsimd.Int32x4.TruncateToInt8, map1n[int32](toInt8, 16))
+		testInt32x8ConvertToInt8(t, archsimd.Int32x8.TruncateToInt8, map1n[int32](toInt8, 16))
+		testInt32x16ConvertToInt8(t, archsimd.Int32x16.TruncateToInt8, map1[int32](toInt8))
+		testInt64x2ConvertToInt8(t, archsimd.Int64x2.TruncateToInt8, map1n[int64](toInt8, 16))
+		testInt64x4ConvertToInt8(t, archsimd.Int64x4.TruncateToInt8, map1n[int64](toInt8, 16))
+		testInt64x8ConvertToInt8(t, archsimd.Int64x8.TruncateToInt8, map1n[int64](toInt8, 16))
+		testInt32x4ConvertToInt16(t, archsimd.Int32x4.TruncateToInt16, map1n[int32](toInt16, 8))
+		testInt32x8ConvertToInt16(t, archsimd.Int32x8.TruncateToInt16, map1[int32](toInt16))
+		testInt32x16ConvertToInt16(t, archsimd.Int32x16.TruncateToInt16, map1[int32](toInt16))
+		testInt64x2ConvertToInt16(t, archsimd.Int64x2.TruncateToInt16, map1n[int64](toInt16, 8))
+		testInt64x4ConvertToInt16(t, archsimd.Int64x4.TruncateToInt16, map1n[int64](toInt16, 8))
+		testInt64x8ConvertToInt16(t, archsimd.Int64x8.TruncateToInt16, map1[int64](toInt16))
+		testInt64x2ConvertToInt32(t, archsimd.Int64x2.TruncateToInt32, map1n[int64](toInt32, 4))
+		testInt64x4ConvertToInt32(t, archsimd.Int64x4.TruncateToInt32, map1[int64](toInt32))
+		testInt64x8ConvertToInt32(t, archsimd.Int64x8.TruncateToInt32, map1[int64](toInt32))
+
+		testUint16x8ConvertToUint8(t, archsimd.Uint16x8.TruncateToUint8, map1n[uint16](toUint8, 16))
+		testUint16x16ConvertToUint8(t, archsimd.Uint16x16.TruncateToUint8, map1[uint16](toUint8))
+		testUint16x32ConvertToUint8(t, archsimd.Uint16x32.TruncateToUint8, map1[uint16](toUint8))
+		testUint32x4ConvertToUint8(t, archsimd.Uint32x4.TruncateToUint8, map1n[uint32](toUint8, 16))
+		testUint32x8ConvertToUint8(t, archsimd.Uint32x8.TruncateToUint8, map1n[uint32](toUint8, 16))
+		testUint32x16ConvertToUint8(t, archsimd.Uint32x16.TruncateToUint8, map1[uint32](toUint8))
+		testUint64x2ConvertToUint8(t, archsimd.Uint64x2.TruncateToUint8, map1n[uint64](toUint8, 16))
+		testUint64x4ConvertToUint8(t, archsimd.Uint64x4.TruncateToUint8, map1n[uint64](toUint8, 16))
+		testUint64x8ConvertToUint8(t, archsimd.Uint64x8.TruncateToUint8, map1n[uint64](toUint8, 16))
+		testUint32x4ConvertToUint16(t, archsimd.Uint32x4.TruncateToUint16, map1n[uint32](toUint16, 8))
+		testUint32x8ConvertToUint16(t, archsimd.Uint32x8.TruncateToUint16, map1[uint32](toUint16))
+		testUint32x16ConvertToUint16(t, archsimd.Uint32x16.TruncateToUint16, map1[uint32](toUint16))
+		testUint64x2ConvertToUint16(t, archsimd.Uint64x2.TruncateToUint16, map1n[uint64](toUint16, 8))
+		testUint64x4ConvertToUint16(t, archsimd.Uint64x4.TruncateToUint16, map1n[uint64](toUint16, 8))
+		testUint64x8ConvertToUint16(t, archsimd.Uint64x8.TruncateToUint16, map1[uint64](toUint16))
+		testUint64x2ConvertToUint32(t, archsimd.Uint64x2.TruncateToUint32, map1n[uint64](toUint32, 4))
+		testUint64x4ConvertToUint32(t, archsimd.Uint64x4.TruncateToUint32, map1[uint64](toUint32))
+		testUint64x8ConvertToUint32(t, archsimd.Uint64x8.TruncateToUint32, map1[uint64](toUint32))
+	}
 }
 
-func TestConvertsAVX512(t *testing.T) {
-	if !archsimd.X86.AVX512() {
-		t.Skip("Needs AVX512")
+func TestSaturate(t *testing.T) {
+	if archsimd.X86.AVX512() {
+		testInt16x8ConvertToInt8(t, archsimd.Int16x8.SaturateToInt8, map1n[int16](satToInt8, 16))
+		testInt16x16ConvertToInt8(t, archsimd.Int16x16.SaturateToInt8, map1[int16](satToInt8))
+		testInt16x32ConvertToInt8(t, archsimd.Int16x32.SaturateToInt8, map1[int16](satToInt8))
+		testInt32x4ConvertToInt8(t, archsimd.Int32x4.SaturateToInt8, map1n[int32](satToInt8, 16))
+		testInt32x8ConvertToInt8(t, archsimd.Int32x8.SaturateToInt8, map1n[int32](satToInt8, 16))
+		testInt32x16ConvertToInt8(t, archsimd.Int32x16.SaturateToInt8, map1[int32](satToInt8))
+		testInt64x2ConvertToInt8(t, archsimd.Int64x2.SaturateToInt8, map1n[int64](satToInt8, 16))
+		testInt64x4ConvertToInt8(t, archsimd.Int64x4.SaturateToInt8, map1n[int64](satToInt8, 16))
+		testInt64x8ConvertToInt8(t, archsimd.Int64x8.SaturateToInt8, map1n[int64](satToInt8, 16))
+		testInt32x4ConvertToInt16(t, archsimd.Int32x4.SaturateToInt16, map1n[int32](satToInt16, 8))
+		testInt32x8ConvertToInt16(t, archsimd.Int32x8.SaturateToInt16, map1[int32](satToInt16))
+		testInt32x16ConvertToInt16(t, archsimd.Int32x16.SaturateToInt16, map1[int32](satToInt16))
+		testInt64x2ConvertToInt16(t, archsimd.Int64x2.SaturateToInt16, map1n[int64](satToInt16, 8))
+		testInt64x4ConvertToInt16(t, archsimd.Int64x4.SaturateToInt16, map1n[int64](satToInt16, 8))
+		testInt64x8ConvertToInt16(t, archsimd.Int64x8.SaturateToInt16, map1[int64](satToInt16))
+		testInt64x2ConvertToInt32(t, archsimd.Int64x2.SaturateToInt32, map1n[int64](satToInt32, 4))
+		testInt64x4ConvertToInt32(t, archsimd.Int64x4.SaturateToInt32, map1[int64](satToInt32))
+		testInt64x8ConvertToInt32(t, archsimd.Int64x8.SaturateToInt32, map1[int64](satToInt32))
+
+		testUint16x8ConvertToUint8(t, archsimd.Uint16x8.SaturateToUint8, map1n[uint16](satToUint8, 16))
+		testUint16x16ConvertToUint8(t, archsimd.Uint16x16.SaturateToUint8, map1[uint16](satToUint8))
+		testUint16x32ConvertToUint8(t, archsimd.Uint16x32.SaturateToUint8, map1[uint16](satToUint8))
+		testUint32x4ConvertToUint8(t, archsimd.Uint32x4.SaturateToUint8, map1n[uint32](satToUint8, 16))
+		testUint32x8ConvertToUint8(t, archsimd.Uint32x8.SaturateToUint8, map1n[uint32](satToUint8, 16))
+		testUint32x16ConvertToUint8(t, archsimd.Uint32x16.SaturateToUint8, map1[uint32](satToUint8))
+		testUint64x2ConvertToUint8(t, archsimd.Uint64x2.SaturateToUint8, map1n[uint64](satToUint8, 16))
+		testUint64x4ConvertToUint8(t, archsimd.Uint64x4.SaturateToUint8, map1n[uint64](satToUint8, 16))
+		testUint64x8ConvertToUint8(t, archsimd.Uint64x8.SaturateToUint8, map1n[uint64](satToUint8, 16))
+		testUint32x4ConvertToUint16(t, archsimd.Uint32x4.SaturateToUint16, map1n[uint32](satToUint16, 8))
+		testUint32x8ConvertToUint16(t, archsimd.Uint32x8.SaturateToUint16, map1[uint32](satToUint16))
+		testUint32x16ConvertToUint16(t, archsimd.Uint32x16.SaturateToUint16, map1[uint32](satToUint16))
+		testUint64x2ConvertToUint16(t, archsimd.Uint64x2.SaturateToUint16, map1n[uint64](satToUint16, 8))
+		testUint64x4ConvertToUint16(t, archsimd.Uint64x4.SaturateToUint16, map1n[uint64](satToUint16, 8))
+		testUint64x8ConvertToUint16(t, archsimd.Uint64x8.SaturateToUint16, map1[uint64](satToUint16))
+		testUint64x2ConvertToUint32(t, archsimd.Uint64x2.SaturateToUint32, map1n[uint64](satToUint32, 4))
+		testUint64x4ConvertToUint32(t, archsimd.Uint64x4.SaturateToUint32, map1[uint64](satToUint32))
+		testUint64x8ConvertToUint32(t, archsimd.Uint64x8.SaturateToUint32, map1[uint64](satToUint32))
 	}
-	testUint8x32ConvertToUint16(t, archsimd.Uint8x32.ExtendToUint16, map1[uint8](toUint16))
 }
diff --git a/src/simd/archsimd/maskmerge_gen_amd64.go b/src/simd/archsimd/maskmerge_gen_amd64.go
index 5e9ea394b3..ad56521714 100644
--- a/src/simd/archsimd/maskmerge_gen_amd64.go
+++ b/src/simd/archsimd/maskmerge_gen_amd64.go
@@ -1,4 +1,4 @@
-// Code generated by 'go run genfiles.go'; DO NOT EDIT.
+// Code generated by 'tmplgen'; DO NOT EDIT.
 
 //go:build goexperiment.simd
 
@@ -286,7 +286,7 @@ func (x Int8x64) Masked(mask Mask8x64) Int8x64 {
 	return im.And(x)
 }
 
-// Merge returns x but with elements set to y where m is false.
+// Merge returns x but with elements set to y where mask is false.
 func (x Int8x64) Merge(y Int8x64, mask Mask8x64) Int8x64 {
 	return y.blendMasked(x, mask)
 }
@@ -297,7 +297,7 @@ func (x Int16x32) Masked(mask Mask16x32) Int16x32 {
 	return im.And(x)
 }
 
-// Merge returns x but with elements set to y where m is false.
+// Merge returns x but with elements set to y where mask is false.
 func (x Int16x32) Merge(y Int16x32, mask Mask16x32) Int16x32 {
 	return y.blendMasked(x, mask)
 }
@@ -308,7 +308,7 @@ func (x Int32x16) Masked(mask Mask32x16) Int32x16 {
 	return im.And(x)
 }
 
-// Merge returns x but with elements set to y where m is false.
+// Merge returns x but with elements set to y where mask is false.
 func (x Int32x16) Merge(y Int32x16, mask Mask32x16) Int32x16 {
 	return y.blendMasked(x, mask)
 }
@@ -319,7 +319,7 @@ func (x Int64x8) Masked(mask Mask64x8) Int64x8 {
 	return im.And(x)
 }
 
-// Merge returns x but with elements set to y where m is false.
+// Merge returns x but with elements set to y where mask is false.
 func (x Int64x8) Merge(y Int64x8, mask Mask64x8) Int64x8 {
 	return y.blendMasked(x, mask)
 }
@@ -330,7 +330,7 @@ func (x Uint8x64) Masked(mask Mask8x64) Uint8x64 {
 	return x.AsInt8x64().And(im).AsUint8x64()
 }
 
-// Merge returns x but with elements set to y where m is false.
+// Merge returns x but with elements set to y where mask is false.
 func (x Uint8x64) Merge(y Uint8x64, mask Mask8x64) Uint8x64 {
 	ix := x.AsInt8x64()
 	iy := y.AsInt8x64()
@@ -343,7 +343,7 @@ func (x Uint16x32) Masked(mask Mask16x32) Uint16x32 {
 	return x.AsInt16x32().And(im).AsUint16x32()
 }
 
-// Merge returns x but with elements set to y where m is false.
+// Merge returns x but with elements set to y where mask is false.
 func (x Uint16x32) Merge(y Uint16x32, mask Mask16x32) Uint16x32 {
 	ix := x.AsInt16x32()
 	iy := y.AsInt16x32()
@@ -356,7 +356,7 @@ func (x Uint32x16) Masked(mask Mask32x16) Uint32x16 {
 	return x.AsInt32x16().And(im).AsUint32x16()
 }
 
-// Merge returns x but with elements set to y where m is false.
+// Merge returns x but with elements set to y where mask is false.
 func (x Uint32x16) Merge(y Uint32x16, mask Mask32x16) Uint32x16 {
 	ix := x.AsInt32x16()
 	iy := y.AsInt32x16()
@@ -369,7 +369,7 @@ func (x Uint64x8) Masked(mask Mask64x8) Uint64x8 {
 	return x.AsInt64x8().And(im).AsUint64x8()
 }
 
-// Merge returns x but with elements set to y where m is false.
+// Merge returns x but with elements set to y where mask is false.
 func (x Uint64x8) Merge(y Uint64x8, mask Mask64x8) Uint64x8 {
 	ix := x.AsInt64x8()
 	iy := y.AsInt64x8()
@@ -382,7 +382,7 @@ func (x Float32x16) Masked(mask Mask32x16) Float32x16 {
 	return x.AsInt32x16().And(im).AsFloat32x16()
 }
 
-// Merge returns x but with elements set to y where m is false.
+// Merge returns x but with elements set to y where mask is false.
 func (x Float32x16) Merge(y Float32x16, mask Mask32x16) Float32x16 {
 	ix := x.AsInt32x16()
 	iy := y.AsInt32x16()
@@ -395,7 +395,7 @@ func (x Float64x8) Masked(mask Mask64x8) Float64x8 {
 	return x.AsInt64x8().And(im).AsFloat64x8()
 }
 
-// Merge returns x but with elements set to y where m is false.
+// Merge returns x but with elements set to y where mask is false.
 func (x Float64x8) Merge(y Float64x8, mask Mask64x8) Float64x8 {
 	ix := x.AsInt64x8()
 	iy := y.AsInt64x8()
diff --git a/src/simd/archsimd/ops_amd64.go b/src/simd/archsimd/ops_amd64.go
index acd5719e6e..eba340c793 100644
--- a/src/simd/archsimd/ops_amd64.go
+++ b/src/simd/archsimd/ops_amd64.go
@@ -1,4 +1,4 @@
-// Code generated by x/arch/internal/simdgen using 'go run . -xedPath $XED_PATH -o godefs -goroot $GOROOT go.yaml types.yaml categories.yaml'; DO NOT EDIT.
+// Code generated by 'simdgen -o godefs -goroot $GOROOT -xedPath $XED_PATH go.yaml types.yaml categories.yaml'; DO NOT EDIT.
 
 //go:build goexperiment.simd
 
@@ -349,90 +349,101 @@ func (x Uint64x8) Add(y Uint64x8) Uint64x8
 /* AddPairs */
 
 // AddPairs horizontally adds adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...].
 //
 // Asm: VHADDPS, CPU Feature: AVX
 func (x Float32x4) AddPairs(y Float32x4) Float32x4
 
 // AddPairs horizontally adds adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
-//
-// Asm: VHADDPS, CPU Feature: AVX
-func (x Float32x8) AddPairs(y Float32x8) Float32x8
-
-// AddPairs horizontally adds adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
+// For x = [x0, x1] and y = [y0, y1], the result is [x0+x1, y0+y1].
 //
 // Asm: VHADDPD, CPU Feature: AVX
 func (x Float64x2) AddPairs(y Float64x2) Float64x2
 
 // AddPairs horizontally adds adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...].
 //
-// Asm: VHADDPD, CPU Feature: AVX
-func (x Float64x4) AddPairs(y Float64x4) Float64x4
+// Asm: VPHADDW, CPU Feature: AVX
+func (x Int16x8) AddPairs(y Int16x8) Int16x8
 
 // AddPairs horizontally adds adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...].
 //
-// Asm: VPHADDW, CPU Feature: AVX
-func (x Int16x8) AddPairs(y Int16x8) Int16x8
+// Asm: VPHADDD, CPU Feature: AVX
+func (x Int32x4) AddPairs(y Int32x4) Int32x4
 
 // AddPairs horizontally adds adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...].
 //
-// Asm: VPHADDW, CPU Feature: AVX2
-func (x Int16x16) AddPairs(y Int16x16) Int16x16
+// Asm: VPHADDW, CPU Feature: AVX
+func (x Uint16x8) AddPairs(y Uint16x8) Uint16x8
 
 // AddPairs horizontally adds adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...].
 //
 // Asm: VPHADDD, CPU Feature: AVX
-func (x Int32x4) AddPairs(y Int32x4) Int32x4
+func (x Uint32x4) AddPairs(y Uint32x4) Uint32x4
 
-// AddPairs horizontally adds adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
+/* AddPairsGrouped */
+
+// AddPairsGrouped horizontally adds adjacent pairs of elements.
+// With each 128-bit as a group:
+// for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...].
 //
-// Asm: VPHADDD, CPU Feature: AVX2
-func (x Int32x8) AddPairs(y Int32x8) Int32x8
+// Asm: VHADDPS, CPU Feature: AVX
+func (x Float32x8) AddPairsGrouped(y Float32x8) Float32x8
 
-// AddPairs horizontally adds adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
+// AddPairsGrouped horizontally adds adjacent pairs of elements.
+// With each 128-bit as a group:
+// for x = [x0, x1] and y = [y0, y1], the result is [x0+x1, y0+y1].
 //
-// Asm: VPHADDW, CPU Feature: AVX
-func (x Uint16x8) AddPairs(y Uint16x8) Uint16x8
+// Asm: VHADDPD, CPU Feature: AVX
+func (x Float64x4) AddPairsGrouped(y Float64x4) Float64x4
 
-// AddPairs horizontally adds adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
+// AddPairsGrouped horizontally adds adjacent pairs of elements.
+// With each 128-bit as a group:
+// for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...].
 //
 // Asm: VPHADDW, CPU Feature: AVX2
-func (x Uint16x16) AddPairs(y Uint16x16) Uint16x16
+func (x Int16x16) AddPairsGrouped(y Int16x16) Int16x16
 
-// AddPairs horizontally adds adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
+// AddPairsGrouped horizontally adds adjacent pairs of elements.
+// With each 128-bit as a group:
+// for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...].
 //
-// Asm: VPHADDD, CPU Feature: AVX
-func (x Uint32x4) AddPairs(y Uint32x4) Uint32x4
+// Asm: VPHADDD, CPU Feature: AVX2
+func (x Int32x8) AddPairsGrouped(y Int32x8) Int32x8
 
-// AddPairs horizontally adds adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
+// AddPairsGrouped horizontally adds adjacent pairs of elements.
+// With each 128-bit as a group:
+// for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...].
+//
+// Asm: VPHADDW, CPU Feature: AVX2
+func (x Uint16x16) AddPairsGrouped(y Uint16x16) Uint16x16
+
+// AddPairsGrouped horizontally adds adjacent pairs of elements.
+// With each 128-bit as a group:
+// for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...].
 //
 // Asm: VPHADDD, CPU Feature: AVX2
-func (x Uint32x8) AddPairs(y Uint32x8) Uint32x8
+func (x Uint32x8) AddPairsGrouped(y Uint32x8) Uint32x8
 
 /* AddPairsSaturated */
 
 // AddPairsSaturated horizontally adds adjacent pairs of elements with saturation.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...].
 //
 // Asm: VPHADDSW, CPU Feature: AVX
 func (x Int16x8) AddPairsSaturated(y Int16x8) Int16x8
 
-// AddPairsSaturated horizontally adds adjacent pairs of elements with saturation.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
+/* AddPairsSaturatedGrouped */
+
+// AddPairsSaturatedGrouped horizontally adds adjacent pairs of elements with saturation.
+// With each 128-bit as a group:
+// for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...].
 //
 // Asm: VPHADDSW, CPU Feature: AVX2
-func (x Int16x16) AddPairsSaturated(y Int16x16) Int16x16
+func (x Int16x16) AddPairsSaturatedGrouped(y Int16x16) Int16x16
 
 /* AddSaturated */
 
@@ -1275,7 +1286,9 @@ func (x Uint64x8) Compress(mask Mask64x8) Uint64x8
 /* ConcatPermute */
 
 // ConcatPermute performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+//
+//	result = {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+//
 // where xy is the concatenation of x (lower half) and y (upper half).
 // Only the needed bits to represent xy's index are used in indices' elements.
 //
@@ -1283,7 +1296,9 @@ func (x Uint64x8) Compress(mask Mask64x8) Uint64x8
 func (x Int8x16) ConcatPermute(y Int8x16, indices Uint8x16) Int8x16
 
 // ConcatPermute performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+//
+//	result = {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+//
 // where xy is the concatenation of x (lower half) and y (upper half).
 // Only the needed bits to represent xy's index are used in indices' elements.
 //
@@ -1291,7 +1306,9 @@ func (x Int8x16) ConcatPermute(y Int8x16, indices Uint8x16) Int8x16
 func (x Uint8x16) ConcatPermute(y Uint8x16, indices Uint8x16) Uint8x16
 
 // ConcatPermute performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+//
+//	result = {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+//
 // where xy is the concatenation of x (lower half) and y (upper half).
 // Only the needed bits to represent xy's index are used in indices' elements.
 //
@@ -1299,7 +1316,9 @@ func (x Uint8x16) ConcatPermute(y Uint8x16, indices Uint8x16) Uint8x16
 func (x Int8x32) ConcatPermute(y Int8x32, indices Uint8x32) Int8x32
 
 // ConcatPermute performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+//
+//	result = {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+//
 // where xy is the concatenation of x (lower half) and y (upper half).
 // Only the needed bits to represent xy's index are used in indices' elements.
 //
@@ -1307,7 +1326,9 @@ func (x Int8x32) ConcatPermute(y Int8x32, indices Uint8x32) Int8x32
 func (x Uint8x32) ConcatPermute(y Uint8x32, indices Uint8x32) Uint8x32
 
 // ConcatPermute performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+//
+//	result = {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+//
 // where xy is the concatenation of x (lower half) and y (upper half).
 // Only the needed bits to represent xy's index are used in indices' elements.
 //
@@ -1315,7 +1336,9 @@ func (x Uint8x32) ConcatPermute(y Uint8x32, indices Uint8x32) Uint8x32
 func (x Int8x64) ConcatPermute(y Int8x64, indices Uint8x64) Int8x64
 
 // ConcatPermute performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+//
+//	result = {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+//
 // where xy is the concatenation of x (lower half) and y (upper half).
 // Only the needed bits to represent xy's index are used in indices' elements.
 //
@@ -1323,7 +1346,9 @@ func (x Int8x64) ConcatPermute(y Int8x64, indices Uint8x64) Int8x64
 func (x Uint8x64) ConcatPermute(y Uint8x64, indices Uint8x64) Uint8x64
 
 // ConcatPermute performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+//
+//	result = {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+//
 // where xy is the concatenation of x (lower half) and y (upper half).
 // Only the needed bits to represent xy's index are used in indices' elements.
 //
@@ -1331,7 +1356,9 @@ func (x Uint8x64) ConcatPermute(y Uint8x64, indices Uint8x64) Uint8x64
 func (x Int16x8) ConcatPermute(y Int16x8, indices Uint16x8) Int16x8
 
 // ConcatPermute performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+//
+//	result = {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+//
 // where xy is the concatenation of x (lower half) and y (upper half).
 // Only the needed bits to represent xy's index are used in indices' elements.
 //
@@ -1339,7 +1366,9 @@ func (x Int16x8) ConcatPermute(y Int16x8, indices Uint16x8) Int16x8
 func (x Uint16x8) ConcatPermute(y Uint16x8, indices Uint16x8) Uint16x8
 
 // ConcatPermute performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+//
+//	result = {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+//
 // where xy is the concatenation of x (lower half) and y (upper half).
 // Only the needed bits to represent xy's index are used in indices' elements.
 //
@@ -1347,7 +1376,9 @@ func (x Uint16x8) ConcatPermute(y Uint16x8, indices Uint16x8) Uint16x8
 func (x Int16x16) ConcatPermute(y Int16x16, indices Uint16x16) Int16x16
 
 // ConcatPermute performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+//
+//	result = {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+//
 // where xy is the concatenation of x (lower half) and y (upper half).
 // Only the needed bits to represent xy's index are used in indices' elements.
 //
@@ -1355,7 +1386,9 @@ func (x Int16x16) ConcatPermute(y Int16x16, indices Uint16x16) Int16x16
 func (x Uint16x16) ConcatPermute(y Uint16x16, indices Uint16x16) Uint16x16
 
 // ConcatPermute performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+//
+//	result = {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+//
 // where xy is the concatenation of x (lower half) and y (upper half).
 // Only the needed bits to represent xy's index are used in indices' elements.
 //
@@ -1363,7 +1396,9 @@ func (x Uint16x16) ConcatPermute(y Uint16x16, indices Uint16x16) Uint16x16
 func (x Int16x32) ConcatPermute(y Int16x32, indices Uint16x32) Int16x32
 
 // ConcatPermute performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+//
+//	result = {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+//
 // where xy is the concatenation of x (lower half) and y (upper half).
 // Only the needed bits to represent xy's index are used in indices' elements.
 //
@@ -1371,7 +1406,9 @@ func (x Int16x32) ConcatPermute(y Int16x32, indices Uint16x32) Int16x32
 func (x Uint16x32) ConcatPermute(y Uint16x32, indices Uint16x32) Uint16x32
 
 // ConcatPermute performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+//
+//	result = {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+//
 // where xy is the concatenation of x (lower half) and y (upper half).
 // Only the needed bits to represent xy's index are used in indices' elements.
 //
@@ -1379,7 +1416,9 @@ func (x Uint16x32) ConcatPermute(y Uint16x32, indices Uint16x32) Uint16x32
 func (x Float32x4) ConcatPermute(y Float32x4, indices Uint32x4) Float32x4
 
 // ConcatPermute performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+//
+//	result = {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+//
 // where xy is the concatenation of x (lower half) and y (upper half).
 // Only the needed bits to represent xy's index are used in indices' elements.
 //
@@ -1387,7 +1426,9 @@ func (x Float32x4) ConcatPermute(y Float32x4, indices Uint32x4) Float32x4
 func (x Int32x4) ConcatPermute(y Int32x4, indices Uint32x4) Int32x4
 
 // ConcatPermute performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+//
+//	result = {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+//
 // where xy is the concatenation of x (lower half) and y (upper half).
 // Only the needed bits to represent xy's index are used in indices' elements.
 //
@@ -1395,7 +1436,9 @@ func (x Int32x4) ConcatPermute(y Int32x4, indices Uint32x4) Int32x4
 func (x Uint32x4) ConcatPermute(y Uint32x4, indices Uint32x4) Uint32x4
 
 // ConcatPermute performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+//
+//	result = {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+//
 // where xy is the concatenation of x (lower half) and y (upper half).
 // Only the needed bits to represent xy's index are used in indices' elements.
 //
@@ -1403,7 +1446,9 @@ func (x Uint32x4) ConcatPermute(y Uint32x4, indices Uint32x4) Uint32x4
 func (x Float32x8) ConcatPermute(y Float32x8, indices Uint32x8) Float32x8
 
 // ConcatPermute performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+//
+//	result = {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+//
 // where xy is the concatenation of x (lower half) and y (upper half).
 // Only the needed bits to represent xy's index are used in indices' elements.
 //
@@ -1411,7 +1456,9 @@ func (x Float32x8) ConcatPermute(y Float32x8, indices Uint32x8) Float32x8
 func (x Int32x8) ConcatPermute(y Int32x8, indices Uint32x8) Int32x8
 
 // ConcatPermute performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+//
+//	result = {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+//
 // where xy is the concatenation of x (lower half) and y (upper half).
 // Only the needed bits to represent xy's index are used in indices' elements.
 //
@@ -1419,7 +1466,9 @@ func (x Int32x8) ConcatPermute(y Int32x8, indices Uint32x8) Int32x8
 func (x Uint32x8) ConcatPermute(y Uint32x8, indices Uint32x8) Uint32x8
 
 // ConcatPermute performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+//
+//	result = {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+//
 // where xy is the concatenation of x (lower half) and y (upper half).
 // Only the needed bits to represent xy's index are used in indices' elements.
 //
@@ -1427,7 +1476,9 @@ func (x Uint32x8) ConcatPermute(y Uint32x8, indices Uint32x8) Uint32x8
 func (x Float32x16) ConcatPermute(y Float32x16, indices Uint32x16) Float32x16
 
 // ConcatPermute performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+//
+//	result = {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+//
 // where xy is the concatenation of x (lower half) and y (upper half).
 // Only the needed bits to represent xy's index are used in indices' elements.
 //
@@ -1435,7 +1486,9 @@ func (x Float32x16) ConcatPermute(y Float32x16, indices Uint32x16) Float32x16
 func (x Int32x16) ConcatPermute(y Int32x16, indices Uint32x16) Int32x16
 
 // ConcatPermute performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+//
+//	result = {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+//
 // where xy is the concatenation of x (lower half) and y (upper half).
 // Only the needed bits to represent xy's index are used in indices' elements.
 //
@@ -1443,7 +1496,9 @@ func (x Int32x16) ConcatPermute(y Int32x16, indices Uint32x16) Int32x16
 func (x Uint32x16) ConcatPermute(y Uint32x16, indices Uint32x16) Uint32x16
 
 // ConcatPermute performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+//
+//	result = {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+//
 // where xy is the concatenation of x (lower half) and y (upper half).
 // Only the needed bits to represent xy's index are used in indices' elements.
 //
@@ -1451,7 +1506,9 @@ func (x Uint32x16) ConcatPermute(y Uint32x16, indices Uint32x16) Uint32x16
 func (x Float64x2) ConcatPermute(y Float64x2, indices Uint64x2) Float64x2
 
 // ConcatPermute performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+//
+//	result = {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+//
 // where xy is the concatenation of x (lower half) and y (upper half).
 // Only the needed bits to represent xy's index are used in indices' elements.
 //
@@ -1459,7 +1516,9 @@ func (x Float64x2) ConcatPermute(y Float64x2, indices Uint64x2) Float64x2
 func (x Int64x2) ConcatPermute(y Int64x2, indices Uint64x2) Int64x2
 
 // ConcatPermute performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+//
+//	result = {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+//
 // where xy is the concatenation of x (lower half) and y (upper half).
 // Only the needed bits to represent xy's index are used in indices' elements.
 //
@@ -1467,7 +1526,9 @@ func (x Int64x2) ConcatPermute(y Int64x2, indices Uint64x2) Int64x2
 func (x Uint64x2) ConcatPermute(y Uint64x2, indices Uint64x2) Uint64x2
 
 // ConcatPermute performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+//
+//	result = {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+//
 // where xy is the concatenation of x (lower half) and y (upper half).
 // Only the needed bits to represent xy's index are used in indices' elements.
 //
@@ -1475,7 +1536,9 @@ func (x Uint64x2) ConcatPermute(y Uint64x2, indices Uint64x2) Uint64x2
 func (x Float64x4) ConcatPermute(y Float64x4, indices Uint64x4) Float64x4
 
 // ConcatPermute performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+//
+//	result = {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+//
 // where xy is the concatenation of x (lower half) and y (upper half).
 // Only the needed bits to represent xy's index are used in indices' elements.
 //
@@ -1483,7 +1546,9 @@ func (x Float64x4) ConcatPermute(y Float64x4, indices Uint64x4) Float64x4
 func (x Int64x4) ConcatPermute(y Int64x4, indices Uint64x4) Int64x4
 
 // ConcatPermute performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+//
+//	result = {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+//
 // where xy is the concatenation of x (lower half) and y (upper half).
 // Only the needed bits to represent xy's index are used in indices' elements.
 //
@@ -1491,7 +1556,9 @@ func (x Int64x4) ConcatPermute(y Int64x4, indices Uint64x4) Int64x4
 func (x Uint64x4) ConcatPermute(y Uint64x4, indices Uint64x4) Uint64x4
 
 // ConcatPermute performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+//
+//	result = {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+//
 // where xy is the concatenation of x (lower half) and y (upper half).
 // Only the needed bits to represent xy's index are used in indices' elements.
 //
@@ -1499,7 +1566,9 @@ func (x Uint64x4) ConcatPermute(y Uint64x4, indices Uint64x4) Uint64x4
 func (x Float64x8) ConcatPermute(y Float64x8, indices Uint64x8) Float64x8
 
 // ConcatPermute performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+//
+//	result = {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+//
 // where xy is the concatenation of x (lower half) and y (upper half).
 // Only the needed bits to represent xy's index are used in indices' elements.
 //
@@ -1507,7 +1576,9 @@ func (x Float64x8) ConcatPermute(y Float64x8, indices Uint64x8) Float64x8
 func (x Int64x8) ConcatPermute(y Int64x8, indices Uint64x8) Int64x8
 
 // ConcatPermute performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+//
+//	result = {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+//
 // where xy is the concatenation of x (lower half) and y (upper half).
 // Only the needed bits to represent xy's index are used in indices' elements.
 //
@@ -1516,33 +1587,33 @@ func (x Uint64x8) ConcatPermute(y Uint64x8, indices Uint64x8) Uint64x8
 
 /* ConcatShiftBytesRight */
 
-// ConcatShiftBytesRight concatenates x and y and shift it right by constant bytes.
+// ConcatShiftBytesRight concatenates x and y and shift it right by shift bytes.
 // The result vector will be the lower half of the concatenated vector.
 //
-// constant results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+// shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
 //
 // Asm: VPALIGNR, CPU Feature: AVX
-func (x Uint8x16) ConcatShiftBytesRight(constant uint8, y Uint8x16) Uint8x16
+func (x Uint8x16) ConcatShiftBytesRight(shift uint8, y Uint8x16) Uint8x16
 
 /* ConcatShiftBytesRightGrouped */
 
-// ConcatShiftBytesRightGrouped concatenates x and y and shift it right by constant bytes.
+// ConcatShiftBytesRightGrouped concatenates x and y and shift it right by shift bytes.
 // The result vector will be the lower half of the concatenated vector.
 // This operation is performed grouped by each 16 byte.
 //
-// constant results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+// shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
 //
 // Asm: VPALIGNR, CPU Feature: AVX2
-func (x Uint8x32) ConcatShiftBytesRightGrouped(constant uint8, y Uint8x32) Uint8x32
+func (x Uint8x32) ConcatShiftBytesRightGrouped(shift uint8, y Uint8x32) Uint8x32
 
-// ConcatShiftBytesRightGrouped concatenates x and y and shift it right by constant bytes.
+// ConcatShiftBytesRightGrouped concatenates x and y and shift it right by shift bytes.
 // The result vector will be the lower half of the concatenated vector.
 // This operation is performed grouped by each 16 byte.
 //
-// constant results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+// shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
 //
 // Asm: VPALIGNR, CPU Feature: AVX512
-func (x Uint8x64) ConcatShiftBytesRightGrouped(constant uint8, y Uint8x64) Uint8x64
+func (x Uint8x64) ConcatShiftBytesRightGrouped(shift uint8, y Uint8x64) Uint8x64
 
 /* ConvertToFloat32 */
 
@@ -1872,38 +1943,38 @@ func (x Float64x8) ConvertToUint64() Uint64x8
 
 /* CopySign */
 
-// CopySign returns the product of the first operand with -1, 0, or 1,
-// whichever constant is nearest to the value of the second operand.
+// CopySign returns the product of x with -1, 0, or 1,
+// whichever constant is nearest to the value of y.
 //
 // Asm: VPSIGNB, CPU Feature: AVX
 func (x Int8x16) CopySign(y Int8x16) Int8x16
 
-// CopySign returns the product of the first operand with -1, 0, or 1,
-// whichever constant is nearest to the value of the second operand.
+// CopySign returns the product of x with -1, 0, or 1,
+// whichever constant is nearest to the value of y.
 //
 // Asm: VPSIGNB, CPU Feature: AVX2
 func (x Int8x32) CopySign(y Int8x32) Int8x32
 
-// CopySign returns the product of the first operand with -1, 0, or 1,
-// whichever constant is nearest to the value of the second operand.
+// CopySign returns the product of x with -1, 0, or 1,
+// whichever constant is nearest to the value of y.
 //
 // Asm: VPSIGNW, CPU Feature: AVX
 func (x Int16x8) CopySign(y Int16x8) Int16x8
 
-// CopySign returns the product of the first operand with -1, 0, or 1,
-// whichever constant is nearest to the value of the second operand.
+// CopySign returns the product of x with -1, 0, or 1,
+// whichever constant is nearest to the value of y.
 //
 // Asm: VPSIGNW, CPU Feature: AVX2
 func (x Int16x16) CopySign(y Int16x16) Int16x16
 
-// CopySign returns the product of the first operand with -1, 0, or 1,
-// whichever constant is nearest to the value of the second operand.
+// CopySign returns the product of x with -1, 0, or 1,
+// whichever constant is nearest to the value of y.
 //
 // Asm: VPSIGND, CPU Feature: AVX
 func (x Int32x4) CopySign(y Int32x4) Int32x4
 
-// CopySign returns the product of the first operand with -1, 0, or 1,
-// whichever constant is nearest to the value of the second operand.
+// CopySign returns the product of x with -1, 0, or 1,
+// whichever constant is nearest to the value of y.
 //
 // Asm: VPSIGND, CPU Feature: AVX2
 func (x Int32x8) CopySign(y Int32x8) Int32x8
@@ -1980,194 +2051,154 @@ func (x Uint8x32) DotProductPairsSaturated(y Int8x32) Int16x16
 // Asm: VPMADDUBSW, CPU Feature: AVX512
 func (x Uint8x64) DotProductPairsSaturated(y Int8x64) Int16x32
 
-/* DotProductQuadruple */
-
-// DotProductQuadruple performs dot products on groups of 4 elements of x and y.
-// DotProductQuadruple(x, y).Add(z) will be optimized to the full form of the underlying instruction.
-//
-// Asm: VPDPBUSD, CPU Feature: AVXVNNI
-func (x Int8x16) DotProductQuadruple(y Uint8x16) Int32x4
-
-// DotProductQuadruple performs dot products on groups of 4 elements of x and y.
-// DotProductQuadruple(x, y).Add(z) will be optimized to the full form of the underlying instruction.
-//
-// Asm: VPDPBUSD, CPU Feature: AVXVNNI
-func (x Int8x32) DotProductQuadruple(y Uint8x32) Int32x8
-
-// DotProductQuadruple performs dot products on groups of 4 elements of x and y.
-// DotProductQuadruple(x, y).Add(z) will be optimized to the full form of the underlying instruction.
-//
-// Asm: VPDPBUSD, CPU Feature: AVX512VNNI
-func (x Int8x64) DotProductQuadruple(y Uint8x64) Int32x16
-
-/* DotProductQuadrupleSaturated */
-
-// DotProductQuadrupleSaturated multiplies performs dot products on groups of 4 elements of x and y.
-// DotProductQuadrupleSaturated(x, y).Add(z) will be optimized to the full form of the underlying instruction.
-//
-// Asm: VPDPBUSDS, CPU Feature: AVXVNNI
-func (x Int8x16) DotProductQuadrupleSaturated(y Uint8x16) Int32x4
-
-// DotProductQuadrupleSaturated multiplies performs dot products on groups of 4 elements of x and y.
-// DotProductQuadrupleSaturated(x, y).Add(z) will be optimized to the full form of the underlying instruction.
-//
-// Asm: VPDPBUSDS, CPU Feature: AVXVNNI
-func (x Int8x32) DotProductQuadrupleSaturated(y Uint8x32) Int32x8
-
-// DotProductQuadrupleSaturated multiplies performs dot products on groups of 4 elements of x and y.
-// DotProductQuadrupleSaturated(x, y).Add(z) will be optimized to the full form of the underlying instruction.
-//
-// Asm: VPDPBUSDS, CPU Feature: AVX512VNNI
-func (x Int8x64) DotProductQuadrupleSaturated(y Uint8x64) Int32x16
-
 /* Equal */
 
-// Equal returns x equals y, elementwise.
+// Equal returns a mask whose elements indicate whether x == y.
 //
 // Asm: VPCMPEQB, CPU Feature: AVX
 func (x Int8x16) Equal(y Int8x16) Mask8x16
 
-// Equal returns x equals y, elementwise.
+// Equal returns a mask whose elements indicate whether x == y.
 //
 // Asm: VPCMPEQB, CPU Feature: AVX2
 func (x Int8x32) Equal(y Int8x32) Mask8x32
 
-// Equal returns x equals y, elementwise.
+// Equal returns a mask whose elements indicate whether x == y.
 //
 // Asm: VPCMPEQB, CPU Feature: AVX512
 func (x Int8x64) Equal(y Int8x64) Mask8x64
 
-// Equal returns x equals y, elementwise.
+// Equal returns a mask whose elements indicate whether x == y.
 //
 // Asm: VPCMPEQW, CPU Feature: AVX
 func (x Int16x8) Equal(y Int16x8) Mask16x8
 
-// Equal returns x equals y, elementwise.
+// Equal returns a mask whose elements indicate whether x == y.
 //
 // Asm: VPCMPEQW, CPU Feature: AVX2
 func (x Int16x16) Equal(y Int16x16) Mask16x16
 
-// Equal returns x equals y, elementwise.
+// Equal returns a mask whose elements indicate whether x == y.
 //
 // Asm: VPCMPEQW, CPU Feature: AVX512
 func (x Int16x32) Equal(y Int16x32) Mask16x32
 
-// Equal returns x equals y, elementwise.
+// Equal returns a mask whose elements indicate whether x == y.
 //
 // Asm: VPCMPEQD, CPU Feature: AVX
 func (x Int32x4) Equal(y Int32x4) Mask32x4
 
-// Equal returns x equals y, elementwise.
+// Equal returns a mask whose elements indicate whether x == y.
 //
 // Asm: VPCMPEQD, CPU Feature: AVX2
 func (x Int32x8) Equal(y Int32x8) Mask32x8
 
-// Equal returns x equals y, elementwise.
+// Equal returns a mask whose elements indicate whether x == y.
 //
 // Asm: VPCMPEQD, CPU Feature: AVX512
 func (x Int32x16) Equal(y Int32x16) Mask32x16
 
-// Equal returns x equals y, elementwise.
+// Equal returns a mask whose elements indicate whether x == y.
 //
 // Asm: VPCMPEQQ, CPU Feature: AVX
 func (x Int64x2) Equal(y Int64x2) Mask64x2
 
-// Equal returns x equals y, elementwise.
+// Equal returns a mask whose elements indicate whether x == y.
 //
 // Asm: VPCMPEQQ, CPU Feature: AVX2
 func (x Int64x4) Equal(y Int64x4) Mask64x4
 
-// Equal returns x equals y, elementwise.
+// Equal returns a mask whose elements indicate whether x == y.
 //
 // Asm: VPCMPEQQ, CPU Feature: AVX512
 func (x Int64x8) Equal(y Int64x8) Mask64x8
 
-// Equal returns x equals y, elementwise.
+// Equal returns a mask whose elements indicate whether x == y.
 //
 // Asm: VPCMPEQB, CPU Feature: AVX
 func (x Uint8x16) Equal(y Uint8x16) Mask8x16
 
-// Equal returns x equals y, elementwise.
+// Equal returns a mask whose elements indicate whether x == y.
 //
 // Asm: VPCMPEQB, CPU Feature: AVX2
 func (x Uint8x32) Equal(y Uint8x32) Mask8x32
 
-// Equal returns x equals y, elementwise.
+// Equal returns a mask whose elements indicate whether x == y.
 //
 // Asm: VPCMPEQB, CPU Feature: AVX512
 func (x Uint8x64) Equal(y Uint8x64) Mask8x64
 
-// Equal returns x equals y, elementwise.
+// Equal returns a mask whose elements indicate whether x == y.
 //
 // Asm: VPCMPEQW, CPU Feature: AVX
 func (x Uint16x8) Equal(y Uint16x8) Mask16x8
 
-// Equal returns x equals y, elementwise.
+// Equal returns a mask whose elements indicate whether x == y.
 //
 // Asm: VPCMPEQW, CPU Feature: AVX2
 func (x Uint16x16) Equal(y Uint16x16) Mask16x16
 
-// Equal returns x equals y, elementwise.
+// Equal returns a mask whose elements indicate whether x == y.
 //
 // Asm: VPCMPEQW, CPU Feature: AVX512
 func (x Uint16x32) Equal(y Uint16x32) Mask16x32
 
-// Equal returns x equals y, elementwise.
+// Equal returns a mask whose elements indicate whether x == y.
 //
 // Asm: VPCMPEQD, CPU Feature: AVX
 func (x Uint32x4) Equal(y Uint32x4) Mask32x4
 
-// Equal returns x equals y, elementwise.
+// Equal returns a mask whose elements indicate whether x == y.
 //
 // Asm: VPCMPEQD, CPU Feature: AVX2
 func (x Uint32x8) Equal(y Uint32x8) Mask32x8
 
-// Equal returns x equals y, elementwise.
+// Equal returns a mask whose elements indicate whether x == y.
 //
 // Asm: VPCMPEQD, CPU Feature: AVX512
 func (x Uint32x16) Equal(y Uint32x16) Mask32x16
 
-// Equal returns x equals y, elementwise.
+// Equal returns a mask whose elements indicate whether x == y.
 //
 // Asm: VPCMPEQQ, CPU Feature: AVX
 func (x Uint64x2) Equal(y Uint64x2) Mask64x2
 
-// Equal returns x equals y, elementwise.
+// Equal returns a mask whose elements indicate whether x == y.
 //
 // Asm: VPCMPEQQ, CPU Feature: AVX2
 func (x Uint64x4) Equal(y Uint64x4) Mask64x4
 
-// Equal returns x equals y, elementwise.
+// Equal returns a mask whose elements indicate whether x == y.
 //
 // Asm: VPCMPEQQ, CPU Feature: AVX512
 func (x Uint64x8) Equal(y Uint64x8) Mask64x8
 
-// Equal returns x equals y, elementwise.
+// Equal returns a mask whose elements indicate whether x == y.
 //
 // Asm: VCMPPS, CPU Feature: AVX
 func (x Float32x4) Equal(y Float32x4) Mask32x4
 
-// Equal returns x equals y, elementwise.
+// Equal returns a mask whose elements indicate whether x == y.
 //
 // Asm: VCMPPS, CPU Feature: AVX
 func (x Float32x8) Equal(y Float32x8) Mask32x8
 
-// Equal returns x equals y, elementwise.
+// Equal returns a mask whose elements indicate whether x == y.
 //
 // Asm: VCMPPS, CPU Feature: AVX512
 func (x Float32x16) Equal(y Float32x16) Mask32x16
 
-// Equal returns x equals y, elementwise.
+// Equal returns a mask whose elements indicate whether x == y.
 //
 // Asm: VCMPPD, CPU Feature: AVX
 func (x Float64x2) Equal(y Float64x2) Mask64x2
 
-// Equal returns x equals y, elementwise.
+// Equal returns a mask whose elements indicate whether x == y.
 //
 // Asm: VCMPPD, CPU Feature: AVX
 func (x Float64x4) Equal(y Float64x4) Mask64x4
 
-// Equal returns x equals y, elementwise.
+// Equal returns a mask whose elements indicate whether x == y.
 //
 // Asm: VCMPPD, CPU Feature: AVX512
 func (x Float64x8) Equal(y Float64x8) Mask64x8
@@ -2354,254 +2385,218 @@ func (x Uint64x4) Expand(mask Mask64x4) Uint64x4
 // Asm: VPEXPANDQ, CPU Feature: AVX512
 func (x Uint64x8) Expand(mask Mask64x8) Uint64x8
 
-/* ExtendLo2ToInt64x2 */
+/* ExtendLo2ToInt64 */
 
-// ExtendLo2ToInt64x2 converts 2 lowest vector element values to int64.
-// The result vector's elements are sign-extended.
+// ExtendLo2ToInt64 sign-extends 2 lowest vector element values to int64.
 //
 // Asm: VPMOVSXBQ, CPU Feature: AVX
-func (x Int8x16) ExtendLo2ToInt64x2() Int64x2
+func (x Int8x16) ExtendLo2ToInt64() Int64x2
 
-// ExtendLo2ToInt64x2 converts 2 lowest vector element values to int64.
-// The result vector's elements are sign-extended.
+// ExtendLo2ToInt64 sign-extends 2 lowest vector element values to int64.
 //
 // Asm: VPMOVSXWQ, CPU Feature: AVX
-func (x Int16x8) ExtendLo2ToInt64x2() Int64x2
+func (x Int16x8) ExtendLo2ToInt64() Int64x2
 
-// ExtendLo2ToInt64x2 converts 2 lowest vector element values to int64.
-// The result vector's elements are sign-extended.
+// ExtendLo2ToInt64 sign-extends 2 lowest vector element values to int64.
 //
 // Asm: VPMOVSXDQ, CPU Feature: AVX
-func (x Int32x4) ExtendLo2ToInt64x2() Int64x2
+func (x Int32x4) ExtendLo2ToInt64() Int64x2
 
-/* ExtendLo2ToUint64x2 */
+/* ExtendLo2ToUint64 */
 
-// ExtendLo2ToUint64x2 converts 2 lowest vector element values to uint64.
-// The result vector's elements are zero-extended.
+// ExtendLo2ToUint64 zero-extends 2 lowest vector element values to uint64.
 //
 // Asm: VPMOVZXBQ, CPU Feature: AVX
-func (x Uint8x16) ExtendLo2ToUint64x2() Uint64x2
+func (x Uint8x16) ExtendLo2ToUint64() Uint64x2
 
-// ExtendLo2ToUint64x2 converts 2 lowest vector element values to uint64.
-// The result vector's elements are zero-extended.
+// ExtendLo2ToUint64 zero-extends 2 lowest vector element values to uint64.
 //
 // Asm: VPMOVZXWQ, CPU Feature: AVX
-func (x Uint16x8) ExtendLo2ToUint64x2() Uint64x2
+func (x Uint16x8) ExtendLo2ToUint64() Uint64x2
 
-// ExtendLo2ToUint64x2 converts 2 lowest vector element values to uint64.
-// The result vector's elements are zero-extended.
+// ExtendLo2ToUint64 zero-extends 2 lowest vector element values to uint64.
 //
 // Asm: VPMOVZXDQ, CPU Feature: AVX
-func (x Uint32x4) ExtendLo2ToUint64x2() Uint64x2
+func (x Uint32x4) ExtendLo2ToUint64() Uint64x2
 
-/* ExtendLo4ToInt32x4 */
+/* ExtendLo4ToInt32 */
 
-// ExtendLo4ToInt32x4 converts 4 lowest vector element values to int32.
-// The result vector's elements are sign-extended.
+// ExtendLo4ToInt32 sign-extends 4 lowest vector element values to int32.
 //
 // Asm: VPMOVSXBD, CPU Feature: AVX
-func (x Int8x16) ExtendLo4ToInt32x4() Int32x4
+func (x Int8x16) ExtendLo4ToInt32() Int32x4
 
-// ExtendLo4ToInt32x4 converts 4 lowest vector element values to int32.
-// The result vector's elements are sign-extended.
+// ExtendLo4ToInt32 sign-extends 4 lowest vector element values to int32.
 //
 // Asm: VPMOVSXWD, CPU Feature: AVX
-func (x Int16x8) ExtendLo4ToInt32x4() Int32x4
+func (x Int16x8) ExtendLo4ToInt32() Int32x4
 
-/* ExtendLo4ToInt64x4 */
+/* ExtendLo4ToInt64 */
 
-// ExtendLo4ToInt64x4 converts 4 lowest vector element values to int64.
-// The result vector's elements are sign-extended.
+// ExtendLo4ToInt64 sign-extends 4 lowest vector element values to int64.
 //
 // Asm: VPMOVSXBQ, CPU Feature: AVX2
-func (x Int8x16) ExtendLo4ToInt64x4() Int64x4
+func (x Int8x16) ExtendLo4ToInt64() Int64x4
 
-// ExtendLo4ToInt64x4 converts 4 lowest vector element values to int64.
-// The result vector's elements are sign-extended.
+// ExtendLo4ToInt64 sign-extends 4 lowest vector element values to int64.
 //
 // Asm: VPMOVSXWQ, CPU Feature: AVX2
-func (x Int16x8) ExtendLo4ToInt64x4() Int64x4
+func (x Int16x8) ExtendLo4ToInt64() Int64x4
 
-/* ExtendLo4ToUint32x4 */
+/* ExtendLo4ToUint32 */
 
-// ExtendLo4ToUint32x4 converts 4 lowest vector element values to uint32.
-// The result vector's elements are zero-extended.
+// ExtendLo4ToUint32 zero-extends 4 lowest vector element values to uint32.
 //
 // Asm: VPMOVZXBD, CPU Feature: AVX
-func (x Uint8x16) ExtendLo4ToUint32x4() Uint32x4
+func (x Uint8x16) ExtendLo4ToUint32() Uint32x4
 
-// ExtendLo4ToUint32x4 converts 4 lowest vector element values to uint32.
-// The result vector's elements are zero-extended.
+// ExtendLo4ToUint32 zero-extends 4 lowest vector element values to uint32.
 //
 // Asm: VPMOVZXWD, CPU Feature: AVX
-func (x Uint16x8) ExtendLo4ToUint32x4() Uint32x4
+func (x Uint16x8) ExtendLo4ToUint32() Uint32x4
 
-/* ExtendLo4ToUint64x4 */
+/* ExtendLo4ToUint64 */
 
-// ExtendLo4ToUint64x4 converts 4 lowest vector element values to uint64.
-// The result vector's elements are zero-extended.
+// ExtendLo4ToUint64 zero-extends 4 lowest vector element values to uint64.
 //
 // Asm: VPMOVZXBQ, CPU Feature: AVX2
-func (x Uint8x16) ExtendLo4ToUint64x4() Uint64x4
+func (x Uint8x16) ExtendLo4ToUint64() Uint64x4
 
-// ExtendLo4ToUint64x4 converts 4 lowest vector element values to uint64.
-// The result vector's elements are zero-extended.
+// ExtendLo4ToUint64 zero-extends 4 lowest vector element values to uint64.
 //
 // Asm: VPMOVZXWQ, CPU Feature: AVX2
-func (x Uint16x8) ExtendLo4ToUint64x4() Uint64x4
+func (x Uint16x8) ExtendLo4ToUint64() Uint64x4
 
-/* ExtendLo8ToInt16x8 */
+/* ExtendLo8ToInt16 */
 
-// ExtendLo8ToInt16x8 converts 8 lowest vector element values to int16.
-// The result vector's elements are sign-extended.
+// ExtendLo8ToInt16 sign-extends 8 lowest vector element values to int16.
 //
 // Asm: VPMOVSXBW, CPU Feature: AVX
-func (x Int8x16) ExtendLo8ToInt16x8() Int16x8
+func (x Int8x16) ExtendLo8ToInt16() Int16x8
 
-/* ExtendLo8ToInt32x8 */
+/* ExtendLo8ToInt32 */
 
-// ExtendLo8ToInt32x8 converts 8 lowest vector element values to int32.
-// The result vector's elements are sign-extended.
+// ExtendLo8ToInt32 sign-extends 8 lowest vector element values to int32.
 //
 // Asm: VPMOVSXBD, CPU Feature: AVX2
-func (x Int8x16) ExtendLo8ToInt32x8() Int32x8
+func (x Int8x16) ExtendLo8ToInt32() Int32x8
 
-/* ExtendLo8ToInt64x8 */
+/* ExtendLo8ToInt64 */
 
-// ExtendLo8ToInt64x8 converts 8 lowest vector element values to int64.
-// The result vector's elements are sign-extended.
+// ExtendLo8ToInt64 sign-extends 8 lowest vector element values to int64.
 //
 // Asm: VPMOVSXBQ, CPU Feature: AVX512
-func (x Int8x16) ExtendLo8ToInt64x8() Int64x8
+func (x Int8x16) ExtendLo8ToInt64() Int64x8
 
-/* ExtendLo8ToUint16x8 */
+/* ExtendLo8ToUint16 */
 
-// ExtendLo8ToUint16x8 converts 8 lowest vector element values to uint16.
-// The result vector's elements are zero-extended.
+// ExtendLo8ToUint16 zero-extends 8 lowest vector element values to uint16.
 //
 // Asm: VPMOVZXBW, CPU Feature: AVX
-func (x Uint8x16) ExtendLo8ToUint16x8() Uint16x8
+func (x Uint8x16) ExtendLo8ToUint16() Uint16x8
 
-/* ExtendLo8ToUint32x8 */
+/* ExtendLo8ToUint32 */
 
-// ExtendLo8ToUint32x8 converts 8 lowest vector element values to uint32.
-// The result vector's elements are zero-extended.
+// ExtendLo8ToUint32 zero-extends 8 lowest vector element values to uint32.
 //
 // Asm: VPMOVZXBD, CPU Feature: AVX2
-func (x Uint8x16) ExtendLo8ToUint32x8() Uint32x8
+func (x Uint8x16) ExtendLo8ToUint32() Uint32x8
 
-/* ExtendLo8ToUint64x8 */
+/* ExtendLo8ToUint64 */
 
-// ExtendLo8ToUint64x8 converts 8 lowest vector element values to uint64.
-// The result vector's elements are zero-extended.
+// ExtendLo8ToUint64 zero-extends 8 lowest vector element values to uint64.
 //
 // Asm: VPMOVZXBQ, CPU Feature: AVX512
-func (x Uint8x16) ExtendLo8ToUint64x8() Uint64x8
+func (x Uint8x16) ExtendLo8ToUint64() Uint64x8
 
 /* ExtendToInt16 */
 
-// ExtendToInt16 converts element values to int16.
-// The result vector's elements are sign-extended.
+// ExtendToInt16 sign-extends element values to int16.
 //
 // Asm: VPMOVSXBW, CPU Feature: AVX2
 func (x Int8x16) ExtendToInt16() Int16x16
 
-// ExtendToInt16 converts element values to int16.
-// The result vector's elements are sign-extended.
+// ExtendToInt16 sign-extends element values to int16.
 //
 // Asm: VPMOVSXBW, CPU Feature: AVX512
 func (x Int8x32) ExtendToInt16() Int16x32
 
 /* ExtendToInt32 */
 
-// ExtendToInt32 converts element values to int32.
-// The result vector's elements are sign-extended.
+// ExtendToInt32 sign-extends element values to int32.
 //
 // Asm: VPMOVSXBD, CPU Feature: AVX512
 func (x Int8x16) ExtendToInt32() Int32x16
 
-// ExtendToInt32 converts element values to int32.
-// The result vector's elements are sign-extended.
+// ExtendToInt32 sign-extends element values to int32.
 //
 // Asm: VPMOVSXWD, CPU Feature: AVX2
 func (x Int16x8) ExtendToInt32() Int32x8
 
-// ExtendToInt32 converts element values to int32.
-// The result vector's elements are sign-extended.
+// ExtendToInt32 sign-extends element values to int32.
 //
 // Asm: VPMOVSXWD, CPU Feature: AVX512
 func (x Int16x16) ExtendToInt32() Int32x16
 
 /* ExtendToInt64 */
 
-// ExtendToInt64 converts element values to int64.
-// The result vector's elements are sign-extended.
+// ExtendToInt64 sign-extends element values to int64.
 //
 // Asm: VPMOVSXWQ, CPU Feature: AVX512
 func (x Int16x8) ExtendToInt64() Int64x8
 
-// ExtendToInt64 converts element values to int64.
-// The result vector's elements are sign-extended.
+// ExtendToInt64 sign-extends element values to int64.
 //
 // Asm: VPMOVSXDQ, CPU Feature: AVX2
 func (x Int32x4) ExtendToInt64() Int64x4
 
-// ExtendToInt64 converts element values to int64.
-// The result vector's elements are sign-extended.
+// ExtendToInt64 sign-extends element values to int64.
 //
 // Asm: VPMOVSXDQ, CPU Feature: AVX512
 func (x Int32x8) ExtendToInt64() Int64x8
 
 /* ExtendToUint16 */
 
-// ExtendToUint16 converts element values to uint16.
-// The result vector's elements are zero-extended.
+// ExtendToUint16 zero-extends element values to uint16.
 //
 // Asm: VPMOVZXBW, CPU Feature: AVX2
 func (x Uint8x16) ExtendToUint16() Uint16x16
 
-// ExtendToUint16 converts element values to uint16.
-// The result vector's elements are zero-extended.
+// ExtendToUint16 zero-extends element values to uint16.
 //
 // Asm: VPMOVZXBW, CPU Feature: AVX512
 func (x Uint8x32) ExtendToUint16() Uint16x32
 
 /* ExtendToUint32 */
 
-// ExtendToUint32 converts element values to uint32.
-// The result vector's elements are zero-extended.
+// ExtendToUint32 zero-extends element values to uint32.
 //
 // Asm: VPMOVZXBD, CPU Feature: AVX512
 func (x Uint8x16) ExtendToUint32() Uint32x16
 
-// ExtendToUint32 converts element values to uint32.
-// The result vector's elements are zero-extended.
+// ExtendToUint32 zero-extends element values to uint32.
 //
 // Asm: VPMOVZXWD, CPU Feature: AVX2
 func (x Uint16x8) ExtendToUint32() Uint32x8
 
-// ExtendToUint32 converts element values to uint32.
-// The result vector's elements are zero-extended.
+// ExtendToUint32 zero-extends element values to uint32.
 //
 // Asm: VPMOVZXWD, CPU Feature: AVX512
 func (x Uint16x16) ExtendToUint32() Uint32x16
 
 /* ExtendToUint64 */
 
-// ExtendToUint64 converts element values to uint64.
-// The result vector's elements are zero-extended.
+// ExtendToUint64 zero-extends element values to uint64.
 //
 // Asm: VPMOVZXWQ, CPU Feature: AVX512
 func (x Uint16x8) ExtendToUint64() Uint64x8
 
-// ExtendToUint64 converts element values to uint64.
-// The result vector's elements are zero-extended.
+// ExtendToUint64 zero-extends element values to uint64.
 //
 // Asm: VPMOVZXDQ, CPU Feature: AVX2
 func (x Uint32x4) ExtendToUint64() Uint64x4
 
-// ExtendToUint64 converts element values to uint64.
-// The result vector's elements are zero-extended.
+// ExtendToUint64 zero-extends element values to uint64.
 //
 // Asm: VPMOVZXDQ, CPU Feature: AVX512
 func (x Uint32x8) ExtendToUint64() Uint64x8
@@ -3081,184 +3076,184 @@ func (x Uint64x8) GetLo() Uint64x4
 
 /* Greater */
 
-// Greater returns x greater-than y, elementwise.
+// Greater returns a mask whose elements indicate whether x > y.
 //
 // Asm: VPCMPGTB, CPU Feature: AVX
 func (x Int8x16) Greater(y Int8x16) Mask8x16
 
-// Greater returns x greater-than y, elementwise.
+// Greater returns a mask whose elements indicate whether x > y.
 //
 // Asm: VPCMPGTB, CPU Feature: AVX2
 func (x Int8x32) Greater(y Int8x32) Mask8x32
 
-// Greater returns x greater-than y, elementwise.
+// Greater returns a mask whose elements indicate whether x > y.
 //
 // Asm: VPCMPGTB, CPU Feature: AVX512
 func (x Int8x64) Greater(y Int8x64) Mask8x64
 
-// Greater returns x greater-than y, elementwise.
+// Greater returns a mask whose elements indicate whether x > y.
 //
 // Asm: VPCMPGTW, CPU Feature: AVX
 func (x Int16x8) Greater(y Int16x8) Mask16x8
 
-// Greater returns x greater-than y, elementwise.
+// Greater returns a mask whose elements indicate whether x > y.
 //
 // Asm: VPCMPGTW, CPU Feature: AVX2
 func (x Int16x16) Greater(y Int16x16) Mask16x16
 
-// Greater returns x greater-than y, elementwise.
+// Greater returns a mask whose elements indicate whether x > y.
 //
 // Asm: VPCMPGTW, CPU Feature: AVX512
 func (x Int16x32) Greater(y Int16x32) Mask16x32
 
-// Greater returns x greater-than y, elementwise.
+// Greater returns a mask whose elements indicate whether x > y.
 //
 // Asm: VPCMPGTD, CPU Feature: AVX
 func (x Int32x4) Greater(y Int32x4) Mask32x4
 
-// Greater returns x greater-than y, elementwise.
+// Greater returns a mask whose elements indicate whether x > y.
 //
 // Asm: VPCMPGTD, CPU Feature: AVX2
 func (x Int32x8) Greater(y Int32x8) Mask32x8
 
-// Greater returns x greater-than y, elementwise.
+// Greater returns a mask whose elements indicate whether x > y.
 //
 // Asm: VPCMPGTD, CPU Feature: AVX512
 func (x Int32x16) Greater(y Int32x16) Mask32x16
 
-// Greater returns x greater-than y, elementwise.
+// Greater returns a mask whose elements indicate whether x > y.
 //
 // Asm: VPCMPGTQ, CPU Feature: AVX
 func (x Int64x2) Greater(y Int64x2) Mask64x2
 
-// Greater returns x greater-than y, elementwise.
+// Greater returns a mask whose elements indicate whether x > y.
 //
 // Asm: VPCMPGTQ, CPU Feature: AVX2
 func (x Int64x4) Greater(y Int64x4) Mask64x4
 
-// Greater returns x greater-than y, elementwise.
+// Greater returns a mask whose elements indicate whether x > y.
 //
 // Asm: VPCMPGTQ, CPU Feature: AVX512
 func (x Int64x8) Greater(y Int64x8) Mask64x8
 
-// Greater returns x greater-than y, elementwise.
+// Greater returns a mask whose elements indicate whether x > y.
 //
 // Asm: VCMPPS, CPU Feature: AVX
 func (x Float32x4) Greater(y Float32x4) Mask32x4
 
-// Greater returns x greater-than y, elementwise.
+// Greater returns a mask whose elements indicate whether x > y.
 //
 // Asm: VCMPPS, CPU Feature: AVX
 func (x Float32x8) Greater(y Float32x8) Mask32x8
 
-// Greater returns x greater-than y, elementwise.
+// Greater returns a mask whose elements indicate whether x > y.
 //
 // Asm: VCMPPS, CPU Feature: AVX512
 func (x Float32x16) Greater(y Float32x16) Mask32x16
 
-// Greater returns x greater-than y, elementwise.
+// Greater returns a mask whose elements indicate whether x > y.
 //
 // Asm: VCMPPD, CPU Feature: AVX
 func (x Float64x2) Greater(y Float64x2) Mask64x2
 
-// Greater returns x greater-than y, elementwise.
+// Greater returns a mask whose elements indicate whether x > y.
 //
 // Asm: VCMPPD, CPU Feature: AVX
 func (x Float64x4) Greater(y Float64x4) Mask64x4
 
-// Greater returns x greater-than y, elementwise.
+// Greater returns a mask whose elements indicate whether x > y.
 //
 // Asm: VCMPPD, CPU Feature: AVX512
 func (x Float64x8) Greater(y Float64x8) Mask64x8
 
-// Greater returns x greater-than y, elementwise.
+// Greater returns a mask whose elements indicate whether x > y.
 //
 // Asm: VPCMPUB, CPU Feature: AVX512
 func (x Uint8x64) Greater(y Uint8x64) Mask8x64
 
-// Greater returns x greater-than y, elementwise.
+// Greater returns a mask whose elements indicate whether x > y.
 //
 // Asm: VPCMPUW, CPU Feature: AVX512
 func (x Uint16x32) Greater(y Uint16x32) Mask16x32
 
-// Greater returns x greater-than y, elementwise.
+// Greater returns a mask whose elements indicate whether x > y.
 //
 // Asm: VPCMPUD, CPU Feature: AVX512
 func (x Uint32x16) Greater(y Uint32x16) Mask32x16
 
-// Greater returns x greater-than y, elementwise.
+// Greater returns a mask whose elements indicate whether x > y.
 //
 // Asm: VPCMPUQ, CPU Feature: AVX512
 func (x Uint64x8) Greater(y Uint64x8) Mask64x8
 
 /* GreaterEqual */
 
-// GreaterEqual returns x greater-than-or-equals y, elementwise.
+// GreaterEqual returns a mask whose elements indicate whether x >= y.
 //
 // Asm: VCMPPS, CPU Feature: AVX
 func (x Float32x4) GreaterEqual(y Float32x4) Mask32x4
 
-// GreaterEqual returns x greater-than-or-equals y, elementwise.
+// GreaterEqual returns a mask whose elements indicate whether x >= y.
 //
 // Asm: VCMPPS, CPU Feature: AVX
 func (x Float32x8) GreaterEqual(y Float32x8) Mask32x8
 
-// GreaterEqual returns x greater-than-or-equals y, elementwise.
+// GreaterEqual returns a mask whose elements indicate whether x >= y.
 //
 // Asm: VCMPPS, CPU Feature: AVX512
 func (x Float32x16) GreaterEqual(y Float32x16) Mask32x16
 
-// GreaterEqual returns x greater-than-or-equals y, elementwise.
+// GreaterEqual returns a mask whose elements indicate whether x >= y.
 //
 // Asm: VCMPPD, CPU Feature: AVX
 func (x Float64x2) GreaterEqual(y Float64x2) Mask64x2
 
-// GreaterEqual returns x greater-than-or-equals y, elementwise.
+// GreaterEqual returns a mask whose elements indicate whether x >= y.
 //
 // Asm: VCMPPD, CPU Feature: AVX
 func (x Float64x4) GreaterEqual(y Float64x4) Mask64x4
 
-// GreaterEqual returns x greater-than-or-equals y, elementwise.
+// GreaterEqual returns a mask whose elements indicate whether x >= y.
 //
 // Asm: VCMPPD, CPU Feature: AVX512
 func (x Float64x8) GreaterEqual(y Float64x8) Mask64x8
 
-// GreaterEqual returns x greater-than-or-equals y, elementwise.
+// GreaterEqual returns a mask whose elements indicate whether x >= y.
 //
 // Asm: VPCMPB, CPU Feature: AVX512
 func (x Int8x64) GreaterEqual(y Int8x64) Mask8x64
 
-// GreaterEqual returns x greater-than-or-equals y, elementwise.
+// GreaterEqual returns a mask whose elements indicate whether x >= y.
 //
 // Asm: VPCMPW, CPU Feature: AVX512
 func (x Int16x32) GreaterEqual(y Int16x32) Mask16x32
 
-// GreaterEqual returns x greater-than-or-equals y, elementwise.
+// GreaterEqual returns a mask whose elements indicate whether x >= y.
 //
 // Asm: VPCMPD, CPU Feature: AVX512
 func (x Int32x16) GreaterEqual(y Int32x16) Mask32x16
 
-// GreaterEqual returns x greater-than-or-equals y, elementwise.
+// GreaterEqual returns a mask whose elements indicate whether x >= y.
 //
 // Asm: VPCMPQ, CPU Feature: AVX512
 func (x Int64x8) GreaterEqual(y Int64x8) Mask64x8
 
-// GreaterEqual returns x greater-than-or-equals y, elementwise.
+// GreaterEqual returns a mask whose elements indicate whether x >= y.
 //
 // Asm: VPCMPUB, CPU Feature: AVX512
 func (x Uint8x64) GreaterEqual(y Uint8x64) Mask8x64
 
-// GreaterEqual returns x greater-than-or-equals y, elementwise.
+// GreaterEqual returns a mask whose elements indicate whether x >= y.
 //
 // Asm: VPCMPUW, CPU Feature: AVX512
 func (x Uint16x32) GreaterEqual(y Uint16x32) Mask16x32
 
-// GreaterEqual returns x greater-than-or-equals y, elementwise.
+// GreaterEqual returns a mask whose elements indicate whether x >= y.
 //
 // Asm: VPCMPUD, CPU Feature: AVX512
 func (x Uint32x16) GreaterEqual(y Uint32x16) Mask32x16
 
-// GreaterEqual returns x greater-than-or-equals y, elementwise.
+// GreaterEqual returns a mask whose elements indicate whether x >= y.
 //
 // Asm: VPCMPUQ, CPU Feature: AVX512
 func (x Uint64x8) GreaterEqual(y Uint64x8) Mask64x8
@@ -3451,38 +3446,6 @@ func (x Uint64x4) InterleaveLoGrouped(y Uint64x4) Uint64x4
 // Asm: VPUNPCKLQDQ, CPU Feature: AVX512
 func (x Uint64x8) InterleaveLoGrouped(y Uint64x8) Uint64x8
 
-/* IsNan */
-
-// IsNan checks if elements are NaN. Use as x.IsNan(x).
-//
-// Asm: VCMPPS, CPU Feature: AVX
-func (x Float32x4) IsNan(y Float32x4) Mask32x4
-
-// IsNan checks if elements are NaN. Use as x.IsNan(x).
-//
-// Asm: VCMPPS, CPU Feature: AVX
-func (x Float32x8) IsNan(y Float32x8) Mask32x8
-
-// IsNan checks if elements are NaN. Use as x.IsNan(x).
-//
-// Asm: VCMPPS, CPU Feature: AVX512
-func (x Float32x16) IsNan(y Float32x16) Mask32x16
-
-// IsNan checks if elements are NaN. Use as x.IsNan(x).
-//
-// Asm: VCMPPD, CPU Feature: AVX
-func (x Float64x2) IsNan(y Float64x2) Mask64x2
-
-// IsNan checks if elements are NaN. Use as x.IsNan(x).
-//
-// Asm: VCMPPD, CPU Feature: AVX
-func (x Float64x4) IsNan(y Float64x4) Mask64x4
-
-// IsNan checks if elements are NaN. Use as x.IsNan(x).
-//
-// Asm: VCMPPD, CPU Feature: AVX512
-func (x Float64x8) IsNan(y Float64x8) Mask64x8
-
 /* LeadingZeros */
 
 // LeadingZeros counts the leading zeros of each element in x.
@@ -3547,448 +3510,448 @@ func (x Uint64x8) LeadingZeros() Uint64x8
 
 /* Less */
 
-// Less returns x less-than y, elementwise.
+// Less returns a mask whose elements indicate whether x < y.
 //
 // Asm: VCMPPS, CPU Feature: AVX
 func (x Float32x4) Less(y Float32x4) Mask32x4
 
-// Less returns x less-than y, elementwise.
+// Less returns a mask whose elements indicate whether x < y.
 //
 // Asm: VCMPPS, CPU Feature: AVX
 func (x Float32x8) Less(y Float32x8) Mask32x8
 
-// Less returns x less-than y, elementwise.
+// Less returns a mask whose elements indicate whether x < y.
 //
 // Asm: VCMPPS, CPU Feature: AVX512
 func (x Float32x16) Less(y Float32x16) Mask32x16
 
-// Less returns x less-than y, elementwise.
+// Less returns a mask whose elements indicate whether x < y.
 //
 // Asm: VCMPPD, CPU Feature: AVX
 func (x Float64x2) Less(y Float64x2) Mask64x2
 
-// Less returns x less-than y, elementwise.
+// Less returns a mask whose elements indicate whether x < y.
 //
 // Asm: VCMPPD, CPU Feature: AVX
 func (x Float64x4) Less(y Float64x4) Mask64x4
 
-// Less returns x less-than y, elementwise.
+// Less returns a mask whose elements indicate whether x < y.
 //
 // Asm: VCMPPD, CPU Feature: AVX512
 func (x Float64x8) Less(y Float64x8) Mask64x8
 
-// Less returns x less-than y, elementwise.
+// Less returns a mask whose elements indicate whether x < y.
 //
 // Asm: VPCMPB, CPU Feature: AVX512
 func (x Int8x64) Less(y Int8x64) Mask8x64
 
-// Less returns x less-than y, elementwise.
+// Less returns a mask whose elements indicate whether x < y.
 //
 // Asm: VPCMPW, CPU Feature: AVX512
 func (x Int16x32) Less(y Int16x32) Mask16x32
 
-// Less returns x less-than y, elementwise.
+// Less returns a mask whose elements indicate whether x < y.
 //
 // Asm: VPCMPD, CPU Feature: AVX512
 func (x Int32x16) Less(y Int32x16) Mask32x16
 
-// Less returns x less-than y, elementwise.
+// Less returns a mask whose elements indicate whether x < y.
 //
 // Asm: VPCMPQ, CPU Feature: AVX512
 func (x Int64x8) Less(y Int64x8) Mask64x8
 
-// Less returns x less-than y, elementwise.
+// Less returns a mask whose elements indicate whether x < y.
 //
 // Asm: VPCMPUB, CPU Feature: AVX512
 func (x Uint8x64) Less(y Uint8x64) Mask8x64
 
-// Less returns x less-than y, elementwise.
+// Less returns a mask whose elements indicate whether x < y.
 //
 // Asm: VPCMPUW, CPU Feature: AVX512
 func (x Uint16x32) Less(y Uint16x32) Mask16x32
 
-// Less returns x less-than y, elementwise.
+// Less returns a mask whose elements indicate whether x < y.
 //
 // Asm: VPCMPUD, CPU Feature: AVX512
 func (x Uint32x16) Less(y Uint32x16) Mask32x16
 
-// Less returns x less-than y, elementwise.
+// Less returns a mask whose elements indicate whether x < y.
 //
 // Asm: VPCMPUQ, CPU Feature: AVX512
 func (x Uint64x8) Less(y Uint64x8) Mask64x8
 
 /* LessEqual */
 
-// LessEqual returns x less-than-or-equals y, elementwise.
+// LessEqual returns a mask whose elements indicate whether x <= y.
 //
 // Asm: VCMPPS, CPU Feature: AVX
 func (x Float32x4) LessEqual(y Float32x4) Mask32x4
 
-// LessEqual returns x less-than-or-equals y, elementwise.
+// LessEqual returns a mask whose elements indicate whether x <= y.
 //
 // Asm: VCMPPS, CPU Feature: AVX
 func (x Float32x8) LessEqual(y Float32x8) Mask32x8
 
-// LessEqual returns x less-than-or-equals y, elementwise.
+// LessEqual returns a mask whose elements indicate whether x <= y.
 //
 // Asm: VCMPPS, CPU Feature: AVX512
 func (x Float32x16) LessEqual(y Float32x16) Mask32x16
 
-// LessEqual returns x less-than-or-equals y, elementwise.
+// LessEqual returns a mask whose elements indicate whether x <= y.
 //
 // Asm: VCMPPD, CPU Feature: AVX
 func (x Float64x2) LessEqual(y Float64x2) Mask64x2
 
-// LessEqual returns x less-than-or-equals y, elementwise.
+// LessEqual returns a mask whose elements indicate whether x <= y.
 //
 // Asm: VCMPPD, CPU Feature: AVX
 func (x Float64x4) LessEqual(y Float64x4) Mask64x4
 
-// LessEqual returns x less-than-or-equals y, elementwise.
+// LessEqual returns a mask whose elements indicate whether x <= y.
 //
 // Asm: VCMPPD, CPU Feature: AVX512
 func (x Float64x8) LessEqual(y Float64x8) Mask64x8
 
-// LessEqual returns x less-than-or-equals y, elementwise.
+// LessEqual returns a mask whose elements indicate whether x <= y.
 //
 // Asm: VPCMPB, CPU Feature: AVX512
 func (x Int8x64) LessEqual(y Int8x64) Mask8x64
 
-// LessEqual returns x less-than-or-equals y, elementwise.
+// LessEqual returns a mask whose elements indicate whether x <= y.
 //
 // Asm: VPCMPW, CPU Feature: AVX512
 func (x Int16x32) LessEqual(y Int16x32) Mask16x32
 
-// LessEqual returns x less-than-or-equals y, elementwise.
+// LessEqual returns a mask whose elements indicate whether x <= y.
 //
 // Asm: VPCMPD, CPU Feature: AVX512
 func (x Int32x16) LessEqual(y Int32x16) Mask32x16
 
-// LessEqual returns x less-than-or-equals y, elementwise.
+// LessEqual returns a mask whose elements indicate whether x <= y.
 //
 // Asm: VPCMPQ, CPU Feature: AVX512
 func (x Int64x8) LessEqual(y Int64x8) Mask64x8
 
-// LessEqual returns x less-than-or-equals y, elementwise.
+// LessEqual returns a mask whose elements indicate whether x <= y.
 //
 // Asm: VPCMPUB, CPU Feature: AVX512
 func (x Uint8x64) LessEqual(y Uint8x64) Mask8x64
 
-// LessEqual returns x less-than-or-equals y, elementwise.
+// LessEqual returns a mask whose elements indicate whether x <= y.
 //
 // Asm: VPCMPUW, CPU Feature: AVX512
 func (x Uint16x32) LessEqual(y Uint16x32) Mask16x32
 
-// LessEqual returns x less-than-or-equals y, elementwise.
+// LessEqual returns a mask whose elements indicate whether x <= y.
 //
 // Asm: VPCMPUD, CPU Feature: AVX512
 func (x Uint32x16) LessEqual(y Uint32x16) Mask32x16
 
-// LessEqual returns x less-than-or-equals y, elementwise.
+// LessEqual returns a mask whose elements indicate whether x <= y.
 //
 // Asm: VPCMPUQ, CPU Feature: AVX512
 func (x Uint64x8) LessEqual(y Uint64x8) Mask64x8
 
 /* Max */
 
-// Max computes the maximum of corresponding elements.
+// Max computes the maximum of each pair of corresponding elements in x and y.
 //
 // Asm: VMAXPS, CPU Feature: AVX
 func (x Float32x4) Max(y Float32x4) Float32x4
 
-// Max computes the maximum of corresponding elements.
+// Max computes the maximum of each pair of corresponding elements in x and y.
 //
 // Asm: VMAXPS, CPU Feature: AVX
 func (x Float32x8) Max(y Float32x8) Float32x8
 
-// Max computes the maximum of corresponding elements.
+// Max computes the maximum of each pair of corresponding elements in x and y.
 //
 // Asm: VMAXPS, CPU Feature: AVX512
 func (x Float32x16) Max(y Float32x16) Float32x16
 
-// Max computes the maximum of corresponding elements.
+// Max computes the maximum of each pair of corresponding elements in x and y.
 //
 // Asm: VMAXPD, CPU Feature: AVX
 func (x Float64x2) Max(y Float64x2) Float64x2
 
-// Max computes the maximum of corresponding elements.
+// Max computes the maximum of each pair of corresponding elements in x and y.
 //
 // Asm: VMAXPD, CPU Feature: AVX
 func (x Float64x4) Max(y Float64x4) Float64x4
 
-// Max computes the maximum of corresponding elements.
+// Max computes the maximum of each pair of corresponding elements in x and y.
 //
 // Asm: VMAXPD, CPU Feature: AVX512
 func (x Float64x8) Max(y Float64x8) Float64x8
 
-// Max computes the maximum of corresponding elements.
+// Max computes the maximum of each pair of corresponding elements in x and y.
 //
 // Asm: VPMAXSB, CPU Feature: AVX
 func (x Int8x16) Max(y Int8x16) Int8x16
 
-// Max computes the maximum of corresponding elements.
+// Max computes the maximum of each pair of corresponding elements in x and y.
 //
 // Asm: VPMAXSB, CPU Feature: AVX2
 func (x Int8x32) Max(y Int8x32) Int8x32
 
-// Max computes the maximum of corresponding elements.
+// Max computes the maximum of each pair of corresponding elements in x and y.
 //
 // Asm: VPMAXSB, CPU Feature: AVX512
 func (x Int8x64) Max(y Int8x64) Int8x64
 
-// Max computes the maximum of corresponding elements.
+// Max computes the maximum of each pair of corresponding elements in x and y.
 //
 // Asm: VPMAXSW, CPU Feature: AVX
 func (x Int16x8) Max(y Int16x8) Int16x8
 
-// Max computes the maximum of corresponding elements.
+// Max computes the maximum of each pair of corresponding elements in x and y.
 //
 // Asm: VPMAXSW, CPU Feature: AVX2
 func (x Int16x16) Max(y Int16x16) Int16x16
 
-// Max computes the maximum of corresponding elements.
+// Max computes the maximum of each pair of corresponding elements in x and y.
 //
 // Asm: VPMAXSW, CPU Feature: AVX512
 func (x Int16x32) Max(y Int16x32) Int16x32
 
-// Max computes the maximum of corresponding elements.
+// Max computes the maximum of each pair of corresponding elements in x and y.
 //
 // Asm: VPMAXSD, CPU Feature: AVX
 func (x Int32x4) Max(y Int32x4) Int32x4
 
-// Max computes the maximum of corresponding elements.
+// Max computes the maximum of each pair of corresponding elements in x and y.
 //
 // Asm: VPMAXSD, CPU Feature: AVX2
 func (x Int32x8) Max(y Int32x8) Int32x8
 
-// Max computes the maximum of corresponding elements.
+// Max computes the maximum of each pair of corresponding elements in x and y.
 //
 // Asm: VPMAXSD, CPU Feature: AVX512
 func (x Int32x16) Max(y Int32x16) Int32x16
 
-// Max computes the maximum of corresponding elements.
+// Max computes the maximum of each pair of corresponding elements in x and y.
 //
 // Asm: VPMAXSQ, CPU Feature: AVX512
 func (x Int64x2) Max(y Int64x2) Int64x2
 
-// Max computes the maximum of corresponding elements.
+// Max computes the maximum of each pair of corresponding elements in x and y.
 //
 // Asm: VPMAXSQ, CPU Feature: AVX512
 func (x Int64x4) Max(y Int64x4) Int64x4
 
-// Max computes the maximum of corresponding elements.
+// Max computes the maximum of each pair of corresponding elements in x and y.
 //
 // Asm: VPMAXSQ, CPU Feature: AVX512
 func (x Int64x8) Max(y Int64x8) Int64x8
 
-// Max computes the maximum of corresponding elements.
+// Max computes the maximum of each pair of corresponding elements in x and y.
 //
 // Asm: VPMAXUB, CPU Feature: AVX
 func (x Uint8x16) Max(y Uint8x16) Uint8x16
 
-// Max computes the maximum of corresponding elements.
+// Max computes the maximum of each pair of corresponding elements in x and y.
 //
 // Asm: VPMAXUB, CPU Feature: AVX2
 func (x Uint8x32) Max(y Uint8x32) Uint8x32
 
-// Max computes the maximum of corresponding elements.
+// Max computes the maximum of each pair of corresponding elements in x and y.
 //
 // Asm: VPMAXUB, CPU Feature: AVX512
 func (x Uint8x64) Max(y Uint8x64) Uint8x64
 
-// Max computes the maximum of corresponding elements.
+// Max computes the maximum of each pair of corresponding elements in x and y.
 //
 // Asm: VPMAXUW, CPU Feature: AVX
 func (x Uint16x8) Max(y Uint16x8) Uint16x8
 
-// Max computes the maximum of corresponding elements.
+// Max computes the maximum of each pair of corresponding elements in x and y.
 //
 // Asm: VPMAXUW, CPU Feature: AVX2
 func (x Uint16x16) Max(y Uint16x16) Uint16x16
 
-// Max computes the maximum of corresponding elements.
+// Max computes the maximum of each pair of corresponding elements in x and y.
 //
 // Asm: VPMAXUW, CPU Feature: AVX512
 func (x Uint16x32) Max(y Uint16x32) Uint16x32
 
-// Max computes the maximum of corresponding elements.
+// Max computes the maximum of each pair of corresponding elements in x and y.
 //
 // Asm: VPMAXUD, CPU Feature: AVX
 func (x Uint32x4) Max(y Uint32x4) Uint32x4
 
-// Max computes the maximum of corresponding elements.
+// Max computes the maximum of each pair of corresponding elements in x and y.
 //
 // Asm: VPMAXUD, CPU Feature: AVX2
 func (x Uint32x8) Max(y Uint32x8) Uint32x8
 
-// Max computes the maximum of corresponding elements.
+// Max computes the maximum of each pair of corresponding elements in x and y.
 //
 // Asm: VPMAXUD, CPU Feature: AVX512
 func (x Uint32x16) Max(y Uint32x16) Uint32x16
 
-// Max computes the maximum of corresponding elements.
+// Max computes the maximum of each pair of corresponding elements in x and y.
 //
 // Asm: VPMAXUQ, CPU Feature: AVX512
 func (x Uint64x2) Max(y Uint64x2) Uint64x2
 
-// Max computes the maximum of corresponding elements.
+// Max computes the maximum of each pair of corresponding elements in x and y.
 //
 // Asm: VPMAXUQ, CPU Feature: AVX512
 func (x Uint64x4) Max(y Uint64x4) Uint64x4
 
-// Max computes the maximum of corresponding elements.
+// Max computes the maximum of each pair of corresponding elements in x and y.
 //
 // Asm: VPMAXUQ, CPU Feature: AVX512
 func (x Uint64x8) Max(y Uint64x8) Uint64x8
 
 /* Min */
 
-// Min computes the minimum of corresponding elements.
+// Min computes the minimum of each pair of corresponding elements in x and y.
 //
 // Asm: VMINPS, CPU Feature: AVX
 func (x Float32x4) Min(y Float32x4) Float32x4
 
-// Min computes the minimum of corresponding elements.
+// Min computes the minimum of each pair of corresponding elements in x and y.
 //
 // Asm: VMINPS, CPU Feature: AVX
 func (x Float32x8) Min(y Float32x8) Float32x8
 
-// Min computes the minimum of corresponding elements.
+// Min computes the minimum of each pair of corresponding elements in x and y.
 //
 // Asm: VMINPS, CPU Feature: AVX512
 func (x Float32x16) Min(y Float32x16) Float32x16
 
-// Min computes the minimum of corresponding elements.
+// Min computes the minimum of each pair of corresponding elements in x and y.
 //
 // Asm: VMINPD, CPU Feature: AVX
 func (x Float64x2) Min(y Float64x2) Float64x2
 
-// Min computes the minimum of corresponding elements.
+// Min computes the minimum of each pair of corresponding elements in x and y.
 //
 // Asm: VMINPD, CPU Feature: AVX
 func (x Float64x4) Min(y Float64x4) Float64x4
 
-// Min computes the minimum of corresponding elements.
+// Min computes the minimum of each pair of corresponding elements in x and y.
 //
 // Asm: VMINPD, CPU Feature: AVX512
 func (x Float64x8) Min(y Float64x8) Float64x8
 
-// Min computes the minimum of corresponding elements.
+// Min computes the minimum of each pair of corresponding elements in x and y.
 //
 // Asm: VPMINSB, CPU Feature: AVX
 func (x Int8x16) Min(y Int8x16) Int8x16
 
-// Min computes the minimum of corresponding elements.
+// Min computes the minimum of each pair of corresponding elements in x and y.
 //
 // Asm: VPMINSB, CPU Feature: AVX2
 func (x Int8x32) Min(y Int8x32) Int8x32
 
-// Min computes the minimum of corresponding elements.
+// Min computes the minimum of each pair of corresponding elements in x and y.
 //
 // Asm: VPMINSB, CPU Feature: AVX512
 func (x Int8x64) Min(y Int8x64) Int8x64
 
-// Min computes the minimum of corresponding elements.
+// Min computes the minimum of each pair of corresponding elements in x and y.
 //
 // Asm: VPMINSW, CPU Feature: AVX
 func (x Int16x8) Min(y Int16x8) Int16x8
 
-// Min computes the minimum of corresponding elements.
+// Min computes the minimum of each pair of corresponding elements in x and y.
 //
 // Asm: VPMINSW, CPU Feature: AVX2
 func (x Int16x16) Min(y Int16x16) Int16x16
 
-// Min computes the minimum of corresponding elements.
+// Min computes the minimum of each pair of corresponding elements in x and y.
 //
 // Asm: VPMINSW, CPU Feature: AVX512
 func (x Int16x32) Min(y Int16x32) Int16x32
 
-// Min computes the minimum of corresponding elements.
+// Min computes the minimum of each pair of corresponding elements in x and y.
 //
 // Asm: VPMINSD, CPU Feature: AVX
 func (x Int32x4) Min(y Int32x4) Int32x4
 
-// Min computes the minimum of corresponding elements.
+// Min computes the minimum of each pair of corresponding elements in x and y.
 //
 // Asm: VPMINSD, CPU Feature: AVX2
 func (x Int32x8) Min(y Int32x8) Int32x8
 
-// Min computes the minimum of corresponding elements.
+// Min computes the minimum of each pair of corresponding elements in x and y.
 //
 // Asm: VPMINSD, CPU Feature: AVX512
 func (x Int32x16) Min(y Int32x16) Int32x16
 
-// Min computes the minimum of corresponding elements.
+// Min computes the minimum of each pair of corresponding elements in x and y.
 //
 // Asm: VPMINSQ, CPU Feature: AVX512
 func (x Int64x2) Min(y Int64x2) Int64x2
 
-// Min computes the minimum of corresponding elements.
+// Min computes the minimum of each pair of corresponding elements in x and y.
 //
 // Asm: VPMINSQ, CPU Feature: AVX512
 func (x Int64x4) Min(y Int64x4) Int64x4
 
-// Min computes the minimum of corresponding elements.
+// Min computes the minimum of each pair of corresponding elements in x and y.
 //
 // Asm: VPMINSQ, CPU Feature: AVX512
 func (x Int64x8) Min(y Int64x8) Int64x8
 
-// Min computes the minimum of corresponding elements.
+// Min computes the minimum of each pair of corresponding elements in x and y.
 //
 // Asm: VPMINUB, CPU Feature: AVX
 func (x Uint8x16) Min(y Uint8x16) Uint8x16
 
-// Min computes the minimum of corresponding elements.
+// Min computes the minimum of each pair of corresponding elements in x and y.
 //
 // Asm: VPMINUB, CPU Feature: AVX2
 func (x Uint8x32) Min(y Uint8x32) Uint8x32
 
-// Min computes the minimum of corresponding elements.
+// Min computes the minimum of each pair of corresponding elements in x and y.
 //
 // Asm: VPMINUB, CPU Feature: AVX512
 func (x Uint8x64) Min(y Uint8x64) Uint8x64
 
-// Min computes the minimum of corresponding elements.
+// Min computes the minimum of each pair of corresponding elements in x and y.
 //
 // Asm: VPMINUW, CPU Feature: AVX
 func (x Uint16x8) Min(y Uint16x8) Uint16x8
 
-// Min computes the minimum of corresponding elements.
+// Min computes the minimum of each pair of corresponding elements in x and y.
 //
 // Asm: VPMINUW, CPU Feature: AVX2
 func (x Uint16x16) Min(y Uint16x16) Uint16x16
 
-// Min computes the minimum of corresponding elements.
+// Min computes the minimum of each pair of corresponding elements in x and y.
 //
 // Asm: VPMINUW, CPU Feature: AVX512
 func (x Uint16x32) Min(y Uint16x32) Uint16x32
 
-// Min computes the minimum of corresponding elements.
+// Min computes the minimum of each pair of corresponding elements in x and y.
 //
 // Asm: VPMINUD, CPU Feature: AVX
 func (x Uint32x4) Min(y Uint32x4) Uint32x4
 
-// Min computes the minimum of corresponding elements.
+// Min computes the minimum of each pair of corresponding elements in x and y.
 //
 // Asm: VPMINUD, CPU Feature: AVX2
 func (x Uint32x8) Min(y Uint32x8) Uint32x8
 
-// Min computes the minimum of corresponding elements.
+// Min computes the minimum of each pair of corresponding elements in x and y.
 //
 // Asm: VPMINUD, CPU Feature: AVX512
 func (x Uint32x16) Min(y Uint32x16) Uint32x16
 
-// Min computes the minimum of corresponding elements.
+// Min computes the minimum of each pair of corresponding elements in x and y.
 //
 // Asm: VPMINUQ, CPU Feature: AVX512
 func (x Uint64x2) Min(y Uint64x2) Uint64x2
 
-// Min computes the minimum of corresponding elements.
+// Min computes the minimum of each pair of corresponding elements in x and y.
 //
 // Asm: VPMINUQ, CPU Feature: AVX512
 func (x Uint64x4) Min(y Uint64x4) Uint64x4
 
-// Min computes the minimum of corresponding elements.
+// Min computes the minimum of each pair of corresponding elements in x and y.
 //
 // Asm: VPMINUQ, CPU Feature: AVX512
 func (x Uint64x8) Min(y Uint64x8) Uint64x8
@@ -4182,25 +4145,25 @@ func (x Float64x8) MulAddSub(y Float64x8, z Float64x8) Float64x8
 /* MulEvenWiden */
 
 // MulEvenWiden multiplies even-indexed elements, widening the result.
-// Result[i] = v1.Even[i] * v2.Even[i].
+// Result[i] = v1[2*i] * v2[2*i].
 //
 // Asm: VPMULDQ, CPU Feature: AVX
 func (x Int32x4) MulEvenWiden(y Int32x4) Int64x2
 
 // MulEvenWiden multiplies even-indexed elements, widening the result.
-// Result[i] = v1.Even[i] * v2.Even[i].
+// Result[i] = v1[2*i] * v2[2*i].
 //
 // Asm: VPMULDQ, CPU Feature: AVX2
 func (x Int32x8) MulEvenWiden(y Int32x8) Int64x4
 
 // MulEvenWiden multiplies even-indexed elements, widening the result.
-// Result[i] = v1.Even[i] * v2.Even[i].
+// Result[i] = v1[2*i] * v2[2*i].
 //
 // Asm: VPMULUDQ, CPU Feature: AVX
 func (x Uint32x4) MulEvenWiden(y Uint32x4) Uint64x2
 
 // MulEvenWiden multiplies even-indexed elements, widening the result.
-// Result[i] = v1.Even[i] * v2.Even[i].
+// Result[i] = v1[2*i] * v2[2*i].
 //
 // Asm: VPMULUDQ, CPU Feature: AVX2
 func (x Uint32x8) MulEvenWiden(y Uint32x8) Uint64x4
@@ -4271,72 +4234,72 @@ func (x Float64x8) MulSubAdd(y Float64x8, z Float64x8) Float64x8
 
 /* NotEqual */
 
-// NotEqual returns x not-equals y, elementwise.
+// NotEqual returns a mask whose elements indicate whether x != y.
 //
 // Asm: VCMPPS, CPU Feature: AVX
 func (x Float32x4) NotEqual(y Float32x4) Mask32x4
 
-// NotEqual returns x not-equals y, elementwise.
+// NotEqual returns a mask whose elements indicate whether x != y.
 //
 // Asm: VCMPPS, CPU Feature: AVX
 func (x Float32x8) NotEqual(y Float32x8) Mask32x8
 
-// NotEqual returns x not-equals y, elementwise.
+// NotEqual returns a mask whose elements indicate whether x != y.
 //
 // Asm: VCMPPS, CPU Feature: AVX512
 func (x Float32x16) NotEqual(y Float32x16) Mask32x16
 
-// NotEqual returns x not-equals y, elementwise.
+// NotEqual returns a mask whose elements indicate whether x != y.
 //
 // Asm: VCMPPD, CPU Feature: AVX
 func (x Float64x2) NotEqual(y Float64x2) Mask64x2
 
-// NotEqual returns x not-equals y, elementwise.
+// NotEqual returns a mask whose elements indicate whether x != y.
 //
 // Asm: VCMPPD, CPU Feature: AVX
 func (x Float64x4) NotEqual(y Float64x4) Mask64x4
 
-// NotEqual returns x not-equals y, elementwise.
+// NotEqual returns a mask whose elements indicate whether x != y.
 //
 // Asm: VCMPPD, CPU Feature: AVX512
 func (x Float64x8) NotEqual(y Float64x8) Mask64x8
 
-// NotEqual returns x not-equals y, elementwise.
+// NotEqual returns a mask whose elements indicate whether x != y.
 //
 // Asm: VPCMPB, CPU Feature: AVX512
 func (x Int8x64) NotEqual(y Int8x64) Mask8x64
 
-// NotEqual returns x not-equals y, elementwise.
+// NotEqual returns a mask whose elements indicate whether x != y.
 //
 // Asm: VPCMPW, CPU Feature: AVX512
 func (x Int16x32) NotEqual(y Int16x32) Mask16x32
 
-// NotEqual returns x not-equals y, elementwise.
+// NotEqual returns a mask whose elements indicate whether x != y.
 //
 // Asm: VPCMPD, CPU Feature: AVX512
 func (x Int32x16) NotEqual(y Int32x16) Mask32x16
 
-// NotEqual returns x not-equals y, elementwise.
+// NotEqual returns a mask whose elements indicate whether x != y.
 //
 // Asm: VPCMPQ, CPU Feature: AVX512
 func (x Int64x8) NotEqual(y Int64x8) Mask64x8
 
-// NotEqual returns x not-equals y, elementwise.
+// NotEqual returns a mask whose elements indicate whether x != y.
 //
 // Asm: VPCMPUB, CPU Feature: AVX512
 func (x Uint8x64) NotEqual(y Uint8x64) Mask8x64
 
-// NotEqual returns x not-equals y, elementwise.
+// NotEqual returns a mask whose elements indicate whether x != y.
 //
 // Asm: VPCMPUW, CPU Feature: AVX512
 func (x Uint16x32) NotEqual(y Uint16x32) Mask16x32
 
-// NotEqual returns x not-equals y, elementwise.
+// NotEqual returns a mask whose elements indicate whether x != y.
 //
 // Asm: VPCMPUD, CPU Feature: AVX512
 func (x Uint32x16) NotEqual(y Uint32x16) Mask32x16
 
-// NotEqual returns x not-equals y, elementwise.
+// NotEqual returns a mask whose elements indicate whether x != y.
 //
 // Asm: VPCMPUQ, CPU Feature: AVX512
 func (x Uint64x8) NotEqual(y Uint64x8) Mask64x8
@@ -4588,169 +4551,217 @@ func (x Uint64x8) Or(y Uint64x8) Uint64x8
 /* Permute */
 
 // Permute performs a full permutation of vector x using indices:
-// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-// The low 4 bits (values 0-15) of each element of indices is used
+//
+//	result = {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+//
+// The low 4 bits (values 0-15) of each element of indices is used.
 //
 // Asm: VPERMB, CPU Feature: AVX512VBMI
 func (x Int8x16) Permute(indices Uint8x16) Int8x16
 
 // Permute performs a full permutation of vector x using indices:
-// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-// The low 4 bits (values 0-15) of each element of indices is used
+//
+//	result = {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+//
+// The low 4 bits (values 0-15) of each element of indices is used.
 //
 // Asm: VPERMB, CPU Feature: AVX512VBMI
 func (x Uint8x16) Permute(indices Uint8x16) Uint8x16
 
 // Permute performs a full permutation of vector x using indices:
-// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-// The low 5 bits (values 0-31) of each element of indices is used
+//
+//	result = {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+//
+// The low 5 bits (values 0-31) of each element of indices is used.
 //
 // Asm: VPERMB, CPU Feature: AVX512VBMI
 func (x Int8x32) Permute(indices Uint8x32) Int8x32
 
 // Permute performs a full permutation of vector x using indices:
-// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-// The low 5 bits (values 0-31) of each element of indices is used
+//
+//	result = {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+//
+// The low 5 bits (values 0-31) of each element of indices is used.
 //
 // Asm: VPERMB, CPU Feature: AVX512VBMI
 func (x Uint8x32) Permute(indices Uint8x32) Uint8x32
 
 // Permute performs a full permutation of vector x using indices:
-// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-// The low 6 bits (values 0-63) of each element of indices is used
+//
+//	result = {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+//
+// The low 6 bits (values 0-63) of each element of indices is used.
 //
 // Asm: VPERMB, CPU Feature: AVX512VBMI
 func (x Int8x64) Permute(indices Uint8x64) Int8x64
 
 // Permute performs a full permutation of vector x using indices:
-// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-// The low 6 bits (values 0-63) of each element of indices is used
+//
+//	result = {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+//
+// The low 6 bits (values 0-63) of each element of indices is used.
 //
 // Asm: VPERMB, CPU Feature: AVX512VBMI
 func (x Uint8x64) Permute(indices Uint8x64) Uint8x64
 
 // Permute performs a full permutation of vector x using indices:
-// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-// The low 3 bits (values 0-7) of each element of indices is used
+//
+//	result = {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+//
+// The low 3 bits (values 0-7) of each element of indices is used.
 //
 // Asm: VPERMW, CPU Feature: AVX512
 func (x Int16x8) Permute(indices Uint16x8) Int16x8
 
 // Permute performs a full permutation of vector x using indices:
-// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-// The low 3 bits (values 0-7) of each element of indices is used
+//
+//	result = {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+//
+// The low 3 bits (values 0-7) of each element of indices is used.
 //
 // Asm: VPERMW, CPU Feature: AVX512
 func (x Uint16x8) Permute(indices Uint16x8) Uint16x8
 
 // Permute performs a full permutation of vector x using indices:
-// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-// The low 4 bits (values 0-15) of each element of indices is used
+//
+//	result = {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+//
+// The low 4 bits (values 0-15) of each element of indices is used.
 //
 // Asm: VPERMW, CPU Feature: AVX512
 func (x Int16x16) Permute(indices Uint16x16) Int16x16
 
 // Permute performs a full permutation of vector x using indices:
-// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-// The low 4 bits (values 0-15) of each element of indices is used
+//
+//	result = {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+//
+// The low 4 bits (values 0-15) of each element of indices is used.
 //
 // Asm: VPERMW, CPU Feature: AVX512
 func (x Uint16x16) Permute(indices Uint16x16) Uint16x16
 
 // Permute performs a full permutation of vector x using indices:
-// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-// The low 5 bits (values 0-31) of each element of indices is used
+//
+//	result = {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+//
+// The low 5 bits (values 0-31) of each element of indices is used.
 //
 // Asm: VPERMW, CPU Feature: AVX512
 func (x Int16x32) Permute(indices Uint16x32) Int16x32
 
 // Permute performs a full permutation of vector x using indices:
-// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-// The low 5 bits (values 0-31) of each element of indices is used
+//
+//	result = {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+//
+// The low 5 bits (values 0-31) of each element of indices is used.
 //
 // Asm: VPERMW, CPU Feature: AVX512
 func (x Uint16x32) Permute(indices Uint16x32) Uint16x32
 
 // Permute performs a full permutation of vector x using indices:
-// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-// The low 3 bits (values 0-7) of each element of indices is used
+//
+//	result = {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+//
+// The low 3 bits (values 0-7) of each element of indices is used.
 //
 // Asm: VPERMPS, CPU Feature: AVX2
 func (x Float32x8) Permute(indices Uint32x8) Float32x8
 
 // Permute performs a full permutation of vector x using indices:
-// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-// The low 3 bits (values 0-7) of each element of indices is used
+//
+//	result = {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+//
+// The low 3 bits (values 0-7) of each element of indices is used.
 //
 // Asm: VPERMD, CPU Feature: AVX2
 func (x Int32x8) Permute(indices Uint32x8) Int32x8
 
 // Permute performs a full permutation of vector x using indices:
-// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-// The low 3 bits (values 0-7) of each element of indices is used
+//
+//	result = {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+//
+// The low 3 bits (values 0-7) of each element of indices is used.
 //
 // Asm: VPERMD, CPU Feature: AVX2
 func (x Uint32x8) Permute(indices Uint32x8) Uint32x8
 
 // Permute performs a full permutation of vector x using indices:
-// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-// The low 4 bits (values 0-15) of each element of indices is used
+//
+//	result = {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+//
+// The low 4 bits (values 0-15) of each element of indices is used.
 //
 // Asm: VPERMPS, CPU Feature: AVX512
 func (x Float32x16) Permute(indices Uint32x16) Float32x16
 
 // Permute performs a full permutation of vector x using indices:
-// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-// The low 4 bits (values 0-15) of each element of indices is used
+//
+//	result = {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+//
+// The low 4 bits (values 0-15) of each element of indices is used.
 //
 // Asm: VPERMD, CPU Feature: AVX512
 func (x Int32x16) Permute(indices Uint32x16) Int32x16
 
 // Permute performs a full permutation of vector x using indices:
-// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-// The low 4 bits (values 0-15) of each element of indices is used
+//
+//	result = {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+//
+// The low 4 bits (values 0-15) of each element of indices is used.
 //
 // Asm: VPERMD, CPU Feature: AVX512
 func (x Uint32x16) Permute(indices Uint32x16) Uint32x16
 
 // Permute performs a full permutation of vector x using indices:
-// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-// The low 2 bits (values 0-3) of each element of indices is used
+//
+//	result = {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+//
+// The low 2 bits (values 0-3) of each element of indices is used.
 //
 // Asm: VPERMPD, CPU Feature: AVX512
 func (x Float64x4) Permute(indices Uint64x4) Float64x4
 
 // Permute performs a full permutation of vector x using indices:
-// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-// The low 2 bits (values 0-3) of each element of indices is used
+//
+//	result = {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+//
+// The low 2 bits (values 0-3) of each element of indices is used.
 //
 // Asm: VPERMQ, CPU Feature: AVX512
 func (x Int64x4) Permute(indices Uint64x4) Int64x4
 
 // Permute performs a full permutation of vector x using indices:
-// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-// The low 2 bits (values 0-3) of each element of indices is used
+//
+//	result = {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+//
+// The low 2 bits (values 0-3) of each element of indices is used.
 //
 // Asm: VPERMQ, CPU Feature: AVX512
 func (x Uint64x4) Permute(indices Uint64x4) Uint64x4
 
 // Permute performs a full permutation of vector x using indices:
-// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-// The low 3 bits (values 0-7) of each element of indices is used
+//
+//	result = {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+//
+// The low 3 bits (values 0-7) of each element of indices is used.
 //
 // Asm: VPERMPD, CPU Feature: AVX512
 func (x Float64x8) Permute(indices Uint64x8) Float64x8
 
 // Permute performs a full permutation of vector x using indices:
-// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-// The low 3 bits (values 0-7) of each element of indices is used
+//
+//	result = {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+//
+// The low 3 bits (values 0-7) of each element of indices is used.
 //
 // Asm: VPERMQ, CPU Feature: AVX512
 func (x Int64x8) Permute(indices Uint64x8) Int64x8
 
 // Permute performs a full permutation of vector x using indices:
-// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-// The low 3 bits (values 0-7) of each element of indices is used
+//
+//	result = {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+//
+// The low 3 bits (values 0-7) of each element of indices is used.
 //
 // Asm: VPERMQ, CPU Feature: AVX512
 func (x Uint64x8) Permute(indices Uint64x8) Uint64x8
@@ -4758,7 +4769,9 @@ func (x Uint64x8) Permute(indices Uint64x8) Uint64x8
 /* PermuteOrZero */
 
 // PermuteOrZero performs a full permutation of vector x using indices:
-// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+//
+//	result = {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+//
 // The lower four bits of each byte-sized index in indices select an element from x,
 // unless the index's sign bit is set in which case zero is used instead.
 //
@@ -4766,7 +4779,9 @@ func (x Uint64x8) Permute(indices Uint64x8) Uint64x8
 func (x Int8x16) PermuteOrZero(indices Int8x16) Int8x16
 
 // PermuteOrZero performs a full permutation of vector x using indices:
-// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+//
+//	result = {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+//
 // The lower four bits of each byte-sized index in indices select an element from x,
 // unless the index's sign bit is set in which case zero is used instead.
 //
@@ -4776,7 +4791,9 @@ func (x Uint8x16) PermuteOrZero(indices Int8x16) Uint8x16
 /* PermuteOrZeroGrouped */
 
 // PermuteOrZeroGrouped performs a grouped permutation of vector x using indices:
-// result = {x_group0[indices[0]], x_group0[indices[1]], ..., x_group1[indices[16]], x_group1[indices[17]], ...}
+//
+//	result = {x_group0[indices[0]], x_group0[indices[1]], ..., x_group1[indices[16]], x_group1[indices[17]], ...}
+//
 // The lower four bits of each byte-sized index in indices select an element from its corresponding group in x,
 // unless the index's sign bit is set in which case zero is used instead.
 // Each group is of size 128-bit.
@@ -4785,7 +4802,9 @@ func (x Uint8x16) PermuteOrZero(indices Int8x16) Uint8x16
 func (x Int8x32) PermuteOrZeroGrouped(indices Int8x32) Int8x32
 
 // PermuteOrZeroGrouped performs a grouped permutation of vector x using indices:
-// result = {x_group0[indices[0]], x_group0[indices[1]], ..., x_group1[indices[16]], x_group1[indices[17]], ...}
+//
+//	result = {x_group0[indices[0]], x_group0[indices[1]], ..., x_group1[indices[16]], x_group1[indices[17]], ...}
+//
 // The lower four bits of each byte-sized index in indices select an element from its corresponding group in x,
 // unless the index's sign bit is set in which case zero is used instead.
 // Each group is of size 128-bit.
@@ -4794,7 +4813,9 @@ func (x Int8x32) PermuteOrZeroGrouped(indices Int8x32) Int8x32
 func (x Int8x64) PermuteOrZeroGrouped(indices Int8x64) Int8x64
 
 // PermuteOrZeroGrouped performs a grouped permutation of vector x using indices:
-// result = {x_group0[indices[0]], x_group0[indices[1]], ..., x_group1[indices[16]], x_group1[indices[17]], ...}
+//
+//	result = {x_group0[indices[0]], x_group0[indices[1]], ..., x_group1[indices[16]], x_group1[indices[17]], ...}
+//
 // The lower four bits of each byte-sized index in indices select an element from its corresponding group in x,
 // unless the index's sign bit is set in which case zero is used instead.
 // Each group is of size 128-bit.
@@ -4803,7 +4824,9 @@ func (x Int8x64) PermuteOrZeroGrouped(indices Int8x64) Int8x64
 func (x Uint8x32) PermuteOrZeroGrouped(indices Int8x32) Uint8x32
 
 // PermuteOrZeroGrouped performs a grouped permutation of vector x using indices:
-// result = {x_group0[indices[0]], x_group0[indices[1]], ..., x_group1[indices[16]], x_group1[indices[17]], ...}
+//
+//	result = {x_group0[indices[0]], x_group0[indices[1]], ..., x_group1[indices[16]], x_group1[indices[17]], ...}
+//
 // The lower four bits of each byte-sized index in indices select an element from its corresponding group in x,
 // unless the index's sign bit is set in which case zero is used instead.
 // Each group is of size 128-bit.
@@ -4877,84 +4900,84 @@ func (x Float64x8) ReciprocalSqrt() Float64x8
 
 /* RotateAllLeft */
 
-// RotateAllLeft rotates each element to the left by the number of bits specified by the immediate.
+// RotateAllLeft rotates each element to the left by the number of bits specified by shift.
 //
 // shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
 //
 // Asm: VPROLD, CPU Feature: AVX512
 func (x Int32x4) RotateAllLeft(shift uint8) Int32x4
 
-// RotateAllLeft rotates each element to the left by the number of bits specified by the immediate.
+// RotateAllLeft rotates each element to the left by the number of bits specified by shift.
 //
 // shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
 //
 // Asm: VPROLD, CPU Feature: AVX512
 func (x Int32x8) RotateAllLeft(shift uint8) Int32x8
 
-// RotateAllLeft rotates each element to the left by the number of bits specified by the immediate.
+// RotateAllLeft rotates each element to the left by the number of bits specified by shift.
 //
 // shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
 //
 // Asm: VPROLD, CPU Feature: AVX512
 func (x Int32x16) RotateAllLeft(shift uint8) Int32x16
 
-// RotateAllLeft rotates each element to the left by the number of bits specified by the immediate.
+// RotateAllLeft rotates each element to the left by the number of bits specified by shift.
 //
 // shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
 //
 // Asm: VPROLQ, CPU Feature: AVX512
 func (x Int64x2) RotateAllLeft(shift uint8) Int64x2
 
-// RotateAllLeft rotates each element to the left by the number of bits specified by the immediate.
+// RotateAllLeft rotates each element to the left by the number of bits specified by shift.
 //
 // shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
 //
 // Asm: VPROLQ, CPU Feature: AVX512
 func (x Int64x4) RotateAllLeft(shift uint8) Int64x4
 
-// RotateAllLeft rotates each element to the left by the number of bits specified by the immediate.
+// RotateAllLeft rotates each element to the left by the number of bits specified by shift.
 //
 // shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
 //
 // Asm: VPROLQ, CPU Feature: AVX512
 func (x Int64x8) RotateAllLeft(shift uint8) Int64x8
 
-// RotateAllLeft rotates each element to the left by the number of bits specified by the immediate.
+// RotateAllLeft rotates each element to the left by the number of bits specified by shift.
 //
 // shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
 //
 // Asm: VPROLD, CPU Feature: AVX512
 func (x Uint32x4) RotateAllLeft(shift uint8) Uint32x4
 
-// RotateAllLeft rotates each element to the left by the number of bits specified by the immediate.
+// RotateAllLeft rotates each element to the left by the number of bits specified by shift.
 //
 // shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
 //
 // Asm: VPROLD, CPU Feature: AVX512
 func (x Uint32x8) RotateAllLeft(shift uint8) Uint32x8
 
-// RotateAllLeft rotates each element to the left by the number of bits specified by the immediate.
+// RotateAllLeft rotates each element to the left by the number of bits specified by shift.
 //
 // shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
 //
 // Asm: VPROLD, CPU Feature: AVX512
 func (x Uint32x16) RotateAllLeft(shift uint8) Uint32x16
 
-// RotateAllLeft rotates each element to the left by the number of bits specified by the immediate.
+// RotateAllLeft rotates each element to the left by the number of bits specified by shift.
 //
 // shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
 //
 // Asm: VPROLQ, CPU Feature: AVX512
 func (x Uint64x2) RotateAllLeft(shift uint8) Uint64x2
 
-// RotateAllLeft rotates each element to the left by the number of bits specified by the immediate.
+// RotateAllLeft rotates each element to the left by the number of bits specified by shift.
 //
 // shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
 //
 // Asm: VPROLQ, CPU Feature: AVX512
 func (x Uint64x4) RotateAllLeft(shift uint8) Uint64x4
 
-// RotateAllLeft rotates each element to the left by the number of bits specified by the immediate.
+// RotateAllLeft rotates each element to the left by the number of bits specified by shift.
 //
 // shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
 //
@@ -4963,84 +4986,84 @@ func (x Uint64x8) RotateAllLeft(shift uint8) Uint64x8
 
 /* RotateAllRight */
 
-// RotateAllRight rotates each element to the right by the number of bits specified by the immediate.
+// RotateAllRight rotates each element to the right by the number of bits specified by shift.
 //
 // shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
 //
 // Asm: VPRORD, CPU Feature: AVX512
 func (x Int32x4) RotateAllRight(shift uint8) Int32x4
 
-// RotateAllRight rotates each element to the right by the number of bits specified by the immediate.
+// RotateAllRight rotates each element to the right by the number of bits specified by shift.
 //
 // shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
 //
 // Asm: VPRORD, CPU Feature: AVX512
 func (x Int32x8) RotateAllRight(shift uint8) Int32x8
 
-// RotateAllRight rotates each element to the right by the number of bits specified by the immediate.
+// RotateAllRight rotates each element to the right by the number of bits specified by shift.
 //
 // shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
 //
 // Asm: VPRORD, CPU Feature: AVX512
 func (x Int32x16) RotateAllRight(shift uint8) Int32x16
 
-// RotateAllRight rotates each element to the right by the number of bits specified by the immediate.
+// RotateAllRight rotates each element to the right by the number of bits specified by shift.
 //
 // shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
 //
 // Asm: VPRORQ, CPU Feature: AVX512
 func (x Int64x2) RotateAllRight(shift uint8) Int64x2
 
-// RotateAllRight rotates each element to the right by the number of bits specified by the immediate.
+// RotateAllRight rotates each element to the right by the number of bits specified by shift.
 //
 // shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
 //
 // Asm: VPRORQ, CPU Feature: AVX512
 func (x Int64x4) RotateAllRight(shift uint8) Int64x4
 
-// RotateAllRight rotates each element to the right by the number of bits specified by the immediate.
+// RotateAllRight rotates each element to the right by the number of bits specified by shift.
 //
 // shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
 //
 // Asm: VPRORQ, CPU Feature: AVX512
 func (x Int64x8) RotateAllRight(shift uint8) Int64x8
 
-// RotateAllRight rotates each element to the right by the number of bits specified by the immediate.
+// RotateAllRight rotates each element to the right by the number of bits specified by shift.
 //
 // shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
 //
 // Asm: VPRORD, CPU Feature: AVX512
 func (x Uint32x4) RotateAllRight(shift uint8) Uint32x4
 
-// RotateAllRight rotates each element to the right by the number of bits specified by the immediate.
+// RotateAllRight rotates each element to the right by the number of bits specified by shift.
 //
 // shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
 //
 // Asm: VPRORD, CPU Feature: AVX512
 func (x Uint32x8) RotateAllRight(shift uint8) Uint32x8
 
-// RotateAllRight rotates each element to the right by the number of bits specified by the immediate.
+// RotateAllRight rotates each element to the right by the number of bits specified by shift.
 //
 // shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
 //
 // Asm: VPRORD, CPU Feature: AVX512
 func (x Uint32x16) RotateAllRight(shift uint8) Uint32x16
 
-// RotateAllRight rotates each element to the right by the number of bits specified by the immediate.
+// RotateAllRight rotates each element to the right by the number of bits specified by shift.
 //
 // shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
 //
 // Asm: VPRORQ, CPU Feature: AVX512
 func (x Uint64x2) RotateAllRight(shift uint8) Uint64x2
 
-// RotateAllRight rotates each element to the right by the number of bits specified by the immediate.
+// RotateAllRight rotates each element to the right by the number of bits specified by shift.
 //
 // shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
 //
 // Asm: VPRORQ, CPU Feature: AVX512
 func (x Uint64x4) RotateAllRight(shift uint8) Uint64x4
 
-// RotateAllRight rotates each element to the right by the number of bits specified by the immediate.
+// RotateAllRight rotates each element to the right by the number of bits specified by shift.
 //
 // shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
 //
@@ -5173,22 +5196,22 @@ func (x Uint64x8) RotateRight(y Uint64x8) Uint64x8
 
 /* RoundToEven */
 
-// RoundToEven rounds elements to the nearest integer.
+// RoundToEven rounds elements to the nearest integer, rounding ties to even.
 //
 // Asm: VROUNDPS, CPU Feature: AVX
 func (x Float32x4) RoundToEven() Float32x4
 
-// RoundToEven rounds elements to the nearest integer.
+// RoundToEven rounds elements to the nearest integer, rounding ties to even.
 //
 // Asm: VROUNDPS, CPU Feature: AVX
 func (x Float32x8) RoundToEven() Float32x8
 
-// RoundToEven rounds elements to the nearest integer.
+// RoundToEven rounds elements to the nearest integer, rounding ties to even.
 //
 // Asm: VROUNDPD, CPU Feature: AVX
 func (x Float64x2) RoundToEven() Float64x2
 
-// RoundToEven rounds elements to the nearest integer.
+// RoundToEven rounds elements to the nearest integer, rounding ties to even.
 //
 // Asm: VROUNDPD, CPU Feature: AVX
 func (x Float64x4) RoundToEven() Float64x4
@@ -5365,334 +5388,304 @@ func (x Uint32x4) SHA256TwoRounds(y Uint32x4, z Uint32x4) Uint32x4
 
 /* SaturateToInt8 */
 
-// SaturateToInt8 converts element values to int8.
-// Conversion is done with saturation on the vector elements.
-// Results are packed to low elements in the returned vector, its upper elements are zero-cleared.
+// SaturateToInt8 converts element values to int8 with signed saturation.
+// Results are packed to low elements in the returned vector, its upper elements are zeroed.
 //
 // Asm: VPMOVSWB, CPU Feature: AVX512
 func (x Int16x8) SaturateToInt8() Int8x16
 
-// SaturateToInt8 converts element values to int8.
-// Conversion is done with saturation on the vector elements.
-// Results are packed to low elements in the returned vector, its upper elements are zero-cleared.
+// SaturateToInt8 converts element values to int8 with signed saturation.
 //
 // Asm: VPMOVSWB, CPU Feature: AVX512
 func (x Int16x16) SaturateToInt8() Int8x16
 
-// SaturateToInt8 converts element values to int8.
-// Conversion is done with saturation on the vector elements.
+// SaturateToInt8 converts element values to int8 with signed saturation.
 //
 // Asm: VPMOVSWB, CPU Feature: AVX512
 func (x Int16x32) SaturateToInt8() Int8x32
 
-// SaturateToInt8 converts element values to int8.
-// Conversion is done with saturation on the vector elements.
-// Results are packed to low elements in the returned vector, its upper elements are zero-cleared.
+// SaturateToInt8 converts element values to int8 with signed saturation.
+// Results are packed to low elements in the returned vector, its upper elements are zeroed.
 //
 // Asm: VPMOVSDB, CPU Feature: AVX512
 func (x Int32x4) SaturateToInt8() Int8x16
 
-// SaturateToInt8 converts element values to int8.
-// Conversion is done with saturation on the vector elements.
-// Results are packed to low elements in the returned vector, its upper elements are zero-cleared.
+// SaturateToInt8 converts element values to int8 with signed saturation.
+// Results are packed to low elements in the returned vector, its upper elements are zeroed.
 //
 // Asm: VPMOVSDB, CPU Feature: AVX512
 func (x Int32x8) SaturateToInt8() Int8x16
 
-// SaturateToInt8 converts element values to int8.
-// Conversion is done with saturation on the vector elements.
-// Results are packed to low elements in the returned vector, its upper elements are zero-cleared.
+// SaturateToInt8 converts element values to int8 with signed saturation.
 //
 // Asm: VPMOVSDB, CPU Feature: AVX512
 func (x Int32x16) SaturateToInt8() Int8x16
 
-// SaturateToInt8 converts element values to int8.
-// Conversion is done with saturation on the vector elements.
-// Results are packed to low elements in the returned vector, its upper elements are zero-cleared.
+// SaturateToInt8 converts element values to int8 with signed saturation.
+// Results are packed to low elements in the returned vector, its upper elements are zeroed.
 //
 // Asm: VPMOVSQB, CPU Feature: AVX512
 func (x Int64x2) SaturateToInt8() Int8x16
 
-// SaturateToInt8 converts element values to int8.
-// Conversion is done with saturation on the vector elements.
-// Results are packed to low elements in the returned vector, its upper elements are zero-cleared.
+// SaturateToInt8 converts element values to int8 with signed saturation.
+// Results are packed to low elements in the returned vector, its upper elements are zeroed.
 //
 // Asm: VPMOVSQB, CPU Feature: AVX512
 func (x Int64x4) SaturateToInt8() Int8x16
 
-// SaturateToInt8 converts element values to int8.
-// Conversion is done with saturation on the vector elements.
-// Results are packed to low elements in the returned vector, its upper elements are zero-cleared.
+// SaturateToInt8 converts element values to int8 with signed saturation.
+// Results are packed to low elements in the returned vector, its upper elements are zeroed.
 //
 // Asm: VPMOVSQB, CPU Feature: AVX512
 func (x Int64x8) SaturateToInt8() Int8x16
 
 /* SaturateToInt16 */
 
-// SaturateToInt16 converts element values to int16.
-// Conversion is done with saturation on the vector elements.
+// SaturateToInt16 converts element values to int16 with signed saturation.
+// Results are packed to low elements in the returned vector, its upper elements are zeroed.
 //
 // Asm: VPMOVSDW, CPU Feature: AVX512
 func (x Int32x4) SaturateToInt16() Int16x8
 
-// SaturateToInt16 converts element values to int16.
-// Conversion is done with saturation on the vector elements.
+// SaturateToInt16 converts element values to int16 with signed saturation.
 //
 // Asm: VPMOVSDW, CPU Feature: AVX512
 func (x Int32x8) SaturateToInt16() Int16x8
 
-// SaturateToInt16 converts element values to int16.
-// Conversion is done with saturation on the vector elements.
+// SaturateToInt16 converts element values to int16 with signed saturation.
 //
 // Asm: VPMOVSDW, CPU Feature: AVX512
 func (x Int32x16) SaturateToInt16() Int16x16
 
-// SaturateToInt16 converts element values to int16.
-// Conversion is done with saturation on the vector elements.
+// SaturateToInt16 converts element values to int16 with signed saturation.
+// Results are packed to low elements in the returned vector, its upper elements are zeroed.
 //
 // Asm: VPMOVSQW, CPU Feature: AVX512
 func (x Int64x2) SaturateToInt16() Int16x8
 
-// SaturateToInt16 converts element values to int16.
-// Conversion is done with saturation on the vector elements.
+// SaturateToInt16 converts element values to int16 with signed saturation.
+// Results are packed to low elements in the returned vector, its upper elements are zeroed.
 //
 // Asm: VPMOVSQW, CPU Feature: AVX512
 func (x Int64x4) SaturateToInt16() Int16x8
 
-// SaturateToInt16 converts element values to int16.
-// Conversion is done with saturation on the vector elements.
+// SaturateToInt16 converts element values to int16 with signed saturation.
 //
 // Asm: VPMOVSQW, CPU Feature: AVX512
 func (x Int64x8) SaturateToInt16() Int16x8
 
 /* SaturateToInt16Concat */
 
-// SaturateToInt16Concat converts element values to int16.
-// With each 128-bit as a group:
-// The converted group from the first input vector will be packed to the lower part of the result vector,
-// the converted group from the second input vector will be packed to the upper part of the result vector.
-// Conversion is done with saturation on the vector elements.
+// SaturateToInt16Concat converts element values to int16 with signed saturation.
+// The converted elements from x will be packed to the lower part of the result vector,
+// the converted elements from y will be packed to the upper part of the result vector.
 //
 // Asm: VPACKSSDW, CPU Feature: AVX
 func (x Int32x4) SaturateToInt16Concat(y Int32x4) Int16x8
 
-// SaturateToInt16Concat converts element values to int16.
+/* SaturateToInt16ConcatGrouped */
+
+// SaturateToInt16ConcatGrouped converts element values to int16 with signed saturation.
 // With each 128-bit as a group:
-// The converted group from the first input vector will be packed to the lower part of the result vector,
-// the converted group from the second input vector will be packed to the upper part of the result vector.
-// Conversion is done with saturation on the vector elements.
+// The converted elements from x will be packed to the lower part of the group in the result vector,
+// the converted elements from y will be packed to the upper part of the group in the result vector.
 //
 // Asm: VPACKSSDW, CPU Feature: AVX2
-func (x Int32x8) SaturateToInt16Concat(y Int32x8) Int16x16
+func (x Int32x8) SaturateToInt16ConcatGrouped(y Int32x8) Int16x16
 
-// SaturateToInt16Concat converts element values to int16.
+// SaturateToInt16ConcatGrouped converts element values to int16 with signed saturation.
 // With each 128-bit as a group:
-// The converted group from the first input vector will be packed to the lower part of the result vector,
-// the converted group from the second input vector will be packed to the upper part of the result vector.
-// Conversion is done with saturation on the vector elements.
+// The converted elements from x will be packed to the lower part of the group in the result vector,
+// the converted elements from y will be packed to the upper part of the group in the result vector.
 //
 // Asm: VPACKSSDW, CPU Feature: AVX512
-func (x Int32x16) SaturateToInt16Concat(y Int32x16) Int16x32
+func (x Int32x16) SaturateToInt16ConcatGrouped(y Int32x16) Int16x32
 
 /* SaturateToInt32 */
 
-// SaturateToInt32 converts element values to int32.
-// Conversion is done with saturation on the vector elements.
+// SaturateToInt32 converts element values to int32 with signed saturation.
+// Results are packed to low elements in the returned vector, its upper elements are zeroed.
 //
 // Asm: VPMOVSQD, CPU Feature: AVX512
 func (x Int64x2) SaturateToInt32() Int32x4
 
-// SaturateToInt32 converts element values to int32.
-// Conversion is done with saturation on the vector elements.
+// SaturateToInt32 converts element values to int32 with signed saturation.
 //
 // Asm: VPMOVSQD, CPU Feature: AVX512
 func (x Int64x4) SaturateToInt32() Int32x4
 
-// SaturateToInt32 converts element values to int32.
-// Conversion is done with saturation on the vector elements.
+// SaturateToInt32 converts element values to int32 with signed saturation.
 //
 // Asm: VPMOVSQD, CPU Feature: AVX512
 func (x Int64x8) SaturateToInt32() Int32x8
 
 /* SaturateToUint8 */
 
-// SaturateToUint8 converts element values to uint8.
-// Conversion is done with saturation on the vector elements.
-// Results are packed to low elements in the returned vector, its upper elements are zero-cleared.
+// SaturateToUint8 converts element values to uint8 with unsigned saturation.
+// Results are packed to low elements in the returned vector, its upper elements are zeroed.
 //
-// Asm: VPMOVSWB, CPU Feature: AVX512
-func (x Int16x8) SaturateToUint8() Int8x16
+// Asm: VPMOVUSWB, CPU Feature: AVX512
+func (x Uint16x8) SaturateToUint8() Uint8x16
 
-// SaturateToUint8 converts element values to uint8.
-// Conversion is done with saturation on the vector elements.
-// Results are packed to low elements in the returned vector, its upper elements are zero-cleared.
+// SaturateToUint8 converts element values to uint8 with unsigned saturation.
 //
-// Asm: VPMOVSWB, CPU Feature: AVX512
-func (x Int16x16) SaturateToUint8() Int8x16
+// Asm: VPMOVUSWB, CPU Feature: AVX512
+func (x Uint16x16) SaturateToUint8() Uint8x16
 
-// SaturateToUint8 converts element values to uint8.
-// Conversion is done with saturation on the vector elements.
-// Results are packed to low elements in the returned vector, its upper elements are zero-cleared.
+// SaturateToUint8 converts element values to uint8 with unsigned saturation.
 //
-// Asm: VPMOVSDB, CPU Feature: AVX512
-func (x Int32x4) SaturateToUint8() Int8x16
+// Asm: VPMOVUSWB, CPU Feature: AVX512
+func (x Uint16x32) SaturateToUint8() Uint8x32
 
-// SaturateToUint8 converts element values to uint8.
-// Conversion is done with saturation on the vector elements.
-// Results are packed to low elements in the returned vector, its upper elements are zero-cleared.
+// SaturateToUint8 converts element values to uint8 with unsigned saturation.
+// Results are packed to low elements in the returned vector, its upper elements are zeroed.
 //
-// Asm: VPMOVSDB, CPU Feature: AVX512
-func (x Int32x8) SaturateToUint8() Int8x16
+// Asm: VPMOVUSDB, CPU Feature: AVX512
+func (x Uint32x4) SaturateToUint8() Uint8x16
 
-// SaturateToUint8 converts element values to uint8.
-// Conversion is done with saturation on the vector elements.
-// Results are packed to low elements in the returned vector, its upper elements are zero-cleared.
+// SaturateToUint8 converts element values to uint8 with unsigned saturation.
+// Results are packed to low elements in the returned vector, its upper elements are zeroed.
 //
-// Asm: VPMOVSDB, CPU Feature: AVX512
-func (x Int32x16) SaturateToUint8() Int8x16
+// Asm: VPMOVUSDB, CPU Feature: AVX512
+func (x Uint32x8) SaturateToUint8() Uint8x16
 
-// SaturateToUint8 converts element values to uint8.
-// Conversion is done with saturation on the vector elements.
-// Results are packed to low elements in the returned vector, its upper elements are zero-cleared.
+// SaturateToUint8 converts element values to uint8 with unsigned saturation.
 //
-// Asm: VPMOVSQB, CPU Feature: AVX512
-func (x Int64x2) SaturateToUint8() Int8x16
+// Asm: VPMOVUSDB, CPU Feature: AVX512
+func (x Uint32x16) SaturateToUint8() Uint8x16
 
-// SaturateToUint8 converts element values to uint8.
-// Conversion is done with saturation on the vector elements.
-// Results are packed to low elements in the returned vector, its upper elements are zero-cleared.
+// SaturateToUint8 converts element values to uint8 with unsigned saturation.
+// Results are packed to low elements in the returned vector, its upper elements are zeroed.
 //
-// Asm: VPMOVSQB, CPU Feature: AVX512
-func (x Int64x4) SaturateToUint8() Int8x16
+// Asm: VPMOVUSQB, CPU Feature: AVX512
+func (x Uint64x2) SaturateToUint8() Uint8x16
 
-// SaturateToUint8 converts element values to uint8.
-// Conversion is done with saturation on the vector elements.
-// Results are packed to low elements in the returned vector, its upper elements are zero-cleared.
+// SaturateToUint8 converts element values to uint8 with unsigned saturation.
+// Results are packed to low elements in the returned vector, its upper elements are zeroed.
 //
-// Asm: VPMOVSQB, CPU Feature: AVX512
-func (x Int64x8) SaturateToUint8() Int8x16
+// Asm: VPMOVUSQB, CPU Feature: AVX512
+func (x Uint64x4) SaturateToUint8() Uint8x16
 
-// SaturateToUint8 converts element values to uint8.
-// Conversion is done with saturation on the vector elements.
+// SaturateToUint8 converts element values to uint8 with unsigned saturation.
+// Results are packed to low elements in the returned vector, its upper elements are zeroed.
 //
-// Asm: VPMOVUSWB, CPU Feature: AVX512
-func (x Uint16x32) SaturateToUint8() Uint8x32
+// Asm: VPMOVUSQB, CPU Feature: AVX512
+func (x Uint64x8) SaturateToUint8() Uint8x16
 
 /* SaturateToUint16 */
 
-// SaturateToUint16 converts element values to uint16.
-// Conversion is done with saturation on the vector elements.
+// SaturateToUint16 converts element values to uint16 with unsigned saturation.
+// Results are packed to low elements in the returned vector, its upper elements are zeroed.
 //
 // Asm: VPMOVUSDW, CPU Feature: AVX512
 func (x Uint32x4) SaturateToUint16() Uint16x8
 
-// SaturateToUint16 converts element values to uint16.
-// Conversion is done with saturation on the vector elements.
+// SaturateToUint16 converts element values to uint16 with unsigned saturation.
 //
 // Asm: VPMOVUSDW, CPU Feature: AVX512
 func (x Uint32x8) SaturateToUint16() Uint16x8
 
-// SaturateToUint16 converts element values to uint16.
-// Conversion is done with saturation on the vector elements.
+// SaturateToUint16 converts element values to uint16 with unsigned saturation.
 //
 // Asm: VPMOVUSDW, CPU Feature: AVX512
 func (x Uint32x16) SaturateToUint16() Uint16x16
 
-// SaturateToUint16 converts element values to uint16.
-// Conversion is done with saturation on the vector elements.
+// SaturateToUint16 converts element values to uint16 with unsigned saturation.
+// Results are packed to low elements in the returned vector, its upper elements are zeroed.
 //
 // Asm: VPMOVUSQW, CPU Feature: AVX512
 func (x Uint64x2) SaturateToUint16() Uint16x8
 
-// SaturateToUint16 converts element values to uint16.
-// Conversion is done with saturation on the vector elements.
+// SaturateToUint16 converts element values to uint16 with unsigned saturation.
+// Results are packed to low elements in the returned vector, its upper elements are zeroed.
 //
 // Asm: VPMOVUSQW, CPU Feature: AVX512
 func (x Uint64x4) SaturateToUint16() Uint16x8
 
-// SaturateToUint16 converts element values to uint16.
-// Conversion is done with saturation on the vector elements.
+// SaturateToUint16 converts element values to uint16 with unsigned saturation.
 //
 // Asm: VPMOVUSQW, CPU Feature: AVX512
 func (x Uint64x8) SaturateToUint16() Uint16x8
 
 /* SaturateToUint16Concat */
 
-// SaturateToUint16Concat converts element values to uint16.
-// With each 128-bit as a group:
-// The converted group from the first input vector will be packed to the lower part of the result vector,
-// the converted group from the second input vector will be packed to the upper part of the result vector.
-// Conversion is done with saturation on the vector elements.
+// SaturateToUint16Concat converts element values to uint16 with unsigned saturation.
+// The converted elements from x will be packed to the lower part of the result vector,
+// the converted elements from y will be packed to the upper part of the result vector.
 //
 // Asm: VPACKUSDW, CPU Feature: AVX
-func (x Uint32x4) SaturateToUint16Concat(y Uint32x4) Uint16x8
+func (x Int32x4) SaturateToUint16Concat(y Int32x4) Uint16x8
+
+/* SaturateToUint16ConcatGrouped */
 
-// SaturateToUint16Concat converts element values to uint16.
+// SaturateToUint16ConcatGrouped converts element values to uint16 with unsigned saturation.
 // With each 128-bit as a group:
-// The converted group from the first input vector will be packed to the lower part of the result vector,
-// the converted group from the second input vector will be packed to the upper part of the result vector.
-// Conversion is done with saturation on the vector elements.
+// The converted elements from x will be packed to the lower part of the group in the result vector,
+// the converted elements from y will be packed to the upper part of the group in the result vector.
 //
 // Asm: VPACKUSDW, CPU Feature: AVX2
-func (x Uint32x8) SaturateToUint16Concat(y Uint32x8) Uint16x16
+func (x Int32x8) SaturateToUint16ConcatGrouped(y Int32x8) Uint16x16
 
-// SaturateToUint16Concat converts element values to uint16.
+// SaturateToUint16ConcatGrouped converts element values to uint16 with unsigned saturation.
 // With each 128-bit as a group:
-// The converted group from the first input vector will be packed to the lower part of the result vector,
-// the converted group from the second input vector will be packed to the upper part of the result vector.
-// Conversion is done with saturation on the vector elements.
+// The converted elements from x will be packed to the lower part of the group in the result vector,
+// the converted elements from y will be packed to the upper part of the group in the result vector.
 //
 // Asm: VPACKUSDW, CPU Feature: AVX512
-func (x Uint32x16) SaturateToUint16Concat(y Uint32x16) Uint16x32
+func (x Int32x16) SaturateToUint16ConcatGrouped(y Int32x16) Uint16x32
 
 /* SaturateToUint32 */
 
-// SaturateToUint32 converts element values to uint32.
-// Conversion is done with saturation on the vector elements.
+// SaturateToUint32 converts element values to uint32 with unsigned saturation.
+// Results are packed to low elements in the returned vector, its upper elements are zeroed.
 //
 // Asm: VPMOVUSQD, CPU Feature: AVX512
 func (x Uint64x2) SaturateToUint32() Uint32x4
 
-// SaturateToUint32 converts element values to uint32.
-// Conversion is done with saturation on the vector elements.
+// SaturateToUint32 converts element values to uint32 with unsigned saturation.
 //
 // Asm: VPMOVUSQD, CPU Feature: AVX512
 func (x Uint64x4) SaturateToUint32() Uint32x4
 
-// SaturateToUint32 converts element values to uint32.
-// Conversion is done with saturation on the vector elements.
+// SaturateToUint32 converts element values to uint32 with unsigned saturation.
 //
 // Asm: VPMOVUSQD, CPU Feature: AVX512
 func (x Uint64x8) SaturateToUint32() Uint32x8
 
 /* Scale */
 
-// Scale multiplies elements by a power of 2.
+// Scale multiplies each element of x by 2 raised to the power of the
+// floor of the corresponding element in y.
 //
 // Asm: VSCALEFPS, CPU Feature: AVX512
 func (x Float32x4) Scale(y Float32x4) Float32x4
 
-// Scale multiplies elements by a power of 2.
+// Scale multiplies each element of x by 2 raised to the power of the
+// floor of the corresponding element in y.
 //
 // Asm: VSCALEFPS, CPU Feature: AVX512
 func (x Float32x8) Scale(y Float32x8) Float32x8
 
-// Scale multiplies elements by a power of 2.
+// Scale multiplies each element of x by 2 raised to the power of the
+// floor of the corresponding element in y.
 //
 // Asm: VSCALEFPS, CPU Feature: AVX512
 func (x Float32x16) Scale(y Float32x16) Float32x16
 
-// Scale multiplies elements by a power of 2.
+// Scale multiplies each element of x by 2 raised to the power of the
+// floor of the corresponding element in y.
 //
 // Asm: VSCALEFPD, CPU Feature: AVX512
 func (x Float64x2) Scale(y Float64x2) Float64x2
 
-// Scale multiplies elements by a power of 2.
+// Scale multiplies each element of x by 2 raised to the power of the
+// floor of the corresponding element in y.
 //
 // Asm: VSCALEFPD, CPU Feature: AVX512
 func (x Float64x4) Scale(y Float64x4) Float64x4
 
-// Scale multiplies elements by a power of 2.
+// Scale multiplies each element of x by 2 raised to the power of the
+// floor of the corresponding element in y.
 //
 // Asm: VSCALEFPD, CPU Feature: AVX512
 func (x Float64x8) Scale(y Float64x8) Float64x8
@@ -6131,236 +6124,236 @@ func (x Uint64x8) SetLo(y Uint64x4) Uint64x8
 
 /* ShiftAllLeft */
 
-// ShiftAllLeft shifts each element to the left by the specified number of bits. Emptied lower bits are zeroed.
+// ShiftAllLeft shifts each element to the left by y bits.
 //
 // Asm: VPSLLW, CPU Feature: AVX
 func (x Int16x8) ShiftAllLeft(y uint64) Int16x8
 
-// ShiftAllLeft shifts each element to the left by the specified number of bits. Emptied lower bits are zeroed.
+// ShiftAllLeft shifts each element to the left by y bits.
 //
 // Asm: VPSLLW, CPU Feature: AVX2
 func (x Int16x16) ShiftAllLeft(y uint64) Int16x16
 
-// ShiftAllLeft shifts each element to the left by the specified number of bits. Emptied lower bits are zeroed.
+// ShiftAllLeft shifts each element to the left by y bits.
 //
 // Asm: VPSLLW, CPU Feature: AVX512
 func (x Int16x32) ShiftAllLeft(y uint64) Int16x32
 
-// ShiftAllLeft shifts each element to the left by the specified number of bits. Emptied lower bits are zeroed.
+// ShiftAllLeft shifts each element to the left by y bits.
 //
 // Asm: VPSLLD, CPU Feature: AVX
 func (x Int32x4) ShiftAllLeft(y uint64) Int32x4
 
-// ShiftAllLeft shifts each element to the left by the specified number of bits. Emptied lower bits are zeroed.
+// ShiftAllLeft shifts each element to the left by y bits.
 //
 // Asm: VPSLLD, CPU Feature: AVX2
 func (x Int32x8) ShiftAllLeft(y uint64) Int32x8
 
-// ShiftAllLeft shifts each element to the left by the specified number of bits. Emptied lower bits are zeroed.
+// ShiftAllLeft shifts each element to the left by y bits.
 //
 // Asm: VPSLLD, CPU Feature: AVX512
 func (x Int32x16) ShiftAllLeft(y uint64) Int32x16
 
-// ShiftAllLeft shifts each element to the left by the specified number of bits. Emptied lower bits are zeroed.
+// ShiftAllLeft shifts each element to the left by y bits.
 //
 // Asm: VPSLLQ, CPU Feature: AVX
 func (x Int64x2) ShiftAllLeft(y uint64) Int64x2
 
-// ShiftAllLeft shifts each element to the left by the specified number of bits. Emptied lower bits are zeroed.
+// ShiftAllLeft shifts each element to the left by y bits.
 //
 // Asm: VPSLLQ, CPU Feature: AVX2
 func (x Int64x4) ShiftAllLeft(y uint64) Int64x4
 
-// ShiftAllLeft shifts each element to the left by the specified number of bits. Emptied lower bits are zeroed.
+// ShiftAllLeft shifts each element to the left by y bits.
 //
 // Asm: VPSLLQ, CPU Feature: AVX512
 func (x Int64x8) ShiftAllLeft(y uint64) Int64x8
 
-// ShiftAllLeft shifts each element to the left by the specified number of bits. Emptied lower bits are zeroed.
+// ShiftAllLeft shifts each element to the left by y bits.
 //
 // Asm: VPSLLW, CPU Feature: AVX
 func (x Uint16x8) ShiftAllLeft(y uint64) Uint16x8
 
-// ShiftAllLeft shifts each element to the left by the specified number of bits. Emptied lower bits are zeroed.
+// ShiftAllLeft shifts each element to the left by y bits.
 //
 // Asm: VPSLLW, CPU Feature: AVX2
 func (x Uint16x16) ShiftAllLeft(y uint64) Uint16x16
 
-// ShiftAllLeft shifts each element to the left by the specified number of bits. Emptied lower bits are zeroed.
+// ShiftAllLeft shifts each element to the left by y bits.
 //
 // Asm: VPSLLW, CPU Feature: AVX512
 func (x Uint16x32) ShiftAllLeft(y uint64) Uint16x32
 
-// ShiftAllLeft shifts each element to the left by the specified number of bits. Emptied lower bits are zeroed.
+// ShiftAllLeft shifts each element to the left by y bits.
 //
 // Asm: VPSLLD, CPU Feature: AVX
 func (x Uint32x4) ShiftAllLeft(y uint64) Uint32x4
 
-// ShiftAllLeft shifts each element to the left by the specified number of bits. Emptied lower bits are zeroed.
+// ShiftAllLeft shifts each element to the left by y bits.
 //
 // Asm: VPSLLD, CPU Feature: AVX2
 func (x Uint32x8) ShiftAllLeft(y uint64) Uint32x8
 
-// ShiftAllLeft shifts each element to the left by the specified number of bits. Emptied lower bits are zeroed.
+// ShiftAllLeft shifts each element to the left by y bits.
 //
 // Asm: VPSLLD, CPU Feature: AVX512
 func (x Uint32x16) ShiftAllLeft(y uint64) Uint32x16
 
-// ShiftAllLeft shifts each element to the left by the specified number of bits. Emptied lower bits are zeroed.
+// ShiftAllLeft shifts each element to the left by y bits.
 //
 // Asm: VPSLLQ, CPU Feature: AVX
 func (x Uint64x2) ShiftAllLeft(y uint64) Uint64x2
 
-// ShiftAllLeft shifts each element to the left by the specified number of bits. Emptied lower bits are zeroed.
+// ShiftAllLeft shifts each element to the left by y bits.
 //
 // Asm: VPSLLQ, CPU Feature: AVX2
 func (x Uint64x4) ShiftAllLeft(y uint64) Uint64x4
 
-// ShiftAllLeft shifts each element to the left by the specified number of bits. Emptied lower bits are zeroed.
+// ShiftAllLeft shifts each element to the left by y bits.
 //
 // Asm: VPSLLQ, CPU Feature: AVX512
 func (x Uint64x8) ShiftAllLeft(y uint64) Uint64x8
 
 /* ShiftAllLeftConcat */
 
-// ShiftAllLeftConcat shifts each element of x to the left by the number of bits specified by the
-// immediate(only the lower 5 bits are used), and then copies the upper bits of y to the emptied lower bits of the shifted x.
+// ShiftAllLeftConcat shifts each element of x to the left by the number of bits specified by
+// shift (only the lower 5 bits are used), and then copies the upper bits of y to the emptied lower bits of the shifted x.
 //
 // shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
 //
 // Asm: VPSHLDW, CPU Feature: AVX512VBMI2
 func (x Int16x8) ShiftAllLeftConcat(shift uint8, y Int16x8) Int16x8
 
-// ShiftAllLeftConcat shifts each element of x to the left by the number of bits specified by the
-// immediate(only the lower 5 bits are used), and then copies the upper bits of y to the emptied lower bits of the shifted x.
+// ShiftAllLeftConcat shifts each element of x to the left by the number of bits specified by
+// shift (only the lower 5 bits are used), and then copies the upper bits of y to the emptied lower bits of the shifted x.
 //
 // shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
 //
 // Asm: VPSHLDW, CPU Feature: AVX512VBMI2
 func (x Int16x16) ShiftAllLeftConcat(shift uint8, y Int16x16) Int16x16
 
-// ShiftAllLeftConcat shifts each element of x to the left by the number of bits specified by the
-// immediate(only the lower 5 bits are used), and then copies the upper bits of y to the emptied lower bits of the shifted x.
+// ShiftAllLeftConcat shifts each element of x to the left by the number of bits specified by
+// shift (only the lower 5 bits are used), and then copies the upper bits of y to the emptied lower bits of the shifted x.
 //
 // shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
 //
 // Asm: VPSHLDW, CPU Feature: AVX512VBMI2
 func (x Int16x32) ShiftAllLeftConcat(shift uint8, y Int16x32) Int16x32
 
-// ShiftAllLeftConcat shifts each element of x to the left by the number of bits specified by the
-// immediate(only the lower 5 bits are used), and then copies the upper bits of y to the emptied lower bits of the shifted x.
+// ShiftAllLeftConcat shifts each element of x to the left by the number of bits specified by
+// shift (only the lower 5 bits are used), and then copies the upper bits of y to the emptied lower bits of the shifted x.
 //
 // shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
 //
 // Asm: VPSHLDD, CPU Feature: AVX512VBMI2
 func (x Int32x4) ShiftAllLeftConcat(shift uint8, y Int32x4) Int32x4
 
-// ShiftAllLeftConcat shifts each element of x to the left by the number of bits specified by the
-// immediate(only the lower 5 bits are used), and then copies the upper bits of y to the emptied lower bits of the shifted x.
+// ShiftAllLeftConcat shifts each element of x to the left by the number of bits specified by
+// shift (only the lower 5 bits are used), and then copies the upper bits of y to the emptied lower bits of the shifted x.
 //
 // shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
 //
 // Asm: VPSHLDD, CPU Feature: AVX512VBMI2
 func (x Int32x8) ShiftAllLeftConcat(shift uint8, y Int32x8) Int32x8
 
-// ShiftAllLeftConcat shifts each element of x to the left by the number of bits specified by the
-// immediate(only the lower 5 bits are used), and then copies the upper bits of y to the emptied lower bits of the shifted x.
+// ShiftAllLeftConcat shifts each element of x to the left by the number of bits specified by
+// shift (only the lower 5 bits are used), and then copies the upper bits of y to the emptied lower bits of the shifted x.
 //
 // shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
 //
 // Asm: VPSHLDD, CPU Feature: AVX512VBMI2
 func (x Int32x16) ShiftAllLeftConcat(shift uint8, y Int32x16) Int32x16
 
-// ShiftAllLeftConcat shifts each element of x to the left by the number of bits specified by the
-// immediate(only the lower 5 bits are used), and then copies the upper bits of y to the emptied lower bits of the shifted x.
+// ShiftAllLeftConcat shifts each element of x to the left by the number of bits specified by
+// shift (only the lower 5 bits are used), and then copies the upper bits of y to the emptied lower bits of the shifted x.
 //
 // shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
 //
 // Asm: VPSHLDQ, CPU Feature: AVX512VBMI2
 func (x Int64x2) ShiftAllLeftConcat(shift uint8, y Int64x2) Int64x2
 
-// ShiftAllLeftConcat shifts each element of x to the left by the number of bits specified by the
-// immediate(only the lower 5 bits are used), and then copies the upper bits of y to the emptied lower bits of the shifted x.
+// ShiftAllLeftConcat shifts each element of x to the left by the number of bits specified by
+// shift (only the lower 5 bits are used), and then copies the upper bits of y to the emptied lower bits of the shifted x.
 //
 // shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
 //
 // Asm: VPSHLDQ, CPU Feature: AVX512VBMI2
 func (x Int64x4) ShiftAllLeftConcat(shift uint8, y Int64x4) Int64x4
 
-// ShiftAllLeftConcat shifts each element of x to the left by the number of bits specified by the
-// immediate(only the lower 5 bits are used), and then copies the upper bits of y to the emptied lower bits of the shifted x.
+// ShiftAllLeftConcat shifts each element of x to the left by the number of bits specified by
+// shift (only the lower 5 bits are used), and then copies the upper bits of y to the emptied lower bits of the shifted x.
 //
 // shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
 //
 // Asm: VPSHLDQ, CPU Feature: AVX512VBMI2
 func (x Int64x8) ShiftAllLeftConcat(shift uint8, y Int64x8) Int64x8
 
-// ShiftAllLeftConcat shifts each element of x to the left by the number of bits specified by the
-// immediate(only the lower 5 bits are used), and then copies the upper bits of y to the emptied lower bits of the shifted x.
+// ShiftAllLeftConcat shifts each element of x to the left by the number of bits specified by
+// shift (only the lower 5 bits are used), and then copies the upper bits of y to the emptied lower bits of the shifted x.
 //
 // shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
 //
 // Asm: VPSHLDW, CPU Feature: AVX512VBMI2
 func (x Uint16x8) ShiftAllLeftConcat(shift uint8, y Uint16x8) Uint16x8
 
-// ShiftAllLeftConcat shifts each element of x to the left by the number of bits specified by the
-// immediate(only the lower 5 bits are used), and then copies the upper bits of y to the emptied lower bits of the shifted x.
+// ShiftAllLeftConcat shifts each element of x to the left by the number of bits specified by
+// shift (only the lower 5 bits are used), and then copies the upper bits of y to the emptied lower bits of the shifted x.
 //
 // shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
 //
 // Asm: VPSHLDW, CPU Feature: AVX512VBMI2
 func (x Uint16x16) ShiftAllLeftConcat(shift uint8, y Uint16x16) Uint16x16
 
-// ShiftAllLeftConcat shifts each element of x to the left by the number of bits specified by the
-// immediate(only the lower 5 bits are used), and then copies the upper bits of y to the emptied lower bits of the shifted x.
+// ShiftAllLeftConcat shifts each element of x to the left by the number of bits specified by
+// shift (only the lower 5 bits are used), and then copies the upper bits of y to the emptied lower bits of the shifted x.
 //
 // shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
 //
 // Asm: VPSHLDW, CPU Feature: AVX512VBMI2
 func (x Uint16x32) ShiftAllLeftConcat(shift uint8, y Uint16x32) Uint16x32
 
-// ShiftAllLeftConcat shifts each element of x to the left by the number of bits specified by the
-// immediate(only the lower 5 bits are used), and then copies the upper bits of y to the emptied lower bits of the shifted x.
+// ShiftAllLeftConcat shifts each element of x to the left by the number of bits specified by
+// shift (only the lower 5 bits are used), and then copies the upper bits of y to the emptied lower bits of the shifted x.
 //
 // shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
 //
 // Asm: VPSHLDD, CPU Feature: AVX512VBMI2
 func (x Uint32x4) ShiftAllLeftConcat(shift uint8, y Uint32x4) Uint32x4
 
-// ShiftAllLeftConcat shifts each element of x to the left by the number of bits specified by the
-// immediate(only the lower 5 bits are used), and then copies the upper bits of y to the emptied lower bits of the shifted x.
+// ShiftAllLeftConcat shifts each element of x to the left by the number of bits specified by
+// shift (only the lower 5 bits are used), and then copies the upper bits of y to the emptied lower bits of the shifted x.
 //
 // shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
 //
 // Asm: VPSHLDD, CPU Feature: AVX512VBMI2
 func (x Uint32x8) ShiftAllLeftConcat(shift uint8, y Uint32x8) Uint32x8
 
-// ShiftAllLeftConcat shifts each element of x to the left by the number of bits specified by the
-// immediate(only the lower 5 bits are used), and then copies the upper bits of y to the emptied lower bits of the shifted x.
+// ShiftAllLeftConcat shifts each element of x to the left by the number of bits specified by
+// shift (only the lower 5 bits are used), and then copies the upper bits of y to the emptied lower bits of the shifted x.
 //
 // shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
 //
 // Asm: VPSHLDD, CPU Feature: AVX512VBMI2
 func (x Uint32x16) ShiftAllLeftConcat(shift uint8, y Uint32x16) Uint32x16
 
-// ShiftAllLeftConcat shifts each element of x to the left by the number of bits specified by the
-// immediate(only the lower 5 bits are used), and then copies the upper bits of y to the emptied lower bits of the shifted x.
+// ShiftAllLeftConcat shifts each element of x to the left by the number of bits specified by
+// shift (only the lower 5 bits are used), and then copies the upper bits of y to the emptied lower bits of the shifted x.
 //
 // shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
 //
 // Asm: VPSHLDQ, CPU Feature: AVX512VBMI2
 func (x Uint64x2) ShiftAllLeftConcat(shift uint8, y Uint64x2) Uint64x2
 
-// ShiftAllLeftConcat shifts each element of x to the left by the number of bits specified by the
-// immediate(only the lower 5 bits are used), and then copies the upper bits of y to the emptied lower bits of the shifted x.
+// ShiftAllLeftConcat shifts each element of x to the left by the number of bits specified by
+// shift (only the lower 5 bits are used), and then copies the upper bits of y to the emptied lower bits of the shifted x.
 //
 // shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
 //
 // Asm: VPSHLDQ, CPU Feature: AVX512VBMI2
 func (x Uint64x4) ShiftAllLeftConcat(shift uint8, y Uint64x4) Uint64x4
 
-// ShiftAllLeftConcat shifts each element of x to the left by the number of bits specified by the
-// immediate(only the lower 5 bits are used), and then copies the upper bits of y to the emptied lower bits of the shifted x.
+// ShiftAllLeftConcat shifts each element of x to the left by the number of bits specified by
+// shift (only the lower 5 bits are used), and then copies the upper bits of y to the emptied lower bits of the shifted x.
 //
 // shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
 //
@@ -6369,236 +6362,236 @@ func (x Uint64x8) ShiftAllLeftConcat(shift uint8, y Uint64x8) Uint64x8
 
 /* ShiftAllRight */
 
-// ShiftAllRight shifts each element to the right by the specified number of bits. Emptied upper bits are filled with the sign bit.
+// ShiftAllRight performs a signed right shift on each element by y bits.
 //
 // Asm: VPSRAW, CPU Feature: AVX
 func (x Int16x8) ShiftAllRight(y uint64) Int16x8
 
-// ShiftAllRight shifts each element to the right by the specified number of bits. Emptied upper bits are filled with the sign bit.
+// ShiftAllRight performs a signed right shift on each element by y bits.
 //
 // Asm: VPSRAW, CPU Feature: AVX2
 func (x Int16x16) ShiftAllRight(y uint64) Int16x16
 
-// ShiftAllRight shifts each element to the right by the specified number of bits. Emptied upper bits are filled with the sign bit.
+// ShiftAllRight performs a signed right shift on each element by y bits.
 //
 // Asm: VPSRAW, CPU Feature: AVX512
 func (x Int16x32) ShiftAllRight(y uint64) Int16x32
 
-// ShiftAllRight shifts each element to the right by the specified number of bits. Emptied upper bits are filled with the sign bit.
+// ShiftAllRight performs a signed right shift on each element by y bits.
 //
 // Asm: VPSRAD, CPU Feature: AVX
 func (x Int32x4) ShiftAllRight(y uint64) Int32x4
 
-// ShiftAllRight shifts each element to the right by the specified number of bits. Emptied upper bits are filled with the sign bit.
+// ShiftAllRight performs a signed right shift on each element by y bits.
 //
 // Asm: VPSRAD, CPU Feature: AVX2
 func (x Int32x8) ShiftAllRight(y uint64) Int32x8
 
-// ShiftAllRight shifts each element to the right by the specified number of bits. Emptied upper bits are filled with the sign bit.
+// ShiftAllRight performs a signed right shift on each element by y bits.
 //
 // Asm: VPSRAD, CPU Feature: AVX512
 func (x Int32x16) ShiftAllRight(y uint64) Int32x16
 
-// ShiftAllRight shifts each element to the right by the specified number of bits. Emptied upper bits are filled with the sign bit.
+// ShiftAllRight performs a signed right shift on each element by y bits.
 //
 // Asm: VPSRAQ, CPU Feature: AVX512
 func (x Int64x2) ShiftAllRight(y uint64) Int64x2
 
-// ShiftAllRight shifts each element to the right by the specified number of bits. Emptied upper bits are filled with the sign bit.
+// ShiftAllRight performs a signed right shift on each element by y bits.
 //
 // Asm: VPSRAQ, CPU Feature: AVX512
 func (x Int64x4) ShiftAllRight(y uint64) Int64x4
 
-// ShiftAllRight shifts each element to the right by the specified number of bits. Emptied upper bits are filled with the sign bit.
+// ShiftAllRight performs a signed right shift on each element by y bits.
 //
 // Asm: VPSRAQ, CPU Feature: AVX512
 func (x Int64x8) ShiftAllRight(y uint64) Int64x8
 
-// ShiftAllRight shifts each element to the right by the specified number of bits. Emptied upper bits are zeroed.
+// ShiftAllRight performs an unsigned right shift on each element by y bits.
 //
 // Asm: VPSRLW, CPU Feature: AVX
 func (x Uint16x8) ShiftAllRight(y uint64) Uint16x8
 
-// ShiftAllRight shifts each element to the right by the specified number of bits. Emptied upper bits are zeroed.
+// ShiftAllRight performs an unsigned right shift on each element by y bits.
 //
 // Asm: VPSRLW, CPU Feature: AVX2
 func (x Uint16x16) ShiftAllRight(y uint64) Uint16x16
 
-// ShiftAllRight shifts each element to the right by the specified number of bits. Emptied upper bits are zeroed.
+// ShiftAllRight performs an unsigned right shift on each element by y bits.
 //
 // Asm: VPSRLW, CPU Feature: AVX512
 func (x Uint16x32) ShiftAllRight(y uint64) Uint16x32
 
-// ShiftAllRight shifts each element to the right by the specified number of bits. Emptied upper bits are zeroed.
+// ShiftAllRight performs an unsigned right shift on each element by y bits.
 //
 // Asm: VPSRLD, CPU Feature: AVX
 func (x Uint32x4) ShiftAllRight(y uint64) Uint32x4
 
-// ShiftAllRight shifts each element to the right by the specified number of bits. Emptied upper bits are zeroed.
+// ShiftAllRight performs an unsigned right shift on each element by y bits.
 //
 // Asm: VPSRLD, CPU Feature: AVX2
 func (x Uint32x8) ShiftAllRight(y uint64) Uint32x8
 
-// ShiftAllRight shifts each element to the right by the specified number of bits. Emptied upper bits are zeroed.
+// ShiftAllRight performs an unsigned right shift on each element by y bits.
 //
 // Asm: VPSRLD, CPU Feature: AVX512
 func (x Uint32x16) ShiftAllRight(y uint64) Uint32x16
 
-// ShiftAllRight shifts each element to the right by the specified number of bits. Emptied upper bits are zeroed.
+// ShiftAllRight performs an unsigned right shift on each element by y bits.
 //
 // Asm: VPSRLQ, CPU Feature: AVX
 func (x Uint64x2) ShiftAllRight(y uint64) Uint64x2
 
-// ShiftAllRight shifts each element to the right by the specified number of bits. Emptied upper bits are zeroed.
+// ShiftAllRight performs an unsigned right shift on each element by y bits.
 //
 // Asm: VPSRLQ, CPU Feature: AVX2
 func (x Uint64x4) ShiftAllRight(y uint64) Uint64x4
 
-// ShiftAllRight shifts each element to the right by the specified number of bits. Emptied upper bits are zeroed.
+// ShiftAllRight performs an unsigned right shift on each element by y bits.
 //
 // Asm: VPSRLQ, CPU Feature: AVX512
 func (x Uint64x8) ShiftAllRight(y uint64) Uint64x8
 
 /* ShiftAllRightConcat */
 
-// ShiftAllRightConcat shifts each element of x to the right by the number of bits specified by the
-// immediate(only the lower 5 bits are used), and then copies the lower bits of y to the emptied upper bits of the shifted x.
+// ShiftAllRightConcat shifts each element of x to the right by the number of bits specified by
+// shift (only the lower 5 bits are used), and then copies the lower bits of y to the emptied upper bits of the shifted x.
 //
 // shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
 //
 // Asm: VPSHRDW, CPU Feature: AVX512VBMI2
 func (x Int16x8) ShiftAllRightConcat(shift uint8, y Int16x8) Int16x8
 
-// ShiftAllRightConcat shifts each element of x to the right by the number of bits specified by the
-// immediate(only the lower 5 bits are used), and then copies the lower bits of y to the emptied upper bits of the shifted x.
+// ShiftAllRightConcat shifts each element of x to the right by the number of bits specified by
+// shift (only the lower 5 bits are used), and then copies the lower bits of y to the emptied upper bits of the shifted x.
 //
 // shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
 //
 // Asm: VPSHRDW, CPU Feature: AVX512VBMI2
 func (x Int16x16) ShiftAllRightConcat(shift uint8, y Int16x16) Int16x16
 
-// ShiftAllRightConcat shifts each element of x to the right by the number of bits specified by the
-// immediate(only the lower 5 bits are used), and then copies the lower bits of y to the emptied upper bits of the shifted x.
+// ShiftAllRightConcat shifts each element of x to the right by the number of bits specified by
+// shift (only the lower 5 bits are used), and then copies the lower bits of y to the emptied upper bits of the shifted x.
 //
 // shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
 //
 // Asm: VPSHRDW, CPU Feature: AVX512VBMI2
 func (x Int16x32) ShiftAllRightConcat(shift uint8, y Int16x32) Int16x32
 
-// ShiftAllRightConcat shifts each element of x to the right by the number of bits specified by the
-// immediate(only the lower 5 bits are used), and then copies the lower bits of y to the emptied upper bits of the shifted x.
+// ShiftAllRightConcat shifts each element of x to the right by the number of bits specified by
+// shift (only the lower 5 bits are used), and then copies the lower bits of y to the emptied upper bits of the shifted x.
 //
 // shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
 //
 // Asm: VPSHRDD, CPU Feature: AVX512VBMI2
 func (x Int32x4) ShiftAllRightConcat(shift uint8, y Int32x4) Int32x4
 
-// ShiftAllRightConcat shifts each element of x to the right by the number of bits specified by the
-// immediate(only the lower 5 bits are used), and then copies the lower bits of y to the emptied upper bits of the shifted x.
+// ShiftAllRightConcat shifts each element of x to the right by the number of bits specified by
+// shift (only the lower 5 bits are used), and then copies the lower bits of y to the emptied upper bits of the shifted x.
 //
 // shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
 //
 // Asm: VPSHRDD, CPU Feature: AVX512VBMI2
 func (x Int32x8) ShiftAllRightConcat(shift uint8, y Int32x8) Int32x8
 
-// ShiftAllRightConcat shifts each element of x to the right by the number of bits specified by the
-// immediate(only the lower 5 bits are used), and then copies the lower bits of y to the emptied upper bits of the shifted x.
+// ShiftAllRightConcat shifts each element of x to the right by the number of bits specified by
+// shift (only the lower 5 bits are used), and then copies the lower bits of y to the emptied upper bits of the shifted x.
 //
 // shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
 //
 // Asm: VPSHRDD, CPU Feature: AVX512VBMI2
 func (x Int32x16) ShiftAllRightConcat(shift uint8, y Int32x16) Int32x16
 
-// ShiftAllRightConcat shifts each element of x to the right by the number of bits specified by the
-// immediate(only the lower 5 bits are used), and then copies the lower bits of y to the emptied upper bits of the shifted x.
+// ShiftAllRightConcat shifts each element of x to the right by the number of bits specified by
+// shift (only the lower 5 bits are used), and then copies the lower bits of y to the emptied upper bits of the shifted x.
 //
 // shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
 //
 // Asm: VPSHRDQ, CPU Feature: AVX512VBMI2
 func (x Int64x2) ShiftAllRightConcat(shift uint8, y Int64x2) Int64x2
 
-// ShiftAllRightConcat shifts each element of x to the right by the number of bits specified by the
-// immediate(only the lower 5 bits are used), and then copies the lower bits of y to the emptied upper bits of the shifted x.
+// ShiftAllRightConcat shifts each element of x to the right by the number of bits specified by
+// shift (only the lower 5 bits are used), and then copies the lower bits of y to the emptied upper bits of the shifted x.
 //
 // shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
 //
 // Asm: VPSHRDQ, CPU Feature: AVX512VBMI2
 func (x Int64x4) ShiftAllRightConcat(shift uint8, y Int64x4) Int64x4
 
-// ShiftAllRightConcat shifts each element of x to the right by the number of bits specified by the
-// immediate(only the lower 5 bits are used), and then copies the lower bits of y to the emptied upper bits of the shifted x.
+// ShiftAllRightConcat shifts each element of x to the right by the number of bits specified by
+// shift (only the lower 5 bits are used), and then copies the lower bits of y to the emptied upper bits of the shifted x.
 //
 // shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
 //
 // Asm: VPSHRDQ, CPU Feature: AVX512VBMI2
 func (x Int64x8) ShiftAllRightConcat(shift uint8, y Int64x8) Int64x8
 
-// ShiftAllRightConcat shifts each element of x to the right by the number of bits specified by the
-// immediate(only the lower 5 bits are used), and then copies the lower bits of y to the emptied upper bits of the shifted x.
+// ShiftAllRightConcat shifts each element of x to the right by the number of bits specified by
+// shift (only the lower 5 bits are used), and then copies the lower bits of y to the emptied upper bits of the shifted x.
 //
 // shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
 //
 // Asm: VPSHRDW, CPU Feature: AVX512VBMI2
 func (x Uint16x8) ShiftAllRightConcat(shift uint8, y Uint16x8) Uint16x8
 
-// ShiftAllRightConcat shifts each element of x to the right by the number of bits specified by the
-// immediate(only the lower 5 bits are used), and then copies the lower bits of y to the emptied upper bits of the shifted x.
+// ShiftAllRightConcat shifts each element of x to the right by the number of bits specified by
+// shift (only the lower 5 bits are used), and then copies the lower bits of y to the emptied upper bits of the shifted x.
 //
 // shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
 //
 // Asm: VPSHRDW, CPU Feature: AVX512VBMI2
 func (x Uint16x16) ShiftAllRightConcat(shift uint8, y Uint16x16) Uint16x16
 
-// ShiftAllRightConcat shifts each element of x to the right by the number of bits specified by the
-// immediate(only the lower 5 bits are used), and then copies the lower bits of y to the emptied upper bits of the shifted x.
+// ShiftAllRightConcat shifts each element of x to the right by the number of bits specified by
+// shift (only the lower 5 bits are used), and then copies the lower bits of y to the emptied upper bits of the shifted x.
 //
 // shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
 //
 // Asm: VPSHRDW, CPU Feature: AVX512VBMI2
 func (x Uint16x32) ShiftAllRightConcat(shift uint8, y Uint16x32) Uint16x32
 
-// ShiftAllRightConcat shifts each element of x to the right by the number of bits specified by the
-// immediate(only the lower 5 bits are used), and then copies the lower bits of y to the emptied upper bits of the shifted x.
+// ShiftAllRightConcat shifts each element of x to the right by the number of bits specified by
+// shift (only the lower 5 bits are used), and then copies the lower bits of y to the emptied upper bits of the shifted x.
 //
 // shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
 //
 // Asm: VPSHRDD, CPU Feature: AVX512VBMI2
 func (x Uint32x4) ShiftAllRightConcat(shift uint8, y Uint32x4) Uint32x4
 
-// ShiftAllRightConcat shifts each element of x to the right by the number of bits specified by the
-// immediate(only the lower 5 bits are used), and then copies the lower bits of y to the emptied upper bits of the shifted x.
+// ShiftAllRightConcat shifts each element of x to the right by the number of bits specified by
+// shift (only the lower 5 bits are used), and then copies the lower bits of y to the emptied upper bits of the shifted x.
 //
 // shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
 //
 // Asm: VPSHRDD, CPU Feature: AVX512VBMI2
 func (x Uint32x8) ShiftAllRightConcat(shift uint8, y Uint32x8) Uint32x8
 
-// ShiftAllRightConcat shifts each element of x to the right by the number of bits specified by the
-// immediate(only the lower 5 bits are used), and then copies the lower bits of y to the emptied upper bits of the shifted x.
+// ShiftAllRightConcat shifts each element of x to the right by the number of bits specified by
+// shift (only the lower 5 bits are used), and then copies the lower bits of y to the emptied upper bits of the shifted x.
 //
 // shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
 //
 // Asm: VPSHRDD, CPU Feature: AVX512VBMI2
 func (x Uint32x16) ShiftAllRightConcat(shift uint8, y Uint32x16) Uint32x16
 
-// ShiftAllRightConcat shifts each element of x to the right by the number of bits specified by the
-// immediate(only the lower 5 bits are used), and then copies the lower bits of y to the emptied upper bits of the shifted x.
+// ShiftAllRightConcat shifts each element of x to the right by the number of bits specified by
+// shift (only the lower 5 bits are used), and then copies the lower bits of y to the emptied upper bits of the shifted x.
 //
 // shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
 //
 // Asm: VPSHRDQ, CPU Feature: AVX512VBMI2
 func (x Uint64x2) ShiftAllRightConcat(shift uint8, y Uint64x2) Uint64x2
 
-// ShiftAllRightConcat shifts each element of x to the right by the number of bits specified by the
-// immediate(only the lower 5 bits are used), and then copies the lower bits of y to the emptied upper bits of the shifted x.
+// ShiftAllRightConcat shifts each element of x to the right by the number of bits specified by
+// shift (only the lower 5 bits are used), and then copies the lower bits of y to the emptied upper bits of the shifted x.
 //
 // shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
 //
 // Asm: VPSHRDQ, CPU Feature: AVX512VBMI2
 func (x Uint64x4) ShiftAllRightConcat(shift uint8, y Uint64x4) Uint64x4
 
-// ShiftAllRightConcat shifts each element of x to the right by the number of bits specified by the
-// immediate(only the lower 5 bits are used), and then copies the lower bits of y to the emptied upper bits of the shifted x.
+// ShiftAllRightConcat shifts each element of x to the right by the number of bits specified by
+// shift (only the lower 5 bits are used), and then copies the lower bits of y to the emptied upper bits of the shifted x.
 //
 // shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
 //
@@ -6607,92 +6600,92 @@ func (x Uint64x8) ShiftAllRightConcat(shift uint8, y Uint64x8) Uint64x8
 
 /* ShiftLeft */
 
-// ShiftLeft shifts each element in x to the left by the number of bits specified in y's corresponding elements. Emptied lower bits are zeroed.
+// ShiftLeft shifts each element in x to the left by the number of bits specified in y's corresponding elements.
 //
 // Asm: VPSLLVW, CPU Feature: AVX512
 func (x Int16x8) ShiftLeft(y Int16x8) Int16x8
 
-// ShiftLeft shifts each element in x to the left by the number of bits specified in y's corresponding elements. Emptied lower bits are zeroed.
+// ShiftLeft shifts each element in x to the left by the number of bits specified in y's corresponding elements.
 //
 // Asm: VPSLLVW, CPU Feature: AVX512
 func (x Int16x16) ShiftLeft(y Int16x16) Int16x16
 
-// ShiftLeft shifts each element in x to the left by the number of bits specified in y's corresponding elements. Emptied lower bits are zeroed.
+// ShiftLeft shifts each element in x to the left by the number of bits specified in y's corresponding elements.
 //
 // Asm: VPSLLVW, CPU Feature: AVX512
 func (x Int16x32) ShiftLeft(y Int16x32) Int16x32
 
-// ShiftLeft shifts each element in x to the left by the number of bits specified in y's corresponding elements. Emptied lower bits are zeroed.
+// ShiftLeft shifts each element in x to the left by the number of bits specified in y's corresponding elements.
 //
 // Asm: VPSLLVD, CPU Feature: AVX2
 func (x Int32x4) ShiftLeft(y Int32x4) Int32x4
 
-// ShiftLeft shifts each element in x to the left by the number of bits specified in y's corresponding elements. Emptied lower bits are zeroed.
+// ShiftLeft shifts each element in x to the left by the number of bits specified in y's corresponding elements.
 //
 // Asm: VPSLLVD, CPU Feature: AVX2
 func (x Int32x8) ShiftLeft(y Int32x8) Int32x8
 
-// ShiftLeft shifts each element in x to the left by the number of bits specified in y's corresponding elements. Emptied lower bits are zeroed.
+// ShiftLeft shifts each element in x to the left by the number of bits specified in y's corresponding elements.
 //
 // Asm: VPSLLVD, CPU Feature: AVX512
 func (x Int32x16) ShiftLeft(y Int32x16) Int32x16
 
-// ShiftLeft shifts each element in x to the left by the number of bits specified in y's corresponding elements. Emptied lower bits are zeroed.
+// ShiftLeft shifts each element in x to the left by the number of bits specified in y's corresponding elements.
 //
 // Asm: VPSLLVQ, CPU Feature: AVX2
 func (x Int64x2) ShiftLeft(y Int64x2) Int64x2
 
-// ShiftLeft shifts each element in x to the left by the number of bits specified in y's corresponding elements. Emptied lower bits are zeroed.
+// ShiftLeft shifts each element in x to the left by the number of bits specified in y's corresponding elements.
 //
 // Asm: VPSLLVQ, CPU Feature: AVX2
 func (x Int64x4) ShiftLeft(y Int64x4) Int64x4
 
-// ShiftLeft shifts each element in x to the left by the number of bits specified in y's corresponding elements. Emptied lower bits are zeroed.
+// ShiftLeft shifts each element in x to the left by the number of bits specified in y's corresponding elements.
 //
 // Asm: VPSLLVQ, CPU Feature: AVX512
 func (x Int64x8) ShiftLeft(y Int64x8) Int64x8
 
-// ShiftLeft shifts each element in x to the left by the number of bits specified in y's corresponding elements. Emptied lower bits are zeroed.
+// ShiftLeft shifts each element in x to the left by the number of bits specified in y's corresponding elements.
 //
 // Asm: VPSLLVW, CPU Feature: AVX512
 func (x Uint16x8) ShiftLeft(y Uint16x8) Uint16x8
 
-// ShiftLeft shifts each element in x to the left by the number of bits specified in y's corresponding elements. Emptied lower bits are zeroed.
+// ShiftLeft shifts each element in x to the left by the number of bits specified in y's corresponding elements.
 //
 // Asm: VPSLLVW, CPU Feature: AVX512
 func (x Uint16x16) ShiftLeft(y Uint16x16) Uint16x16
 
-// ShiftLeft shifts each element in x to the left by the number of bits specified in y's corresponding elements. Emptied lower bits are zeroed.
+// ShiftLeft shifts each element in x to the left by the number of bits specified in y's corresponding elements.
 //
 // Asm: VPSLLVW, CPU Feature: AVX512
 func (x Uint16x32) ShiftLeft(y Uint16x32) Uint16x32
 
-// ShiftLeft shifts each element in x to the left by the number of bits specified in y's corresponding elements. Emptied lower bits are zeroed.
+// ShiftLeft shifts each element in x to the left by the number of bits specified in y's corresponding elements.
 //
 // Asm: VPSLLVD, CPU Feature: AVX2
 func (x Uint32x4) ShiftLeft(y Uint32x4) Uint32x4
 
-// ShiftLeft shifts each element in x to the left by the number of bits specified in y's corresponding elements. Emptied lower bits are zeroed.
+// ShiftLeft shifts each element in x to the left by the number of bits specified in y's corresponding elements.
 //
 // Asm: VPSLLVD, CPU Feature: AVX2
 func (x Uint32x8) ShiftLeft(y Uint32x8) Uint32x8
 
-// ShiftLeft shifts each element in x to the left by the number of bits specified in y's corresponding elements. Emptied lower bits are zeroed.
+// ShiftLeft shifts each element in x to the left by the number of bits specified in y's corresponding elements.
 //
 // Asm: VPSLLVD, CPU Feature: AVX512
 func (x Uint32x16) ShiftLeft(y Uint32x16) Uint32x16
 
-// ShiftLeft shifts each element in x to the left by the number of bits specified in y's corresponding elements. Emptied lower bits are zeroed.
+// ShiftLeft shifts each element in x to the left by the number of bits specified in y's corresponding elements.
 //
 // Asm: VPSLLVQ, CPU Feature: AVX2
 func (x Uint64x2) ShiftLeft(y Uint64x2) Uint64x2
 
-// ShiftLeft shifts each element in x to the left by the number of bits specified in y's corresponding elements. Emptied lower bits are zeroed.
+// ShiftLeft shifts each element in x to the left by the number of bits specified in y's corresponding elements.
 //
 // Asm: VPSLLVQ, CPU Feature: AVX2
 func (x Uint64x4) ShiftLeft(y Uint64x4) Uint64x4
 
-// ShiftLeft shifts each element in x to the left by the number of bits specified in y's corresponding elements. Emptied lower bits are zeroed.
+// ShiftLeft shifts each element in x to the left by the number of bits specified in y's corresponding elements.
 //
 // Asm: VPSLLVQ, CPU Feature: AVX512
 func (x Uint64x8) ShiftLeft(y Uint64x8) Uint64x8
@@ -6700,201 +6693,201 @@ func (x Uint64x8) ShiftLeft(y Uint64x8) Uint64x8
 /* ShiftLeftConcat */
 
 // ShiftLeftConcat shifts each element of x to the left by the number of bits specified by the
-// corresponding elements in y(only the lower 5 bits are used), and then copies the upper bits of z to the emptied lower bits of the shifted x.
+// corresponding elements in y (only the lower 5 bits are used), and then copies the upper bits of z to the emptied lower bits of the shifted x.
 //
 // Asm: VPSHLDVW, CPU Feature: AVX512VBMI2
 func (x Int16x8) ShiftLeftConcat(y Int16x8, z Int16x8) Int16x8
 
 // ShiftLeftConcat shifts each element of x to the left by the number of bits specified by the
-// corresponding elements in y(only the lower 5 bits are used), and then copies the upper bits of z to the emptied lower bits of the shifted x.
+// corresponding elements in y (only the lower 5 bits are used), and then copies the upper bits of z to the emptied lower bits of the shifted x.
 //
 // Asm: VPSHLDVW, CPU Feature: AVX512VBMI2
 func (x Int16x16) ShiftLeftConcat(y Int16x16, z Int16x16) Int16x16
 
 // ShiftLeftConcat shifts each element of x to the left by the number of bits specified by the
-// corresponding elements in y(only the lower 5 bits are used), and then copies the upper bits of z to the emptied lower bits of the shifted x.
+// corresponding elements in y (only the lower 5 bits are used), and then copies the upper bits of z to the emptied lower bits of the shifted x.
 //
 // Asm: VPSHLDVW, CPU Feature: AVX512VBMI2
 func (x Int16x32) ShiftLeftConcat(y Int16x32, z Int16x32) Int16x32
 
 // ShiftLeftConcat shifts each element of x to the left by the number of bits specified by the
-// corresponding elements in y(only the lower 5 bits are used), and then copies the upper bits of z to the emptied lower bits of the shifted x.
+// corresponding elements in y (only the lower 5 bits are used), and then copies the upper bits of z to the emptied lower bits of the shifted x.
 //
 // Asm: VPSHLDVD, CPU Feature: AVX512VBMI2
 func (x Int32x4) ShiftLeftConcat(y Int32x4, z Int32x4) Int32x4
 
 // ShiftLeftConcat shifts each element of x to the left by the number of bits specified by the
-// corresponding elements in y(only the lower 5 bits are used), and then copies the upper bits of z to the emptied lower bits of the shifted x.
+// corresponding elements in y (only the lower 5 bits are used), and then copies the upper bits of z to the emptied lower bits of the shifted x.
 //
 // Asm: VPSHLDVD, CPU Feature: AVX512VBMI2
 func (x Int32x8) ShiftLeftConcat(y Int32x8, z Int32x8) Int32x8
 
 // ShiftLeftConcat shifts each element of x to the left by the number of bits specified by the
-// corresponding elements in y(only the lower 5 bits are used), and then copies the upper bits of z to the emptied lower bits of the shifted x.
+// corresponding elements in y (only the lower 5 bits are used), and then copies the upper bits of z to the emptied lower bits of the shifted x.
 //
 // Asm: VPSHLDVD, CPU Feature: AVX512VBMI2
 func (x Int32x16) ShiftLeftConcat(y Int32x16, z Int32x16) Int32x16
 
 // ShiftLeftConcat shifts each element of x to the left by the number of bits specified by the
-// corresponding elements in y(only the lower 5 bits are used), and then copies the upper bits of z to the emptied lower bits of the shifted x.
+// corresponding elements in y (only the lower 5 bits are used), and then copies the upper bits of z to the emptied lower bits of the shifted x.
 //
 // Asm: VPSHLDVQ, CPU Feature: AVX512VBMI2
 func (x Int64x2) ShiftLeftConcat(y Int64x2, z Int64x2) Int64x2
 
 // ShiftLeftConcat shifts each element of x to the left by the number of bits specified by the
-// corresponding elements in y(only the lower 5 bits are used), and then copies the upper bits of z to the emptied lower bits of the shifted x.
+// corresponding elements in y (only the lower 5 bits are used), and then copies the upper bits of z to the emptied lower bits of the shifted x.
 //
 // Asm: VPSHLDVQ, CPU Feature: AVX512VBMI2
 func (x Int64x4) ShiftLeftConcat(y Int64x4, z Int64x4) Int64x4
 
 // ShiftLeftConcat shifts each element of x to the left by the number of bits specified by the
-// corresponding elements in y(only the lower 5 bits are used), and then copies the upper bits of z to the emptied lower bits of the shifted x.
+// corresponding elements in y (only the lower 5 bits are used), and then copies the upper bits of z to the emptied lower bits of the shifted x.
 //
 // Asm: VPSHLDVQ, CPU Feature: AVX512VBMI2
 func (x Int64x8) ShiftLeftConcat(y Int64x8, z Int64x8) Int64x8
 
 // ShiftLeftConcat shifts each element of x to the left by the number of bits specified by the
-// corresponding elements in y(only the lower 5 bits are used), and then copies the upper bits of z to the emptied lower bits of the shifted x.
+// corresponding elements in y (only the lower 5 bits are used), and then copies the upper bits of z to the emptied lower bits of the shifted x.
 //
 // Asm: VPSHLDVW, CPU Feature: AVX512VBMI2
 func (x Uint16x8) ShiftLeftConcat(y Uint16x8, z Uint16x8) Uint16x8
 
 // ShiftLeftConcat shifts each element of x to the left by the number of bits specified by the
-// corresponding elements in y(only the lower 5 bits are used), and then copies the upper bits of z to the emptied lower bits of the shifted x.
+// corresponding elements in y (only the lower 5 bits are used), and then copies the upper bits of z to the emptied lower bits of the shifted x.
 //
 // Asm: VPSHLDVW, CPU Feature: AVX512VBMI2
 func (x Uint16x16) ShiftLeftConcat(y Uint16x16, z Uint16x16) Uint16x16
 
 // ShiftLeftConcat shifts each element of x to the left by the number of bits specified by the
-// corresponding elements in y(only the lower 5 bits are used), and then copies the upper bits of z to the emptied lower bits of the shifted x.
+// corresponding elements in y (only the lower 5 bits are used), and then copies the upper bits of z to the emptied lower bits of the shifted x.
 //
 // Asm: VPSHLDVW, CPU Feature: AVX512VBMI2
 func (x Uint16x32) ShiftLeftConcat(y Uint16x32, z Uint16x32) Uint16x32
 
 // ShiftLeftConcat shifts each element of x to the left by the number of bits specified by the
-// corresponding elements in y(only the lower 5 bits are used), and then copies the upper bits of z to the emptied lower bits of the shifted x.
+// corresponding elements in y (only the lower 5 bits are used), and then copies the upper bits of z to the emptied lower bits of the shifted x.
 //
 // Asm: VPSHLDVD, CPU Feature: AVX512VBMI2
 func (x Uint32x4) ShiftLeftConcat(y Uint32x4, z Uint32x4) Uint32x4
 
 // ShiftLeftConcat shifts each element of x to the left by the number of bits specified by the
-// corresponding elements in y(only the lower 5 bits are used), and then copies the upper bits of z to the emptied lower bits of the shifted x.
+// corresponding elements in y (only the lower 5 bits are used), and then copies the upper bits of z to the emptied lower bits of the shifted x.
 //
 // Asm: VPSHLDVD, CPU Feature: AVX512VBMI2
 func (x Uint32x8) ShiftLeftConcat(y Uint32x8, z Uint32x8) Uint32x8
 
 // ShiftLeftConcat shifts each element of x to the left by the number of bits specified by the
-// corresponding elements in y(only the lower 5 bits are used), and then copies the upper bits of z to the emptied lower bits of the shifted x.
+// corresponding elements in y (only the lower 5 bits are used), and then copies the upper bits of z to the emptied lower bits of the shifted x.
 //
 // Asm: VPSHLDVD, CPU Feature: AVX512VBMI2
 func (x Uint32x16) ShiftLeftConcat(y Uint32x16, z Uint32x16) Uint32x16
 
 // ShiftLeftConcat shifts each element of x to the left by the number of bits specified by the
-// corresponding elements in y(only the lower 5 bits are used), and then copies the upper bits of z to the emptied lower bits of the shifted x.
+// corresponding elements in y (only the lower 5 bits are used), and then copies the upper bits of z to the emptied lower bits of the shifted x.
 //
 // Asm: VPSHLDVQ, CPU Feature: AVX512VBMI2
 func (x Uint64x2) ShiftLeftConcat(y Uint64x2, z Uint64x2) Uint64x2
 
 // ShiftLeftConcat shifts each element of x to the left by the number of bits specified by the
-// corresponding elements in y(only the lower 5 bits are used), and then copies the upper bits of z to the emptied lower bits of the shifted x.
+// corresponding elements in y (only the lower 5 bits are used), and then copies the upper bits of z to the emptied lower bits of the shifted x.
 //
 // Asm: VPSHLDVQ, CPU Feature: AVX512VBMI2
 func (x Uint64x4) ShiftLeftConcat(y Uint64x4, z Uint64x4) Uint64x4
 
 // ShiftLeftConcat shifts each element of x to the left by the number of bits specified by the
-// corresponding elements in y(only the lower 5 bits are used), and then copies the upper bits of z to the emptied lower bits of the shifted x.
+// corresponding elements in y (only the lower 5 bits are used), and then copies the upper bits of z to the emptied lower bits of the shifted x.
 //
 // Asm: VPSHLDVQ, CPU Feature: AVX512VBMI2
 func (x Uint64x8) ShiftLeftConcat(y Uint64x8, z Uint64x8) Uint64x8
 
 /* ShiftRight */
 
-// ShiftRight shifts each element in x to the right by the number of bits specified in y's corresponding elements. Emptied upper bits are filled with the sign bit.
+// ShiftRight performs a signed right shift on each element in x by the number of bits specified in y's corresponding elements.
 //
 // Asm: VPSRAVW, CPU Feature: AVX512
 func (x Int16x8) ShiftRight(y Int16x8) Int16x8
 
-// ShiftRight shifts each element in x to the right by the number of bits specified in y's corresponding elements. Emptied upper bits are filled with the sign bit.
+// ShiftRight performs a signed right shift on each element in x by the number of bits specified in y's corresponding elements.
 //
 // Asm: VPSRAVW, CPU Feature: AVX512
 func (x Int16x16) ShiftRight(y Int16x16) Int16x16
 
-// ShiftRight shifts each element in x to the right by the number of bits specified in y's corresponding elements. Emptied upper bits are filled with the sign bit.
+// ShiftRight performs a signed right shift on each element in x by the number of bits specified in y's corresponding elements.
 //
 // Asm: VPSRAVW, CPU Feature: AVX512
 func (x Int16x32) ShiftRight(y Int16x32) Int16x32
 
-// ShiftRight shifts each element in x to the right by the number of bits specified in y's corresponding elements. Emptied upper bits are filled with the sign bit.
+// ShiftRight performs a signed right shift on each element in x by the number of bits specified in y's corresponding elements.
 //
 // Asm: VPSRAVD, CPU Feature: AVX2
 func (x Int32x4) ShiftRight(y Int32x4) Int32x4
 
-// ShiftRight shifts each element in x to the right by the number of bits specified in y's corresponding elements. Emptied upper bits are filled with the sign bit.
+// ShiftRight performs a signed right shift on each element in x by the number of bits specified in y's corresponding elements.
 //
 // Asm: VPSRAVD, CPU Feature: AVX2
 func (x Int32x8) ShiftRight(y Int32x8) Int32x8
 
-// ShiftRight shifts each element in x to the right by the number of bits specified in y's corresponding elements. Emptied upper bits are filled with the sign bit.
+// ShiftRight performs a signed right shift on each element in x by the number of bits specified in y's corresponding elements.
 //
 // Asm: VPSRAVD, CPU Feature: AVX512
 func (x Int32x16) ShiftRight(y Int32x16) Int32x16
 
-// ShiftRight shifts each element in x to the right by the number of bits specified in y's corresponding elements. Emptied upper bits are filled with the sign bit.
+// ShiftRight performs a signed right shift on each element in x by the number of bits specified in y's corresponding elements.
 //
 // Asm: VPSRAVQ, CPU Feature: AVX512
 func (x Int64x2) ShiftRight(y Int64x2) Int64x2
 
-// ShiftRight shifts each element in x to the right by the number of bits specified in y's corresponding elements. Emptied upper bits are filled with the sign bit.
+// ShiftRight performs a signed right shift on each element in x by the number of bits specified in y's corresponding elements.
 //
 // Asm: VPSRAVQ, CPU Feature: AVX512
 func (x Int64x4) ShiftRight(y Int64x4) Int64x4
 
-// ShiftRight shifts each element in x to the right by the number of bits specified in y's corresponding elements. Emptied upper bits are filled with the sign bit.
+// ShiftRight performs a signed right shift on each element in x by the number of bits specified in y's corresponding elements.
 //
 // Asm: VPSRAVQ, CPU Feature: AVX512
 func (x Int64x8) ShiftRight(y Int64x8) Int64x8
 
-// ShiftRight shifts each element in x to the right by the number of bits specified in y's corresponding elements. Emptied upper bits are zeroed.
+// ShiftRight performs an unsigned right shift on each element in x by the number of bits specified in y's corresponding elements.
 //
 // Asm: VPSRLVW, CPU Feature: AVX512
 func (x Uint16x8) ShiftRight(y Uint16x8) Uint16x8
 
-// ShiftRight shifts each element in x to the right by the number of bits specified in y's corresponding elements. Emptied upper bits are zeroed.
+// ShiftRight performs an unsigned right shift on each element in x by the number of bits specified in y's corresponding elements.
 //
 // Asm: VPSRLVW, CPU Feature: AVX512
 func (x Uint16x16) ShiftRight(y Uint16x16) Uint16x16
 
-// ShiftRight shifts each element in x to the right by the number of bits specified in y's corresponding elements. Emptied upper bits are zeroed.
+// ShiftRight performs an unsigned right shift on each element in x by the number of bits specified in y's corresponding elements.
 //
 // Asm: VPSRLVW, CPU Feature: AVX512
 func (x Uint16x32) ShiftRight(y Uint16x32) Uint16x32
 
-// ShiftRight shifts each element in x to the right by the number of bits specified in y's corresponding elements. Emptied upper bits are zeroed.
+// ShiftRight performs an unsigned right shift on each element in x by the number of bits specified in y's corresponding elements.
 //
 // Asm: VPSRLVD, CPU Feature: AVX2
 func (x Uint32x4) ShiftRight(y Uint32x4) Uint32x4
 
-// ShiftRight shifts each element in x to the right by the number of bits specified in y's corresponding elements. Emptied upper bits are zeroed.
+// ShiftRight performs an unsigned right shift on each element in x by the number of bits specified in y's corresponding elements.
 //
 // Asm: VPSRLVD, CPU Feature: AVX2
 func (x Uint32x8) ShiftRight(y Uint32x8) Uint32x8
 
-// ShiftRight shifts each element in x to the right by the number of bits specified in y's corresponding elements. Emptied upper bits are zeroed.
+// ShiftRight performs an unsigned right shift on each element in x by the number of bits specified in y's corresponding elements.
 //
 // Asm: VPSRLVD, CPU Feature: AVX512
 func (x Uint32x16) ShiftRight(y Uint32x16) Uint32x16
 
-// ShiftRight shifts each element in x to the right by the number of bits specified in y's corresponding elements. Emptied upper bits are zeroed.
+// ShiftRight performs an unsigned right shift on each element in x by the number of bits specified in y's corresponding elements.
 //
 // Asm: VPSRLVQ, CPU Feature: AVX2
 func (x Uint64x2) ShiftRight(y Uint64x2) Uint64x2
 
-// ShiftRight shifts each element in x to the right by the number of bits specified in y's corresponding elements. Emptied upper bits are zeroed.
+// ShiftRight performs an unsigned right shift on each element in x by the number of bits specified in y's corresponding elements.
 //
 // Asm: VPSRLVQ, CPU Feature: AVX2
 func (x Uint64x4) ShiftRight(y Uint64x4) Uint64x4
 
-// ShiftRight shifts each element in x to the right by the number of bits specified in y's corresponding elements. Emptied upper bits are zeroed.
+// ShiftRight performs an unsigned right shift on each element in x by the number of bits specified in y's corresponding elements.
 //
 // Asm: VPSRLVQ, CPU Feature: AVX512
 func (x Uint64x8) ShiftRight(y Uint64x8) Uint64x8
@@ -6902,109 +6895,109 @@ func (x Uint64x8) ShiftRight(y Uint64x8) Uint64x8
 /* ShiftRightConcat */
 
 // ShiftRightConcat shifts each element of x to the right by the number of bits specified by the
-// corresponding elements in y(only the lower 5 bits are used), and then copies the lower bits of z to the emptied upper bits of the shifted x.
+// corresponding elements in y (only the lower 5 bits are used), and then copies the lower bits of z to the emptied upper bits of the shifted x.
 //
 // Asm: VPSHRDVW, CPU Feature: AVX512VBMI2
 func (x Int16x8) ShiftRightConcat(y Int16x8, z Int16x8) Int16x8
 
 // ShiftRightConcat shifts each element of x to the right by the number of bits specified by the
-// corresponding elements in y(only the lower 5 bits are used), and then copies the lower bits of z to the emptied upper bits of the shifted x.
+// corresponding elements in y (only the lower 5 bits are used), and then copies the lower bits of z to the emptied upper bits of the shifted x.
 //
 // Asm: VPSHRDVW, CPU Feature: AVX512VBMI2
 func (x Int16x16) ShiftRightConcat(y Int16x16, z Int16x16) Int16x16
 
 // ShiftRightConcat shifts each element of x to the right by the number of bits specified by the
-// corresponding elements in y(only the lower 5 bits are used), and then copies the lower bits of z to the emptied upper bits of the shifted x.
+// corresponding elements in y (only the lower 5 bits are used), and then copies the lower bits of z to the emptied upper bits of the shifted x.
 //
 // Asm: VPSHRDVW, CPU Feature: AVX512VBMI2
 func (x Int16x32) ShiftRightConcat(y Int16x32, z Int16x32) Int16x32
 
 // ShiftRightConcat shifts each element of x to the right by the number of bits specified by the
-// corresponding elements in y(only the lower 5 bits are used), and then copies the lower bits of z to the emptied upper bits of the shifted x.
+// corresponding elements in y (only the lower 5 bits are used), and then copies the lower bits of z to the emptied upper bits of the shifted x.
 //
 // Asm: VPSHRDVD, CPU Feature: AVX512VBMI2
 func (x Int32x4) ShiftRightConcat(y Int32x4, z Int32x4) Int32x4
 
 // ShiftRightConcat shifts each element of x to the right by the number of bits specified by the
-// corresponding elements in y(only the lower 5 bits are used), and then copies the lower bits of z to the emptied upper bits of the shifted x.
+// corresponding elements in y (only the lower 5 bits are used), and then copies the lower bits of z to the emptied upper bits of the shifted x.
 //
 // Asm: VPSHRDVD, CPU Feature: AVX512VBMI2
 func (x Int32x8) ShiftRightConcat(y Int32x8, z Int32x8) Int32x8
 
 // ShiftRightConcat shifts each element of x to the right by the number of bits specified by the
-// corresponding elements in y(only the lower 5 bits are used), and then copies the lower bits of z to the emptied upper bits of the shifted x.
+// corresponding elements in y (only the lower 5 bits are used), and then copies the lower bits of z to the emptied upper bits of the shifted x.
 //
 // Asm: VPSHRDVD, CPU Feature: AVX512VBMI2
 func (x Int32x16) ShiftRightConcat(y Int32x16, z Int32x16) Int32x16
 
 // ShiftRightConcat shifts each element of x to the right by the number of bits specified by the
-// corresponding elements in y(only the lower 5 bits are used), and then copies the lower bits of z to the emptied upper bits of the shifted x.
+// corresponding elements in y (only the lower 5 bits are used), and then copies the lower bits of z to the emptied upper bits of the shifted x.
 //
 // Asm: VPSHRDVQ, CPU Feature: AVX512VBMI2
 func (x Int64x2) ShiftRightConcat(y Int64x2, z Int64x2) Int64x2
 
 // ShiftRightConcat shifts each element of x to the right by the number of bits specified by the
-// corresponding elements in y(only the lower 5 bits are used), and then copies the lower bits of z to the emptied upper bits of the shifted x.
+// corresponding elements in y (only the lower 5 bits are used), and then copies the lower bits of z to the emptied upper bits of the shifted x.
 //
 // Asm: VPSHRDVQ, CPU Feature: AVX512VBMI2
 func (x Int64x4) ShiftRightConcat(y Int64x4, z Int64x4) Int64x4
 
 // ShiftRightConcat shifts each element of x to the right by the number of bits specified by the
-// corresponding elements in y(only the lower 5 bits are used), and then copies the lower bits of z to the emptied upper bits of the shifted x.
+// corresponding elements in y (only the lower 5 bits are used), and then copies the lower bits of z to the emptied upper bits of the shifted x.
 //
 // Asm: VPSHRDVQ, CPU Feature: AVX512VBMI2
 func (x Int64x8) ShiftRightConcat(y Int64x8, z Int64x8) Int64x8
 
 // ShiftRightConcat shifts each element of x to the right by the number of bits specified by the
-// corresponding elements in y(only the lower 5 bits are used), and then copies the lower bits of z to the emptied upper bits of the shifted x.
+// corresponding elements in y (only the lower 5 bits are used), and then copies the lower bits of z to the emptied upper bits of the shifted x.
 //
 // Asm: VPSHRDVW, CPU Feature: AVX512VBMI2
 func (x Uint16x8) ShiftRightConcat(y Uint16x8, z Uint16x8) Uint16x8
 
 // ShiftRightConcat shifts each element of x to the right by the number of bits specified by the
-// corresponding elements in y(only the lower 5 bits are used), and then copies the lower bits of z to the emptied upper bits of the shifted x.
+// corresponding elements in y (only the lower 5 bits are used), and then copies the lower bits of z to the emptied upper bits of the shifted x.
 //
 // Asm: VPSHRDVW, CPU Feature: AVX512VBMI2
 func (x Uint16x16) ShiftRightConcat(y Uint16x16, z Uint16x16) Uint16x16
 
 // ShiftRightConcat shifts each element of x to the right by the number of bits specified by the
-// corresponding elements in y(only the lower 5 bits are used), and then copies the lower bits of z to the emptied upper bits of the shifted x.
+// corresponding elements in y (only the lower 5 bits are used), and then copies the lower bits of z to the emptied upper bits of the shifted x.
 //
 // Asm: VPSHRDVW, CPU Feature: AVX512VBMI2
 func (x Uint16x32) ShiftRightConcat(y Uint16x32, z Uint16x32) Uint16x32
 
 // ShiftRightConcat shifts each element of x to the right by the number of bits specified by the
-// corresponding elements in y(only the lower 5 bits are used), and then copies the lower bits of z to the emptied upper bits of the shifted x.
+// corresponding elements in y (only the lower 5 bits are used), and then copies the lower bits of z to the emptied upper bits of the shifted x.
 //
 // Asm: VPSHRDVD, CPU Feature: AVX512VBMI2
 func (x Uint32x4) ShiftRightConcat(y Uint32x4, z Uint32x4) Uint32x4
 
 // ShiftRightConcat shifts each element of x to the right by the number of bits specified by the
-// corresponding elements in y(only the lower 5 bits are used), and then copies the lower bits of z to the emptied upper bits of the shifted x.
+// corresponding elements in y (only the lower 5 bits are used), and then copies the lower bits of z to the emptied upper bits of the shifted x.
 //
 // Asm: VPSHRDVD, CPU Feature: AVX512VBMI2
 func (x Uint32x8) ShiftRightConcat(y Uint32x8, z Uint32x8) Uint32x8
 
 // ShiftRightConcat shifts each element of x to the right by the number of bits specified by the
-// corresponding elements in y(only the lower 5 bits are used), and then copies the lower bits of z to the emptied upper bits of the shifted x.
+// corresponding elements in y (only the lower 5 bits are used), and then copies the lower bits of z to the emptied upper bits of the shifted x.
 //
 // Asm: VPSHRDVD, CPU Feature: AVX512VBMI2
 func (x Uint32x16) ShiftRightConcat(y Uint32x16, z Uint32x16) Uint32x16
 
 // ShiftRightConcat shifts each element of x to the right by the number of bits specified by the
-// corresponding elements in y(only the lower 5 bits are used), and then copies the lower bits of z to the emptied upper bits of the shifted x.
+// corresponding elements in y (only the lower 5 bits are used), and then copies the lower bits of z to the emptied upper bits of the shifted x.
 //
 // Asm: VPSHRDVQ, CPU Feature: AVX512VBMI2
 func (x Uint64x2) ShiftRightConcat(y Uint64x2, z Uint64x2) Uint64x2
 
 // ShiftRightConcat shifts each element of x to the right by the number of bits specified by the
-// corresponding elements in y(only the lower 5 bits are used), and then copies the lower bits of z to the emptied upper bits of the shifted x.
+// corresponding elements in y (only the lower 5 bits are used), and then copies the lower bits of z to the emptied upper bits of the shifted x.
 //
 // Asm: VPSHRDVQ, CPU Feature: AVX512VBMI2
 func (x Uint64x4) ShiftRightConcat(y Uint64x4, z Uint64x4) Uint64x4
 
 // ShiftRightConcat shifts each element of x to the right by the number of bits specified by the
-// corresponding elements in y(only the lower 5 bits are used), and then copies the lower bits of z to the emptied upper bits of the shifted x.
+// corresponding elements in y (only the lower 5 bits are used), and then copies the lower bits of z to the emptied upper bits of the shifted x.
 //
 // Asm: VPSHRDVQ, CPU Feature: AVX512VBMI2
 func (x Uint64x8) ShiftRightConcat(y Uint64x8, z Uint64x8) Uint64x8
@@ -7196,90 +7189,101 @@ func (x Uint64x8) Sub(y Uint64x8) Uint64x8
 /* SubPairs */
 
 // SubPairs horizontally subtracts adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...].
 //
 // Asm: VHSUBPS, CPU Feature: AVX
 func (x Float32x4) SubPairs(y Float32x4) Float32x4
 
 // SubPairs horizontally subtracts adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
-//
-// Asm: VHSUBPS, CPU Feature: AVX
-func (x Float32x8) SubPairs(y Float32x8) Float32x8
-
-// SubPairs horizontally subtracts adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
+// For x = [x0, x1] and y = [y0, y1], the result is [x0-x1, y0-y1].
 //
 // Asm: VHSUBPD, CPU Feature: AVX
 func (x Float64x2) SubPairs(y Float64x2) Float64x2
 
 // SubPairs horizontally subtracts adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...].
 //
-// Asm: VHSUBPD, CPU Feature: AVX
-func (x Float64x4) SubPairs(y Float64x4) Float64x4
+// Asm: VPHSUBW, CPU Feature: AVX
+func (x Int16x8) SubPairs(y Int16x8) Int16x8
 
 // SubPairs horizontally subtracts adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...].
 //
-// Asm: VPHSUBW, CPU Feature: AVX
-func (x Int16x8) SubPairs(y Int16x8) Int16x8
+// Asm: VPHSUBD, CPU Feature: AVX
+func (x Int32x4) SubPairs(y Int32x4) Int32x4
 
 // SubPairs horizontally subtracts adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...].
 //
-// Asm: VPHSUBW, CPU Feature: AVX2
-func (x Int16x16) SubPairs(y Int16x16) Int16x16
+// Asm: VPHSUBW, CPU Feature: AVX
+func (x Uint16x8) SubPairs(y Uint16x8) Uint16x8
 
 // SubPairs horizontally subtracts adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...].
 //
 // Asm: VPHSUBD, CPU Feature: AVX
-func (x Int32x4) SubPairs(y Int32x4) Int32x4
+func (x Uint32x4) SubPairs(y Uint32x4) Uint32x4
 
-// SubPairs horizontally subtracts adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
+/* SubPairsGrouped */
+
+// SubPairsGrouped horizontally subtracts adjacent pairs of elements.
+// With each 128-bit as a group:
+// for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...].
 //
-// Asm: VPHSUBD, CPU Feature: AVX2
-func (x Int32x8) SubPairs(y Int32x8) Int32x8
+// Asm: VHSUBPS, CPU Feature: AVX
+func (x Float32x8) SubPairsGrouped(y Float32x8) Float32x8
 
-// SubPairs horizontally subtracts adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
+// SubPairsGrouped horizontally subtracts adjacent pairs of elements.
+// With each 128-bit as a group:
+// for x = [x0, x1] and y = [y0, y1], the result is [x0-x1, y0-y1].
 //
-// Asm: VPHSUBW, CPU Feature: AVX
-func (x Uint16x8) SubPairs(y Uint16x8) Uint16x8
+// Asm: VHSUBPD, CPU Feature: AVX
+func (x Float64x4) SubPairsGrouped(y Float64x4) Float64x4
 
-// SubPairs horizontally subtracts adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
+// SubPairsGrouped horizontally subtracts adjacent pairs of elements.
+// With each 128-bit as a group:
+// for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...].
 //
 // Asm: VPHSUBW, CPU Feature: AVX2
-func (x Uint16x16) SubPairs(y Uint16x16) Uint16x16
+func (x Int16x16) SubPairsGrouped(y Int16x16) Int16x16
 
-// SubPairs horizontally subtracts adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
+// SubPairsGrouped horizontally subtracts adjacent pairs of elements.
+// With each 128-bit as a group:
+// for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...].
 //
-// Asm: VPHSUBD, CPU Feature: AVX
-func (x Uint32x4) SubPairs(y Uint32x4) Uint32x4
+// Asm: VPHSUBD, CPU Feature: AVX2
+func (x Int32x8) SubPairsGrouped(y Int32x8) Int32x8
 
-// SubPairs horizontally subtracts adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
+// SubPairsGrouped horizontally subtracts adjacent pairs of elements.
+// With each 128-bit as a group:
+// for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...].
+//
+// Asm: VPHSUBW, CPU Feature: AVX2
+func (x Uint16x16) SubPairsGrouped(y Uint16x16) Uint16x16
+
+// SubPairsGrouped horizontally subtracts adjacent pairs of elements.
+// With each 128-bit as a group:
+// for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...].
 //
 // Asm: VPHSUBD, CPU Feature: AVX2
-func (x Uint32x8) SubPairs(y Uint32x8) Uint32x8
+func (x Uint32x8) SubPairsGrouped(y Uint32x8) Uint32x8
 
 /* SubPairsSaturated */
 
 // SubPairsSaturated horizontally subtracts adjacent pairs of elements with saturation.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...].
 //
 // Asm: VPHSUBSW, CPU Feature: AVX
 func (x Int16x8) SubPairsSaturated(y Int16x8) Int16x8
 
-// SubPairsSaturated horizontally subtracts adjacent pairs of elements with saturation.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
+/* SubPairsSaturatedGrouped */
+
+// SubPairsSaturatedGrouped horizontally subtracts adjacent pairs of elements with saturation.
+// With each 128-bit as a group:
+// for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...].
 //
 // Asm: VPHSUBSW, CPU Feature: AVX2
-func (x Int16x16) SubPairsSaturated(y Int16x16) Int16x16
+func (x Int16x16) SubPairsSaturatedGrouped(y Int16x16) Int16x16
 
 /* SubSaturated */
 
@@ -7478,244 +7482,212 @@ func (x Float64x8) TruncScaledResidue(prec uint8) Float64x8
 
 /* TruncateToInt8 */
 
-// TruncateToInt8 converts element values to int8.
-// Conversion is done with truncation on the vector elements.
-// Results are packed to low elements in the returned vector, its upper elements are zero-cleared.
+// TruncateToInt8 truncates element values to int8.
+// Results are packed to low elements in the returned vector, its upper elements are zeroed.
 //
 // Asm: VPMOVWB, CPU Feature: AVX512
 func (x Int16x8) TruncateToInt8() Int8x16
 
-// TruncateToInt8 converts element values to int8.
-// Conversion is done with truncation on the vector elements.
-// Results are packed to low elements in the returned vector, its upper elements are zero-cleared.
+// TruncateToInt8 truncates element values to int8.
 //
 // Asm: VPMOVWB, CPU Feature: AVX512
 func (x Int16x16) TruncateToInt8() Int8x16
 
-// TruncateToInt8 converts element values to int8.
-// Conversion is done with truncation on the vector elements.
+// TruncateToInt8 truncates element values to int8.
 //
 // Asm: VPMOVWB, CPU Feature: AVX512
 func (x Int16x32) TruncateToInt8() Int8x32
 
-// TruncateToInt8 converts element values to int8.
-// Conversion is done with truncation on the vector elements.
-// Results are packed to low elements in the returned vector, its upper elements are zero-cleared.
+// TruncateToInt8 truncates element values to int8.
+// Results are packed to low elements in the returned vector, its upper elements are zeroed.
 //
 // Asm: VPMOVDB, CPU Feature: AVX512
 func (x Int32x4) TruncateToInt8() Int8x16
 
-// TruncateToInt8 converts element values to int8.
-// Conversion is done with truncation on the vector elements.
-// Results are packed to low elements in the returned vector, its upper elements are zero-cleared.
+// TruncateToInt8 truncates element values to int8.
+// Results are packed to low elements in the returned vector, its upper elements are zeroed.
 //
 // Asm: VPMOVDB, CPU Feature: AVX512
 func (x Int32x8) TruncateToInt8() Int8x16
 
-// TruncateToInt8 converts element values to int8.
-// Conversion is done with truncation on the vector elements.
-// Results are packed to low elements in the returned vector, its upper elements are zero-cleared.
+// TruncateToInt8 truncates element values to int8.
 //
 // Asm: VPMOVDB, CPU Feature: AVX512
 func (x Int32x16) TruncateToInt8() Int8x16
 
-// TruncateToInt8 converts element values to int8.
-// Conversion is done with truncation on the vector elements.
-// Results are packed to low elements in the returned vector, its upper elements are zero-cleared.
+// TruncateToInt8 truncates element values to int8.
+// Results are packed to low elements in the returned vector, its upper elements are zeroed.
 //
 // Asm: VPMOVQB, CPU Feature: AVX512
 func (x Int64x2) TruncateToInt8() Int8x16
 
-// TruncateToInt8 converts element values to int8.
-// Conversion is done with truncation on the vector elements.
-// Results are packed to low elements in the returned vector, its upper elements are zero-cleared.
+// TruncateToInt8 truncates element values to int8.
+// Results are packed to low elements in the returned vector, its upper elements are zeroed.
 //
 // Asm: VPMOVQB, CPU Feature: AVX512
 func (x Int64x4) TruncateToInt8() Int8x16
 
-// TruncateToInt8 converts element values to int8.
-// Conversion is done with truncation on the vector elements.
-// Results are packed to low elements in the returned vector, its upper elements are zero-cleared.
+// TruncateToInt8 truncates element values to int8.
+// Results are packed to low elements in the returned vector, its upper elements are zeroed.
 //
 // Asm: VPMOVQB, CPU Feature: AVX512
 func (x Int64x8) TruncateToInt8() Int8x16
 
 /* TruncateToInt16 */
 
-// TruncateToInt16 converts element values to int16.
-// Conversion is done with truncation on the vector elements.
+// TruncateToInt16 truncates element values to int16.
+// Results are packed to low elements in the returned vector, its upper elements are zeroed.
 //
 // Asm: VPMOVDW, CPU Feature: AVX512
 func (x Int32x4) TruncateToInt16() Int16x8
 
-// TruncateToInt16 converts element values to int16.
-// Conversion is done with truncation on the vector elements.
+// TruncateToInt16 truncates element values to int16.
 //
 // Asm: VPMOVDW, CPU Feature: AVX512
 func (x Int32x8) TruncateToInt16() Int16x8
 
-// TruncateToInt16 converts element values to int16.
-// Conversion is done with truncation on the vector elements.
+// TruncateToInt16 truncates element values to int16.
 //
 // Asm: VPMOVDW, CPU Feature: AVX512
 func (x Int32x16) TruncateToInt16() Int16x16
 
-// TruncateToInt16 converts element values to int16.
-// Conversion is done with truncation on the vector elements.
+// TruncateToInt16 truncates element values to int16.
+// Results are packed to low elements in the returned vector, its upper elements are zeroed.
 //
 // Asm: VPMOVQW, CPU Feature: AVX512
 func (x Int64x2) TruncateToInt16() Int16x8
 
-// TruncateToInt16 converts element values to int16.
-// Conversion is done with truncation on the vector elements.
+// TruncateToInt16 truncates element values to int16.
+// Results are packed to low elements in the returned vector, its upper elements are zeroed.
 //
 // Asm: VPMOVQW, CPU Feature: AVX512
 func (x Int64x4) TruncateToInt16() Int16x8
 
-// TruncateToInt16 converts element values to int16.
-// Conversion is done with truncation on the vector elements.
+// TruncateToInt16 truncates element values to int16.
 //
 // Asm: VPMOVQW, CPU Feature: AVX512
 func (x Int64x8) TruncateToInt16() Int16x8
 
 /* TruncateToInt32 */
 
-// TruncateToInt32 converts element values to int32.
-// Conversion is done with truncation on the vector elements.
+// TruncateToInt32 truncates element values to int32.
+// Results are packed to low elements in the returned vector, its upper elements are zeroed.
 //
 // Asm: VPMOVQD, CPU Feature: AVX512
 func (x Int64x2) TruncateToInt32() Int32x4
 
-// TruncateToInt32 converts element values to int32.
-// Conversion is done with truncation on the vector elements.
+// TruncateToInt32 truncates element values to int32.
 //
 // Asm: VPMOVQD, CPU Feature: AVX512
 func (x Int64x4) TruncateToInt32() Int32x4
 
-// TruncateToInt32 converts element values to int32.
-// Conversion is done with truncation on the vector elements.
+// TruncateToInt32 truncates element values to int32.
 //
 // Asm: VPMOVQD, CPU Feature: AVX512
 func (x Int64x8) TruncateToInt32() Int32x8
 
 /* TruncateToUint8 */
 
-// TruncateToUint8 converts element values to uint8.
-// Conversion is done with truncation on the vector elements.
-// Results are packed to low elements in the returned vector, its upper elements are zero-cleared.
+// TruncateToUint8 truncates element values to uint8.
+// Results are packed to low elements in the returned vector, its upper elements are zeroed.
 //
 // Asm: VPMOVWB, CPU Feature: AVX512
 func (x Uint16x8) TruncateToUint8() Uint8x16
 
-// TruncateToUint8 converts element values to uint8.
-// Conversion is done with truncation on the vector elements.
-// Results are packed to low elements in the returned vector, its upper elements are zero-cleared.
+// TruncateToUint8 truncates element values to uint8.
 //
 // Asm: VPMOVWB, CPU Feature: AVX512
 func (x Uint16x16) TruncateToUint8() Uint8x16
 
-// TruncateToUint8 converts element values to uint8.
-// Conversion is done with truncation on the vector elements.
+// TruncateToUint8 truncates element values to uint8.
 //
 // Asm: VPMOVWB, CPU Feature: AVX512
 func (x Uint16x32) TruncateToUint8() Uint8x32
 
-// TruncateToUint8 converts element values to uint8.
-// Conversion is done with truncation on the vector elements.
-// Results are packed to low elements in the returned vector, its upper elements are zero-cleared.
+// TruncateToUint8 truncates element values to uint8.
+// Results are packed to low elements in the returned vector, its upper elements are zeroed.
 //
 // Asm: VPMOVDB, CPU Feature: AVX512
 func (x Uint32x4) TruncateToUint8() Uint8x16
 
-// TruncateToUint8 converts element values to uint8.
-// Conversion is done with truncation on the vector elements.
-// Results are packed to low elements in the returned vector, its upper elements are zero-cleared.
+// TruncateToUint8 truncates element values to uint8.
+// Results are packed to low elements in the returned vector, its upper elements are zeroed.
 //
 // Asm: VPMOVDB, CPU Feature: AVX512
 func (x Uint32x8) TruncateToUint8() Uint8x16
 
-// TruncateToUint8 converts element values to uint8.
-// Conversion is done with truncation on the vector elements.
-// Results are packed to low elements in the returned vector, its upper elements are zero-cleared.
+// TruncateToUint8 truncates element values to uint8.
 //
 // Asm: VPMOVDB, CPU Feature: AVX512
 func (x Uint32x16) TruncateToUint8() Uint8x16
 
-// TruncateToUint8 converts element values to uint8.
-// Conversion is done with truncation on the vector elements.
-// Results are packed to low elements in the returned vector, its upper elements are zero-cleared.
+// TruncateToUint8 truncates element values to uint8.
+// Results are packed to low elements in the returned vector, its upper elements are zeroed.
 //
 // Asm: VPMOVQB, CPU Feature: AVX512
 func (x Uint64x2) TruncateToUint8() Uint8x16
 
-// TruncateToUint8 converts element values to uint8.
-// Conversion is done with truncation on the vector elements.
-// Results are packed to low elements in the returned vector, its upper elements are zero-cleared.
+// TruncateToUint8 truncates element values to uint8.
+// Results are packed to low elements in the returned vector, its upper elements are zeroed.
 //
 // Asm: VPMOVQB, CPU Feature: AVX512
 func (x Uint64x4) TruncateToUint8() Uint8x16
 
-// TruncateToUint8 converts element values to uint8.
-// Conversion is done with truncation on the vector elements.
-// Results are packed to low elements in the returned vector, its upper elements are zero-cleared.
+// TruncateToUint8 truncates element values to uint8.
+// Results are packed to low elements in the returned vector, its upper elements are zeroed.
 //
 // Asm: VPMOVQB, CPU Feature: AVX512
 func (x Uint64x8) TruncateToUint8() Uint8x16
 
 /* TruncateToUint16 */
 
-// TruncateToUint16 converts element values to uint16.
-// Conversion is done with truncation on the vector elements.
+// TruncateToUint16 truncates element values to uint16.
+// Results are packed to low elements in the returned vector, its upper elements are zeroed.
 //
 // Asm: VPMOVDW, CPU Feature: AVX512
 func (x Uint32x4) TruncateToUint16() Uint16x8
 
-// TruncateToUint16 converts element values to uint16.
-// Conversion is done with truncation on the vector elements.
+// TruncateToUint16 truncates element values to uint16.
 //
 // Asm: VPMOVDW, CPU Feature: AVX512
 func (x Uint32x8) TruncateToUint16() Uint16x8
 
-// TruncateToUint16 converts element values to uint16.
-// Conversion is done with truncation on the vector elements.
+// TruncateToUint16 truncates element values to uint16.
 //
 // Asm: VPMOVDW, CPU Feature: AVX512
 func (x Uint32x16) TruncateToUint16() Uint16x16
 
-// TruncateToUint16 converts element values to uint16.
-// Conversion is done with truncation on the vector elements.
+// TruncateToUint16 truncates element values to uint16.
+// Results are packed to low elements in the returned vector, its upper elements are zeroed.
 //
 // Asm: VPMOVQW, CPU Feature: AVX512
 func (x Uint64x2) TruncateToUint16() Uint16x8
 
-// TruncateToUint16 converts element values to uint16.
-// Conversion is done with truncation on the vector elements.
+// TruncateToUint16 truncates element values to uint16.
+// Results are packed to low elements in the returned vector, its upper elements are zeroed.
 //
 // Asm: VPMOVQW, CPU Feature: AVX512
 func (x Uint64x4) TruncateToUint16() Uint16x8
 
-// TruncateToUint16 converts element values to uint16.
-// Conversion is done with truncation on the vector elements.
+// TruncateToUint16 truncates element values to uint16.
 //
 // Asm: VPMOVQW, CPU Feature: AVX512
 func (x Uint64x8) TruncateToUint16() Uint16x8
 
 /* TruncateToUint32 */
 
-// TruncateToUint32 converts element values to uint32.
-// Conversion is done with truncation on the vector elements.
+// TruncateToUint32 truncates element values to uint32.
+// Results are packed to low elements in the returned vector, its upper elements are zeroed.
 //
 // Asm: VPMOVQD, CPU Feature: AVX512
 func (x Uint64x2) TruncateToUint32() Uint32x4
 
-// TruncateToUint32 converts element values to uint32.
-// Conversion is done with truncation on the vector elements.
+// TruncateToUint32 truncates element values to uint32.
 //
 // Asm: VPMOVQD, CPU Feature: AVX512
 func (x Uint64x4) TruncateToUint32() Uint32x4
 
-// TruncateToUint32 converts element values to uint32.
-// Conversion is done with truncation on the vector elements.
+// TruncateToUint32 truncates element values to uint32.
 //
 // Asm: VPMOVQD, CPU Feature: AVX512
 func (x Uint64x8) TruncateToUint32() Uint32x8
@@ -7842,930 +7814,930 @@ func (x Uint64x4) Xor(y Uint64x4) Uint64x4
 // Asm: VPXORQ, CPU Feature: AVX512
 func (x Uint64x8) Xor(y Uint64x8) Uint64x8
 
-// Float64x2 converts from Float32x4 to Float64x2
-func (from Float32x4) AsFloat64x2() (to Float64x2)
+// AsFloat64x2 returns a Float64x2 with the same bit representation as x.
+func (x Float32x4) AsFloat64x2() Float64x2
 
-// Int8x16 converts from Float32x4 to Int8x16
-func (from Float32x4) AsInt8x16() (to Int8x16)
+// AsInt8x16 returns an Int8x16 with the same bit representation as x.
+func (x Float32x4) AsInt8x16() Int8x16
 
-// Int16x8 converts from Float32x4 to Int16x8
-func (from Float32x4) AsInt16x8() (to Int16x8)
+// AsInt16x8 returns an Int16x8 with the same bit representation as x.
+func (x Float32x4) AsInt16x8() Int16x8
 
-// Int32x4 converts from Float32x4 to Int32x4
-func (from Float32x4) AsInt32x4() (to Int32x4)
+// AsInt32x4 returns an Int32x4 with the same bit representation as x.
+func (x Float32x4) AsInt32x4() Int32x4
 
-// Int64x2 converts from Float32x4 to Int64x2
-func (from Float32x4) AsInt64x2() (to Int64x2)
+// AsInt64x2 returns an Int64x2 with the same bit representation as x.
+func (x Float32x4) AsInt64x2() Int64x2
 
-// Uint8x16 converts from Float32x4 to Uint8x16
-func (from Float32x4) AsUint8x16() (to Uint8x16)
+// AsUint8x16 returns a Uint8x16 with the same bit representation as x.
+func (x Float32x4) AsUint8x16() Uint8x16
 
-// Uint16x8 converts from Float32x4 to Uint16x8
-func (from Float32x4) AsUint16x8() (to Uint16x8)
+// AsUint16x8 returns a Uint16x8 with the same bit representation as x.
+func (x Float32x4) AsUint16x8() Uint16x8
 
-// Uint32x4 converts from Float32x4 to Uint32x4
-func (from Float32x4) AsUint32x4() (to Uint32x4)
+// AsUint32x4 returns a Uint32x4 with the same bit representation as x.
+func (x Float32x4) AsUint32x4() Uint32x4
 
-// Uint64x2 converts from Float32x4 to Uint64x2
-func (from Float32x4) AsUint64x2() (to Uint64x2)
+// AsUint64x2 returns a Uint64x2 with the same bit representation as x.
+func (x Float32x4) AsUint64x2() Uint64x2
 
-// Float64x4 converts from Float32x8 to Float64x4
-func (from Float32x8) AsFloat64x4() (to Float64x4)
+// AsFloat64x4 returns a Float64x4 with the same bit representation as x.
+func (x Float32x8) AsFloat64x4() Float64x4
 
-// Int8x32 converts from Float32x8 to Int8x32
-func (from Float32x8) AsInt8x32() (to Int8x32)
+// AsInt8x32 returns an Int8x32 with the same bit representation as x.
+func (x Float32x8) AsInt8x32() Int8x32
 
-// Int16x16 converts from Float32x8 to Int16x16
-func (from Float32x8) AsInt16x16() (to Int16x16)
+// AsInt16x16 returns an Int16x16 with the same bit representation as x.
+func (x Float32x8) AsInt16x16() Int16x16
 
-// Int32x8 converts from Float32x8 to Int32x8
-func (from Float32x8) AsInt32x8() (to Int32x8)
+// AsInt32x8 returns an Int32x8 with the same bit representation as x.
+func (x Float32x8) AsInt32x8() Int32x8
 
-// Int64x4 converts from Float32x8 to Int64x4
-func (from Float32x8) AsInt64x4() (to Int64x4)
+// AsInt64x4 returns an Int64x4 with the same bit representation as x.
+func (x Float32x8) AsInt64x4() Int64x4
 
-// Uint8x32 converts from Float32x8 to Uint8x32
-func (from Float32x8) AsUint8x32() (to Uint8x32)
+// AsUint8x32 returns a Uint8x32 with the same bit representation as x.
+func (x Float32x8) AsUint8x32() Uint8x32
 
-// Uint16x16 converts from Float32x8 to Uint16x16
-func (from Float32x8) AsUint16x16() (to Uint16x16)
+// AsUint16x16 returns a Uint16x16 with the same bit representation as x.
+func (x Float32x8) AsUint16x16() Uint16x16
 
-// Uint32x8 converts from Float32x8 to Uint32x8
-func (from Float32x8) AsUint32x8() (to Uint32x8)
+// AsUint32x8 returns a Uint32x8 with the same bit representation as x.
+func (x Float32x8) AsUint32x8() Uint32x8
 
-// Uint64x4 converts from Float32x8 to Uint64x4
-func (from Float32x8) AsUint64x4() (to Uint64x4)
+// AsUint64x4 returns a Uint64x4 with the same bit representation as x.
+func (x Float32x8) AsUint64x4() Uint64x4
 
-// Float64x8 converts from Float32x16 to Float64x8
-func (from Float32x16) AsFloat64x8() (to Float64x8)
+// AsFloat64x8 returns a Float64x8 with the same bit representation as x.
+func (x Float32x16) AsFloat64x8() Float64x8
 
-// Int8x64 converts from Float32x16 to Int8x64
-func (from Float32x16) AsInt8x64() (to Int8x64)
+// AsInt8x64 returns an Int8x64 with the same bit representation as x.
+func (x Float32x16) AsInt8x64() Int8x64
 
-// Int16x32 converts from Float32x16 to Int16x32
-func (from Float32x16) AsInt16x32() (to Int16x32)
+// AsInt16x32 returns an Int16x32 with the same bit representation as x.
+func (x Float32x16) AsInt16x32() Int16x32
 
-// Int32x16 converts from Float32x16 to Int32x16
-func (from Float32x16) AsInt32x16() (to Int32x16)
+// AsInt32x16 returns an Int32x16 with the same bit representation as x.
+func (x Float32x16) AsInt32x16() Int32x16
 
-// Int64x8 converts from Float32x16 to Int64x8
-func (from Float32x16) AsInt64x8() (to Int64x8)
+// AsInt64x8 returns an Int64x8 with the same bit representation as x.
+func (x Float32x16) AsInt64x8() Int64x8
 
-// Uint8x64 converts from Float32x16 to Uint8x64
-func (from Float32x16) AsUint8x64() (to Uint8x64)
+// AsUint8x64 returns a Uint8x64 with the same bit representation as x.
+func (x Float32x16) AsUint8x64() Uint8x64
 
-// Uint16x32 converts from Float32x16 to Uint16x32
-func (from Float32x16) AsUint16x32() (to Uint16x32)
+// AsUint16x32 returns a Uint16x32 with the same bit representation as x.
+func (x Float32x16) AsUint16x32() Uint16x32
 
-// Uint32x16 converts from Float32x16 to Uint32x16
-func (from Float32x16) AsUint32x16() (to Uint32x16)
+// AsUint32x16 returns a Uint32x16 with the same bit representation as x.
+func (x Float32x16) AsUint32x16() Uint32x16
 
-// Uint64x8 converts from Float32x16 to Uint64x8
-func (from Float32x16) AsUint64x8() (to Uint64x8)
+// AsUint64x8 returns a Uint64x8 with the same bit representation as x.
+func (x Float32x16) AsUint64x8() Uint64x8
 
-// Float32x4 converts from Float64x2 to Float32x4
-func (from Float64x2) AsFloat32x4() (to Float32x4)
+// AsFloat32x4 returns a Float32x4 with the same bit representation as x.
+func (x Float64x2) AsFloat32x4() Float32x4
 
-// Int8x16 converts from Float64x2 to Int8x16
-func (from Float64x2) AsInt8x16() (to Int8x16)
+// AsInt8x16 returns an Int8x16 with the same bit representation as x.
+func (x Float64x2) AsInt8x16() Int8x16
 
-// Int16x8 converts from Float64x2 to Int16x8
-func (from Float64x2) AsInt16x8() (to Int16x8)
+// AsInt16x8 returns an Int16x8 with the same bit representation as x.
+func (x Float64x2) AsInt16x8() Int16x8
 
-// Int32x4 converts from Float64x2 to Int32x4
-func (from Float64x2) AsInt32x4() (to Int32x4)
+// AsInt32x4 returns an Int32x4 with the same bit representation as x.
+func (x Float64x2) AsInt32x4() Int32x4
 
-// Int64x2 converts from Float64x2 to Int64x2
-func (from Float64x2) AsInt64x2() (to Int64x2)
+// AsInt64x2 returns an Int64x2 with the same bit representation as x.
+func (x Float64x2) AsInt64x2() Int64x2
 
-// Uint8x16 converts from Float64x2 to Uint8x16
-func (from Float64x2) AsUint8x16() (to Uint8x16)
+// AsUint8x16 returns a Uint8x16 with the same bit representation as x.
+func (x Float64x2) AsUint8x16() Uint8x16
 
-// Uint16x8 converts from Float64x2 to Uint16x8
-func (from Float64x2) AsUint16x8() (to Uint16x8)
+// AsUint16x8 returns a Uint16x8 with the same bit representation as x.
+func (x Float64x2) AsUint16x8() Uint16x8
 
-// Uint32x4 converts from Float64x2 to Uint32x4
-func (from Float64x2) AsUint32x4() (to Uint32x4)
+// AsUint32x4 returns a Uint32x4 with the same bit representation as x.
+func (x Float64x2) AsUint32x4() Uint32x4
 
-// Uint64x2 converts from Float64x2 to Uint64x2
-func (from Float64x2) AsUint64x2() (to Uint64x2)
+// AsUint64x2 returns a Uint64x2 with the same bit representation as x.
+func (x Float64x2) AsUint64x2() Uint64x2
 
-// Float32x8 converts from Float64x4 to Float32x8
-func (from Float64x4) AsFloat32x8() (to Float32x8)
+// AsFloat32x8 returns a Float32x8 with the same bit representation as x.
+func (x Float64x4) AsFloat32x8() Float32x8
 
-// Int8x32 converts from Float64x4 to Int8x32
-func (from Float64x4) AsInt8x32() (to Int8x32)
+// AsInt8x32 returns an Int8x32 with the same bit representation as x.
+func (x Float64x4) AsInt8x32() Int8x32
 
-// Int16x16 converts from Float64x4 to Int16x16
-func (from Float64x4) AsInt16x16() (to Int16x16)
+// AsInt16x16 returns an Int16x16 with the same bit representation as x.
+func (x Float64x4) AsInt16x16() Int16x16
 
-// Int32x8 converts from Float64x4 to Int32x8
-func (from Float64x4) AsInt32x8() (to Int32x8)
+// AsInt32x8 returns an Int32x8 with the same bit representation as x.
+func (x Float64x4) AsInt32x8() Int32x8
 
-// Int64x4 converts from Float64x4 to Int64x4
-func (from Float64x4) AsInt64x4() (to Int64x4)
+// AsInt64x4 returns an Int64x4 with the same bit representation as x.
+func (x Float64x4) AsInt64x4() Int64x4
 
-// Uint8x32 converts from Float64x4 to Uint8x32
-func (from Float64x4) AsUint8x32() (to Uint8x32)
+// AsUint8x32 returns a Uint8x32 with the same bit representation as x.
+func (x Float64x4) AsUint8x32() Uint8x32
 
-// Uint16x16 converts from Float64x4 to Uint16x16
-func (from Float64x4) AsUint16x16() (to Uint16x16)
+// AsUint16x16 returns a Uint16x16 with the same bit representation as x.
+func (x Float64x4) AsUint16x16() Uint16x16
 
-// Uint32x8 converts from Float64x4 to Uint32x8
-func (from Float64x4) AsUint32x8() (to Uint32x8)
+// AsUint32x8 returns a Uint32x8 with the same bit representation as x.
+func (x Float64x4) AsUint32x8() Uint32x8
 
-// Uint64x4 converts from Float64x4 to Uint64x4
-func (from Float64x4) AsUint64x4() (to Uint64x4)
+// AsUint64x4 returns a Uint64x4 with the same bit representation as x.
+func (x Float64x4) AsUint64x4() Uint64x4
 
-// Float32x16 converts from Float64x8 to Float32x16
-func (from Float64x8) AsFloat32x16() (to Float32x16)
+// AsFloat32x16 returns a Float32x16 with the same bit representation as x.
+func (x Float64x8) AsFloat32x16() Float32x16
 
-// Int8x64 converts from Float64x8 to Int8x64
-func (from Float64x8) AsInt8x64() (to Int8x64)
+// AsInt8x64 returns an Int8x64 with the same bit representation as x.
+func (x Float64x8) AsInt8x64() Int8x64
 
-// Int16x32 converts from Float64x8 to Int16x32
-func (from Float64x8) AsInt16x32() (to Int16x32)
+// AsInt16x32 returns an Int16x32 with the same bit representation as x.
+func (x Float64x8) AsInt16x32() Int16x32
 
-// Int32x16 converts from Float64x8 to Int32x16
-func (from Float64x8) AsInt32x16() (to Int32x16)
+// AsInt32x16 returns an Int32x16 with the same bit representation as x.
+func (x Float64x8) AsInt32x16() Int32x16
 
-// Int64x8 converts from Float64x8 to Int64x8
-func (from Float64x8) AsInt64x8() (to Int64x8)
+// AsInt64x8 returns an Int64x8 with the same bit representation as x.
+func (x Float64x8) AsInt64x8() Int64x8
 
-// Uint8x64 converts from Float64x8 to Uint8x64
-func (from Float64x8) AsUint8x64() (to Uint8x64)
+// AsUint8x64 returns a Uint8x64 with the same bit representation as x.
+func (x Float64x8) AsUint8x64() Uint8x64
 
-// Uint16x32 converts from Float64x8 to Uint16x32
-func (from Float64x8) AsUint16x32() (to Uint16x32)
+// AsUint16x32 returns a Uint16x32 with the same bit representation as x.
+func (x Float64x8) AsUint16x32() Uint16x32
 
-// Uint32x16 converts from Float64x8 to Uint32x16
-func (from Float64x8) AsUint32x16() (to Uint32x16)
+// AsUint32x16 returns a Uint32x16 with the same bit representation as x.
+func (x Float64x8) AsUint32x16() Uint32x16
 
-// Uint64x8 converts from Float64x8 to Uint64x8
-func (from Float64x8) AsUint64x8() (to Uint64x8)
+// AsUint64x8 returns a Uint64x8 with the same bit representation as x.
+func (x Float64x8) AsUint64x8() Uint64x8
 
-// Float32x4 converts from Int8x16 to Float32x4
-func (from Int8x16) AsFloat32x4() (to Float32x4)
+// AsFloat32x4 returns a Float32x4 with the same bit representation as x.
+func (x Int8x16) AsFloat32x4() Float32x4
 
-// Float64x2 converts from Int8x16 to Float64x2
-func (from Int8x16) AsFloat64x2() (to Float64x2)
+// AsFloat64x2 returns a Float64x2 with the same bit representation as x.
+func (x Int8x16) AsFloat64x2() Float64x2
 
-// Int16x8 converts from Int8x16 to Int16x8
-func (from Int8x16) AsInt16x8() (to Int16x8)
+// AsInt16x8 returns an Int16x8 with the same bit representation as x.
+func (x Int8x16) AsInt16x8() Int16x8
 
-// Int32x4 converts from Int8x16 to Int32x4
-func (from Int8x16) AsInt32x4() (to Int32x4)
+// AsInt32x4 returns an Int32x4 with the same bit representation as x.
+func (x Int8x16) AsInt32x4() Int32x4
 
-// Int64x2 converts from Int8x16 to Int64x2
-func (from Int8x16) AsInt64x2() (to Int64x2)
+// AsInt64x2 returns an Int64x2 with the same bit representation as x.
+func (x Int8x16) AsInt64x2() Int64x2
 
-// Uint8x16 converts from Int8x16 to Uint8x16
-func (from Int8x16) AsUint8x16() (to Uint8x16)
+// AsUint8x16 returns a Uint8x16 with the same bit representation as x.
+func (x Int8x16) AsUint8x16() Uint8x16
 
-// Uint16x8 converts from Int8x16 to Uint16x8
-func (from Int8x16) AsUint16x8() (to Uint16x8)
+// AsUint16x8 returns a Uint16x8 with the same bit representation as x.
+func (x Int8x16) AsUint16x8() Uint16x8
 
-// Uint32x4 converts from Int8x16 to Uint32x4
-func (from Int8x16) AsUint32x4() (to Uint32x4)
+// AsUint32x4 returns a Uint32x4 with the same bit representation as x.
+func (x Int8x16) AsUint32x4() Uint32x4
 
-// Uint64x2 converts from Int8x16 to Uint64x2
-func (from Int8x16) AsUint64x2() (to Uint64x2)
+// AsUint64x2 returns a Uint64x2 with the same bit representation as x.
+func (x Int8x16) AsUint64x2() Uint64x2
 
-// Float32x8 converts from Int8x32 to Float32x8
-func (from Int8x32) AsFloat32x8() (to Float32x8)
+// AsFloat32x8 returns a Float32x8 with the same bit representation as x.
+func (x Int8x32) AsFloat32x8() Float32x8
 
-// Float64x4 converts from Int8x32 to Float64x4
-func (from Int8x32) AsFloat64x4() (to Float64x4)
+// AsFloat64x4 returns a Float64x4 with the same bit representation as x.
+func (x Int8x32) AsFloat64x4() Float64x4
 
-// Int16x16 converts from Int8x32 to Int16x16
-func (from Int8x32) AsInt16x16() (to Int16x16)
+// AsInt16x16 returns an Int16x16 with the same bit representation as x.
+func (x Int8x32) AsInt16x16() Int16x16
 
-// Int32x8 converts from Int8x32 to Int32x8
-func (from Int8x32) AsInt32x8() (to Int32x8)
+// AsInt32x8 returns an Int32x8 with the same bit representation as x.
+func (x Int8x32) AsInt32x8() Int32x8
 
-// Int64x4 converts from Int8x32 to Int64x4
-func (from Int8x32) AsInt64x4() (to Int64x4)
+// AsInt64x4 returns an Int64x4 with the same bit representation as x.
+func (x Int8x32) AsInt64x4() Int64x4
 
-// Uint8x32 converts from Int8x32 to Uint8x32
-func (from Int8x32) AsUint8x32() (to Uint8x32)
+// AsUint8x32 returns a Uint8x32 with the same bit representation as x.
+func (x Int8x32) AsUint8x32() Uint8x32
 
-// Uint16x16 converts from Int8x32 to Uint16x16
-func (from Int8x32) AsUint16x16() (to Uint16x16)
+// AsUint16x16 returns a Uint16x16 with the same bit representation as x.
+func (x Int8x32) AsUint16x16() Uint16x16
 
-// Uint32x8 converts from Int8x32 to Uint32x8
-func (from Int8x32) AsUint32x8() (to Uint32x8)
+// AsUint32x8 returns a Uint32x8 with the same bit representation as x.
+func (x Int8x32) AsUint32x8() Uint32x8
 
-// Uint64x4 converts from Int8x32 to Uint64x4
-func (from Int8x32) AsUint64x4() (to Uint64x4)
+// AsUint64x4 returns a Uint64x4 with the same bit representation as x.
+func (x Int8x32) AsUint64x4() Uint64x4
 
-// Float32x16 converts from Int8x64 to Float32x16
-func (from Int8x64) AsFloat32x16() (to Float32x16)
+// AsFloat32x16 returns a Float32x16 with the same bit representation as x.
+func (x Int8x64) AsFloat32x16() Float32x16
 
-// Float64x8 converts from Int8x64 to Float64x8
-func (from Int8x64) AsFloat64x8() (to Float64x8)
+// AsFloat64x8 returns a Float64x8 with the same bit representation as x.
+func (x Int8x64) AsFloat64x8() Float64x8
 
-// Int16x32 converts from Int8x64 to Int16x32
-func (from Int8x64) AsInt16x32() (to Int16x32)
+// AsInt16x32 returns an Int16x32 with the same bit representation as x.
+func (x Int8x64) AsInt16x32() Int16x32
 
-// Int32x16 converts from Int8x64 to Int32x16
-func (from Int8x64) AsInt32x16() (to Int32x16)
+// AsInt32x16 returns an Int32x16 with the same bit representation as x.
+func (x Int8x64) AsInt32x16() Int32x16
 
-// Int64x8 converts from Int8x64 to Int64x8
-func (from Int8x64) AsInt64x8() (to Int64x8)
+// AsInt64x8 returns an Int64x8 with the same bit representation as x.
+func (x Int8x64) AsInt64x8() Int64x8
 
-// Uint8x64 converts from Int8x64 to Uint8x64
-func (from Int8x64) AsUint8x64() (to Uint8x64)
+// AsUint8x64 returns a Uint8x64 with the same bit representation as x.
+func (x Int8x64) AsUint8x64() Uint8x64
 
-// Uint16x32 converts from Int8x64 to Uint16x32
-func (from Int8x64) AsUint16x32() (to Uint16x32)
+// AsUint16x32 returns a Uint16x32 with the same bit representation as x.
+func (x Int8x64) AsUint16x32() Uint16x32
 
-// Uint32x16 converts from Int8x64 to Uint32x16
-func (from Int8x64) AsUint32x16() (to Uint32x16)
+// AsUint32x16 returns a Uint32x16 with the same bit representation as x.
+func (x Int8x64) AsUint32x16() Uint32x16
 
-// Uint64x8 converts from Int8x64 to Uint64x8
-func (from Int8x64) AsUint64x8() (to Uint64x8)
+// AsUint64x8 returns a Uint64x8 with the same bit representation as x.
+func (x Int8x64) AsUint64x8() Uint64x8
 
-// Float32x4 converts from Int16x8 to Float32x4
-func (from Int16x8) AsFloat32x4() (to Float32x4)
+// AsFloat32x4 returns a Float32x4 with the same bit representation as x.
+func (x Int16x8) AsFloat32x4() Float32x4
 
-// Float64x2 converts from Int16x8 to Float64x2
-func (from Int16x8) AsFloat64x2() (to Float64x2)
+// AsFloat64x2 returns a Float64x2 with the same bit representation as x.
+func (x Int16x8) AsFloat64x2() Float64x2
 
-// Int8x16 converts from Int16x8 to Int8x16
-func (from Int16x8) AsInt8x16() (to Int8x16)
+// AsInt8x16 returns an Int8x16 with the same bit representation as x.
+func (x Int16x8) AsInt8x16() Int8x16
 
-// Int32x4 converts from Int16x8 to Int32x4
-func (from Int16x8) AsInt32x4() (to Int32x4)
+// AsInt32x4 returns an Int32x4 with the same bit representation as x.
+func (x Int16x8) AsInt32x4() Int32x4
 
-// Int64x2 converts from Int16x8 to Int64x2
-func (from Int16x8) AsInt64x2() (to Int64x2)
+// AsInt64x2 returns an Int64x2 with the same bit representation as x.
+func (x Int16x8) AsInt64x2() Int64x2
 
-// Uint8x16 converts from Int16x8 to Uint8x16
-func (from Int16x8) AsUint8x16() (to Uint8x16)
+// AsUint8x16 returns a Uint8x16 with the same bit representation as x.
+func (x Int16x8) AsUint8x16() Uint8x16
 
-// Uint16x8 converts from Int16x8 to Uint16x8
-func (from Int16x8) AsUint16x8() (to Uint16x8)
+// AsUint16x8 returns a Uint16x8 with the same bit representation as x.
+func (x Int16x8) AsUint16x8() Uint16x8
 
-// Uint32x4 converts from Int16x8 to Uint32x4
-func (from Int16x8) AsUint32x4() (to Uint32x4)
+// AsUint32x4 returns a Uint32x4 with the same bit representation as x.
+func (x Int16x8) AsUint32x4() Uint32x4
 
-// Uint64x2 converts from Int16x8 to Uint64x2
-func (from Int16x8) AsUint64x2() (to Uint64x2)
+// AsUint64x2 returns a Uint64x2 with the same bit representation as x.
+func (x Int16x8) AsUint64x2() Uint64x2
 
-// Float32x8 converts from Int16x16 to Float32x8
-func (from Int16x16) AsFloat32x8() (to Float32x8)
+// AsFloat32x8 returns a Float32x8 with the same bit representation as x.
+func (x Int16x16) AsFloat32x8() Float32x8
 
-// Float64x4 converts from Int16x16 to Float64x4
-func (from Int16x16) AsFloat64x4() (to Float64x4)
+// AsFloat64x4 returns a Float64x4 with the same bit representation as x.
+func (x Int16x16) AsFloat64x4() Float64x4
 
-// Int8x32 converts from Int16x16 to Int8x32
-func (from Int16x16) AsInt8x32() (to Int8x32)
+// AsInt8x32 returns an Int8x32 with the same bit representation as x.
+func (x Int16x16) AsInt8x32() Int8x32
 
-// Int32x8 converts from Int16x16 to Int32x8
-func (from Int16x16) AsInt32x8() (to Int32x8)
+// AsInt32x8 returns an Int32x8 with the same bit representation as x.
+func (x Int16x16) AsInt32x8() Int32x8
 
-// Int64x4 converts from Int16x16 to Int64x4
-func (from Int16x16) AsInt64x4() (to Int64x4)
+// AsInt64x4 returns an Int64x4 with the same bit representation as x.
+func (x Int16x16) AsInt64x4() Int64x4
 
-// Uint8x32 converts from Int16x16 to Uint8x32
-func (from Int16x16) AsUint8x32() (to Uint8x32)
+// AsUint8x32 returns a Uint8x32 with the same bit representation as x.
+func (x Int16x16) AsUint8x32() Uint8x32
 
-// Uint16x16 converts from Int16x16 to Uint16x16
-func (from Int16x16) AsUint16x16() (to Uint16x16)
+// AsUint16x16 returns a Uint16x16 with the same bit representation as x.
+func (x Int16x16) AsUint16x16() Uint16x16
 
-// Uint32x8 converts from Int16x16 to Uint32x8
-func (from Int16x16) AsUint32x8() (to Uint32x8)
+// AsUint32x8 returns a Uint32x8 with the same bit representation as x.
+func (x Int16x16) AsUint32x8() Uint32x8
 
-// Uint64x4 converts from Int16x16 to Uint64x4
-func (from Int16x16) AsUint64x4() (to Uint64x4)
+// AsUint64x4 returns a Uint64x4 with the same bit representation as x.
+func (x Int16x16) AsUint64x4() Uint64x4
 
-// Float32x16 converts from Int16x32 to Float32x16
-func (from Int16x32) AsFloat32x16() (to Float32x16)
+// AsFloat32x16 returns a Float32x16 with the same bit representation as x.
+func (x Int16x32) AsFloat32x16() Float32x16
 
-// Float64x8 converts from Int16x32 to Float64x8
-func (from Int16x32) AsFloat64x8() (to Float64x8)
+// AsFloat64x8 returns a Float64x8 with the same bit representation as x.
+func (x Int16x32) AsFloat64x8() Float64x8
 
-// Int8x64 converts from Int16x32 to Int8x64
-func (from Int16x32) AsInt8x64() (to Int8x64)
+// AsInt8x64 returns an Int8x64 with the same bit representation as x.
+func (x Int16x32) AsInt8x64() Int8x64
 
-// Int32x16 converts from Int16x32 to Int32x16
-func (from Int16x32) AsInt32x16() (to Int32x16)
+// AsInt32x16 returns an Int32x16 with the same bit representation as x.
+func (x Int16x32) AsInt32x16() Int32x16
 
-// Int64x8 converts from Int16x32 to Int64x8
-func (from Int16x32) AsInt64x8() (to Int64x8)
+// AsInt64x8 returns an Int64x8 with the same bit representation as x.
+func (x Int16x32) AsInt64x8() Int64x8
 
-// Uint8x64 converts from Int16x32 to Uint8x64
-func (from Int16x32) AsUint8x64() (to Uint8x64)
+// AsUint8x64 returns a Uint8x64 with the same bit representation as x.
+func (x Int16x32) AsUint8x64() Uint8x64
 
-// Uint16x32 converts from Int16x32 to Uint16x32
-func (from Int16x32) AsUint16x32() (to Uint16x32)
+// AsUint16x32 returns a Uint16x32 with the same bit representation as x.
+func (x Int16x32) AsUint16x32() Uint16x32
 
-// Uint32x16 converts from Int16x32 to Uint32x16
-func (from Int16x32) AsUint32x16() (to Uint32x16)
+// AsUint32x16 returns a Uint32x16 with the same bit representation as x.
+func (x Int16x32) AsUint32x16() Uint32x16
 
-// Uint64x8 converts from Int16x32 to Uint64x8
-func (from Int16x32) AsUint64x8() (to Uint64x8)
+// AsUint64x8 returns a Uint64x8 with the same bit representation as x.
+func (x Int16x32) AsUint64x8() Uint64x8
 
-// Float32x4 converts from Int32x4 to Float32x4
-func (from Int32x4) AsFloat32x4() (to Float32x4)
+// AsFloat32x4 returns a Float32x4 with the same bit representation as x.
+func (x Int32x4) AsFloat32x4() Float32x4
 
-// Float64x2 converts from Int32x4 to Float64x2
-func (from Int32x4) AsFloat64x2() (to Float64x2)
+// AsFloat64x2 returns a Float64x2 with the same bit representation as x.
+func (x Int32x4) AsFloat64x2() Float64x2
 
-// Int8x16 converts from Int32x4 to Int8x16
-func (from Int32x4) AsInt8x16() (to Int8x16)
+// AsInt8x16 returns an Int8x16 with the same bit representation as x.
+func (x Int32x4) AsInt8x16() Int8x16
 
-// Int16x8 converts from Int32x4 to Int16x8
-func (from Int32x4) AsInt16x8() (to Int16x8)
+// AsInt16x8 returns an Int16x8 with the same bit representation as x.
+func (x Int32x4) AsInt16x8() Int16x8
 
-// Int64x2 converts from Int32x4 to Int64x2
-func (from Int32x4) AsInt64x2() (to Int64x2)
+// AsInt64x2 returns an Int64x2 with the same bit representation as x.
+func (x Int32x4) AsInt64x2() Int64x2
 
-// Uint8x16 converts from Int32x4 to Uint8x16
-func (from Int32x4) AsUint8x16() (to Uint8x16)
+// AsUint8x16 returns a Uint8x16 with the same bit representation as x.
+func (x Int32x4) AsUint8x16() Uint8x16
 
-// Uint16x8 converts from Int32x4 to Uint16x8
-func (from Int32x4) AsUint16x8() (to Uint16x8)
+// AsUint16x8 returns a Uint16x8 with the same bit representation as x.
+func (x Int32x4) AsUint16x8() Uint16x8
 
-// Uint32x4 converts from Int32x4 to Uint32x4
-func (from Int32x4) AsUint32x4() (to Uint32x4)
+// AsUint32x4 returns a Uint32x4 with the same bit representation as x.
+func (x Int32x4) AsUint32x4() Uint32x4
 
-// Uint64x2 converts from Int32x4 to Uint64x2
-func (from Int32x4) AsUint64x2() (to Uint64x2)
+// AsUint64x2 returns a Uint64x2 with the same bit representation as x.
+func (x Int32x4) AsUint64x2() Uint64x2
 
-// Float32x8 converts from Int32x8 to Float32x8
-func (from Int32x8) AsFloat32x8() (to Float32x8)
+// AsFloat32x8 returns a Float32x8 with the same bit representation as x.
+func (x Int32x8) AsFloat32x8() Float32x8
 
-// Float64x4 converts from Int32x8 to Float64x4
-func (from Int32x8) AsFloat64x4() (to Float64x4)
+// AsFloat64x4 returns a Float64x4 with the same bit representation as x.
+func (x Int32x8) AsFloat64x4() Float64x4
 
-// Int8x32 converts from Int32x8 to Int8x32
-func (from Int32x8) AsInt8x32() (to Int8x32)
+// AsInt8x32 returns an Int8x32 with the same bit representation as x.
+func (x Int32x8) AsInt8x32() Int8x32
 
-// Int16x16 converts from Int32x8 to Int16x16
-func (from Int32x8) AsInt16x16() (to Int16x16)
+// AsInt16x16 returns an Int16x16 with the same bit representation as x.
+func (x Int32x8) AsInt16x16() Int16x16
 
-// Int64x4 converts from Int32x8 to Int64x4
-func (from Int32x8) AsInt64x4() (to Int64x4)
+// AsInt64x4 returns an Int64x4 with the same bit representation as x.
+func (x Int32x8) AsInt64x4() Int64x4
 
-// Uint8x32 converts from Int32x8 to Uint8x32
-func (from Int32x8) AsUint8x32() (to Uint8x32)
+// AsUint8x32 returns a Uint8x32 with the same bit representation as x.
+func (x Int32x8) AsUint8x32() Uint8x32
 
-// Uint16x16 converts from Int32x8 to Uint16x16
-func (from Int32x8) AsUint16x16() (to Uint16x16)
+// AsUint16x16 returns a Uint16x16 with the same bit representation as x.
+func (x Int32x8) AsUint16x16() Uint16x16
 
-// Uint32x8 converts from Int32x8 to Uint32x8
-func (from Int32x8) AsUint32x8() (to Uint32x8)
+// AsUint32x8 returns a Uint32x8 with the same bit representation as x.
+func (x Int32x8) AsUint32x8() Uint32x8
 
-// Uint64x4 converts from Int32x8 to Uint64x4
-func (from Int32x8) AsUint64x4() (to Uint64x4)
+// AsUint64x4 returns a Uint64x4 with the same bit representation as x.
+func (x Int32x8) AsUint64x4() Uint64x4
 
-// Float32x16 converts from Int32x16 to Float32x16
-func (from Int32x16) AsFloat32x16() (to Float32x16)
+// AsFloat32x16 returns a Float32x16 with the same bit representation as x.
+func (x Int32x16) AsFloat32x16() Float32x16
 
-// Float64x8 converts from Int32x16 to Float64x8
-func (from Int32x16) AsFloat64x8() (to Float64x8)
+// AsFloat64x8 returns a Float64x8 with the same bit representation as x.
+func (x Int32x16) AsFloat64x8() Float64x8
 
-// Int8x64 converts from Int32x16 to Int8x64
-func (from Int32x16) AsInt8x64() (to Int8x64)
+// AsInt8x64 returns an Int8x64 with the same bit representation as x.
+func (x Int32x16) AsInt8x64() Int8x64
 
-// Int16x32 converts from Int32x16 to Int16x32
-func (from Int32x16) AsInt16x32() (to Int16x32)
+// AsInt16x32 returns an Int16x32 with the same bit representation as x.
+func (x Int32x16) AsInt16x32() Int16x32
 
-// Int64x8 converts from Int32x16 to Int64x8
-func (from Int32x16) AsInt64x8() (to Int64x8)
+// AsInt64x8 returns an Int64x8 with the same bit representation as x.
+func (x Int32x16) AsInt64x8() Int64x8
 
-// Uint8x64 converts from Int32x16 to Uint8x64
-func (from Int32x16) AsUint8x64() (to Uint8x64)
+// AsUint8x64 returns a Uint8x64 with the same bit representation as x.
+func (x Int32x16) AsUint8x64() Uint8x64
 
-// Uint16x32 converts from Int32x16 to Uint16x32
-func (from Int32x16) AsUint16x32() (to Uint16x32)
+// AsUint16x32 returns a Uint16x32 with the same bit representation as x.
+func (x Int32x16) AsUint16x32() Uint16x32
 
-// Uint32x16 converts from Int32x16 to Uint32x16
-func (from Int32x16) AsUint32x16() (to Uint32x16)
+// AsUint32x16 returns a Uint32x16 with the same bit representation as x.
+func (x Int32x16) AsUint32x16() Uint32x16
 
-// Uint64x8 converts from Int32x16 to Uint64x8
-func (from Int32x16) AsUint64x8() (to Uint64x8)
+// AsUint64x8 returns a Uint64x8 with the same bit representation as x.
+func (x Int32x16) AsUint64x8() Uint64x8
 
-// Float32x4 converts from Int64x2 to Float32x4
-func (from Int64x2) AsFloat32x4() (to Float32x4)
+// AsFloat32x4 returns a Float32x4 with the same bit representation as x.
+func (x Int64x2) AsFloat32x4() Float32x4
 
-// Float64x2 converts from Int64x2 to Float64x2
-func (from Int64x2) AsFloat64x2() (to Float64x2)
+// AsFloat64x2 returns a Float64x2 with the same bit representation as x.
+func (x Int64x2) AsFloat64x2() Float64x2
 
-// Int8x16 converts from Int64x2 to Int8x16
-func (from Int64x2) AsInt8x16() (to Int8x16)
+// AsInt8x16 returns an Int8x16 with the same bit representation as x.
+func (x Int64x2) AsInt8x16() Int8x16
 
-// Int16x8 converts from Int64x2 to Int16x8
-func (from Int64x2) AsInt16x8() (to Int16x8)
+// AsInt16x8 returns an Int16x8 with the same bit representation as x.
+func (x Int64x2) AsInt16x8() Int16x8
 
-// Int32x4 converts from Int64x2 to Int32x4
-func (from Int64x2) AsInt32x4() (to Int32x4)
+// AsInt32x4 returns an Int32x4 with the same bit representation as x.
+func (x Int64x2) AsInt32x4() Int32x4
 
-// Uint8x16 converts from Int64x2 to Uint8x16
-func (from Int64x2) AsUint8x16() (to Uint8x16)
+// AsUint8x16 returns a Uint8x16 with the same bit representation as x.
+func (x Int64x2) AsUint8x16() Uint8x16
 
-// Uint16x8 converts from Int64x2 to Uint16x8
-func (from Int64x2) AsUint16x8() (to Uint16x8)
+// AsUint16x8 returns a Uint16x8 with the same bit representation as x.
+func (x Int64x2) AsUint16x8() Uint16x8
 
-// Uint32x4 converts from Int64x2 to Uint32x4
-func (from Int64x2) AsUint32x4() (to Uint32x4)
+// AsUint32x4 returns a Uint32x4 with the same bit representation as x.
+func (x Int64x2) AsUint32x4() Uint32x4
 
-// Uint64x2 converts from Int64x2 to Uint64x2
-func (from Int64x2) AsUint64x2() (to Uint64x2)
+// AsUint64x2 returns a Uint64x2 with the same bit representation as x.
+func (x Int64x2) AsUint64x2() Uint64x2
 
-// Float32x8 converts from Int64x4 to Float32x8
-func (from Int64x4) AsFloat32x8() (to Float32x8)
+// AsFloat32x8 returns a Float32x8 with the same bit representation as x.
+func (x Int64x4) AsFloat32x8() Float32x8
 
-// Float64x4 converts from Int64x4 to Float64x4
-func (from Int64x4) AsFloat64x4() (to Float64x4)
+// AsFloat64x4 returns a Float64x4 with the same bit representation as x.
+func (x Int64x4) AsFloat64x4() Float64x4
 
-// Int8x32 converts from Int64x4 to Int8x32
-func (from Int64x4) AsInt8x32() (to Int8x32)
+// AsInt8x32 returns an Int8x32 with the same bit representation as x.
+func (x Int64x4) AsInt8x32() Int8x32
 
-// Int16x16 converts from Int64x4 to Int16x16
-func (from Int64x4) AsInt16x16() (to Int16x16)
+// AsInt16x16 returns an Int16x16 with the same bit representation as x.
+func (x Int64x4) AsInt16x16() Int16x16
 
-// Int32x8 converts from Int64x4 to Int32x8
-func (from Int64x4) AsInt32x8() (to Int32x8)
+// AsInt32x8 returns an Int32x8 with the same bit representation as x.
+func (x Int64x4) AsInt32x8() Int32x8
 
-// Uint8x32 converts from Int64x4 to Uint8x32
-func (from Int64x4) AsUint8x32() (to Uint8x32)
+// AsUint8x32 returns a Uint8x32 with the same bit representation as x.
+func (x Int64x4) AsUint8x32() Uint8x32
 
-// Uint16x16 converts from Int64x4 to Uint16x16
-func (from Int64x4) AsUint16x16() (to Uint16x16)
+// AsUint16x16 returns a Uint16x16 with the same bit representation as x.
+func (x Int64x4) AsUint16x16() Uint16x16
 
-// Uint32x8 converts from Int64x4 to Uint32x8
-func (from Int64x4) AsUint32x8() (to Uint32x8)
+// AsUint32x8 returns a Uint32x8 with the same bit representation as x.
+func (x Int64x4) AsUint32x8() Uint32x8
 
-// Uint64x4 converts from Int64x4 to Uint64x4
-func (from Int64x4) AsUint64x4() (to Uint64x4)
+// AsUint64x4 returns a Uint64x4 with the same bit representation as x.
+func (x Int64x4) AsUint64x4() Uint64x4
 
-// Float32x16 converts from Int64x8 to Float32x16
-func (from Int64x8) AsFloat32x16() (to Float32x16)
+// AsFloat32x16 returns a Float32x16 with the same bit representation as x.
+func (x Int64x8) AsFloat32x16() Float32x16
 
-// Float64x8 converts from Int64x8 to Float64x8
-func (from Int64x8) AsFloat64x8() (to Float64x8)
+// AsFloat64x8 returns a Float64x8 with the same bit representation as x.
+func (x Int64x8) AsFloat64x8() Float64x8
 
-// Int8x64 converts from Int64x8 to Int8x64
-func (from Int64x8) AsInt8x64() (to Int8x64)
+// AsInt8x64 returns an Int8x64 with the same bit representation as x.
+func (x Int64x8) AsInt8x64() Int8x64
 
-// Int16x32 converts from Int64x8 to Int16x32
-func (from Int64x8) AsInt16x32() (to Int16x32)
+// AsInt16x32 returns an Int16x32 with the same bit representation as x.
+func (x Int64x8) AsInt16x32() Int16x32
 
-// Int32x16 converts from Int64x8 to Int32x16
-func (from Int64x8) AsInt32x16() (to Int32x16)
+// AsInt32x16 returns an Int32x16 with the same bit representation as x.
+func (x Int64x8) AsInt32x16() Int32x16
 
-// Uint8x64 converts from Int64x8 to Uint8x64
-func (from Int64x8) AsUint8x64() (to Uint8x64)
+// AsUint8x64 returns a Uint8x64 with the same bit representation as x.
+func (x Int64x8) AsUint8x64() Uint8x64
 
-// Uint16x32 converts from Int64x8 to Uint16x32
-func (from Int64x8) AsUint16x32() (to Uint16x32)
+// AsUint16x32 returns a Uint16x32 with the same bit representation as x.
+func (x Int64x8) AsUint16x32() Uint16x32
 
-// Uint32x16 converts from Int64x8 to Uint32x16
-func (from Int64x8) AsUint32x16() (to Uint32x16)
+// AsUint32x16 returns a Uint32x16 with the same bit representation as x.
+func (x Int64x8) AsUint32x16() Uint32x16
 
-// Uint64x8 converts from Int64x8 to Uint64x8
-func (from Int64x8) AsUint64x8() (to Uint64x8)
+// AsUint64x8 returns a Uint64x8 with the same bit representation as x.
+func (x Int64x8) AsUint64x8() Uint64x8
 
-// Float32x4 converts from Uint8x16 to Float32x4
-func (from Uint8x16) AsFloat32x4() (to Float32x4)
+// AsFloat32x4 returns a Float32x4 with the same bit representation as x.
+func (x Uint8x16) AsFloat32x4() Float32x4
 
-// Float64x2 converts from Uint8x16 to Float64x2
-func (from Uint8x16) AsFloat64x2() (to Float64x2)
+// AsFloat64x2 returns a Float64x2 with the same bit representation as x.
+func (x Uint8x16) AsFloat64x2() Float64x2
 
-// Int8x16 converts from Uint8x16 to Int8x16
-func (from Uint8x16) AsInt8x16() (to Int8x16)
+// AsInt8x16 returns an Int8x16 with the same bit representation as x.
+func (x Uint8x16) AsInt8x16() Int8x16
 
-// Int16x8 converts from Uint8x16 to Int16x8
-func (from Uint8x16) AsInt16x8() (to Int16x8)
+// AsInt16x8 returns an Int16x8 with the same bit representation as x.
+func (x Uint8x16) AsInt16x8() Int16x8
 
-// Int32x4 converts from Uint8x16 to Int32x4
-func (from Uint8x16) AsInt32x4() (to Int32x4)
+// AsInt32x4 returns an Int32x4 with the same bit representation as x.
+func (x Uint8x16) AsInt32x4() Int32x4
 
-// Int64x2 converts from Uint8x16 to Int64x2
-func (from Uint8x16) AsInt64x2() (to Int64x2)
+// AsInt64x2 returns an Int64x2 with the same bit representation as x.
+func (x Uint8x16) AsInt64x2() Int64x2
 
-// Uint16x8 converts from Uint8x16 to Uint16x8
-func (from Uint8x16) AsUint16x8() (to Uint16x8)
+// AsUint16x8 returns a Uint16x8 with the same bit representation as x.
+func (x Uint8x16) AsUint16x8() Uint16x8
 
-// Uint32x4 converts from Uint8x16 to Uint32x4
-func (from Uint8x16) AsUint32x4() (to Uint32x4)
+// AsUint32x4 returns a Uint32x4 with the same bit representation as x.
+func (x Uint8x16) AsUint32x4() Uint32x4
 
-// Uint64x2 converts from Uint8x16 to Uint64x2
-func (from Uint8x16) AsUint64x2() (to Uint64x2)
+// AsUint64x2 returns a Uint64x2 with the same bit representation as x.
+func (x Uint8x16) AsUint64x2() Uint64x2
 
-// Float32x8 converts from Uint8x32 to Float32x8
-func (from Uint8x32) AsFloat32x8() (to Float32x8)
+// AsFloat32x8 returns a Float32x8 with the same bit representation as x.
+func (x Uint8x32) AsFloat32x8() Float32x8
 
-// Float64x4 converts from Uint8x32 to Float64x4
-func (from Uint8x32) AsFloat64x4() (to Float64x4)
+// AsFloat64x4 returns a Float64x4 with the same bit representation as x.
+func (x Uint8x32) AsFloat64x4() Float64x4
 
-// Int8x32 converts from Uint8x32 to Int8x32
-func (from Uint8x32) AsInt8x32() (to Int8x32)
+// AsInt8x32 returns an Int8x32 with the same bit representation as x.
+func (x Uint8x32) AsInt8x32() Int8x32
 
-// Int16x16 converts from Uint8x32 to Int16x16
-func (from Uint8x32) AsInt16x16() (to Int16x16)
+// AsInt16x16 returns an Int16x16 with the same bit representation as x.
+func (x Uint8x32) AsInt16x16() Int16x16
 
-// Int32x8 converts from Uint8x32 to Int32x8
-func (from Uint8x32) AsInt32x8() (to Int32x8)
+// AsInt32x8 returns an Int32x8 with the same bit representation as x.
+func (x Uint8x32) AsInt32x8() Int32x8
 
-// Int64x4 converts from Uint8x32 to Int64x4
-func (from Uint8x32) AsInt64x4() (to Int64x4)
+// AsInt64x4 returns an Int64x4 with the same bit representation as x.
+func (x Uint8x32) AsInt64x4() Int64x4
 
-// Uint16x16 converts from Uint8x32 to Uint16x16
-func (from Uint8x32) AsUint16x16() (to Uint16x16)
+// AsUint16x16 returns a Uint16x16 with the same bit representation as x.
+func (x Uint8x32) AsUint16x16() Uint16x16
 
-// Uint32x8 converts from Uint8x32 to Uint32x8
-func (from Uint8x32) AsUint32x8() (to Uint32x8)
+// AsUint32x8 returns a Uint32x8 with the same bit representation as x.
+func (x Uint8x32) AsUint32x8() Uint32x8
 
-// Uint64x4 converts from Uint8x32 to Uint64x4
-func (from Uint8x32) AsUint64x4() (to Uint64x4)
+// AsUint64x4 returns a Uint64x4 with the same bit representation as x.
+func (x Uint8x32) AsUint64x4() Uint64x4
 
-// Float32x16 converts from Uint8x64 to Float32x16
-func (from Uint8x64) AsFloat32x16() (to Float32x16)
+// AsFloat32x16 returns a Float32x16 with the same bit representation as x.
+func (x Uint8x64) AsFloat32x16() Float32x16
 
-// Float64x8 converts from Uint8x64 to Float64x8
-func (from Uint8x64) AsFloat64x8() (to Float64x8)
+// AsFloat64x8 returns a Float64x8 with the same bit representation as x.
+func (x Uint8x64) AsFloat64x8() Float64x8
 
-// Int8x64 converts from Uint8x64 to Int8x64
-func (from Uint8x64) AsInt8x64() (to Int8x64)
+// AsInt8x64 returns an Int8x64 with the same bit representation as x.
+func (x Uint8x64) AsInt8x64() Int8x64
 
-// Int16x32 converts from Uint8x64 to Int16x32
-func (from Uint8x64) AsInt16x32() (to Int16x32)
+// AsInt16x32 returns an Int16x32 with the same bit representation as x.
+func (x Uint8x64) AsInt16x32() Int16x32
 
-// Int32x16 converts from Uint8x64 to Int32x16
-func (from Uint8x64) AsInt32x16() (to Int32x16)
+// AsInt32x16 returns an Int32x16 with the same bit representation as x.
+func (x Uint8x64) AsInt32x16() Int32x16
 
-// Int64x8 converts from Uint8x64 to Int64x8
-func (from Uint8x64) AsInt64x8() (to Int64x8)
+// AsInt64x8 returns an Int64x8 with the same bit representation as x.
+func (x Uint8x64) AsInt64x8() Int64x8
 
-// Uint16x32 converts from Uint8x64 to Uint16x32
-func (from Uint8x64) AsUint16x32() (to Uint16x32)
+// AsUint16x32 returns a Uint16x32 with the same bit representation as x.
+func (x Uint8x64) AsUint16x32() Uint16x32
 
-// Uint32x16 converts from Uint8x64 to Uint32x16
-func (from Uint8x64) AsUint32x16() (to Uint32x16)
+// AsUint32x16 returns a Uint32x16 with the same bit representation as x.
+func (x Uint8x64) AsUint32x16() Uint32x16
 
-// Uint64x8 converts from Uint8x64 to Uint64x8
-func (from Uint8x64) AsUint64x8() (to Uint64x8)
+// AsUint64x8 returns a Uint64x8 with the same bit representation as x.
+func (x Uint8x64) AsUint64x8() Uint64x8
 
-// Float32x4 converts from Uint16x8 to Float32x4
-func (from Uint16x8) AsFloat32x4() (to Float32x4)
+// AsFloat32x4 returns a Float32x4 with the same bit representation as x.
+func (x Uint16x8) AsFloat32x4() Float32x4
 
-// Float64x2 converts from Uint16x8 to Float64x2
-func (from Uint16x8) AsFloat64x2() (to Float64x2)
+// AsFloat64x2 returns a Float64x2 with the same bit representation as x.
+func (x Uint16x8) AsFloat64x2() Float64x2
 
-// Int8x16 converts from Uint16x8 to Int8x16
-func (from Uint16x8) AsInt8x16() (to Int8x16)
+// AsInt8x16 returns an Int8x16 with the same bit representation as x.
+func (x Uint16x8) AsInt8x16() Int8x16
 
-// Int16x8 converts from Uint16x8 to Int16x8
-func (from Uint16x8) AsInt16x8() (to Int16x8)
+// AsInt16x8 returns an Int16x8 with the same bit representation as x.
+func (x Uint16x8) AsInt16x8() Int16x8
 
-// Int32x4 converts from Uint16x8 to Int32x4
-func (from Uint16x8) AsInt32x4() (to Int32x4)
+// AsInt32x4 returns an Int32x4 with the same bit representation as x.
+func (x Uint16x8) AsInt32x4() Int32x4
 
-// Int64x2 converts from Uint16x8 to Int64x2
-func (from Uint16x8) AsInt64x2() (to Int64x2)
+// AsInt64x2 returns an Int64x2 with the same bit representation as x.
+func (x Uint16x8) AsInt64x2() Int64x2
 
-// Uint8x16 converts from Uint16x8 to Uint8x16
-func (from Uint16x8) AsUint8x16() (to Uint8x16)
+// AsUint8x16 returns a Uint8x16 with the same bit representation as x.
+func (x Uint16x8) AsUint8x16() Uint8x16
 
-// Uint32x4 converts from Uint16x8 to Uint32x4
-func (from Uint16x8) AsUint32x4() (to Uint32x4)
+// AsUint32x4 returns a Uint32x4 with the same bit representation as x.
+func (x Uint16x8) AsUint32x4() Uint32x4
 
-// Uint64x2 converts from Uint16x8 to Uint64x2
-func (from Uint16x8) AsUint64x2() (to Uint64x2)
+// AsUint64x2 returns a Uint64x2 with the same bit representation as x.
+func (x Uint16x8) AsUint64x2() Uint64x2
 
-// Float32x8 converts from Uint16x16 to Float32x8
-func (from Uint16x16) AsFloat32x8() (to Float32x8)
+// AsFloat32x8 returns a Float32x8 with the same bit representation as x.
+func (x Uint16x16) AsFloat32x8() Float32x8
 
-// Float64x4 converts from Uint16x16 to Float64x4
-func (from Uint16x16) AsFloat64x4() (to Float64x4)
+// AsFloat64x4 returns a Float64x4 with the same bit representation as x.
+func (x Uint16x16) AsFloat64x4() Float64x4
 
-// Int8x32 converts from Uint16x16 to Int8x32
-func (from Uint16x16) AsInt8x32() (to Int8x32)
+// AsInt8x32 returns an Int8x32 with the same bit representation as x.
+func (x Uint16x16) AsInt8x32() Int8x32
 
-// Int16x16 converts from Uint16x16 to Int16x16
-func (from Uint16x16) AsInt16x16() (to Int16x16)
+// AsInt16x16 returns an Int16x16 with the same bit representation as x.
+func (x Uint16x16) AsInt16x16() Int16x16
 
-// Int32x8 converts from Uint16x16 to Int32x8
-func (from Uint16x16) AsInt32x8() (to Int32x8)
+// AsInt32x8 returns an Int32x8 with the same bit representation as x.
+func (x Uint16x16) AsInt32x8() Int32x8
 
-// Int64x4 converts from Uint16x16 to Int64x4
-func (from Uint16x16) AsInt64x4() (to Int64x4)
+// AsInt64x4 returns an Int64x4 with the same bit representation as x.
+func (x Uint16x16) AsInt64x4() Int64x4
 
-// Uint8x32 converts from Uint16x16 to Uint8x32
-func (from Uint16x16) AsUint8x32() (to Uint8x32)
+// AsUint8x32 returns a Uint8x32 with the same bit representation as x.
+func (x Uint16x16) AsUint8x32() Uint8x32
 
-// Uint32x8 converts from Uint16x16 to Uint32x8
-func (from Uint16x16) AsUint32x8() (to Uint32x8)
+// AsUint32x8 returns a Uint32x8 with the same bit representation as x.
+func (x Uint16x16) AsUint32x8() Uint32x8
 
-// Uint64x4 converts from Uint16x16 to Uint64x4
-func (from Uint16x16) AsUint64x4() (to Uint64x4)
+// AsUint64x4 returns a Uint64x4 with the same bit representation as x.
+func (x Uint16x16) AsUint64x4() Uint64x4
 
-// Float32x16 converts from Uint16x32 to Float32x16
-func (from Uint16x32) AsFloat32x16() (to Float32x16)
+// AsFloat32x16 returns a Float32x16 with the same bit representation as x.
+func (x Uint16x32) AsFloat32x16() Float32x16
 
-// Float64x8 converts from Uint16x32 to Float64x8
-func (from Uint16x32) AsFloat64x8() (to Float64x8)
+// AsFloat64x8 returns a Float64x8 with the same bit representation as x.
+func (x Uint16x32) AsFloat64x8() Float64x8
 
-// Int8x64 converts from Uint16x32 to Int8x64
-func (from Uint16x32) AsInt8x64() (to Int8x64)
+// AsInt8x64 returns an Int8x64 with the same bit representation as x.
+func (x Uint16x32) AsInt8x64() Int8x64
 
-// Int16x32 converts from Uint16x32 to Int16x32
-func (from Uint16x32) AsInt16x32() (to Int16x32)
+// AsInt16x32 returns an Int16x32 with the same bit representation as x.
+func (x Uint16x32) AsInt16x32() Int16x32
 
-// Int32x16 converts from Uint16x32 to Int32x16
-func (from Uint16x32) AsInt32x16() (to Int32x16)
+// AsInt32x16 returns an Int32x16 with the same bit representation as x.
+func (x Uint16x32) AsInt32x16() Int32x16
 
-// Int64x8 converts from Uint16x32 to Int64x8
-func (from Uint16x32) AsInt64x8() (to Int64x8)
+// AsInt64x8 returns an Int64x8 with the same bit representation as x.
+func (x Uint16x32) AsInt64x8() Int64x8
 
-// Uint8x64 converts from Uint16x32 to Uint8x64
-func (from Uint16x32) AsUint8x64() (to Uint8x64)
+// AsUint8x64 returns a Uint8x64 with the same bit representation as x.
+func (x Uint16x32) AsUint8x64() Uint8x64
 
-// Uint32x16 converts from Uint16x32 to Uint32x16
-func (from Uint16x32) AsUint32x16() (to Uint32x16)
+// AsUint32x16 returns a Uint32x16 with the same bit representation as x.
+func (x Uint16x32) AsUint32x16() Uint32x16
 
-// Uint64x8 converts from Uint16x32 to Uint64x8
-func (from Uint16x32) AsUint64x8() (to Uint64x8)
+// AsUint64x8 returns a Uint64x8 with the same bit representation as x.
+func (x Uint16x32) AsUint64x8() Uint64x8
 
-// Float32x4 converts from Uint32x4 to Float32x4
-func (from Uint32x4) AsFloat32x4() (to Float32x4)
+// AsFloat32x4 returns a Float32x4 with the same bit representation as x.
+func (x Uint32x4) AsFloat32x4() Float32x4
 
-// Float64x2 converts from Uint32x4 to Float64x2
-func (from Uint32x4) AsFloat64x2() (to Float64x2)
+// AsFloat64x2 returns a Float64x2 with the same bit representation as x.
+func (x Uint32x4) AsFloat64x2() Float64x2
 
-// Int8x16 converts from Uint32x4 to Int8x16
-func (from Uint32x4) AsInt8x16() (to Int8x16)
+// AsInt8x16 returns an Int8x16 with the same bit representation as x.
+func (x Uint32x4) AsInt8x16() Int8x16
 
-// Int16x8 converts from Uint32x4 to Int16x8
-func (from Uint32x4) AsInt16x8() (to Int16x8)
+// AsInt16x8 returns an Int16x8 with the same bit representation as x.
+func (x Uint32x4) AsInt16x8() Int16x8
 
-// Int32x4 converts from Uint32x4 to Int32x4
-func (from Uint32x4) AsInt32x4() (to Int32x4)
+// AsInt32x4 returns an Int32x4 with the same bit representation as x.
+func (x Uint32x4) AsInt32x4() Int32x4
 
-// Int64x2 converts from Uint32x4 to Int64x2
-func (from Uint32x4) AsInt64x2() (to Int64x2)
+// AsInt64x2 returns an Int64x2 with the same bit representation as x.
+func (x Uint32x4) AsInt64x2() Int64x2
 
-// Uint8x16 converts from Uint32x4 to Uint8x16
-func (from Uint32x4) AsUint8x16() (to Uint8x16)
+// AsUint8x16 returns a Uint8x16 with the same bit representation as x.
+func (x Uint32x4) AsUint8x16() Uint8x16
 
-// Uint16x8 converts from Uint32x4 to Uint16x8
-func (from Uint32x4) AsUint16x8() (to Uint16x8)
+// AsUint16x8 returns a Uint16x8 with the same bit representation as x.
+func (x Uint32x4) AsUint16x8() Uint16x8
 
-// Uint64x2 converts from Uint32x4 to Uint64x2
-func (from Uint32x4) AsUint64x2() (to Uint64x2)
+// AsUint64x2 returns a Uint64x2 with the same bit representation as x.
+func (x Uint32x4) AsUint64x2() Uint64x2
 
-// Float32x8 converts from Uint32x8 to Float32x8
-func (from Uint32x8) AsFloat32x8() (to Float32x8)
+// AsFloat32x8 returns a Float32x8 with the same bit representation as x.
+func (x Uint32x8) AsFloat32x8() Float32x8
 
-// Float64x4 converts from Uint32x8 to Float64x4
-func (from Uint32x8) AsFloat64x4() (to Float64x4)
+// AsFloat64x4 returns a Float64x4 with the same bit representation as x.
+func (x Uint32x8) AsFloat64x4() Float64x4
 
-// Int8x32 converts from Uint32x8 to Int8x32
-func (from Uint32x8) AsInt8x32() (to Int8x32)
+// AsInt8x32 returns an Int8x32 with the same bit representation as x.
+func (x Uint32x8) AsInt8x32() Int8x32
 
-// Int16x16 converts from Uint32x8 to Int16x16
-func (from Uint32x8) AsInt16x16() (to Int16x16)
+// AsInt16x16 returns an Int16x16 with the same bit representation as x.
+func (x Uint32x8) AsInt16x16() Int16x16
 
-// Int32x8 converts from Uint32x8 to Int32x8
-func (from Uint32x8) AsInt32x8() (to Int32x8)
+// AsInt32x8 returns an Int32x8 with the same bit representation as x.
+func (x Uint32x8) AsInt32x8() Int32x8
 
-// Int64x4 converts from Uint32x8 to Int64x4
-func (from Uint32x8) AsInt64x4() (to Int64x4)
+// AsInt64x4 returns an Int64x4 with the same bit representation as x.
+func (x Uint32x8) AsInt64x4() Int64x4
 
-// Uint8x32 converts from Uint32x8 to Uint8x32
-func (from Uint32x8) AsUint8x32() (to Uint8x32)
+// AsUint8x32 returns a Uint8x32 with the same bit representation as x.
+func (x Uint32x8) AsUint8x32() Uint8x32
 
-// Uint16x16 converts from Uint32x8 to Uint16x16
-func (from Uint32x8) AsUint16x16() (to Uint16x16)
+// AsUint16x16 returns a Uint16x16 with the same bit representation as x.
+func (x Uint32x8) AsUint16x16() Uint16x16
 
-// Uint64x4 converts from Uint32x8 to Uint64x4
-func (from Uint32x8) AsUint64x4() (to Uint64x4)
+// AsUint64x4 returns a Uint64x4 with the same bit representation as x.
+func (x Uint32x8) AsUint64x4() Uint64x4
 
-// Float32x16 converts from Uint32x16 to Float32x16
-func (from Uint32x16) AsFloat32x16() (to Float32x16)
+// AsFloat32x16 returns a Float32x16 with the same bit representation as x.
+func (x Uint32x16) AsFloat32x16() Float32x16
 
-// Float64x8 converts from Uint32x16 to Float64x8
-func (from Uint32x16) AsFloat64x8() (to Float64x8)
+// AsFloat64x8 returns a Float64x8 with the same bit representation as x.
+func (x Uint32x16) AsFloat64x8() Float64x8
 
-// Int8x64 converts from Uint32x16 to Int8x64
-func (from Uint32x16) AsInt8x64() (to Int8x64)
+// AsInt8x64 returns an Int8x64 with the same bit representation as x.
+func (x Uint32x16) AsInt8x64() Int8x64
 
-// Int16x32 converts from Uint32x16 to Int16x32
-func (from Uint32x16) AsInt16x32() (to Int16x32)
+// AsInt16x32 returns an Int16x32 with the same bit representation as x.
+func (x Uint32x16) AsInt16x32() Int16x32
 
-// Int32x16 converts from Uint32x16 to Int32x16
-func (from Uint32x16) AsInt32x16() (to Int32x16)
+// AsInt32x16 returns an Int32x16 with the same bit representation as x.
+func (x Uint32x16) AsInt32x16() Int32x16
 
-// Int64x8 converts from Uint32x16 to Int64x8
-func (from Uint32x16) AsInt64x8() (to Int64x8)
+// AsInt64x8 returns an Int64x8 with the same bit representation as x.
+func (x Uint32x16) AsInt64x8() Int64x8
 
-// Uint8x64 converts from Uint32x16 to Uint8x64
-func (from Uint32x16) AsUint8x64() (to Uint8x64)
+// AsUint8x64 returns a Uint8x64 with the same bit representation as x.
+func (x Uint32x16) AsUint8x64() Uint8x64
 
-// Uint16x32 converts from Uint32x16 to Uint16x32
-func (from Uint32x16) AsUint16x32() (to Uint16x32)
+// AsUint16x32 returns a Uint16x32 with the same bit representation as x.
+func (x Uint32x16) AsUint16x32() Uint16x32
 
-// Uint64x8 converts from Uint32x16 to Uint64x8
-func (from Uint32x16) AsUint64x8() (to Uint64x8)
+// AsUint64x8 returns a Uint64x8 with the same bit representation as x.
+func (x Uint32x16) AsUint64x8() Uint64x8
 
-// Float32x4 converts from Uint64x2 to Float32x4
-func (from Uint64x2) AsFloat32x4() (to Float32x4)
+// AsFloat32x4 returns a Float32x4 with the same bit representation as x.
+func (x Uint64x2) AsFloat32x4() Float32x4
 
-// Float64x2 converts from Uint64x2 to Float64x2
-func (from Uint64x2) AsFloat64x2() (to Float64x2)
+// AsFloat64x2 returns a Float64x2 with the same bit representation as x.
+func (x Uint64x2) AsFloat64x2() Float64x2
 
-// Int8x16 converts from Uint64x2 to Int8x16
-func (from Uint64x2) AsInt8x16() (to Int8x16)
+// AsInt8x16 returns an Int8x16 with the same bit representation as x.
+func (x Uint64x2) AsInt8x16() Int8x16
 
-// Int16x8 converts from Uint64x2 to Int16x8
-func (from Uint64x2) AsInt16x8() (to Int16x8)
+// AsInt16x8 returns an Int16x8 with the same bit representation as x.
+func (x Uint64x2) AsInt16x8() Int16x8
 
-// Int32x4 converts from Uint64x2 to Int32x4
-func (from Uint64x2) AsInt32x4() (to Int32x4)
+// AsInt32x4 returns an Int32x4 with the same bit representation as x.
+func (x Uint64x2) AsInt32x4() Int32x4
 
-// Int64x2 converts from Uint64x2 to Int64x2
-func (from Uint64x2) AsInt64x2() (to Int64x2)
+// AsInt64x2 returns an Int64x2 with the same bit representation as x.
+func (x Uint64x2) AsInt64x2() Int64x2
 
-// Uint8x16 converts from Uint64x2 to Uint8x16
-func (from Uint64x2) AsUint8x16() (to Uint8x16)
+// AsUint8x16 returns a Uint8x16 with the same bit representation as x.
+func (x Uint64x2) AsUint8x16() Uint8x16
 
-// Uint16x8 converts from Uint64x2 to Uint16x8
-func (from Uint64x2) AsUint16x8() (to Uint16x8)
+// AsUint16x8 returns a Uint16x8 with the same bit representation as x.
+func (x Uint64x2) AsUint16x8() Uint16x8
 
-// Uint32x4 converts from Uint64x2 to Uint32x4
-func (from Uint64x2) AsUint32x4() (to Uint32x4)
+// AsUint32x4 returns a Uint32x4 with the same bit representation as x.
+func (x Uint64x2) AsUint32x4() Uint32x4
 
-// Float32x8 converts from Uint64x4 to Float32x8
-func (from Uint64x4) AsFloat32x8() (to Float32x8)
+// AsFloat32x8 returns a Float32x8 with the same bit representation as x.
+func (x Uint64x4) AsFloat32x8() Float32x8
 
-// Float64x4 converts from Uint64x4 to Float64x4
-func (from Uint64x4) AsFloat64x4() (to Float64x4)
+// AsFloat64x4 returns a Float64x4 with the same bit representation as x.
+func (x Uint64x4) AsFloat64x4() Float64x4
 
-// Int8x32 converts from Uint64x4 to Int8x32
-func (from Uint64x4) AsInt8x32() (to Int8x32)
+// AsInt8x32 returns an Int8x32 with the same bit representation as x.
+func (x Uint64x4) AsInt8x32() Int8x32
 
-// Int16x16 converts from Uint64x4 to Int16x16
-func (from Uint64x4) AsInt16x16() (to Int16x16)
+// AsInt16x16 returns an Int16x16 with the same bit representation as x.
+func (x Uint64x4) AsInt16x16() Int16x16
 
-// Int32x8 converts from Uint64x4 to Int32x8
-func (from Uint64x4) AsInt32x8() (to Int32x8)
+// AsInt32x8 returns an Int32x8 with the same bit representation as x.
+func (x Uint64x4) AsInt32x8() Int32x8
 
-// Int64x4 converts from Uint64x4 to Int64x4
-func (from Uint64x4) AsInt64x4() (to Int64x4)
+// AsInt64x4 returns an Int64x4 with the same bit representation as x.
+func (x Uint64x4) AsInt64x4() Int64x4
 
-// Uint8x32 converts from Uint64x4 to Uint8x32
-func (from Uint64x4) AsUint8x32() (to Uint8x32)
+// AsUint8x32 returns a Uint8x32 with the same bit representation as x.
+func (x Uint64x4) AsUint8x32() Uint8x32
 
-// Uint16x16 converts from Uint64x4 to Uint16x16
-func (from Uint64x4) AsUint16x16() (to Uint16x16)
+// AsUint16x16 returns a Uint16x16 with the same bit representation as x.
+func (x Uint64x4) AsUint16x16() Uint16x16
 
-// Uint32x8 converts from Uint64x4 to Uint32x8
-func (from Uint64x4) AsUint32x8() (to Uint32x8)
+// AsUint32x8 returns a Uint32x8 with the same bit representation as x.
+func (x Uint64x4) AsUint32x8() Uint32x8
 
-// Float32x16 converts from Uint64x8 to Float32x16
-func (from Uint64x8) AsFloat32x16() (to Float32x16)
+// AsFloat32x16 returns a Float32x16 with the same bit representation as x.
+func (x Uint64x8) AsFloat32x16() Float32x16
 
-// Float64x8 converts from Uint64x8 to Float64x8
-func (from Uint64x8) AsFloat64x8() (to Float64x8)
+// AsFloat64x8 returns a Float64x8 with the same bit representation as x.
+func (x Uint64x8) AsFloat64x8() Float64x8
 
-// Int8x64 converts from Uint64x8 to Int8x64
-func (from Uint64x8) AsInt8x64() (to Int8x64)
+// AsInt8x64 returns an Int8x64 with the same bit representation as x.
+func (x Uint64x8) AsInt8x64() Int8x64
 
-// Int16x32 converts from Uint64x8 to Int16x32
-func (from Uint64x8) AsInt16x32() (to Int16x32)
+// AsInt16x32 returns an Int16x32 with the same bit representation as x.
+func (x Uint64x8) AsInt16x32() Int16x32
 
-// Int32x16 converts from Uint64x8 to Int32x16
-func (from Uint64x8) AsInt32x16() (to Int32x16)
+// AsInt32x16 returns an Int32x16 with the same bit representation as x.
+func (x Uint64x8) AsInt32x16() Int32x16
 
-// Int64x8 converts from Uint64x8 to Int64x8
-func (from Uint64x8) AsInt64x8() (to Int64x8)
+// AsInt64x8 returns an Int64x8 with the same bit representation as x.
+func (x Uint64x8) AsInt64x8() Int64x8
 
-// Uint8x64 converts from Uint64x8 to Uint8x64
-func (from Uint64x8) AsUint8x64() (to Uint8x64)
+// AsUint8x64 returns a Uint8x64 with the same bit representation as x.
+func (x Uint64x8) AsUint8x64() Uint8x64
 
-// Uint16x32 converts from Uint64x8 to Uint16x32
-func (from Uint64x8) AsUint16x32() (to Uint16x32)
+// AsUint16x32 returns a Uint16x32 with the same bit representation as x.
+func (x Uint64x8) AsUint16x32() Uint16x32
 
-// Uint32x16 converts from Uint64x8 to Uint32x16
-func (from Uint64x8) AsUint32x16() (to Uint32x16)
+// AsUint32x16 returns a Uint32x16 with the same bit representation as x.
+func (x Uint64x8) AsUint32x16() Uint32x16
 
-// ToInt8x16 converts from Mask8x16 to Int8x16
+// ToInt8x16 converts from Mask8x16 to Int8x16.
 func (from Mask8x16) ToInt8x16() (to Int8x16)
 
-// asMask converts from Int8x16 to Mask8x16
+// asMask converts from Int8x16 to Mask8x16.
 func (from Int8x16) asMask() (to Mask8x16)
 
 func (x Mask8x16) And(y Mask8x16) Mask8x16
 
 func (x Mask8x16) Or(y Mask8x16) Mask8x16
 
-// ToInt8x32 converts from Mask8x32 to Int8x32
+// ToInt8x32 converts from Mask8x32 to Int8x32.
 func (from Mask8x32) ToInt8x32() (to Int8x32)
 
-// asMask converts from Int8x32 to Mask8x32
+// asMask converts from Int8x32 to Mask8x32.
 func (from Int8x32) asMask() (to Mask8x32)
 
 func (x Mask8x32) And(y Mask8x32) Mask8x32
 
 func (x Mask8x32) Or(y Mask8x32) Mask8x32
 
-// ToInt8x64 converts from Mask8x64 to Int8x64
+// ToInt8x64 converts from Mask8x64 to Int8x64.
 func (from Mask8x64) ToInt8x64() (to Int8x64)
 
-// asMask converts from Int8x64 to Mask8x64
+// asMask converts from Int8x64 to Mask8x64.
 func (from Int8x64) asMask() (to Mask8x64)
 
 func (x Mask8x64) And(y Mask8x64) Mask8x64
 
 func (x Mask8x64) Or(y Mask8x64) Mask8x64
 
-// ToInt16x8 converts from Mask16x8 to Int16x8
+// ToInt16x8 converts from Mask16x8 to Int16x8.
 func (from Mask16x8) ToInt16x8() (to Int16x8)
 
-// asMask converts from Int16x8 to Mask16x8
+// asMask converts from Int16x8 to Mask16x8.
 func (from Int16x8) asMask() (to Mask16x8)
 
 func (x Mask16x8) And(y Mask16x8) Mask16x8
 
 func (x Mask16x8) Or(y Mask16x8) Mask16x8
 
-// ToInt16x16 converts from Mask16x16 to Int16x16
+// ToInt16x16 converts from Mask16x16 to Int16x16.
 func (from Mask16x16) ToInt16x16() (to Int16x16)
 
-// asMask converts from Int16x16 to Mask16x16
+// asMask converts from Int16x16 to Mask16x16.
 func (from Int16x16) asMask() (to Mask16x16)
 
 func (x Mask16x16) And(y Mask16x16) Mask16x16
 
 func (x Mask16x16) Or(y Mask16x16) Mask16x16
 
-// ToInt16x32 converts from Mask16x32 to Int16x32
+// ToInt16x32 converts from Mask16x32 to Int16x32.
 func (from Mask16x32) ToInt16x32() (to Int16x32)
 
-// asMask converts from Int16x32 to Mask16x32
+// asMask converts from Int16x32 to Mask16x32.
 func (from Int16x32) asMask() (to Mask16x32)
 
 func (x Mask16x32) And(y Mask16x32) Mask16x32
 
 func (x Mask16x32) Or(y Mask16x32) Mask16x32
 
-// ToInt32x4 converts from Mask32x4 to Int32x4
+// ToInt32x4 converts from Mask32x4 to Int32x4.
 func (from Mask32x4) ToInt32x4() (to Int32x4)
 
-// asMask converts from Int32x4 to Mask32x4
+// asMask converts from Int32x4 to Mask32x4.
 func (from Int32x4) asMask() (to Mask32x4)
 
 func (x Mask32x4) And(y Mask32x4) Mask32x4
 
 func (x Mask32x4) Or(y Mask32x4) Mask32x4
 
-// ToInt32x8 converts from Mask32x8 to Int32x8
+// ToInt32x8 converts from Mask32x8 to Int32x8.
 func (from Mask32x8) ToInt32x8() (to Int32x8)
 
-// asMask converts from Int32x8 to Mask32x8
+// asMask converts from Int32x8 to Mask32x8.
 func (from Int32x8) asMask() (to Mask32x8)
 
 func (x Mask32x8) And(y Mask32x8) Mask32x8
 
 func (x Mask32x8) Or(y Mask32x8) Mask32x8
 
-// ToInt32x16 converts from Mask32x16 to Int32x16
+// ToInt32x16 converts from Mask32x16 to Int32x16.
 func (from Mask32x16) ToInt32x16() (to Int32x16)
 
-// asMask converts from Int32x16 to Mask32x16
+// asMask converts from Int32x16 to Mask32x16.
 func (from Int32x16) asMask() (to Mask32x16)
 
 func (x Mask32x16) And(y Mask32x16) Mask32x16
 
 func (x Mask32x16) Or(y Mask32x16) Mask32x16
 
-// ToInt64x2 converts from Mask64x2 to Int64x2
+// ToInt64x2 converts from Mask64x2 to Int64x2.
 func (from Mask64x2) ToInt64x2() (to Int64x2)
 
-// asMask converts from Int64x2 to Mask64x2
+// asMask converts from Int64x2 to Mask64x2.
 func (from Int64x2) asMask() (to Mask64x2)
 
 func (x Mask64x2) And(y Mask64x2) Mask64x2
 
 func (x Mask64x2) Or(y Mask64x2) Mask64x2
 
-// ToInt64x4 converts from Mask64x4 to Int64x4
+// ToInt64x4 converts from Mask64x4 to Int64x4.
 func (from Mask64x4) ToInt64x4() (to Int64x4)
 
-// asMask converts from Int64x4 to Mask64x4
+// asMask converts from Int64x4 to Mask64x4.
 func (from Int64x4) asMask() (to Mask64x4)
 
 func (x Mask64x4) And(y Mask64x4) Mask64x4
 
 func (x Mask64x4) Or(y Mask64x4) Mask64x4
 
-// ToInt64x8 converts from Mask64x8 to Int64x8
+// ToInt64x8 converts from Mask64x8 to Int64x8.
 func (from Mask64x8) ToInt64x8() (to Int64x8)
 
-// asMask converts from Int64x8 to Mask64x8
+// asMask converts from Int64x8 to Mask64x8.
 func (from Int64x8) asMask() (to Mask64x8)
 
 func (x Mask64x8) And(y Mask64x8) Mask64x8
diff --git a/src/simd/archsimd/ops_internal_amd64.go b/src/simd/archsimd/ops_internal_amd64.go
index 566b88d510..8eae69a7ba 100644
--- a/src/simd/archsimd/ops_internal_amd64.go
+++ b/src/simd/archsimd/ops_internal_amd64.go
@@ -1,4 +1,4 @@
-// Code generated by x/arch/internal/simdgen using 'go run . -xedPath $XED_PATH -o godefs -goroot $GOROOT go.yaml types.yaml categories.yaml'; DO NOT EDIT.
+// Code generated by 'simdgen -o godefs -goroot $GOROOT -xedPath $XED_PATH go.yaml types.yaml categories.yaml'; DO NOT EDIT.
 
 //go:build goexperiment.simd
 
@@ -382,7 +382,9 @@ func (x Uint64x8) concatSelectedConstantGrouped(hilos uint8, y Uint64x8) Uint64x
 /* permuteScalars */
 
 // permuteScalars performs a permutation of vector x using constant indices:
-// result = {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]}
+//
+//	result = {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]}
+//
 // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
 //
 // indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
@@ -391,7 +393,9 @@ func (x Uint64x8) concatSelectedConstantGrouped(hilos uint8, y Uint64x8) Uint64x
 func (x Int32x4) permuteScalars(indices uint8) Int32x4
 
 // permuteScalars performs a permutation of vector x using constant indices:
-// result = {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]}
+//
+//	result = {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]}
+//
 // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
 //
 // indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
@@ -402,7 +406,9 @@ func (x Uint32x4) permuteScalars(indices uint8) Uint32x4
 /* permuteScalarsGrouped */
 
 // permuteScalarsGrouped performs a grouped permutation of vector x using constant indices:
-// result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
+//
+//	result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
+//
 // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
 // Each group is of size 128-bit.
 //
@@ -412,7 +418,9 @@ func (x Uint32x4) permuteScalars(indices uint8) Uint32x4
 func (x Int32x8) permuteScalarsGrouped(indices uint8) Int32x8
 
 // permuteScalarsGrouped performs a grouped permutation of vector x using constant indices:
-// result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
+//
+//	result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
+//
 // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
 // Each group is of size 128-bit.
 //
@@ -422,7 +430,9 @@ func (x Int32x8) permuteScalarsGrouped(indices uint8) Int32x8
 func (x Int32x16) permuteScalarsGrouped(indices uint8) Int32x16
 
 // permuteScalarsGrouped performs a grouped permutation of vector x using constant indices:
-// result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
+//
+//	result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
+//
 // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
 // Each group is of size 128-bit.
 //
@@ -432,7 +442,9 @@ func (x Int32x16) permuteScalarsGrouped(indices uint8) Int32x16
 func (x Uint32x8) permuteScalarsGrouped(indices uint8) Uint32x8
 
 // permuteScalarsGrouped performs a grouped permutation of vector x using constant indices:
-// result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
+//
+//	result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
+//
 // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
 // Each group is of size 128-bit.
 //
@@ -444,7 +456,9 @@ func (x Uint32x16) permuteScalarsGrouped(indices uint8) Uint32x16
 /* permuteScalarsHi */
 
 // permuteScalarsHi performs a permutation of vector x using constant indices:
-// result = {x[0], x[1], x[2], x[3], x[indices[0:2]+4], x[indices[2:4]+4], x[indices[4:6]+4], x[indices[6:8]+4]}
+//
+//	result = {x[0], x[1], x[2], x[3], x[indices[0:2]+4], x[indices[2:4]+4], x[indices[4:6]+4], x[indices[6:8]+4]}
+//
 // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
 //
 // indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
@@ -453,7 +467,9 @@ func (x Uint32x16) permuteScalarsGrouped(indices uint8) Uint32x16
 func (x Int16x8) permuteScalarsHi(indices uint8) Int16x8
 
 // permuteScalarsHi performs a permutation of vector x using constant indices:
-// result = {x[0], x[1], x[2], x[3], x[indices[0:2]+4], x[indices[2:4]+4], x[indices[4:6]+4], x[indices[6:8]+4]}
+//
+//	result = {x[0], x[1], x[2], x[3], x[indices[0:2]+4], x[indices[2:4]+4], x[indices[4:6]+4], x[indices[6:8]+4]}
+//
 // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
 //
 // indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
@@ -522,7 +538,9 @@ func (x Uint16x32) permuteScalarsHiGrouped(indices uint8) Uint16x32
 /* permuteScalarsLo */
 
 // permuteScalarsLo performs a permutation of vector x using constant indices:
-// result = {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]], x[4], x[5], x[6], x[7]}
+//
+//	result = {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]], x[4], x[5], x[6], x[7]}
+//
 // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
 //
 // indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
@@ -531,7 +549,9 @@ func (x Uint16x32) permuteScalarsHiGrouped(indices uint8) Uint16x32
 func (x Int16x8) permuteScalarsLo(indices uint8) Int16x8
 
 // permuteScalarsLo performs a permutation of vector x using constant indices:
-// result = {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]], x[4], x[5], x[6], x[7]}
+//
+//	result = {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]], x[4], x[5], x[6], x[7]}
+//
 // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
 //
 // indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
diff --git a/src/simd/archsimd/other_gen_amd64.go b/src/simd/archsimd/other_gen_amd64.go
index 8d04409197..647001acce 100644
--- a/src/simd/archsimd/other_gen_amd64.go
+++ b/src/simd/archsimd/other_gen_amd64.go
@@ -1,4 +1,4 @@
-// Code generated by 'go run genfiles.go'; DO NOT EDIT.
+// Code generated by 'tmplgen'; DO NOT EDIT.
 
 //go:build goexperiment.simd
 
@@ -7,7 +7,7 @@ package archsimd
 // BroadcastInt8x16 returns a vector with the input
 // x assigned to all elements of the output.
 //
-// Emulated, CPU Feature AVX2
+// Emulated, CPU Feature: AVX2
 func BroadcastInt8x16(x int8) Int8x16 {
 	var z Int8x16
 	return z.SetElem(0, x).Broadcast128()
@@ -16,7 +16,7 @@ func BroadcastInt8x16(x int8) Int8x16 {
 // BroadcastInt16x8 returns a vector with the input
 // x assigned to all elements of the output.
 //
-// Emulated, CPU Feature AVX2
+// Emulated, CPU Feature: AVX2
 func BroadcastInt16x8(x int16) Int16x8 {
 	var z Int16x8
 	return z.SetElem(0, x).Broadcast128()
@@ -25,7 +25,7 @@ func BroadcastInt16x8(x int16) Int16x8 {
 // BroadcastInt32x4 returns a vector with the input
 // x assigned to all elements of the output.
 //
-// Emulated, CPU Feature AVX2
+// Emulated, CPU Feature: AVX2
 func BroadcastInt32x4(x int32) Int32x4 {
 	var z Int32x4
 	return z.SetElem(0, x).Broadcast128()
@@ -34,7 +34,7 @@ func BroadcastInt32x4(x int32) Int32x4 {
 // BroadcastInt64x2 returns a vector with the input
 // x assigned to all elements of the output.
 //
-// Emulated, CPU Feature AVX2
+// Emulated, CPU Feature: AVX2
 func BroadcastInt64x2(x int64) Int64x2 {
 	var z Int64x2
 	return z.SetElem(0, x).Broadcast128()
@@ -43,7 +43,7 @@ func BroadcastInt64x2(x int64) Int64x2 {
 // BroadcastUint8x16 returns a vector with the input
 // x assigned to all elements of the output.
 //
-// Emulated, CPU Feature AVX2
+// Emulated, CPU Feature: AVX2
 func BroadcastUint8x16(x uint8) Uint8x16 {
 	var z Uint8x16
 	return z.SetElem(0, x).Broadcast128()
@@ -52,7 +52,7 @@ func BroadcastUint8x16(x uint8) Uint8x16 {
 // BroadcastUint16x8 returns a vector with the input
 // x assigned to all elements of the output.
 //
-// Emulated, CPU Feature AVX2
+// Emulated, CPU Feature: AVX2
 func BroadcastUint16x8(x uint16) Uint16x8 {
 	var z Uint16x8
 	return z.SetElem(0, x).Broadcast128()
@@ -61,7 +61,7 @@ func BroadcastUint16x8(x uint16) Uint16x8 {
 // BroadcastUint32x4 returns a vector with the input
 // x assigned to all elements of the output.
 //
-// Emulated, CPU Feature AVX2
+// Emulated, CPU Feature: AVX2
 func BroadcastUint32x4(x uint32) Uint32x4 {
 	var z Uint32x4
 	return z.SetElem(0, x).Broadcast128()
@@ -70,7 +70,7 @@ func BroadcastUint32x4(x uint32) Uint32x4 {
 // BroadcastUint64x2 returns a vector with the input
 // x assigned to all elements of the output.
 //
-// Emulated, CPU Feature AVX2
+// Emulated, CPU Feature: AVX2
 func BroadcastUint64x2(x uint64) Uint64x2 {
 	var z Uint64x2
 	return z.SetElem(0, x).Broadcast128()
@@ -79,7 +79,7 @@ func BroadcastUint64x2(x uint64) Uint64x2 {
 // BroadcastFloat32x4 returns a vector with the input
 // x assigned to all elements of the output.
 //
-// Emulated, CPU Feature AVX2
+// Emulated, CPU Feature: AVX2
 func BroadcastFloat32x4(x float32) Float32x4 {
 	var z Float32x4
 	return z.SetElem(0, x).Broadcast128()
@@ -88,7 +88,7 @@ func BroadcastFloat32x4(x float32) Float32x4 {
 // BroadcastFloat64x2 returns a vector with the input
 // x assigned to all elements of the output.
 //
-// Emulated, CPU Feature AVX2
+// Emulated, CPU Feature: AVX2
 func BroadcastFloat64x2(x float64) Float64x2 {
 	var z Float64x2
 	return z.SetElem(0, x).Broadcast128()
@@ -97,7 +97,7 @@ func BroadcastFloat64x2(x float64) Float64x2 {
 // BroadcastInt8x32 returns a vector with the input
 // x assigned to all elements of the output.
 //
-// Emulated, CPU Feature AVX2
+// Emulated, CPU Feature: AVX2
 func BroadcastInt8x32(x int8) Int8x32 {
 	var z Int8x16
 	return z.SetElem(0, x).Broadcast256()
@@ -106,7 +106,7 @@ func BroadcastInt8x32(x int8) Int8x32 {
 // BroadcastInt16x16 returns a vector with the input
 // x assigned to all elements of the output.
 //
-// Emulated, CPU Feature AVX2
+// Emulated, CPU Feature: AVX2
 func BroadcastInt16x16(x int16) Int16x16 {
 	var z Int16x8
 	return z.SetElem(0, x).Broadcast256()
@@ -115,7 +115,7 @@ func BroadcastInt16x16(x int16) Int16x16 {
 // BroadcastInt32x8 returns a vector with the input
 // x assigned to all elements of the output.
 //
-// Emulated, CPU Feature AVX2
+// Emulated, CPU Feature: AVX2
 func BroadcastInt32x8(x int32) Int32x8 {
 	var z Int32x4
 	return z.SetElem(0, x).Broadcast256()
@@ -124,7 +124,7 @@ func BroadcastInt32x8(x int32) Int32x8 {
 // BroadcastInt64x4 returns a vector with the input
 // x assigned to all elements of the output.
 //
-// Emulated, CPU Feature AVX2
+// Emulated, CPU Feature: AVX2
 func BroadcastInt64x4(x int64) Int64x4 {
 	var z Int64x2
 	return z.SetElem(0, x).Broadcast256()
@@ -133,7 +133,7 @@ func BroadcastInt64x4(x int64) Int64x4 {
 // BroadcastUint8x32 returns a vector with the input
 // x assigned to all elements of the output.
 //
-// Emulated, CPU Feature AVX2
+// Emulated, CPU Feature: AVX2
 func BroadcastUint8x32(x uint8) Uint8x32 {
 	var z Uint8x16
 	return z.SetElem(0, x).Broadcast256()
@@ -142,7 +142,7 @@ func BroadcastUint8x32(x uint8) Uint8x32 {
 // BroadcastUint16x16 returns a vector with the input
 // x assigned to all elements of the output.
 //
-// Emulated, CPU Feature AVX2
+// Emulated, CPU Feature: AVX2
 func BroadcastUint16x16(x uint16) Uint16x16 {
 	var z Uint16x8
 	return z.SetElem(0, x).Broadcast256()
@@ -151,7 +151,7 @@ func BroadcastUint16x16(x uint16) Uint16x16 {
 // BroadcastUint32x8 returns a vector with the input
 // x assigned to all elements of the output.
 //
-// Emulated, CPU Feature AVX2
+// Emulated, CPU Feature: AVX2
 func BroadcastUint32x8(x uint32) Uint32x8 {
 	var z Uint32x4
 	return z.SetElem(0, x).Broadcast256()
@@ -160,7 +160,7 @@ func BroadcastUint32x8(x uint32) Uint32x8 {
 // BroadcastUint64x4 returns a vector with the input
 // x assigned to all elements of the output.
 //
-// Emulated, CPU Feature AVX2
+// Emulated, CPU Feature: AVX2
 func BroadcastUint64x4(x uint64) Uint64x4 {
 	var z Uint64x2
 	return z.SetElem(0, x).Broadcast256()
@@ -169,7 +169,7 @@ func BroadcastUint64x4(x uint64) Uint64x4 {
 // BroadcastFloat32x8 returns a vector with the input
 // x assigned to all elements of the output.
 //
-// Emulated, CPU Feature AVX2
+// Emulated, CPU Feature: AVX2
 func BroadcastFloat32x8(x float32) Float32x8 {
 	var z Float32x4
 	return z.SetElem(0, x).Broadcast256()
@@ -178,7 +178,7 @@ func BroadcastFloat32x8(x float32) Float32x8 {
 // BroadcastFloat64x4 returns a vector with the input
 // x assigned to all elements of the output.
 //
-// Emulated, CPU Feature AVX2
+// Emulated, CPU Feature: AVX2
 func BroadcastFloat64x4(x float64) Float64x4 {
 	var z Float64x2
 	return z.SetElem(0, x).Broadcast256()
@@ -187,7 +187,7 @@ func BroadcastFloat64x4(x float64) Float64x4 {
 // BroadcastInt8x64 returns a vector with the input
 // x assigned to all elements of the output.
 //
-// Emulated, CPU Feature AVX512BW
+// Emulated, CPU Feature: AVX512BW
 func BroadcastInt8x64(x int8) Int8x64 {
 	var z Int8x16
 	return z.SetElem(0, x).Broadcast512()
@@ -196,7 +196,7 @@ func BroadcastInt8x64(x int8) Int8x64 {
 // BroadcastInt16x32 returns a vector with the input
 // x assigned to all elements of the output.
 //
-// Emulated, CPU Feature AVX512BW
+// Emulated, CPU Feature: AVX512BW
 func BroadcastInt16x32(x int16) Int16x32 {
 	var z Int16x8
 	return z.SetElem(0, x).Broadcast512()
@@ -205,7 +205,7 @@ func BroadcastInt16x32(x int16) Int16x32 {
 // BroadcastInt32x16 returns a vector with the input
 // x assigned to all elements of the output.
 //
-// Emulated, CPU Feature AVX512F
+// Emulated, CPU Feature: AVX512F
 func BroadcastInt32x16(x int32) Int32x16 {
 	var z Int32x4
 	return z.SetElem(0, x).Broadcast512()
@@ -214,7 +214,7 @@ func BroadcastInt32x16(x int32) Int32x16 {
 // BroadcastInt64x8 returns a vector with the input
 // x assigned to all elements of the output.
 //
-// Emulated, CPU Feature AVX512F
+// Emulated, CPU Feature: AVX512F
 func BroadcastInt64x8(x int64) Int64x8 {
 	var z Int64x2
 	return z.SetElem(0, x).Broadcast512()
@@ -223,7 +223,7 @@ func BroadcastInt64x8(x int64) Int64x8 {
 // BroadcastUint8x64 returns a vector with the input
 // x assigned to all elements of the output.
 //
-// Emulated, CPU Feature AVX512BW
+// Emulated, CPU Feature: AVX512BW
 func BroadcastUint8x64(x uint8) Uint8x64 {
 	var z Uint8x16
 	return z.SetElem(0, x).Broadcast512()
@@ -232,7 +232,7 @@ func BroadcastUint8x64(x uint8) Uint8x64 {
 // BroadcastUint16x32 returns a vector with the input
 // x assigned to all elements of the output.
 //
-// Emulated, CPU Feature AVX512BW
+// Emulated, CPU Feature: AVX512BW
 func BroadcastUint16x32(x uint16) Uint16x32 {
 	var z Uint16x8
 	return z.SetElem(0, x).Broadcast512()
@@ -241,7 +241,7 @@ func BroadcastUint16x32(x uint16) Uint16x32 {
 // BroadcastUint32x16 returns a vector with the input
 // x assigned to all elements of the output.
 //
-// Emulated, CPU Feature AVX512F
+// Emulated, CPU Feature: AVX512F
 func BroadcastUint32x16(x uint32) Uint32x16 {
 	var z Uint32x4
 	return z.SetElem(0, x).Broadcast512()
@@ -250,7 +250,7 @@ func BroadcastUint32x16(x uint32) Uint32x16 {
 // BroadcastUint64x8 returns a vector with the input
 // x assigned to all elements of the output.
 //
-// Emulated, CPU Feature AVX512F
+// Emulated, CPU Feature: AVX512F
 func BroadcastUint64x8(x uint64) Uint64x8 {
 	var z Uint64x2
 	return z.SetElem(0, x).Broadcast512()
@@ -259,7 +259,7 @@ func BroadcastUint64x8(x uint64) Uint64x8 {
 // BroadcastFloat32x16 returns a vector with the input
 // x assigned to all elements of the output.
 //
-// Emulated, CPU Feature AVX512F
+// Emulated, CPU Feature: AVX512F
 func BroadcastFloat32x16(x float32) Float32x16 {
 	var z Float32x4
 	return z.SetElem(0, x).Broadcast512()
@@ -268,7 +268,7 @@ func BroadcastFloat32x16(x float32) Float32x16 {
 // BroadcastFloat64x8 returns a vector with the input
 // x assigned to all elements of the output.
 //
-// Emulated, CPU Feature AVX512F
+// Emulated, CPU Feature: AVX512F
 func BroadcastFloat64x8(x float64) Float64x8 {
 	var z Float64x2
 	return z.SetElem(0, x).Broadcast512()
@@ -334,378 +334,378 @@ func (from Int64x8) ToMask() (to Mask64x8) {
 	return from.NotEqual(Int64x8{})
 }
 
-// Not returns the bitwise complement of x
+// Not returns the bitwise complement of x.
 //
-// Emulated, CPU Feature AVX
+// Emulated, CPU Feature: AVX
 func (x Int8x16) Not() Int8x16 {
 	return x.Xor(x.Equal(x).ToInt8x16())
 }
 
-// Not returns the bitwise complement of x
+// Not returns the bitwise complement of x.
 //
-// Emulated, CPU Feature AVX
+// Emulated, CPU Feature: AVX
 func (x Int16x8) Not() Int16x8 {
 	return x.Xor(x.Equal(x).ToInt16x8())
 }
 
-// Not returns the bitwise complement of x
+// Not returns the bitwise complement of x.
 //
-// Emulated, CPU Feature AVX
+// Emulated, CPU Feature: AVX
 func (x Int32x4) Not() Int32x4 {
 	return x.Xor(x.Equal(x).ToInt32x4())
 }
 
-// Not returns the bitwise complement of x
+// Not returns the bitwise complement of x.
 //
-// Emulated, CPU Feature AVX
+// Emulated, CPU Feature: AVX
 func (x Int64x2) Not() Int64x2 {
 	return x.Xor(x.Equal(x).ToInt64x2())
 }
 
-// Not returns the bitwise complement of x
+// Not returns the bitwise complement of x.
 //
-// Emulated, CPU Feature AVX2
+// Emulated, CPU Feature: AVX2
 func (x Int8x32) Not() Int8x32 {
 	return x.Xor(x.Equal(x).ToInt8x32())
 }
 
-// Not returns the bitwise complement of x
+// Not returns the bitwise complement of x.
 //
-// Emulated, CPU Feature AVX2
+// Emulated, CPU Feature: AVX2
 func (x Int16x16) Not() Int16x16 {
 	return x.Xor(x.Equal(x).ToInt16x16())
 }
 
-// Not returns the bitwise complement of x
+// Not returns the bitwise complement of x.
 //
-// Emulated, CPU Feature AVX2
+// Emulated, CPU Feature: AVX2
 func (x Int32x8) Not() Int32x8 {
 	return x.Xor(x.Equal(x).ToInt32x8())
 }
 
-// Not returns the bitwise complement of x
+// Not returns the bitwise complement of x.
 //
-// Emulated, CPU Feature AVX2
+// Emulated, CPU Feature: AVX2
 func (x Int64x4) Not() Int64x4 {
 	return x.Xor(x.Equal(x).ToInt64x4())
 }
 
-// Not returns the bitwise complement of x
+// Not returns the bitwise complement of x.
 //
-// Emulated, CPU Feature AVX512
+// Emulated, CPU Feature: AVX512
 func (x Int8x64) Not() Int8x64 {
 	return x.Xor(x.Equal(x).ToInt8x64())
 }
 
-// Not returns the bitwise complement of x
+// Not returns the bitwise complement of x.
 //
-// Emulated, CPU Feature AVX512
+// Emulated, CPU Feature: AVX512
 func (x Int16x32) Not() Int16x32 {
 	return x.Xor(x.Equal(x).ToInt16x32())
 }
 
-// Not returns the bitwise complement of x
+// Not returns the bitwise complement of x.
 //
-// Emulated, CPU Feature AVX512
+// Emulated, CPU Feature: AVX512
 func (x Int32x16) Not() Int32x16 {
 	return x.Xor(x.Equal(x).ToInt32x16())
 }
 
-// Not returns the bitwise complement of x
+// Not returns the bitwise complement of x.
 //
-// Emulated, CPU Feature AVX512
+// Emulated, CPU Feature: AVX512
 func (x Int64x8) Not() Int64x8 {
 	return x.Xor(x.Equal(x).ToInt64x8())
 }
 
-// Not returns the bitwise complement of x
+// Not returns the bitwise complement of x.
 //
-// Emulated, CPU Feature AVX
+// Emulated, CPU Feature: AVX
 func (x Uint8x16) Not() Uint8x16 {
 	return x.Xor(x.Equal(x).ToInt8x16().AsUint8x16())
 }
 
-// Not returns the bitwise complement of x
+// Not returns the bitwise complement of x.
 //
-// Emulated, CPU Feature AVX
+// Emulated, CPU Feature: AVX
 func (x Uint16x8) Not() Uint16x8 {
 	return x.Xor(x.Equal(x).ToInt16x8().AsUint16x8())
 }
 
-// Not returns the bitwise complement of x
+// Not returns the bitwise complement of x.
 //
-// Emulated, CPU Feature AVX
+// Emulated, CPU Feature: AVX
 func (x Uint32x4) Not() Uint32x4 {
 	return x.Xor(x.Equal(x).ToInt32x4().AsUint32x4())
 }
 
-// Not returns the bitwise complement of x
+// Not returns the bitwise complement of x.
 //
-// Emulated, CPU Feature AVX
+// Emulated, CPU Feature: AVX
 func (x Uint64x2) Not() Uint64x2 {
 	return x.Xor(x.Equal(x).ToInt64x2().AsUint64x2())
 }
 
-// Not returns the bitwise complement of x
+// Not returns the bitwise complement of x.
 //
-// Emulated, CPU Feature AVX2
+// Emulated, CPU Feature: AVX2
 func (x Uint8x32) Not() Uint8x32 {
 	return x.Xor(x.Equal(x).ToInt8x32().AsUint8x32())
 }
 
-// Not returns the bitwise complement of x
+// Not returns the bitwise complement of x.
 //
-// Emulated, CPU Feature AVX2
+// Emulated, CPU Feature: AVX2
 func (x Uint16x16) Not() Uint16x16 {
 	return x.Xor(x.Equal(x).ToInt16x16().AsUint16x16())
 }
 
-// Not returns the bitwise complement of x
+// Not returns the bitwise complement of x.
 //
-// Emulated, CPU Feature AVX2
+// Emulated, CPU Feature: AVX2
 func (x Uint32x8) Not() Uint32x8 {
 	return x.Xor(x.Equal(x).ToInt32x8().AsUint32x8())
 }
 
-// Not returns the bitwise complement of x
+// Not returns the bitwise complement of x.
 //
-// Emulated, CPU Feature AVX2
+// Emulated, CPU Feature: AVX2
 func (x Uint64x4) Not() Uint64x4 {
 	return x.Xor(x.Equal(x).ToInt64x4().AsUint64x4())
 }
 
-// Not returns the bitwise complement of x
+// Not returns the bitwise complement of x.
 //
-// Emulated, CPU Feature AVX512
+// Emulated, CPU Feature: AVX512
 func (x Uint8x64) Not() Uint8x64 {
 	return x.Xor(x.Equal(x).ToInt8x64().AsUint8x64())
 }
 
-// Not returns the bitwise complement of x
+// Not returns the bitwise complement of x.
 //
-// Emulated, CPU Feature AVX512
+// Emulated, CPU Feature: AVX512
 func (x Uint16x32) Not() Uint16x32 {
 	return x.Xor(x.Equal(x).ToInt16x32().AsUint16x32())
 }
 
-// Not returns the bitwise complement of x
+// Not returns the bitwise complement of x.
 //
-// Emulated, CPU Feature AVX512
+// Emulated, CPU Feature: AVX512
 func (x Uint32x16) Not() Uint32x16 {
 	return x.Xor(x.Equal(x).ToInt32x16().AsUint32x16())
 }
 
-// Not returns the bitwise complement of x
+// Not returns the bitwise complement of x.
 //
-// Emulated, CPU Feature AVX512
+// Emulated, CPU Feature: AVX512
 func (x Uint64x8) Not() Uint64x8 {
 	return x.Xor(x.Equal(x).ToInt64x8().AsUint64x8())
 }
 
-// String returns a string representation of SIMD vector x
+// String returns a string representation of SIMD vector x.
 func (x Int8x16) String() string {
 	var s [16]int8
 	x.Store(&s)
 	return sliceToString(s[:])
 }
 
-// String returns a string representation of SIMD vector x
+// String returns a string representation of SIMD vector x.
 func (x Int16x8) String() string {
 	var s [8]int16
 	x.Store(&s)
 	return sliceToString(s[:])
 }
 
-// String returns a string representation of SIMD vector x
+// String returns a string representation of SIMD vector x.
 func (x Int32x4) String() string {
 	var s [4]int32
 	x.Store(&s)
 	return sliceToString(s[:])
 }
 
-// String returns a string representation of SIMD vector x
+// String returns a string representation of SIMD vector x.
 func (x Int64x2) String() string {
 	var s [2]int64
 	x.Store(&s)
 	return sliceToString(s[:])
 }
 
-// String returns a string representation of SIMD vector x
+// String returns a string representation of SIMD vector x.
 func (x Uint8x16) String() string {
 	var s [16]uint8
 	x.Store(&s)
 	return sliceToString(s[:])
 }
 
-// String returns a string representation of SIMD vector x
+// String returns a string representation of SIMD vector x.
 func (x Uint16x8) String() string {
 	var s [8]uint16
 	x.Store(&s)
 	return sliceToString(s[:])
 }
 
-// String returns a string representation of SIMD vector x
+// String returns a string representation of SIMD vector x.
 func (x Uint32x4) String() string {
 	var s [4]uint32
 	x.Store(&s)
 	return sliceToString(s[:])
 }
 
-// String returns a string representation of SIMD vector x
+// String returns a string representation of SIMD vector x.
 func (x Uint64x2) String() string {
 	var s [2]uint64
 	x.Store(&s)
 	return sliceToString(s[:])
 }
 
-// String returns a string representation of SIMD vector x
+// String returns a string representation of SIMD vector x.
 func (x Float32x4) String() string {
 	var s [4]float32
 	x.Store(&s)
 	return sliceToString(s[:])
 }
 
-// String returns a string representation of SIMD vector x
+// String returns a string representation of SIMD vector x.
 func (x Float64x2) String() string {
 	var s [2]float64
 	x.Store(&s)
 	return sliceToString(s[:])
 }
 
-// String returns a string representation of SIMD vector x
+// String returns a string representation of SIMD vector x.
 func (x Int8x32) String() string {
 	var s [32]int8
 	x.Store(&s)
 	return sliceToString(s[:])
 }
 
-// String returns a string representation of SIMD vector x
+// String returns a string representation of SIMD vector x.
 func (x Int16x16) String() string {
 	var s [16]int16
 	x.Store(&s)
 	return sliceToString(s[:])
 }
 
-// String returns a string representation of SIMD vector x
+// String returns a string representation of SIMD vector x.
 func (x Int32x8) String() string {
 	var s [8]int32
 	x.Store(&s)
 	return sliceToString(s[:])
 }
 
-// String returns a string representation of SIMD vector x
+// String returns a string representation of SIMD vector x.
 func (x Int64x4) String() string {
 	var s [4]int64
 	x.Store(&s)
 	return sliceToString(s[:])
 }
 
-// String returns a string representation of SIMD vector x
+// String returns a string representation of SIMD vector x.
 func (x Uint8x32) String() string {
 	var s [32]uint8
 	x.Store(&s)
 	return sliceToString(s[:])
 }
 
-// String returns a string representation of SIMD vector x
+// String returns a string representation of SIMD vector x.
 func (x Uint16x16) String() string {
 	var s [16]uint16
 	x.Store(&s)
 	return sliceToString(s[:])
 }
 
-// String returns a string representation of SIMD vector x
+// String returns a string representation of SIMD vector x.
 func (x Uint32x8) String() string {
 	var s [8]uint32
 	x.Store(&s)
 	return sliceToString(s[:])
 }
 
-// String returns a string representation of SIMD vector x
+// String returns a string representation of SIMD vector x.
 func (x Uint64x4) String() string {
 	var s [4]uint64
 	x.Store(&s)
 	return sliceToString(s[:])
 }
 
-// String returns a string representation of SIMD vector x
+// String returns a string representation of SIMD vector x.
 func (x Float32x8) String() string {
 	var s [8]float32
 	x.Store(&s)
 	return sliceToString(s[:])
 }
 
-// String returns a string representation of SIMD vector x
+// String returns a string representation of SIMD vector x.
 func (x Float64x4) String() string {
 	var s [4]float64
 	x.Store(&s)
 	return sliceToString(s[:])
 }
 
-// String returns a string representation of SIMD vector x
+// String returns a string representation of SIMD vector x.
 func (x Int8x64) String() string {
 	var s [64]int8
 	x.Store(&s)
 	return sliceToString(s[:])
 }
 
-// String returns a string representation of SIMD vector x
+// String returns a string representation of SIMD vector x.
 func (x Int16x32) String() string {
 	var s [32]int16
 	x.Store(&s)
 	return sliceToString(s[:])
 }
 
-// String returns a string representation of SIMD vector x
+// String returns a string representation of SIMD vector x.
 func (x Int32x16) String() string {
 	var s [16]int32
 	x.Store(&s)
 	return sliceToString(s[:])
 }
 
-// String returns a string representation of SIMD vector x
+// String returns a string representation of SIMD vector x.
 func (x Int64x8) String() string {
 	var s [8]int64
 	x.Store(&s)
 	return sliceToString(s[:])
 }
 
-// String returns a string representation of SIMD vector x
+// String returns a string representation of SIMD vector x.
 func (x Uint8x64) String() string {
 	var s [64]uint8
 	x.Store(&s)
 	return sliceToString(s[:])
 }
 
-// String returns a string representation of SIMD vector x
+// String returns a string representation of SIMD vector x.
 func (x Uint16x32) String() string {
 	var s [32]uint16
 	x.Store(&s)
 	return sliceToString(s[:])
 }
 
-// String returns a string representation of SIMD vector x
+// String returns a string representation of SIMD vector x.
 func (x Uint32x16) String() string {
 	var s [16]uint32
 	x.Store(&s)
 	return sliceToString(s[:])
 }
 
-// String returns a string representation of SIMD vector x
+// String returns a string representation of SIMD vector x.
 func (x Uint64x8) String() string {
 	var s [8]uint64
 	x.Store(&s)
 	return sliceToString(s[:])
 }
 
-// String returns a string representation of SIMD vector x
+// String returns a string representation of SIMD vector x.
 func (x Float32x16) String() string {
 	var s [16]float32
 	x.Store(&s)
 	return sliceToString(s[:])
 }
 
-// String returns a string representation of SIMD vector x
+// String returns a string representation of SIMD vector x.
 func (x Float64x8) String() string {
 	var s [8]float64
 	x.Store(&s)
diff --git a/src/simd/archsimd/shuffles_amd64.go b/src/simd/archsimd/shuffles_amd64.go
index 2bbd89c725..355634fcae 100644
--- a/src/simd/archsimd/shuffles_amd64.go
+++ b/src/simd/archsimd/shuffles_amd64.go
@@ -54,7 +54,10 @@ const (
 // requires two.  a is the source index of the least element in the
 // output, and b, c, and d are the indices of the 2nd, 3rd, and 4th
 // elements in the output.  For example,
-// {1,2,4,8}.SelectFromPair(2,3,5,7,{9,25,49,81}) returns {4,8,25,81}
+//
+//	{1,2,4,8}.SelectFromPair(2,3,5,7,{9,25,49,81})
+//
+// returns {4,8,25,81}.
 //
 // If the selectors are not constant this will translate to a function
 // call.
@@ -133,7 +136,10 @@ func (x Int32x4) SelectFromPair(a, b, c, d uint8, y Int32x4) Int32x4 {
 // it requires two. a is the source index of the least element in the
 // output, and b, c, and d are the indices of the 2nd, 3rd, and 4th
 // elements in the output.  For example,
-// {1,2,4,8}.SelectFromPair(2,3,5,7,{9,25,49,81}) returns {4,8,25,81}
+//
+//	{1,2,4,8}.SelectFromPair(2,3,5,7,{9,25,49,81})
+//
+// returns {4,8,25,81}.
 //
 // If the selectors are not constant this will translate to a function
 // call.
@@ -205,7 +211,10 @@ func (x Uint32x4) SelectFromPair(a, b, c, d uint8, y Uint32x4) Uint32x4 {
 // it requires two. a is the source index of the least element in the
 // output, and b, c, and d are the indices of the 2nd, 3rd, and 4th
 // elements in the output.  For example,
-// {1,2,4,8}.SelectFromPair(2,3,5,7,{9,25,49,81}) returns {4,8,25,81}
+//
+//	{1,2,4,8}.SelectFromPair(2,3,5,7,{9,25,49,81})
+//
+// returns {4,8,25,81}.
 //
 // If the selectors are not constant this will translate to a function
 // call.
@@ -278,9 +287,10 @@ func (x Float32x4) SelectFromPair(a, b, c, d uint8, y Float32x4) Float32x4 {
 // it requires two. a is the source index of the least element in the
 // output, and b, c, and d are the indices of the 2nd, 3rd, and 4th
 // elements in the output.  For example,
-// {1,2,4,8,16,32,64,128}.SelectFromPair(2,3,5,7,{9,25,49,81,121,169,225,289})
 //
-//	returns {4,8,25,81,64,128,169,289}
+//	{1,2,4,8,16,32,64,128}.SelectFromPair(2,3,5,7,{9,25,49,81,121,169,225,289})
+//
+// returns {4,8,25,81,64,128,169,289}.
 //
 // If the selectors are not constant this will translate to a function
 // call.
@@ -353,9 +363,10 @@ func (x Int32x8) SelectFromPairGrouped(a, b, c, d uint8, y Int32x8) Int32x8 {
 // it requires two. a is the source index of the least element in the
 // output, and b, c, and d are the indices of the 2nd, 3rd, and 4th
 // elements in the output.  For example,
-// {1,2,4,8,16,32,64,128}.SelectFromPair(2,3,5,7,{9,25,49,81,121,169,225,289})
 //
-//	returns {4,8,25,81,64,128,169,289}
+//	{1,2,4,8,16,32,64,128}.SelectFromPair(2,3,5,7,{9,25,49,81,121,169,225,289})
+//
+// returns {4,8,25,81,64,128,169,289}.
 //
 // If the selectors are not constant this will translate to a function
 // call.
@@ -428,9 +439,10 @@ func (x Uint32x8) SelectFromPairGrouped(a, b, c, d uint8, y Uint32x8) Uint32x8 {
 // it requires two. a is the source index of the least element in the
 // output, and b, c, and d are the indices of the 2nd, 3rd, and 4th
 // elements in the output.  For example,
-// {1,2,4,8,16,32,64,128}.SelectFromPair(2,3,5,7,{9,25,49,81,121,169,225,289})
 //
-//	returns {4,8,25,81,64,128,169,289}
+//	{1,2,4,8,16,32,64,128}.SelectFromPair(2,3,5,7,{9,25,49,81,121,169,225,289})
+//
+// returns {4,8,25,81,64,128,169,289}.
 //
 // If the selectors are not constant this will translate to a function
 // call.
@@ -1080,7 +1092,7 @@ func (x Uint32x16) PermuteScalarsGrouped(a, b, c, d uint8) Uint32x16 {
 
 // PermuteScalarsHi performs a permutation of vector x using the supplied indices:
 //
-// result = {x[0], x[1], x[2], x[3], x[a+4], x[b+4], x[c+4], x[d+4]}
+//	result = {x[0], x[1], x[2], x[3], x[a+4], x[b+4], x[c+4], x[d+4]}
 //
 // Parameters a,b,c,d should have values between 0 and 3.
 // If a through d are constants, then an instruction will be inlined, otherwise
@@ -1093,7 +1105,7 @@ func (x Int16x8) PermuteScalarsHi(a, b, c, d uint8) Int16x8 {
 
 // PermuteScalarsHi performs a permutation of vector x using the supplied indices:
 //
-// result = {x[0], x[1], x[2], x[3], x[a+4], x[b+4], x[c+4], x[d+4]}
+//	result = {x[0], x[1], x[2], x[3], x[a+4], x[b+4], x[c+4], x[d+4]}
 //
 // Parameters a,b,c,d should have values between 0 and 3.
 // If a through d are constants, then an instruction will be inlined, otherwise
@@ -1276,7 +1288,8 @@ func (x Uint16x32) PermuteScalarsLoGrouped(a, b, c, d uint8) Uint16x32 {
 //
 // A carryless multiplication uses bitwise XOR instead of
 // add-with-carry, for example (in base two):
-// 11 * 11 = 11 * (10 ^ 1) = (11 * 10) ^ (11 * 1) = 110 ^ 11 = 101
+//
+//	11 * 11 = 11 * (10 ^ 1) = (11 * 10) ^ (11 * 1) = 110 ^ 11 = 101
 //
 // This also models multiplication of polynomials with coefficients
 // from GF(2) -- 11 * 11 models (x+1)*(x+1) = x**2 + (1^1)x + 1 =
@@ -1300,7 +1313,8 @@ func (x Uint64x2) CarrylessMultiply(a, b uint8, y Uint64x2) Uint64x2 {
 //
 // A carryless multiplication uses bitwise XOR instead of
 // add-with-carry, for example (in base two):
-// 11 * 11 = 11 * (10 ^ 1) = (11 * 10) ^ (11 * 1) = 110 ^ 11 = 101
+//
+//	11 * 11 = 11 * (10 ^ 1) = (11 * 10) ^ (11 * 1) = 110 ^ 11 = 101
 //
 // This also models multiplication of polynomials with coefficients
 // from GF(2) -- 11 * 11 models (x+1)*(x+1) = x**2 + (1^1)x + 1 =
@@ -1324,7 +1338,8 @@ func (x Uint64x4) CarrylessMultiplyGrouped(a, b uint8, y Uint64x4) Uint64x4 {
 //
 // A carryless multiplication uses bitwise XOR instead of
 // add-with-carry, for example (in base two):
-// 11 * 11 = 11 * (10 ^ 1) = (11 * 10) ^ (11 * 1) = 110 ^ 11 = 101
+//
+//	11 * 11 = 11 * (10 ^ 1) = (11 * 10) ^ (11 * 1) = 110 ^ 11 = 101
 //
 // This also models multiplication of polynomials with coefficients
 // from GF(2) -- 11 * 11 models (x+1)*(x+1) = x**2 + (1^1)x + 1 =
diff --git a/src/simd/archsimd/slice_gen_amd64.go b/src/simd/archsimd/slice_gen_amd64.go
index c03e28206d..9e34f9ca56 100644
--- a/src/simd/archsimd/slice_gen_amd64.go
+++ b/src/simd/archsimd/slice_gen_amd64.go
@@ -1,4 +1,4 @@
-// Code generated by 'go run genfiles.go'; DO NOT EDIT.
+// Code generated by 'tmplgen'; DO NOT EDIT.
 
 //go:build goexperiment.simd
 
@@ -6,302 +6,302 @@ package archsimd
 
 import "unsafe"
 
-// LoadInt8x16Slice loads an Int8x16 from a slice of at least 16 int8s
+// LoadInt8x16Slice loads an Int8x16 from a slice of at least 16 int8s.
 func LoadInt8x16Slice(s []int8) Int8x16 {
 	return LoadInt8x16((*[16]int8)(s))
 }
 
-// StoreSlice stores x into a slice of at least 16 int8s
+// StoreSlice stores x into a slice of at least 16 int8s.
 func (x Int8x16) StoreSlice(s []int8) {
 	x.Store((*[16]int8)(s))
 }
 
-// LoadInt16x8Slice loads an Int16x8 from a slice of at least 8 int16s
+// LoadInt16x8Slice loads an Int16x8 from a slice of at least 8 int16s.
 func LoadInt16x8Slice(s []int16) Int16x8 {
 	return LoadInt16x8((*[8]int16)(s))
 }
 
-// StoreSlice stores x into a slice of at least 8 int16s
+// StoreSlice stores x into a slice of at least 8 int16s.
 func (x Int16x8) StoreSlice(s []int16) {
 	x.Store((*[8]int16)(s))
 }
 
-// LoadInt32x4Slice loads an Int32x4 from a slice of at least 4 int32s
+// LoadInt32x4Slice loads an Int32x4 from a slice of at least 4 int32s.
 func LoadInt32x4Slice(s []int32) Int32x4 {
 	return LoadInt32x4((*[4]int32)(s))
 }
 
-// StoreSlice stores x into a slice of at least 4 int32s
+// StoreSlice stores x into a slice of at least 4 int32s.
 func (x Int32x4) StoreSlice(s []int32) {
 	x.Store((*[4]int32)(s))
 }
 
-// LoadInt64x2Slice loads an Int64x2 from a slice of at least 2 int64s
+// LoadInt64x2Slice loads an Int64x2 from a slice of at least 2 int64s.
 func LoadInt64x2Slice(s []int64) Int64x2 {
 	return LoadInt64x2((*[2]int64)(s))
 }
 
-// StoreSlice stores x into a slice of at least 2 int64s
+// StoreSlice stores x into a slice of at least 2 int64s.
 func (x Int64x2) StoreSlice(s []int64) {
 	x.Store((*[2]int64)(s))
 }
 
-// LoadUint8x16Slice loads an Uint8x16 from a slice of at least 16 uint8s
+// LoadUint8x16Slice loads an Uint8x16 from a slice of at least 16 uint8s.
 func LoadUint8x16Slice(s []uint8) Uint8x16 {
 	return LoadUint8x16((*[16]uint8)(s))
 }
 
-// StoreSlice stores x into a slice of at least 16 uint8s
+// StoreSlice stores x into a slice of at least 16 uint8s.
 func (x Uint8x16) StoreSlice(s []uint8) {
 	x.Store((*[16]uint8)(s))
 }
 
-// LoadUint16x8Slice loads an Uint16x8 from a slice of at least 8 uint16s
+// LoadUint16x8Slice loads an Uint16x8 from a slice of at least 8 uint16s.
 func LoadUint16x8Slice(s []uint16) Uint16x8 {
 	return LoadUint16x8((*[8]uint16)(s))
 }
 
-// StoreSlice stores x into a slice of at least 8 uint16s
+// StoreSlice stores x into a slice of at least 8 uint16s.
 func (x Uint16x8) StoreSlice(s []uint16) {
 	x.Store((*[8]uint16)(s))
 }
 
-// LoadUint32x4Slice loads an Uint32x4 from a slice of at least 4 uint32s
+// LoadUint32x4Slice loads an Uint32x4 from a slice of at least 4 uint32s.
 func LoadUint32x4Slice(s []uint32) Uint32x4 {
 	return LoadUint32x4((*[4]uint32)(s))
 }
 
-// StoreSlice stores x into a slice of at least 4 uint32s
+// StoreSlice stores x into a slice of at least 4 uint32s.
 func (x Uint32x4) StoreSlice(s []uint32) {
 	x.Store((*[4]uint32)(s))
 }
 
-// LoadUint64x2Slice loads an Uint64x2 from a slice of at least 2 uint64s
+// LoadUint64x2Slice loads an Uint64x2 from a slice of at least 2 uint64s.
 func LoadUint64x2Slice(s []uint64) Uint64x2 {
 	return LoadUint64x2((*[2]uint64)(s))
 }
 
-// StoreSlice stores x into a slice of at least 2 uint64s
+// StoreSlice stores x into a slice of at least 2 uint64s.
 func (x Uint64x2) StoreSlice(s []uint64) {
 	x.Store((*[2]uint64)(s))
 }
 
-// LoadFloat32x4Slice loads a Float32x4 from a slice of at least 4 float32s
+// LoadFloat32x4Slice loads a Float32x4 from a slice of at least 4 float32s.
 func LoadFloat32x4Slice(s []float32) Float32x4 {
 	return LoadFloat32x4((*[4]float32)(s))
 }
 
-// StoreSlice stores x into a slice of at least 4 float32s
+// StoreSlice stores x into a slice of at least 4 float32s.
 func (x Float32x4) StoreSlice(s []float32) {
 	x.Store((*[4]float32)(s))
 }
 
-// LoadFloat64x2Slice loads a Float64x2 from a slice of at least 2 float64s
+// LoadFloat64x2Slice loads a Float64x2 from a slice of at least 2 float64s.
 func LoadFloat64x2Slice(s []float64) Float64x2 {
 	return LoadFloat64x2((*[2]float64)(s))
 }
 
-// StoreSlice stores x into a slice of at least 2 float64s
+// StoreSlice stores x into a slice of at least 2 float64s.
 func (x Float64x2) StoreSlice(s []float64) {
 	x.Store((*[2]float64)(s))
 }
 
-// LoadInt8x32Slice loads an Int8x32 from a slice of at least 32 int8s
+// LoadInt8x32Slice loads an Int8x32 from a slice of at least 32 int8s.
 func LoadInt8x32Slice(s []int8) Int8x32 {
 	return LoadInt8x32((*[32]int8)(s))
 }
 
-// StoreSlice stores x into a slice of at least 32 int8s
+// StoreSlice stores x into a slice of at least 32 int8s.
 func (x Int8x32) StoreSlice(s []int8) {
 	x.Store((*[32]int8)(s))
 }
 
-// LoadInt16x16Slice loads an Int16x16 from a slice of at least 16 int16s
+// LoadInt16x16Slice loads an Int16x16 from a slice of at least 16 int16s.
 func LoadInt16x16Slice(s []int16) Int16x16 {
 	return LoadInt16x16((*[16]int16)(s))
 }
 
-// StoreSlice stores x into a slice of at least 16 int16s
+// StoreSlice stores x into a slice of at least 16 int16s.
 func (x Int16x16) StoreSlice(s []int16) {
 	x.Store((*[16]int16)(s))
 }
 
-// LoadInt32x8Slice loads an Int32x8 from a slice of at least 8 int32s
+// LoadInt32x8Slice loads an Int32x8 from a slice of at least 8 int32s.
 func LoadInt32x8Slice(s []int32) Int32x8 {
 	return LoadInt32x8((*[8]int32)(s))
 }
 
-// StoreSlice stores x into a slice of at least 8 int32s
+// StoreSlice stores x into a slice of at least 8 int32s.
 func (x Int32x8) StoreSlice(s []int32) {
 	x.Store((*[8]int32)(s))
 }
 
-// LoadInt64x4Slice loads an Int64x4 from a slice of at least 4 int64s
+// LoadInt64x4Slice loads an Int64x4 from a slice of at least 4 int64s.
 func LoadInt64x4Slice(s []int64) Int64x4 {
 	return LoadInt64x4((*[4]int64)(s))
 }
 
-// StoreSlice stores x into a slice of at least 4 int64s
+// StoreSlice stores x into a slice of at least 4 int64s.
 func (x Int64x4) StoreSlice(s []int64) {
 	x.Store((*[4]int64)(s))
 }
 
-// LoadUint8x32Slice loads an Uint8x32 from a slice of at least 32 uint8s
+// LoadUint8x32Slice loads an Uint8x32 from a slice of at least 32 uint8s.
 func LoadUint8x32Slice(s []uint8) Uint8x32 {
 	return LoadUint8x32((*[32]uint8)(s))
 }
 
-// StoreSlice stores x into a slice of at least 32 uint8s
+// StoreSlice stores x into a slice of at least 32 uint8s.
 func (x Uint8x32) StoreSlice(s []uint8) {
 	x.Store((*[32]uint8)(s))
 }
 
-// LoadUint16x16Slice loads an Uint16x16 from a slice of at least 16 uint16s
+// LoadUint16x16Slice loads an Uint16x16 from a slice of at least 16 uint16s.
 func LoadUint16x16Slice(s []uint16) Uint16x16 {
 	return LoadUint16x16((*[16]uint16)(s))
 }
 
-// StoreSlice stores x into a slice of at least 16 uint16s
+// StoreSlice stores x into a slice of at least 16 uint16s.
 func (x Uint16x16) StoreSlice(s []uint16) {
 	x.Store((*[16]uint16)(s))
 }
 
-// LoadUint32x8Slice loads an Uint32x8 from a slice of at least 8 uint32s
+// LoadUint32x8Slice loads an Uint32x8 from a slice of at least 8 uint32s.
 func LoadUint32x8Slice(s []uint32) Uint32x8 {
 	return LoadUint32x8((*[8]uint32)(s))
 }
 
-// StoreSlice stores x into a slice of at least 8 uint32s
+// StoreSlice stores x into a slice of at least 8 uint32s.
 func (x Uint32x8) StoreSlice(s []uint32) {
 	x.Store((*[8]uint32)(s))
 }
 
-// LoadUint64x4Slice loads an Uint64x4 from a slice of at least 4 uint64s
+// LoadUint64x4Slice loads an Uint64x4 from a slice of at least 4 uint64s.
 func LoadUint64x4Slice(s []uint64) Uint64x4 {
 	return LoadUint64x4((*[4]uint64)(s))
 }
 
-// StoreSlice stores x into a slice of at least 4 uint64s
+// StoreSlice stores x into a slice of at least 4 uint64s.
 func (x Uint64x4) StoreSlice(s []uint64) {
 	x.Store((*[4]uint64)(s))
 }
 
-// LoadFloat32x8Slice loads a Float32x8 from a slice of at least 8 float32s
+// LoadFloat32x8Slice loads a Float32x8 from a slice of at least 8 float32s.
 func LoadFloat32x8Slice(s []float32) Float32x8 {
 	return LoadFloat32x8((*[8]float32)(s))
 }
 
-// StoreSlice stores x into a slice of at least 8 float32s
+// StoreSlice stores x into a slice of at least 8 float32s.
 func (x Float32x8) StoreSlice(s []float32) {
 	x.Store((*[8]float32)(s))
 }
 
-// LoadFloat64x4Slice loads a Float64x4 from a slice of at least 4 float64s
+// LoadFloat64x4Slice loads a Float64x4 from a slice of at least 4 float64s.
 func LoadFloat64x4Slice(s []float64) Float64x4 {
 	return LoadFloat64x4((*[4]float64)(s))
 }
 
-// StoreSlice stores x into a slice of at least 4 float64s
+// StoreSlice stores x into a slice of at least 4 float64s.
 func (x Float64x4) StoreSlice(s []float64) {
 	x.Store((*[4]float64)(s))
 }
 
-// LoadInt8x64Slice loads an Int8x64 from a slice of at least 64 int8s
+// LoadInt8x64Slice loads an Int8x64 from a slice of at least 64 int8s.
 func LoadInt8x64Slice(s []int8) Int8x64 {
 	return LoadInt8x64((*[64]int8)(s))
 }
 
-// StoreSlice stores x into a slice of at least 64 int8s
+// StoreSlice stores x into a slice of at least 64 int8s.
 func (x Int8x64) StoreSlice(s []int8) {
 	x.Store((*[64]int8)(s))
 }
 
-// LoadInt16x32Slice loads an Int16x32 from a slice of at least 32 int16s
+// LoadInt16x32Slice loads an Int16x32 from a slice of at least 32 int16s.
 func LoadInt16x32Slice(s []int16) Int16x32 {
 	return LoadInt16x32((*[32]int16)(s))
 }
 
-// StoreSlice stores x into a slice of at least 32 int16s
+// StoreSlice stores x into a slice of at least 32 int16s.
 func (x Int16x32) StoreSlice(s []int16) {
 	x.Store((*[32]int16)(s))
 }
 
-// LoadInt32x16Slice loads an Int32x16 from a slice of at least 16 int32s
+// LoadInt32x16Slice loads an Int32x16 from a slice of at least 16 int32s.
 func LoadInt32x16Slice(s []int32) Int32x16 {
 	return LoadInt32x16((*[16]int32)(s))
 }
 
-// StoreSlice stores x into a slice of at least 16 int32s
+// StoreSlice stores x into a slice of at least 16 int32s.
 func (x Int32x16) StoreSlice(s []int32) {
 	x.Store((*[16]int32)(s))
 }
 
-// LoadInt64x8Slice loads an Int64x8 from a slice of at least 8 int64s
+// LoadInt64x8Slice loads an Int64x8 from a slice of at least 8 int64s.
 func LoadInt64x8Slice(s []int64) Int64x8 {
 	return LoadInt64x8((*[8]int64)(s))
 }
 
-// StoreSlice stores x into a slice of at least 8 int64s
+// StoreSlice stores x into a slice of at least 8 int64s.
 func (x Int64x8) StoreSlice(s []int64) {
 	x.Store((*[8]int64)(s))
 }
 
-// LoadUint8x64Slice loads an Uint8x64 from a slice of at least 64 uint8s
+// LoadUint8x64Slice loads an Uint8x64 from a slice of at least 64 uint8s.
 func LoadUint8x64Slice(s []uint8) Uint8x64 {
 	return LoadUint8x64((*[64]uint8)(s))
 }
 
-// StoreSlice stores x into a slice of at least 64 uint8s
+// StoreSlice stores x into a slice of at least 64 uint8s.
 func (x Uint8x64) StoreSlice(s []uint8) {
 	x.Store((*[64]uint8)(s))
 }
 
-// LoadUint16x32Slice loads an Uint16x32 from a slice of at least 32 uint16s
+// LoadUint16x32Slice loads an Uint16x32 from a slice of at least 32 uint16s.
 func LoadUint16x32Slice(s []uint16) Uint16x32 {
 	return LoadUint16x32((*[32]uint16)(s))
 }
 
-// StoreSlice stores x into a slice of at least 32 uint16s
+// StoreSlice stores x into a slice of at least 32 uint16s.
 func (x Uint16x32) StoreSlice(s []uint16) {
 	x.Store((*[32]uint16)(s))
 }
 
-// LoadUint32x16Slice loads an Uint32x16 from a slice of at least 16 uint32s
+// LoadUint32x16Slice loads an Uint32x16 from a slice of at least 16 uint32s.
 func LoadUint32x16Slice(s []uint32) Uint32x16 {
 	return LoadUint32x16((*[16]uint32)(s))
 }
 
-// StoreSlice stores x into a slice of at least 16 uint32s
+// StoreSlice stores x into a slice of at least 16 uint32s.
 func (x Uint32x16) StoreSlice(s []uint32) {
 	x.Store((*[16]uint32)(s))
 }
 
-// LoadUint64x8Slice loads an Uint64x8 from a slice of at least 8 uint64s
+// LoadUint64x8Slice loads an Uint64x8 from a slice of at least 8 uint64s.
 func LoadUint64x8Slice(s []uint64) Uint64x8 {
 	return LoadUint64x8((*[8]uint64)(s))
 }
 
-// StoreSlice stores x into a slice of at least 8 uint64s
+// StoreSlice stores x into a slice of at least 8 uint64s.
 func (x Uint64x8) StoreSlice(s []uint64) {
 	x.Store((*[8]uint64)(s))
 }
 
-// LoadFloat32x16Slice loads a Float32x16 from a slice of at least 16 float32s
+// LoadFloat32x16Slice loads a Float32x16 from a slice of at least 16 float32s.
 func LoadFloat32x16Slice(s []float32) Float32x16 {
 	return LoadFloat32x16((*[16]float32)(s))
 }
 
-// StoreSlice stores x into a slice of at least 16 float32s
+// StoreSlice stores x into a slice of at least 16 float32s.
 func (x Float32x16) StoreSlice(s []float32) {
 	x.Store((*[16]float32)(s))
 }
 
-// LoadFloat64x8Slice loads a Float64x8 from a slice of at least 8 float64s
+// LoadFloat64x8Slice loads a Float64x8 from a slice of at least 8 float64s.
 func LoadFloat64x8Slice(s []float64) Float64x8 {
 	return LoadFloat64x8((*[8]float64)(s))
 }
 
-// StoreSlice stores x into a slice of at least 8 float64s
+// StoreSlice stores x into a slice of at least 8 float64s.
 func (x Float64x8) StoreSlice(s []float64) {
 	x.Store((*[8]float64)(s))
 }
diff --git a/src/simd/archsimd/types_amd64.go b/src/simd/archsimd/types_amd64.go
index 556383b380..3d0a49dc09 100644
--- a/src/simd/archsimd/types_amd64.go
+++ b/src/simd/archsimd/types_amd64.go
@@ -1,4 +1,4 @@
-// Code generated by x/arch/internal/simdgen using 'go run . -xedPath $XED_PATH -o godefs -goroot $GOROOT go.yaml types.yaml categories.yaml'; DO NOT EDIT.
+// Code generated by 'simdgen -o godefs -goroot $GOROOT -xedPath $XED_PATH go.yaml types.yaml categories.yaml'; DO NOT EDIT.
 
 //go:build goexperiment.simd
 
@@ -9,27 +9,27 @@ type v128 struct {
 	_128 [0]func() // uncomparable
 }
 
-// Float32x4 is a 128-bit SIMD vector of 4 float32
+// Float32x4 is a 128-bit SIMD vector of 4 float32s.
 type Float32x4 struct {
 	float32x4 v128
 	vals      [4]float32
 }
 
-// Len returns the number of elements in a Float32x4
+// Len returns the number of elements in a Float32x4.
 func (x Float32x4) Len() int { return 4 }
 
-// LoadFloat32x4 loads a Float32x4 from an array
+// LoadFloat32x4 loads a Float32x4 from an array.
 //
 //go:noescape
 func LoadFloat32x4(y *[4]float32) Float32x4
 
-// Store stores a Float32x4 to an array
+// Store stores a Float32x4 to an array.
 //
 //go:noescape
 func (x Float32x4) Store(y *[4]float32)
 
 // LoadMaskedFloat32x4 loads a Float32x4 from an array,
-// at those elements enabled by mask
+// at those elements enabled by mask.
 //
 // Asm: VMASKMOVD, CPU Feature: AVX2
 //
@@ -37,34 +37,34 @@ func (x Float32x4) Store(y *[4]float32)
 func LoadMaskedFloat32x4(y *[4]float32, mask Mask32x4) Float32x4
 
 // StoreMasked stores a Float32x4 to an array,
-// at those elements enabled by mask
+// at those elements enabled by mask.
 //
 // Asm: VMASKMOVD, CPU Feature: AVX2
 //
 //go:noescape
 func (x Float32x4) StoreMasked(y *[4]float32, mask Mask32x4)
 
-// Float64x2 is a 128-bit SIMD vector of 2 float64
+// Float64x2 is a 128-bit SIMD vector of 2 float64s.
 type Float64x2 struct {
 	float64x2 v128
 	vals      [2]float64
 }
 
-// Len returns the number of elements in a Float64x2
+// Len returns the number of elements in a Float64x2.
 func (x Float64x2) Len() int { return 2 }
 
-// LoadFloat64x2 loads a Float64x2 from an array
+// LoadFloat64x2 loads a Float64x2 from an array.
 //
 //go:noescape
 func LoadFloat64x2(y *[2]float64) Float64x2
 
-// Store stores a Float64x2 to an array
+// Store stores a Float64x2 to an array.
 //
 //go:noescape
 func (x Float64x2) Store(y *[2]float64)
 
 // LoadMaskedFloat64x2 loads a Float64x2 from an array,
-// at those elements enabled by mask
+// at those elements enabled by mask.
 //
 // Asm: VMASKMOVQ, CPU Feature: AVX2
 //
@@ -72,180 +72,180 @@ func (x Float64x2) Store(y *[2]float64)
 func LoadMaskedFloat64x2(y *[2]float64, mask Mask64x2) Float64x2
 
 // StoreMasked stores a Float64x2 to an array,
-// at those elements enabled by mask
+// at those elements enabled by mask.
 //
 // Asm: VMASKMOVQ, CPU Feature: AVX2
 //
 //go:noescape
 func (x Float64x2) StoreMasked(y *[2]float64, mask Mask64x2)
 
-// Int8x16 is a 128-bit SIMD vector of 16 int8
+// Int8x16 is a 128-bit SIMD vector of 16 int8s.
 type Int8x16 struct {
 	int8x16 v128
 	vals    [16]int8
 }
 
-// Len returns the number of elements in a Int8x16
+// Len returns the number of elements in an Int8x16.
 func (x Int8x16) Len() int { return 16 }
 
-// LoadInt8x16 loads a Int8x16 from an array
+// LoadInt8x16 loads an Int8x16 from an array.
 //
 //go:noescape
 func LoadInt8x16(y *[16]int8) Int8x16
 
-// Store stores a Int8x16 to an array
+// Store stores an Int8x16 to an array.
 //
 //go:noescape
 func (x Int8x16) Store(y *[16]int8)
 
-// Int16x8 is a 128-bit SIMD vector of 8 int16
+// Int16x8 is a 128-bit SIMD vector of 8 int16s.
 type Int16x8 struct {
 	int16x8 v128
 	vals    [8]int16
 }
 
-// Len returns the number of elements in a Int16x8
+// Len returns the number of elements in an Int16x8.
 func (x Int16x8) Len() int { return 8 }
 
-// LoadInt16x8 loads a Int16x8 from an array
+// LoadInt16x8 loads an Int16x8 from an array.
 //
 //go:noescape
 func LoadInt16x8(y *[8]int16) Int16x8
 
-// Store stores a Int16x8 to an array
+// Store stores an Int16x8 to an array.
 //
 //go:noescape
 func (x Int16x8) Store(y *[8]int16)
 
-// Int32x4 is a 128-bit SIMD vector of 4 int32
+// Int32x4 is a 128-bit SIMD vector of 4 int32s.
 type Int32x4 struct {
 	int32x4 v128
 	vals    [4]int32
 }
 
-// Len returns the number of elements in a Int32x4
+// Len returns the number of elements in an Int32x4.
 func (x Int32x4) Len() int { return 4 }
 
-// LoadInt32x4 loads a Int32x4 from an array
+// LoadInt32x4 loads an Int32x4 from an array.
 //
 //go:noescape
 func LoadInt32x4(y *[4]int32) Int32x4
 
-// Store stores a Int32x4 to an array
+// Store stores an Int32x4 to an array.
 //
 //go:noescape
 func (x Int32x4) Store(y *[4]int32)
 
-// LoadMaskedInt32x4 loads a Int32x4 from an array,
-// at those elements enabled by mask
+// LoadMaskedInt32x4 loads an Int32x4 from an array,
+// at those elements enabled by mask.
 //
 // Asm: VMASKMOVD, CPU Feature: AVX2
 //
 //go:noescape
 func LoadMaskedInt32x4(y *[4]int32, mask Mask32x4) Int32x4
 
-// StoreMasked stores a Int32x4 to an array,
-// at those elements enabled by mask
+// StoreMasked stores an Int32x4 to an array,
+// at those elements enabled by mask.
 //
 // Asm: VMASKMOVD, CPU Feature: AVX2
 //
 //go:noescape
 func (x Int32x4) StoreMasked(y *[4]int32, mask Mask32x4)
 
-// Int64x2 is a 128-bit SIMD vector of 2 int64
+// Int64x2 is a 128-bit SIMD vector of 2 int64s.
 type Int64x2 struct {
 	int64x2 v128
 	vals    [2]int64
 }
 
-// Len returns the number of elements in a Int64x2
+// Len returns the number of elements in an Int64x2.
 func (x Int64x2) Len() int { return 2 }
 
-// LoadInt64x2 loads a Int64x2 from an array
+// LoadInt64x2 loads an Int64x2 from an array.
 //
 //go:noescape
 func LoadInt64x2(y *[2]int64) Int64x2
 
-// Store stores a Int64x2 to an array
+// Store stores an Int64x2 to an array.
 //
 //go:noescape
 func (x Int64x2) Store(y *[2]int64)
 
-// LoadMaskedInt64x2 loads a Int64x2 from an array,
-// at those elements enabled by mask
+// LoadMaskedInt64x2 loads an Int64x2 from an array,
+// at those elements enabled by mask.
 //
 // Asm: VMASKMOVQ, CPU Feature: AVX2
 //
 //go:noescape
 func LoadMaskedInt64x2(y *[2]int64, mask Mask64x2) Int64x2
 
-// StoreMasked stores a Int64x2 to an array,
-// at those elements enabled by mask
+// StoreMasked stores an Int64x2 to an array,
+// at those elements enabled by mask.
 //
 // Asm: VMASKMOVQ, CPU Feature: AVX2
 //
 //go:noescape
 func (x Int64x2) StoreMasked(y *[2]int64, mask Mask64x2)
 
-// Uint8x16 is a 128-bit SIMD vector of 16 uint8
+// Uint8x16 is a 128-bit SIMD vector of 16 uint8s.
 type Uint8x16 struct {
 	uint8x16 v128
 	vals     [16]uint8
 }
 
-// Len returns the number of elements in a Uint8x16
+// Len returns the number of elements in a Uint8x16.
 func (x Uint8x16) Len() int { return 16 }
 
-// LoadUint8x16 loads a Uint8x16 from an array
+// LoadUint8x16 loads a Uint8x16 from an array.
 //
 //go:noescape
 func LoadUint8x16(y *[16]uint8) Uint8x16
 
-// Store stores a Uint8x16 to an array
+// Store stores a Uint8x16 to an array.
 //
 //go:noescape
 func (x Uint8x16) Store(y *[16]uint8)
 
-// Uint16x8 is a 128-bit SIMD vector of 8 uint16
+// Uint16x8 is a 128-bit SIMD vector of 8 uint16s.
 type Uint16x8 struct {
 	uint16x8 v128
 	vals     [8]uint16
 }
 
-// Len returns the number of elements in a Uint16x8
+// Len returns the number of elements in a Uint16x8.
 func (x Uint16x8) Len() int { return 8 }
 
-// LoadUint16x8 loads a Uint16x8 from an array
+// LoadUint16x8 loads a Uint16x8 from an array.
 //
 //go:noescape
 func LoadUint16x8(y *[8]uint16) Uint16x8
 
-// Store stores a Uint16x8 to an array
+// Store stores a Uint16x8 to an array.
 //
 //go:noescape
 func (x Uint16x8) Store(y *[8]uint16)
 
-// Uint32x4 is a 128-bit SIMD vector of 4 uint32
+// Uint32x4 is a 128-bit SIMD vector of 4 uint32s.
 type Uint32x4 struct {
 	uint32x4 v128
 	vals     [4]uint32
 }
 
-// Len returns the number of elements in a Uint32x4
+// Len returns the number of elements in a Uint32x4.
 func (x Uint32x4) Len() int { return 4 }
 
-// LoadUint32x4 loads a Uint32x4 from an array
+// LoadUint32x4 loads a Uint32x4 from an array.
 //
 //go:noescape
 func LoadUint32x4(y *[4]uint32) Uint32x4
 
-// Store stores a Uint32x4 to an array
+// Store stores a Uint32x4 to an array.
 //
 //go:noescape
 func (x Uint32x4) Store(y *[4]uint32)
 
 // LoadMaskedUint32x4 loads a Uint32x4 from an array,
-// at those elements enabled by mask
+// at those elements enabled by mask.
 //
 // Asm: VMASKMOVD, CPU Feature: AVX2
 //
@@ -253,34 +253,34 @@ func (x Uint32x4) Store(y *[4]uint32)
 func LoadMaskedUint32x4(y *[4]uint32, mask Mask32x4) Uint32x4
 
 // StoreMasked stores a Uint32x4 to an array,
-// at those elements enabled by mask
+// at those elements enabled by mask.
 //
 // Asm: VMASKMOVD, CPU Feature: AVX2
 //
 //go:noescape
 func (x Uint32x4) StoreMasked(y *[4]uint32, mask Mask32x4)
 
-// Uint64x2 is a 128-bit SIMD vector of 2 uint64
+// Uint64x2 is a 128-bit SIMD vector of 2 uint64s.
 type Uint64x2 struct {
 	uint64x2 v128
 	vals     [2]uint64
 }
 
-// Len returns the number of elements in a Uint64x2
+// Len returns the number of elements in a Uint64x2.
 func (x Uint64x2) Len() int { return 2 }
 
-// LoadUint64x2 loads a Uint64x2 from an array
+// LoadUint64x2 loads a Uint64x2 from an array.
 //
 //go:noescape
 func LoadUint64x2(y *[2]uint64) Uint64x2
 
-// Store stores a Uint64x2 to an array
+// Store stores a Uint64x2 to an array.
 //
 //go:noescape
 func (x Uint64x2) Store(y *[2]uint64)
 
 // LoadMaskedUint64x2 loads a Uint64x2 from an array,
-// at those elements enabled by mask
+// at those elements enabled by mask.
 //
 // Asm: VMASKMOVQ, CPU Feature: AVX2
 //
@@ -288,14 +288,14 @@ func (x Uint64x2) Store(y *[2]uint64)
 func LoadMaskedUint64x2(y *[2]uint64, mask Mask64x2) Uint64x2
 
 // StoreMasked stores a Uint64x2 to an array,
-// at those elements enabled by mask
+// at those elements enabled by mask.
 //
 // Asm: VMASKMOVQ, CPU Feature: AVX2
 //
 //go:noescape
 func (x Uint64x2) StoreMasked(y *[2]uint64, mask Mask64x2)
 
-// Mask8x16 is a 128-bit SIMD vector of 16 int8
+// Mask8x16 is a mask for a SIMD vector of 16 8-bit elements.
 type Mask8x16 struct {
 	int8x16 v128
 	vals    [16]int8
@@ -308,10 +308,10 @@ func Mask8x16FromBits(y uint16) Mask8x16
 
 // ToBits constructs a bitmap from a Mask8x16, where 1 means set for the indexed element, 0 means unset.
 //
-// Asm: KMOVB, CPU Features: AVX512
+// Asm: VPMOVMSKB, CPU Features: AVX
 func (x Mask8x16) ToBits() uint16
 
-// Mask16x8 is a 128-bit SIMD vector of 8 int16
+// Mask16x8 is a mask for a SIMD vector of 8 16-bit elements.
 type Mask16x8 struct {
 	int16x8 v128
 	vals    [8]int16
@@ -327,7 +327,7 @@ func Mask16x8FromBits(y uint8) Mask16x8
 // Asm: KMOVW, CPU Features: AVX512
 func (x Mask16x8) ToBits() uint8
 
-// Mask32x4 is a 128-bit SIMD vector of 4 int32
+// Mask32x4 is a mask for a SIMD vector of 4 32-bit elements.
 type Mask32x4 struct {
 	int32x4 v128
 	vals    [4]int32
@@ -342,10 +342,10 @@ func Mask32x4FromBits(y uint8) Mask32x4
 // ToBits constructs a bitmap from a Mask32x4, where 1 means set for the indexed element, 0 means unset.
 // Only the lower 4 bits of y are used.
 //
-// Asm: KMOVD, CPU Features: AVX512
+// Asm: VMOVMSKPS, CPU Features: AVX
 func (x Mask32x4) ToBits() uint8
 
-// Mask64x2 is a 128-bit SIMD vector of 2 int64
+// Mask64x2 is a mask for a SIMD vector of 2 64-bit elements.
 type Mask64x2 struct {
 	int64x2 v128
 	vals    [2]int64
@@ -360,7 +360,7 @@ func Mask64x2FromBits(y uint8) Mask64x2
 // ToBits constructs a bitmap from a Mask64x2, where 1 means set for the indexed element, 0 means unset.
 // Only the lower 2 bits of y are used.
 //
-// Asm: KMOVQ, CPU Features: AVX512
+// Asm: VMOVMSKPD, CPU Features: AVX
 func (x Mask64x2) ToBits() uint8
 
 // v256 is a tag type that tells the compiler that this is really 256-bit SIMD
@@ -368,27 +368,27 @@ type v256 struct {
 	_256 [0]func() // uncomparable
 }
 
-// Float32x8 is a 256-bit SIMD vector of 8 float32
+// Float32x8 is a 256-bit SIMD vector of 8 float32s.
 type Float32x8 struct {
 	float32x8 v256
 	vals      [8]float32
 }
 
-// Len returns the number of elements in a Float32x8
+// Len returns the number of elements in a Float32x8.
 func (x Float32x8) Len() int { return 8 }
 
-// LoadFloat32x8 loads a Float32x8 from an array
+// LoadFloat32x8 loads a Float32x8 from an array.
 //
 //go:noescape
 func LoadFloat32x8(y *[8]float32) Float32x8
 
-// Store stores a Float32x8 to an array
+// Store stores a Float32x8 to an array.
 //
 //go:noescape
 func (x Float32x8) Store(y *[8]float32)
 
 // LoadMaskedFloat32x8 loads a Float32x8 from an array,
-// at those elements enabled by mask
+// at those elements enabled by mask.
 //
 // Asm: VMASKMOVD, CPU Feature: AVX2
 //
@@ -396,34 +396,34 @@ func (x Float32x8) Store(y *[8]float32)
 func LoadMaskedFloat32x8(y *[8]float32, mask Mask32x8) Float32x8
 
 // StoreMasked stores a Float32x8 to an array,
-// at those elements enabled by mask
+// at those elements enabled by mask.
 //
 // Asm: VMASKMOVD, CPU Feature: AVX2
 //
 //go:noescape
 func (x Float32x8) StoreMasked(y *[8]float32, mask Mask32x8)
 
-// Float64x4 is a 256-bit SIMD vector of 4 float64
+// Float64x4 is a 256-bit SIMD vector of 4 float64s.
 type Float64x4 struct {
 	float64x4 v256
 	vals      [4]float64
 }
 
-// Len returns the number of elements in a Float64x4
+// Len returns the number of elements in a Float64x4.
 func (x Float64x4) Len() int { return 4 }
 
-// LoadFloat64x4 loads a Float64x4 from an array
+// LoadFloat64x4 loads a Float64x4 from an array.
 //
 //go:noescape
 func LoadFloat64x4(y *[4]float64) Float64x4
 
-// Store stores a Float64x4 to an array
+// Store stores a Float64x4 to an array.
 //
 //go:noescape
 func (x Float64x4) Store(y *[4]float64)
 
 // LoadMaskedFloat64x4 loads a Float64x4 from an array,
-// at those elements enabled by mask
+// at those elements enabled by mask.
 //
 // Asm: VMASKMOVQ, CPU Feature: AVX2
 //
@@ -431,180 +431,180 @@ func (x Float64x4) Store(y *[4]float64)
 func LoadMaskedFloat64x4(y *[4]float64, mask Mask64x4) Float64x4
 
 // StoreMasked stores a Float64x4 to an array,
-// at those elements enabled by mask
+// at those elements enabled by mask.
 //
 // Asm: VMASKMOVQ, CPU Feature: AVX2
 //
 //go:noescape
 func (x Float64x4) StoreMasked(y *[4]float64, mask Mask64x4)
 
-// Int8x32 is a 256-bit SIMD vector of 32 int8
+// Int8x32 is a 256-bit SIMD vector of 32 int8s.
 type Int8x32 struct {
 	int8x32 v256
 	vals    [32]int8
 }
 
-// Len returns the number of elements in a Int8x32
+// Len returns the number of elements in an Int8x32.
 func (x Int8x32) Len() int { return 32 }
 
-// LoadInt8x32 loads a Int8x32 from an array
+// LoadInt8x32 loads an Int8x32 from an array.
 //
 //go:noescape
 func LoadInt8x32(y *[32]int8) Int8x32
 
-// Store stores a Int8x32 to an array
+// Store stores an Int8x32 to an array.
 //
 //go:noescape
 func (x Int8x32) Store(y *[32]int8)
 
-// Int16x16 is a 256-bit SIMD vector of 16 int16
+// Int16x16 is a 256-bit SIMD vector of 16 int16s.
 type Int16x16 struct {
 	int16x16 v256
 	vals     [16]int16
 }
 
-// Len returns the number of elements in a Int16x16
+// Len returns the number of elements in an Int16x16.
 func (x Int16x16) Len() int { return 16 }
 
-// LoadInt16x16 loads a Int16x16 from an array
+// LoadInt16x16 loads an Int16x16 from an array.
 //
 //go:noescape
 func LoadInt16x16(y *[16]int16) Int16x16
 
-// Store stores a Int16x16 to an array
+// Store stores an Int16x16 to an array.
 //
 //go:noescape
 func (x Int16x16) Store(y *[16]int16)
 
-// Int32x8 is a 256-bit SIMD vector of 8 int32
+// Int32x8 is a 256-bit SIMD vector of 8 int32s.
 type Int32x8 struct {
 	int32x8 v256
 	vals    [8]int32
 }
 
-// Len returns the number of elements in a Int32x8
+// Len returns the number of elements in an Int32x8.
 func (x Int32x8) Len() int { return 8 }
 
-// LoadInt32x8 loads a Int32x8 from an array
+// LoadInt32x8 loads an Int32x8 from an array.
 //
 //go:noescape
 func LoadInt32x8(y *[8]int32) Int32x8
 
-// Store stores a Int32x8 to an array
+// Store stores an Int32x8 to an array.
 //
 //go:noescape
 func (x Int32x8) Store(y *[8]int32)
 
-// LoadMaskedInt32x8 loads a Int32x8 from an array,
-// at those elements enabled by mask
+// LoadMaskedInt32x8 loads an Int32x8 from an array,
+// at those elements enabled by mask.
 //
 // Asm: VMASKMOVD, CPU Feature: AVX2
 //
 //go:noescape
 func LoadMaskedInt32x8(y *[8]int32, mask Mask32x8) Int32x8
 
-// StoreMasked stores a Int32x8 to an array,
-// at those elements enabled by mask
+// StoreMasked stores an Int32x8 to an array,
+// at those elements enabled by mask.
 //
 // Asm: VMASKMOVD, CPU Feature: AVX2
 //
 //go:noescape
 func (x Int32x8) StoreMasked(y *[8]int32, mask Mask32x8)
 
-// Int64x4 is a 256-bit SIMD vector of 4 int64
+// Int64x4 is a 256-bit SIMD vector of 4 int64s.
 type Int64x4 struct {
 	int64x4 v256
 	vals    [4]int64
 }
 
-// Len returns the number of elements in a Int64x4
+// Len returns the number of elements in an Int64x4.
 func (x Int64x4) Len() int { return 4 }
 
-// LoadInt64x4 loads a Int64x4 from an array
+// LoadInt64x4 loads an Int64x4 from an array.
 //
 //go:noescape
 func LoadInt64x4(y *[4]int64) Int64x4
 
-// Store stores a Int64x4 to an array
+// Store stores an Int64x4 to an array.
 //
 //go:noescape
 func (x Int64x4) Store(y *[4]int64)
 
-// LoadMaskedInt64x4 loads a Int64x4 from an array,
-// at those elements enabled by mask
+// LoadMaskedInt64x4 loads an Int64x4 from an array,
+// at those elements enabled by mask.
 //
 // Asm: VMASKMOVQ, CPU Feature: AVX2
 //
 //go:noescape
 func LoadMaskedInt64x4(y *[4]int64, mask Mask64x4) Int64x4
 
-// StoreMasked stores a Int64x4 to an array,
-// at those elements enabled by mask
+// StoreMasked stores an Int64x4 to an array,
+// at those elements enabled by mask.
 //
 // Asm: VMASKMOVQ, CPU Feature: AVX2
 //
 //go:noescape
 func (x Int64x4) StoreMasked(y *[4]int64, mask Mask64x4)
 
-// Uint8x32 is a 256-bit SIMD vector of 32 uint8
+// Uint8x32 is a 256-bit SIMD vector of 32 uint8s.
 type Uint8x32 struct {
 	uint8x32 v256
 	vals     [32]uint8
 }
 
-// Len returns the number of elements in a Uint8x32
+// Len returns the number of elements in a Uint8x32.
 func (x Uint8x32) Len() int { return 32 }
 
-// LoadUint8x32 loads a Uint8x32 from an array
+// LoadUint8x32 loads a Uint8x32 from an array.
 //
 //go:noescape
 func LoadUint8x32(y *[32]uint8) Uint8x32
 
-// Store stores a Uint8x32 to an array
+// Store stores a Uint8x32 to an array.
 //
 //go:noescape
 func (x Uint8x32) Store(y *[32]uint8)
 
-// Uint16x16 is a 256-bit SIMD vector of 16 uint16
+// Uint16x16 is a 256-bit SIMD vector of 16 uint16s.
 type Uint16x16 struct {
 	uint16x16 v256
 	vals      [16]uint16
 }
 
-// Len returns the number of elements in a Uint16x16
+// Len returns the number of elements in a Uint16x16.
 func (x Uint16x16) Len() int { return 16 }
 
-// LoadUint16x16 loads a Uint16x16 from an array
+// LoadUint16x16 loads a Uint16x16 from an array.
 //
 //go:noescape
 func LoadUint16x16(y *[16]uint16) Uint16x16
 
-// Store stores a Uint16x16 to an array
+// Store stores a Uint16x16 to an array.
 //
 //go:noescape
 func (x Uint16x16) Store(y *[16]uint16)
 
-// Uint32x8 is a 256-bit SIMD vector of 8 uint32
+// Uint32x8 is a 256-bit SIMD vector of 8 uint32s.
 type Uint32x8 struct {
 	uint32x8 v256
 	vals     [8]uint32
 }
 
-// Len returns the number of elements in a Uint32x8
+// Len returns the number of elements in a Uint32x8.
 func (x Uint32x8) Len() int { return 8 }
 
-// LoadUint32x8 loads a Uint32x8 from an array
+// LoadUint32x8 loads a Uint32x8 from an array.
 //
 //go:noescape
 func LoadUint32x8(y *[8]uint32) Uint32x8
 
-// Store stores a Uint32x8 to an array
+// Store stores a Uint32x8 to an array.
 //
 //go:noescape
 func (x Uint32x8) Store(y *[8]uint32)
 
 // LoadMaskedUint32x8 loads a Uint32x8 from an array,
-// at those elements enabled by mask
+// at those elements enabled by mask.
 //
 // Asm: VMASKMOVD, CPU Feature: AVX2
 //
@@ -612,34 +612,34 @@ func (x Uint32x8) Store(y *[8]uint32)
 func LoadMaskedUint32x8(y *[8]uint32, mask Mask32x8) Uint32x8
 
 // StoreMasked stores a Uint32x8 to an array,
-// at those elements enabled by mask
+// at those elements enabled by mask.
 //
 // Asm: VMASKMOVD, CPU Feature: AVX2
 //
 //go:noescape
 func (x Uint32x8) StoreMasked(y *[8]uint32, mask Mask32x8)
 
-// Uint64x4 is a 256-bit SIMD vector of 4 uint64
+// Uint64x4 is a 256-bit SIMD vector of 4 uint64s.
 type Uint64x4 struct {
 	uint64x4 v256
 	vals     [4]uint64
 }
 
-// Len returns the number of elements in a Uint64x4
+// Len returns the number of elements in a Uint64x4.
 func (x Uint64x4) Len() int { return 4 }
 
-// LoadUint64x4 loads a Uint64x4 from an array
+// LoadUint64x4 loads a Uint64x4 from an array.
 //
 //go:noescape
 func LoadUint64x4(y *[4]uint64) Uint64x4
 
-// Store stores a Uint64x4 to an array
+// Store stores a Uint64x4 to an array.
 //
 //go:noescape
 func (x Uint64x4) Store(y *[4]uint64)
 
 // LoadMaskedUint64x4 loads a Uint64x4 from an array,
-// at those elements enabled by mask
+// at those elements enabled by mask.
 //
 // Asm: VMASKMOVQ, CPU Feature: AVX2
 //
@@ -647,14 +647,14 @@ func (x Uint64x4) Store(y *[4]uint64)
 func LoadMaskedUint64x4(y *[4]uint64, mask Mask64x4) Uint64x4
 
 // StoreMasked stores a Uint64x4 to an array,
-// at those elements enabled by mask
+// at those elements enabled by mask.
 //
 // Asm: VMASKMOVQ, CPU Feature: AVX2
 //
 //go:noescape
 func (x Uint64x4) StoreMasked(y *[4]uint64, mask Mask64x4)
 
-// Mask8x32 is a 256-bit SIMD vector of 32 int8
+// Mask8x32 is a mask for a SIMD vector of 32 8-bit elements.
 type Mask8x32 struct {
 	int8x32 v256
 	vals    [32]int8
@@ -667,10 +667,10 @@ func Mask8x32FromBits(y uint32) Mask8x32
 
 // ToBits constructs a bitmap from a Mask8x32, where 1 means set for the indexed element, 0 means unset.
 //
-// Asm: KMOVB, CPU Features: AVX512
+// Asm: VPMOVMSKB, CPU Features: AVX2
 func (x Mask8x32) ToBits() uint32
 
-// Mask16x16 is a 256-bit SIMD vector of 16 int16
+// Mask16x16 is a mask for a SIMD vector of 16 16-bit elements.
 type Mask16x16 struct {
 	int16x16 v256
 	vals     [16]int16
@@ -686,7 +686,7 @@ func Mask16x16FromBits(y uint16) Mask16x16
 // Asm: KMOVW, CPU Features: AVX512
 func (x Mask16x16) ToBits() uint16
 
-// Mask32x8 is a 256-bit SIMD vector of 8 int32
+// Mask32x8 is a mask for a SIMD vector of 8 32-bit elements.
 type Mask32x8 struct {
 	int32x8 v256
 	vals    [8]int32
@@ -699,10 +699,10 @@ func Mask32x8FromBits(y uint8) Mask32x8
 
 // ToBits constructs a bitmap from a Mask32x8, where 1 means set for the indexed element, 0 means unset.
 //
-// Asm: KMOVD, CPU Features: AVX512
+// Asm: VMOVMSKPS, CPU Features: AVX
 func (x Mask32x8) ToBits() uint8
 
-// Mask64x4 is a 256-bit SIMD vector of 4 int64
+// Mask64x4 is a mask for a SIMD vector of 4 64-bit elements.
 type Mask64x4 struct {
 	int64x4 v256
 	vals    [4]int64
@@ -717,7 +717,7 @@ func Mask64x4FromBits(y uint8) Mask64x4
 // ToBits constructs a bitmap from a Mask64x4, where 1 means set for the indexed element, 0 means unset.
 // Only the lower 4 bits of y are used.
 //
-// Asm: KMOVQ, CPU Features: AVX512
+// Asm: VMOVMSKPD, CPU Features: AVX
 func (x Mask64x4) ToBits() uint8
 
 // v512 is a tag type that tells the compiler that this is really 512-bit SIMD
@@ -725,27 +725,27 @@ type v512 struct {
 	_512 [0]func() // uncomparable
 }
 
-// Float32x16 is a 512-bit SIMD vector of 16 float32
+// Float32x16 is a 512-bit SIMD vector of 16 float32s.
 type Float32x16 struct {
 	float32x16 v512
 	vals       [16]float32
 }
 
-// Len returns the number of elements in a Float32x16
+// Len returns the number of elements in a Float32x16.
 func (x Float32x16) Len() int { return 16 }
 
-// LoadFloat32x16 loads a Float32x16 from an array
+// LoadFloat32x16 loads a Float32x16 from an array.
 //
 //go:noescape
 func LoadFloat32x16(y *[16]float32) Float32x16
 
-// Store stores a Float32x16 to an array
+// Store stores a Float32x16 to an array.
 //
 //go:noescape
 func (x Float32x16) Store(y *[16]float32)
 
 // LoadMaskedFloat32x16 loads a Float32x16 from an array,
-// at those elements enabled by mask
+// at those elements enabled by mask.
 //
 // Asm: VMOVDQU32.Z, CPU Feature: AVX512
 //
@@ -753,34 +753,34 @@ func (x Float32x16) Store(y *[16]float32)
 func LoadMaskedFloat32x16(y *[16]float32, mask Mask32x16) Float32x16
 
 // StoreMasked stores a Float32x16 to an array,
-// at those elements enabled by mask
+// at those elements enabled by mask.
 //
 // Asm: VMOVDQU32, CPU Feature: AVX512
 //
 //go:noescape
 func (x Float32x16) StoreMasked(y *[16]float32, mask Mask32x16)
 
-// Float64x8 is a 512-bit SIMD vector of 8 float64
+// Float64x8 is a 512-bit SIMD vector of 8 float64s.
 type Float64x8 struct {
 	float64x8 v512
 	vals      [8]float64
 }
 
-// Len returns the number of elements in a Float64x8
+// Len returns the number of elements in a Float64x8.
 func (x Float64x8) Len() int { return 8 }
 
-// LoadFloat64x8 loads a Float64x8 from an array
+// LoadFloat64x8 loads a Float64x8 from an array.
 //
 //go:noescape
 func LoadFloat64x8(y *[8]float64) Float64x8
 
-// Store stores a Float64x8 to an array
+// Store stores a Float64x8 to an array.
 //
 //go:noescape
 func (x Float64x8) Store(y *[8]float64)
 
 // LoadMaskedFloat64x8 loads a Float64x8 from an array,
-// at those elements enabled by mask
+// at those elements enabled by mask.
 //
 // Asm: VMOVDQU64.Z, CPU Feature: AVX512
 //
@@ -788,174 +788,174 @@ func (x Float64x8) Store(y *[8]float64)
 func LoadMaskedFloat64x8(y *[8]float64, mask Mask64x8) Float64x8
 
 // StoreMasked stores a Float64x8 to an array,
-// at those elements enabled by mask
+// at those elements enabled by mask.
 //
 // Asm: VMOVDQU64, CPU Feature: AVX512
 //
 //go:noescape
 func (x Float64x8) StoreMasked(y *[8]float64, mask Mask64x8)
 
-// Int8x64 is a 512-bit SIMD vector of 64 int8
+// Int8x64 is a 512-bit SIMD vector of 64 int8s.
 type Int8x64 struct {
 	int8x64 v512
 	vals    [64]int8
 }
 
-// Len returns the number of elements in a Int8x64
+// Len returns the number of elements in an Int8x64.
 func (x Int8x64) Len() int { return 64 }
 
-// LoadInt8x64 loads a Int8x64 from an array
+// LoadInt8x64 loads an Int8x64 from an array.
 //
 //go:noescape
 func LoadInt8x64(y *[64]int8) Int8x64
 
-// Store stores a Int8x64 to an array
+// Store stores an Int8x64 to an array.
 //
 //go:noescape
 func (x Int8x64) Store(y *[64]int8)
 
-// LoadMaskedInt8x64 loads a Int8x64 from an array,
-// at those elements enabled by mask
+// LoadMaskedInt8x64 loads an Int8x64 from an array,
+// at those elements enabled by mask.
 //
 // Asm: VMOVDQU8.Z, CPU Feature: AVX512
 //
 //go:noescape
 func LoadMaskedInt8x64(y *[64]int8, mask Mask8x64) Int8x64
 
-// StoreMasked stores a Int8x64 to an array,
-// at those elements enabled by mask
+// StoreMasked stores an Int8x64 to an array,
+// at those elements enabled by mask.
 //
 // Asm: VMOVDQU8, CPU Feature: AVX512
 //
 //go:noescape
 func (x Int8x64) StoreMasked(y *[64]int8, mask Mask8x64)
 
-// Int16x32 is a 512-bit SIMD vector of 32 int16
+// Int16x32 is a 512-bit SIMD vector of 32 int16s.
 type Int16x32 struct {
 	int16x32 v512
 	vals     [32]int16
 }
 
-// Len returns the number of elements in a Int16x32
+// Len returns the number of elements in an Int16x32.
 func (x Int16x32) Len() int { return 32 }
 
-// LoadInt16x32 loads a Int16x32 from an array
+// LoadInt16x32 loads an Int16x32 from an array.
 //
 //go:noescape
 func LoadInt16x32(y *[32]int16) Int16x32
 
-// Store stores a Int16x32 to an array
+// Store stores an Int16x32 to an array.
 //
 //go:noescape
 func (x Int16x32) Store(y *[32]int16)
 
-// LoadMaskedInt16x32 loads a Int16x32 from an array,
-// at those elements enabled by mask
+// LoadMaskedInt16x32 loads an Int16x32 from an array,
+// at those elements enabled by mask.
 //
 // Asm: VMOVDQU16.Z, CPU Feature: AVX512
 //
 //go:noescape
 func LoadMaskedInt16x32(y *[32]int16, mask Mask16x32) Int16x32
 
-// StoreMasked stores a Int16x32 to an array,
-// at those elements enabled by mask
+// StoreMasked stores an Int16x32 to an array,
+// at those elements enabled by mask.
 //
 // Asm: VMOVDQU16, CPU Feature: AVX512
 //
 //go:noescape
 func (x Int16x32) StoreMasked(y *[32]int16, mask Mask16x32)
 
-// Int32x16 is a 512-bit SIMD vector of 16 int32
+// Int32x16 is a 512-bit SIMD vector of 16 int32s.
 type Int32x16 struct {
 	int32x16 v512
 	vals     [16]int32
 }
 
-// Len returns the number of elements in a Int32x16
+// Len returns the number of elements in an Int32x16.
 func (x Int32x16) Len() int { return 16 }
 
-// LoadInt32x16 loads a Int32x16 from an array
+// LoadInt32x16 loads an Int32x16 from an array.
 //
 //go:noescape
 func LoadInt32x16(y *[16]int32) Int32x16
 
-// Store stores a Int32x16 to an array
+// Store stores an Int32x16 to an array.
 //
 //go:noescape
 func (x Int32x16) Store(y *[16]int32)
 
-// LoadMaskedInt32x16 loads a Int32x16 from an array,
-// at those elements enabled by mask
+// LoadMaskedInt32x16 loads an Int32x16 from an array,
+// at those elements enabled by mask.
 //
 // Asm: VMOVDQU32.Z, CPU Feature: AVX512
 //
 //go:noescape
 func LoadMaskedInt32x16(y *[16]int32, mask Mask32x16) Int32x16
 
-// StoreMasked stores a Int32x16 to an array,
-// at those elements enabled by mask
+// StoreMasked stores an Int32x16 to an array,
+// at those elements enabled by mask.
 //
 // Asm: VMOVDQU32, CPU Feature: AVX512
 //
 //go:noescape
 func (x Int32x16) StoreMasked(y *[16]int32, mask Mask32x16)
 
-// Int64x8 is a 512-bit SIMD vector of 8 int64
+// Int64x8 is a 512-bit SIMD vector of 8 int64s.
 type Int64x8 struct {
 	int64x8 v512
 	vals    [8]int64
 }
 
-// Len returns the number of elements in a Int64x8
+// Len returns the number of elements in an Int64x8.
 func (x Int64x8) Len() int { return 8 }
 
-// LoadInt64x8 loads a Int64x8 from an array
+// LoadInt64x8 loads an Int64x8 from an array.
 //
 //go:noescape
 func LoadInt64x8(y *[8]int64) Int64x8
 
-// Store stores a Int64x8 to an array
+// Store stores an Int64x8 to an array.
 //
 //go:noescape
 func (x Int64x8) Store(y *[8]int64)
 
-// LoadMaskedInt64x8 loads a Int64x8 from an array,
-// at those elements enabled by mask
+// LoadMaskedInt64x8 loads an Int64x8 from an array,
+// at those elements enabled by mask.
 //
 // Asm: VMOVDQU64.Z, CPU Feature: AVX512
 //
 //go:noescape
 func LoadMaskedInt64x8(y *[8]int64, mask Mask64x8) Int64x8
 
-// StoreMasked stores a Int64x8 to an array,
-// at those elements enabled by mask
+// StoreMasked stores an Int64x8 to an array,
+// at those elements enabled by mask.
 //
 // Asm: VMOVDQU64, CPU Feature: AVX512
 //
 //go:noescape
 func (x Int64x8) StoreMasked(y *[8]int64, mask Mask64x8)
 
-// Uint8x64 is a 512-bit SIMD vector of 64 uint8
+// Uint8x64 is a 512-bit SIMD vector of 64 uint8s.
 type Uint8x64 struct {
 	uint8x64 v512
 	vals     [64]uint8
 }
 
-// Len returns the number of elements in a Uint8x64
+// Len returns the number of elements in a Uint8x64.
 func (x Uint8x64) Len() int { return 64 }
 
-// LoadUint8x64 loads a Uint8x64 from an array
+// LoadUint8x64 loads a Uint8x64 from an array.
 //
 //go:noescape
 func LoadUint8x64(y *[64]uint8) Uint8x64
 
-// Store stores a Uint8x64 to an array
+// Store stores a Uint8x64 to an array.
 //
 //go:noescape
 func (x Uint8x64) Store(y *[64]uint8)
 
 // LoadMaskedUint8x64 loads a Uint8x64 from an array,
-// at those elements enabled by mask
+// at those elements enabled by mask.
 //
 // Asm: VMOVDQU8.Z, CPU Feature: AVX512
 //
@@ -963,34 +963,34 @@ func (x Uint8x64) Store(y *[64]uint8)
 func LoadMaskedUint8x64(y *[64]uint8, mask Mask8x64) Uint8x64
 
 // StoreMasked stores a Uint8x64 to an array,
-// at those elements enabled by mask
+// at those elements enabled by mask.
 //
 // Asm: VMOVDQU8, CPU Feature: AVX512
 //
 //go:noescape
 func (x Uint8x64) StoreMasked(y *[64]uint8, mask Mask8x64)
 
-// Uint16x32 is a 512-bit SIMD vector of 32 uint16
+// Uint16x32 is a 512-bit SIMD vector of 32 uint16s.
 type Uint16x32 struct {
 	uint16x32 v512
 	vals      [32]uint16
 }
 
-// Len returns the number of elements in a Uint16x32
+// Len returns the number of elements in a Uint16x32.
 func (x Uint16x32) Len() int { return 32 }
 
-// LoadUint16x32 loads a Uint16x32 from an array
+// LoadUint16x32 loads a Uint16x32 from an array.
 //
 //go:noescape
 func LoadUint16x32(y *[32]uint16) Uint16x32
 
-// Store stores a Uint16x32 to an array
+// Store stores a Uint16x32 to an array.
 //
 //go:noescape
 func (x Uint16x32) Store(y *[32]uint16)
 
 // LoadMaskedUint16x32 loads a Uint16x32 from an array,
-// at those elements enabled by mask
+// at those elements enabled by mask.
 //
 // Asm: VMOVDQU16.Z, CPU Feature: AVX512
 //
@@ -998,34 +998,34 @@ func (x Uint16x32) Store(y *[32]uint16)
 func LoadMaskedUint16x32(y *[32]uint16, mask Mask16x32) Uint16x32
 
 // StoreMasked stores a Uint16x32 to an array,
-// at those elements enabled by mask
+// at those elements enabled by mask.
 //
 // Asm: VMOVDQU16, CPU Feature: AVX512
 //
 //go:noescape
 func (x Uint16x32) StoreMasked(y *[32]uint16, mask Mask16x32)
 
-// Uint32x16 is a 512-bit SIMD vector of 16 uint32
+// Uint32x16 is a 512-bit SIMD vector of 16 uint32s.
 type Uint32x16 struct {
 	uint32x16 v512
 	vals      [16]uint32
 }
 
-// Len returns the number of elements in a Uint32x16
+// Len returns the number of elements in a Uint32x16.
 func (x Uint32x16) Len() int { return 16 }
 
-// LoadUint32x16 loads a Uint32x16 from an array
+// LoadUint32x16 loads a Uint32x16 from an array.
 //
 //go:noescape
 func LoadUint32x16(y *[16]uint32) Uint32x16
 
-// Store stores a Uint32x16 to an array
+// Store stores a Uint32x16 to an array.
 //
 //go:noescape
 func (x Uint32x16) Store(y *[16]uint32)
 
 // LoadMaskedUint32x16 loads a Uint32x16 from an array,
-// at those elements enabled by mask
+// at those elements enabled by mask.
 //
 // Asm: VMOVDQU32.Z, CPU Feature: AVX512
 //
@@ -1033,34 +1033,34 @@ func (x Uint32x16) Store(y *[16]uint32)
 func LoadMaskedUint32x16(y *[16]uint32, mask Mask32x16) Uint32x16
 
 // StoreMasked stores a Uint32x16 to an array,
-// at those elements enabled by mask
+// at those elements enabled by mask.
 //
 // Asm: VMOVDQU32, CPU Feature: AVX512
 //
 //go:noescape
 func (x Uint32x16) StoreMasked(y *[16]uint32, mask Mask32x16)
 
-// Uint64x8 is a 512-bit SIMD vector of 8 uint64
+// Uint64x8 is a 512-bit SIMD vector of 8 uint64s.
 type Uint64x8 struct {
 	uint64x8 v512
 	vals     [8]uint64
 }
 
-// Len returns the number of elements in a Uint64x8
+// Len returns the number of elements in a Uint64x8.
 func (x Uint64x8) Len() int { return 8 }
 
-// LoadUint64x8 loads a Uint64x8 from an array
+// LoadUint64x8 loads a Uint64x8 from an array.
 //
 //go:noescape
 func LoadUint64x8(y *[8]uint64) Uint64x8
 
-// Store stores a Uint64x8 to an array
+// Store stores a Uint64x8 to an array.
 //
 //go:noescape
 func (x Uint64x8) Store(y *[8]uint64)
 
 // LoadMaskedUint64x8 loads a Uint64x8 from an array,
-// at those elements enabled by mask
+// at those elements enabled by mask.
 //
 // Asm: VMOVDQU64.Z, CPU Feature: AVX512
 //
@@ -1068,14 +1068,14 @@ func (x Uint64x8) Store(y *[8]uint64)
 func LoadMaskedUint64x8(y *[8]uint64, mask Mask64x8) Uint64x8
 
 // StoreMasked stores a Uint64x8 to an array,
-// at those elements enabled by mask
+// at those elements enabled by mask.
 //
 // Asm: VMOVDQU64, CPU Feature: AVX512
 //
 //go:noescape
 func (x Uint64x8) StoreMasked(y *[8]uint64, mask Mask64x8)
 
-// Mask8x64 is a 512-bit SIMD vector of 64 int8
+// Mask8x64 is a mask for a SIMD vector of 64 8-bit elements.
 type Mask8x64 struct {
 	int8x64 v512
 	vals    [64]int8
@@ -1091,7 +1091,7 @@ func Mask8x64FromBits(y uint64) Mask8x64
 // Asm: KMOVB, CPU Features: AVX512
 func (x Mask8x64) ToBits() uint64
 
-// Mask16x32 is a 512-bit SIMD vector of 32 int16
+// Mask16x32 is a mask for a SIMD vector of 32 16-bit elements.
 type Mask16x32 struct {
 	int16x32 v512
 	vals     [32]int16
@@ -1107,7 +1107,7 @@ func Mask16x32FromBits(y uint32) Mask16x32
 // Asm: KMOVW, CPU Features: AVX512
 func (x Mask16x32) ToBits() uint32
 
-// Mask32x16 is a 512-bit SIMD vector of 16 int32
+// Mask32x16 is a mask for a SIMD vector of 16 32-bit elements.
 type Mask32x16 struct {
 	int32x16 v512
 	vals     [16]int32
@@ -1123,7 +1123,7 @@ func Mask32x16FromBits(y uint16) Mask32x16
 // Asm: KMOVD, CPU Features: AVX512
 func (x Mask32x16) ToBits() uint16
 
-// Mask64x8 is a 512-bit SIMD vector of 8 int64
+// Mask64x8 is a mask for a SIMD vector of 8 64-bit elements.
 type Mask64x8 struct {
 	int64x8 v512
 	vals    [8]int64
diff --git a/src/simd/archsimd/unsafe_helpers.go b/src/simd/archsimd/unsafe_helpers.go
index 0123ad77c5..7b98053e70 100644
--- a/src/simd/archsimd/unsafe_helpers.go
+++ b/src/simd/archsimd/unsafe_helpers.go
@@ -1,4 +1,4 @@
-// Code generated by 'go run genfiles.go'; DO NOT EDIT.
+// Code generated by 'tmplgen'; DO NOT EDIT.
 
 //go:build goexperiment.simd
 
diff --git a/src/time/export_test.go b/src/time/export_test.go
index a4940d12f9..78ce2ad00d 100644
--- a/src/time/export_test.go
+++ b/src/time/export_test.go
@@ -40,6 +40,7 @@ var (
 	Tzset                  = tzset
 	TzsetName              = tzsetName
 	TzsetOffset            = tzsetOffset
+	AsynctimerChan         = asynctimerchan
 )
 
 func LoadFromEmbeddedTZData(zone string) (string, error) {
diff --git a/src/time/tick_test.go b/src/time/tick_test.go
index dcbbcdb145..9b39d28143 100644
--- a/src/time/tick_test.go
+++ b/src/time/tick_test.go
@@ -266,6 +266,10 @@ func BenchmarkTickerResetNaive(b *testing.B) {
 }
 
 func TestTimerGC(t *testing.T) {
+	if AsynctimerChan.Value() == "1" {
+		t.Skip("skipping TestTimerGC with asynctimerchan=1")
+	}
+
 	run := func(t *testing.T, what string, f func()) {
 		t.Helper()
 		t.Run(what, func(t *testing.T) {
diff --git a/test/cmplxdivide.go b/test/cmplxdivide.go
index 49cd5bf582..4b8d549fc4 100644
--- a/test/cmplxdivide.go
+++ b/test/cmplxdivide.go
@@ -35,7 +35,7 @@ func main() {
 				fmt.Printf("BUG\n")
 				bad = true
 			}
-			fmt.Printf("%v/%v: expected %v error; got %v\n", t.f, t.g, t.out, x)
+			fmt.Printf("%v/%v: got %v, want %v\n", t.f, t.g, x, t.out)
 		}
 	}
 	if bad {
diff --git a/test/codegen/bits.go b/test/codegen/bits.go
index 39969dcdb2..d9c567b078 100644
--- a/test/codegen/bits.go
+++ b/test/codegen/bits.go
@@ -8,274 +8,387 @@ package codegen
 
 import "math/bits"
 
-/************************************
- * 64-bit instructions
- ************************************/
+//
+// 64 bit instructions
+//
 
-func bitcheck64_constleft(a uint64) (n int) {
-	// amd64:"BTQ [$]63"
+func bitsCheckConstLeftShiftU64(a uint64) (n int) {
+	// amd64:"BTQ [$]63,"
+	// arm64:"TBNZ [$]63,"
+	// riscv64:"MOV [$]" "AND" "BNEZ"
 	if a&(1<<63) != 0 {
 		return 1
 	}
-	// amd64:"BTQ [$]60"
+	// amd64:"BTQ [$]60,"
+	// arm64:"TBNZ [$]60,"
+	// riscv64:"MOV [$]" "AND" "BNEZ"
 	if a&(1<<60) != 0 {
 		return 1
 	}
-	// amd64:"BTL [$]0"
+	// amd64:"BTL [$]0,"
+	// arm64:"TBZ [$]0,"
+	// riscv64:"ANDI" "BEQZ"
 	if a&(1<<0) != 0 {
 		return 1
 	}
 	return 0
 }
 
-func bitcheck64_constright(a [8]uint64) (n int) {
-	// amd64:"BTQ [$]63"
+func bitsCheckConstRightShiftU64(a [8]uint64) (n int) {
+	// amd64:"BTQ [$]63,"
+	// arm64:"LSR [$]63," "TBNZ [$]0,"
+	// riscv64:"SRLI" "ANDI" "BNEZ"
 	if (a[0]>>63)&1 != 0 {
 		return 1
 	}
-	// amd64:"BTQ [$]63"
+	// amd64:"BTQ [$]63,"
+	// arm64:"LSR [$]63," "CBNZ"
+	// riscv64:"SRLI" "BNEZ"
 	if a[1]>>63 != 0 {
 		return 1
 	}
-	// amd64:"BTQ [$]63"
+	// amd64:"BTQ [$]63,"
+	// arm64:"LSR [$]63," "CBZ"
+	// riscv64:"SRLI" "BEQZ"
 	if a[2]>>63 == 0 {
 		return 1
 	}
-	// amd64:"BTQ [$]60"
+	// amd64:"BTQ [$]60,"
+	// arm64:"LSR [$]60," "TBZ [$]0,"
+	// riscv64:"SRLI", "ANDI" "BEQZ"
 	if (a[3]>>60)&1 == 0 {
 		return 1
 	}
-	// amd64:"BTL [$]1"
+	// amd64:"BTL [$]1,"
+	// arm64:"LSR [$]1," "TBZ [$]0,"
+	// riscv64:"SRLI" "ANDI" "BEQZ"
 	if (a[4]>>1)&1 == 0 {
 		return 1
 	}
-	// amd64:"BTL [$]0"
+	// amd64:"BTL [$]0,"
+	// arm64:"TBZ [$]0," -"LSR"
+	// riscv64:"ANDI" "BEQZ" -"SRLI"
 	if (a[5]>>0)&1 == 0 {
 		return 1
 	}
-	// amd64:"BTL [$]7"
+	// amd64:"BTL [$]7,"
+	// arm64:"LSR [$]5," "TBNZ [$]2,"
+	// riscv64:"SRLI" "ANDI" "BNEZ"
 	if (a[6]>>5)&4 == 0 {
 		return 1
 	}
 	return 0
 }
 
-func bitcheck64_var(a, b uint64) (n int) {
+func bitsCheckVarU64(a, b uint64) (n int) {
 	// amd64:"BTQ"
+	// arm64:"MOVD [$]1," "LSL" "TST"
+	// riscv64:"ANDI [$]63," "SLL " "AND "
 	if a&(1<<(b&63)) != 0 {
 		return 1
 	}
-	// amd64:"BTQ" -"BT. [$]0"
+	// amd64:"BTQ" -"BT. [$]0,"
+	// arm64:"LSR" "TBZ [$]0,"
+	// riscv64:"ANDI [$]63," "SRL" "ANDI [$]1,"
 	if (b>>(a&63))&1 != 0 {
 		return 1
 	}
 	return 0
 }
 
-func bitcheck64_mask(a uint64) (n int) {
-	// amd64:"BTQ [$]63"
+func bitsCheckMaskU64(a uint64) (n int) {
+	// amd64:"BTQ [$]63,"
+	// arm64:"TBNZ [$]63,"
+	// riscv64:"MOV [$]" "AND" "BNEZ"
 	if a&0x8000000000000000 != 0 {
 		return 1
 	}
-	// amd64:"BTQ [$]59"
+	// amd64:"BTQ [$]59,"
+	// arm64:"TBNZ [$]59,"
+	// riscv64:"MOV [$]" "AND" "BNEZ"
 	if a&0x800000000000000 != 0 {
 		return 1
 	}
-	// amd64:"BTL [$]0"
+	// amd64:"BTL [$]0,"
+	// arm64:"TBZ [$]0,"
+	// riscv64:"ANDI" "BEQZ"
 	if a&0x1 != 0 {
 		return 1
 	}
 	return 0
 }
 
-func biton64(a, b uint64) (n uint64) {
+func bitsSetU64(a, b uint64) (n uint64) {
 	// amd64:"BTSQ"
+	// arm64:"MOVD [$]1," "LSL" "ORR"
+	// riscv64:"ANDI" "SLL" "OR"
 	n += b | (1 << (a & 63))
 
-	// amd64:"BTSQ [$]63"
+	// amd64:"BTSQ [$]63,"
+	// arm64:"ORR [$]-9223372036854775808,"
+	// riscv64:"MOV [$]" "OR "
 	n += a | (1 << 63)
 
-	// amd64:"BTSQ [$]60"
+	// amd64:"BTSQ [$]60,"
+	// arm64:"ORR [$]1152921504606846976,"
+	// riscv64:"MOV [$]" "OR "
 	n += a | (1 << 60)
 
-	// amd64:"ORQ [$]1"
+	// amd64:"ORQ [$]1,"
+	// arm64:"ORR [$]1,"
+	// riscv64:"ORI"
 	n += a | (1 << 0)
 
 	return n
 }
 
-func bitoff64(a, b uint64) (n uint64) {
+func bitsClearU64(a, b uint64) (n uint64) {
 	// amd64:"BTRQ"
+	// arm64:"MOVD [$]1," "LSL" "BIC"
+	// riscv64:"ANDI" "SLL" "ANDN"
 	n += b &^ (1 << (a & 63))
 
-	// amd64:"BTRQ [$]63"
+	// amd64:"BTRQ [$]63,"
+	// arm64:"AND [$]9223372036854775807,"
+	// riscv64:"MOV [$]" "AND "
 	n += a &^ (1 << 63)
 
-	// amd64:"BTRQ [$]60"
+	// amd64:"BTRQ [$]60,"
+	// arm64:"AND [$]-1152921504606846977,"
+	// riscv64:"MOV [$]" "AND "
 	n += a &^ (1 << 60)
 
 	// amd64:"ANDQ [$]-2"
+	// arm64:"AND [$]-2"
+	// riscv64:"ANDI [$]-2"
 	n += a &^ (1 << 0)
 
 	return n
 }
 
-func clearLastBit(x int64, y int32) (int64, int32) {
-	// amd64:"ANDQ [$]-2"
+func bitsClearLowest(x int64, y int32) (int64, int32) {
+	// amd64:"ANDQ [$]-2,"
+	// arm64:"AND [$]-2,"
+	// riscv64:"ANDI [$]-2,"
 	a := (x >> 1) << 1
 
-	// amd64:"ANDL [$]-2"
+	// amd64:"ANDL [$]-2,"
+	// arm64:"AND [$]-2,"
+	// riscv64:"ANDI [$]-2,"
 	b := (y >> 1) << 1
 
 	return a, b
 }
 
-func bitcompl64(a, b uint64) (n uint64) {
+func bitsFlipU64(a, b uint64) (n uint64) {
 	// amd64:"BTCQ"
+	// arm64:"MOVD [$]1," "LSL" "EOR"
+	// riscv64:"ANDI" "SLL" "XOR "
 	n += b ^ (1 << (a & 63))
 
-	// amd64:"BTCQ [$]63"
+	// amd64:"BTCQ [$]63,"
+	// arm64:"EOR [$]-9223372036854775808,"
+	// riscv64:"MOV [$]" "XOR "
 	n += a ^ (1 << 63)
 
-	// amd64:"BTCQ [$]60"
+	// amd64:"BTCQ [$]60,"
+	// arm64:"EOR [$]1152921504606846976,"
+	// riscv64:"MOV [$]" "XOR "
 	n += a ^ (1 << 60)
 
-	// amd64:"XORQ [$]1"
+	// amd64:"XORQ [$]1,"
+	// arm64:"EOR [$]1,"
+	// riscv64:"XORI [$]1,"
 	n += a ^ (1 << 0)
 
 	return n
 }
 
-/************************************
- * 32-bit instructions
- ************************************/
+//
+// 32 bit instructions
+//
 
-func bitcheck32_constleft(a uint32) (n int) {
-	// amd64:"BTL [$]31"
+func bitsCheckConstShiftLeftU32(a uint32) (n int) {
+	// amd64:"BTL [$]31,"
+	// arm64:"TBNZ [$]31,"
+	// riscv64:"MOV [$]" "AND" "BNEZ"
 	if a&(1<<31) != 0 {
 		return 1
 	}
-	// amd64:"BTL [$]28"
+	// amd64:"BTL [$]28,"
+	// arm64:"TBNZ [$]28,"
+	// riscv64:"ANDI" "BNEZ"
 	if a&(1<<28) != 0 {
 		return 1
 	}
-	// amd64:"BTL [$]0"
+	// amd64:"BTL [$]0,"
+	// arm64:"TBZ [$]0,"
+	// riscv64:"ANDI" "BEQZ"
 	if a&(1<<0) != 0 {
 		return 1
 	}
 	return 0
 }
 
-func bitcheck32_constright(a [8]uint32) (n int) {
-	// amd64:"BTL [$]31"
+func bitsCheckConstRightShiftU32(a [8]uint32) (n int) {
+	// amd64:"BTL [$]31,"
+	// arm64:"UBFX [$]31," "CBNZW"
+	// riscv64:"SRLI" "ANDI" "BNEZ"
 	if (a[0]>>31)&1 != 0 {
 		return 1
 	}
-	// amd64:"BTL [$]31"
+	// amd64:"BTL [$]31,"
+	// arm64:"UBFX [$]31," "CBNZW"
+	// riscv64:"SRLI" "BNEZ"
 	if a[1]>>31 != 0 {
 		return 1
 	}
-	// amd64:"BTL [$]31"
+	// amd64:"BTL [$]31,"
+	// arm64:"UBFX [$]31," "CBZW"
+	// riscv64:"SRLI" "BEQZ"
 	if a[2]>>31 == 0 {
 		return 1
 	}
-	// amd64:"BTL [$]28"
+	// amd64:"BTL [$]28,"
+	// arm64:"UBFX [$]28," "TBZ"
+	// riscv64:"SRLI" "ANDI" "BEQZ"
 	if (a[3]>>28)&1 == 0 {
 		return 1
 	}
-	// amd64:"BTL [$]1"
+	// amd64:"BTL [$]1,"
+	// arm64:"UBFX [$]1," "TBZ"
+	// riscv64:"SRLI" "ANDI" "BEQZ"
 	if (a[4]>>1)&1 == 0 {
 		return 1
 	}
-	// amd64:"BTL [$]0"
+	// amd64:"BTL [$]0,"
+	// arm64:"TBZ" -"UBFX" -"SRL"
+	// riscv64:"ANDI" "BEQZ" -"SRLI "
 	if (a[5]>>0)&1 == 0 {
 		return 1
 	}
-	// amd64:"BTL [$]7"
+	// amd64:"BTL [$]7,"
+	// arm64:"UBFX [$]5," "TBNZ"
+	// riscv64:"SRLI" "ANDI" "BNEZ"
 	if (a[6]>>5)&4 == 0 {
 		return 1
 	}
 	return 0
 }
 
-func bitcheck32_var(a, b uint32) (n int) {
+func bitsCheckVarU32(a, b uint32) (n int) {
 	// amd64:"BTL"
+	// arm64:"AND [$]31," "MOVD [$]1," "LSL" "TSTW"
+	// riscv64:"ANDI [$]31," "SLL " "AND "
 	if a&(1<<(b&31)) != 0 {
 		return 1
 	}
 	// amd64:"BTL" -"BT. [$]0"
+	// arm64:"AND [$]31," "LSR" "TBZ"
+	// riscv64:"ANDI [$]31," "SRLW " "ANDI [$]1,"
 	if (b>>(a&31))&1 != 0 {
 		return 1
 	}
 	return 0
 }
 
-func bitcheck32_mask(a uint32) (n int) {
-	// amd64:"BTL [$]31"
+func bitsCheckMaskU32(a uint32) (n int) {
+	// amd64:"BTL [$]31,"
+	// arm64:"TBNZ [$]31,"
+	// riscv64:"MOV [$]" "AND" "BNEZ"
 	if a&0x80000000 != 0 {
 		return 1
 	}
-	// amd64:"BTL [$]27"
+	// amd64:"BTL [$]27,"
+	// arm64:"TBNZ [$]27,"
+	// riscv64:"ANDI" "BNEZ"
 	if a&0x8000000 != 0 {
 		return 1
 	}
-	// amd64:"BTL [$]0"
+	// amd64:"BTL [$]0,"
+	// arm64:"TBZ [$]0,"
+	// riscv64:"ANDI" "BEQZ"
 	if a&0x1 != 0 {
 		return 1
 	}
 	return 0
 }
 
-func biton32(a, b uint32) (n uint32) {
+func bitsSetU32(a, b uint32) (n uint32) {
 	// amd64:"BTSL"
+	// arm64:"AND [$]31," "MOVD [$]1," "LSL" "ORR"
+	// riscv64:"ANDI" "SLL" "OR"
 	n += b | (1 << (a & 31))
 
-	// amd64:"ORL [$]-2147483648"
+	// amd64:"ORL [$]-2147483648,"
+	// arm64:"ORR [$]-2147483648,"
+	// riscv64:"ORI [$]-2147483648,"
 	n += a | (1 << 31)
 
-	// amd64:"ORL [$]268435456"
+	// amd64:"ORL [$]268435456,"
+	// arm64:"ORR [$]268435456,"
+	// riscv64:"ORI [$]268435456,"
 	n += a | (1 << 28)
 
-	// amd64:"ORL [$]1"
+	// amd64:"ORL [$]1,"
+	// arm64:"ORR [$]1,"
+	// riscv64:"ORI [$]1,"
 	n += a | (1 << 0)
 
 	return n
 }
 
-func bitoff32(a, b uint32) (n uint32) {
+func bitsClearU32(a, b uint32) (n uint32) {
 	// amd64:"BTRL"
+	// arm64:"AND [$]31," "MOVD [$]1," "LSL" "BIC"
+	// riscv64:"ANDI" "SLL" "ANDN"
 	n += b &^ (1 << (a & 31))
 
-	// amd64:"ANDL [$]2147483647"
+	// amd64:"ANDL [$]2147483647,"
+	// arm64:"AND [$]2147483647,"
+	// riscv64:"ANDI [$]2147483647,"
 	n += a &^ (1 << 31)
 
-	// amd64:"ANDL [$]-268435457"
+	// amd64:"ANDL [$]-268435457,"
+	// arm64:"AND [$]-268435457,"
+	// riscv64:"ANDI [$]-268435457,"
 	n += a &^ (1 << 28)
 
-	// amd64:"ANDL [$]-2"
+	// amd64:"ANDL [$]-2,"
+	// arm64:"AND [$]-2,"
+	// riscv64:"ANDI [$]-2,"
 	n += a &^ (1 << 0)
 
 	return n
 }
 
-func bitcompl32(a, b uint32) (n uint32) {
+func bitsFlipU32(a, b uint32) (n uint32) {
 	// amd64:"BTCL"
+	// arm64:"AND [$]31," "MOVD [$]1," "LSL" "EOR"
+	// riscv64:"ANDI" "SLL" "XOR "
 	n += b ^ (1 << (a & 31))
 
-	// amd64:"XORL [$]-2147483648"
+	// amd64:"XORL [$]-2147483648,"
+	// arm64:"EOR [$]-2147483648,"
+	// riscv64:"XORI [$]-2147483648,"
 	n += a ^ (1 << 31)
 
-	// amd64:"XORL [$]268435456"
+	// amd64:"XORL [$]268435456,"
+	// arm64:"EOR [$]268435456,"
+	// riscv64:"XORI [$]268435456,"
 	n += a ^ (1 << 28)
 
-	// amd64:"XORL [$]1"
+	// amd64:"XORL [$]1,"
+	// arm64:"EOR [$]1,"
+	// riscv64:"XORI [$]1,"
 	n += a ^ (1 << 0)
 
 	return n
 }
 
-// check direct operation on memory with constant and shifted constant sources
-func bitOpOnMem(a []uint32, b, c, d uint32) {
+func bitsOpOnMem(a []uint32, b, c, d uint32) {
+	// check direct operation on memory with constant
+
 	// amd64:`ANDL\s[$]200,\s\([A-Z][A-Z0-9]+\)`
 	a[0] &= 200
 	// amd64:`ORL\s[$]220,\s4\([A-Z][A-Z0-9]+\)`
@@ -284,24 +397,24 @@ func bitOpOnMem(a []uint32, b, c, d uint32) {
 	a[2] ^= 240
 }
 
-func bitcheckMostNegative(b uint8) bool {
+func bitsCheckMostNegative(b uint8) bool {
 	// amd64:"TESTB"
+	// arm64:"TSTW" "CSET"
+	// riscv64:"ANDI [$]128," "SNEZ" -"ADDI"
 	return b&0x80 == 0x80
 }
 
-// Check AND masking on arm64 (Issue #19857)
-
-func and_mask_1(a uint64) uint64 {
+func bitsIssue19857a(a uint64) uint64 {
 	// arm64:`AND `
 	return a & ((1 << 63) - 1)
 }
 
-func and_mask_2(a uint64) uint64 {
+func bitsIssue19857b(a uint64) uint64 {
 	// arm64:`AND `
 	return a & (1 << 63)
 }
 
-func and_mask_3(a, b uint32) (uint32, uint32) {
+func bitsIssue19857c(a, b uint32) (uint32, uint32) {
 	// arm/7:`BIC`,-`AND`
 	a &= 0xffffaaaa
 	// arm/7:`BFC`,-`AND`,-`BIC`
@@ -309,34 +422,39 @@ func and_mask_3(a, b uint32) (uint32, uint32) {
 	return a, b
 }
 
-// Check generation of arm64 BIC/EON/ORN instructions
-
-func op_bic(x, y uint32) uint32 {
+func bitsAndNot(x, y uint32) uint32 {
 	// arm64:`BIC `,-`AND`
+	// loong64:"ANDN " -"AND "
+	// riscv64:"ANDN" -"AND "
 	return x &^ y
 }
 
-func op_eon(x, y, z uint32, a []uint32, n, m uint64) uint64 {
+func bitsXorNot(x, y, z uint32, a []uint32, n, m uint64) uint64 {
 	// arm64:`EON `,-`EOR`,-`MVN`
+	// riscv64:"XNOR " -"MOV [$]" -"XOR"
 	a[0] = x ^ (y ^ 0xffffffff)
 
 	// arm64:`EON `,-`EOR`,-`MVN`
+	// riscv64:"XNOR" -"XOR"
 	a[1] = ^(y ^ z)
 
 	// arm64:`EON `,-`XOR`
+	// riscv64:"XNOR" -"XOR" -"NOT"
 	a[2] = x ^ ^z
 
 	// arm64:`EON `,-`EOR`,-`MVN`
+	// riscv64:"XNOR" -"MOV [$]" -"XOR"
 	return n ^ (m ^ 0xffffffffffffffff)
 }
 
-func op_orn(x, y uint32) uint32 {
-	// arm64:`ORN `,-`ORR`
-	// loong64:"ORN" ,-"OR "
+func bitsOrNot(x, y uint32) uint32 {
+	// arm64:"ORN " -"ORR"
+	// loong64:"ORN" -"OR "
+	// riscv64:"ORN" -"OR "
 	return x | ^y
 }
 
-func op_nor(x int64, a []int64) {
+func bitsNotOr(x int64, a []int64) {
 	// loong64: "MOVV [$]0" "NOR R"
 	a[0] = ^(0x1234 | x)
 	// loong64:"NOR" -"XOR"
@@ -345,64 +463,60 @@ func op_nor(x int64, a []int64) {
 	a[2] = ^(0x12 | 0x34)
 }
 
-func op_andn(x, y uint32) uint32 {
-	// loong64:"ANDN " -"AND "
-	return x &^ y
-}
-
-// check bitsets
-func bitSetPowerOf2Test(x int) bool {
+func bitsSetPowerOf2Test(x int) bool {
 	// amd64:"BTL [$]3"
+	// riscv64:"ANDI [$]8," "SNEZ" -"ADDI"
 	return x&8 == 8
 }
 
-func bitSetTest(x int) bool {
+func bitsSetTest(x int) bool {
 	// amd64:"ANDL [$]9, AX"
 	// amd64:"CMPQ AX, [$]9"
+	// riscv64:"ANDI [$]9," "ADDI [$]-9," "SEQZ"
 	return x&9 == 9
 }
 
-// mask contiguous one bits
-func cont1Mask64U(x uint64) uint64 {
+func bitsMaskContiguousOnes64U(x uint64) uint64 {
 	// s390x:"RISBGZ [$]16, [$]47, [$]0,"
 	return x & 0x0000ffffffff0000
 }
 
-// mask contiguous zero bits
-func cont0Mask64U(x uint64) uint64 {
+func bitsMaskContiguousZeroes64U(x uint64) uint64 {
 	// s390x:"RISBGZ [$]48, [$]15, [$]0,"
 	return x & 0xffff00000000ffff
 }
 
-func issue44228a(a []int64, i int) bool {
+func bitsIssue44228a(a []int64, i int) bool {
 	// amd64: "BTQ", -"SHL"
 	return a[i>>6]&(1<<(i&63)) != 0
 }
-func issue44228b(a []int32, i int) bool {
+
+func bitsIssue44228b(a []int32, i int) bool {
 	// amd64: "BTL", -"SHL"
 	return a[i>>5]&(1<<(i&31)) != 0
 }
 
-func issue48467(x, y uint64) uint64 {
+func bitsIssue48467(x, y uint64) uint64 {
 	// arm64: -"NEG"
 	d, borrow := bits.Sub64(x, y, 0)
 	return x - d&(-borrow)
 }
 
-func foldConst(x, y uint64) uint64 {
+func bitsFoldConst(x, y uint64) uint64 {
 	// arm64: "ADDS [$]7" -"MOVD [$]7"
 	// ppc64x: "ADDC [$]7,"
 	d, b := bits.Add64(x, 7, 0)
 	return b & d
 }
 
-func foldConstOutOfRange(a uint64) uint64 {
+func bitsFoldConstOutOfRange(a uint64) uint64 {
 	// arm64: "MOVD [$]19088744" -"ADD [$]19088744"
 	return a + 0x1234568
 }
 
-// Verify sign-extended values are not zero-extended under a bit mask (#61297)
-func signextendAndMask8to64(a int8) (s, z uint64) {
+func bitsSignExtendAndMask8to64U(a int8) (s, z uint64) {
+	// Verify sign-extended values are not zero-extended under a bit mask (#61297)
+
 	// ppc64x: "MOVB", "ANDCC [$]1015,"
 	s = uint64(a) & 0x3F7
 	// ppc64x: -"MOVB", "ANDCC [$]247,"
@@ -410,8 +524,9 @@ func signextendAndMask8to64(a int8) (s, z uint64) {
 	return
 }
 
-// Verify zero-extended values are not sign-extended under a bit mask (#61297)
-func zeroextendAndMask8to64(a int8, b int16) (x, y uint64) {
+func bitsZeroExtendAndMask8toU64(a int8, b int16) (x, y uint64) {
+	// Verify zero-extended values are not sign-extended under a bit mask (#61297)
+
 	// ppc64x: -"MOVB ", -"ANDCC", "MOVBZ"
 	x = uint64(a) & 0xFF
 	// ppc64x: -"MOVH ", -"ANDCC", "MOVHZ"
@@ -419,8 +534,9 @@ func zeroextendAndMask8to64(a int8, b int16) (x, y uint64) {
 	return
 }
 
-// Verify rotate and mask instructions, and further simplified instructions for small types
-func bitRotateAndMask(io64 [8]uint64, io32 [4]uint32, io16 [4]uint16, io8 [4]uint8) {
+func bitsRotateAndMask(io64 [8]uint64, io32 [4]uint32, io16 [4]uint16, io8 [4]uint8) {
+	// Verify rotate and mask instructions, and further simplified instructions for small types
+
 	// ppc64x: "RLDICR [$]0, R[0-9]*, [$]47, R"
 	io64[0] = io64[0] & 0xFFFFFFFFFFFF0000
 	// ppc64x: "RLDICL [$]0, R[0-9]*, [$]16, R"
diff --git a/test/codegen/comparisons.go b/test/codegen/comparisons.go
index bcce21e404..0b550adc05 100644
--- a/test/codegen/comparisons.go
+++ b/test/codegen/comparisons.go
@@ -660,13 +660,13 @@ func equalVarString8(a string) bool {
 	return a[:8] == b
 }
 
-func equalVarStringNoSpill(a,b string) bool {
+func equalVarStringNoSpill(a, b string) bool {
 	s := string("ZZZZZZZZZ")
 	// arm64:".*memequal"
 	memeq1 := a[:9] == s
 	// arm64:-".*"
 	memeq2 := s == a[:9]
-	// arm64:-"MOVB\tR0,.*SP",".*memequal"
+	// arm64:-"MOVB R0,.*SP",".*memequal"
 	memeq3 := s == b[:9]
 	return memeq1 && memeq2 && memeq3
 }
diff --git a/test/codegen/simd.go b/test/codegen/simd.go
index 8f3a1a9f46..04e01944de 100644
--- a/test/codegen/simd.go
+++ b/test/codegen/simd.go
@@ -6,11 +6,14 @@
 
 // These tests check code generation of simd peephole optimizations.
 
-//go:build goexperiment.simd
+//go:build goexperiment.simd && amd64
 
 package codegen
 
-import "simd/archsimd"
+import (
+	"math"
+	"simd/archsimd"
+)
 
 func vptest1() bool {
 	v1 := archsimd.LoadUint64x2Slice([]uint64{0, 1})
@@ -77,3 +80,27 @@ func simdMaskedMerge() archsimd.Int16x16 {
 	mask := archsimd.Mask16x16FromBits(5)
 	return x.Add(y).Merge(x, mask) // amd64:`VPBLENDVB\s.*$`
 }
+
+var nan = math.NaN()
+var floats64s = []float64{0, 1, 2, nan, 4, nan, 6, 7, 8, 9, 10, 11, nan, 13, 14, 15}
+var sinkInt64s = make([]int64, 100)
+
+func simdIsNaN() {
+	x := archsimd.LoadFloat64x4Slice(floats64s)
+	y := archsimd.LoadFloat64x4Slice(floats64s[4:])
+	a := x.IsNaN()
+	b := y.IsNaN()
+	// amd64:"VCMPPD [$]3," -"VPOR"
+	c := a.Or(b)
+	c.ToInt64x4().StoreSlice(sinkInt64s)
+}
+
+func simdIsNaN512() {
+	x := archsimd.LoadFloat64x8Slice(floats64s)
+	y := archsimd.LoadFloat64x8Slice(floats64s[8:])
+	a := x.IsNaN()
+	b := y.IsNaN()
+	// amd64:"VCMPPD [$]3," -"VPOR"
+	c := a.Or(b)
+	c.ToInt64x8().StoreSlice(sinkInt64s)
+}
diff --git a/test/fixedbugs/issue76950.go b/test/fixedbugs/issue76950.go
new file mode 100644
index 0000000000..b5716e0fc6
--- /dev/null
+++ b/test/fixedbugs/issue76950.go
@@ -0,0 +1,67 @@
+// compile
+
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package p
+
+func MatchLog(input string) bool {
+	pos := 0
+	n := len(input)
+	matchState := -1
+	var c byte
+
+	goto State12
+
+State8:
+	goto State65
+
+State12:
+	if pos >= n {
+		goto End
+	}
+	c = input[pos]
+	switch {
+	case c >= 0x09 && c <= 0x0A || c >= 0x0C && c <= 0x0D || c == ' ':
+	case c >= '0' && c <= '9':
+	case c >= 'A' && c <= 'Z' || c == '_' || c >= 'b' && c <= 'z':
+	case c == '[':
+		goto State8
+	case c == 'a':
+	default:
+		goto End
+	}
+
+State64:
+	matchState = 179
+	if pos >= n {
+		goto End
+	}
+	pos = n
+	goto State64
+
+State65:
+
+State66:
+	matchState = 181
+	if pos >= n {
+		goto End
+	}
+	pos = n
+	goto State66
+
+End:
+	if matchState != -1 {
+		switch matchState {
+		case 178:
+		case 156:
+		case 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175:
+		case 176, 177, 181, 182, 183:
+		case 179, 184:
+		case 180:
+		}
+		return true
+	}
+	return false
+}
diff --git a/test/map.go b/test/map.go
index 2c1cf8a140..b72fe59bd7 100644
--- a/test/map.go
+++ b/test/map.go
@@ -431,7 +431,7 @@ func testbasic() {
 		{
 			_, b := mpTi[apT[i]]
 			if b {
-				panic(fmt.Sprintf("tuple nonexistence decl: mpTi[apt[%d]]", i))
+				panic(fmt.Sprintf("tuple nonexistence decl: mpTi[apT[%d]]", i))
 			}
 			_, b = mpTi[apT[i]]
 			if b {
diff --git a/test/stringrange.go b/test/stringrange.go
index 99e5edb5a4..d98013b876 100644
--- a/test/stringrange.go
+++ b/test/stringrange.go
@@ -59,7 +59,7 @@ func main() {
 
 	for _, c := range "a\xed\xa0\x80a" {
 		if c != 'a' && c != utf8.RuneError {
-			fmt.Printf("surrogate UTF-8 does not error: %U\n", c)
+			fmt.Printf("surrogate UTF-8 does not produce an error: %U\n", c)
 			ok = false
 		}
 	}
diff --git a/test/typeparam/typelist.go b/test/typeparam/typelist.go
index cd8ef7d6e7..b3226301fb 100644
--- a/test/typeparam/typelist.go
+++ b/test/typeparam/typelist.go
@@ -32,7 +32,7 @@ func _[T interface{ ~int }](x T) {
 	var _ T = T(myint(42))
 }
 
-// Indexing a generic type which has a an array as core type.
+// Indexing a generic type which has an array as core type.
 func _[T interface{ ~[10]int }](x T) {
 	_ = x[9] // ok
 }
diff --git a/test/uintptrescapes.dir/main.go b/test/uintptrescapes.dir/main.go
index afda6218ad..0ccb18f9ff 100644
--- a/test/uintptrescapes.dir/main.go
+++ b/test/uintptrescapes.dir/main.go
@@ -49,7 +49,7 @@ func main() {
 		defer wg.Done()
 		b := F1()
 		if b != 42 {
-			fmt.Printf("F1: got %d, expected 42\n", b)
+			fmt.Printf("F1: got %d, want 42\n", b)
 			c <- false
 		}
 	}()
@@ -58,7 +58,7 @@ func main() {
 		defer wg.Done()
 		b := F2()
 		if b != 42 {
-			fmt.Printf("F2: got %d, expected 42\n", b)
+			fmt.Printf("F2: got %d, want 42\n", b)
 			c <- false
 		}
 	}()
@@ -67,7 +67,7 @@ func main() {
 		defer wg.Done()
 		b := M1()
 		if b != 42 {
-			fmt.Printf("M1: got %d, expected 42\n", b)
+			fmt.Printf("M1: got %d, want 42\n", b)
 			c <- false
 		}
 	}()
@@ -76,7 +76,7 @@ func main() {
 		defer wg.Done()
 		b := M2()
 		if b != 42 {
-			fmt.Printf("M2: got %d, expected 42\n", b)
+			fmt.Printf("M2: got %d, want 42\n", b)
 			c <- false
 		}
 	}()
author	Junyang Shao <shaojunyang@google.com>	2026-01-07 20:06:48 +0000
committer	Junyang Shao <shaojunyang@google.com>	2026-01-07 20:06:49 +0000
commit	b8191a2f9893220bdbe52ecebb37e293847d98f5 (patch)
tree	ffaec06811834d36737d182a65831d65cd8ce798
parent	c599a8f2385849a225d02843b3c6389dbfc5aa69 (diff)
parent	f6ebd91129e13ef7f495550a4fc8fa74769f6a2d (diff)
download	go-b8191a2f9893220bdbe52ecebb37e293847d98f5.tar.xz